This message was deleted.
# rke2
a
This message was deleted.
e
Here's the full kubectl describe on the pod:
Copy code
# ./kubectl describe pod kube-apiserver-henry-cluster4-pool1-27cb5e2d-fsfxg -n kube-system
Name:                 kube-apiserver-henry-cluster4-pool1-27cb5e2d-fsfxg
Namespace:            kube-system
Priority:             2000000000
Priority Class Name:  system-cluster-critical
Node:                 henry-cluster4-pool1-27cb5e2d-fsfxg/xxxxxxxxxxx
Start Time:           Wed, 08 Feb 2023 14:06:08 -0500
Labels:               component=kube-apiserver
                      tier=control-plane
Annotations:          <http://kubernetes.io/config.hash|kubernetes.io/config.hash>: 4874a08227e8932676b83ca998a390f3
                      <http://kubernetes.io/config.mirror|kubernetes.io/config.mirror>: 4874a08227e8932676b83ca998a390f3
                      <http://kubernetes.io/config.seen|kubernetes.io/config.seen>: 2023-02-08T13:54:41.543487161-05:00
                      <http://kubernetes.io/config.source|kubernetes.io/config.source>: file
                      <http://kubernetes.io/psp|kubernetes.io/psp>: global-unrestricted-psp
Status:               Running
IP:                   XXXXXXXXXX
IPs:
  IP:           XXXXXXXXXX
Controlled By:  Node/henry-cluster4-pool1-27cb5e2d-fsfxg
Containers:
  kube-apiserver:
    Container ID:  <containerd://07c1f36b907b90784ed1a6d7e4c95c6817018a113ce7a9b555a87660b72e4fc>e
    Image:         <http://index.docker.io/rancher/hardened-kubernetes:v1.24.9-rke2r2-build20230104|index.docker.io/rancher/hardened-kubernetes:v1.24.9-rke2r2-build20230104>
    Image ID:      <http://docker.io/rancher/hardened-kubernetes@sha256:284ed583bf9011db9110b47683084c3238127c49ab937ad31be3efbf5656a0bc|docker.io/rancher/hardened-kubernetes@sha256:284ed583bf9011db9110b47683084c3238127c49ab937ad31be3efbf5656a0bc>
    Port:          <none>
    Host Port:     <none>
    Command:
      kube-apiserver
    Args:
      --allow-privileged=true
      --anonymous-auth=false
      --api-audiences=<https://kubernetes.default.svc.cluster.local>,rke2
      --authorization-mode=Node,RBAC
      --bind-address=0.0.0.0
      --cert-dir=/var/lib/rancher/rke2/server/tls/temporary-certs
      --client-ca-file=/var/lib/rancher/rke2/server/tls/client-ca.crt
      --egress-selector-config-file=/var/lib/rancher/rke2/server/etc/egress-selector-config.yaml
      --enable-admission-plugins=NodeRestriction,PodSecurityPolicy
      --enable-aggregator-routing=true
      --encryption-provider-config=/var/lib/rancher/rke2/server/cred/encryption-config.json
      --etcd-cafile=/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt
      --etcd-certfile=/var/lib/rancher/rke2/server/tls/etcd/client.crt
      --etcd-keyfile=/var/lib/rancher/rke2/server/tls/etcd/client.key
      --etcd-servers=<https://127.0.0.1:2379>
      --feature-gates=JobTrackingWithFinalizers=true
      --kubelet-certificate-authority=/var/lib/rancher/rke2/server/tls/server-ca.crt
      --kubelet-client-certificate=/var/lib/rancher/rke2/server/tls/client-kube-apiserver.crt
      --kubelet-client-key=/var/lib/rancher/rke2/server/tls/client-kube-apiserver.key
      --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
      --profiling=false
      --proxy-client-cert-file=/var/lib/rancher/rke2/server/tls/client-auth-proxy.crt
      --proxy-client-key-file=/var/lib/rancher/rke2/server/tls/client-auth-proxy.key
      --requestheader-allowed-names=system:auth-proxy
      --requestheader-client-ca-file=/var/lib/rancher/rke2/server/tls/request-header-ca.crt
      --requestheader-extra-headers-prefix=X-Remote-Extra-
      --requestheader-group-headers=X-Remote-Group
      --requestheader-username-headers=X-Remote-User
      --secure-port=6443
      --service-account-issuer=<https://kubernetes.default.svc.cluster.local>
      --service-account-key-file=/var/lib/rancher/rke2/server/tls/service.key
      --service-account-signing-key-file=/var/lib/rancher/rke2/server/tls/service.key
      --service-cluster-ip-range=10.43.0.0/16
      --service-node-port-range=30000-32767
      --storage-backend=etcd3
      --tls-cert-file=/var/lib/rancher/rke2/server/tls/serving-kube-apiserver.crt
      --tls-private-key-file=/var/lib/rancher/rke2/server/tls/serving-kube-apiserver.key
    State:          Running
      Started:      Wed, 08 Feb 2023 14:12:32 -0500
    Last State:     Terminated
      Reason:       Error
      Exit Code:    137
      Started:      Wed, 08 Feb 2023 13:54:42 -0500
      Finished:     Wed, 08 Feb 2023 14:12:10 -0500
    Ready:          True
    Restart Count:  1
    Requests:
      cpu:      250m
      memory:   1Gi
    Liveness:   exec [kubectl get --server=<https://localhost:6443/> --client-certificate=/var/lib/rancher/rke2/server/tls/client-kube-apiserver.crt --client-key=/var/lib/rancher/rke2/server/tls/client-kube-apiserver.key --certificate-authority=/var/lib/rancher/rke2/server/tls/server-ca.crt --raw=/livez] delay=10s timeout=15s period=10s #success=1 #failure=8
    Readiness:  exec [kubectl get --server=<https://localhost:6443/> --client-certificate=/var/lib/rancher/rke2/server/tls/client-kube-apiserver.crt --client-key=/var/lib/rancher/rke2/server/tls/client-kube-apiserver.key --certificate-authority=/var/lib/rancher/rke2/server/tls/server-ca.crt --raw=/readyz] delay=0s timeout=15s period=5s #success=1 #failure=3
    Startup:    exec [kubectl get --server=<https://localhost:6443/> --client-certificate=/var/lib/rancher/rke2/server/tls/client-kube-apiserver.crt --client-key=/var/lib/rancher/rke2/server/tls/client-kube-apiserver.key --certificate-authority=/var/lib/rancher/rke2/server/tls/server-ca.crt --raw=/livez] delay=10s timeout=5s period=10s #success=1 #failure=24
    Environment:
      FILE_HASH:  65c747a3981e887284258667d87df8f0aac1d5eb238d049859c1986c85b92190
      NO_PROXY:   .svc,.cluster.local,10.42.0.0/16,10.43.0.0/16
      POD_HASH:   0d017805ffe058495a528aa9431ffa9b
    Mounts:
      /etc/ca-certificates from dir1 (rw)
      /etc/ssl/certs from dir0 (rw)
      /var/lib/rancher/rke2/server/cred/encryption-config.json from file1 (ro)
      /var/lib/rancher/rke2/server/db/etcd/name from file0 (ro)
      /var/lib/rancher/rke2/server/etc/egress-selector-config.yaml from file2 (ro)
      /var/lib/rancher/rke2/server/logs from dir2 (rw)
      /var/lib/rancher/rke2/server/tls/client-auth-proxy.crt from file3 (ro)
      /var/lib/rancher/rke2/server/tls/client-auth-proxy.key from file4 (ro)
      /var/lib/rancher/rke2/server/tls/client-ca.crt from file5 (ro)
      /var/lib/rancher/rke2/server/tls/client-kube-apiserver.crt from file6 (ro)
      /var/lib/rancher/rke2/server/tls/client-kube-apiserver.key from file7 (ro)
      /var/lib/rancher/rke2/server/tls/etcd/client.crt from file8 (ro)
      /var/lib/rancher/rke2/server/tls/etcd/client.key from file9 (ro)
      /var/lib/rancher/rke2/server/tls/etcd/server-ca.crt from file10 (ro)
      /var/lib/rancher/rke2/server/tls/request-header-ca.crt from file11 (ro)
      /var/lib/rancher/rke2/server/tls/server-ca.crt from file12 (ro)
      /var/lib/rancher/rke2/server/tls/service.key from file13 (ro)
      /var/lib/rancher/rke2/server/tls/serving-kube-apiserver.crt from file14 (ro)
      /var/lib/rancher/rke2/server/tls/serving-kube-apiserver.key from file15 (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             True 
  ContainersReady   True 
  PodScheduled      True 
Volumes:
  dir0:
    Type:          HostPath (bare host directory volume)
    Path:          /etc/ssl/certs
    HostPathType:  DirectoryOrCreate
  dir1:
    Type:          HostPath (bare host directory volume)
    Path:          /etc/ca-certificates
    HostPathType:  DirectoryOrCreate
  dir2:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/logs
    HostPathType:  DirectoryOrCreate
  file0:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/db/etcd/name
    HostPathType:  File
  file1:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/cred/encryption-config.json
    HostPathType:  File
  file2:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/etc/egress-selector-config.yaml
    HostPathType:  File
  file3:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/client-auth-proxy.crt
    HostPathType:  File
  file4:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/client-auth-proxy.key
    HostPathType:  File
  file5:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/client-ca.crt
    HostPathType:  File
  file6:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/client-kube-apiserver.crt
    HostPathType:  File
  file7:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/client-kube-apiserver.key
    HostPathType:  File
  file8:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/etcd/client.crt
    HostPathType:  File
  file9:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/etcd/client.key
    HostPathType:  File
  file10:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/etcd/server-ca.crt
    HostPathType:  File
  file11:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/request-header-ca.crt
    HostPathType:  File
  file12:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/server-ca.crt
    HostPathType:  File
  file13:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/service.key
    HostPathType:  File
  file14:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/serving-kube-apiserver.crt
    HostPathType:  File
  file15:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/rancher/rke2/server/tls/serving-kube-apiserver.key
    HostPathType:  File
QoS Class:         Burstable
Node-Selectors:    <none>
Tolerations:       :NoExecute op=Exists
Events:
  Type     Reason     Age                 From     Message
  ----     ------     ----                ----     -------
  Warning  Unhealthy  28m (x32 over 22h)  kubelet  Readiness probe failed: Error from server (InternalError): an error on the server ("[+]ping ok\n[+]log ok\n[-]etcd failed: reason withheld\n[+]informer-sync ok\n[+]poststarthook/start-kube-apiserver-admission-initializer ok\n[+]poststarthook/generic-apiserver-start-informers ok\n[+]poststarthook/priority-and-fairness-config-consumer ok\n[+]poststarthook/priority-and-fairness-filter ok\n[+]poststarthook/start-apiextensions-informers ok\n[+]poststarthook/start-apiextensions-controllers ok\n[+]poststarthook/crd-informer-synced ok\n[+]poststarthook/bootstrap-controller ok\n[+]poststarthook/rbac/bootstrap-roles ok\n[+]poststarthook/scheduling/bootstrap-system-priority-classes ok\n[+]poststarthook/priority-and-fairness-config-producer ok\n[+]poststarthook/start-cluster-authentication-info-controller ok\n[+]poststarthook/aggregator-reload-proxy-client-cert ok\n[+]poststarthook/start-kube-aggregator-informers ok\n[+]poststarthook/apiservice-registration-controller ok\n[+]poststarthook/apiservice-status-available-controller ok\n[+]poststarthook/kube-apiserver-autoregistration ok\n[+]autoregister-completion ok\n[+]poststarthook/apiservice-openapi-controller ok\n[+]poststarthook/apiservice-openapiv3-controller ok\n[+]shutdown ok\nreadyz check failed") has prevented the request from succeeding
c
You don’t need to pass the cert and key, as its telling you those are already provided by your kubeconfig
The only important bit here is this:
Copy code
[-]etcd failed: reason withheld
You should look at the apiserver and etcd pod logs on that node to figure out why the apiserver is unhealthy.
e
Thanks. I will look at the logs on the apiserver and etc pods. Right now the event has disappeared but I know it will come back eventually.
c
its possible that the latency is too high and it’s mad about that
e
The etc pod is spilling messages "prober found high clock drift". I'm waiting on the admin to get the nodes to sync up to the timeserver. This might be the cause of the issue. Will let you know.
c
ah yeah that’d do it
e
Still struggling with the time sync between the 3 nodes. They are between 1 second to 10 seconds apart. In the etcd log it shows msgs like these. It has a remote-peer-id but I can't figure out what that id is. It's not the machine-id or the duid of the VMs. Do you know what that id is referencing here?
Copy code
{"level":"warn","ts":"2023-02-10T01:00:34.834Z","caller":"rafthttp/probing_status.go:82","msg":"prober found high clock drift","round-tripper-name":"ROUND_TRIPPER_RAFT_MESSAGE","remote-peer-id":"1d87be6feeb3c575","clock-drift":"3.517034636s","rtt":"6.270375ms"}
Thu, Feb 9 2023 7:00:34 pm	{"level":"warn","ts":"2023-02-10T01:00:34.910Z","caller":"rafthttp/probing_status.go:82","msg":"prober found high clock drift","round-tripper-name":"ROUND_TRIPPER_RAFT_MESSAGE","remote-peer-id":"38e76a16311e2a6c","clock-drift":"3.595203382s","rtt":"5.143833ms"}
Thu, Feb 9 2023 7:00:34 pm	{"level":"warn","ts":"2023-02-10T01:00:34.916Z","caller":"rafthttp/probing_status.go:82","msg":"prober found high clock drift","round-tripper-name":"ROUND_TRIPPER_SNAPSHOT","remote-peer-id":"38e76a16311e2a6c","clock-drift":"3.597571189s","rtt":"777.936µs"}
c
the id is randomly assigned by etcd itself. You could install etcdctl and try to figure out which one it is, but a better use of time would probably be to install ntpd and set it up to sync to a a pool of servers and then check which one is the most off from those.
e
I got the clocks in sync on all nodes. These nodes (VMs) have chronyd configured to sync to an internal time server. The problem was that they also synced to the ESXi hosts via the vmware tool and their times were not in sync. So the the VMs were pulled back and forth between the two sources. I disabled the chronyd on the VMs and now they are perfectly in sync. Now I'm seeing the "apply request took too long" msgs in the etcd pods. I'm not sure why there's such huge latency between these VMs. There are no user workloads, only the system pods. Each node CPU is at 20%, mem at 70%, and all 3 VMs are on the same ESXi hosts as well so I can't imagine there's any network congestion. But will look into it to make sure.
Copy code
{"level":"info","ts":"2023-02-10T02:34:02.576Z","caller":"traceutil/trace.go:171","msg":"trace[1663112911] linearizableReadLoop","detail":"{readStateIndex:711272; appliedIndex:711272; }","duration":"613.489744ms","start":"2023-02-10T02:34:01.962Z","end":"2023-02-10T02:34:02.576Z","steps":["trace[1663112911] 'read index received' (duration: 613.482382ms)","trace[1663112911] 'applied index is now lower than readState.Index' (duration: 3.327µs)"],"step_count":2}
Thu, Feb 9 2023 8:34:02 pm	{"level":"warn","ts":"2023-02-10T02:34:02.578Z","caller":"etcdserver/util.go:166","msg":"apply request took too long","took":"615.409408ms","expected-duration":"100ms","prefix":"read-only range ","request":"key:\"/registry/namespaces/default\" ","response":"range_response_count:1 size:1142"}
Thu, Feb 9 2023 8:34:02 pm	{"level":"info","ts":"2023-02-10T02:34:02.578Z","caller":"traceutil/trace.go:171","msg":"trace[441043562] range","detail":"{range_begin:/registry/namespaces/default; range_end:; response_count:1; response_revision:641985; }","duration":"615.893108ms","start":"2023-02-10T02:34:01.962Z","end":"2023-02-10T02:34:02.578Z","steps":["trace[441043562] 'agreement among raft nodes before linearized reading' (duration: 615.371984ms)"],"step_count":1}
Thu, Feb 9 2023 8:34:02 pm	{"level":"warn","ts":"2023-02-10T02:34:02.578Z","caller":"v3rpc/interceptor.go:197","msg":"request stats","start time":"2023-02-10T02:34:01.962Z","time spent":"616.11993ms","remote":"127.0.0.1:33802","response type":"/etcdserverpb.KV/Range","request count":0,"request size":30,"response count":1,"response size":1164,"request content":"key:\"/registry/namespaces/default\" "}
Thu, Feb 9 2023 8:34:02 pm	{"level":"warn","ts":"2023-02-10T02:34:02.687Z","caller":"etcdserver/util.go:166","msg":"apply request took too long","took":"100.923121ms","expected-duration":"100ms","prefix":"read-only range ","request":"key:\"/registry/masterleases/10.246.159.95\" ","response":"range_response_count:1 size:141"}
Thu, Feb 9 2023 8:34:02 pm	{"level":"info","ts":"2023-02-10T02:34:02.688Z","caller":"traceutil/trace.go:171","msg":"trace[1058366229] range","detail":"{range_begin:/registry/masterleases/10.246.159.95; range_end:; response_count:1; response_revision:641985; }","duration":"101.046608ms","start":"2023-02-10T02:34:02.587Z","end":"2023-02-10T02:34:02.688Z","steps":["trace[1058366229] 'agreement among raft nodes before linearized reading' (duration: 89.398475ms)"],"step_count":1}
c
that sort of thing is usually more related to disk IO latency than network latency. I’m assuming these nodes are on VMFS on top of… something? Do you know what kind of disk it is?
SSD is usually recommended for etcd
e
Yes the VM volumes are on VMFS backed by a SAN array, using near-line drives. I do not have access to the array so I can't see the actual configuration and performance of the array. On the VMs, I have been watching the io latency using iostat and so far the writes are less than 1ms and the reads are around 30ms.
c
by near-line do you mean rotational storage?
e
Yes spinning drives. Typically, the writes are acknowledged as soon as they reach the cache so the latency is not that bad unless the storage is under a heavy workload.
1066 Views