This message was deleted.
# rke2
a
This message was deleted.
a
@creamy-rainbow-46562
c
I have no idea what you’re showing here. What is actually consuming memory?
a
Yes. ETCDs memory
I need to confirm but I think that node crashed is always the master ETCD node.
c
Are you talking about the etcd pod itself? I’m not aware of any memory leaks in etcd.
a
This behavior occurred only ETCD instances. I'll improve my monitoring to isolate pod memory usage to check which pod or process is using more memory.
Hi @creamy-pencil-82913 I was able to identify who is affecting the increase in memory on the ETCD node. It is the RKE2 server process. I don't understand why, but I have separated some logs and maybe you can identify something strange. Looking at the HTOP, we can see that this ETCD node is using 35% of memory for the rke2-server process and it is increasing. We have 5 ECTD nodes segregated from the Controlplane and this behavior occurs on a single ECTD node, I believe it is on the master, the rest of the ETCD nodes are using an average of 4.5% of memory. Below is the image of the HTOP command:
This image below shows the memory usage behavior and it only occurs on one node and this will even lead to the node crash.
Logs:
c
logs won’t provide any information on what’s going on with process memory. Can you set
enable-pprof: true
on your servers, and collect a memory profile from when the utilization is high? You can get a profile from:
Copy code
curl -vks -o heap.pprof <https://localhost:9345/debug/pprof/heap?seconds=120>
a
Hi @creamy-pencil-82913 I'll check.
Hi @creamy-pencil-82913 I couldn´t identify where I need to put this parameter. Is it inside rke config. I configured all of cluster using terraform. My terraform:
Copy code
resource "rancher2_cluster_v2" "cluster" {
  provider = rancher2.admin

  name               = var.cluster_name
  kubernetes_version = var.cluster_kubernetes_version

  rke_config {
    machine_global_config = yamlencode({
      cloud-provider-name = "aws"
    })
    machine_selector_config {
      machine_label_selector {
        match_expressions {
          key      = "rke.cattle.io/etcd-role"
          operator = "In"
          values   = ["true"]
        }
      }
      config = yamlencode({
        kubelet-arg = [
          "cloud-provider=external"
        ]
      })
    }
    machine_selector_config {
      machine_label_selector {
        match_expressions {
          key      = "rke.cattle.io/control-plane-role"
          operator = "In"
          values   = ["true"]
        }
      }
      config = yamlencode({
        disable-cloud-controller = "true"
        kubelet-arg = [
          "cloud-provider=external"
        ]
        kube-apiserver-arg = [
          "cloud-provider=external"
        ]
        kube-controller-manager-arg = [
          "cloud-provider=external"
        ]
      })
    }
    machine_selector_config {
      machine_label_selector {
        match_expressions {
          key      = "rke.cattle.io/worker-role"
          operator = "In"
          values   = ["true"]
        }
      }
      config = yamlencode({
        kubelet-arg = [
          "cloud-provider=external",
          "image-credential-provider-bin-dir=/opt/xxx-toolbox/files/ecr-credential-provider",
          "image-credential-provider-config=/opt/xxx-toolbox/files/ecr-credential-provider/credential-provider-config.yaml"
        ]
      })
    }
    upgrade_strategy {
      control_plane_concurrency = "1"
      worker_concurrency = "1"
      worker_drain_options {
        enabled = true
        delete_empty_dir_data = true
        ignore_daemon_sets = true
        disable_eviction = true
        force = true
      }
      control_plane_drain_options {
        enabled = true
        delete_empty_dir_data = true
        ignore_daemon_sets = true
        disable_eviction = true
        force = true
      }
    }
    etcd {
      snapshot_schedule_cron = var.snapshot_schedule_cron
      snapshot_retention = var.snapshot_retention
      s3_config {
        bucket    = var.bucket_etcd_bkp_name
        endpoint  = "s3.amazonaws.com"
        folder    = "${var.cluster_name}-etcd-backup"
        region    = data.aws_s3_bucket.selected.region
      }
    }
    additional_manifest = <<EOF
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
  name: aws-cloud-controller-manager
  namespace: kube-system
spec:
  chart: aws-cloud-controller-manager
  repo: <https://kubernetes.github.io/cloud-provider-aws>
  targetNamespace: kube-system
  bootstrap: true
  valuesContent: |-
    hostNetworking: true
    nodeSelector:
      node-role.kubernetes.io/control-plane: "true"
    args:
      - --configure-cloud-routes=false
      - --use-service-account-credentials=true
      - --v=2
      - --cloud-provider=aws
    clusterRoleRules:
      - apiGroups:
          - ""
        resources:
          - events
        verbs:
          - create
          - patch
          - update
      - apiGroups:
          - ""
        resources:
          - nodes
        verbs:
          - '*'
      - apiGroups:
          - ""
        resources:
          - nodes/status
        verbs:
          - patch
      - apiGroups:
          - ""
        resources:
          - services
        verbs:
          - list
          - patch
          - update
          - watch
      - apiGroups:
          - ""
        resources:
          - services/status
        verbs:
          - list
          - patch
          - update
          - watch
      - apiGroups:
         - ''
        resources:
          - serviceaccounts
        verbs:
        - create
        - get
      - apiGroups:
          - ""
        resources:
          - persistentvolumes
        verbs:
          - get
          - list
          - update
          - watch
      - apiGroups:
          - ""
        resources:
          - endpoints
        verbs:
          - create
          - get
          - list
          - watch
          - update
      - apiGroups:
          - coordination.k8s.io
        resources:
          - leases
        verbs:
          - create
          - get
          - list
          - watch
          - update
      - apiGroups:
          - ""
        resources:
          - serviceaccounts/token
        verbs:
          - create    
EOF
  }
}
Can you help me?