zhong (钟鹏群) 1 月之前
父节点
当前提交
171dd5874b

+ 50 - 10
terraform/ansible-files/install-k3s-master.yml

@@ -9,17 +9,37 @@
     k3s_token: "my-secret-token"
 
   tasks:
-    - name: Stop and cleanup any existing k3s installation
-      shell: |
-        systemctl stop k3s 2>/dev/null || true
-        pkill -f k3s 2>/dev/null || true
-        rm -f /etc/systemd/system/k3s.service
-        rm -rf /var/lib/rancher/k3s
-        rm -rf /etc/rancher/k3s
-        rm -rf /root/.kube
-      register: cleanup_result
+    - name: Check if k3s service exists
+      stat:
+        path: /etc/systemd/system/k3s.service
+      register: k3s_service_exists
+
+    - name: Stop k3s service if it exists
+      systemd:
+        name: k3s
+        state: stopped
+      when: k3s_service_exists.stat.exists
       ignore_errors: yes
 
+    - name: Kill any running k3s processes
+      shell: pkill -f k3s || true
+      when: k3s_service_exists.stat.exists
+      ignore_errors: yes
+
+    - name: Remove k3s service file
+      file:
+        path: /etc/systemd/system/k3s.service
+        state: absent
+
+    - name: Remove k3s data directories
+      file:
+        path: "{{ item }}"
+        state: absent
+      loop:
+        - /var/lib/rancher/k3s
+        - /etc/rancher/k3s
+        - /root/.kube
+
     - name: Print cleanup status
       debug:
         msg: "Cleanup completed on master node"
@@ -60,9 +80,29 @@
     - name: Wait for node-token file to be created
       wait_for:
         path: /var/lib/rancher/k3s/server/node-token
-        timeout: 300
+        timeout: 120
       register: token_wait
 
+    - name: Check k3s service status
+      shell: systemctl is-active k3s
+      register: service_status
+      ignore_errors: yes
+
+    - name: Display k3s service status
+      debug:
+        msg: "k3s service status on master: {{ service_status.stdout }}"
+      
+    - name: Get k3s logs if service is not active
+      shell: journalctl -u k3s --no-pager -n 20
+      when: service_status.stdout != 'active'
+      register: service_logs
+      ignore_errors: yes
+
+    - name: Display k3s logs if needed
+      debug:
+        msg: "{{ service_logs.stdout_lines }}"
+      when: service_status.stdout != 'active'
+
     - name: Display success message
       debug:
         msg: "k3s master node installed and running successfully"

+ 68 - 9
terraform/ansible-files/install-k3s-workers.yml

@@ -34,20 +34,49 @@
     k3s_download_url: "http://download.9981.tech/k3s-v1.35.0%2Bk3s1"
 
   tasks:
-    - name: Stop and cleanup any existing k3s installation on workers
-      shell: |
-        systemctl stop k3s-agent 2>/dev/null || true
-        pkill -f k3s 2>/dev/null || true
-        rm -rf /var/lib/rancher/k3s
-        rm -f /etc/systemd/system/k3s-agent.service
-        systemctl daemon-reload 2>/dev/null || true
-      register: cleanup_result
+    - name: Check if k3s-agent service exists
+      stat:
+        path: /etc/systemd/system/k3s-agent.service
+      register: k3s_agent_service_exists
+
+    - name: Stop k3s-agent service if it exists
+      systemd:
+        name: k3s-agent
+        state: stopped
+      when: k3s_agent_service_exists.stat.exists
       ignore_errors: yes
 
+    - name: Kill any running k3s processes
+      shell: pkill -f k3s || true
+      when: k3s_agent_service_exists.stat.exists
+      ignore_errors: yes
+
+    - name: Remove k3s-agent service file
+      file:
+        path: /etc/systemd/system/k3s-agent.service
+        state: absent
+
+    - name: Remove k3s data directory
+      file:
+        path: /var/lib/rancher/k3s
+        state: absent
+
+    - name: Reload systemd daemon
+      systemd:
+        daemon_reload: yes
+
     - name: Print cleanup status
       debug:
         msg: "Cleanup completed on worker node {{ inventory_hostname }}"
 
+    - name: Test connectivity to master node
+      wait_for:
+        port: 6443
+        host: "{{ master_ip }}"
+        timeout: 10
+        state: started
+      ignore_errors: yes
+
     - name: Check if k3s binary exists
       stat:
         path: /usr/local/bin/k3s
@@ -85,10 +114,40 @@
       wait_for:
         port: 10250
         host: "{{ inventory_hostname }}"
-        timeout: 300
+        timeout: 120
         delay: 10
       ignore_errors: yes
 
+    - name: Check k3s-agent service status
+      shell: systemctl is-active k3s-agent
+      register: service_status
+      ignore_errors: yes
+
+    - name: Display k3s-agent service status
+      debug:
+        msg: "k3s-agent service status on {{ inventory_hostname }}: {{ service_status.stdout }}"
+      
+    - name: Get k3s-agent logs if service is not active
+      shell: journalctl -u k3s-agent --no-pager -n 20
+      when: service_status.stdout != 'active'
+      register: service_logs
+      ignore_errors: yes
+
+    - name: Display k3s-agent logs if needed
+      debug:
+        msg: "{{ service_logs.stdout_lines }}"
+      when: service_status.stdout != 'active'
+
+    - name: Check if k3s-agent process is running
+      shell: pgrep k3s
+      register: k3s_process
+      ignore_errors: yes
+
+    - name: Display k3s process info
+      debug:
+        msg: "k3s process IDs on {{ inventory_hostname }}: {{ k3s_process.stdout }}"
+      when: k3s_process.stdout is defined and k3s_process.stdout != ""
+
     - name: Display success message
       debug:
         msg: "k3s agent installed and joined to cluster on worker {{ inventory_hostname }}"

+ 1 - 0
terraform/ansible-files/main-playbook.yml

@@ -4,6 +4,7 @@
 
 - name: Install k3s workers and join to cluster
   import_playbook: install-k3s-workers.yml
+  ignore_errors: yes
 
 - name: Verify k3s cluster status
   import_playbook: verify-cluster.yml

+ 24 - 8
terraform/ansible-files/verify-cluster.yml

@@ -9,17 +9,25 @@
         path: /var/lib/rancher/k3s/server/manifests
         timeout: 300
 
-    - name: Check if kubectl is available
-      command: which kubectl
-      register: kubectl_check
-      changed_when: false
+    - name: Check if k3s kubectl is available
+      stat:
+        path: /usr/local/bin/k3s
+      register: k3s_binary
 
-    - name: Get cluster nodes status
+    - name: Get cluster nodes status using k3s kubectl
+      command: /usr/local/bin/k3s kubectl get nodes
+      register: nodes_status
+      environment:
+        KUBECONFIG: /etc/rancher/k3s/k3s.yaml
+      when: k3s_binary.stat.exists
+      failed_when: false
+
+    - name: Get cluster nodes status using direct kubectl
       command: kubectl get nodes
       register: nodes_status
       environment:
         KUBECONFIG: /etc/rancher/k3s/k3s.yaml
-      when: kubectl_check.rc == 0
+      when: not k3s_binary.stat.exists
       failed_when: false
 
     - name: Display cluster nodes status
@@ -27,12 +35,20 @@
         msg: "{{ nodes_status.stdout_lines }}"
       when: nodes_status is succeeded
 
-    - name: Get cluster info
+    - name: Get cluster info using k3s kubectl
+      command: /usr/local/bin/k3s kubectl cluster-info
+      register: cluster_info
+      environment:
+        KUBECONFIG: /etc/rancher/k3s/k3s.yaml
+      when: k3s_binary.stat.exists
+      failed_when: false
+
+    - name: Get cluster info using direct kubectl
       command: kubectl cluster-info
       register: cluster_info
       environment:
         KUBECONFIG: /etc/rancher/k3s/k3s.yaml
-      when: kubectl_check.rc == 0
+      when: not k3s_binary.stat.exists
       failed_when: false
 
     - name: Display cluster info