From 16c0d09d6b1cb177e093c3b885802465e0318a64 Mon Sep 17 00:00:00 2001 From: Konstantin Krastev Date: Wed, 14 May 2025 12:00:57 +0300 Subject: [PATCH] Update monitor-all.sh (#4437) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit βœ… Summary of Changes from the Initial Version 🧩 1. Tag-Based Filtering (Core Feature) New feature: Only restart instances (VMs or containers) that have the mon-restart tag. This makes monitoring and auto-restart controllable directly from the Proxmox Web UI, without editing scripts or services. Set via GUI: VM β†’ Options β†’ Tags β†’ mon-restart Set via CLI: qm set -tags mon-restart or pct set -tags mon-restart This is the primary new control mechanism, making the script safer, more flexible, and user-friendly. 🧰 2. Backward-Compatible Exclusion Mechanism The original feature that lets you exclude instances via CLI arguments is preserved: bash Copy Edit ./ping-instances.sh 101 300 These IDs will always be skipped regardless of tag. 🧠 3. Intelligent Responsiveness Checks For VMs: Uses qm guest cmd ping to check responsiveness via the QEMU guest agent. No longer relies on network-level ping, which can be misleading or blocked. For containers (CTs): Uses traditional ping to IP addresses obtained from pct exec, since CTs don’t support QEMU agent. β›” 4. Instance Skipping Improvements Instances are now skipped if: They are explicitly excluded via CLI. They are templates. They are configured with onboot: 0 or missing. They lack the mon-restart tag, regardless of other status. πŸͺ΅ 5. Same Logging Behavior All output continues to go to /var/log/ping-instances.log for persistent tracking. Verbose messages were added for traceability (e.g., why a VM or CT was skipped). 🎯 Why This Matters With tag-based control, admins can now manage restart behavior dynamically from the Proxmox Web UI, making the script: More secure (no accidental restarts). More maintainable (no script edits needed). More user-friendly (integrated with the UI workflow). --- tools/pve/monitor-all.sh | 120 ++++++++++++++++++++++----------------- 1 file changed, 68 insertions(+), 52 deletions(-) diff --git a/tools/pve/monitor-all.sh b/tools/pve/monitor-all.sh index b0f95786a..30795cce1 100644 --- a/tools/pve/monitor-all.sh +++ b/tools/pve/monitor-all.sh @@ -16,43 +16,52 @@ cat <<"EOF" EOF add() { + echo -e "\n IMPORTANT: Tag-Based Monitoring Enabled" + echo "Only VMs and containers with the tag 'mon-restart' will be automatically restarted by this service." + echo + echo "πŸ”§ How to add the tag:" + echo " β†’ Proxmox Web UI: Go to VM/CT β†’ Options β†’ Tags β†’ Add 'mon-restart'" + echo " β†’ CLI: qm set -tags mon-restart" + echo " pct set -tags mon-restart" + echo + while true; do - read -p "This script will add Monitor All to Proxmox VE. Proceed(y/n)?" yn + read -p "This script will add Monitor All to Proxmox VE. Proceed (y/n)? " yn case $yn in - [Yy]*) break ;; - [Nn]*) exit ;; - *) echo "Please answer yes or no." ;; + [Yy]*) break ;; + [Nn]*) exit ;; + *) echo "Please answer yes or no." ;; esac done - echo '#!/usr/bin/env bash + cat <<'EOF' >/usr/local/bin/ping-instances.sh +#!/usr/bin/env bash + # Read excluded instances from command line arguments excluded_instances=("$@") echo "Excluded instances: ${excluded_instances[@]}" while true; do - for instance in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\''); do + for instance in $(pct list | awk 'NR>1 {print $1}'; qm list | awk 'NR>1 {print $1}'); do # Skip excluded instances if [[ " ${excluded_instances[@]} " =~ " ${instance} " ]]; then echo "Skipping $instance because it is excluded" continue fi - # Determine the type of the instance (container or virtual machine) + # Determine type and set config command if pct status $instance >/dev/null 2>&1; then - # It is a container + type="ct" config_cmd="pct config" - IP=$(pct exec $instance ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1) else - # It is a virtual machine + type="vm" config_cmd="qm config" - IP=$(qm guest cmd $instance network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "192\.|10\." | head -n 1) fi - # Skip instances based on onboot and templates + # Skip templates and onboot-disabled onboot=$($config_cmd $instance | grep -q "onboot: 0" || ( ! $config_cmd $instance | grep -q "onboot" ) && echo "true" || echo "false") - template=$($config_cmd $instance | grep template | grep -q "template:" && echo "true" || echo "false") + template=$($config_cmd $instance | grep -q "^template:" && echo "true" || echo "false") if [ "$onboot" == "true" ]; then echo "Skipping $instance because it is set not to boot" @@ -62,36 +71,49 @@ while true; do continue fi - # Ping the instance - if ! ping -c 1 $IP >/dev/null 2>&1; then - # If the instance can not be pinged, stop and start it - if pct status $instance >/dev/null 2>&1; then - # It is a container + # Check for mon-restart tag + has_tag=$($config_cmd $instance | grep -q "tags:.*mon-restart" && echo "true" || echo "false") + if [ "$has_tag" != "true" ]; then + echo "Skipping $instance because it does not have 'mon-restart' tag" + continue + fi + + # Responsiveness check and restart if needed + if [ "$type" == "vm" ]; then + # Check if guest agent responds + if qm guest cmd $instance ping >/dev/null 2>&1; then + echo "VM $instance is responsive via guest agent" + else + echo "$(date): VM $instance is not responding to agent ping, restarting..." + if qm status $instance | grep -q "status: running"; then + qm stop $instance >/dev/null 2>&1 + sleep 5 + fi + qm start $instance >/dev/null 2>&1 + fi + else + # Container: get IP and ping + IP=$(pct exec $instance ip a s dev eth0 | awk '/inet / {print $2}' | cut -d/ -f1) + if ! ping -c 1 $IP >/dev/null 2>&1; then echo "$(date): CT $instance is not responding, restarting..." pct stop $instance >/dev/null 2>&1 sleep 5 pct start $instance >/dev/null 2>&1 else - # It is a virtual machine - if qm status $instance | grep -q "status: running"; then - echo "$(date): VM $instance is not responding, restarting..." - qm stop $instance >/dev/null 2>&1 - sleep 5 - else - echo "$(date): VM $instance is not running, starting..." - fi - qm start $instance >/dev/null 2>&1 + echo "CT $instance is responsive" fi fi done - # Wait for 5 minutes. (Edit to your needs) echo "$(date): Pausing for 5 minutes..." sleep 300 -done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh + +done >/var/log/ping-instances.log 2>&1 +EOF + touch /var/log/ping-instances.log - # Change file permissions to executable chmod +x /usr/local/bin/ping-instances.sh + cat </etc/systemd/system/ping-instances.timer [Unit] Description=Delay ping-instances.service by 5 minutes @@ -104,17 +126,17 @@ OnUnitActiveSec=300 WantedBy=timers.target EOF - # Create ping-instances.service cat </etc/systemd/system/ping-instances.service [Unit] -Description=Ping instances every 5 minutes and restarts if necessary +Description=Ping instances every 5 minutes and restart if necessary After=ping-instances.timer Requires=ping-instances.timer + [Service] Type=simple -# To specify which CT/VM should be excluded, add the CT/VM ID at the end of the line where ExecStart=/usr/local/bin/ping-instances.sh is specified. -# For example: ExecStart=/usr/local/bin/ping-instances.sh 100 102 -# Virtual machines without the QEMU guest agent installed must be excluded. +# To exclude specific instances, pass IDs to ExecStart, e.g.: +# ExecStart=/usr/local/bin/ping-instances.sh 100 200 +# Instances must also have the 'mon-restart' tag to be monitored ExecStart=/usr/local/bin/ping-instances.sh Restart=always @@ -125,39 +147,33 @@ StandardError=file:/var/log/ping-instances.log WantedBy=multi-user.target EOF - # Reload daemon, enable and start ping-instances.service systemctl daemon-reload systemctl enable -q --now ping-instances.timer systemctl enable -q --now ping-instances.service clear - echo -e "\n To view Monitor All logs: cat /var/log/ping-instances.log" + echo -e "\n Monitor All installed." + echo "πŸ“„ To view logs: cat /var/log/ping-instances.log" + echo "βš™οΈ Make sure your VMs or containers have the 'mon-restart' tag to be monitored." } remove() { systemctl disable -q --now ping-instances.timer systemctl disable -q --now ping-instances.service - rm /etc/systemd/system/ping-instances.service /etc/systemd/system/ping-instances.timer /usr/local/bin/ping-instances.sh /var/log/ping-instances.log - echo "Removed Monitor All from Proxmox VE" + rm -f /etc/systemd/system/ping-instances.service + rm -f /etc/systemd/system/ping-instances.timer + rm -f /usr/local/bin/ping-instances.sh + rm -f /var/log/ping-instances.log + echo "Monitor All removed from Proxmox VE" } -# Define options for the whiptail menu OPTIONS=(Add "Add Monitor-All to Proxmox VE" Remove "Remove Monitor-All from Proxmox VE") -# Show the whiptail menu and save the user's choice CHOICE=$(whiptail --backtitle "Proxmox VE Helper Scripts" --title "Monitor-All for Proxmox VE" --menu "Select an option:" 10 58 2 \ "${OPTIONS[@]}" 3>&1 1>&2 2>&3) -# Check the user's choice and perform the corresponding action case $CHOICE in -"Add") - add - ;; -"Remove") - remove - ;; -*) - echo "Exiting..." - exit 0 - ;; +"Add") add ;; +"Remove") remove ;; +*) echo "Exiting..."; exit 0 ;; esac