Update monitor-all.sh (#4437)

 Summary of Changes from the Initial Version
🧩 1. Tag-Based Filtering (Core Feature)
New feature: Only restart instances (VMs or containers) that have the mon-restart tag.

This makes monitoring and auto-restart controllable directly from the Proxmox Web UI, without editing scripts or services.

Set via GUI: VM → Options → Tags → mon-restart

Set via CLI: qm set <vmid> -tags mon-restart or pct set <ctid> -tags mon-restart

This is the primary new control mechanism, making the script safer, more flexible, and user-friendly.

🧰 2. Backward-Compatible Exclusion Mechanism
The original feature that lets you exclude instances via CLI arguments is preserved:

bash
Copy
Edit
./ping-instances.sh 101 300
These IDs will always be skipped regardless of tag.

🧠 3. Intelligent Responsiveness Checks
For VMs:

Uses qm guest cmd <id> ping to check responsiveness via the QEMU guest agent.

No longer relies on network-level ping, which can be misleading or blocked.

For containers (CTs):

Uses traditional ping to IP addresses obtained from pct exec, since CTs don’t support QEMU agent.

 4. Instance Skipping Improvements
Instances are now skipped if:

They are explicitly excluded via CLI.

They are templates.

They are configured with onboot: 0 or missing.

They lack the mon-restart tag, regardless of other status.

🪵 5. Same Logging Behavior
All output continues to go to /var/log/ping-instances.log for persistent tracking.

Verbose messages were added for traceability (e.g., why a VM or CT was skipped).

🎯 Why This Matters
With tag-based control, admins can now manage restart behavior dynamically from the Proxmox Web UI, making the script:

More secure (no accidental restarts).

More maintainable (no script edits needed).

More user-friendly (integrated with the UI workflow).
This commit is contained in:
Konstantin Krastev 2025-05-14 12:00:57 +03:00 committed by GitHub
parent 7740ab68f7
commit 16c0d09d6b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -16,43 +16,52 @@ cat <<"EOF"
EOF EOF
add() { add() {
echo -e "\n IMPORTANT: Tag-Based Monitoring Enabled"
echo "Only VMs and containers with the tag 'mon-restart' will be automatically restarted by this service."
echo
echo "🔧 How to add the tag:"
echo " → Proxmox Web UI: Go to VM/CT → Options → Tags → Add 'mon-restart'"
echo " → CLI: qm set <vmid> -tags mon-restart"
echo " pct set <ctid> -tags mon-restart"
echo
while true; do while true; do
read -p "This script will add Monitor All to Proxmox VE. Proceed(y/n)?" yn read -p "This script will add Monitor All to Proxmox VE. Proceed (y/n)? " yn
case $yn in case $yn in
[Yy]*) break ;; [Yy]*) break ;;
[Nn]*) exit ;; [Nn]*) exit ;;
*) echo "Please answer yes or no." ;; *) echo "Please answer yes or no." ;;
esac esac
done done
echo '#!/usr/bin/env bash cat <<'EOF' >/usr/local/bin/ping-instances.sh
#!/usr/bin/env bash
# Read excluded instances from command line arguments # Read excluded instances from command line arguments
excluded_instances=("$@") excluded_instances=("$@")
echo "Excluded instances: ${excluded_instances[@]}" echo "Excluded instances: ${excluded_instances[@]}"
while true; do while true; do
for instance in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\''); do for instance in $(pct list | awk 'NR>1 {print $1}'; qm list | awk 'NR>1 {print $1}'); do
# Skip excluded instances # Skip excluded instances
if [[ " ${excluded_instances[@]} " =~ " ${instance} " ]]; then if [[ " ${excluded_instances[@]} " =~ " ${instance} " ]]; then
echo "Skipping $instance because it is excluded" echo "Skipping $instance because it is excluded"
continue continue
fi fi
# Determine the type of the instance (container or virtual machine) # Determine type and set config command
if pct status $instance >/dev/null 2>&1; then if pct status $instance >/dev/null 2>&1; then
# It is a container type="ct"
config_cmd="pct config" config_cmd="pct config"
IP=$(pct exec $instance ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1)
else else
# It is a virtual machine type="vm"
config_cmd="qm config" config_cmd="qm config"
IP=$(qm guest cmd $instance network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "192\.|10\." | head -n 1)
fi fi
# Skip instances based on onboot and templates # Skip templates and onboot-disabled
onboot=$($config_cmd $instance | grep -q "onboot: 0" || ( ! $config_cmd $instance | grep -q "onboot" ) && echo "true" || echo "false") onboot=$($config_cmd $instance | grep -q "onboot: 0" || ( ! $config_cmd $instance | grep -q "onboot" ) && echo "true" || echo "false")
template=$($config_cmd $instance | grep template | grep -q "template:" && echo "true" || echo "false") template=$($config_cmd $instance | grep -q "^template:" && echo "true" || echo "false")
if [ "$onboot" == "true" ]; then if [ "$onboot" == "true" ]; then
echo "Skipping $instance because it is set not to boot" echo "Skipping $instance because it is set not to boot"
@ -62,36 +71,49 @@ while true; do
continue continue
fi fi
# Ping the instance # Check for mon-restart tag
if ! ping -c 1 $IP >/dev/null 2>&1; then has_tag=$($config_cmd $instance | grep -q "tags:.*mon-restart" && echo "true" || echo "false")
# If the instance can not be pinged, stop and start it if [ "$has_tag" != "true" ]; then
if pct status $instance >/dev/null 2>&1; then echo "Skipping $instance because it does not have 'mon-restart' tag"
# It is a container continue
fi
# Responsiveness check and restart if needed
if [ "$type" == "vm" ]; then
# Check if guest agent responds
if qm guest cmd $instance ping >/dev/null 2>&1; then
echo "VM $instance is responsive via guest agent"
else
echo "$(date): VM $instance is not responding to agent ping, restarting..."
if qm status $instance | grep -q "status: running"; then
qm stop $instance >/dev/null 2>&1
sleep 5
fi
qm start $instance >/dev/null 2>&1
fi
else
# Container: get IP and ping
IP=$(pct exec $instance ip a s dev eth0 | awk '/inet / {print $2}' | cut -d/ -f1)
if ! ping -c 1 $IP >/dev/null 2>&1; then
echo "$(date): CT $instance is not responding, restarting..." echo "$(date): CT $instance is not responding, restarting..."
pct stop $instance >/dev/null 2>&1 pct stop $instance >/dev/null 2>&1
sleep 5 sleep 5
pct start $instance >/dev/null 2>&1 pct start $instance >/dev/null 2>&1
else else
# It is a virtual machine echo "CT $instance is responsive"
if qm status $instance | grep -q "status: running"; then
echo "$(date): VM $instance is not responding, restarting..."
qm stop $instance >/dev/null 2>&1
sleep 5
else
echo "$(date): VM $instance is not running, starting..."
fi
qm start $instance >/dev/null 2>&1
fi fi
fi fi
done done
# Wait for 5 minutes. (Edit to your needs)
echo "$(date): Pausing for 5 minutes..." echo "$(date): Pausing for 5 minutes..."
sleep 300 sleep 300
done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh
done >/var/log/ping-instances.log 2>&1
EOF
touch /var/log/ping-instances.log touch /var/log/ping-instances.log
# Change file permissions to executable
chmod +x /usr/local/bin/ping-instances.sh chmod +x /usr/local/bin/ping-instances.sh
cat <<EOF >/etc/systemd/system/ping-instances.timer cat <<EOF >/etc/systemd/system/ping-instances.timer
[Unit] [Unit]
Description=Delay ping-instances.service by 5 minutes Description=Delay ping-instances.service by 5 minutes
@ -104,17 +126,17 @@ OnUnitActiveSec=300
WantedBy=timers.target WantedBy=timers.target
EOF EOF
# Create ping-instances.service
cat <<EOF >/etc/systemd/system/ping-instances.service cat <<EOF >/etc/systemd/system/ping-instances.service
[Unit] [Unit]
Description=Ping instances every 5 minutes and restarts if necessary Description=Ping instances every 5 minutes and restart if necessary
After=ping-instances.timer After=ping-instances.timer
Requires=ping-instances.timer Requires=ping-instances.timer
[Service] [Service]
Type=simple Type=simple
# To specify which CT/VM should be excluded, add the CT/VM ID at the end of the line where ExecStart=/usr/local/bin/ping-instances.sh is specified. # To exclude specific instances, pass IDs to ExecStart, e.g.:
# For example: ExecStart=/usr/local/bin/ping-instances.sh 100 102 # ExecStart=/usr/local/bin/ping-instances.sh 100 200
# Virtual machines without the QEMU guest agent installed must be excluded. # Instances must also have the 'mon-restart' tag to be monitored
ExecStart=/usr/local/bin/ping-instances.sh ExecStart=/usr/local/bin/ping-instances.sh
Restart=always Restart=always
@ -125,39 +147,33 @@ StandardError=file:/var/log/ping-instances.log
WantedBy=multi-user.target WantedBy=multi-user.target
EOF EOF
# Reload daemon, enable and start ping-instances.service
systemctl daemon-reload systemctl daemon-reload
systemctl enable -q --now ping-instances.timer systemctl enable -q --now ping-instances.timer
systemctl enable -q --now ping-instances.service systemctl enable -q --now ping-instances.service
clear clear
echo -e "\n To view Monitor All logs: cat /var/log/ping-instances.log" echo -e "\n Monitor All installed."
echo "📄 To view logs: cat /var/log/ping-instances.log"
echo "⚙️ Make sure your VMs or containers have the 'mon-restart' tag to be monitored."
} }
remove() { remove() {
systemctl disable -q --now ping-instances.timer systemctl disable -q --now ping-instances.timer
systemctl disable -q --now ping-instances.service systemctl disable -q --now ping-instances.service
rm /etc/systemd/system/ping-instances.service /etc/systemd/system/ping-instances.timer /usr/local/bin/ping-instances.sh /var/log/ping-instances.log rm -f /etc/systemd/system/ping-instances.service
echo "Removed Monitor All from Proxmox VE" rm -f /etc/systemd/system/ping-instances.timer
rm -f /usr/local/bin/ping-instances.sh
rm -f /var/log/ping-instances.log
echo "Monitor All removed from Proxmox VE"
} }
# Define options for the whiptail menu
OPTIONS=(Add "Add Monitor-All to Proxmox VE" OPTIONS=(Add "Add Monitor-All to Proxmox VE"
Remove "Remove Monitor-All from Proxmox VE") Remove "Remove Monitor-All from Proxmox VE")
# Show the whiptail menu and save the user's choice
CHOICE=$(whiptail --backtitle "Proxmox VE Helper Scripts" --title "Monitor-All for Proxmox VE" --menu "Select an option:" 10 58 2 \ CHOICE=$(whiptail --backtitle "Proxmox VE Helper Scripts" --title "Monitor-All for Proxmox VE" --menu "Select an option:" 10 58 2 \
"${OPTIONS[@]}" 3>&1 1>&2 2>&3) "${OPTIONS[@]}" 3>&1 1>&2 2>&3)
# Check the user's choice and perform the corresponding action
case $CHOICE in case $CHOICE in
"Add") "Add") add ;;
add "Remove") remove ;;
;; *) echo "Exiting..."; exit 0 ;;
"Remove")
remove
;;
*)
echo "Exiting..."
exit 0
;;
esac esac