#!/bin/bash
###############################################################################
# pve-health-check.sh - Monthly Proxmox VE Health Check
# 
# Run as root on each Proxmox node. Checks storage health, backup status,
# LVM thin pool utilization, SMART data, ZFS pool status, system resources,
# and service health. Outputs a plaintext report to stdout and optionally
# to a file.
#
# Usage:
#   ./pve-health-check.sh              # Print to stdout
#   ./pve-health-check.sh -o /path     # Also save report to /path
#   ./pve-health-check.sh -m email     # Also email report (requires mailutils)
#
# Exit codes:
#   0 = All checks passed
#   1 = Warnings detected
#   2 = Critical issues detected
#
# Changelog:
#   v1.6.1 - Tiered SMART alerting: added NOTICE/WARN/CRITICAL thresholds for
#            reallocated sectors (5/50/200), pending sectors (1/25/100), and
#            uncorrectable sectors (0/5/50). NOTICE level is informational
#            (monitor only, no action required). Previous threshold of 5
#            reallocated sectors as WARNING caused false positives on aged drives.
#            Adjusted ZFS fragmentation thresholds: SSD CRITICAL raised from
#            55% to 70%, WARNING raised from 30% to 35%. Previous SSD threshold
#            triggered false CRITICAL on pools with low utilization and no
#            performance impact.
#            Backup recency now queries PBS datastores via pvesm for snapshot
#            history when local task logs do not contain a match. Replaced
#            fragile grep-based JSON parsing with python3 json module. Guests
#            backed up exclusively to PBS no longer produce false warnings.
#   v1.6.0 - Added NFS mount health, ISO integrity, vzdump.conf audit,
#            orphaned vzdump temp files, NVIDIA DKMS status, CT on
#            non-snapshottable storage, LVM snapshot audit, PBS datastore
#            utilization, backup recency, TrueNAS snapshot count,
#            corosync state consistency, duplicate storage paths
#   v1.5.0 - Workload-aware CPU overallocation, kernel cleanup, EFI/TPM
#            thin pool exclusion, directory-on-ZFS detection, stale mounts
###############################################################################

set +e  # Do not exit on command failures; health checks must tolerate non-zero exits

# --- Configuration -----------------------------------------------------------
WARN_THIN_DATA_PCT=75
CRIT_THIN_DATA_PCT=85
WARN_THIN_META_PCT=60
CRIT_THIN_META_PCT=75
WARN_FS_PCT=80
CRIT_FS_PCT=90
WARN_NVME_SPARE_PCT=90      # Warn if available spare drops below this
WARN_NVME_USED_PCT=80       # Warn if percentage used exceeds this
NOTICE_SATA_REALLOC=5        # Notice (monitor) if reallocated sector count exceeds this
WARN_SATA_REALLOC=50         # Warn if reallocated sector count exceeds this
CRIT_SATA_REALLOC=200        # Critical if reallocated sector count exceeds this
NOTICE_SATA_PENDING=1        # Notice (monitor) if pending sector count exceeds this
WARN_SATA_PENDING=25         # Warn if pending sector count exceeds this
CRIT_SATA_PENDING=100        # Critical if pending sector count exceeds this
NOTICE_SATA_UNCORRECT=0      # Notice (monitor) if uncorrectable sector count exceeds this
WARN_SATA_UNCORRECT=5        # Warn if uncorrectable sector count exceeds this
CRIT_SATA_UNCORRECT=50       # Critical if uncorrectable sector count exceeds this
WARN_ZFS_ERRORS=0            # Any errors = warning
WARN_TEMP_CELSIUS=70
CRIT_TEMP_CELSIUS=80
BACKUP_STALE_HOURS=48        # Warn if running guest has no backup within this window
WARN_ZFS_FRAG_PCT=35         # ZFS fragmentation warning
CRIT_ZFS_FRAG_SSD_PCT=70     # ZFS fragmentation critical (SSD)
CRIT_ZFS_FRAG_HDD_PCT=40     # ZFS fragmentation critical (HDD)
WARN_LOAD_RATIO=2            # Warn if 5-min load average exceeds CPU count by this multiplier
CRIT_LOAD_RATIO=3            # Critical if 5-min load average exceeds CPU count by this multiplier
NFS_TIMEOUT_SECS=5           # Timeout for NFS mount responsiveness test
WARN_ISO_MIN_SIZE_MB=500     # Flag ISOs smaller than this as potentially corrupt
VZDUMP_TEMP_STALE_HOURS=24   # Flag vzdump temp dirs older than this
WARN_PBS_USAGE_PCT=80        # PBS datastore utilization warning
CRIT_PBS_USAGE_PCT=90        # PBS datastore utilization critical
WARN_TRUENAS_SNAPSHOTS=5000  # TrueNAS snapshot count warning threshold
# vCPU overallocation thresholds (evaluated per workload type):
#   VMs:  warn 2:1, crit 4:1 (full kernel, dedicated memory, higher overhead)
#   CTs:  warn 6:1, crit 10:1 (shared kernel, minimal overhead when idle)
# -----------------------------------------------------------------------------

VERSION="1.6.1"
HOSTNAME=$(hostname)
DATE=$(date '+%Y-%m-%d %H:%M:%S')
DATE_SHORT=$(date '+%Y%m%d_%H%M%S')
OUTPUT_FILE=""
MAIL_TO=""
EXIT_CODE=0
WARNINGS=0
CRITICALS=0

# --- Argument Parsing --------------------------------------------------------
while getopts "o:m:h" opt; do
    case $opt in
        o) OUTPUT_FILE="$OPTARG" ;;
        m) MAIL_TO="$OPTARG" ;;
        h)
            echo "Usage: $0 [-o output_path] [-m email_address]"
            exit 0
            ;;
        *) echo "Usage: $0 [-o output_path] [-m email_address]"; exit 1 ;;
    esac
done

# --- Helpers -----------------------------------------------------------------
REPORT=""
REPORT_PLAIN=""

# Color support: enabled for terminal, disabled for file/email output
if [[ -t 1 ]]; then
    USE_COLOR=true
else
    USE_COLOR=false
fi

# Color codes
C_RESET="\033[0m"
C_GREEN="\033[1;32m"
C_YELLOW="\033[1;33m"
C_RED="\033[1;31m"
C_CYAN="\033[1;36m"
C_WHITE="\033[1;37m"
C_BLUE="\033[1;34m"
C_DIM="\033[0;37m"

log() {
    REPORT+="$1"$'\n'
    # Strip ANSI codes for plain-text output (file/email)
    REPORT_PLAIN+="$(echo -e "$1" | sed 's/\x1b\[[0-9;]*m//g')"$'\n'
}

header() {
    if $USE_COLOR; then
        log ""
        log "${C_BLUE}===============================================================================${C_RESET}"
        log "  ${C_WHITE}$1${C_RESET}"
        log "${C_BLUE}===============================================================================${C_RESET}"
    else
        log ""
        log "==============================================================================="
        log "  $1"
        log "==============================================================================="
    fi
}

subheader() {
    if $USE_COLOR; then
        log ""
        log "${C_CYAN}--- $1 ---${C_RESET}"
    else
        log ""
        log "--- $1 ---"
    fi
}

ok()   {
    if $USE_COLOR; then
        log "  ${C_GREEN}[OK]${C_RESET}       $1"
    else
        log "  [OK]       $1"
    fi
}

warn() {
    if $USE_COLOR; then
        log "  ${C_YELLOW}[WARNING]${C_RESET}  $1"
    else
        log "  [WARNING]  $1"
    fi
    WARNINGS=$((WARNINGS + 1))
}

crit() {
    if $USE_COLOR; then
        log "  ${C_RED}[CRITICAL]${C_RESET} $1"
    else
        log "  [CRITICAL] $1"
    fi
    CRITICALS=$((CRITICALS + 1))
}

info() {
    if $USE_COLOR; then
        log "  ${C_DIM}[INFO]${C_RESET}     $1"
    else
        log "  [INFO]     $1"
    fi
}

notice() {
    if $USE_COLOR; then
        log "  ${C_CYAN}[NOTICE]${C_RESET}   $1"
    else
        log "  [NOTICE]   $1"
    fi
}

# --- Pre-flight --------------------------------------------------------------
if [[ $EUID -ne 0 ]]; then
    echo "ERROR: This script must be run as root." >&2
    exit 1
fi

header "PROXMOX VE HEALTH CHECK REPORT"
log "  Host:    $HOSTNAME"
log "  Date:    $DATE"
log "  Script:  v$VERSION"
log "  PVE:     $(pveversion 2>/dev/null || echo 'N/A')"
log "  Kernel:  $(uname -r)"

###############################################################################
# 1. FILESYSTEM UTILIZATION
###############################################################################
header "1. FILESYSTEM UTILIZATION"

while IFS= read -r line; do
    fs=$(echo "$line" | awk '{print $1}')
    mount=$(echo "$line" | awk '{print $7}')
    pct=$(echo "$line" | awk '{print $6}' | tr -d '%')
    size=$(echo "$line" | awk '{print $3}')
    used=$(echo "$line" | awk '{print $4}')
    avail=$(echo "$line" | awk '{print $5}')

    # Skip lines where percentage is not a valid integer
    if ! [[ "$pct" =~ ^[0-9]+$ ]]; then
        continue
    fi

    if (( pct >= CRIT_FS_PCT )); then
        crit "$mount ($fs): ${pct}% used - ${used}/${size} (${avail} free)"
    elif (( pct >= WARN_FS_PCT )); then
        warn "$mount ($fs): ${pct}% used - ${used}/${size} (${avail} free)"
    else
        ok "$mount ($fs): ${pct}% used - ${used}/${size} (${avail} free)"
    fi
done < <(df -hT -x tmpfs -x devtmpfs -x overlay -x squashfs 2>/dev/null | tail -n +2)

###############################################################################
# 2. LVM THIN POOL STATUS
###############################################################################
header "2. LVM THIN POOL STATUS"

if command -v lvs &>/dev/null; then
    thin_found=false
    while IFS= read -r line; do
        thin_found=true
        lv=$(echo "$line" | awk '{print $1}')
        vg=$(echo "$line" | awk '{print $2}')
        size=$(echo "$line" | awk '{print $3}')
        data_pct=$(echo "$line" | awk '{print $4}')
        meta_pct=$(echo "$line" | awk '{print $5}')

        # Strip any non-numeric characters for comparison
        data_int=$(echo "$data_pct" | awk -F. '{print $1}')
        meta_int=$(echo "$meta_pct" | awk -F. '{print $1}')

        subheader "Thin Pool: ${vg}/${lv} (${size})"

        # Determine if this is a small EFI/TPM volume (128 MiB or less) -- skip threshold alerts
        size_mib=$(lvs --noheadings --nosuffix --units m -o lv_size "${vg}/${lv}" 2>/dev/null | awk '{printf "%d", $1}')
        is_tiny_volume=false
        if [[ -n "$size_mib" ]] && (( size_mib <= 128 )); then
            is_tiny_volume=true
        fi

        # Data utilization
        if $is_tiny_volume; then
            ok "Data: ${data_pct}% used (EFI/TPM volume - threshold check skipped)"
        elif (( data_int >= CRIT_THIN_DATA_PCT )); then
            crit "Data: ${data_pct}% used (threshold: ${CRIT_THIN_DATA_PCT}%)"
        elif (( data_int >= WARN_THIN_DATA_PCT )); then
            warn "Data: ${data_pct}% used (threshold: ${WARN_THIN_DATA_PCT}%)"
        else
            ok "Data: ${data_pct}% used"
        fi

        # Metadata utilization
        if (( meta_int >= CRIT_THIN_META_PCT )); then
            crit "Metadata: ${meta_pct}% used (threshold: ${CRIT_THIN_META_PCT}%)"
        elif (( meta_int >= WARN_THIN_META_PCT )); then
            warn "Metadata: ${meta_pct}% used (threshold: ${WARN_THIN_META_PCT}%)"
        else
            ok "Metadata: ${meta_pct}% used"
        fi

        # Check overprovisioning
        alloc_total=$(lvs --noheadings --nosuffix --units g -o lv_size -S "pool_lv=${lv},vg_name=${vg}" 2>/dev/null | awk '{sum += $1} END {printf "%.1f", sum}')
        pool_size=$(lvs --noheadings --nosuffix --units g -o lv_size "${vg}/${lv}" 2>/dev/null | awk '{printf "%.1f", $1}')
        if [[ -n "$alloc_total" && -n "$pool_size" ]]; then
            alloc_int=$(echo "$alloc_total" | awk -F. '{print $1}')
            pool_int=$(echo "$pool_size" | awk -F. '{print $1}')
            if (( alloc_int > pool_int )); then
                warn "Overprovisioned: ${alloc_total}G allocated across thin volumes vs ${pool_size}G pool size"
            else
                ok "Not overprovisioned: ${alloc_total}G allocated within ${pool_size}G pool"
            fi
        fi

        # Check autoextend
        threshold=$(grep -E "^\s*thin_pool_autoextend_threshold" /etc/lvm/lvm.conf 2>/dev/null | grep -oP '\d+' | head -1)
        if [[ -z "$threshold" || "$threshold" == "100" ]]; then
            warn "Thin pool autoextend protection is NOT enabled (threshold=100 or unset)"
            info "Set thin_pool_autoextend_threshold < 100 in /etc/lvm/lvm.conf"
        else
            ok "Thin pool autoextend threshold: ${threshold}%"
        fi

    done < <(lvs --noheadings -o lv_name,vg_name,lv_size,data_percent,metadata_percent -S 'lv_attr=~t' 2>/dev/null | awk 'NF')

    if ! $thin_found; then
        info "No LVM thin pools found on this node."
    fi

    # LVM Snapshot Audit
    subheader "LVM Snapshot Audit"
    lvm_snap_found=false
    while IFS= read -r snapline; do
        [[ -z "$snapline" ]] && continue
        snap_name=$(echo "$snapline" | awk '{print $1}')
        snap_vg=$(echo "$snapline" | awk '{print $2}')
        snap_size=$(echo "$snapline" | awk '{print $3}')
        snap_origin=$(echo "$snapline" | awk '{print $4}')
        lvm_snap_found=true
        warn "LVM snapshot '${snap_vg}/${snap_name}' (${snap_size}) of origin '${snap_origin}' still exists"
        info "Old snapshots consume thin pool space. Remove if no longer needed: lvremove ${snap_vg}/${snap_name}"
    done < <(lvs --noheadings -o lv_name,vg_name,lv_size,origin -S 'lv_attr=~V' 2>/dev/null | awk '$4 != "" {print}')

    if ! $lvm_snap_found; then
        ok "No stale LVM snapshots found"
    fi
else
    info "LVM not available on this node."
fi

###############################################################################
# 3. ZFS POOL STATUS
###############################################################################
header "3. ZFS POOL STATUS"

if command -v zpool &>/dev/null; then
    zfs_found=false
    while IFS= read -r pool; do
        zfs_found=true
        subheader "Pool: $pool"

        state=$(zpool get -H -o value health "$pool" 2>/dev/null)
        if [[ "$state" == "ONLINE" ]]; then
            ok "State: $state"
        elif [[ "$state" == "DEGRADED" ]]; then
            crit "State: $state - pool is running with reduced redundancy"
        else
            crit "State: $state"
        fi

        # Check for errors
        read_err=0; write_err=0; cksum_err=0
        while IFS= read -r devline; do
            r=$(echo "$devline" | awk '{print $(NF-2)}')
            w=$(echo "$devline" | awk '{print $(NF-1)}')
            c=$(echo "$devline" | awk '{print $NF}')
            [[ "$r" =~ ^[0-9]+$ ]] && read_err=$((read_err + r))
            [[ "$w" =~ ^[0-9]+$ ]] && write_err=$((write_err + w))
            [[ "$c" =~ ^[0-9]+$ ]] && cksum_err=$((cksum_err + c))
        done < <(zpool status "$pool" 2>/dev/null | awk '/NAME.*STATE.*READ/,0' | tail -n +2 | grep -v "^$" | grep -v "errors:")

        if (( read_err + write_err + cksum_err > WARN_ZFS_ERRORS )); then
            warn "Errors detected - Read: $read_err, Write: $write_err, Checksum: $cksum_err"
        else
            ok "No errors - Read: $read_err, Write: $write_err, Checksum: $cksum_err"
        fi

        # Check for permanent data errors
        perm_errors=$(zpool status -v "$pool" 2>/dev/null | grep -c "Permanent errors" || true)
        if (( perm_errors > 0 )); then
            crit "Permanent data errors detected. Run: zpool status -v $pool"
        fi

        # Last scrub
        last_scrub=$(zpool status "$pool" 2>/dev/null | grep "scan:" | head -1)
        if [[ -n "$last_scrub" ]]; then
            info "Last scrub: $last_scrub"
        else
            warn "No scrub history found. Schedule regular scrubs."
        fi

        # Capacity
        cap=$(zpool get -H -o value capacity "$pool" 2>/dev/null | tr -d '%')
        size=$(zpool get -H -o value size "$pool" 2>/dev/null)
        free=$(zpool get -H -o value free "$pool" 2>/dev/null)
        if [[ -n "$cap" ]] && (( cap >= CRIT_FS_PCT )); then
            crit "Capacity: ${cap}% used (${free} free of ${size})"
        elif [[ -n "$cap" ]] && (( cap >= WARN_FS_PCT )); then
            warn "Capacity: ${cap}% used (${free} free of ${size})"
        elif [[ -n "$cap" ]]; then
            ok "Capacity: ${cap}% used (${free} free of ${size})"
        fi

        # Fragmentation check
        frag=$(zpool list -H -o frag "$pool" 2>/dev/null | tr -d '%')
        if [[ -n "$frag" ]] && [[ "$frag" != "-" ]]; then
            # Determine if pool is on SSD or HDD for appropriate threshold
            pool_is_ssd=false
            while IFS= read -r pdev; do
                base_dev=$(echo "$pdev" | sed 's/-part[0-9]*$//' | sed 's|.*/||')
                # Resolve ata-/nvme- symlinks to actual device
                if [[ -L "/dev/disk/by-id/$pdev" ]]; then
                    real_dev=$(readlink -f "/dev/disk/by-id/$pdev" 2>/dev/null)
                    base_dev=$(lsblk -dno NAME "$real_dev" 2>/dev/null | head -1)
                fi
                if [[ -n "$base_dev" && -f "/sys/block/${base_dev}/queue/rotational" ]]; then
                    rot=$(cat "/sys/block/${base_dev}/queue/rotational" 2>/dev/null)
                    if [[ "$rot" == "0" ]]; then
                        pool_is_ssd=true
                        break
                    fi
                fi
            done < <(zpool status "$pool" 2>/dev/null | awk '/ONLINE/{print $1}' | grep -vE "^${pool}$|^raidz|^mirror|^NAME|^state|^config|^errors|^scan|^spares|^logs|^cache|^special")

            if $pool_is_ssd; then
                crit_frag=$CRIT_ZFS_FRAG_SSD_PCT
            else
                crit_frag=$CRIT_ZFS_FRAG_HDD_PCT
            fi

            drive_type_label="HDD"
            $pool_is_ssd && drive_type_label="SSD"

            if (( frag >= crit_frag )); then
                crit "Fragmentation: ${frag}% ($drive_type_label pool, threshold: ${crit_frag}%)"
            elif (( frag >= WARN_ZFS_FRAG_PCT )); then
                warn "Fragmentation: ${frag}% ($drive_type_label pool)"
            else
                ok "Fragmentation: ${frag}%"
            fi
        fi

        # Autotrim check (SSD pools only)
        autotrim=$(zpool get -H -o value autotrim "$pool" 2>/dev/null)
        if [[ -n "$autotrim" ]]; then
            if $pool_is_ssd && [[ "$autotrim" != "on" ]]; then
                warn "Autotrim is OFF on SSD-backed pool '$pool'. Enable with: zpool set autotrim=on $pool"
            elif $pool_is_ssd && [[ "$autotrim" == "on" ]]; then
                ok "Autotrim: on (SSD pool)"
            fi
        fi

    done < <(zpool list -H -o name 2>/dev/null)

    # ZFS version alignment (placed after per-pool checks)
    subheader "ZFS Version Alignment"
    zfs_user_ver=$(zfs --version 2>/dev/null | head -1 | grep -oP '[\d.]+' | head -1)
    zfs_mod_ver=$(cat /sys/module/zfs/version 2>/dev/null | grep -oP '[\d.]+' | head -1)
    if [[ -n "$zfs_user_ver" && -n "$zfs_mod_ver" ]]; then
        if [[ "$zfs_user_ver" == "$zfs_mod_ver" ]]; then
            ok "ZFS userspace ($zfs_user_ver) matches kernel module ($zfs_mod_ver)"
        else
            crit "ZFS version mismatch: userspace=$zfs_user_ver, kernel module=$zfs_mod_ver"
            info "This can cause scrub failures and data operation errors. Align packages or reboot."
            zfs_upgradable=$(apt list --upgradable 2>/dev/null | grep -c "zfs" || true)
            if (( zfs_upgradable > 0 )); then
                info "ZFS packages have updates available. Run: apt install libzfs7linux zfs-initramfs zfs-zed zfsutils-linux"
            fi
        fi
    fi

    # ZFS package hold status
    zfs_held=$(apt-mark showhold 2>/dev/null | grep -c "zfs" || true)
    if (( zfs_held > 0 )); then
        info "ZFS packages are held. Held packages: $(apt-mark showhold 2>/dev/null | grep zfs | tr '\n' ' ')"
    fi

    if ! $zfs_found; then
        info "No ZFS pools found on this node."
    fi
else
    info "ZFS not available on this node."
fi

###############################################################################
# 4. SMART DRIVE HEALTH
###############################################################################
header "4. DRIVE SMART HEALTH"

if command -v smartctl &>/dev/null; then

    # NVMe drives
    while IFS= read -r dev; do
        [[ -z "$dev" ]] && continue
        subheader "NVMe: $dev"

        model=$(smartctl -i "$dev" 2>/dev/null | grep "Model Number" | awk -F: '{print $2}' | xargs)
        serial=$(smartctl -i "$dev" 2>/dev/null | grep "Serial Number" | awk -F: '{print $2}' | xargs)
        info "Model: ${model:-Unknown} | Serial: ${serial:-Unknown}"

        health=$(smartctl -H "$dev" 2>/dev/null | grep "SMART overall" | awk -F: '{print $2}' | xargs)
        if [[ "$health" == "PASSED" ]]; then
            ok "SMART Health: PASSED"
        else
            crit "SMART Health: ${health:-UNKNOWN}"
        fi

        spare=$(smartctl -A "$dev" 2>/dev/null | grep "Available Spare:" | awk '{print $3}' | tr -d '%')
        if [[ -n "$spare" ]]; then
            if (( spare < WARN_NVME_SPARE_PCT )); then
                warn "Available Spare: ${spare}% (below ${WARN_NVME_SPARE_PCT}%)"
            else
                ok "Available Spare: ${spare}%"
            fi
        fi

        used=$(smartctl -A "$dev" 2>/dev/null | grep "Percentage Used:" | awk '{print $3}' | tr -d '%')
        if [[ -n "$used" ]]; then
            if (( used > WARN_NVME_USED_PCT )); then
                warn "Percentage Used: ${used}% (above ${WARN_NVME_USED_PCT}%)"
            else
                ok "Percentage Used: ${used}%"
            fi
        fi

        media_err=$(smartctl -A "$dev" 2>/dev/null | grep "Media and Data Integrity" | awk '{print $NF}')
        if [[ -n "$media_err" && "$media_err" != "0" ]]; then
            crit "Media and Data Integrity Errors: $media_err"
        elif [[ -n "$media_err" ]]; then
            ok "Media and Data Integrity Errors: 0"
        fi

        unsafe=$(smartctl -A "$dev" 2>/dev/null | grep "Unsafe Shutdowns:" | awk '{print $NF}')
        if [[ -n "$unsafe" ]]; then
            info "Unsafe Shutdowns: $unsafe"
        fi

        poh=$(smartctl -A "$dev" 2>/dev/null | grep "Power On Hours:" | awk '{print $NF}' | tr -d ',')
        if [[ -n "$poh" ]]; then
            info "Power On Hours: $poh"
        fi

        temp=$(smartctl -A "$dev" 2>/dev/null | grep "Temperature:" | head -1 | awk '{print $2}')
        if [[ -n "$temp" ]] && (( temp >= CRIT_TEMP_CELSIUS )); then
            crit "Temperature: ${temp}C (above ${CRIT_TEMP_CELSIUS}C)"
        elif [[ -n "$temp" ]] && (( temp >= WARN_TEMP_CELSIUS )); then
            warn "Temperature: ${temp}C (above ${WARN_TEMP_CELSIUS}C)"
        elif [[ -n "$temp" ]]; then
            ok "Temperature: ${temp}C"
        fi

    done < <(lsblk -dno NAME,TYPE 2>/dev/null | awk '$2=="disk" && $1~/nvme/' | awk '{print "/dev/"$1}')

    # SATA/SAS drives
    while IFS= read -r dev; do
        [[ -z "$dev" ]] && continue
        subheader "SATA: $dev"

        model=$(smartctl -i "$dev" 2>/dev/null | grep -E "Device Model|Product:" | awk -F: '{print $2}' | xargs)
        serial=$(smartctl -i "$dev" 2>/dev/null | grep "Serial Number" | awk -F: '{print $2}' | xargs)
        info "Model: ${model:-Unknown} | Serial: ${serial:-Unknown}"

        health=$(smartctl -H "$dev" 2>/dev/null | grep -E "SMART overall|SMART Health" | awk -F: '{print $2}' | xargs)
        if [[ "$health" == "PASSED" || "$health" == "OK" ]]; then
            ok "SMART Health: $health"
        elif [[ -n "$health" ]]; then
            crit "SMART Health: $health"
        else
            warn "SMART Health: Unable to determine"
        fi

        realloc=$(smartctl -A "$dev" 2>/dev/null | grep "Reallocated_Sector" | awk '{print $NF}')
        if [[ -n "$realloc" ]]; then
            if (( realloc > CRIT_SATA_REALLOC )); then
                crit "Reallocated Sectors: $realloc (above ${CRIT_SATA_REALLOC}) - drive replacement recommended"
            elif (( realloc > WARN_SATA_REALLOC )); then
                warn "Reallocated Sectors: $realloc (above ${WARN_SATA_REALLOC}) - drive degrading, plan replacement"
            elif (( realloc > NOTICE_SATA_REALLOC )); then
                notice "Reallocated Sectors: $realloc (above ${NOTICE_SATA_REALLOC}) - monitor for growth, not actionable yet"
            else
                ok "Reallocated Sectors: $realloc"
            fi
        fi

        pending=$(smartctl -A "$dev" 2>/dev/null | grep "Current_Pending_Sector" | awk '{print $NF}')
        if [[ -n "$pending" ]]; then
            if (( pending > CRIT_SATA_PENDING )); then
                crit "Current Pending Sectors: $pending (above ${CRIT_SATA_PENDING}) - drive replacement recommended"
            elif (( pending > WARN_SATA_PENDING )); then
                warn "Current Pending Sectors: $pending (above ${WARN_SATA_PENDING}) - drive degrading, plan replacement"
            elif (( pending > NOTICE_SATA_PENDING )); then
                notice "Current Pending Sectors: $pending - monitor for growth; may resolve after next write pass or scrub"
            elif (( pending > 0 )); then
                notice "Current Pending Sectors: $pending - likely transient, monitor"
            else
                ok "Current Pending Sectors: 0"
            fi
        fi

        uncorrect=$(smartctl -A "$dev" 2>/dev/null | grep "Offline_Uncorrectable" | awk '{print $NF}')
        if [[ -n "$uncorrect" ]]; then
            if (( uncorrect > CRIT_SATA_UNCORRECT )); then
                crit "Offline Uncorrectable: $uncorrect (above ${CRIT_SATA_UNCORRECT}) - data loss risk, replace drive"
            elif (( uncorrect > WARN_SATA_UNCORRECT )); then
                warn "Offline Uncorrectable: $uncorrect (above ${WARN_SATA_UNCORRECT}) - drive degrading, plan replacement"
            elif (( uncorrect > NOTICE_SATA_UNCORRECT )); then
                notice "Offline Uncorrectable: $uncorrect - monitor closely, early sign of surface degradation"
            else
                ok "Offline Uncorrectable: 0"
            fi
        fi

        temp=$(smartctl -A "$dev" 2>/dev/null | grep "Temperature_Celsius" | awk '{print $10}')
        if [[ -n "$temp" ]] && (( temp >= CRIT_TEMP_CELSIUS )); then
            crit "Temperature: ${temp}C"
        elif [[ -n "$temp" ]] && (( temp >= WARN_TEMP_CELSIUS )); then
            warn "Temperature: ${temp}C"
        elif [[ -n "$temp" ]]; then
            ok "Temperature: ${temp}C"
        fi

        poh=$(smartctl -A "$dev" 2>/dev/null | grep "Power_On_Hours" | awk '{print $NF}')
        if [[ -n "$poh" ]]; then
            info "Power On Hours: $poh"
        fi

    done < <(lsblk -dno NAME,TYPE 2>/dev/null | awk '$2=="disk" && $1~/^sd/' | awk '{print "/dev/"$1}')

else
    crit "smartctl not found. Drive health cannot be checked."
    info "Install with: apt install smartmontools"
fi

###############################################################################
# 5. BACKUP STATUS
###############################################################################
header "5. BACKUP STATUS"

if command -v pvesh &>/dev/null; then
    node="$HOSTNAME"

    # Check vzdump log for recent failures
    if [[ -d /var/log/vzdump/ ]]; then
        recent_failures=$(find /var/log/vzdump/ -name "*.log" -mtime -1 -exec grep -l "TASK ERROR\|Backup of VM.*failed" {} \; 2>/dev/null | wc -l)
        if (( recent_failures > 0 )); then
            warn "$recent_failures backup job(s) had failures in the last 24 hours"
            info "Review logs in /var/log/vzdump/"
        else
            ok "No backup failures detected in the last 24 hours"
        fi
    fi

    # Check last backup task status from task log
    recent_backup_tasks=$(pvesh get /nodes/"$node"/tasks --typefilter vzdump --limit 5 2>/dev/null || true)
    if [[ -n "$recent_backup_tasks" ]]; then
        error_count=$(echo "$recent_backup_tasks" | grep -c '"status"\s*:\s*".*[Ee]rror' || true)
        if (( error_count > 0 )); then
            warn "$error_count of last 5 backup tasks had errors"
        else
            ok "Last 5 backup tasks completed without errors"
        fi
    fi

    # Check for PBS backup group lock errors in recent logs
    subheader "PBS Backup Lock Status"
    lock_errors=0
    if [[ -d /var/log/vzdump/ ]]; then
        while IFS= read -r logfile; do
            [[ -z "$logfile" ]] && continue
            locked_group=$(grep -oP "creating locked backup group.*BackupGroup.*id: \"?\K[0-9]+" "$logfile" 2>/dev/null | head -1)
            if [[ -n "$locked_group" ]]; then
                warn "PBS lock error for VM/CT $locked_group detected in $(basename "$logfile")"
                info "A stale lock on the PBS datastore may be preventing backups for this guest."
                info "Check PBS server for orphaned tasks or restart proxmox-backup-proxy on the PBS host."
                lock_errors=$((lock_errors + 1))
            fi
        done < <(find /var/log/vzdump/ -name "*.log" -mtime -3 2>/dev/null)
    fi
    if (( lock_errors == 0 )); then
        ok "No PBS backup group lock errors in recent logs"
    fi

    # Backup recency check for running guests
    subheader "Backup Recency"
    backup_recency_issues=0
    cutoff_epoch=$(( $(date +%s) - (BACKUP_STALE_HOURS * 3600) ))

    # Build list of running guest IDs
    declare -A running_guest_names
    if command -v qm &>/dev/null; then
        while IFS= read -r vmline; do
            [[ -z "$vmline" ]] && continue
            vmid=$(echo "$vmline" | awk '{print $1}')
            vmstatus=$(echo "$vmline" | awk '{print $3}')
            vmname=$(echo "$vmline" | awk '{print $2}')
            if [[ "$vmstatus" == "running" ]]; then
                running_guest_names["$vmid"]="$vmname"
            fi
        done < <(qm list 2>/dev/null | tail -n +2 | awk '{print $1, $2, $3}')
    fi
    if command -v pct &>/dev/null; then
        while IFS= read -r ctline; do
            [[ -z "$ctline" ]] && continue
            ctid=$(echo "$ctline" | awk '{print $1}')
            ctstatus=$(echo "$ctline" | awk '{print $2}')
            ctname=$(echo "$ctline" | awk '{print $3}')
            if [[ "$ctstatus" == "running" ]]; then
                running_guest_names["$ctid"]="$ctname"
            fi
        done < <(pct list 2>/dev/null | tail -n +2 | awk '{print $1, $2, $3}')
    fi

    # Build list of PBS storage IDs for fallback snapshot check
    declare -a pbs_stores=()
    while IFS= read -r sline; do
        s_id=$(echo "$sline" | awk '{print $1}')
        s_type=$(echo "$sline" | awk '{print $2}')
        s_status=$(echo "$sline" | awk '{print $3}')
        if [[ "$s_type" == "pbs" && "$s_status" == "active" ]]; then
            pbs_stores+=("$s_id")
        fi
    done < <(pvesm status 2>/dev/null | tail -n +2)

    # Check each running guest for a recent backup
    for guestid in "${!running_guest_names[@]}"; do
        guest_name="${running_guest_names[$guestid]}"
        backup_found=false

        # Method 1: Check local node task log (vzdump tasks)
        # Use pvesh JSON output with proper field extraction
        task_json=$(pvesh get /nodes/"$node"/tasks --typefilter vzdump --limit 100 --output-format json 2>/dev/null || true)
        if [[ -n "$task_json" ]]; then
            # Extract the most recent successful backup time for this guest
            last_ok_epoch=$(echo "$task_json" | \
                python3 -c "
import json, sys
try:
    tasks = json.load(sys.stdin)
    if isinstance(tasks, dict) and 'data' in tasks:
        tasks = tasks['data']
    matches = [t.get('starttime',0) for t in tasks
               if str(t.get('id','')) == '$guestid'
               and t.get('status','') == 'OK']
    print(max(matches) if matches else '')
except: print('')
" 2>/dev/null || true)

            if [[ -n "$last_ok_epoch" ]] && (( last_ok_epoch >= cutoff_epoch )); then
                backup_found=true
            fi
        fi

        # Method 2: If not found locally, query PBS datastores for snapshots
        if ! $backup_found && (( ${#pbs_stores[@]} > 0 )); then
            # Determine guest type for PBS path
            guest_type=""
            if qm status "$guestid" &>/dev/null; then
                guest_type="vm"
            elif pct status "$guestid" &>/dev/null; then
                guest_type="ct"
            fi

            if [[ -n "$guest_type" ]]; then
                for pbs_store in "${pbs_stores[@]}"; do
                    # List snapshots for this guest on this PBS store
                    snap_list=$(pvesm list "$pbs_store" --vmid "$guestid" 2>/dev/null || true)
                    if [[ -n "$snap_list" ]]; then
                        # Extract the most recent backup timestamp from the snapshot listing
                        # Format: volid | format | size | ctime
                        last_snap_epoch=$(echo "$snap_list" | tail -n +2 | awk '{print $NF}' | sort -rn | head -1 2>/dev/null || true)
                        if [[ -n "$last_snap_epoch" ]] && [[ "$last_snap_epoch" =~ ^[0-9]+$ ]] && (( last_snap_epoch >= cutoff_epoch )); then
                            backup_found=true
                            break
                        fi
                    fi
                done
            fi
        fi

        # Method 3: Fallback to vzdump log files
        if ! $backup_found; then
            last_log=$(find /var/log/vzdump/ -name "*-${guestid}-*.log" -newer /etc/hostname 2>/dev/null | sort | tail -1)
            if [[ -n "$last_log" ]]; then
                last_log_epoch=$(stat -c %Y "$last_log" 2>/dev/null || echo 0)
                if (( last_log_epoch >= cutoff_epoch )); then
                    backup_found=true
                fi
            fi
        fi

        # Report result
        if ! $backup_found; then
            # Distinguish between "no backup at all" and "backup is stale"
            has_any_backup=false

            # Check PBS for any snapshot regardless of age
            if (( ${#pbs_stores[@]} > 0 )); then
                for pbs_store in "${pbs_stores[@]}"; do
                    snap_count=$(pvesm list "$pbs_store" --vmid "$guestid" 2>/dev/null | tail -n +2 | wc -l)
                    if (( snap_count > 0 )); then
                        has_any_backup=true
                        # Get the age of the most recent backup
                        last_snap_epoch=$(pvesm list "$pbs_store" --vmid "$guestid" 2>/dev/null | tail -n +2 | awk '{print $NF}' | sort -rn | head -1)
                        if [[ -n "$last_snap_epoch" ]] && [[ "$last_snap_epoch" =~ ^[0-9]+$ ]]; then
                            hours_ago=$(( ( $(date +%s) - last_snap_epoch ) / 3600 ))
                            warn "VM/CT $guestid ($guest_name): last backup on PBS '$pbs_store' is ${hours_ago} hours old (threshold: ${BACKUP_STALE_HOURS}h)"
                        else
                            warn "VM/CT $guestid ($guest_name): backup exists on PBS '$pbs_store' but age could not be determined"
                        fi
                        backup_recency_issues=$((backup_recency_issues + 1))
                        break
                    fi
                done
            fi

            if ! $has_any_backup; then
                warn "VM/CT $guestid ($guest_name): no backups found on this node or PBS"
                backup_recency_issues=$((backup_recency_issues + 1))
            fi
        fi
    done

    if (( backup_recency_issues == 0 )) && (( ${#running_guest_names[@]} > 0 )); then
        ok "All ${#running_guest_names[@]} running guests have backups within the last ${BACKUP_STALE_HOURS} hours"
    elif (( ${#running_guest_names[@]} == 0 )); then
        info "No running guests to check backup recency for"
    fi

else
    info "pvesh not available. Skipping backup status checks."
fi

###############################################################################
# 6. SYSTEM RESOURCES
###############################################################################
header "6. SYSTEM RESOURCES"

subheader "Memory"
mem_total=$(free -m | awk '/^Mem:/ {print $2}')
mem_used=$(free -m | awk '/^Mem:/ {print $3}')
mem_pct=$((mem_used * 100 / mem_total))
swap_total=$(free -m | awk '/^Swap:/ {print $2}')
swap_used=$(free -m | awk '/^Swap:/ {print $3}')

if (( mem_pct >= 90 )); then
    crit "RAM: ${mem_used}M / ${mem_total}M (${mem_pct}%)"
elif (( mem_pct >= 80 )); then
    warn "RAM: ${mem_used}M / ${mem_total}M (${mem_pct}%)"
else
    ok "RAM: ${mem_used}M / ${mem_total}M (${mem_pct}%)"
fi

if (( swap_total > 0 && swap_used > 0 )); then
    swap_pct=$((swap_used * 100 / swap_total))
    if (( swap_pct > 20 )); then
        warn "Swap: ${swap_used}M / ${swap_total}M (${swap_pct}%) - high swap usage"
    else
        ok "Swap: ${swap_used}M / ${swap_total}M (${swap_pct}%)"
    fi
else
    ok "Swap: ${swap_used}M / ${swap_total}M (no swap pressure)"
fi

subheader "CPU Load"
load1=$(cat /proc/loadavg | awk '{print $1}')
load5=$(cat /proc/loadavg | awk '{print $2}')
load15=$(cat /proc/loadavg | awk '{print $3}')
ncpu=$(nproc)

load5_int=$(echo "$load5" | awk -F. '{print $1}')
load_warn_threshold=$((ncpu * WARN_LOAD_RATIO))
load_crit_threshold=$((ncpu * CRIT_LOAD_RATIO))

if (( load5_int >= load_crit_threshold )); then
    crit "Load average: $load1 / $load5 / $load15 ($ncpu CPUs) - 5-min load is ${CRIT_LOAD_RATIO}x+ CPU count"
    info "System is severely overloaded. Check for runaway processes or I/O saturation."
elif (( load5_int >= load_warn_threshold )); then
    warn "Load average: $load1 / $load5 / $load15 ($ncpu CPUs) - 5-min load is ${WARN_LOAD_RATIO}x+ CPU count"
    info "System is under heavy load. Review with: top -bn1 | head -20"
else
    ok "Load average: $load1 / $load5 / $load15 ($ncpu CPUs)"
fi

subheader "CPU Overallocation"

phys_cpus=$(nproc)
total_vcpus=0
vm_vcpus_total=0
ct_vcpus_total=0
running_vms=0
running_cts=0
running_guests=""

if command -v qm &>/dev/null; then
    while IFS= read -r vmline; do
        [[ -z "$vmline" ]] && continue
        vmid=$(echo "$vmline" | awk '{print $1}')
        vmstatus=$(echo "$vmline" | awk '{print $3}')
        vmname=$(echo "$vmline" | awk '{print $2}')

        if [[ "$vmstatus" == "running" ]]; then
            cores=$(qm config "$vmid" 2>/dev/null | grep "^cores:" | awk '{print $2}')
            sockets=$(qm config "$vmid" 2>/dev/null | grep "^sockets:" | awk '{print $2}')
            cores=${cores:-1}
            sockets=${sockets:-1}
            vm_vcpus=$((cores * sockets))
            vm_vcpus_total=$((vm_vcpus_total + vm_vcpus))
            total_vcpus=$((total_vcpus + vm_vcpus))
            running_vms=$((running_vms + 1))
            running_guests+="  VM  ${vmid}  ${vmname}: ${vm_vcpus} vCPU(s)"$'\n'
        fi
    done < <(qm list 2>/dev/null | tail -n +2 | awk '{print $1, $2, $3}')
fi

if command -v pct &>/dev/null; then
    while IFS= read -r ctline; do
        [[ -z "$ctline" ]] && continue
        ctid=$(echo "$ctline" | awk '{print $1}')
        ctstatus=$(echo "$ctline" | awk '{print $2}')
        ctname=$(echo "$ctline" | awk '{print $3}')

        if [[ "$ctstatus" == "running" ]]; then
            cores=$(pct config "$ctid" 2>/dev/null | grep "^cores:" | awk '{print $2}')
            if [[ -z "$cores" || "$cores" == "0" ]]; then
                cores=$phys_cpus
                running_guests+="  CT  ${ctid}  ${ctname}: ${cores} vCPU(s) (unlimited - using host CPU count)"$'\n'
            else
                running_guests+="  CT  ${ctid}  ${ctname}: ${cores} vCPU(s)"$'\n'
            fi
            ct_vcpus_total=$((ct_vcpus_total + cores))
            total_vcpus=$((total_vcpus + cores))
            running_cts=$((running_cts + 1))
        fi
    done < <(pct list 2>/dev/null | tail -n +2 | awk '{print $1, $2, $3}')
fi

if (( total_vcpus > 0 )); then
    vcpu_ratio_display=$(awk "BEGIN {printf \"%.1f\", $total_vcpus / $phys_cpus}")
    overalloc_status="ok"
    overalloc_details=""

    if (( vm_vcpus_total > 0 )); then
        vm_ratio_display=$(awk "BEGIN {printf \"%.1f\", $vm_vcpus_total / $phys_cpus}")
        vm_warn_threshold=$((phys_cpus * 2))
        vm_crit_threshold=$((phys_cpus * 4))

        if (( vm_vcpus_total >= vm_crit_threshold )); then
            overalloc_status="crit"
            overalloc_details="VM vCPUs critically overallocated: ${vm_vcpus_total} vCPUs across ${running_vms} VM(s) (${vm_ratio_display}:1 ratio, threshold: 4:1)"
        elif (( vm_vcpus_total >= vm_warn_threshold )); then
            [[ "$overalloc_status" != "crit" ]] && overalloc_status="warn"
            overalloc_details="VM vCPUs overallocated: ${vm_vcpus_total} vCPUs across ${running_vms} VM(s) (${vm_ratio_display}:1 ratio, threshold: 2:1)"
        fi
    fi

    if (( ct_vcpus_total > 0 )); then
        ct_ratio_display=$(awk "BEGIN {printf \"%.1f\", $ct_vcpus_total / $phys_cpus}")
        ct_warn_threshold=$((phys_cpus * 6))
        ct_crit_threshold=$((phys_cpus * 10))

        if (( ct_vcpus_total >= ct_crit_threshold )); then
            overalloc_status="crit"
            overalloc_details+="${overalloc_details:+; }CT vCPUs critically overallocated: ${ct_vcpus_total} vCPUs across ${running_cts} CT(s) (${ct_ratio_display}:1 ratio, threshold: 10:1)"
        elif (( ct_vcpus_total >= ct_warn_threshold )); then
            [[ "$overalloc_status" != "crit" ]] && overalloc_status="warn"
            overalloc_details+="${overalloc_details:+; }CT vCPUs overallocated: ${ct_vcpus_total} vCPUs across ${running_cts} CT(s) (${ct_ratio_display}:1 ratio, threshold: 6:1)"
        fi
    fi

    summary="vCPU allocation: ${total_vcpus} total (${vm_vcpus_total} VM, ${ct_vcpus_total} CT) across ${running_vms} VM(s) and ${running_cts} CT(s) vs ${phys_cpus} physical CPUs (${vcpu_ratio_display}:1 combined)"

    if [[ "$overalloc_status" == "crit" ]]; then
        crit "$summary"
        info "$overalloc_details"
        info "Reduce guest CPU allocations or migrate workloads to other nodes."
    elif [[ "$overalloc_status" == "warn" ]]; then
        warn "$summary"
        info "$overalloc_details"
        info "Consider reducing guest CPU allocations if CPU contention is observed."
    else
        ok "$summary"
    fi

    if [[ -n "$running_guests" ]]; then
        log ""
        log "  Running Guest vCPU Breakdown:"
        log "$running_guests"
    fi
else
    ok "No running guests detected."
fi

subheader "Uptime"
info "$(uptime -p) (since $(uptime -s))"

###############################################################################
# 7. PROXMOX SERVICES
###############################################################################
header "7. PROXMOX SERVICES"

services=(
    "pvedaemon"
    "pveproxy"
    "pvestatd"
    "pve-cluster"
    "corosync"
    "cron"
)

command -v zfs &>/dev/null && services+=("zfs-zed")
systemctl list-unit-files pbs*.service &>/dev/null && services+=("proxmox-backup-proxy")

for svc in "${services[@]}"; do
    if systemctl is-active --quiet "$svc" 2>/dev/null; then
        ok "$svc: active"
    elif systemctl is-enabled --quiet "$svc" 2>/dev/null; then
        warn "$svc: enabled but not running"
    else
        info "$svc: not enabled (may not be applicable)"
    fi
done

# Corosync state consistency
subheader "Corosync State Consistency"
corosync_enabled=$(systemctl is-enabled corosync 2>/dev/null || echo "unknown")
corosync_active=$(systemctl is-active corosync 2>/dev/null || echo "unknown")
if [[ "$corosync_enabled" == "enabled" && "$corosync_active" != "active" ]]; then
    warn "Corosync is enabled but not running. If clustering is not in use, disable it: systemctl disable corosync"
elif [[ "$corosync_enabled" == "disabled" && "$corosync_active" == "active" ]]; then
    warn "Corosync is running but not enabled. State is inconsistent."
elif [[ "$corosync_enabled" == "enabled" && "$corosync_active" == "active" ]]; then
    ok "Corosync: enabled and running (cluster mode)"
else
    ok "Corosync: not enabled (standalone node)"
fi

###############################################################################
# 8. PENDING UPDATES
###############################################################################
header "8. PENDING UPDATES"

apt_update_output=$(apt update 2>&1)
upgradable=$(apt list --upgradable 2>/dev/null | grep -c "upgradable" || true)
if (( upgradable > 0 )); then
    info "$upgradable package(s) have updates available"
    pve_updates=$(apt list --upgradable 2>/dev/null | grep -c "pve\|proxmox" || true)
    if (( pve_updates > 0 )); then
        warn "$pve_updates Proxmox-related update(s) pending"
    fi
else
    ok "System is up to date"
fi

# Kernel cleanup
subheader "Kernel Cleanup"
running_kernel=$(uname -r)
installed_kernels=$(ls /boot/vmlinuz-* 2>/dev/null | sed 's|/boot/vmlinuz-||' | sort -V)
kernel_count=$(echo "$installed_kernels" | wc -l)

if (( kernel_count > 3 )); then
    boot_kernels=""
    if command -v proxmox-boot-tool &>/dev/null; then
        boot_kernels=$(proxmox-boot-tool kernel list 2>/dev/null | grep -E "^[0-9]" || true)
    fi

    old_kernels=""
    old_count=0
    old_module_size=0

    while IFS= read -r kver; do
        [[ -z "$kver" ]] && continue
        [[ "$kver" == "$running_kernel" ]] && continue
        if [[ -n "$boot_kernels" ]] && echo "$boot_kernels" | grep -q "$kver"; then
            continue
        fi
        old_kernels+="$kver "
        old_count=$((old_count + 1))
        if [[ -d "/lib/modules/$kver" ]]; then
            mod_size=$(du -sm "/lib/modules/$kver" 2>/dev/null | awk '{print $1}')
            old_module_size=$((old_module_size + mod_size))
        fi
    done <<< "$installed_kernels"

    if (( old_count > 0 )); then
        old_module_size_gb=$(awk "BEGIN {printf \"%.1f\", $old_module_size / 1024}")
        warn "$old_count old kernel(s) found (~${old_module_size_gb} GiB reclaimable): $old_kernels"
        info "Remove with: apt purge proxmox-kernel-<version> for each old kernel"
        info "Then run: update-grub (GRUB) or proxmox-boot-tool refresh (systemd-boot)"
    else
        ok "No old kernels to clean up"
    fi

    # Check for orphaned module directories
    orphan_modules=""
    orphan_count=0
    for moddir in /lib/modules/*/; do
        modver=$(basename "$moddir")
        if [[ ! -f "/boot/vmlinuz-$modver" ]] && [[ "$modver" != "$running_kernel" ]]; then
            orphan_modules+="$modver "
            orphan_count=$((orphan_count + 1))
        fi
    done

    if (( orphan_count > 0 )); then
        warn "$orphan_count orphaned module directory(s) with no matching kernel: $orphan_modules"
        info "Remove with: rm -rf /lib/modules/<version> for each orphaned directory"
    fi
else
    ok "Kernel count is clean ($kernel_count installed)"
fi

###############################################################################
# 9. KERNEL & SYSTEM ERRORS
###############################################################################
header "9. RECENT SYSTEM ERRORS (dmesg)"

hw_errors=$(dmesg 2>/dev/null | grep -icE "hardware error|machine check|mce|oom|hung_task|panic|kernel bug" | head -1 || true)
hw_errors_real=$(dmesg 2>/dev/null | grep -iE "hardware error|machine check|mce|oom|hung_task|panic|kernel bug" | grep -ivE "drm.*panic|panic_notifier|panic handler" | wc -l || true)
if (( hw_errors_real > 0 )); then
    warn "$hw_errors_real hardware/critical error message(s) found in dmesg"
    info "Review with: dmesg | grep -iE 'hardware error|mce|oom|hung_task|panic|kernel bug'"
else
    ok "No hardware or critical errors in dmesg"
fi

io_errors=$(dmesg 2>/dev/null | grep -icE "I/O error|blk_update_request|Buffer I/O error" || true)
if (( io_errors > 0 )); then
    warn "$io_errors I/O error(s) found in dmesg"
    info "Review with: dmesg | grep -iE 'I/O error|blk_update_request|Buffer I/O'"
else
    ok "No I/O errors in dmesg"
fi

###############################################################################
# 10. EFI CERTIFICATE CHECK
###############################################################################
header "10. EFI CERTIFICATE STATUS"

if command -v qm &>/dev/null; then
    efi_warn_found=false
    while IFS= read -r vmid; do
        [[ -z "$vmid" ]] && continue
        conf="/etc/pve/qemu-server/${vmid}.conf"
        [[ -f "$conf" ]] || continue
        
        if grep -q "efidisk0" "$conf" 2>/dev/null; then
            ostype=$(grep "^ostype:" "$conf" 2>/dev/null | awk '{print $2}')
            if [[ "$ostype" == "win10" || "$ostype" == "win11" ]]; then
                if ! grep -q "ms-cert=2023w" "$conf" 2>/dev/null; then
                    vm_name=$(grep "^name:" "$conf" 2>/dev/null | awk '{print $2}' || echo "unknown")
                    warn "VM $vmid ($vm_name): UEFI 2011 cert in use - expires June 2026. Run: qm enroll-efi-keys $vmid"
                    efi_warn_found=true
                fi
            fi
        fi
    done < <(ls /etc/pve/qemu-server/ 2>/dev/null | grep -oP '^\d+' | sort -n)

    if ! $efi_warn_found; then
        ok "All EFI-enabled VMs have current certificates (or no EFI VMs found)"
    fi
else
    info "qm not available. Skipping EFI certificate check."
fi

###############################################################################
# 11. STORAGE MISCONFIGURATION CHECK
###############################################################################
header "11. STORAGE MISCONFIGURATION CHECK"

# Directory-on-ZFS Detection
subheader "Directory-on-ZFS Detection"
dir_on_zfs_found=false

if command -v pvesm &>/dev/null; then
    while IFS= read -r line; do
        storage_id=$(echo "$line" | awk '{print $1}')
        storage_type=$(echo "$line" | awk '{print $2}')

        if [[ "$storage_type" == "dir" ]]; then
            storage_path=$(awk "/^dir: ${storage_id}\$/,/^\$/" /etc/pve/storage.cfg 2>/dev/null | grep "path " | awk '{print $2}')
            if [[ -n "$storage_path" ]]; then
                mount_fs=$(df -T "$storage_path" 2>/dev/null | tail -1 | awk '{print $2}')
                if [[ "$mount_fs" == "zfs" ]]; then
                    content=$(awk "/^dir: ${storage_id}\$/,/^\$/" /etc/pve/storage.cfg 2>/dev/null | grep "content " | awk '{print $2}')
                    if echo "$content" | grep -qE "images|rootdir"; then
                        crit "Storage '$storage_id' is type 'dir' on ZFS at '$storage_path' with VM/CT content"
                        info "Convert to type 'zfspool' for native zvol support and snapshot capability."
                        dir_on_zfs_found=true
                    fi
                fi
            fi
        fi
    done < <(pvesm status 2>/dev/null | tail -n +2)
fi

if ! $dir_on_zfs_found; then
    ok "No directory stores on ZFS with VM/CT content detected"
fi

# Stale Systemd Mount Units
subheader "Stale Systemd Mount Units"
stale_mounts=0

for mount_unit in /etc/systemd/system/mnt-pve-*.mount; do
    [[ -f "$mount_unit" ]] || continue
    mount_where=$(grep "^Where=" "$mount_unit" 2>/dev/null | cut -d= -f2)
    if [[ -n "$mount_where" ]]; then
        if ! mountpoint -q "$mount_where" 2>/dev/null; then
            unit_name=$(basename "$mount_unit")
            warn "Stale systemd mount unit: $unit_name (target $mount_where not mounted)"
            info "Remove with: systemctl disable '$unit_name' && rm /etc/systemd/system/'$unit_name' && systemctl daemon-reload"
            stale_mounts=$((stale_mounts + 1))
        fi
    fi
done

if (( stale_mounts == 0 )); then
    ok "No stale systemd mount units found"
fi

# Duplicate Storage Paths
subheader "Duplicate Storage Paths"
dup_paths_found=false
declare -A storage_path_map

while IFS= read -r line; do
    s_id=$(echo "$line" | awk '{print $1}')
    s_type=$(echo "$line" | awk '{print $2}')

    s_path=""
    if [[ "$s_type" == "dir" ]]; then
        s_path=$(awk "/^dir: ${s_id}\$/,/^\$/" /etc/pve/storage.cfg 2>/dev/null | grep "path " | awk '{print $2}')
    elif [[ "$s_type" == "zfspool" ]]; then
        s_pool=$(awk "/^zfspool: ${s_id}\$/,/^\$/" /etc/pve/storage.cfg 2>/dev/null | grep "pool " | awk '{print $2}')
        s_path="zfspool:${s_pool}"
    fi

    if [[ -n "$s_path" ]]; then
        if [[ -n "${storage_path_map[$s_path]}" ]]; then
            warn "Duplicate storage path: '$s_path' used by both '${storage_path_map[$s_path]}' and '$s_id'"
            dup_paths_found=true
        else
            storage_path_map["$s_path"]="$s_id"
        fi
    fi
done < <(pvesm status 2>/dev/null | tail -n +2)

if ! $dup_paths_found; then
    ok "No duplicate storage paths detected"
fi

###############################################################################
# 12. NFS MOUNT HEALTH
###############################################################################
header "12. NFS MOUNT HEALTH"

nfs_mounts_found=false
while IFS= read -r nfs_line; do
    [[ -z "$nfs_line" ]] && continue
    nfs_mounts_found=true
    nfs_src=$(echo "$nfs_line" | awk '{print $1}')
    nfs_mount=$(echo "$nfs_line" | awk '{print $3}')

    # Test responsiveness with a timeout
    if timeout "$NFS_TIMEOUT_SECS" stat "$nfs_mount" &>/dev/null; then
        ok "NFS mount '$nfs_mount' ($nfs_src): responsive"
    else
        crit "NFS mount '$nfs_mount' ($nfs_src): NOT responding (timed out after ${NFS_TIMEOUT_SECS}s)"
        info "This can freeze the PVE web interface. Force unmount with: umount -l '$nfs_mount'"
        info "Then remove the storage definition if no longer needed: pvesm remove <storage-id>"
    fi
done < <(mount -t nfs,nfs4 2>/dev/null)

if ! $nfs_mounts_found; then
    ok "No NFS mounts present"
fi

###############################################################################
# 13. VZDUMP CONFIGURATION AUDIT
###############################################################################
header "13. VZDUMP CONFIGURATION AUDIT"

# Check tmpdir setting
subheader "Tmpdir Configuration"
vzdump_conf="/etc/vzdump.conf"
if [[ -f "$vzdump_conf" ]]; then
    tmpdir_line=$(grep -E "^tmpdir:" "$vzdump_conf" 2>/dev/null | awk '{print $2}')
    if [[ -n "$tmpdir_line" ]]; then
        # Check if tmpdir is on network storage
        tmpdir_fs=$(df -T "$tmpdir_line" 2>/dev/null | tail -1 | awk '{print $2}')
        if [[ "$tmpdir_fs" == "nfs" || "$tmpdir_fs" == "nfs4" || "$tmpdir_fs" == "cifs" || "$tmpdir_fs" == "smb" ]]; then
            warn "vzdump tmpdir is set to network storage: $tmpdir_line ($tmpdir_fs)"
            info "Network-based tmpdir can cause GUI freezes if the share becomes unresponsive."
            info "Consider using local storage for tmpdir or comment out the line to use default /var/tmp."
        else
            ok "vzdump tmpdir: $tmpdir_line (local filesystem: $tmpdir_fs)"
        fi
    else
        # Check for commented-out tmpdir pointing to network storage (informational)
        commented_tmpdir=$(grep -E "^#tmpdir:" "$vzdump_conf" 2>/dev/null | awk '{print $2}')
        if [[ -n "$commented_tmpdir" ]]; then
            info "vzdump tmpdir is commented out (using default /var/tmp). Commented value: $commented_tmpdir"
        else
            ok "vzdump tmpdir: using default /var/tmp (local)"
        fi
    fi
else
    ok "No vzdump.conf found (using defaults)"
fi

# Check for orphaned vzdump temp files
subheader "Orphaned Vzdump Temp Files"
orphan_vzdump_count=0
stale_cutoff=$(date -d "${VZDUMP_TEMP_STALE_HOURS} hours ago" +%s 2>/dev/null || echo 0)

for tmpdir_check in /var/tmp /tmp; do
    while IFS= read -r vzdump_tmp; do
        [[ -z "$vzdump_tmp" ]] && continue
        tmp_mtime=$(stat -c %Y "$vzdump_tmp" 2>/dev/null || echo 0)
        if (( tmp_mtime > 0 && tmp_mtime < stale_cutoff )); then
            tmp_age_days=$(( ($(date +%s) - tmp_mtime) / 86400 ))
            tmp_size=$(du -sh "$vzdump_tmp" 2>/dev/null | awk '{print $1}')
            warn "Orphaned vzdump temp: $vzdump_tmp (${tmp_size}, ${tmp_age_days} days old)"
            info "Remove with: rm -rf '$vzdump_tmp'"
            orphan_vzdump_count=$((orphan_vzdump_count + 1))
        fi
    done < <(find "$tmpdir_check" -maxdepth 1 -name "vzdump*" -o -name "pveupload-*" 2>/dev/null)
done

if (( orphan_vzdump_count == 0 )); then
    ok "No orphaned vzdump temp files found"
fi

###############################################################################
# 14. ISO & TEMPLATE INTEGRITY
###############################################################################
header "14. ISO & TEMPLATE INTEGRITY"

iso_issues=0
iso_dir="/var/lib/vz/template/iso"

# Also check any directory storage configured for ISO content
declare -a iso_dirs=()
[[ -d "$iso_dir" ]] && iso_dirs+=("$iso_dir")

# Add ISO directories from storage.cfg
while IFS= read -r line; do
    s_id=$(echo "$line" | awk '{print $1}')
    s_type=$(echo "$line" | awk '{print $2}')
    if [[ "$s_type" == "dir" ]]; then
        content=$(awk "/^dir: ${s_id}\$/,/^\$/" /etc/pve/storage.cfg 2>/dev/null | grep "content " | awk '{print $2}')
        if echo "$content" | grep -q "iso"; then
            s_path=$(awk "/^dir: ${s_id}\$/,/^\$/" /etc/pve/storage.cfg 2>/dev/null | grep "path " | awk '{print $2}')
            if [[ -n "$s_path" && -d "${s_path}/template/iso" ]]; then
                # Avoid duplicates
                already_added=false
                for d in "${iso_dirs[@]}"; do
                    [[ "$d" == "${s_path}/template/iso" ]] && already_added=true
                done
                $already_added || iso_dirs+=("${s_path}/template/iso")
            fi
        fi
    fi
done < <(pvesm status 2>/dev/null | tail -n +2)

for check_dir in "${iso_dirs[@]}"; do
    while IFS= read -r iso_file; do
        [[ -z "$iso_file" ]] && continue
        iso_basename=$(basename "$iso_file")
        iso_size_mb=$(stat -c %s "$iso_file" 2>/dev/null | awk '{printf "%d", $1/1048576}')

        if (( iso_size_mb < WARN_ISO_MIN_SIZE_MB )); then
            warn "Suspicious ISO: $iso_basename (${iso_size_mb} MiB) in $check_dir - may be corrupt or incomplete"
            info "A valid OS installer ISO is typically 2-6 GiB. Re-download if this is an OS image."
            iso_issues=$((iso_issues + 1))
        fi
    done < <(find "$check_dir" -maxdepth 1 -name "*.iso" -type f 2>/dev/null)
done

if (( iso_issues == 0 )); then
    ok "All ISO files are above minimum expected size (${WARN_ISO_MIN_SIZE_MB} MiB)"
fi

###############################################################################
# 15. NVIDIA DKMS STATUS
###############################################################################
header "15. NVIDIA DKMS STATUS"

if command -v dkms &>/dev/null; then
    running_kernel=$(uname -r)
    dkms_issues=false

    while IFS= read -r dkms_line; do
        [[ -z "$dkms_line" ]] && continue
        module_name=$(echo "$dkms_line" | awk -F'[,/]' '{print $1}')
        module_ver=$(echo "$dkms_line" | awk -F'[,/]' '{print $2}' | sed 's/^ *//')
        module_kernel=$(echo "$dkms_line" | awk -F'[,:]' '{print $2}' | sed 's/^ *//')
        module_status=$(echo "$dkms_line" | awk -F':' '{print $NF}' | sed 's/^ *//')

        # Check if module is built for the running kernel
        if echo "$dkms_line" | grep -q "$running_kernel"; then
            if echo "$module_status" | grep -qi "installed"; then
                ok "DKMS module '$module_name' ($module_ver): installed for $running_kernel"
            elif echo "$module_status" | grep -qi "built"; then
                warn "DKMS module '$module_name' ($module_ver): built but not installed for $running_kernel"
                info "Install with: dkms install $module_name/$module_ver -k $running_kernel"
                dkms_issues=true
            else
                warn "DKMS module '$module_name' ($module_ver): status '$module_status' for $running_kernel"
                dkms_issues=true
            fi
        fi
    done < <(dkms status 2>/dev/null)

    # Check if any NVIDIA modules exist but none are built for running kernel
    nvidia_modules=$(dkms status 2>/dev/null | grep -i nvidia || true)
    if [[ -n "$nvidia_modules" ]]; then
        nvidia_for_running=$(echo "$nvidia_modules" | grep "$running_kernel" || true)
        if [[ -z "$nvidia_for_running" ]]; then
            crit "NVIDIA DKMS modules exist but none are built for running kernel $running_kernel"
            info "GPU passthrough will not function. Rebuild with: dkms autoinstall -k $running_kernel"
            dkms_issues=true
        fi
    fi

    # Check for Gasket/Coral modules similarly
    gasket_modules=$(dkms status 2>/dev/null | grep -i gasket || true)
    if [[ -n "$gasket_modules" ]]; then
        gasket_for_running=$(echo "$gasket_modules" | grep "$running_kernel" || true)
        if [[ -z "$gasket_for_running" ]]; then
            warn "Gasket DKMS modules exist but none are built for running kernel $running_kernel"
            info "Coral TPU passthrough may not function. Rebuild with: dkms autoinstall -k $running_kernel"
            dkms_issues=true
        fi
    fi

    if ! $dkms_issues && [[ -z "$nvidia_modules" && -z "$gasket_modules" ]]; then
        ok "No NVIDIA/Gasket DKMS modules present (not a GPU node)"
    elif ! $dkms_issues; then
        # Already reported OK per-module above
        true
    fi
else
    # Check if NVIDIA GPU is present but DKMS is not installed
    if lspci 2>/dev/null | grep -qi "nvidia"; then
        warn "NVIDIA GPU detected but DKMS is not installed. GPU passthrough modules cannot be managed."
    else
        ok "DKMS not installed (no GPU modules expected)"
    fi
fi

###############################################################################
# 16. CONTAINER STORAGE COMPATIBILITY
###############################################################################
header "16. CONTAINER STORAGE COMPATIBILITY"

ct_storage_issues=0

if command -v pct &>/dev/null; then
    while IFS= read -r ctline; do
        [[ -z "$ctline" ]] && continue
        ctid=$(echo "$ctline" | awk '{print $1}')
        ctstatus=$(echo "$ctline" | awk '{print $2}')
        ctname=$(echo "$ctline" | awk '{print $3}')

        # Only check running containers
        [[ "$ctstatus" != "running" ]] && continue

        # Check rootfs storage type
        rootfs_line=$(pct config "$ctid" 2>/dev/null | grep "^rootfs:")
        if [[ -n "$rootfs_line" ]]; then
            rootfs_storage=$(echo "$rootfs_line" | awk '{print $2}' | cut -d: -f1)
            # Determine storage type
            storage_type=$(awk "/^[a-z]+: ${rootfs_storage}\$/,/^\$/" /etc/pve/storage.cfg 2>/dev/null | head -1 | awk '{print $1}' | tr -d ':')
            if [[ "$storage_type" == "dir" || "$storage_type" == "nfs" || "$storage_type" == "cifs" ]]; then
                warn "CT $ctid ($ctname): rootfs on '$rootfs_storage' (type: $storage_type) - does not support snapshots"
                info "Backups will use suspend mode (brief downtime). Consider migrating rootfs to ZFS or LVM-thin storage."
                ct_storage_issues=$((ct_storage_issues + 1))
            fi
        fi
    done < <(pct list 2>/dev/null | tail -n +2 | awk '{print $1, $2, $3}')
fi

if (( ct_storage_issues == 0 )); then
    ok "All running containers are on snapshot-capable storage"
fi

###############################################################################
# 17. PBS DATASTORE UTILIZATION
###############################################################################
header "17. PBS DATASTORE UTILIZATION"

pbs_checked=false

if command -v pvesm &>/dev/null; then
    while IFS= read -r line; do
        s_id=$(echo "$line" | awk '{print $1}')
        s_type=$(echo "$line" | awk '{print $2}')
        s_status=$(echo "$line" | awk '{print $3}')
        s_total=$(echo "$line" | awk '{print $4}')
        s_used=$(echo "$line" | awk '{print $5}')
        s_avail=$(echo "$line" | awk '{print $6}')
        s_pct_raw=$(echo "$line" | awk '{print $7}' | tr -d '%')
        s_pct=$(echo "$s_pct_raw" | awk -F. '{print $1}')

        if [[ "$s_type" == "pbs" && "$s_status" == "active" ]]; then
            pbs_checked=true
            if [[ -n "$s_pct" ]] && [[ "$s_pct" =~ ^[0-9]+$ ]]; then
                s_used_human=$(awk "BEGIN {printf \"%.1f\", $s_used / 1048576}" 2>/dev/null)
                s_total_human=$(awk "BEGIN {printf \"%.1f\", $s_total / 1048576}" 2>/dev/null)
                s_avail_human=$(awk "BEGIN {printf \"%.1f\", $s_avail / 1048576}" 2>/dev/null)

                if (( s_pct >= CRIT_PBS_USAGE_PCT )); then
                    crit "PBS '$s_id': ${s_pct}% used (${s_used_human}G / ${s_total_human}G, ${s_avail_human}G free)"
                    info "Datastore is nearly full. Review retention policies and run garbage collection."
                elif (( s_pct >= WARN_PBS_USAGE_PCT )); then
                    warn "PBS '$s_id': ${s_pct}% used (${s_used_human}G / ${s_total_human}G, ${s_avail_human}G free)"
                    info "Consider reviewing backup retention or expanding storage."
                else
                    ok "PBS '$s_id': ${s_pct}% used (${s_used_human}G / ${s_total_human}G, ${s_avail_human}G free)"
                fi
            else
                info "PBS '$s_id': unable to determine utilization percentage"
            fi
        fi
    done < <(pvesm status 2>/dev/null | tail -n +2)
fi

if ! $pbs_checked; then
    info "No PBS datastores configured on this node"
fi

###############################################################################
# 18. TRUENAS SNAPSHOT COUNT (optional, via SSH)
###############################################################################
header "18. TRUENAS SNAPSHOT COUNT"

# Detect TrueNAS VMs by name pattern and check if reachable
truenas_checked=false

# Look for TrueNAS IPs in storage.cfg (NFS/SMB servers) or known hosts
declare -a truenas_hosts=()

# Check storage.cfg for NFS/CIFS servers that might be TrueNAS
while IFS= read -r storage_line; do
    server_ip=$(echo "$storage_line" | awk '{print $2}')
    if [[ -n "$server_ip" ]]; then
        # Deduplicate
        already=false
        for h in "${truenas_hosts[@]}"; do
            [[ "$h" == "$server_ip" ]] && already=true
        done
        $already || truenas_hosts+=("$server_ip")
    fi
done < <(grep -E "^\s+server " /etc/pve/storage.cfg 2>/dev/null)

# Also check for VMs named TrueNAS and get their IPs from qm agent
if command -v qm &>/dev/null; then
    while IFS= read -r vmline; do
        vmid=$(echo "$vmline" | awk '{print $1}')
        vmname=$(echo "$vmline" | awk '{print $2}')
        vmstatus=$(echo "$vmline" | awk '{print $3}')
        if [[ "$vmstatus" == "running" ]] && echo "$vmname" | grep -qi "truenas"; then
            tn_ip=$(qm guest cmd "$vmid" network-get-interfaces 2>/dev/null | grep -oP '"ip-address"\s*:\s*"\K[0-9.]+' | grep -v "^127\." | head -1 || true)
            if [[ -n "$tn_ip" ]]; then
                already=false
                for h in "${truenas_hosts[@]}"; do
                    [[ "$h" == "$tn_ip" ]] && already=true
                done
                $already || truenas_hosts+=("$tn_ip")
            fi
        fi
    done < <(qm list 2>/dev/null | tail -n +2 | awk '{print $1, $2, $3}')
fi

for tn_host in "${truenas_hosts[@]}"; do
    # Try SSH with a short timeout (key-based auth only, no password prompt)
    snap_count=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@"$tn_host" "zfs list -t snapshot 2>/dev/null | wc -l" 2>/dev/null || echo "")

    if [[ -n "$snap_count" ]] && [[ "$snap_count" =~ ^[0-9]+$ ]]; then
        truenas_checked=true
        if (( snap_count > WARN_TRUENAS_SNAPSHOTS )); then
            warn "TrueNAS ($tn_host): $snap_count snapshots (threshold: $WARN_TRUENAS_SNAPSHOTS)"
            info "Excessive snapshots degrade pool performance. Review snapshot retention tasks."
        else
            ok "TrueNAS ($tn_host): $snap_count snapshots"
        fi
    fi
done

if ! $truenas_checked; then
    info "No TrueNAS hosts reachable via SSH (key-based auth required)"
fi

###############################################################################
# SUMMARY
###############################################################################
header "SUMMARY"

log "  Total Warnings:  $WARNINGS"
log "  Total Criticals: $CRITICALS"
log ""

if (( CRITICALS > 0 )); then
    if $USE_COLOR; then
        log "  ${C_RED}RESULT: CRITICAL - Immediate attention required${C_RESET}"
    else
        log "  RESULT: CRITICAL - Immediate attention required"
    fi
    EXIT_CODE=2
elif (( WARNINGS > 0 )); then
    if $USE_COLOR; then
        log "  ${C_YELLOW}RESULT: WARNING - Review items above${C_RESET}"
    else
        log "  RESULT: WARNING - Review items above"
    fi
    EXIT_CODE=1
else
    if $USE_COLOR; then
        log "  ${C_GREEN}RESULT: HEALTHY - All checks passed${C_RESET}"
    else
        log "  RESULT: HEALTHY - All checks passed"
    fi
    EXIT_CODE=0
fi

log ""
log "==============================================================================="
log "  End of report - Generated $(date '+%Y-%m-%d %H:%M:%S')"
log "==============================================================================="

# --- Output ------------------------------------------------------------------
echo -e "$REPORT"

if [[ -n "$OUTPUT_FILE" ]]; then
    echo "$REPORT_PLAIN" > "${OUTPUT_FILE}/pve-health-${HOSTNAME}-${DATE_SHORT}.txt"
    echo "Report saved to: ${OUTPUT_FILE}/pve-health-${HOSTNAME}-${DATE_SHORT}.txt"
fi

if [[ -n "$MAIL_TO" ]]; then
    if command -v mail &>/dev/null; then
        echo "$REPORT_PLAIN" | mail -s "PVE Health Check: ${HOSTNAME} - $(if (( CRITICALS > 0 )); then echo 'CRITICAL'; elif (( WARNINGS > 0 )); then echo 'WARNING'; else echo 'HEALTHY'; fi)" "$MAIL_TO"
        echo "Report emailed to: $MAIL_TO"
    else
        echo "WARNING: 'mail' command not found. Install mailutils to enable email reports."
    fi
fi

exit $EXIT_CODE