#!/bin/bash
# =============================================================================
# Deployment Verification - Cluster-weiter Audit aller 5 Cert-Server
# =============================================================================
# Prueft, dass jeder Server identisch und vollstaendig deployt ist.
# Kann von einem beliebigen Cluster-Server aus laufen (SSH-Zugriff noetig).
#
# Verwendung:
#   verify-deployment.sh              # Alle Checks auf allen 5 Servern
#   verify-deployment.sh <n>          # Nur Server <n> (1-5)
#   verify-deployment.sh --quiet      # Nur FAIL-Meldungen + Summary
#
# Version: 1.0
# =============================================================================

set -uo pipefail

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

declare -A SERVERS
SERVERS[1]="10.0.0.2:Cert-Server-1-NBG"
SERVERS[2]="10.0.0.3:Cert-Server-0-NBG"
SERVERS[3]="10.0.0.4:Cert-Server-0-FSN"
SERVERS[4]="10.0.0.5:Cert-Server-1-FSN"
SERVERS[5]="10.0.0.6:Cert-Server-HEL"

REFERENCE_IP="10.0.0.2"
SCRIPT_CHECKSUM_REF=""
APACHE_CONFIG_REF=""
KERNEL_REF=""    # Computed in init_references: newest running kernel across cluster
CLAUDE_REF=""    # Computed in init_references: newest Claude Code version across cluster

QUIET=0
ONLY_SERVER=""
for arg in "$@"; do
    case "$arg" in
        --quiet) QUIET=1 ;;
        [1-5]) ONLY_SERVER="$arg" ;;
        *) echo "Unknown arg: $arg" >&2; exit 2 ;;
    esac
done

declare -A FAIL_COUNT
declare -A WARN_COUNT
declare -A OK_COUNT
TOTAL_FAIL=0

ok()   { [ "$QUIET" = 1 ] || echo -e "  ${GREEN}[OK]${NC}   $1"; OK_COUNT[$N]=$((${OK_COUNT[$N]:-0}+1)); }
fail() { echo -e "  ${RED}[FAIL]${NC} $1"; FAIL_COUNT[$N]=$((${FAIL_COUNT[$N]:-0}+1)); TOTAL_FAIL=$((TOTAL_FAIL+1)); }
warn() { echo -e "  ${YELLOW}[WARN]${NC} $1"; WARN_COUNT[$N]=$((${WARN_COUNT[$N]:-0}+1)); }
info() { [ "$QUIET" = 1 ] || echo -e "  ${BLUE}[INFO]${NC} $1"; }

ssh_run() {
    local ip="$1"; shift
    ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o LogLevel=ERROR "$ip" "$@" 2>/dev/null
}

# --- Reference values from server 1 ---
init_references() {
    SCRIPT_CHECKSUM_REF=$(ssh_run "$REFERENCE_IP" "sha256sum /root/projects/cert-pki/scripts/dns-maintenance.sh 2>/dev/null | cut -d' ' -f1")
    # Newest running kernel + Claude Code version across all cluster servers
    KERNEL_REF=""
    CLAUDE_REF=""
    for _n in 1 2 3 4 5; do
        local _entry="${SERVERS[$_n]}"
        local _ip="${_entry%%:*}"
        local _k _cv
        _k=$(ssh_run "$_ip" "uname -r" 2>/dev/null)
        [ -n "$_k" ] && KERNEL_REF=$(printf '%s\n%s' "$KERNEL_REF" "$_k" | sort -V | tail -1)
        _cv=$(ssh_run "$_ip" "/root/.local/bin/claude --version 2>/dev/null | grep -oP '^\S+'" 2>/dev/null)
        [ -n "$_cv" ] && CLAUDE_REF=$(printf '%s\n%s' "$CLAUDE_REF" "$_cv" | sort -V | tail -1)
    done
}

check_server() {
    local n="$1"
    local entry="${SERVERS[$n]}"
    local ip="${entry%%:*}"
    local name="${entry##*:}"
    N="$n"
    FAIL_COUNT[$N]=0
    OK_COUNT[$N]=0
    WARN_COUNT[$N]=0

    echo -e "${BLUE}=== [$n] $name ($ip) ===${NC}"

    # 1. SSH reachable
    if ! ssh_run "$ip" "true"; then
        fail "SSH not reachable (all subsequent checks skipped)"
        return
    fi
    ok "SSH reachable"

    # 2. dns-maintenance.sh present + checksum matches reference server
    local cksum
    cksum=$(ssh_run "$ip" "sha256sum /root/projects/cert-pki/scripts/dns-maintenance.sh 2>/dev/null | cut -d' ' -f1")
    if [ -z "$cksum" ]; then
        fail "dns-maintenance.sh missing"
    elif [ "$cksum" = "$SCRIPT_CHECKSUM_REF" ]; then
        ok "dns-maintenance.sh (checksum ${cksum:0:8} matches reference)"
    else
        fail "dns-maintenance.sh DIFFERS from reference (${cksum:0:8} vs ${SCRIPT_CHECKSUM_REF:0:8}) -- deploy latest"
    fi

    # 3. Symlink
    local link
    link=$(ssh_run "$ip" "readlink /usr/local/sbin/dns-maintenance")
    if [[ "$link" == *"dns-maintenance.sh" ]]; then
        ok "symlink /usr/local/sbin/dns-maintenance -> $link"
    else
        fail "symlink /usr/local/sbin/dns-maintenance missing or wrong (got: '$link')"
    fi

    # 4. Service installed
    if ssh_run "$ip" "test -f /etc/systemd/system/dns-maintenance.service"; then
        ok "dns-maintenance.service installed"
    else
        fail "dns-maintenance.service NOT INSTALLED"
    fi

    # 5. Service enabled
    local en
    en=$(ssh_run "$ip" "systemctl is-enabled dns-maintenance.service")
    if [ "$en" = "enabled" ]; then
        ok "dns-maintenance.service enabled"
    else
        fail "dns-maintenance.service NOT ENABLED (state: $en)"
    fi

    # 6. Service active
    local act
    act=$(ssh_run "$ip" "systemctl is-active dns-maintenance.service")
    if [ "$act" = "active" ]; then
        ok "dns-maintenance.service active"
    else
        fail "dns-maintenance.service NOT ACTIVE (state: $act)"
    fi

    # 7. CF tokens
    local tokens
    tokens=$(ssh_run "$ip" "grep -c '^CF_TOKEN_\(BUSINESS\|SERVICES\)=' /etc/environment")
    if [ "$tokens" = "2" ]; then
        ok "CF_TOKEN_BUSINESS + CF_TOKEN_SERVICES in /etc/environment"
    else
        fail "CF tokens missing in /etc/environment (found $tokens of 2)"
    fi

    # 8. Apache active
    if [ "$(ssh_run "$ip" "systemctl is-active apache2")" = "active" ]; then
        ok "apache2 active"
    else
        fail "apache2 NOT ACTIVE"
    fi

    # 9. Apache config diff vs reference
    if [ "$ip" = "$REFERENCE_IP" ]; then
        ok "apache config (reference server, no diff)"
    else
        local ref_cfg cmp_cfg
        ref_cfg=$(ssh_run "$REFERENCE_IP" "find /etc/apache2/sites-enabled /etc/apache2/conf-enabled -type l -o -type f 2>/dev/null | sort | xargs -I{} sh -c 'echo {}; cat {}' | sha256sum | cut -d' ' -f1")
        cmp_cfg=$(ssh_run "$ip" "find /etc/apache2/sites-enabled /etc/apache2/conf-enabled -type l -o -type f 2>/dev/null | sort | xargs -I{} sh -c 'echo {}; cat {}' | sha256sum | cut -d' ' -f1")
        if [ "$ref_cfg" = "$cmp_cfg" ]; then
            ok "apache config matches reference"
        else
            info "apache config differs from reference (legitimate per-server config)"
        fi
    fi

    # 10. SSL certificate validity
    local cert_days
    cert_days=$(ssh_run "$ip" "openssl x509 -in /etc/letsencrypt/live/cert-business/fullchain.pem -noout -enddate 2>/dev/null | cut -d= -f2")
    if [ -n "$cert_days" ]; then
        local exp_epoch now_epoch days_left
        exp_epoch=$(date -d "$cert_days" +%s 2>/dev/null || echo 0)
        now_epoch=$(date +%s)
        days_left=$(( (exp_epoch - now_epoch) / 86400 ))
        if [ "$days_left" -gt 14 ]; then
            ok "SSL cert valid ($days_left days left)"
        elif [ "$days_left" -gt 0 ]; then
            warn "SSL cert expires in $days_left days"
        else
            fail "SSL cert EXPIRED or invalid"
        fi
    else
        warn "SSL cert not found at expected paths"
    fi

    # 11. Wazuh active
    if [ "$(ssh_run "$ip" "systemctl is-active wazuh-agent")" = "active" ]; then
        ok "wazuh-agent active"
    else
        fail "wazuh-agent NOT ACTIVE"
    fi

    # 12. Cloudflared active
    if [ "$(ssh_run "$ip" "systemctl is-active cloudflared")" = "active" ]; then
        ok "cloudflared active"
    else
        fail "cloudflared NOT ACTIVE"
    fi

    # 13. DNS verify
    local verify_out
    verify_out=$(ssh_run "$ip" "dns-maintenance verify 2>&1 | grep -E 'Missing:' | head -1")
    if echo "$verify_out" | grep -qE 'Missing:[[:space:]]+.*0'; then
        ok "dns-maintenance verify: 0 missing records"
    else
        fail "dns-maintenance verify: records missing ($verify_out)"
    fi

    # 14. Kernel (compare against cluster newest + own installed)
    local k k_latest
    k=$(ssh_run "$ip" "uname -r")
    k_latest=$(ssh_run "$ip" "dpkg -l 'linux-image-*-generic' 2>/dev/null | awk '/^ii/{print \$2}' | sort -V | tail -1 | sed 's/linux-image-//'")
    if [ "$k" = "$KERNEL_REF" ]; then
        ok "kernel $k (cluster newest)"
    elif [ -n "$k_latest" ] && [ "$k" != "$k_latest" ]; then
        warn "kernel $k running, $k_latest installed — reboot needed (cluster newest: $KERNEL_REF)"
    else
        warn "kernel $k differs from cluster newest $KERNEL_REF"
    fi

    # 15. Pending updates
    local upd
    upd=$(ssh_run "$ip" "apt list --upgradable 2>/dev/null | grep -cv '^Listing'")
    if [ "$upd" = "0" ]; then
        ok "no pending apt updates"
    else
        warn "$upd pending apt updates"
    fi

    # 16. jq source (apt vs snap) -- CRITICAL for drain during shutdown
    local jq_path
    jq_path=$(ssh_run "$ip" "readlink -f \$(which jq 2>/dev/null) 2>/dev/null")
    if [ "$jq_path" = "/usr/bin/jq" ]; then
        ok "jq is apt-packaged (/usr/bin/jq)"
    elif [[ "$jq_path" == *snap* ]]; then
        fail "jq is SNAP-packaged ($jq_path) -- drain will fail during shutdown! Run: snap remove jq && apt install -y jq"
    else
        fail "jq not found or at unexpected path: '$jq_path'"
    fi

    # 17. FIPS MUST be inactive (runtime + kernel + grub.d + fips-kernel-pkgs)
    local fips_runtime fips_kernel fips_grub fips_pkgs
    fips_runtime=$(ssh_run "$ip" "cat /proc/sys/crypto/fips_enabled 2>/dev/null || echo 0")
    fips_kernel=$(ssh_run "$ip" "uname -r | grep -c -- '-fips' || true")
    fips_grub=$(ssh_run "$ip" "ls /etc/default/grub.d/*fips* 2>/dev/null | wc -l")
    fips_pkgs=$(ssh_run "$ip" "dpkg -l 2>/dev/null | grep -cE '^ii.*linux-image.*-fips' || true")
    if [ "$fips_runtime" = "0" ] && [ "$fips_kernel" = "0" ] && [ "$fips_grub" = "0" ] && [ "$fips_pkgs" = "0" ]; then
        ok "FIPS clean (runtime=0, kernel=non-fips, grub.d=0, fips-kernel-pkgs=0)"
    else
        fail "FIPS DETECTED -- REBOOT RISK! runtime=$fips_runtime kernel-suffix=$fips_kernel grub.d-cfgs=$fips_grub fips-kernel-pkgs=$fips_pkgs (see docs/FIPS.md)"
    fi

    # 18. Claude Code version (compare against cluster newest)
    local cc_ver
    cc_ver=$(ssh_run "$ip" "/root/.local/bin/claude --version 2>/dev/null | grep -oP '^\S+'")
    if [ -z "$cc_ver" ]; then
        warn "Claude Code not found"
    elif [ "$cc_ver" = "$CLAUDE_REF" ]; then
        ok "Claude Code $cc_ver (cluster newest)"
    else
        warn "Claude Code $cc_ver, cluster newest $CLAUDE_REF — run: claude update"
    fi

    # 19. known_hosts SSH banner matches live banner (auto-refresh if stale)
    # Detects stale entries e.g. after sshd package changes (FIPS add/remove, version bumps).
    local kh=/root/.ssh/known_hosts
    local stored_banner live_banner
    stored_banner=$(grep -m1 "^# $ip:22 " "$kh" 2>/dev/null | sed "s|^# $ip:22 ||")
    live_banner=$(ssh-keyscan -T 3 -t ecdsa "$ip" 2>&1 1>/dev/null | sed -n "s|^# $ip:22 ||p" | head -1)
    if [ -z "$live_banner" ]; then
        warn "known_hosts banner: could not fetch live banner from $ip:22"
    elif [ "$stored_banner" = "$live_banner" ]; then
        ok "known_hosts banner matches live ($live_banner)"
    else
        ssh-keygen -R "$ip" -f "$kh" >/dev/null 2>&1
        sed -i "\|^# $ip:22 |d" "$kh"
        local scan
        scan=$(ssh-keyscan -H -T 3 "$ip" 2>&1)
        echo "$scan" | grep "^# $ip:22 " | head -1 >> "$kh"
        echo "$scan" | grep -v "^#" >> "$kh"
        if [ -z "$stored_banner" ]; then
            warn "known_hosts had no banner for $ip — added ('$live_banner')"
        else
            warn "known_hosts banner was stale for $ip — refreshed (was: '$stored_banner', now: '$live_banner')"
        fi
    fi

    # 20. Reboot/restart needed (Ubuntu reboot-required flag + needrestart services)
    local reboot_pkgs restart_svcs
    if ssh_run "$ip" "test -f /var/run/reboot-required"; then
        reboot_pkgs=$(ssh_run "$ip" "cat /var/run/reboot-required.pkgs 2>/dev/null" | tr '\n' ',' | sed 's/,$//')
        warn "reboot required (pkgs: ${reboot_pkgs:-unknown})"
    else
        restart_svcs=$(ssh_run "$ip" "needrestart -b 2>/dev/null" | awk -F: '/^NEEDRESTART-SVC:/{print $2}' | sed 's/^ //' | paste -sd,)
        if [ -n "$restart_svcs" ]; then
            warn "services need restart (libs updated): $restart_svcs"
        else
            ok "no reboot/restart pending"
        fi
    fi

    # 21. ALERT_WEBHOOK_SLACK present + non-empty (prefer /etc/cert-pki/alerting.env 0600)
    local slack_hook hook_src env_mode env_leak
    # Primary source
    slack_hook=$(ssh_run "$ip" "grep -E '^ALERT_WEBHOOK_SLACK=' /etc/cert-pki/alerting.env 2>/dev/null | head -1 | cut -d= -f2- | tr -d '\"'" | head -c 80)
    hook_src="alerting.env"
    # Fallback: legacy /etc/environment
    if [ -z "$slack_hook" ]; then
        slack_hook=$(ssh_run "$ip" "grep -E '^ALERT_WEBHOOK_SLACK=' /etc/environment 2>/dev/null | head -1 | cut -d= -f2- | tr -d '\"'" | head -c 80)
        hook_src="environment(legacy)"
    fi
    if [ -z "$slack_hook" ]; then
        fail "ALERT_WEBHOOK_SLACK missing in /etc/cert-pki/alerting.env AND /etc/environment — alerts would silently drop"
    elif [[ "$slack_hook" != https://hooks.slack.com/* ]]; then
        fail "ALERT_WEBHOOK_SLACK does not look like a Slack webhook URL (src: $hook_src)"
    else
        # Check 0600 mode on alerting.env if used + warn if legacy /etc/environment still leaks the secret
        if [ "$hook_src" = "alerting.env" ]; then
            env_mode=$(ssh_run "$ip" "stat -c '%a' /etc/cert-pki/alerting.env 2>/dev/null")
            env_leak=$(ssh_run "$ip" "grep -cE '^ALERT_WEBHOOK_SLACK=' /etc/environment 2>/dev/null")
            if [ "$env_mode" != "600" ]; then
                fail "ALERT_WEBHOOK_SLACK in alerting.env but mode=$env_mode (expected 0600 — secret world-readable!)"
            elif [ "$env_leak" -gt 0 ]; then
                warn "ALERT_WEBHOOK_SLACK present in BOTH alerting.env AND /etc/environment — remove the legacy copy"
            else
                ok "ALERT_WEBHOOK_SLACK in alerting.env (mode 600, ${#slack_hook} chars)"
            fi
        else
            warn "ALERT_WEBHOOK_SLACK only in legacy /etc/environment (world-readable) — migrate to /etc/cert-pki/alerting.env 0600"
        fi
    fi

    # 22. verify-deployment.timer enabled + active on this server (monitoring-of-monitoring)
    local vd_timer_enabled vd_timer_active
    vd_timer_enabled=$(ssh_run "$ip" "systemctl is-enabled verify-deployment.timer 2>/dev/null")
    vd_timer_active=$(ssh_run "$ip" "systemctl is-active verify-deployment.timer 2>/dev/null")
    if [ "$vd_timer_enabled" = "enabled" ] && [ "$vd_timer_active" = "active" ]; then
        ok "verify-deployment.timer enabled + active"
    else
        fail "verify-deployment.timer not healthy (enabled=$vd_timer_enabled active=$vd_timer_active) — rotating alerts would stop"
    fi

    # 23. Time synchronization (systemd-timesyncd or chrony — cert validation is time-sensitive)
    local ntp_sync
    ntp_sync=$(ssh_run "$ip" "timedatectl show --property=NTPSynchronized --value 2>/dev/null")
    if [ "$ntp_sync" = "yes" ]; then
        ok "time synchronized (NTPSynchronized=yes)"
    else
        fail "time NOT synchronized (NTPSynchronized=$ntp_sync) — cert/OCSP/log timestamps drift"
    fi

    # 24. Live TLS handshake on localhost:443 — proves Apache serves the real cert, not just holds the file
    local tls_sni tls_out tls_exit
    tls_sni=$(ssh_run "$ip" "cat /etc/apache2/sites-enabled/*.conf 2>/dev/null | grep -oP 'ServerName\s+\K\S+' | grep -v '^127\.' | head -1")
    [ -z "$tls_sni" ] && tls_sni="cert.business.db-app.dev"
    tls_out=$(ssh_run "$ip" "echo | openssl s_client -connect 127.0.0.1:443 -servername '$tls_sni' -verify_return_error 2>&1 | grep -E 'Verify return code|subject=' | head -2")
    tls_exit=$(ssh_run "$ip" "echo | openssl s_client -connect 127.0.0.1:443 -servername '$tls_sni' -verify_return_error >/dev/null 2>&1; echo \$?")
    if [ "$tls_exit" = "0" ]; then
        ok "TLS handshake localhost:443 (SNI=$tls_sni) ok"
    else
        fail "TLS handshake localhost:443 (SNI=$tls_sni) FAILED: $tls_out"
    fi

    # 25. Disk: root filesystem must be < 85% used
    local disk_pct
    disk_pct=$(ssh_run "$ip" "df -P / | awk 'NR==2 {gsub(/%/,\"\"); print \$5}'")
    if [ -z "$disk_pct" ]; then
        warn "disk usage could not be read"
    elif [ "$disk_pct" -ge 85 ]; then
        fail "disk / is ${disk_pct}% full (threshold 85%)"
    else
        ok "disk / ${disk_pct}% used"
    fi

    # 26. Memory: used / total must be < 90%
    local mem_pct
    mem_pct=$(ssh_run "$ip" "free | awk '/^Mem:/{printf \"%d\", \$3*100/\$2}'")
    if [ -z "$mem_pct" ]; then
        warn "memory usage could not be read"
    elif [ "$mem_pct" -ge 90 ]; then
        fail "memory is ${mem_pct}% used (threshold 90%)"
    else
        ok "memory ${mem_pct}% used"
    fi

    # 27. CPU load: 1-min loadavg must be < (nproc * 2)
    local load_1 nprocs load_x100 threshold
    load_1=$(ssh_run "$ip" "awk '{print \$1}' /proc/loadavg")
    nprocs=$(ssh_run "$ip" "nproc")
    load_x100=$(awk -v l="$load_1" 'BEGIN{printf "%d", l*100}')
    threshold=$(( nprocs * 200 ))
    if [ "$load_x100" -ge "$threshold" ]; then
        fail "load ${load_1} over ${nprocs} CPUs (1-min >= ${nprocs}*2)"
    else
        ok "load ${load_1} over ${nprocs} CPUs"
    fi

    # 28. Cloudflared tunnel: readyConnections via /ready metrics endpoint (QUIC-based, not visible in ss -t)
    local cf_ready cf_status
    cf_ready=$(ssh_run "$ip" "curl -s --max-time 3 http://127.0.0.1:20241/ready 2>/dev/null")
    cf_status=$(echo "$cf_ready" | grep -oP '"readyConnections":\K[0-9]+' | head -1)
    if [ -z "$cf_status" ]; then
        fail "cloudflared /ready endpoint unreachable (tunnel health unknown — service 'active' alone isn't enough)"
    elif [ "$cf_status" -lt 1 ]; then
        fail "cloudflared readyConnections=$cf_status (tunnel DOWN)"
    elif [ "$cf_status" -lt 4 ]; then
        warn "cloudflared readyConnections=$cf_status/4 (degraded)"
    else
        ok "cloudflared readyConnections=$cf_status"
    fi
}

# --- Main ---
echo -e "${BLUE}Cert-PKI Deployment Verification${NC}  $(date -Iseconds)"
echo ""
init_references

if [ -n "$ONLY_SERVER" ]; then
    check_server "$ONLY_SERVER"
else
    for n in 1 2 3 4 5; do
        check_server "$n"
        echo ""
    done
fi

# --- Summary ---
echo -e "${BLUE}=== Summary ===${NC}"
for n in 1 2 3 4 5; do
    [ -n "$ONLY_SERVER" ] && [ "$ONLY_SERVER" != "$n" ] && continue
    sname="${SERVERS[$n]##*:}"
    sok="${OK_COUNT[$n]:-0}"
    sfail="${FAIL_COUNT[$n]:-0}"
    swarn="${WARN_COUNT[$n]:-0}"
    warn_part=""
    [ "$swarn" -gt 0 ] && warn_part=", $swarn WARN"
    if [ "$sfail" = "0" ]; then
        echo -e "  ${GREEN}[$n] $sname: $sok OK, 0 FAIL${warn_part}${NC}"
    else
        echo -e "  ${RED}[$n] $sname: $sok OK, $sfail FAIL${warn_part}${NC}"
    fi
done
echo ""

if [ "$TOTAL_FAIL" = "0" ]; then
    echo -e "${GREEN}All servers OK.${NC}"
    exit 0
else
    echo -e "${RED}Total: $TOTAL_FAIL FAIL across cluster.${NC}"
    exit 1
fi