#!/bin/bash
# =============================================================================
# Verify-and-Alert — Cluster Health Check mit Slack/Canari Alerting
# =============================================================================
# Ruft verify-deployment.sh --quiet auf und sendet Ergebnis per Webhook.
# Laeuft rotierend auf allen 5 Servern (Mo-Fr, gestaffelt alle 2h).
#
# Verwendung:
#   verify-and-alert.sh           # Normaler Lauf (via systemd Timer)
#   verify-and-alert.sh --test    # Test-Modus: sendet immer einen Alert
# =============================================================================

set -uo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VERIFY_SCRIPT="${SCRIPT_DIR}/verify-deployment.sh"
HOSTNAME=$(hostname)
MY_IPV4=$(hostname -I 2>/dev/null | awk '{print $1}')
LOG_TAG="verify-and-alert"

# Webhook secrets: prefer /etc/cert-pki/alerting.env (0600) — fallback /etc/environment (legacy, 0644)
for _envfile in /etc/cert-pki/alerting.env /etc/environment; do
    [[ -f "$_envfile" && -r "$_envfile" ]] || continue
    while IFS='=' read -r key val; do
        [[ "$key" =~ ^ALERT_WEBHOOK_ ]] && export "$key"="$(echo "$val" | sed 's/^"//;s/"$//')"
    done < "$_envfile"
done
unset _envfile
ALERT_WEBHOOK_SLACK="${ALERT_WEBHOOK_SLACK:-}"
ALERT_WEBHOOK_CANARI="${ALERT_WEBHOOK_CANARI:-}"

# SSH-URLs fuer Slack-Links
declare -A SSH_URLS
SSH_URLS["Cert-Server-1-NBG"]="https://cert-server-1-nbg-ssh.db-app.dev/"
SSH_URLS["Cert-Server-0-NBG"]="https://cert-server-0-nbg-ssh.db-app.dev/"
SSH_URLS["Cert-Server-0-FSN"]="https://cert-server-0-fsn-ssh.db-app.dev/"
SSH_URLS["Cert-Server-1-FSN"]="https://cert-server-1-fsn-ssh.db-app.dev/"
SSH_URLS["Cert-Server-HEL"]="https://cert-server-hel-ssh.db-app.dev/"

NL=$'\n'
SLACK_SEND_FAILED=0

send_slack() {
    local text="$1"
    if [[ -n "$ALERT_WEBHOOK_SLACK" ]]; then
        local http_code
        http_code=$(curl -s -o /dev/null -w '%{http_code}' -m 10 -X POST "$ALERT_WEBHOOK_SLACK" \
            -H "Content-Type: application/json" \
            -d "$(jq -n --arg t "$text" '{text: $t}')" 2>/dev/null) || http_code="000"
        if [[ "$http_code" != "200" ]]; then
            logger -t "$LOG_TAG" -p "local0.err" \
                "WEBHOOK FAILED: Slack returned HTTP ${http_code}"
            SLACK_SEND_FAILED=1
        fi
    else
        logger -t "$LOG_TAG" -p "local0.err" \
            "WEBHOOK MISSING: ALERT_WEBHOOK_SLACK not set"
        SLACK_SEND_FAILED=1
    fi
}

send_canari() {
    local severity="$1" message="$2"
    if [[ -n "$ALERT_WEBHOOK_CANARI" ]]; then
        local http_code
        http_code=$(curl -s -o /dev/null -w '%{http_code}' -m 5 -X POST "$ALERT_WEBHOOK_CANARI" \
            -H "Content-Type: application/json" \
            -d "$(jq -n --arg s "$severity" --arg m "$message" \
                --arg server "$HOSTNAME" \
                --arg reporter "$HOSTNAME" \
                --arg reporter_ip "${MY_IPV4:-unknown}" \
                --arg ssh_url "${SSH_URLS[$HOSTNAME]:-}" \
                --arg t "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
                '{severity: $s, message: $m, server: $server, reporter: $reporter, reporter_ip: $reporter_ip, ssh_url: $ssh_url, timestamp: $t}')" 2>/dev/null) || http_code="000"
        if [[ "$http_code" != "200" ]]; then
            logger -t "$LOG_TAG" -p "local0.err" \
                "WEBHOOK FAILED: Canari returned HTTP ${http_code}"
        fi
    fi
}

# Verify ausfuehren (ohne --quiet, damit Server-Header sichtbar sind)
output=$("$VERIFY_SCRIPT" 2>&1)
exit_code=$?

# ANSI-Codes global strippen
clean_output=$(echo "$output" | sed 's/\x1b\[[0-9;]*m//g')

# Summary-Zeile parsen
total_fail=$(echo "$clean_output" | grep -oP 'Total: \K[0-9]+' || echo "0")
all_ok=$(echo "$clean_output" | grep -c "All servers OK" || true)

# Server-Summary parsen
summary=$(echo "$clean_output" | grep -E '\[[0-9]\].*OK.*FAIL' | sed 's/^  //')

# Fails sammeln (mit Server-Zuordnung)
fails=""
current_server=""
while IFS= read -r line; do
    if echo "$line" | grep -qE '=== \[[0-9]\]'; then
        current_server=$(echo "$line" | grep -oP 'Cert-Server-[^ ]+')
    elif echo "$line" | grep -q '\[FAIL\]'; then
        fails+="${current_server}: $(echo "$line" | sed 's/^  //')${NL}"
    fi
done <<< "$clean_output"

# Warns sammeln (mit Server-Zuordnung)
warns=""
current_server=""
while IFS= read -r line; do
    if echo "$line" | grep -qE '=== \[[0-9]\]'; then
        current_server=$(echo "$line" | grep -oP 'Cert-Server-[^ ]+')
    elif echo "$line" | grep -q '\[WARN\]'; then
        warns+="${current_server}: $(echo "$line" | sed 's/^  //')${NL}"
    fi
done <<< "$clean_output"
warns=$(echo "$warns" | sed '/^$/d')

my_ssh="${SSH_URLS[$HOSTNAME]:-}"
timestamp=$(date '+%Y-%m-%d %H:%M %Z')

if [[ "$all_ok" -gt 0 && -z "$fails" ]]; then
    # Alles OK
    msg="ℹ️ *[INFO]* Cluster-Check OK — alle Server gesund${NL}"
    msg+="Server: *${HOSTNAME}* (${MY_IPV4:-unknown})"
    [[ -n "$my_ssh" ]] && msg+=" (<${my_ssh}|SSH>)"
    msg+="${NL}Geprueft um: ${timestamp}"

    if [[ -n "$summary" ]]; then
        msg+="${NL}\`\`\`${NL}${summary}${NL}\`\`\`"
    fi

    if [[ -n "$warns" ]]; then
        warn_count=$(echo "$warns" | wc -l)
        msg+="${NL}⚠️ ${warn_count} Warnings:${NL}\`\`\`${NL}${warns}\`\`\`"

        # SSH-Links fuer Server mit Warnings
        affected=$(echo "$warns" | grep -oP 'Cert-Server-[^ :]+' | sort -u)
        if [[ -n "$affected" ]]; then
            ssh_links=""
            while IFS= read -r srv; do
                [[ -n "${SSH_URLS[$srv]:-}" ]] && { [[ -n "$ssh_links" ]] && ssh_links+=" | "; ssh_links+="<${SSH_URLS[$srv]}|${srv}>"; }
            done <<< "$affected"
            [[ -n "$ssh_links" ]] && msg+="${NL}SSH: ${ssh_links}"
        fi
    fi

    send_slack "$msg"
    send_canari "info" "Cluster-Check OK: alle Server gesund (geprueft von ${HOSTNAME})"
else
    # FAILs vorhanden
    msg="🔴 *[CRITICAL]* Cluster-Check FEHLGESCHLAGEN — ${total_fail} FAIL(s)!${NL}"
    msg+="Server: *${HOSTNAME}* (${MY_IPV4:-unknown})"
    [[ -n "$my_ssh" ]] && msg+=" (<${my_ssh}|SSH>)"
    msg+="${NL}Geprueft um: ${timestamp}"

    if [[ -n "$summary" ]]; then
        msg+="${NL}\`\`\`${NL}${summary}${NL}\`\`\`"
    fi

    msg+="${NL}*Failures:*${NL}\`\`\`${NL}${fails}${NL}\`\`\`"

    if [[ -n "$warns" ]]; then
        warn_count=$(echo "$warns" | wc -l)
        msg+="${NL}⚠️ ${warn_count} Warnings:${NL}\`\`\`${NL}${warns}${NL}\`\`\`"
    fi

    # SSH-Links fuer Server mit FAILs
    failed_servers=$(echo "$summary" | grep -v '0 FAIL' | grep -oP '\] \K[^:]+')
    if [[ -n "$failed_servers" ]]; then
        ssh_links=""
        while IFS= read -r srv; do
            srv=$(echo "$srv" | xargs)
            [[ -n "${SSH_URLS[$srv]:-}" ]] && { [[ -n "$ssh_links" ]] && ssh_links+=" | "; ssh_links+="<${SSH_URLS[$srv]}|${srv}>"; }
        done <<< "$failed_servers"
        [[ -n "$ssh_links" ]] && msg+="${NL}SSH: ${ssh_links}"
    fi

    send_slack "$msg"
    send_canari "critical" "Cluster-Check FEHLGESCHLAGEN: ${total_fail} FAIL(s) (geprueft von ${HOSTNAME})"
fi

logger -t "verify-and-alert" "Cluster-Check von ${HOSTNAME}: ${total_fail} FAIL, all_ok=${all_ok}, slack_send_failed=${SLACK_SEND_FAILED}"

if [[ "$SLACK_SEND_FAILED" -ne 0 ]]; then
    logger -t "verify-and-alert" -p "local0.err" \
        "exit 1 due to failed Slack webhook — systemd OnFailure chain will fire alert-service-failure.sh"
    exit 1
fi

exit "$exit_code"
