#!/bin/bash
# =============================================================================
# Alert-Service-Failure — Slack-Alert für systemd-Service-Failures
# =============================================================================
# Wird von systemd via OnFailure= aufgerufen. Postet einen Slack-Alert mit
# Hostname, Unit-Name, Failure-Reason und den letzten 20 Journal-Zeilen der
# fehlgeschlagenen Invocation.
#
# Verwendung: alert-service-failure.sh <unit-name>
# Install: /etc/systemd/system/<unit>-alert@.service Template ruft dieses Script auf.
# =============================================================================

set -uo pipefail

UNIT="${1:-unknown.service}"
HOSTNAME=$(hostname)
MY_IPV4=$(hostname -I 2>/dev/null | awk '{print $1}')
LOG_TAG="alert-service-failure"

# Webhook secrets: prefer /etc/cert-pki/alerting.env (0600) — fallback /etc/environment (legacy, 0644)
for _envfile in /etc/cert-pki/alerting.env /etc/environment; do
    [[ -f "$_envfile" && -r "$_envfile" ]] || continue
    while IFS='=' read -r key val; do
        [[ "$key" =~ ^ALERT_WEBHOOK_ ]] && export "$key"="$(echo "$val" | sed 's/^"//;s/"$//')"
    done < "$_envfile"
done
unset _envfile
ALERT_WEBHOOK_SLACK="${ALERT_WEBHOOK_SLACK:-}"

declare -A SSH_URLS
SSH_URLS["Cert-Server-1-NBG"]="https://cert-server-1-nbg-ssh.db-app.dev/"
SSH_URLS["Cert-Server-0-NBG"]="https://cert-server-0-nbg-ssh.db-app.dev/"
SSH_URLS["Cert-Server-0-FSN"]="https://cert-server-0-fsn-ssh.db-app.dev/"
SSH_URLS["Cert-Server-1-FSN"]="https://cert-server-1-fsn-ssh.db-app.dev/"
SSH_URLS["Cert-Server-HEL"]="https://cert-server-hel-ssh.db-app.dev/"

RESULT=$(systemctl show "$UNIT" -p Result --value 2>/dev/null || echo "unknown")
ACTIVE_STATE=$(systemctl show "$UNIT" -p ActiveState --value 2>/dev/null || echo "unknown")
SUB_STATE=$(systemctl show "$UNIT" -p SubState --value 2>/dev/null || echo "unknown")
EXEC_STATUS=$(systemctl show "$UNIT" -p ExecMainStatus --value 2>/dev/null || echo "")
EXEC_CODE=$(systemctl show "$UNIT" -p ExecMainCode --value 2>/dev/null || echo "")
INVOCATION=$(systemctl show "$UNIT" -p InvocationID --value 2>/dev/null || echo "")

JOURNAL_RAW=$(journalctl -u "$UNIT" -n 20 --no-pager -o cat 2>/dev/null | tail -20)

SSH_URL="${SSH_URLS[$HOSTNAME]:-}"

REASON="result=${RESULT} state=${ACTIVE_STATE}/${SUB_STATE}"
[[ -n "$EXEC_STATUS" ]] && REASON="${REASON} exit=${EXEC_STATUS}(code=${EXEC_CODE})"

TEXT=":rotating_light: *Service failure on \`${HOSTNAME}\`* (${MY_IPV4:-?})
*Unit:* \`${UNIT}\`
*Reason:* ${REASON}
*SSH:* ${SSH_URL:-n/a}
*Last 20 journal lines:*
\`\`\`
${JOURNAL_RAW}
\`\`\`"

if [[ -z "$ALERT_WEBHOOK_SLACK" ]]; then
    logger -t "$LOG_TAG" -p "local0.err" \
        "WEBHOOK SKIPPED: ALERT_WEBHOOK_SLACK not set (unit=$UNIT)"
    exit 0
fi

http_code=$(curl -s -o /dev/null -w '%{http_code}' -m 10 -X POST "$ALERT_WEBHOOK_SLACK" \
    -H "Content-Type: application/json" \
    -d "$(jq -n --arg t "$TEXT" '{text: $t}')" 2>/dev/null) || http_code="000"

if [[ "$http_code" == "200" ]]; then
    logger -t "$LOG_TAG" -p "local0.info" \
        "Slack alert posted for failed unit=$UNIT (reason: $REASON)"
else
    logger -t "$LOG_TAG" -p "local0.err" \
        "WEBHOOK FAILED: Slack returned HTTP ${http_code} for unit=$UNIT"
fi
