#!/usr/bin/env bash
# ClawPulse Agent v2 — deep OpenClaw + system telemetry
# Usage: curl -sS https://www.clawpulse.org/agent.sh | sudo bash -s <TOKEN>
set -euo pipefail

TOKEN="${1:-}"
API="${CLAWPULSE_URL:-https://www.clawpulse.org}"
INTERVAL="${CLAWPULSE_INTERVAL:-30}"
OPENCLAW_PORT="${OPENCLAW_PORT:-8080}"

if [ -z "$TOKEN" ]; then
  echo "[clawpulse] ERROR: connection token required"
  echo "  Usage: curl -sS https://www.clawpulse.org/agent.sh | sudo bash -s <TOKEN>"
  exit 1
fi

if [ "$(id -u)" -ne 0 ]; then
  echo "[clawpulse] ERROR: root required to install the service."
  echo "  Usage: curl -sS https://www.clawpulse.org/agent.sh | sudo bash -s <TOKEN>"
  exit 1
fi

AGENT_DIR="/opt/clawpulse"
AGENT_BIN="$AGENT_DIR/agent.sh"
mkdir -p "$AGENT_DIR"

cat > "$AGENT_BIN" << 'AGENT_EOF'
#!/usr/bin/env bash
set -uo pipefail

TOKEN="$1"
API="$2"
INTERVAL="$3"
OPENCLAW_PORT="$4"
HOME="${HOME:-/root}"
STATE_DIR="/opt/clawpulse/state"
mkdir -p "$STATE_DIR"

# ═══════════════════════════════════════════════════════
#  SYSTEM INFO (collected once)
# ═══════════════════════════════════════════════════════
SYS_HOSTNAME=$(hostname -f 2>/dev/null || hostname)
SYS_OS="unknown"
[ -f /etc/os-release ] && SYS_OS=$(. /etc/os-release && echo "$PRETTY_NAME")
SYS_KERNEL=$(uname -r 2>/dev/null || echo "unknown")
SYS_ARCH=$(uname -m 2>/dev/null || echo "unknown")

# ═══════════════════════════════════════════════════════
#  OPENCLAW DISCOVERY (runs once, refreshed on PID change)
# ═══════════════════════════════════════════════════════
OC_PID_CACHED=""
OC_CONFIG_PATH=""
OC_LOG_PATH=""
OC_DATA_DIR=""
OC_VERSION=""
OC_MODEL=""
OC_PROVIDER=""
OC_TOOLS=""
OC_SERVICE_NAME=""
OC_FILE_LOG=""

discover_openclaw() {
  local pid="$1"
  [ -z "$pid" ] && return

  # Find systemd service name for this PID (for journalctl)
  OC_SERVICE_NAME=""
  if command -v systemctl &>/dev/null; then
    OC_SERVICE_NAME=$(systemctl status "$pid" 2>/dev/null | head -1 | grep -oP '^\S+\.service' | sed 's/\.service$//' || echo "")
    # Fallback: search by common names
    if [ -z "$OC_SERVICE_NAME" ]; then
      for svc in openclaw-gateway openclaw openclaw-server; do
        if systemctl is-active "$svc" &>/dev/null; then
          OC_SERVICE_NAME="$svc"
          break
        fi
      done
    fi
  fi

  # Find working directory
  local cwd=""
  cwd=$(readlink -f /proc/$pid/cwd 2>/dev/null || echo "")

  # Find config file — search multiple common locations
  OC_CONFIG_PATH=""
  local search_dirs=("$cwd" "$cwd/.." "/etc/openclaw" "/opt/openclaw" "$HOME/.openclaw" "$HOME")
  local config_names=("openclaw.json" "openclaw.yaml" "openclaw.yml" "openclaw.toml" "config.json" "config.yaml" "config.yml" ".openclawrc" ".env")
  for dir in "${search_dirs[@]}"; do
    [ -z "$dir" ] || [ ! -d "$dir" ] && continue
    for cfg in "${config_names[@]}"; do
      if [ -f "$dir/$cfg" ]; then
        OC_CONFIG_PATH="$dir/$cfg"
        break 2
      fi
    done
  done

  # Also check cmdline for --config flag
  if [ -z "$OC_CONFIG_PATH" ] && [ -f /proc/$pid/cmdline ]; then
    local cmdargs
    cmdargs=$(tr '\0' ' ' < /proc/$pid/cmdline 2>/dev/null || echo "")
    local cfg_from_arg
    cfg_from_arg=$(echo "$cmdargs" | grep -oP '(?:--config[= ]|--cfg[= ]|-c )\K\S+' || echo "")
    [ -n "$cfg_from_arg" ] && [ -f "$cfg_from_arg" ] && OC_CONFIG_PATH="$cfg_from_arg"
  fi

  # Parse config for model, provider, tools
  if [ -n "$OC_CONFIG_PATH" ] && [ -f "$OC_CONFIG_PATH" ]; then
    local content
    content=$(cat "$OC_CONFIG_PATH" 2>/dev/null || echo "")

    # Model detection — match known model name patterns (including provider/model format)
    OC_MODEL=$(echo "$content" | grep -oiP '(?:"model"|model)\s*[:=]\s*"?\K((?:[a-z]+-[a-z]+/)?(?:gpt-[a-z0-9.-]+|claude-[a-z0-9.-]+|gemini-[a-z0-9.-]+|mistral-[a-z0-9.-]+|llama[a-z0-9.-]*|o[134]-[a-z0-9.-]+))' | head -1 || echo "")
    [ -z "$OC_MODEL" ] && OC_MODEL=$(echo "$content" | grep -oiP '(?:OPENAI_MODEL|MODEL_NAME|ANTHROPIC_MODEL|LLM_MODEL)\s*=\s*\K[a-zA-Z0-9._/-]+' | head -1 || echo "")

    # Provider detection
    OC_PROVIDER=""
    if echo "$content" | grep -qiP 'anthropic|claude'; then
      OC_PROVIDER="anthropic"
    elif echo "$content" | grep -qiP 'openai|gpt-|o1-|o3-'; then
      OC_PROVIDER="openai"
    elif echo "$content" | grep -qiP 'google|gemini|palm'; then
      OC_PROVIDER="google"
    elif echo "$content" | grep -qiP 'mistral'; then
      OC_PROVIDER="mistral"
    elif echo "$content" | grep -qiP 'groq'; then
      OC_PROVIDER="groq"
    elif echo "$content" | grep -qiP 'ollama|localhost:11434'; then
      OC_PROVIDER="ollama"
    fi

    # Tools detection — extract tool names (alphanumeric words from array)
    OC_TOOLS=$(echo "$content" | grep -oiP '(?:"tools"|tools)\s*[:=]\s*\[\s*\K[^\]]+' | head -1 | grep -oP '[a-zA-Z_][a-zA-Z0-9_-]*' | tr '\n' ', ' | sed 's/,$//' || echo "")
    [ -z "$OC_TOOLS" ] && OC_TOOLS=$(echo "$content" | grep -oiP '(?:enabled_tools|TOOLS)\s*=\s*\K[a-zA-Z0-9_,. -]+' | head -1 | sed 's/^ *//;s/ *$//' || echo "")
  fi

  # Try to extract model from gateway startup logs (journalctl or log file)
  if [ -z "$OC_MODEL" ] && command -v journalctl &>/dev/null; then
    local log_model=""
    if [ -n "$OC_SERVICE_NAME" ]; then
      log_model=$(journalctl -u "$OC_SERVICE_NAME" --no-pager -o cat -n 50 2>/dev/null | grep -aoP '\[gateway\] agent model: \K\S+' | tail -1 || echo "")
    elif [ -n "$pid" ]; then
      log_model=$(journalctl _PID="$pid" --no-pager -o cat -n 50 2>/dev/null | grep -aoP '\[gateway\] agent model: \K\S+' | tail -1 || echo "")
    fi
    [ -n "$log_model" ] && OC_MODEL="$log_model"
  fi

  # Also check environment variables of the process
  if [ -f /proc/$pid/environ ]; then
    local env_data
    env_data=$(tr '\0' '\n' < /proc/$pid/environ 2>/dev/null || echo "")

    [ -z "$OC_MODEL" ] && OC_MODEL=$(echo "$env_data" | grep -oP '^(?:OPENAI_MODEL|MODEL_NAME|ANTHROPIC_MODEL|LLM_MODEL)=\K.*' | head -1 || echo "")

    if [ -z "$OC_PROVIDER" ]; then
      if echo "$env_data" | grep -q 'ANTHROPIC_API_KEY'; then
        OC_PROVIDER="anthropic"
      elif echo "$env_data" | grep -q 'OPENAI_API_KEY'; then
        OC_PROVIDER="openai"
      elif echo "$env_data" | grep -q 'GOOGLE_API_KEY'; then
        OC_PROVIDER="google"
      fi
    fi
  fi

  # Find log file
  OC_LOG_PATH=""
  # Priority 1: OpenClaw default log location
  local today_log="/tmp/openclaw/openclaw-$(date +%Y-%m-%d).log"
  [ -f "$today_log" ] && OC_LOG_PATH="$today_log"
  # Priority 2: search known directories for openclaw-named logs
  if [ -z "$OC_LOG_PATH" ]; then
    local log_candidates=("$cwd/logs" "$cwd/log" "$cwd" "/var/log/openclaw" "/tmp/openclaw")
    for ldir in "${log_candidates[@]}"; do
      [ -z "$ldir" ] || [ ! -d "$ldir" ] && continue
      local newest_log
      newest_log=$(find "$ldir" -maxdepth 2 \( -name "openclaw*.log" -o -name "output.log" \) 2>/dev/null | head -5 | xargs ls -t 2>/dev/null | head -1 || echo "")
      if [ -n "$newest_log" ]; then
        OC_LOG_PATH="$newest_log"
        break
      fi
    done
  fi
  # Also check if process has stdout redirected to a file
  if [ -z "$OC_LOG_PATH" ] && [ -L "/proc/$pid/fd/1" ]; then
    local stdout_target
    stdout_target=$(readlink -f /proc/$pid/fd/1 2>/dev/null || echo "")
    if [ -n "$stdout_target" ] && [ -f "$stdout_target" ]; then
      OC_LOG_PATH="$stdout_target"
    fi
  fi

  # Find data directory
  OC_DATA_DIR=""
  for dd in "$cwd/data" "$cwd/db" "$cwd/.data" "/var/lib/openclaw"; do
    [ -d "$dd" ] && OC_DATA_DIR="$dd" && break
  done

  # Version detection
  OC_VERSION=""
  # Check package.json
  [ -f "$cwd/package.json" ] && OC_VERSION=$(grep -oP '"version"\s*:\s*"\K[^"]+' "$cwd/package.json" 2>/dev/null || echo "")
  # Check --version from cmdline output (cached)
  if [ -z "$OC_VERSION" ]; then
    local bin_path
    bin_path=$(readlink -f /proc/$pid/exe 2>/dev/null || echo "")
    [ -n "$bin_path" ] && [ -x "$bin_path" ] && OC_VERSION=$("$bin_path" --version 2>/dev/null | head -1 | grep -oP '[0-9]+\.[0-9]+[.0-9]*' || echo "")
  fi

  # Save file-based log path before potentially overriding with journald
  OC_FILE_LOG="$OC_LOG_PATH"

  # Also discover log path from gateway startup message in journald
  if [ -z "$OC_FILE_LOG" ] && command -v journalctl &>/dev/null; then
    local gw_log=""
    if [ -n "$OC_SERVICE_NAME" ]; then
      gw_log=$(journalctl -u "$OC_SERVICE_NAME" --no-pager -o cat -n 50 2>/dev/null | grep -aoP '\[gateway\] log file: \K\S+' | tail -1 || echo "")
    elif [ -n "$pid" ]; then
      gw_log=$(journalctl _PID="$pid" --no-pager -o cat -n 50 2>/dev/null | grep -aoP '\[gateway\] log file: \K\S+' | tail -1 || echo "")
    fi
    [ -n "$gw_log" ] && [ -f "$gw_log" ] && OC_FILE_LOG="$gw_log"
  fi

  # If no file log found, try OpenClaw default location
  if [ -z "$OC_FILE_LOG" ] || [ ! -f "$OC_FILE_LOG" ] || echo "$OC_FILE_LOG" | grep -qv 'openclaw'; then
    local today_default="/tmp/openclaw/openclaw-$(date +%Y-%m-%d).log"
    [ -f "$today_default" ] && OC_FILE_LOG="$today_default"
  fi

  # Determine best primary log source for journald
  if [ -n "$OC_SERVICE_NAME" ]; then
    OC_LOG_PATH="journald:$OC_SERVICE_NAME"
  elif [ -n "$pid" ] && [ -L "/proc/$pid/fd/1" ]; then
    local stdout_link
    stdout_link=$(readlink /proc/$pid/fd/1 2>/dev/null || echo "")
    if echo "$stdout_link" | grep -q 'socket:'; then
      local comm_name
      comm_name=$(cat /proc/$pid/comm 2>/dev/null || echo "")
      OC_LOG_PATH="journald-pid:$pid:$comm_name"
    fi
  fi

  # Find sessions directory (for dashboard gateway messages)
  OC_SESSIONS_DIR=""
  local session_candidates=("$HOME/.openclaw/agents/main/sessions" "$HOME/.openclaw/sessions" "$cwd/sessions" "/var/lib/openclaw/sessions")
  for sd in "${session_candidates[@]}"; do
    if [ -d "$sd" ]; then
      OC_SESSIONS_DIR="$sd"
      break
    fi
  done

  OC_PID_CACHED="$pid"
  echo "[clawpulse] Discovered OpenClaw: pid=$pid model=$OC_MODEL provider=$OC_PROVIDER config=$OC_CONFIG_PATH logs=$OC_LOG_PATH file_log=$OC_FILE_LOG sessions=$OC_SESSIONS_DIR service=$OC_SERVICE_NAME"
}

# ═══════════════════════════════════════════════════════
#  SYSTEM METRIC COLLECTORS
# ═══════════════════════════════════════════════════════
get_cpu() {
  if [ -f /proc/stat ]; then
    local c1 c2
    c1=$(awk '/^cpu /{print $2+$3+$4+$6+$7+$8, $5}' /proc/stat)
    sleep 1
    c2=$(awk '/^cpu /{print $2+$3+$4+$6+$7+$8, $5}' /proc/stat)
    echo "$c1 $c2" | awk '{d=($3-$1)+($4-$2); if(d>0) printf "%d", 100*($3-$1)/d; else print 0}'
  else
    echo -1
  fi
}

get_memory() {
  [ -f /proc/meminfo ] && awk '/MemTotal/{t=$2} /MemAvailable/{a=$2} END{if(t>0) printf "%d", 100*(t-a)/t; else print -1}' /proc/meminfo || echo -1
}

get_memory_total_mb() {
  [ -f /proc/meminfo ] && awk '/MemTotal/{printf "%d", $2/1024}' /proc/meminfo || echo 0
}

get_memory_used_mb() {
  [ -f /proc/meminfo ] && awk '/MemTotal/{t=$2} /MemAvailable/{a=$2} END{printf "%d", (t-a)/1024}' /proc/meminfo || echo 0
}

get_swap_usage() {
  [ -f /proc/meminfo ] && awk '/SwapTotal/{t=$2} /SwapFree/{f=$2} END{if(t>0) printf "%d", 100*(t-f)/t; else print 0}' /proc/meminfo || echo 0
}

get_disk_usage() { df -h / 2>/dev/null | awk 'NR==2{gsub(/%/,"",$5); print $5}' || echo -1; }
get_disk_total_gb() { df -BG / 2>/dev/null | awk 'NR==2{gsub(/G/,"",$2); print $2}' || echo 0; }
get_load_avg() { [ -f /proc/loadavg ] && awk '{print $1, $2, $3}' /proc/loadavg || echo "0 0 0"; }
get_uptime_seconds() { [ -f /proc/uptime ] && awk '{printf "%d", $1}' /proc/uptime || echo 0; }

get_open_connections() {
  if command -v ss &>/dev/null; then
    ss -tun state established 2>/dev/null | tail -n +2 | wc -l
  else
    echo 0
  fi
}

get_process_count() { ls -1d /proc/[0-9]* 2>/dev/null | wc -l; }

# ═══════════════════════════════════════════════════════
#  OPENCLAW PROCESS DEEP INSPECTION
# ═══════════════════════════════════════════════════════
find_openclaw_pid() {
  local pid=""
  # By name
  pid=$(pgrep -f "openclaw" 2>/dev/null | head -1 || echo "")
  [ -z "$pid" ] && pid=$(pgrep -f "node.*openclaw" 2>/dev/null | head -1 || echo "")
  [ -z "$pid" ] && pid=$(pgrep -f "python.*openclaw" 2>/dev/null | head -1 || echo "")
  # By port
  if [ -z "$pid" ]; then
    pid=$(ss -tlnp 2>/dev/null | grep ":$OPENCLAW_PORT " | grep -o 'pid=[0-9]*' | head -1 | grep -o '[0-9]*' || echo "")
  fi
  echo "$pid"
}

get_process_details() {
  local pid="$1"
  local mem=0 cpupct=0 uptime_s=0 fds=0 threads=0 conns=0

  if [ -n "$pid" ] && [ -d "/proc/$pid" ]; then
    # Memory RSS in MB
    mem=$(awk '/VmRSS/{printf "%d", $2/1024}' /proc/$pid/status 2>/dev/null || echo 0)
    # CPU
    cpupct=$(ps -p "$pid" -o %cpu= 2>/dev/null | awk '{printf "%d", $1}' || echo 0)
    # Process uptime
    if [ -f /proc/uptime ]; then
      local sys_up proc_start hz
      sys_up=$(awk '{printf "%d", $1}' /proc/uptime)
      proc_start=$(awk '{print $22}' /proc/$pid/stat 2>/dev/null || echo 0)
      hz=$(getconf CLK_TCK 2>/dev/null || echo 100)
      [ "$proc_start" -gt 0 ] 2>/dev/null && uptime_s=$(( sys_up - proc_start / hz ))
      [ "$uptime_s" -lt 0 ] && uptime_s=0
    fi
    # File descriptors
    fds=$(ls /proc/$pid/fd 2>/dev/null | wc -l || echo 0)
    # Threads
    threads=$(awk '/Threads/{print $2}' /proc/$pid/status 2>/dev/null || echo 0)
    # Network connections from this process
    conns=$(ls -la /proc/$pid/fd 2>/dev/null | grep socket | wc -l || echo 0)
  fi

  echo "$mem $cpupct $uptime_s $fds $threads $conns"
}

# Count connections to known LLM API endpoints
get_llm_api_connections() {
  local pid="$1"
  local count=0
  if [ -n "$pid" ] && command -v ss &>/dev/null; then
    # Check for connections to known LLM providers
    count=$(ss -tnp 2>/dev/null | grep "pid=$pid" | grep -cE '(api\.openai\.com|api\.anthropic\.com|generativelanguage\.googleapis\.com|api\.mistral\.ai|api\.groq\.com|api\.cohere\.ai|localhost:11434)' || echo 0)
  fi
  echo "$count"
}

# ═══════════════════════════════════════════════════════
#  LOG PARSING — extract live metrics from OpenClaw logs
#  Supports: journalctl (systemd), log files, stdout files
# ═══════════════════════════════════════════════════════
LOG_CURSOR_FILE="$STATE_DIR/journal_cursor"
LOG_OFFSET_FILE="$STATE_DIR/log_offset"
REQ_TOTAL_FILE="$STATE_DIR/req_total"
ERR_TOTAL_FILE="$STATE_DIR/err_total"
TOKENS_IN_FILE="$STATE_DIR/tokens_in"
TOKENS_OUT_FILE="$STATE_DIR/tokens_out"

# Initialize counters
[ -f "$REQ_TOTAL_FILE" ] || echo 0 > "$REQ_TOTAL_FILE"
[ -f "$ERR_TOTAL_FILE" ] || echo 0 > "$ERR_TOTAL_FILE"
[ -f "$TOKENS_IN_FILE" ] || echo 0 > "$TOKENS_IN_FILE"
[ -f "$TOKENS_OUT_FILE" ] || echo 0 > "$TOKENS_OUT_FILE"

get_new_lines() {
  local log_source="$1"
  local svc_name="$2"

  # Strategy 1: journalctl -u <service> — preferred when service name is known
  if [ -n "$svc_name" ] && command -v journalctl &>/dev/null; then
    local cursor_arg=""
    if [ -f "$LOG_CURSOR_FILE" ]; then
      local saved_cursor
      saved_cursor=$(cat "$LOG_CURSOR_FILE" 2>/dev/null || echo "")
      [ -n "$saved_cursor" ] && cursor_arg="--after-cursor=$saved_cursor"
    fi
    if [ -z "$cursor_arg" ]; then
      cursor_arg="-n 500"
    fi
    local jout
    jout=$(journalctl -u "$svc_name" --no-pager -o cat $cursor_arg 2>/dev/null || echo "")
    # Save cursor for next time
    local new_cursor
    new_cursor=$(journalctl -u "$svc_name" --no-pager -o export -n 1 2>/dev/null | grep -aoP '^__CURSOR=\K.*' || echo "")
    [ -n "$new_cursor" ] && echo "$new_cursor" > "$LOG_CURSOR_FILE"
    echo "$jout"
    return
  fi

  # Strategy 2: journalctl _PID=<pid> or _COMM=<comm> — for non-systemd processes logging to journald
  if echo "$log_source" | grep -q '^journald-pid:'; then
    local jpid jcomm
    jpid=$(echo "$log_source" | cut -d: -f2)
    jcomm=$(echo "$log_source" | cut -d: -f3)
    local cursor_arg=""
    if [ -f "$LOG_CURSOR_FILE" ]; then
      local saved_cursor
      saved_cursor=$(cat "$LOG_CURSOR_FILE" 2>/dev/null || echo "")
      [ -n "$saved_cursor" ] && cursor_arg="--after-cursor=$saved_cursor"
    fi
    if [ -z "$cursor_arg" ]; then
      cursor_arg="-n 500"
    fi
    local jout=""
    # Try _PID first (most precise)
    if [ -n "$jpid" ]; then
      jout=$(journalctl _PID="$jpid" --no-pager -o cat $cursor_arg 2>/dev/null || echo "")
    fi
    # If empty and we have a comm name, try _COMM (works across restarts)
    if [ -z "$jout" ] && [ -n "$jcomm" ]; then
      jout=$(journalctl _COMM="$jcomm" --no-pager -o cat $cursor_arg 2>/dev/null || echo "")
    fi
    # Save cursor
    local new_cursor=""
    if [ -n "$jpid" ]; then
      new_cursor=$(journalctl _PID="$jpid" --no-pager -o export -n 1 2>/dev/null | grep -aoP '^__CURSOR=\K.*' || echo "")
    fi
    if [ -z "$new_cursor" ] && [ -n "$jcomm" ]; then
      new_cursor=$(journalctl _COMM="$jcomm" --no-pager -o export -n 1 2>/dev/null | grep -aoP '^__CURSOR=\K.*' || echo "")
    fi
    [ -n "$new_cursor" ] && echo "$new_cursor" > "$LOG_CURSOR_FILE"
    echo "$jout"
    return
  fi

  # Strategy 3: log file with byte offset (fallback when no journald)
  if [ -n "$log_source" ] && [ -f "$log_source" ]; then
    local file_size prev_offset=0
    file_size=$(stat -c%s "$log_source" 2>/dev/null || echo 0)
    [ -f "$LOG_OFFSET_FILE" ] && prev_offset=$(cat "$LOG_OFFSET_FILE" 2>/dev/null || echo 0)
    [ "$file_size" -lt "$prev_offset" ] && prev_offset=0
    if [ "$file_size" -gt "$prev_offset" ]; then
      tail -c +"$((prev_offset + 1))" "$log_source" 2>/dev/null
    fi
    echo "$file_size" > "$LOG_OFFSET_FILE"
    return
  fi

  echo ""
}

analyze_lines() {
  local new_lines="$1"
  local new_requests=0
  local new_errors=0
  local latency_sum=0
  local latency_count=0
  local new_tokens_in=0
  local new_tokens_out=0
  local sessions=0
  local last_err=""

  if [ -z "$new_lines" ]; then
    echo "0 0 0 0 0 0 0 0"
    return
  fi

  # ── Requests: OpenClaw logs "[ws] ⇄ res ✓ cmd Xms" (unicode may be stripped) ──
  new_requests=$(echo "$new_lines" | grep -acE '(\[ws\].*res.*[0-9]+ms|chat\.|node\.|device\.|config\.|completed|chat\.completion|message\.create|POST /|GET /|request completed)' || echo 0)
  # Fallback if no OpenClaw-style matches
  [ "$new_requests" -eq 0 ] && new_requests=$(echo "$new_lines" | grep -aciE '(request|response|200 OK)' || echo 0)

  # ── Errors: OpenClaw uses "error", "failed", "FailoverError", "timed out" ──
  new_errors=$(echo "$new_lines" | grep -aciE '(error|exception|fatal|failed|FailoverError|timed out|500|502|503|429|rate.limit)' || echo 0)

  # ── Latency: OpenClaw logs "chat.history 66ms", "chat.abort 55ms" ──
  local latencies
  latencies=$(echo "$new_lines" | grep -aoP '\b[0-9]+ms\b' | grep -oP '[0-9]+' || echo "")
  if [ -n "$latencies" ]; then
    latency_sum=$(echo "$latencies" | awk '{s+=$1} END{print s+0}')
    latency_count=$(echo "$latencies" | wc -l)
  fi

  # ── Tokens: check for token usage in logs ──
  local t_in t_out
  t_in=$(echo "$new_lines" | grep -aoiP '(?:prompt.tokens|input.tokens|tokens.in|usage.*prompt)[=: ]*\K[0-9]+' || echo "")
  t_out=$(echo "$new_lines" | grep -aoiP '(?:completion.tokens|output.tokens|tokens.out|usage.*completion)[=: ]*\K[0-9]+' || echo "")
  [ -n "$t_in" ] && new_tokens_in=$(echo "$t_in" | awk '{s+=$1} END{print s+0}')
  [ -n "$t_out" ] && new_tokens_out=$(echo "$t_out" | awk '{s+=$1} END{print s+0}')

  # ── Active sessions: unique conn= IDs in recent lines ──
  sessions=$(echo "$new_lines" | grep -aoP 'conn=[a-f0-9-]+' | sort -u | wc -l || echo 0)
  # Fallback: session IDs
  [ "$sessions" -eq 0 ] && sessions=$(echo "$new_lines" | grep -aoiP '(?:session.id|conversation.id|thread.id)[=: "]*\K[a-zA-Z0-9_-]+' | sort -u | wc -l || echo 0)

  # ── Last error ──
  last_err=$(echo "$new_lines" | grep -aiE '(error|exception|fatal|failed|timed out)' | tail -1 | head -c 500 || echo "")

  echo "$new_requests $new_errors $latency_sum $latency_count $new_tokens_in $new_tokens_out $sessions $last_err"
}

TASKS_FILE="$STATE_DIR/tasks.json"

extract_tasks() {
  local new_lines="$1"
  [ -z "$new_lines" ] && return

  echo '[' > "$TASKS_FILE"
  local first=1

  # ── API requests: [ws] ... res ... command Xms conn=... id=... ──
  while IFS= read -r line; do
    [ -z "$line" ] && continue
    local ts cmd dur conn rid status
    ts=$(echo "$line" | grep -aoP '^\S+' || echo "")
    cmd=$(echo "$line" | grep -aoP 'res\s+.\s+\K\S+' || echo "")
    [ -z "$cmd" ] && cmd=$(echo "$line" | grep -aoP 'res\s+\K\S+' || echo "")
    dur=$(echo "$line" | grep -aoP '\b([0-9]+)ms\b' | head -1 | grep -oP '[0-9]+' || echo "")
    conn=$(echo "$line" | grep -aoP 'conn=\K[a-f0-9-]+' | head -1 || echo "")
    rid=$(echo "$line" | grep -aoP 'id=\K[a-f0-9-]+' | head -1 || echo "")
    [ -z "$cmd" ] && continue
    [ "$first" -eq 1 ] && first=0 || echo ',' >> "$TASKS_FILE"
    printf '{"taskType":"request","command":"%s","status":"success","durationMs":%s,"connId":"%s","requestId":"%s","taskAt":"%s"}' \
      "$(echo "$cmd" | head -c 100)" "${dur:-0}" "$conn" "$rid" "$ts" >> "$TASKS_FILE"
  done <<< "$(echo "$new_lines" | grep -aE '\[ws\].*res.*[0-9]+ms')"

  # ── Agent runs: [agent/embedded] embedded run agent end: runId=... isError=... ──
  while IFS= read -r line; do
    [ -z "$line" ] && continue
    local ts run_id is_err err_msg
    ts=$(echo "$line" | grep -aoP '^\S+' || echo "")
    run_id=$(echo "$line" | grep -aoP 'runId=\K[a-f0-9-]+' || echo "")
    is_err=$(echo "$line" | grep -aoP 'isError=\K\w+' || echo "false")
    err_msg=""
    [ "$is_err" = "true" ] && err_msg=$(echo "$line" | grep -aoP 'error=\K.*' | head -c 300 || echo "")
    local st="success"
    [ "$is_err" = "true" ] && st="error"
    [ "$first" -eq 1 ] && first=0 || echo ',' >> "$TASKS_FILE"
    printf '{"taskType":"agent_run","command":"agent.run","status":"%s","requestId":"%s","error":"%s","taskAt":"%s"}' \
      "$st" "$run_id" "$(echo "$err_msg" | sed 's/"/\\"/g' | head -c 250)" "$ts" >> "$TASKS_FILE"
  done <<< "$(echo "$new_lines" | grep -aE '\[agent/embedded\].*agent end')"

  # ── Lane task errors: [diagnostic] lane task error: lane=... durationMs=... error=... ──
  while IFS= read -r line; do
    [ -z "$line" ] && continue
    local ts lane dur err_msg
    ts=$(echo "$line" | grep -aoP '^\S+' || echo "")
    lane=$(echo "$line" | grep -aoP 'lane=\K\S+' | head -1 || echo "")
    dur=$(echo "$line" | grep -aoP 'durationMs=\K[0-9]+' || echo "")
    err_msg=$(echo "$line" | grep -aoP 'error="\K[^"]+' | head -c 300 || echo "")
    [ "$first" -eq 1 ] && first=0 || echo ',' >> "$TASKS_FILE"
    printf '{"taskType":"error","command":"lane.task","status":"error","durationMs":%s,"lane":"%s","error":"%s","taskAt":"%s"}' \
      "${dur:-0}" "$(echo "$lane" | head -c 100)" "$(echo "$err_msg" | sed 's/"/\\"/g' | head -c 250)" "$ts" >> "$TASKS_FILE"
  done <<< "$(echo "$new_lines" | grep -aE '\[diagnostic\] lane task error')"

  # ── Connections: webchat connected/disconnected ──
  while IFS= read -r line; do
    [ -z "$line" ] && continue
    local ts conn action
    ts=$(echo "$line" | grep -aoP '^\S+' || echo "")
    conn=$(echo "$line" | grep -aoP 'conn=\K[a-f0-9-]+' | head -1 || echo "")
    if echo "$line" | grep -aq 'connected'; then
      if echo "$line" | grep -aq 'disconnected'; then
        action="disconnected"
      else
        action="connected"
      fi
    fi
    [ "$first" -eq 1 ] && first=0 || echo ',' >> "$TASKS_FILE"
    printf '{"taskType":"connection","command":"ws.%s","status":"success","connId":"%s","taskAt":"%s"}' \
      "$action" "$conn" "$ts" >> "$TASKS_FILE"
  done <<< "$(echo "$new_lines" | grep -aE '\[ws\].*(connected|disconnected)' | grep -avE 'Proxy headers')"

  echo ']' >> "$TASKS_FILE"
}

FILE_LOG_OFFSET_FILE="$STATE_DIR/file_log_offset"
FILE_TASKS_FILE="$STATE_DIR/file_tasks.json"
SESSION_TASKS_FILE="$STATE_DIR/session_tasks.json"
SESSION_OFFSETS_FILE="$STATE_DIR/session_offsets"
OC_SESSIONS_DIR=""

extract_file_log_tasks() {
  local log_file="$1"
  [ -z "$log_file" ] || [ ! -f "$log_file" ] && return

  local file_size prev_offset=0
  file_size=$(stat -c%s "$log_file" 2>/dev/null || echo 0)
  [ -f "$FILE_LOG_OFFSET_FILE" ] && prev_offset=$(cat "$FILE_LOG_OFFSET_FILE" 2>/dev/null || echo 0)
  [ "$file_size" -lt "$prev_offset" ] && prev_offset=0
  [ "$file_size" -le "$prev_offset" ] && return

  local new_data
  new_data=$(tail -c +"$((prev_offset + 1))" "$log_file" 2>/dev/null)
  echo "$file_size" > "$FILE_LOG_OFFSET_FILE"

  [ -z "$new_data" ] && return

  # Parse JSON log lines into task entries
  echo '[' > "$FILE_TASKS_FILE"
  local first=1

  while IFS= read -r line; do
    [ -z "$line" ] && continue

    # Extract fields from JSON log entry
    local ts module desc
    ts=$(echo "$line" | grep -aoP '"date"\s*:\s*"\K[^"]+' | head -1 || echo "")
    [ -z "$ts" ] && continue
    # Module from escaped JSON in "0": {\"module\":\"web-auto-reply\"} or {\"subsystem\":\"gateway/...\" }
    module=$(echo "$line" | grep -aoP '\\?"module\\?"\s*:\\?\s*\\?"\K[^"\\]+' | head -1 || echo "")
    [ -z "$module" ] && module=$(echo "$line" | grep -aoP '\\?"subsystem\\?"\s*:\\?\s*\\?"\K[^"\\]+' | head -1 || echo "")
    # "2" field is the human-readable description
    desc=$(echo "$line" | grep -aoP '"2"\s*:\s*"\K[^"]+' | head -1 || echo "")
    # Skip lines with no desc AND no module
    [ -z "$desc" ] && [ -z "$module" ] && continue

    # Skip noise entries
    echo "$desc" | grep -qiE 'heartbeat|hook.*loaded|hook.*registered|canvas.*host.*mounted|proxy headers|armTimer|cron: started$|health-monitor.*restarting' && continue

    local task_type="event" cmd="" status="success" dur="" conn_id="" err="" raw=""

    # Helper: extract field1 as string (handles both JSON objects and strings in "1")
    local f1_str
    f1_str=$(echo "$line" | grep -aoP '"1"\s*:\s*"\K[^"]*' | head -1 2>/dev/null || echo "")
    # Common fields from JSON object in "1"
    local f_body f_text f_from f_to f_dur f_corr f_error f_jid f_mid f_connid f_method
    f_body=$(echo "$line" | grep -aoP '"body"\s*:\s*"\K[^"]*' | head -1 2>/dev/null | head -c 500 || echo "")
    f_text=$(echo "$line" | grep -aoP '"text"\s*:\s*"\K[^"]*' | head -1 2>/dev/null | head -c 500 || echo "")
    f_from=$(echo "$line" | grep -aoP '"from"\s*:\s*"\K[^"\\]+' | head -1 2>/dev/null || echo "")
    f_to=$(echo "$line" | grep -aoP '"to"\s*:\s*"\K[^"\\]+' | head -1 2>/dev/null || echo "")
    f_dur=$(echo "$line" | grep -aoP '"durationMs"\s*:\s*\K[0-9]+' | head -1 2>/dev/null || echo "")
    f_corr=$(echo "$line" | grep -aoP '"correlationId"\s*:\s*"\K[^"]+' | head -1 2>/dev/null || echo "")
    f_error=$(echo "$line" | grep -aoP '"error"\s*:\s*"\K[^"]*' | head -1 2>/dev/null | head -c 500 || echo "")
    f_jid=$(echo "$line" | grep -aoP '"jid"\s*:\s*"\K[^"]+' | head -1 2>/dev/null || echo "")
    f_mid=$(echo "$line" | grep -aoP '"messageId"\s*:\s*"\K[^"]+' | head -1 2>/dev/null || echo "")
    f_connid=$(echo "$line" | grep -aoP '"connectionId"\s*:\s*"\K[^"]+' | head -1 2>/dev/null || echo "")
    f_method=$(echo "$line" | grep -aoP '"method"\s*:\s*"\K[^"]+' | head -1 2>/dev/null || echo "")
    local f_logLevel
    f_logLevel=$(echo "$line" | grep -aoP '"logLevelName"\s*:\s*"\K[^"]+' | head -1 2>/dev/null || echo "")

    # Safe JSON string escaper
    _esc() { echo "$1" | sed 's/\\/\\\\/g;s/"/\\\\\\"/g' | tr -d '\n\r' | head -c 400; }

    # Classify by description first (most reliable), then module
    if echo "$desc" | grep -qi 'auto-reply sent'; then
      task_type="message"; cmd="message.reply"
      dur="$f_dur"; conn_id="$f_corr"
      raw="{\\\"text\\\":\\\"$(_esc "$f_text")\\\",\\\"to\\\":\\\"$f_to\\\",\\\"from\\\":\\\"$f_from\\\",\\\"durationMs\\\":${f_dur:-0},\\\"connectionId\\\":\\\"$f_connid\\\"}"
    elif echo "$desc" | grep -qi 'inbound.*message\|inbound message'; then
      task_type="message"; cmd="message.received"
      conn_id="$f_corr"
      raw="{\\\"body\\\":\\\"$(_esc "$f_body")\\\",\\\"from\\\":\\\"$f_from\\\",\\\"to\\\":\\\"$f_to\\\",\\\"connectionId\\\":\\\"$f_connid\\\"}"
    elif echo "$desc" | grep -qi 'sending'; then
      task_type="message"; cmd="message.sending"
      conn_id="$f_corr"
      local f_media
      f_media=$(echo "$line" | grep -aoP '"hasMedia"\s*:\s*\K\w+' | head -1 2>/dev/null || echo "")
      raw="{\\\"jid\\\":\\\"$f_jid\\\",\\\"hasMedia\\\":\\\"$f_media\\\"}"
    elif echo "$desc" | grep -qi 'sent message'; then
      task_type="message"; cmd="message.sent"
      conn_id="$f_corr"
      raw="{\\\"jid\\\":\\\"$f_jid\\\",\\\"messageId\\\":\\\"$f_mid\\\"}"
    elif echo "$desc" | grep -qi 'agent.*end\|agent.*start\|embedded.*run'; then
      task_type="agent_run"; cmd="agent.run"
      local f_runId f_sessId f_timeout
      f_runId=$(echo "$line" | grep -aoP '"runId"\s*:\s*"\K[^"]+' | head -1 2>/dev/null || echo "")
      f_sessId=$(echo "$line" | grep -aoP '"sessionId"\s*:\s*"\K[^"]+' | head -1 2>/dev/null || echo "")
      f_timeout=$(echo "$line" | grep -aoP '"timeoutMs"\s*:\s*\K[0-9]+' | head -1 2>/dev/null || echo "")
      conn_id="$f_runId"; dur="$f_dur"
      local is_err
      is_err=$(echo "$line" | grep -aoP '"isError"\s*:\s*\K\w+' | head -1 2>/dev/null || echo "false")
      [ "$is_err" = "true" ] && status="error"
      err="$f_error"
      raw="{\\\"runId\\\":\\\"$f_runId\\\",\\\"sessionId\\\":\\\"$f_sessId\\\",\\\"timeoutMs\\\":${f_timeout:-0},\\\"detail\\\":\\\"$(_esc "$desc")\\\"}"
    elif echo "$desc" | grep -qiE 'error|failed|timed out'; then
      task_type="error"
      cmd=$(echo "$module" | head -c 50)
      [ -z "$cmd" ] && cmd="error"
      status="error"
      err=$(echo "$desc" | head -c 300)
      # Extract error detail from field1
      [ -n "$f_error" ] && err="$f_error"
      raw="{\\\"module\\\":\\\"$(_esc "$module")\\\",\\\"detail\\\":\\\"$(_esc "$desc")\\\",\\\"error\\\":\\\"$(_esc "$f_error")\\\",\\\"method\\\":\\\"$f_method\\\"}"
    elif echo "$module" | grep -qi 'whatsapp'; then
      task_type="connection"
      if echo "$desc$f1_str" | grep -qi 'starting\|listening'; then
        cmd="whatsapp.start"
      elif echo "$desc$f1_str" | grep -qi 'exited\|closed\|stop\|restarting'; then
        cmd="whatsapp.restart"
        echo "$desc$f1_str" | grep -qi 'error\|exited' && status="error"
      else
        cmd="whatsapp.event"
      fi
      local detail
      detail=$(echo "$f1_str" | head -c 200)
      [ -z "$detail" ] && detail=$(echo "$desc" | head -c 200)
      raw="{\\\"module\\\":\\\"$(_esc "$module")\\\",\\\"detail\\\":\\\"$(_esc "$detail")\\\"}"
    elif echo "$module" | grep -qi 'gateway\|browser\|gmail\|agent'; then
      task_type="event"
      cmd=$(echo "$module" | sed 's|gateway/||;s|channels/||' | head -c 50)
      local detail2
      detail2="$f1_str"
      [ -z "$detail2" ] && detail2="$desc"
      raw="{\\\"module\\\":\\\"$(_esc "$module")\\\",\\\"detail\\\":\\\"$(_esc "$detail2")\\\"}"
    elif echo "$desc" | grep -qi 'signal'; then
      task_type="event"; cmd="system.signal"
      raw="{\\\"detail\\\":\\\"$(_esc "$f1_str")\\\"}"
    elif echo "$desc" | grep -qi 'inbound'; then
      task_type="message"; cmd="message.received"
      raw="{\\\"body\\\":\\\"$(_esc "$f_body")\\\",\\\"from\\\":\\\"$f_from\\\"}"
    else
      task_type="event"
      cmd=$(echo "$desc" | head -c 80)
      [ -z "$cmd" ] && cmd=$(echo "$module" | head -c 50)
      local detail3="$f1_str"
      [ -z "$detail3" ] && detail3="$desc"
      raw="{\\\"module\\\":\\\"$(_esc "$module")\\\",\\\"detail\\\":\\\"$(_esc "$detail3")\\\"}"
    fi

    [ "$first" -eq 1 ] && first=0 || echo ',' >> "$FILE_TASKS_FILE"
    # Sanitize for JSON
    err=$(echo "$err" | sed 's/"/\\"/g' | tr -d '\n\r' | head -c 250)
    cmd=$(echo "$cmd" | sed 's/"/\\"/g' | tr -d '\n\r' | head -c 100)
    printf '{"taskType":"%s","command":"%s","status":"%s","durationMs":%s,"connId":"%s","error":"%s","raw":"%s","taskAt":"%s"}' \
      "$task_type" "$cmd" "$status" "${dur:-0}" "$conn_id" "$err" "$raw" "$ts" >> "$FILE_TASKS_FILE"
  done <<< "$new_data"

  echo ']' >> "$FILE_TASKS_FILE"
}

# ═══════════════════════════════════════════════════════
#  SESSION LOG EXTRACTION (gateway dashboard chat)
# ═══════════════════════════════════════════════════════
extract_session_tasks() {
  local sessions_dir="$1"
  [ -z "$sessions_dir" ] || [ ! -d "$sessions_dir" ] && return
  command -v jq &>/dev/null || return

  # Load previous offsets (file:offset pairs)
  touch "$SESSION_OFFSETS_FILE"

  # Only check session files modified in the last 5 minutes
  local recent_files
  recent_files=$(find "$sessions_dir" -name "*.jsonl" -mmin -5 2>/dev/null)
  [ -z "$recent_files" ] && { echo '[]' > "$SESSION_TASKS_FILE"; return; }

  local tmp_tasks="$STATE_DIR/session_tasks_tmp.jsonl"
  > "$tmp_tasks"

  while IFS= read -r sfile; do
    [ -z "$sfile" ] && continue
    local fname prev_offset file_size
    fname=$(basename "$sfile")
    file_size=$(stat -c%s "$sfile" 2>/dev/null || echo 0)
    prev_offset=$(grep "^$fname:" "$SESSION_OFFSETS_FILE" 2>/dev/null | cut -d: -f2 || echo 0)
    [ -z "$prev_offset" ] && prev_offset=0
    [ "$file_size" -le "$prev_offset" ] && continue

    local new_data
    new_data=$(tail -c +"$((prev_offset + 1))" "$sfile" 2>/dev/null)
    # Update offset
    sed -i "/^$fname:/d" "$SESSION_OFFSETS_FILE" 2>/dev/null
    echo "$fname:$file_size" >> "$SESSION_OFFSETS_FILE"

    [ -z "$new_data" ] && continue

    local conn_id="${fname%.jsonl}"

    # Use jq to parse each message line and build task objects
    echo "$new_data" | jq -c --arg cid "$conn_id" '
      select(.type == "message")
      | .timestamp as $ts
      | .message.role as $role
      | (.message.content // [] | map(select(.type == "text") | .text) | join("") | .[0:600]) as $text
      | select($text | length > 0)
      | if $role == "user" then
          ($text | capture("\\] (?<msg>.*)$") // {msg: $text}) as $m
          | ($text | capture("\"label\"\\s*:\\s*\"(?<lbl>[^\"]+)\"") // {lbl: "dashboard"}) as $s
          | {taskType: "message", command: "dashboard.received", status: "success", durationMs: 0, connId: $cid, error: "",
             raw: ({body: $m.msg, from: $s.lbl, channel: "dashboard"} | tostring), taskAt: $ts}
        elif $role == "assistant" then
          ($text | ltrimstr("[[reply_to_current]]") | .[0:400]) as $reply
          | (.message.model // "") as $model
          | (.message.usage.input // 0) as $tin
          | (.message.usage.output // 0) as $tout
          | {taskType: "message", command: "dashboard.reply", status: "success", durationMs: 0, connId: $cid, error: "",
             raw: ({text: $reply, model: $model, tokensIn: $tin, tokensOut: $tout, channel: "dashboard"} | tostring), taskAt: $ts}
        else empty end
    ' >> "$tmp_tasks" 2>/dev/null

  done <<< "$recent_files"

  # Build JSON array from JSONL lines
  if [ -s "$tmp_tasks" ]; then
    jq -s '.' "$tmp_tasks" > "$SESSION_TASKS_FILE" 2>/dev/null || echo '[]' > "$SESSION_TASKS_FILE"
  else
    echo '[]' > "$SESSION_TASKS_FILE"
  fi
  rm -f "$tmp_tasks"
}

NEW_LINES_FILE="$STATE_DIR/new_lines.tmp"

parse_logs() {
  local log_source="$1"
  local svc_name="$2"
  local lines_file="$3"

  local new_lines=""
  [ -f "$lines_file" ] && new_lines=$(cat "$lines_file")

  local analysis
  analysis=$(analyze_lines "$new_lines")

  local new_requests new_errors latency_sum latency_count new_tokens_in new_tokens_out sessions last_err
  new_requests=$(echo "$analysis" | awk '{print $1+0}')
  new_errors=$(echo "$analysis" | awk '{print $2+0}')
  latency_sum=$(echo "$analysis" | awk '{print $3+0}')
  latency_count=$(echo "$analysis" | awk '{print $4+0}')
  new_tokens_in=$(echo "$analysis" | awk '{print $5+0}')
  new_tokens_out=$(echo "$analysis" | awk '{print $6+0}')
  sessions=$(echo "$analysis" | awk '{print $7+0}')
  last_err=$(echo "$analysis" | awk '{for(i=8;i<=NF;i++) printf "%s ", $i; print ""}' | sed 's/ *$//')

  # Update cumulative totals
  local prev_req prev_err prev_tin prev_tout
  prev_req=$(cat "$REQ_TOTAL_FILE" 2>/dev/null || echo 0)
  prev_err=$(cat "$ERR_TOTAL_FILE" 2>/dev/null || echo 0)
  prev_tin=$(cat "$TOKENS_IN_FILE" 2>/dev/null || echo 0)
  prev_tout=$(cat "$TOKENS_OUT_FILE" 2>/dev/null || echo 0)
  echo "$(( prev_req + new_requests ))" > "$REQ_TOTAL_FILE"
  echo "$(( prev_err + new_errors ))" > "$ERR_TOTAL_FILE"
  echo "$(( prev_tin + new_tokens_in ))" > "$TOKENS_IN_FILE"
  echo "$(( prev_tout + new_tokens_out ))" > "$TOKENS_OUT_FILE"

  # Compute avg latency
  local avg_latency=0
  [ "$latency_count" -gt 0 ] && avg_latency=$(( latency_sum / latency_count ))

  # Compute per-minute rates
  local req_per_min=0 err_per_min=0
  if [ "$INTERVAL" -gt 0 ]; then
    req_per_min=$(awk "BEGIN{printf \"%.2f\", $new_requests * 60 / $INTERVAL}")
    err_per_min=$(awk "BEGIN{printf \"%.2f\", $new_errors * 60 / $INTERVAL}")
  fi

  echo "$req_per_min $err_per_min $avg_latency $sessions $new_tokens_in $new_tokens_out $(cat $REQ_TOTAL_FILE) $(cat $ERR_TOTAL_FILE) $last_err"
}

# ═══════════════════════════════════════════════════════
#  OPENCLAW API PROBE
# ═══════════════════════════════════════════════════════
probe_openclaw_api() {
  local base="http://127.0.0.1:$OPENCLAW_PORT"
  local result="{}"

  # Try multiple common endpoints
  for endpoint in "/metrics" "/health" "/status" "/api/health" "/api/status" "/api/metrics" "/v1/health"; do
    local resp
    resp=$(curl -sS --connect-timeout 2 --max-time 3 "$base$endpoint" 2>/dev/null || echo "")
    if [ -n "$resp" ] && echo "$resp" | grep -qE '^\{'; then
      # Merge the first valid JSON response
      result="$resp"
      break
    fi
  done

  echo "$result"
}

# ═══════════════════════════════════════════════════════
#  SAFE JSON BUILDER — writes to temp file
# ═══════════════════════════════════════════════════════
PAYLOAD_FILE="$STATE_DIR/payload.json"

# Sanitize a string for JSON (remove control chars, escape quotes/backslashes, trim)
sanitize() {
  echo "$1" | tr -d '\000-\037' | sed 's/\\/\\\\/g; s/"/\\"/g' | tr -s ' ' | sed 's/^ *//;s/ *$//' | head -c 250
}

# Safe number: return value if numeric, else default
safe_num() {
  local val="$1" default="${2:-0}"
  case "$val" in
    ''|*[!0-9.-]*) echo "$default" ;;
    *) echo "$val" ;;
  esac
}

build_payload() {
  local f="$PAYLOAD_FILE"
  echo '{' > "$f"
  echo "\"status\":\"$1\"," >> "$f"
  # System info
  echo "\"hostname\":\"$(sanitize "$SYS_HOSTNAME")\"," >> "$f"
  echo "\"osInfo\":\"$(sanitize "$SYS_OS")\"," >> "$f"
  echo "\"kernelVersion\":\"$(sanitize "$SYS_KERNEL")\"," >> "$f"
  echo "\"arch\":\"$(sanitize "$SYS_ARCH")\"," >> "$f"
  # System metrics — always safe numbers
  echo "\"cpu\":$(safe_num "$CPU" 0)," >> "$f"
  echo "\"memory\":$(safe_num "$MEM" 0)," >> "$f"
  echo "\"memoryTotal\":$(safe_num "$MEM_TOTAL" 0)," >> "$f"
  echo "\"memoryUsed\":$(safe_num "$MEM_USED" 0)," >> "$f"
  echo "\"swapUsage\":$(safe_num "$SWAP" 0)," >> "$f"
  echo "\"diskUsage\":$(safe_num "$DISK" 0)," >> "$f"
  echo "\"diskTotal\":$(safe_num "$DISK_TOTAL" 0)," >> "$f"
  echo "\"loadAvg1\":$(safe_num "$LOAD1" 0)," >> "$f"
  echo "\"loadAvg5\":$(safe_num "$LOAD5" 0)," >> "$f"
  echo "\"loadAvg15\":$(safe_num "$LOAD15" 0)," >> "$f"
  echo "\"uptimeSeconds\":$(safe_num "$UPTIME" 0)," >> "$f"
  echo "\"openConns\":$(safe_num "$CONNS" 0)," >> "$f"
  echo "\"processCount\":$(safe_num "$PROCS" 0)," >> "$f"
  # OpenClaw process
  echo "\"openclawPid\":$(safe_num "$OC_PID" 0)," >> "$f"
  echo "\"openclawMem\":$(safe_num "$OC_MEM" 0)," >> "$f"
  echo "\"openclawCpu\":$(safe_num "$OC_CPU" 0)," >> "$f"
  echo "\"openclawUp\":$(safe_num "$OC_UP" 0)," >> "$f"
  echo "\"openclawFds\":$(safe_num "$OC_FDS" 0)," >> "$f"
  echo "\"openclawThreads\":$(safe_num "$OC_THREADS" 0)," >> "$f"
  echo "\"openclawConns\":$(safe_num "$OC_CONNS" 0)," >> "$f"
  # OpenClaw application — only include if non-empty
  [ -n "$FINAL_VERSION" ] && echo "\"ocVersion\":\"$(sanitize "$FINAL_VERSION")\"," >> "$f"
  [ -n "$FINAL_MODEL" ] && echo "\"ocModel\":\"$(sanitize "$FINAL_MODEL")\"," >> "$f"
  [ -n "$OC_PROVIDER" ] && echo "\"ocProvider\":\"$(sanitize "$OC_PROVIDER")\"," >> "$f"
  [ -n "$OC_TOOLS" ] && echo "\"ocTools\":\"$(sanitize "$OC_TOOLS")\"," >> "$f"
  [ -n "$OC_CONFIG_PATH" ] && echo "\"ocConfigPath\":\"$(sanitize "$OC_CONFIG_PATH")\"," >> "$f"
  [ -n "$OC_LOG_PATH" ] && echo "\"ocLogPath\":\"$(sanitize "$OC_LOG_PATH")\"," >> "$f"
  [ -n "$OC_DATA_DIR" ] && echo "\"ocDataDir\":\"$(sanitize "$OC_DATA_DIR")\"," >> "$f"
  # Live metrics
  echo "\"reqTotal\":$(safe_num "$L_REQ_TOTAL" 0)," >> "$f"
  echo "\"reqPerMin\":$(safe_num "$L_REQ_PM" 0)," >> "$f"
  echo "\"errTotal\":$(safe_num "$L_ERR_TOTAL" 0)," >> "$f"
  echo "\"errPerMin\":$(safe_num "$L_ERR_PM" 0)," >> "$f"
  echo "\"avgRespMs\":$(safe_num "$L_AVG_MS" 0)," >> "$f"
  echo "\"activeSessions\":$(safe_num "$L_SESSIONS" 0)," >> "$f"
  echo "\"tokensIn\":$(safe_num "$L_TOK_IN" 0)," >> "$f"
  echo "\"tokensOut\":$(safe_num "$L_TOK_OUT" 0)," >> "$f"
  echo "\"llmApiConns\":$(safe_num "$OC_LLM_CONNS" 0)," >> "$f"
  [ -n "$L_LAST_ERR" ] && echo "\"lastError\":\"$(sanitize "$L_LAST_ERR")\"," >> "$f"
  # API metrics
  [ -n "$API_LATENCY" ] && echo "\"latencyMs\":$(safe_num "$API_LATENCY" 0)," >> "$f"
  [ -n "$API_TOKENS" ] && echo "\"tokensPerMin\":$(safe_num "$API_TOKENS" 0)," >> "$f"
  [ -n "$API_SUCCESS" ] && echo "\"successRate\":$(safe_num "$API_SUCCESS" 0)," >> "$f"
  [ -n "$API_JOBS" ] && echo "\"jobsRunning\":$(safe_num "$API_JOBS" 0)," >> "$f"
  [ -n "$API_COST" ] && echo "\"costToday\":$(safe_num "$API_COST" 0)," >> "$f"
  [ -n "$FINAL_MODEL" ] && echo "\"model\":\"$(sanitize "$FINAL_MODEL")\"," >> "$f"
  [ -n "$FINAL_VERSION" ] && echo "\"version\":\"$(sanitize "$FINAL_VERSION")\"," >> "$f"
  # Sentinel — always last, no trailing comma issue
  echo "\"_t\":1" >> "$f"
  echo '}' >> "$f"
}

# ═══════════════════════════════════════════════════════
#  MAIN LOOP
# ═══════════════════════════════════════════════════════
echo "[clawpulse] Agent v2 started — reporting every ${INTERVAL}s"
echo "[clawpulse] Endpoint: $API"
echo "[clawpulse] OpenClaw port: $OPENCLAW_PORT"

AUTH_FAIL_COUNT=0
CONN_FAIL_COUNT=0
CURRENT_BACKOFF="$INTERVAL"
MAX_BACKOFF=600  # 10 minutes max between retries when server is down

while true; do
  # ── System metrics ──
  CPU=$(get_cpu)
  MEM=$(get_memory)
  MEM_TOTAL=$(get_memory_total_mb)
  MEM_USED=$(get_memory_used_mb)
  SWAP=$(get_swap_usage)
  DISK=$(get_disk_usage)
  DISK_TOTAL=$(get_disk_total_gb)
  LOAD=$(get_load_avg)
  LOAD1=$(echo "$LOAD" | awk '{print $1}')
  LOAD5=$(echo "$LOAD" | awk '{print $2}')
  LOAD15=$(echo "$LOAD" | awk '{print $3}')
  UPTIME=$(get_uptime_seconds)
  CONNS=$(get_open_connections)
  PROCS=$(get_process_count)

  # ── OpenClaw process ──
  OC_PID=$(find_openclaw_pid)

  # Re-discover if PID changed
  if [ "$OC_PID" != "$OC_PID_CACHED" ] && [ -n "$OC_PID" ]; then
    discover_openclaw "$OC_PID"
  fi

  # Process details
  if [ -n "$OC_PID" ] && [ -d "/proc/$OC_PID" ]; then
    OC_DETAILS=$(get_process_details "$OC_PID")
    OC_MEM=$(echo "$OC_DETAILS" | awk '{print $1}')
    OC_CPU=$(echo "$OC_DETAILS" | awk '{print $2}')
    OC_UP=$(echo "$OC_DETAILS" | awk '{print $3}')
    OC_FDS=$(echo "$OC_DETAILS" | awk '{print $4}')
    OC_THREADS=$(echo "$OC_DETAILS" | awk '{print $5}')
    OC_CONNS=$(echo "$OC_DETAILS" | awk '{print $6}')
    OC_LLM_CONNS=$(get_llm_api_connections "$OC_PID")
  else
    OC_PID="" OC_MEM=0 OC_CPU=0 OC_UP=0 OC_FDS=0 OC_THREADS=0 OC_CONNS=0 OC_LLM_CONNS=0
  fi

  # ── Log parsing ──
  # Fetch new lines in main shell (not subshell) so cursor/offset state persists
  get_new_lines "$OC_LOG_PATH" "$OC_SERVICE_NAME" > "$NEW_LINES_FILE"
  # Extract tasks from saved lines
  if [ -s "$NEW_LINES_FILE" ]; then
    extract_tasks "$(cat "$NEW_LINES_FILE")"
  fi
  LOG_DATA=$(parse_logs "$OC_LOG_PATH" "$OC_SERVICE_NAME" "$NEW_LINES_FILE")
  L_REQ_PM=$(echo "$LOG_DATA" | awk '{print $1+0}')
  L_ERR_PM=$(echo "$LOG_DATA" | awk '{print $2+0}')
  L_AVG_MS=$(echo "$LOG_DATA" | awk '{print $3+0}')
  L_SESSIONS=$(echo "$LOG_DATA" | awk '{print $4+0}')
  L_TOK_IN=$(echo "$LOG_DATA" | awk '{print $5+0}')
  L_TOK_OUT=$(echo "$LOG_DATA" | awk '{print $6+0}')
  L_REQ_TOTAL=$(echo "$LOG_DATA" | awk '{print $7+0}')
  L_ERR_TOTAL=$(echo "$LOG_DATA" | awk '{print $8+0}')
  L_LAST_ERR=$(echo "$LOG_DATA" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i; print ""}' | sed 's/ *$//')

  # ── OpenClaw API probe ──
  OC_API=$(probe_openclaw_api)
  API_LATENCY=$(echo "$OC_API" | grep -o '"latencyMs"[[:space:]]*:[[:space:]]*[0-9.]*' | head -1 | grep -o '[0-9.]*$' || echo "")
  API_TOKENS=$(echo "$OC_API" | grep -o '"tokensPerMin"[[:space:]]*:[[:space:]]*[0-9.]*' | head -1 | grep -o '[0-9.]*$' || echo "")
  API_SUCCESS=$(echo "$OC_API" | grep -o '"successRate"[[:space:]]*:[[:space:]]*[0-9.]*' | head -1 | grep -o '[0-9.]*$' || echo "")
  API_JOBS=$(echo "$OC_API" | grep -o '"jobsRunning"[[:space:]]*:[[:space:]]*[0-9]*' | head -1 | grep -o '[0-9]*$' || echo "")
  API_COST=$(echo "$OC_API" | grep -o '"costToday"[[:space:]]*:[[:space:]]*[0-9.]*' | head -1 | grep -o '[0-9.]*$' || echo "")
  API_MODEL=$(echo "$OC_API" | grep -o '"model"[[:space:]]*:[[:space:]]*"[^"]*"' | head -1 | grep -o '"[^"]*"$' | tr -d '"' || echo "")
  API_VERSION=$(echo "$OC_API" | grep -o '"version"[[:space:]]*:[[:space:]]*"[^"]*"' | head -1 | grep -o '"[^"]*"$' | tr -d '"' || echo "")

  FINAL_MODEL="${API_MODEL:-$OC_MODEL}"
  FINAL_VERSION="${API_VERSION:-$OC_VERSION}"

  # Determine status
  STATUS="healthy"

  # Build and send
  build_payload "$STATUS"

  HTTP_CODE=$(curl -sS -o /dev/null -w "%{http_code}" --connect-timeout 10 --max-time 15 \
    -X POST "$API/api/dashboard/telemetry" \
    -H "Authorization: Bearer $TOKEN" \
    -H "Content-Type: application/json" \
    -d @"$PAYLOAD_FILE" 2>/dev/null) || HTTP_CODE="000"

  # ── Extract tasks from file log (JSON format) ──
  extract_file_log_tasks "$OC_FILE_LOG"

  # ── Extract tasks from session JSONL files (dashboard gateway messages) ──
  extract_session_tasks "$OC_SESSIONS_DIR"

  NOW=$(date +"%H:%M:%S")

  if [ "$HTTP_CODE" = "200" ]; then
    # ── Success: reset all counters, send tasks ──
    echo "[clawpulse][$NOW] OK  cpu=$CPU% mem=$MEM% disk=$DISK% load=$LOAD1 oc_pid=${OC_PID:-none} req/m=$L_REQ_PM"
    AUTH_FAIL_COUNT=0
    CONN_FAIL_COUNT=0
    CURRENT_BACKOFF="$INTERVAL"

    # Only send tasks when telemetry succeeded (server is reachable)
    for tf in "$TASKS_FILE" "$FILE_TASKS_FILE" "$SESSION_TASKS_FILE"; do
      if [ -f "$tf" ] && [ -s "$tf" ]; then
        TF_SIZE=$(stat -c%s "$tf" 2>/dev/null || echo 0)
        if [ "$TF_SIZE" -gt 3 ]; then
          curl -sS -o /dev/null --connect-timeout 10 --max-time 15 \
            -X POST "$API/api/dashboard/tasks" \
            -H "Authorization: Bearer $TOKEN" \
            -H "Content-Type: application/json" \
            -d @"$tf" 2>/dev/null || true
        fi
      fi
    done

    sleep "$INTERVAL"

  elif [ "$HTTP_CODE" = "401" ] || [ "$HTTP_CODE" = "410" ]; then
    # ── Token invalid or instance deleted ──
    AUTH_FAIL_COUNT=$(( AUTH_FAIL_COUNT + 1 ))
    echo "[clawpulse][$NOW] ERROR: invalid token ($AUTH_FAIL_COUNT/10)"
    if [ "$AUTH_FAIL_COUNT" -ge 10 ]; then
      echo "[clawpulse][$NOW] FATAL: token rejected 10 times — instance was likely deleted. Stopping agent."
      echo "[clawpulse] To re-register, create a new instance on clawpulse.org and re-run the install command."
      systemctl disable clawpulse-agent.service 2>/dev/null || true
      exit 1
    fi
    sleep "$INTERVAL"

  else
    # ── Server unreachable (000=timeout/dns, 5xx=server error, etc.) ──
    CONN_FAIL_COUNT=$(( CONN_FAIL_COUNT + 1 ))
    echo "[clawpulse][$NOW] WARN: server unreachable (HTTP $HTTP_CODE, attempt $CONN_FAIL_COUNT, next retry in ${CURRENT_BACKOFF}s)"

    sleep "$CURRENT_BACKOFF"

    # Exponential backoff: double the wait, cap at MAX_BACKOFF
    CURRENT_BACKOFF=$(( CURRENT_BACKOFF * 2 ))
    [ "$CURRENT_BACKOFF" -gt "$MAX_BACKOFF" ] && CURRENT_BACKOFF="$MAX_BACKOFF"
  fi

done
AGENT_EOF

chmod +x "$AGENT_BIN"

cat > "$AGENT_DIR/config" << CONF_EOF
TOKEN=$TOKEN
API=$API
INTERVAL=$INTERVAL
OPENCLAW_PORT=$OPENCLAW_PORT
CONF_EOF
chmod 600 "$AGENT_DIR/config"

cat > /etc/systemd/system/clawpulse-agent.service << SVC_EOF
[Unit]
Description=ClawPulse Telemetry Agent v2
After=network-online.target
Wants=network-online.target

[Service]
Type=simple
Environment=HOME=/root
ExecStart=$AGENT_BIN $TOKEN $API $INTERVAL $OPENCLAW_PORT
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=clawpulse

[Install]
WantedBy=multi-user.target
SVC_EOF

systemctl daemon-reload
systemctl enable clawpulse-agent.service
systemctl restart clawpulse-agent.service

echo ""
echo "[clawpulse] Agent v2 installed and started!"
echo ""
echo "  Status:   sudo systemctl status clawpulse-agent"
echo "  Logs:     sudo journalctl -u clawpulse-agent -f"
echo "  Stop:     sudo systemctl stop clawpulse-agent"
echo "  Uninstall: sudo systemctl disable clawpulse-agent && sudo rm -rf /opt/clawpulse /etc/systemd/system/clawpulse-agent.service && sudo systemctl daemon-reload"
echo ""
