| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| |
| RNTIER_HOME="REDACTED_RNTIER_PATH" |
| OPENCLAW_BIN="${RNTIER_HOME}/.npm-global/bin/openclaw" |
| GATEWAY_PORT=18789 |
| GATEWAY_HOST="127.0.0.1" |
| PID_FILE="/tmp/openclaw-gateway.pid" |
| LOG_DIR="/tmp/openclaw" |
| LOG_FILE="${LOG_DIR}/watchdog.log" |
| GATEWAY_LOG="${LOG_DIR}/gateway.log" |
| MAX_LOG_SIZE=$((10 * 1024 * 1024)) |
| RESTART_COOLDOWN=120 |
| LAST_RESTART_FILE="/tmp/openclaw-last-restart" |
| CONSECUTIVE_FAIL_FILE="/tmp/openclaw-consecutive-fails" |
|
|
| |
| export PATH="${RNTIER_HOME}/.npm-global/bin:/usr/bin:/usr/local/bin:/bin:$PATH" |
| export HOME="/home/ghong" |
| export OPENCLAW_STATE_DIR="${RNTIER_HOME}/.openclaw" |
| export OPENCLAW_CONFIG_PATH="${RNTIER_HOME}/.openclaw/openclaw.json" |
|
|
| |
| mkdir -p "$LOG_DIR" |
|
|
| log() { |
| echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE" |
| } |
|
|
| rotate_log() { |
| local file="$1" |
| if [[ -f "$file" ]] && [[ $(stat -c%s "$file" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]]; then |
| mv "$file" "${file}.old" |
| log "Log rotated: $file" |
| fi |
| } |
|
|
| |
| check_gateway_http() { |
| if command -v curl &>/dev/null; then |
| curl -sf --max-time 5 -o /dev/null "http://${GATEWAY_HOST}:${GATEWAY_PORT}/__openclaw__/canvas/" 2>/dev/null |
| return $? |
| fi |
| return 1 |
| } |
|
|
| is_port_open() { |
| if command -v ss &>/dev/null; then |
| ss -tlnH "sport = :${GATEWAY_PORT}" 2>/dev/null | grep -q "$GATEWAY_PORT" |
| else |
| (echo > /dev/tcp/"$GATEWAY_HOST"/"$GATEWAY_PORT") 2>/dev/null |
| fi |
| } |
|
|
| is_process_alive() { |
| if [[ -f "$PID_FILE" ]]; then |
| local pid |
| pid=$(cat "$PID_FILE" 2>/dev/null) |
| if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then |
| return 0 |
| fi |
| fi |
| pgrep -f "openclaw.*gateway" >/dev/null 2>&1 |
| } |
|
|
| |
| |
| |
| |
| check_network() { |
| |
| if command -v getent &>/dev/null; then |
| getent hosts api.telegram.org >/dev/null 2>&1 && return 0 |
| getent hosts api.anthropic.com >/dev/null 2>&1 && return 0 |
| fi |
| |
| if command -v nslookup &>/dev/null; then |
| nslookup -timeout=5 api.telegram.org >/dev/null 2>&1 && return 0 |
| fi |
| |
| (echo > /dev/tcp/168.126.63.1/53) 2>/dev/null && return 0 |
| return 1 |
| } |
|
|
| cooldown_active() { |
| if [[ -f "$LAST_RESTART_FILE" ]]; then |
| local last_restart now diff |
| last_restart=$(cat "$LAST_RESTART_FILE" 2>/dev/null) |
| now=$(date +%s) |
| diff=$(( now - last_restart )) |
| if [[ $diff -lt $RESTART_COOLDOWN ]]; then |
| return 0 |
| fi |
| fi |
| return 1 |
| } |
|
|
| get_consecutive_fails() { |
| if [[ -f "$CONSECUTIVE_FAIL_FILE" ]]; then |
| cat "$CONSECUTIVE_FAIL_FILE" 2>/dev/null || echo 0 |
| else |
| echo 0 |
| fi |
| } |
|
|
| set_consecutive_fails() { |
| echo "$1" > "$CONSECUTIVE_FAIL_FILE" |
| } |
|
|
| start_gateway() { |
| log "ACTION: Starting OpenClaw gateway on port $GATEWAY_PORT..." |
|
|
| |
| local old_pids |
| old_pids=$(pgrep -f "openclaw.*gateway" 2>/dev/null || true) |
| if [[ -n "$old_pids" ]]; then |
| log "ACTION: Killing stale gateway processes: $old_pids" |
| echo "$old_pids" | xargs kill -9 2>/dev/null || true |
| sleep 2 |
| fi |
|
|
| |
| setsid nohup "$OPENCLAW_BIN" gateway run \ |
| --port "$GATEWAY_PORT" \ |
| --bind loopback \ |
| >> "$GATEWAY_LOG" 2>&1 < /dev/null & |
|
|
| local new_pid=$! |
| echo "$new_pid" > "$PID_FILE" |
| date +%s > "$LAST_RESTART_FILE" |
|
|
| log "ACTION: Gateway launched with PID $new_pid (setsid)" |
|
|
| |
| sleep 8 |
| if kill -0 "$new_pid" 2>/dev/null; then |
| log "OK: Gateway PID $new_pid is alive after startup" |
| if is_port_open; then |
| log "OK: Port $GATEWAY_PORT is listening" |
| else |
| log "WARN: Gateway alive but port $GATEWAY_PORT not yet listening (may need more time)" |
| fi |
| return 0 |
| else |
| log "ERROR: Gateway PID $new_pid died immediately after start" |
| log "ERROR: Last 10 lines of gateway.log:" |
| tail -10 "$GATEWAY_LOG" 2>/dev/null | while read -r line; do |
| log " | $line" |
| done |
| return 1 |
| fi |
| } |
|
|
| |
| rotate_log "$LOG_FILE" |
| rotate_log "$GATEWAY_LOG" |
|
|
| |
| find "$LOG_DIR" -name "openclaw-*.log" -mtime +7 -delete 2>/dev/null || true |
|
|
| |
| process_ok=false |
| port_ok=false |
| http_ok=false |
|
|
| if is_process_alive; then |
| process_ok=true |
| fi |
|
|
| if is_port_open; then |
| port_ok=true |
| fi |
|
|
| if $port_ok && check_gateway_http; then |
| http_ok=true |
| fi |
|
|
| |
| if $process_ok && $port_ok; then |
| if $http_ok; then |
| |
| set_consecutive_fails 0 |
| exit 0 |
| fi |
| |
| fails=$(get_consecutive_fails) |
| fails=$((fails + 1)) |
| set_consecutive_fails "$fails" |
| log "WARN: Process alive, port open, but HTTP not responding (consecutive: $fails)" |
| if [[ $fails -lt 3 ]]; then |
| log "INFO: Waiting more cycles before restart (transient check, $fails/3)" |
| exit 0 |
| fi |
| log "WARN: HTTP unresponsive for $fails consecutive checks β proceeding to restart" |
| fi |
|
|
| |
| if $process_ok && ! $port_ok; then |
| log "WARN: Process alive but port $GATEWAY_PORT not listening. Possible hung state." |
| fi |
|
|
| if ! $process_ok && ! $port_ok; then |
| log "WARN: Gateway is completely down (no process, no port)." |
| fi |
|
|
| if ! $process_ok && $port_ok; then |
| log "WARN: No known gateway process but port $GATEWAY_PORT is in use. Stale process?" |
| fi |
|
|
| |
| if ! check_network; then |
| log "WARN: Network unreachable (DNS resolution failed). Skipping gateway restart." |
| exit 0 |
| fi |
|
|
| |
| if cooldown_active; then |
| log "INFO: Cooldown active (last restart < ${RESTART_COOLDOWN}s ago). Skipping." |
| exit 0 |
| fi |
|
|
| |
| log "ACTION: Attempting gateway restart..." |
| if start_gateway; then |
| log "OK: Gateway restart SUCCESS" |
| set_consecutive_fails 0 |
| else |
| log "ERROR: Gateway restart FAILED" |
| exit 1 |
| fi |
|
|