frankenstallm / source /scripts /openclaw-watchdog.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
#!/usr/bin/env bash
#
# openclaw-watchdog.sh β€” OpenClaw Gateway ν—¬μŠ€μ²΄ν¬ + μžλ™ μž¬μ‹œμž‘
# crontab에 λ“±λ‘ν•˜μ—¬ 1λΆ„λ§ˆλ‹€ μ‹€ν–‰
#
# Usage:
# */1 * * * * /PROJECT/0325120031_A/ghong/taketimes/llm-bang/scripts/openclaw-watchdog.sh
#
# 변경이λ ₯:
# 2026-03-01 λ„€νŠΈμ›Œν¬ 체크λ₯Ό ICMPβ†’HTTP둜 λ³€κ²½ (ICMP 차단 ν™˜κ²½ λŒ€μ‘)
# 닀쀑 μ—”λ“œν¬μΈνŠΈ fallback, κ²Œμ΄νŠΈμ›¨μ΄ HTTP 응닡 체크 μΆ”κ°€
# setsid 뢄리 μ‹€ν–‰, 상세 λ‘œκΉ… κ°•ν™”
set -euo pipefail
# ── μ„€μ • ──────────────────────────────────────────────
RNTIER_HOME="REDACTED_RNTIER_PATH"
OPENCLAW_BIN="${RNTIER_HOME}/.npm-global/bin/openclaw"
GATEWAY_PORT=18789
GATEWAY_HOST="127.0.0.1"
PID_FILE="/tmp/openclaw-gateway.pid"
LOG_DIR="/tmp/openclaw"
LOG_FILE="${LOG_DIR}/watchdog.log"
GATEWAY_LOG="${LOG_DIR}/gateway.log"
MAX_LOG_SIZE=$((10 * 1024 * 1024)) # 10MB λ‘œν…Œμ΄μ…˜
RESTART_COOLDOWN=120 # 초 β€” μž¬μ‹œμž‘ ν›„ 이 μ‹œκ°„ λ‚΄ μž¬μ‹œλ„ λ°©μ§€
LAST_RESTART_FILE="/tmp/openclaw-last-restart"
CONSECUTIVE_FAIL_FILE="/tmp/openclaw-consecutive-fails"
# ν™˜κ²½λ³€μˆ˜ β€” openclawκ°€ configλ₯Ό 찾을 수 μžˆλ„λ‘
export PATH="${RNTIER_HOME}/.npm-global/bin:/usr/bin:/usr/local/bin:/bin:$PATH"
export HOME="/home/ghong"
export OPENCLAW_STATE_DIR="${RNTIER_HOME}/.openclaw"
export OPENCLAW_CONFIG_PATH="${RNTIER_HOME}/.openclaw/openclaw.json"
# ── ν•¨μˆ˜ ──────────────────────────────────────────────
mkdir -p "$LOG_DIR"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
}
rotate_log() {
local file="$1"
if [[ -f "$file" ]] && [[ $(stat -c%s "$file" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]]; then
mv "$file" "${file}.old"
log "Log rotated: $file"
fi
}
# κ²Œμ΄νŠΈμ›¨μ΄μ˜ μ‹€μ œ μ—”λ“œν¬μΈνŠΈλ‘œ 둜컬 HTTP 응닡 체크
check_gateway_http() {
if command -v curl &>/dev/null; then
curl -sf --max-time 5 -o /dev/null "http://${GATEWAY_HOST}:${GATEWAY_PORT}/__openclaw__/canvas/" 2>/dev/null
return $?
fi
return 1
}
is_port_open() {
if command -v ss &>/dev/null; then
ss -tlnH "sport = :${GATEWAY_PORT}" 2>/dev/null | grep -q "$GATEWAY_PORT"
else
(echo > /dev/tcp/"$GATEWAY_HOST"/"$GATEWAY_PORT") 2>/dev/null
fi
}
is_process_alive() {
if [[ -f "$PID_FILE" ]]; then
local pid
pid=$(cat "$PID_FILE" 2>/dev/null)
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
return 0
fi
fi
pgrep -f "openclaw.*gateway" >/dev/null 2>&1
}
# λ„€νŠΈμ›Œν¬ 체크 β€” DNS 해석 기반
# 이 μ„œλ²„λŠ” ICMP(ping)κ³Ό μ•„μ›ƒλ°”μš΄λ“œ HTTPS(curl)κ°€ λͺ¨λ‘ 차단됨.
# 단, DNS 해석은 κ°€λŠ₯ν•˜κ³  κ²Œμ΄νŠΈμ›¨μ΄(Node.js)λŠ” long-polling으둜 톡신 κ°€λŠ₯.
# λ”°λΌμ„œ DNS 해석 성곡 μ—¬λΆ€λ‘œ "λ„€νŠΈμ›Œν¬ μžμ²΄κ°€ μ‚΄μ•„μžˆλŠ”μ§€" νŒλ‹¨ν•œλ‹€.
check_network() {
# 방법1: getent (κ°€μž₯ λΉ λ₯΄κ³  가벼움)
if command -v getent &>/dev/null; then
getent hosts api.telegram.org >/dev/null 2>&1 && return 0
getent hosts api.anthropic.com >/dev/null 2>&1 && return 0
fi
# 방법2: nslookup
if command -v nslookup &>/dev/null; then
nslookup -timeout=5 api.telegram.org >/dev/null 2>&1 && return 0
fi
# 방법3: /dev/tcp 둜 DNS μ„œλ²„(168.126.63.1) 포트 53 확인
(echo > /dev/tcp/168.126.63.1/53) 2>/dev/null && return 0
return 1
}
cooldown_active() {
if [[ -f "$LAST_RESTART_FILE" ]]; then
local last_restart now diff
last_restart=$(cat "$LAST_RESTART_FILE" 2>/dev/null)
now=$(date +%s)
diff=$(( now - last_restart ))
if [[ $diff -lt $RESTART_COOLDOWN ]]; then
return 0 # μΏ¨λ‹€μš΄ 쀑
fi
fi
return 1 # μΏ¨λ‹€μš΄ μ•„λ‹˜
}
get_consecutive_fails() {
if [[ -f "$CONSECUTIVE_FAIL_FILE" ]]; then
cat "$CONSECUTIVE_FAIL_FILE" 2>/dev/null || echo 0
else
echo 0
fi
}
set_consecutive_fails() {
echo "$1" > "$CONSECUTIVE_FAIL_FILE"
}
start_gateway() {
log "ACTION: Starting OpenClaw gateway on port $GATEWAY_PORT..."
# κΈ°μ‘΄ μ’€λΉ„ ν”„λ‘œμ„ΈμŠ€ 정리
local old_pids
old_pids=$(pgrep -f "openclaw.*gateway" 2>/dev/null || true)
if [[ -n "$old_pids" ]]; then
log "ACTION: Killing stale gateway processes: $old_pids"
echo "$old_pids" | xargs kill -9 2>/dev/null || true
sleep 2
fi
# κ²Œμ΄νŠΈμ›¨μ΄ μ‹œμž‘ β€” setsid둜 μ™„μ „ 뢄리 (λΆ€λͺ¨ ν”„λ‘œμ„ΈμŠ€ μ‹œκ·Έλ„ μ „νŒŒ λ°©μ§€)
setsid nohup "$OPENCLAW_BIN" gateway run \
--port "$GATEWAY_PORT" \
--bind loopback \
>> "$GATEWAY_LOG" 2>&1 < /dev/null &
local new_pid=$!
echo "$new_pid" > "$PID_FILE"
date +%s > "$LAST_RESTART_FILE"
log "ACTION: Gateway launched with PID $new_pid (setsid)"
# 8초 λŒ€κΈ° ν›„ 확인 (Telegram provider μ΄ˆκΈ°ν™”μ— μ‹œκ°„ ν•„μš”)
sleep 8
if kill -0 "$new_pid" 2>/dev/null; then
log "OK: Gateway PID $new_pid is alive after startup"
if is_port_open; then
log "OK: Port $GATEWAY_PORT is listening"
else
log "WARN: Gateway alive but port $GATEWAY_PORT not yet listening (may need more time)"
fi
return 0
else
log "ERROR: Gateway PID $new_pid died immediately after start"
log "ERROR: Last 10 lines of gateway.log:"
tail -10 "$GATEWAY_LOG" 2>/dev/null | while read -r line; do
log " | $line"
done
return 1
fi
}
# ── 메인 둜직 ─────────────────────────────────────────
rotate_log "$LOG_FILE"
rotate_log "$GATEWAY_LOG"
# 였래된 openclaw 둜그 파일 정리 (7일 이상)
find "$LOG_DIR" -name "openclaw-*.log" -mtime +7 -delete 2>/dev/null || true
# 1) ν”„λ‘œμ„ΈμŠ€ + 포트 체크λ₯Ό λ¨Όμ € μˆ˜ν–‰ (κ²Œμ΄νŠΈμ›¨μ΄κ°€ μ‚΄μ•„μžˆμœΌλ©΄ λ„€νŠΈμ›Œν¬ 체크 λΆˆν•„μš”)
process_ok=false
port_ok=false
http_ok=false
if is_process_alive; then
process_ok=true
fi
if is_port_open; then
port_ok=true
fi
if $port_ok && check_gateway_http; then
http_ok=true
fi
# 2) κ²Œμ΄νŠΈμ›¨μ΄ 정상이면 λ°”λ‘œ μ’…λ£Œ
if $process_ok && $port_ok; then
if $http_ok; then
# μ™„μ „ 정상
set_consecutive_fails 0
exit 0
fi
# ν”„λ‘œμ„ΈμŠ€+포트 OK인데 HTTP 응닡 μ—†μŒ β†’ hung κ°€λŠ₯μ„±
fails=$(get_consecutive_fails)
fails=$((fails + 1))
set_consecutive_fails "$fails"
log "WARN: Process alive, port open, but HTTP not responding (consecutive: $fails)"
if [[ $fails -lt 3 ]]; then
log "INFO: Waiting more cycles before restart (transient check, $fails/3)"
exit 0
fi
log "WARN: HTTP unresponsive for $fails consecutive checks β€” proceeding to restart"
fi
# 3) κ²Œμ΄νŠΈμ›¨μ΄κ°€ 비정상 β€” λ„€νŠΈμ›Œν¬ 체크 ν›„ μž¬μ‹œμž‘ μ—¬λΆ€ νŒλ‹¨
if $process_ok && ! $port_ok; then
log "WARN: Process alive but port $GATEWAY_PORT not listening. Possible hung state."
fi
if ! $process_ok && ! $port_ok; then
log "WARN: Gateway is completely down (no process, no port)."
fi
if ! $process_ok && $port_ok; then
log "WARN: No known gateway process but port $GATEWAY_PORT is in use. Stale process?"
fi
# 4) λ„€νŠΈμ›Œν¬ 체크 β€” DNS 기반 (κ²Œμ΄νŠΈμ›¨μ΄κ°€ μ£½μ—ˆμ„ λ•Œλ§Œ μ‹€ν–‰)
if ! check_network; then
log "WARN: Network unreachable (DNS resolution failed). Skipping gateway restart."
exit 0
fi
# 5) μΏ¨λ‹€μš΄ 체크
if cooldown_active; then
log "INFO: Cooldown active (last restart < ${RESTART_COOLDOWN}s ago). Skipping."
exit 0
fi
# 6) μž¬μ‹œμž‘
log "ACTION: Attempting gateway restart..."
if start_gateway; then
log "OK: Gateway restart SUCCESS"
set_consecutive_fails 0
else
log "ERROR: Gateway restart FAILED"
exit 1
fi