Spaces:

GGSheng
/

page

Running

File size: 12,995 Bytes

ef72157

#!/usr/bin/env bash
# hf-entrypoint.sh - HF Spaces 容器入口
set -euo pipefail

echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting..."

# ============================================
# 0. 保存环境变量到 /etc/profile.d，供后续 bash 会话使用
# ============================================
if [[ -x /usr/local/bin/save-env.sh ]]; then
    /usr/local/bin/save-env.sh
else
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: warning: save-env.sh not found, skipping env export"
fi

# 加载已保存的环境变量
if [[ -f /etc/profile.d/openclaw-env.sh ]]; then
    # shellcheck source=/dev/null
    source /etc/profile.d/openclaw-env.sh
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: loaded environment from /etc/profile.d/openclaw-env.sh"
fi

# ============================================
# 1. 启动 supervisord（管理 cron + openclaw-gateway）
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting supervisord..."
mkdir -p /var/run /var/log/supervisor /var/log/hf-entrypoint
/usr/bin/supervisord -c /etc/supervisor/supervisord.conf \
    >> /var/log/hf-entrypoint/supervisord-stdout.log \
    2>> /var/log/hf-entrypoint/supervisord-stderr.log &
SUPERVISORD_PID=$!
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord started (pid=$SUPERVISORD_PID)"

while [[ ! -f /var/run/supervisord.pid ]]; do
    sleep 0.5
done
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord is ready"

# 1.0 启动 SSH 服务和看门狗（确保SSH持续可用）
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH service and watchdog..."

# 0. 设置root密码（如果已设置ROOT_PASSWORD环境变量）
if [ -n "${ROOT_PASSWORD:-}" ]; then
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: setting root password..."
    echo "root:${ROOT_PASSWORD}" | chpasswd 2>/dev/null
    if [ $? -eq 0 ]; then
        # 确保root账户未锁定
        passwd -u root 2>/dev/null || true
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: root password set successfully"
    else
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: failed to set root password"
    fi
fi

# 0.1 创建 SSH 权限分离目录并清理残留 PID/套接字
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: preparing SSH runtime directories..."
mkdir -p /run/sshd /var/run/sshd 2>/dev/null || true
chmod 755 /run/sshd /var/run/sshd 2>/dev/null || true
rm -f /var/run/sshd.pid /var/run/sshd.init.pid /tmp/ssh-* 2>/dev/null || true

# 0.2 生成 SSH 主机密钥（如果不存在）
if [ ! -f "/etc/ssh/ssh_host_rsa_key" ]; then
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: generating SSH host keys..."
    ssh-keygen -A 2>/dev/null || true
fi

# 1. 确保SSH服务启动
if ! pgrep -x "sshd" > /dev/null 2>&1; then
    _sshd_bin=""
    if [ -x "/usr/sbin/sshd" ]; then
        _sshd_bin="/usr/sbin/sshd"
    elif [ -x "/usr/bin/sshd" ]; then
        _sshd_bin="/usr/bin/sshd"
    fi
    
    if [ -n "$_sshd_bin" ]; then
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting sshd from $_sshd_bin..."
        $_sshd_bin
        sleep 2
        if pgrep -x "sshd" > /dev/null 2>&1; then
            echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd started successfully"
            # 保护 sshd 不被 OOM killer 杀死（降低优先级）
            for _pid in $(pgrep -x "sshd" 2>/dev/null); do
                echo -500 > /proc/$_pid/oom_score_adj 2>/dev/null || true
            done
            echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd OOM protection applied (oom_score_adj=-500)"
            # 调整内核内存策略，降低 OOM 误杀关键服务的概率
            echo "2" > /proc/sys/vm/overcommit_memory 2>/dev/null || true
            echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: vm.overcommit_memory=2 set (never overcommit)"
        else
            echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd failed to start, will be handled by watchdog"
        fi
    else
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd executable not found"
    fi
fi

# 启动SSH看门狗（确保SSH服务持续可用）
# 环境变量 SSH_WATCHDOG_DOCKER_LOG: 控制看门狗日志是否重定向到 Docker logs（默认 true）
SSH_WATCHDOG_DOCKER_LOG="${SSH_WATCHDOG_DOCKER_LOG:-false}"
if [ -x "/usr/local/bin/ssh_service_watchdog.sh" ]; then
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH watchdog (docker_log=$SSH_WATCHDOG_DOCKER_LOG)..."
    if [ "$SSH_WATCHDOG_DOCKER_LOG" = "true" ]; then
        # 将看门狗输出重定向到Docker标准输出，这样即使xterm无法连接，也可以通过docker logs查看
        nohup /usr/local/bin/ssh_service_watchdog.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 &
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog logs are available via: docker logs <container>"
    else
        # 不重定向到 Docker logs，日志仅写入本地文件 /var/log/ssh_watchdog.log
        nohup /usr/local/bin/ssh_service_watchdog.sh > /dev/null 2>&1 &
    fi
    SSH_WATCHDOG_PID=$!
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog started (pid=$SSH_WATCHDOG_PID)"
else
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: ssh_service_watchdog.sh not found"
fi

# 1.1 启动 BT Panel（与 restore 并行启动，节省时间）
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting BT Panel..."
if [[ -f "/www/server/panel/default.pl" ]]; then
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel is installed, starting..."
    bt start 2>/dev/null || true
    bt default 2>/dev/null || true
else
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel not installed, skipping"
fi

# 1.2 等待 openclaw-gateway 完成恢复
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for openclaw-gateway to complete restore..."

RESTORE_COMPLETED_FILE="/tmp/openclaw-restore-completed"
OPENCLAW_LOG_FILE="/var/log/hf-entrypoint/openclaw-gateway-stdout.log"
RESTORE_LOG_FILE="/var/log/openclaw/restore.log"
WAITED=0
LAST_RESTORE_LINE=0
LAST_LOG_SIZE=0
PROGRESS_CHECK_INTERVAL=20
SLOW_WARN_THRESHOLD=900   # 15分钟超时预警
MAX_WAIT_TIMEOUT=3600     # 最大等待3600秒（1小时），超时后强制继续启动
IDLE_WARN_THRESHOLD=120   # 日志无新内容120秒则告警
TIME_NO_NEW_LOG=0

mkdir -p "$(dirname "$RESTORE_LOG_FILE")"

show_restore_progress() {
    if [[ ! -f "$RESTORE_LOG_FILE" ]]; then
        return
    fi

    local current_lines
    current_lines=$(wc -l < "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")

    if [[ -n "$current_lines" ]] && [[ "$current_lines" -gt "$LAST_RESTORE_LINE" ]]; then
        local new_lines=$((current_lines - LAST_RESTORE_LINE))
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Restore log ($new_lines new lines) ---"
        tail -n "$new_lines" "$RESTORE_LOG_FILE" | while IFS= read -r line; do
            echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line"
        done
        LAST_RESTORE_LINE="$current_lines"
        TIME_NO_NEW_LOG=0
    fi

    # Track log file size (bytes) as an activity indicator
    local current_size
    current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
    if [[ "$current_size" != "$LAST_LOG_SIZE" ]]; then
        LAST_LOG_SIZE="$current_size"
    fi
}

while true; do
    # 首次进入循环时显示诊断信息
    if [[ $WAITED -eq 0 ]]; then
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for restore completion..."
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint:   RESTORE_LOG_FILE=$RESTORE_LOG_FILE"
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint:   RESTORE_COMPLETED_FILE=$RESTORE_COMPLETED_FILE"
    fi

    # 每2秒：检查是否完成 + 获取最新日志
    show_restore_progress

    # 超时兜底：超过3600秒仍未完成，强制继续启动流程
    if [[ $WAITED -ge $MAX_WAIT_TIMEOUT ]]; then
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore timed out after ${WAITED}s ($((WAITED / 60))min), forcing proceed"
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ The restore process may still be running in the background"
        break
    fi

    # 只检查恢复完成标志文件
    if [[ -f "$RESTORE_COMPLETED_FILE" ]]; then
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore completed"
        # 显示恢复日志的最后几行
        if [[ -f "$RESTORE_LOG_FILE" ]]; then
            echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Final restore log (last 10 lines) ---"
            tail -n 10 "$RESTORE_LOG_FILE" | while IFS= read -r line; do
                echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line"
            done
        fi
        break
    fi

    sleep 2
    WAITED=$((WAITED + 2))

    # 如果日志无新内容，累计无更新时长
    if [[ -f "$RESTORE_LOG_FILE" ]]; then
        current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
        if [[ "$current_size" -eq "$LAST_LOG_SIZE" ]]; then
            TIME_NO_NEW_LOG=$((TIME_NO_NEW_LOG + 2))
        else
            TIME_NO_NEW_LOG=0
        fi
    fi

    # 每20秒输出一次聚合状态
    if [[ $((WAITED % PROGRESS_CHECK_INTERVAL)) -eq 0 ]]; then
        elapsed_min=$((WAITED / 60))
        log_size_str=""
        if [[ -f "$RESTORE_LOG_FILE" ]]; then
            file_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
            if [[ $file_size -ge 1048576 ]]; then
                log_size_str="$((file_size / 1048576)).$(( (file_size % 1048576) * 10 / 1048576 ))MB"
            elif [[ $file_size -ge 1024 ]]; then
                log_size_str="$((file_size / 1024)).$(( (file_size % 1024) * 10 / 1024 ))KB"
            else
                log_size_str="${file_size}B"
            fi
        fi

        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: still waiting for restore... (${WAITED}s / ${elapsed_min}min, log: ${log_size_str:-N/A})"

        # 如果超过15分钟还没完成，发出预警
        if [[ $WAITED -ge $SLOW_WARN_THRESHOLD ]] && [[ $((WAITED % 60)) -eq 0 ]]; then
            echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore is taking longer than expected (>${elapsed_min}min). Large backup (>10GB) may require more time."
        fi

        show_restore_progress
    fi
done

echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore wait completed (${WAITED}s / $((WAITED / 60))min), proceeding with PM2 startup"

# 1.2 确保 cron daemon 运行
if ! pgrep -x cron >/dev/null 2>&1; then
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting cron daemon..."
    /usr/sbin/cron
fi

# ============================================
# 2. 启动 PM2 管理, 附加的 node 进程（如果需要）
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting PM2 for others..."
mkdir -p /root/.pm2 /var/log/hf-entrypoint

if grep -qE '"name"\s*:' /app/pm2/ecosystem.config.js 2>/dev/null; then
    /usr/bin/pm2-runtime /app/pm2/ecosystem.config.js \
        >> /var/log/hf-entrypoint/pm2-stdout.log \
        2>> /var/log/hf-entrypoint/pm2-stderr.log &
    PM2_PID=$!
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2 started (pid=$PM2_PID)"
else
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2: no applications defined in ecosystem.config.js, skipping..."
    PM2_PID=""
fi

# ============================================
# 3. 信号转发（确保 PID 1 的 SIGTERM 能传到 supervisord）
# ============================================
signal_handler() {
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: received SIGTERM, forwarding to all processes..."
    # 停止SSH看门狗
    if [ -n "${SSH_WATCHDOG_PID:-}" ]; then
        echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH watchdog (pid=$SSH_WATCHDOG_PID)..."
        kill -TERM "$SSH_WATCHDOG_PID" 2>/dev/null || true
    fi
    # 停止supervisord
    kill -TERM "$SUPERVISORD_PID" 2>/dev/null || true
    # 停止PM2
    kill -TERM "$PM2_PID" 2>/dev/null || true
    # 停止SSH服务
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH service..."
    if pgrep -x "sshd" > /dev/null 2>&1; then
        killall sshd 2>/dev/null || true
    fi
    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: all services stopped"
}

trap signal_handler TERM INT QUIT

# ============================================
# 5. 启动 node hf-server.js 作为 PID 1
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting node server.js..."
cd /app
exec node hf-server.js