Spaces:
Running
fix: 12-point production audit — bugs, security, perf, reliability
Browse filesstart.sh:
- BACKUP_DATASET_NAME/SYNC_INTERVAL defaulted before export — empty string
was passed to deerflow-sync.py causing invalid repo name (bug)
- Export FRONTEND_PORT and BACKEND_PORT so health-server.js and child
processes receive correct values from environment
- openssl JWT generation now has Python fallback — openssl CLI not
guaranteed in python:3.12-slim-bookworm slim image
- Remove dead os.environ["CORS_ORIGINS"] in Python heredoc — Python
subprocess env never propagates back to parent bash (dead code)
- Graceful shutdown uses explicit PID list instead of $(jobs -p) which
is unreliable in bash subshells
- Truncate log files on startup to prevent unbounded disk growth
health-server.js:
- sync "error" status now renders as "off" (red) not "neutral" (grey)
- Dashboard response includes Cache-Control: no-store header
nginx.conf:
- gzip compression enabled for JSON/HTML/JS — large research reports
can be 60-80% smaller over the wire
- server_tokens off — stop leaking nginx version in error responses
Dockerfile:
- Replace jq (unused) with openssl (explicit, needed for JWT generation)
- HEALTHCHECK start-period 60s → 120s — restore + backend + frontend
can legitimately take up to 2 min; 60s caused premature unhealthy marks
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- Dockerfile +3 -3
- health-server.js +2 -1
- nginx.conf +11 -0
- start.sh +18 -16
|
@@ -47,7 +47,7 @@ ARG NODE_MAJOR=22
|
|
| 47 |
|
| 48 |
# Layer 1: nginx + base tools (rarely changes — stays cached)
|
| 49 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 50 |
-
curl ca-certificates gnupg nginx
|
| 51 |
&& rm -rf /var/lib/apt/lists/*
|
| 52 |
|
| 53 |
# Layer 2: Node.js (separate layer — apt network stall doesn't force pip re-run)
|
|
@@ -114,8 +114,8 @@ WORKDIR /app
|
|
| 114 |
|
| 115 |
EXPOSE 7860
|
| 116 |
|
| 117 |
-
#
|
| 118 |
-
HEALTHCHECK --interval=30s --timeout=10s --start-period=
|
| 119 |
CMD curl -fsS http://localhost:7860/health || exit 1
|
| 120 |
|
| 121 |
CMD ["/app/start.sh"]
|
|
|
|
| 47 |
|
| 48 |
# Layer 1: nginx + base tools (rarely changes — stays cached)
|
| 49 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 50 |
+
curl ca-certificates gnupg nginx openssl \
|
| 51 |
&& rm -rf /var/lib/apt/lists/*
|
| 52 |
|
| 53 |
# Layer 2: Node.js (separate layer — apt network stall doesn't force pip re-run)
|
|
|
|
| 114 |
|
| 115 |
EXPOSE 7860
|
| 116 |
|
| 117 |
+
# 120s start period — restore + backend + frontend startup can take up to 2 min
|
| 118 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s \
|
| 119 |
CMD curl -fsS http://localhost:7860/health || exit 1
|
| 120 |
|
| 121 |
CMD ["/app/start.sh"]
|
|
@@ -95,6 +95,7 @@ function renderDashboard({ backendUp, frontendUp, uptimeHuman, sync, keepalive }
|
|
| 95 |
const appOnline = backendUp && frontendUp;
|
| 96 |
const syncStatus = String(sync?.status || "unknown");
|
| 97 |
const syncTone = ["success","restored","synced","configured"].includes(syncStatus) ? "ok"
|
|
|
|
| 98 |
: syncStatus === "disabled" ? "warn" : "neutral";
|
| 99 |
const kaOk = keepalive?.configured === true;
|
| 100 |
const kaTone = kaOk ? "ok" : process.env.CLOUDFLARE_WORKERS_TOKEN ? "warn" : "neutral";
|
|
@@ -255,7 +256,7 @@ const server = http.createServer(async (req, res) => {
|
|
| 255 |
probe(NGINX_HOST, NGINX_PORT, "/health"),
|
| 256 |
tcpProbe(NGINX_HOST, FRONTEND_PORT),
|
| 257 |
]);
|
| 258 |
-
res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" });
|
| 259 |
return res.end(renderDashboard({
|
| 260 |
backendUp, frontendUp,
|
| 261 |
uptimeHuman: formatUptime(Date.now() - startTime),
|
|
|
|
| 95 |
const appOnline = backendUp && frontendUp;
|
| 96 |
const syncStatus = String(sync?.status || "unknown");
|
| 97 |
const syncTone = ["success","restored","synced","configured"].includes(syncStatus) ? "ok"
|
| 98 |
+
: syncStatus === "error" ? "off"
|
| 99 |
: syncStatus === "disabled" ? "warn" : "neutral";
|
| 100 |
const kaOk = keepalive?.configured === true;
|
| 101 |
const kaTone = kaOk ? "ok" : process.env.CLOUDFLARE_WORKERS_TOKEN ? "warn" : "neutral";
|
|
|
|
| 256 |
probe(NGINX_HOST, NGINX_PORT, "/health"),
|
| 257 |
tcpProbe(NGINX_HOST, FRONTEND_PORT),
|
| 258 |
]);
|
| 259 |
+
res.writeHead(200, { "Content-Type": "text/html; charset=utf-8", "Cache-Control": "no-store" });
|
| 260 |
return res.end(renderDashboard({
|
| 261 |
backendUp, frontendUp,
|
| 262 |
uptimeHuman: formatUptime(Date.now() - startTime),
|
|
@@ -21,6 +21,17 @@ http {
|
|
| 21 |
tcp_nopush on;
|
| 22 |
tcp_nodelay on;
|
| 23 |
keepalive_timeout 65;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# ── DeerFlow on HF Spaces ─────────────────────────────────────
|
| 26 |
server {
|
|
|
|
| 21 |
tcp_nopush on;
|
| 22 |
tcp_nodelay on;
|
| 23 |
keepalive_timeout 65;
|
| 24 |
+
server_tokens off;
|
| 25 |
+
|
| 26 |
+
# Gzip — compresses API JSON, HTML, JS (big win for research reports)
|
| 27 |
+
gzip on;
|
| 28 |
+
gzip_vary on;
|
| 29 |
+
gzip_proxied any;
|
| 30 |
+
gzip_comp_level 5;
|
| 31 |
+
gzip_min_length 1024;
|
| 32 |
+
gzip_types text/plain text/css text/xml text/javascript
|
| 33 |
+
application/json application/javascript application/xml
|
| 34 |
+
application/x-javascript image/svg+xml;
|
| 35 |
|
| 36 |
# ── DeerFlow on HF Spaces ─────────────────────────────────────
|
| 37 |
server {
|
|
@@ -17,12 +17,16 @@ SYNC_INTERVAL="${SYNC_INTERVAL:-600}"
|
|
| 17 |
BACKEND_READY_TIMEOUT="${BACKEND_READY_TIMEOUT:-120}"
|
| 18 |
FRONTEND_READY_TIMEOUT="${FRONTEND_READY_TIMEOUT:-120}"
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# Export shell vars so inline Python scripts can read them via os.environ
|
| 21 |
-
export DATA_DIR CONFIG_PATH
|
| 22 |
export DEER_FLOW_HOME="$DATA_DIR"
|
| 23 |
export DEER_FLOW_CONFIG_PATH="$CONFIG_PATH"
|
| 24 |
export DEER_FLOW_SKILLS_PATH="/app/skills"
|
| 25 |
-
export NGINX_PORT PUBLIC_PORT
|
| 26 |
|
| 27 |
echo ""
|
| 28 |
echo " ╔══════════════════════════════════════════╗"
|
|
@@ -69,7 +73,8 @@ if [ -z "${AUTH_JWT_SECRET:-}" ]; then
|
|
| 69 |
AUTH_JWT_SECRET=$(cat "$AUTH_JWT_SECRET_FILE")
|
| 70 |
echo "AUTH_JWT_SECRET loaded from disk."
|
| 71 |
else
|
| 72 |
-
AUTH_JWT_SECRET=$(openssl rand -base64 48 | tr -d '\n'
|
|
|
|
| 73 |
printf '%s' "$AUTH_JWT_SECRET" > "$AUTH_JWT_SECRET_FILE"
|
| 74 |
chmod 600 "$AUTH_JWT_SECRET_FILE"
|
| 75 |
echo "AUTH_JWT_SECRET generated and saved to disk."
|
|
@@ -298,15 +303,6 @@ base["skills"]["path"] = "/app/skills"
|
|
| 298 |
base.setdefault("agents_api", {})
|
| 299 |
base["agents_api"]["enabled"] = True
|
| 300 |
|
| 301 |
-
# CORS: allow HF Space URL + localhost
|
| 302 |
-
space_host = os.environ.get("SPACE_HOST", "")
|
| 303 |
-
cors_origins = ["http://localhost:3000", "http://localhost:7860"]
|
| 304 |
-
if space_host:
|
| 305 |
-
cors_origins.append(f"https://{space_host}")
|
| 306 |
-
|
| 307 |
-
# Set via env (picked up by gateway config loader)
|
| 308 |
-
os.environ["CORS_ORIGINS"] = ",".join(cors_origins)
|
| 309 |
-
|
| 310 |
config_path.parent.mkdir(parents=True, exist_ok=True)
|
| 311 |
config_path.write_text(yaml.safe_dump(base, sort_keys=False, allow_unicode=True))
|
| 312 |
config_path.chmod(0o600)
|
|
@@ -353,15 +349,21 @@ graceful_shutdown() {
|
|
| 353 |
echo "Saving state to HF Dataset..."
|
| 354 |
python3 "$APP_DIR/deerflow-sync.py" sync-once || echo "Warning: shutdown sync failed."
|
| 355 |
fi
|
| 356 |
-
# Stop nginx daemon (nginx -s quit = graceful drain)
|
| 357 |
nginx -s quit 2>/dev/null || true
|
| 358 |
-
#
|
| 359 |
-
|
| 360 |
-
|
|
|
|
|
|
|
| 361 |
exit 0
|
| 362 |
}
|
| 363 |
trap graceful_shutdown SIGTERM SIGINT
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
# ── Start health-server (public port 7860) ────────────────────────
|
| 366 |
echo "Starting health-server on port $PUBLIC_PORT..."
|
| 367 |
node "$APP_DIR/health-server.js" 2>&1 | tee -a "$DATA_DIR/logs/health-server.log" &
|
|
|
|
| 17 |
BACKEND_READY_TIMEOUT="${BACKEND_READY_TIMEOUT:-120}"
|
| 18 |
FRONTEND_READY_TIMEOUT="${FRONTEND_READY_TIMEOUT:-120}"
|
| 19 |
|
| 20 |
+
# Apply defaults before exporting so downstream tools never see empty strings
|
| 21 |
+
export BACKUP_DATASET_NAME="${BACKUP_DATASET_NAME:-huggingflow-backup}"
|
| 22 |
+
export SYNC_INTERVAL="${SYNC_INTERVAL:-600}"
|
| 23 |
+
|
| 24 |
# Export shell vars so inline Python scripts can read them via os.environ
|
| 25 |
+
export DATA_DIR CONFIG_PATH
|
| 26 |
export DEER_FLOW_HOME="$DATA_DIR"
|
| 27 |
export DEER_FLOW_CONFIG_PATH="$CONFIG_PATH"
|
| 28 |
export DEER_FLOW_SKILLS_PATH="/app/skills"
|
| 29 |
+
export NGINX_PORT PUBLIC_PORT FRONTEND_PORT BACKEND_PORT
|
| 30 |
|
| 31 |
echo ""
|
| 32 |
echo " ╔══════════════════════════════════════════╗"
|
|
|
|
| 73 |
AUTH_JWT_SECRET=$(cat "$AUTH_JWT_SECRET_FILE")
|
| 74 |
echo "AUTH_JWT_SECRET loaded from disk."
|
| 75 |
else
|
| 76 |
+
AUTH_JWT_SECRET=$(openssl rand -base64 48 2>/dev/null | tr -d '\n' || \
|
| 77 |
+
python3 -c "import secrets; print(secrets.token_urlsafe(64))")
|
| 78 |
printf '%s' "$AUTH_JWT_SECRET" > "$AUTH_JWT_SECRET_FILE"
|
| 79 |
chmod 600 "$AUTH_JWT_SECRET_FILE"
|
| 80 |
echo "AUTH_JWT_SECRET generated and saved to disk."
|
|
|
|
| 303 |
base.setdefault("agents_api", {})
|
| 304 |
base["agents_api"]["enabled"] = True
|
| 305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
config_path.parent.mkdir(parents=True, exist_ok=True)
|
| 307 |
config_path.write_text(yaml.safe_dump(base, sort_keys=False, allow_unicode=True))
|
| 308 |
config_path.chmod(0o600)
|
|
|
|
| 349 |
echo "Saving state to HF Dataset..."
|
| 350 |
python3 "$APP_DIR/deerflow-sync.py" sync-once || echo "Warning: shutdown sync failed."
|
| 351 |
fi
|
|
|
|
| 352 |
nginx -s quit 2>/dev/null || true
|
| 353 |
+
# Kill tracked PIDs explicitly — more reliable than $(jobs -p) in bash
|
| 354 |
+
for pid in "${FRONTEND_PID:-}" "${HEALTH_PID:-}" "${BACKEND_PID:-}"; do
|
| 355 |
+
[ -n "$pid" ] && kill "$pid" 2>/dev/null || true
|
| 356 |
+
done
|
| 357 |
+
sleep 3
|
| 358 |
exit 0
|
| 359 |
}
|
| 360 |
trap graceful_shutdown SIGTERM SIGINT
|
| 361 |
|
| 362 |
+
# ── Truncate logs on startup (prevent unbounded growth) ──────────
|
| 363 |
+
for _log in health-server backend frontend; do
|
| 364 |
+
: > "$DATA_DIR/logs/$_log.log" 2>/dev/null || true
|
| 365 |
+
done
|
| 366 |
+
|
| 367 |
# ── Start health-server (public port 7860) ────────────────────────
|
| 368 |
echo "Starting health-server on port $PUBLIC_PORT..."
|
| 369 |
node "$APP_DIR/health-server.js" 2>&1 | tee -a "$DATA_DIR/logs/health-server.log" &
|