File size: 4,029 Bytes
c994fd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Main startup logic (run as: bash /app/scripts/entrypoint_body.sh).
# entrypoint.sh is generated in the Dockerfile so HF can exec it without CRLF/BOM issues.
set -euo pipefail
cd /app

# Same-container vLLM: PyTorch may call getpass.getuser() before USER is set in some runtimes.
export USER="${USER:-huggingface}"
export LOGNAME="${LOGNAME:-$USER}"
export TORCHINDUCTOR_CACHE_DIR="${TORCHINDUCTOR_CACHE_DIR:-/tmp/torch_inductor_cache}"
export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-/tmp/triton_cache}"

export PYTHONPATH="/app/vendor/rllm:${PYTHONPATH:-}"

# Optional: load Space secrets copied to this path
if [[ -f /app/.env.gen_image ]]; then
  set -a
  # shellcheck source=/dev/null
  source /app/.env.gen_image
  set +a
fi

if [[ "${START_VLLM_GENSEARCHER:-0}" != "1" ]]; then
  case "${OPENAI_BASE_URL:-}" in
    *127.0.0.1*|*localhost*)
      echo "[entrypoint] WARNING: OPENAI_BASE_URL points to loopback but START_VLLM_GENSEARCHER is not 1."
      echo "[entrypoint]          The GenSearcher agent will get 'Connection error' unless a server listens here,"
      echo "[entrypoint]          or you set OPENAI_BASE_URL to an external OpenAI-compatible URL (ending in /v1)."
      ;;
  esac
  if [[ -z "${OPENAI_BASE_URL:-}" ]]; then
    echo "[entrypoint] OPENAI_BASE_URL is unset. For GenSearcher **inside this Space only**, set Space variable"
    echo "[entrypoint] START_VLLM_GENSEARCHER=1 (entrypoint will start vLLM here and set OPENAI_BASE_URL to loopback)."
  fi
fi

wait_http() {
  local url=$1
  local name=$2
  local max_attempts=${3:-90}
  local i=0
  echo "[entrypoint] Waiting for ${name} (${url})..."
  until curl -sf "$url" >/dev/null 2>&1; do
    i=$((i + 1))
    if [[ $i -ge $max_attempts ]]; then
      echo "[entrypoint] Timeout waiting for ${name}"
      exit 1
    fi
    sleep 2
  done
  echo "[entrypoint] ${name} is up."
}

# Defaults: only FireRed + Gradio in-container. Point OPENAI_BASE_URL / BROWSE_SUMMARY_BASE_URL
# to your vLLM (or other OpenAI-compatible) endpoints via Space secrets.

# --- Optional local vLLM: GenSearcher-8B (OpenAI-compatible) ---
if [[ "${START_VLLM_GENSEARCHER:-0}" == "1" ]]; then
  CUDA_VISIBLE_DEVICES="${GENSEARCHER_CUDA_VISIBLE_DEVICES:-0}" \
    vllm serve "${GENSEARCHER_MODEL_ID:-GenSearcher/Gen-Searcher-8B}" \
    --host 0.0.0.0 \
    --port 8002 \
    --tensor-parallel-size "${GENSEARCHER_TP:-1}" \
    --gpu-memory-utilization "${VLLM_GPU_MEMORY_UTIL:-0.85}" \
    --served-model-name "${GEN_EVAL_MODEL:-Gen-Searcher-8B}" \
    --max-model-len "${GENSEARCHER_MAX_MODEL_LEN:-65536}" \
    --no-enable-prefix-caching &
  wait_http "http://127.0.0.1:8002/v1/models" "GenSearcher vLLM"
  export OPENAI_BASE_URL="${OPENAI_BASE_URL:-http://127.0.0.1:8002/v1}"
fi

# --- Optional local vLLM: browse summarization (Qwen3-VL) ---
if [[ "${START_VLLM_BROWSE:-0}" == "1" ]]; then
  export BROWSE_GENERATE_ENGINE=vllm
  CUDA_VISIBLE_DEVICES="${BROWSE_CUDA_VISIBLE_DEVICES:-1}" \
    vllm serve "${BROWSE_MODEL_ID:-Qwen/Qwen3-VL-30B-A3B-Instruct}" \
    --host 0.0.0.0 \
    --port 8003 \
    --tensor-parallel-size "${BROWSE_TP:-1}" \
    --gpu-memory-utilization "${VLLM_GPU_MEMORY_UTIL:-0.85}" \
    --served-model-name "${BROWSE_SUMMARY_MODEL:-Qwen3-VL-30B-A3B-Instruct}" \
    --max-model-len "${BROWSE_MAX_MODEL_LEN:-65536}" \
    --mm-processor-cache-gb 0 \
    --no-enable-prefix-caching &
  wait_http "http://127.0.0.1:8003/v1/models" "Browse-summary vLLM"
  export BROWSE_SUMMARY_BASE_URL="${BROWSE_SUMMARY_BASE_URL:-http://127.0.0.1:8003/v1}"
fi

# --- FireRed adapter (GenSearcher /generate contract) ---
if [[ "${START_FIRERED_API:-1}" == "1" ]]; then
  CUDA_VISIBLE_DEVICES="${FIRERED_CUDA_VISIBLE_DEVICES:-0}" \
    python -m uvicorn services.firered_generate:app --host 0.0.0.0 --port 8765 &
  wait_http "http://127.0.0.1:8765/health" "FireRed API" 120
  export QWEN_EDIT_APP_URL="${QWEN_EDIT_APP_URL:-http://127.0.0.1:8765}"
else
  echo "[entrypoint] START_FIRERED_API=0 — use external QWEN_EDIT_APP_URL for generation."
fi

exec python app.py