Fix vLLM/PyTorch getpwuid crash: USER, TORCHINDUCTOR_CACHE_DIR on HF Spaces
Browse files- scripts/entrypoint.sh +33 -79
scripts/entrypoint.sh
CHANGED
|
@@ -1,88 +1,42 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
| 3 |
-
set -euo pipefail
|
| 4 |
-
cd /app
|
| 5 |
|
| 6 |
-
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
set +a
|
| 14 |
-
fi
|
| 15 |
|
| 16 |
-
|
| 17 |
-
case "${OPENAI_BASE_URL:-}" in
|
| 18 |
-
*127.0.0.1*|*localhost*)
|
| 19 |
-
echo "[entrypoint] WARNING: OPENAI_BASE_URL points to loopback but START_VLLM_GENSEARCHER is not 1."
|
| 20 |
-
echo "[entrypoint] The GenSearcher agent will get 'Connection error' unless a server listens here,"
|
| 21 |
-
echo "[entrypoint] or you set OPENAI_BASE_URL to an external OpenAI-compatible URL (ending in /v1)."
|
| 22 |
-
;;
|
| 23 |
-
esac
|
| 24 |
-
fi
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
echo "[entrypoint] Waiting for ${name} (${url})..."
|
| 32 |
-
until curl -sf "$url" >/dev/null 2>&1; do
|
| 33 |
-
i=$((i + 1))
|
| 34 |
-
if [[ $i -ge $max_attempts ]]; then
|
| 35 |
-
echo "[entrypoint] Timeout waiting for ${name}"
|
| 36 |
-
exit 1
|
| 37 |
-
fi
|
| 38 |
-
sleep 2
|
| 39 |
-
done
|
| 40 |
-
echo "[entrypoint] ${name} is up."
|
| 41 |
-
}
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
vllm serve "${GENSEARCHER_MODEL_ID:-GenSearcher/Gen-Searcher-8B}" \
|
| 50 |
-
--host 0.0.0.0 \
|
| 51 |
-
--port 8002 \
|
| 52 |
-
--tensor-parallel-size "${GENSEARCHER_TP:-1}" \
|
| 53 |
-
--gpu-memory-utilization "${VLLM_GPU_MEMORY_UTIL:-0.85}" \
|
| 54 |
-
--served-model-name "${GEN_EVAL_MODEL:-Gen-Searcher-8B}" \
|
| 55 |
-
--max-model-len "${GENSEARCHER_MAX_MODEL_LEN:-65536}" \
|
| 56 |
-
--no-enable-prefix-caching &
|
| 57 |
-
wait_http "http://127.0.0.1:8002/v1/models" "GenSearcher vLLM"
|
| 58 |
-
export OPENAI_BASE_URL="${OPENAI_BASE_URL:-http://127.0.0.1:8002/v1}"
|
| 59 |
-
fi
|
| 60 |
|
| 61 |
-
#
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
CUDA_VISIBLE_DEVICES="${BROWSE_CUDA_VISIBLE_DEVICES:-1}" \
|
| 65 |
-
vllm serve "${BROWSE_MODEL_ID:-Qwen/Qwen3-VL-30B-A3B-Instruct}" \
|
| 66 |
-
--host 0.0.0.0 \
|
| 67 |
-
--port 8003 \
|
| 68 |
-
--tensor-parallel-size "${BROWSE_TP:-1}" \
|
| 69 |
-
--gpu-memory-utilization "${VLLM_GPU_MEMORY_UTIL:-0.85}" \
|
| 70 |
-
--served-model-name "${BROWSE_SUMMARY_MODEL:-Qwen3-VL-30B-A3B-Instruct}" \
|
| 71 |
-
--max-model-len "${BROWSE_MAX_MODEL_LEN:-65536}" \
|
| 72 |
-
--mm-processor-cache-gb 0 \
|
| 73 |
-
--no-enable-prefix-caching &
|
| 74 |
-
wait_http "http://127.0.0.1:8003/v1/models" "Browse-summary vLLM"
|
| 75 |
-
export BROWSE_SUMMARY_BASE_URL="${BROWSE_SUMMARY_BASE_URL:-http://127.0.0.1:8003/v1}"
|
| 76 |
-
fi
|
| 77 |
|
| 78 |
-
#
|
| 79 |
-
|
| 80 |
-
CUDA_VISIBLE_DEVICES="${FIRERED_CUDA_VISIBLE_DEVICES:-0}" \
|
| 81 |
-
python -m uvicorn services.firered_generate:app --host 0.0.0.0 --port 8765 &
|
| 82 |
-
wait_http "http://127.0.0.1:8765/health" "FireRed API" 120
|
| 83 |
-
export QWEN_EDIT_APP_URL="${QWEN_EDIT_APP_URL:-http://127.0.0.1:8765}"
|
| 84 |
-
else
|
| 85 |
-
echo "[entrypoint] START_FIRERED_API=0 — use external QWEN_EDIT_APP_URL for generation."
|
| 86 |
-
fi
|
| 87 |
|
| 88 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Space (Docker) — GenSearcher + FireRed
|
| 2 |
+
# Requires GPU. For multi-GPU full-local mode, set START_VLLM_*=1 and CUDA device envs in README.
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
|
| 5 |
|
| 6 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 7 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 8 |
+
curl \
|
| 9 |
+
git \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
WORKDIR /app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
COPY vendor/rllm /app/vendor/rllm
|
| 15 |
+
COPY requirements.txt /app/requirements.txt
|
| 16 |
+
COPY app.py space_gen.py space_health.py /app/
|
| 17 |
+
COPY services /app/services
|
| 18 |
+
COPY scripts /app/scripts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
ENV PYTHONPATH=/app/vendor/rllm
|
| 21 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 22 |
+
# HF Spaces / minimal images often have uid 1000 with no /etc/passwd entry; PyTorch Inductor calls
|
| 23 |
+
# getpass.getuser() and crashes with KeyError. USER/LOGNAME short-circuit getuser(); cache dirs avoid $HOME issues.
|
| 24 |
+
ENV USER=huggingface
|
| 25 |
+
ENV LOGNAME=huggingface
|
| 26 |
+
ENV TORCHINDUCTOR_CACHE_DIR=/tmp/torch_inductor_cache
|
| 27 |
+
ENV TRITON_CACHE_DIR=/tmp/triton_cache
|
| 28 |
|
| 29 |
+
RUN pip install --no-cache-dir --upgrade pip setuptools wheel \
|
| 30 |
+
&& pip install --no-cache-dir -e /app/vendor/rllm \
|
| 31 |
+
&& pip install --no-cache-dir -r /app/requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
# Optional: local vLLM inside the image (large). Disable with build-arg if you only use external APIs.
|
| 34 |
+
ARG INSTALL_VLLM=1
|
| 35 |
+
RUN if [ "$INSTALL_VLLM" = "1" ]; then pip install --no-cache-dir "vllm>=0.6.3"; fi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
# Strip Windows CRLF if present (avoids: /usr/bin/env: 'bash\r': No such file or directory)
|
| 38 |
+
RUN sed -i 's/\r$//' /app/scripts/entrypoint.sh && chmod +x /app/scripts/entrypoint.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
EXPOSE 7860
|
| 41 |
+
|
| 42 |
+
CMD ["/app/scripts/entrypoint.sh"]
|