| set -euo pipefail | |
| # ---- runtime env fixes (stop libgomp and torchinductor errors) ---- | |
| export OMP_NUM_THREADS=1 | |
| export TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor_cache | |
| mkdir -p "$TORCHINDUCTOR_CACHE_DIR" | |
| #start vLLM (OpenAI server) | |
| python3 -m vllm.entrypoints.openai.api_server \ | |
| --model "${MODEL_DIR:-/app/models/model}" \ | |
| --served-model-name "Qwen/Qwen1.5-4B-Chat-AWQ" \ | |
| --host 0.0.0.0 \ | |
| --port 8000 \ | |
| --enforce-eager \ | |
| --max-model-len 2048 \ | |
| --gpu-memory-utilization 0.8 & | |
| # Start the Gradio app | |
| exec python3 /app/app.py | |