#!/bin/bash set -euo pipefail # ---- runtime env fixes (stop libgomp and torchinductor errors) ---- export OMP_NUM_THREADS=1 export TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor_cache mkdir -p "$TORCHINDUCTOR_CACHE_DIR" #start vLLM (OpenAI server) python3 -m vllm.entrypoints.openai.api_server \ --model "${MODEL_DIR:-/app/models/model}" \ --served-model-name "Qwen/Qwen1.5-4B-Chat-AWQ" \ --host 0.0.0.0 \ --port 8000 \ --enforce-eager \ --max-model-len 2048 \ --gpu-memory-utilization 0.8 & # Start the Gradio app exec python3 /app/app.py