FROM vllm/vllm-openai:latest

WORKDIR /app

# Python deps
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# ===== Model to bake into the image =====
ARG MODEL_REPO=Qwen/Qwen1.5-4B-Chat-AWQ
ENV MODEL_DIR=/app/models/model
ENV SERVED_MODEL_NAME=${MODEL_REPO}

# Hugging Face Token
ARG HF_TOKEN
ENV HUGGINGFACE_HUB_TOKEN=${HF_TOKEN}
# Make the download speed much faster
ENV HF_HUB_ENABLE_HF_TRANSFER=1 

# Download the model in the image
RUN mkdir -p "${MODEL_DIR}" && \
    if [ -n "$HUGGINGFACE_HUB_TOKEN" ]; then \
      huggingface-cli login --token "$HUGGINGFACE_HUB_TOKEN" --add-to-git-credential; \
      fi && \
      hf download "${MODEL_REPO}" --local-dir "${MODEL_DIR}" && \
      ls -lh "${MODEL_DIR}"

# App code + entrypoint
COPY app.py .
COPY entrypoint.sh .
RUN chmod +x /app/entrypoint.sh


# Expose ports
EXPOSE 8000 7860


# Start both: vLLM (background) + Gradio (PID 1)
ENTRYPOINT ["/app/entrypoint.sh"]