FROM python:3.10-slim

ENV PYTHONUNBUFFERED=1 PORT=7860 OMP_NUM_THREADS=2

WORKDIR /code

# No build-essential/cmake needed anymore -- we're installing a precompiled
# wheel below, not compiling anything. Dropping this saves real build minutes too.

# Direct prebuilt wheel -- skips both the C++ compile AND the extra-index-url
# lookup, so it's the fastest reliable path.
RUN pip install --no-cache-dir \
    "https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.21/llama_cpp_python-0.3.21-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"
RUN pip install --no-cache-dir fastapi uvicorn huggingface_hub

# Bake the model into the image at BUILD time, not runtime.
# HF Spaces' disk is ephemeral -- if you download in main.py on startup,
# you re-pull several GB every single time the Space restarts/sleeps+wakes.
# Doing it here means it's part of the image layer and persists across restarts.
RUN mkdir -p /code/models && \
    python3 -c "from huggingface_hub import hf_hub_download; \
    hf_hub_download(repo_id='unsloth/gemma-4-E4B-it-GGUF', \
    filename='gemma-4-E4B-it-Q4_K_M.gguf', local_dir='/code/models')"

COPY . .

EXPOSE 7860
CMD ["python3", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "30"]