FROM python:3.10-slim ENV PYTHONUNBUFFERED=1 PORT=7860 OMP_NUM_THREADS=2 WORKDIR /code # No build-essential/cmake needed anymore -- we're installing a precompiled # wheel below, not compiling anything. Dropping this saves real build minutes too. # Direct prebuilt wheel -- skips both the C++ compile AND the extra-index-url # lookup, so it's the fastest reliable path. RUN pip install --no-cache-dir \ "https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.21/llama_cpp_python-0.3.21-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl" RUN pip install --no-cache-dir fastapi uvicorn huggingface_hub # Bake the model into the image at BUILD time, not runtime. # HF Spaces' disk is ephemeral -- if you download in main.py on startup, # you re-pull several GB every single time the Space restarts/sleeps+wakes. # Doing it here means it's part of the image layer and persists across restarts. RUN mkdir -p /code/models && \ python3 -c "from huggingface_hub import hf_hub_download; \ hf_hub_download(repo_id='unsloth/gemma-4-E4B-it-GGUF', \ filename='gemma-4-E4B-it-Q4_K_M.gguf', local_dir='/code/models')" COPY . . EXPOSE 7860 CMD ["python3", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "30"]