# --- STAGE 1: Build Environment --- FROM python:3.11-slim-bookworm AS builder # Set environment variables for high-performance CPU build ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ CMAKE_ARGS="-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DGGML_FLASH_ATTN=ON" \ FORCE_CMAKE=1 # Install build essentials RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ cmake \ git \ curl \ && apt-get clean && rm -rf /var/lib/apt/lists/* # Install 'uv' for 2026-standard high-speed dependency resolution COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ WORKDIR /app # Install llama-cpp-python with server support (compiled for CPU) RUN uv pip install --system llama-cpp-python[server] # --- STAGE 2: Runtime Environment --- FROM python:3.11-slim-bookworm # Hugging Face Spaces requires UID 1000 RUN useradd -m -u 1000 user USER user ENV HOME=/home/user \ PATH=/home/user/.local/bin:$PATH \ PYTHONUNBUFFERED=1 WORKDIR $HOME/app # Copy the compiled libraries from the builder stage COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages COPY --from=builder /usr/local/bin /usr/local/bin # Download the specific Q6_K_XL model provided # Q6_K_XL is ~700MB; fits easily in the 16GB RAM alongside the 32k KV cache. RUN apt-get update && apt-get install -y wget && \ wget -O model.gguf "https://huggingface.co/unsloth/LFM2-700M-GGUF/resolve/main/LFM2-700M-UD-Q6_K_XL.gguf?download=true" && \ apt-get purge -y wget && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* # EXPOSE port 7860 (Hugging Face standard) EXPOSE 7860 # --- INFERENCE CONFIGURATION --- # n_ctx: 32768 (Requested context window) # n_threads: 2 (Matches Hugging Face Free Tier 2 vCPU) # host: 0.0.0.0 (Binds to all interfaces for HF proxy) # model_alias: lfm2 (OpenAI compatible endpoint name) ENTRYPOINT ["python3", "-m", "llama_cpp.server"] CMD [ \ "--model", "model.gguf", \ "--n_ctx", "32768", \ "--n_threads", "2", \ "--host", "0.0.0.0", \ "--port", "7860", \ "--model_alias", "lfm2-700m" \ ]