# Stage 1: Build llama.cpp FROM ubuntu:22.04 AS builder RUN apt-get update && apt-get install -y \ build-essential \ cmake \ git \ libcurl4-openssl-dev \ python3-pip # Clone raw llama.cpp WORKDIR /app RUN git clone https://github.com/ggml-org/llama.cpp.git . # Build with UI DISABLED for a pure headless API # We also enable cURL support for remote model loading if needed RUN cmake -B build \ -DLLAMA_BUILD_WEBUI=OFF \ -DLLAMA_CURL=ON \ -DLLAMA_BUILD_EXAMPLES=OFF RUN cmake --build build --config Release -j $(nproc) --target llama-server # Stage 2: Runtime FROM ubuntu:22.04 RUN apt-get update && apt-get install -y libcurl4 python3-pip && rm -rf /var/lib/apt/lists/* RUN pip install huggingface_hub WORKDIR /app COPY --from=builder /app/build/bin/llama-server /app/llama-server # Download official Qwen GGUF (Non-Unsloth) RUN python3 -c 'from huggingface_hub import hf_hub_download; \ hf_hub_download(repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF", \ filename="qwen2.5-7b-instruct-q4_k_m.gguf", local_dir="/app")' # HF Spaces run on 7860 EXPOSE 7860 # Run headless server CMD ["./llama-server", \ "-m", "/app/qwen2.5-7b-instruct-q4_k_m.gguf", \ "--host", "0.0.0.0", \ "--port", "7860", \ "-c", "32768", \ "--embedding"]