# Stage 1: Build llama.cpp
FROM ubuntu:22.04 AS builder

RUN apt-get update && apt-get install -y \
    build-essential \
    cmake \
    git \
    libcurl4-openssl-dev \
    python3-pip

# Clone raw llama.cpp
WORKDIR /app
RUN git clone https://github.com/ggml-org/llama.cpp.git .

# Build with UI DISABLED for a pure headless API
# We also enable cURL support for remote model loading if needed
RUN cmake -B build \
    -DLLAMA_BUILD_WEBUI=OFF \
    -DLLAMA_CURL=ON \
    -DLLAMA_BUILD_EXAMPLES=OFF
RUN cmake --build build --config Release -j $(nproc) --target llama-server

# Stage 2: Runtime
FROM ubuntu:22.04

RUN apt-get update && apt-get install -y libcurl4 python3-pip && rm -rf /var/lib/apt/lists/*
RUN pip install huggingface_hub

WORKDIR /app
COPY --from=builder /app/build/bin/llama-server /app/llama-server

# Download official Qwen GGUF (Non-Unsloth)
RUN python3 -c 'from huggingface_hub import hf_hub_download; \
    hf_hub_download(repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF", \
    filename="qwen2.5-7b-instruct-q4_k_m.gguf", local_dir="/app")'

# HF Spaces run on 7860
EXPOSE 7860

# Run headless server
CMD ["./llama-server", \
     "-m", "/app/qwen2.5-7b-instruct-q4_k_m.gguf", \
     "--host", "0.0.0.0", \
     "--port", "7860", \
     "-c", "32768", \
     "--embedding"]