FROM debian:bookworm-slim

# 1. Install dependencies
# Added pkg-config to fix the "Could NOT find PkgConfig" error
RUN apt-get update && apt-get install -y \
    build-essential \
    cmake \
    git \
    pkg-config \
    libcurl4-openssl-dev \
    libssl-dev \
    libopenblas-dev \
    && rm -rf /var/lib/apt/lists/*

# 2. Setup Hugging Face User
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
    PATH=/home/user/.local/bin:$PATH
WORKDIR $HOME/app

# 3. Clone and Build ONLY llama-server
# Using -j 2 to prevent the 54% hang issue caused by RAM exhaustion
RUN git clone --depth 1 https://github.com/ggerganov/llama.cpp.git . && \
    cmake -B build \
    -DCMAKE_BUILD_TYPE=Release \
    -DGGML_NATIVE=ON \
    -DGGML_AVX512=ON \
    -DGGML_AVX512_VNNI=ON \
    -DGGML_OPENMP=ON \
    -DGGML_BLAS=ON \
    -DGGML_BLAS_VENDOR=OpenBLAS \
    -DGGML_CURL=ON && \
    cmake --build build --config Release --target llama-server -j 8
    

    
    

# 4. Final Server Configuration
# -t 8: Optimized for your 8 physical cores (prevents hyperthreading slowdowns)
# -hf: Pulls directly from Hugging Face
# --host 0.0.0.0: Required for Hugging Face Spaces networking
# --flash-attn: Uses AVX-512 optimized attention kernels
ENTRYPOINT ["./build/bin/llama-server"]

CMD [ \
    "-hf", "unsloth/Qwen3.5-4B-GGUF:Q8_0", \
    "--host", "0.0.0.0", \
    "--port", "7860", \
    "-t", "8", \
    "-c", "4096", \
    "--flash-attn", "true", \
    "--no-mmap" \
    ]