File size: 2,117 Bytes
31491b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# --- STAGE 1: Build Environment ---
FROM python:3.11-slim-bookworm AS builder

# Set environment variables for high-performance CPU build
ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    CMAKE_ARGS="-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DGGML_FLASH_ATTN=ON" \
    FORCE_CMAKE=1

# Install build essentials
RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    cmake \
    git \
    curl \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

# Install 'uv' for 2026-standard high-speed dependency resolution
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
WORKDIR /app

# Install llama-cpp-python with server support (compiled for CPU)
RUN uv pip install --system llama-cpp-python[server]

# --- STAGE 2: Runtime Environment ---
FROM python:3.11-slim-bookworm

# Hugging Face Spaces requires UID 1000
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
    PATH=/home/user/.local/bin:$PATH \
    PYTHONUNBUFFERED=1

WORKDIR $HOME/app

# Copy the compiled libraries from the builder stage
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin

# Download the specific Q6_K_XL model provided
# Q6_K_XL is ~700MB; fits easily in the 16GB RAM alongside the 32k KV cache.
RUN apt-get update && apt-get install -y wget && \
    wget -O model.gguf "https://huggingface.co/unsloth/LFM2-700M-GGUF/resolve/main/LFM2-700M-UD-Q6_K_XL.gguf?download=true" && \
    apt-get purge -y wget && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*

# EXPOSE port 7860 (Hugging Face standard)
EXPOSE 7860

# --- INFERENCE CONFIGURATION ---
# n_ctx: 32768 (Requested context window)
# n_threads: 2 (Matches Hugging Face Free Tier 2 vCPU)
# host: 0.0.0.0 (Binds to all interfaces for HF proxy)
# model_alias: lfm2 (OpenAI compatible endpoint name)
ENTRYPOINT ["python3", "-m", "llama_cpp.server"]
CMD [ \
    "--model", "model.gguf", \
    "--n_ctx", "32768", \
    "--n_threads", "2", \
    "--host", "0.0.0.0", \
    "--port", "7860", \
    "--model_alias", "lfm2-700m" \
]