tusarway commited on
Commit
b72e18c
Β·
verified Β·
1 Parent(s): 8dcb3c9
Files changed (1) hide show
  1. Dockerfile +14 -10
Dockerfile CHANGED
@@ -2,24 +2,29 @@ FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
5
- # Only runtime libs needed β€” no build toolchain since we use pre-built wheels
 
6
  RUN apt-get update && apt-get install -y \
7
- libopenblas0 \
 
 
8
  curl \
9
  wget \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
- # Install all deps except llama-cpp-python first
13
  COPY requirements.txt .
14
  RUN pip install --no-cache-dir -r requirements.txt
15
 
16
- # ── KEY FIX: install pre-built CPU wheel (seconds, not hours) ─────────────────
17
- # abetlen's CPU wheel index has pre-compiled binaries β€” no C++ compilation needed
18
- RUN pip install --no-cache-dir \
19
- "llama-cpp-python==0.3.8" \
20
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 
 
21
 
22
- # Download model at build time so cold starts are fast (~60s instead of 10min)
23
  RUN mkdir -p /app/models && \
24
  wget --progress=dot:giga \
25
  "https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/gemma-4-26B-A4B-it-UD-IQ3_XXS.gguf" \
@@ -27,7 +32,6 @@ RUN mkdir -p /app/models && \
27
 
28
  COPY app.py .
29
 
30
- # HuggingFace Spaces requires port 7860
31
  EXPOSE 7860
32
 
33
  ENV SPACE_URL=""
 
2
 
3
  WORKDIR /app
4
 
5
+ # Build tools required β€” no pre-built CPU wheels exist for llama-cpp-python >= 0.3.x
6
+ # (abetlen's /whl/cpu index only has older versions)
7
  RUN apt-get update && apt-get install -y \
8
+ build-essential \
9
+ cmake \
10
+ libopenblas-dev \
11
  curl \
12
  wget \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
+ # Install all non-compiled deps first (fast, cached separately)
16
  COPY requirements.txt .
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
+ # ── Compile llama-cpp-python from source ──────────────────────────────────────
20
+ # CMAKE_BUILD_PARALLEL_LEVEL=4 β†’ use all available build cores (~4x faster)
21
+ # GGML_BLAS=ON β†’ link OpenBLAS for faster matrix ops on CPU
22
+ # This layer is Docker-cached: only re-runs if requirements change
23
+ RUN CMAKE_BUILD_PARALLEL_LEVEL=4 \
24
+ CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DCMAKE_BUILD_TYPE=Release" \
25
+ pip install --no-cache-dir "llama-cpp-python==0.3.8"
26
 
27
+ # Download model at build time (cached separately from compilation)
28
  RUN mkdir -p /app/models && \
29
  wget --progress=dot:giga \
30
  "https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/gemma-4-26B-A4B-it-UD-IQ3_XXS.gguf" \
 
32
 
33
  COPY app.py .
34
 
 
35
  EXPOSE 7860
36
 
37
  ENV SPACE_URL=""