File size: 4,401 Bytes
d7182a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a27072c
d7182a3
 
 
 
a27072c
d7182a3
 
 
 
a27072c
d7182a3
 
 
 
a27072c
d7182a3
a27072c
d7182a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a27072c
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# ──────────────────────────────────────────────────────────────────────────
# Dockerfile – Children's Learning Router Service
# Target:  Hugging Face Spaces  (CPU-only, Docker SDK)
# Port:    7860  (required by HF Spaces)
#
# Model delivery: via `preload_from_hub` in README.md
#   HF Spaces downloads Qwen/Qwen2.5-1.5B-Instruct before container start
#   and places it under /repo-cache (HF_HOME=/repo-cache).
#   No in-build download is needed or possible (build env has no internet).
#
# OOM mitigation: packages are installed in small isolated groups so pip's
#   dependency resolver never spikes RAM.  --no-cache-dir and --no-compile
#   keep peak memory low throughout the build.
# ──────────────────────────────────────────────────────────────────────────

FROM python:3.10-slim

# ── System packages ───────────────────────────────────────────────────────
RUN apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
        git \
        curl \
    && rm -rf /var/lib/apt/lists/*

# ── Working directory ─────────────────────────────────────────────────────
WORKDIR /app

# ── Pip hygiene: upgrade pip/wheel first (small, fast) ───────────────────
RUN pip install --no-cache-dir --no-compile --upgrade pip wheel

# ── 1 of 4 Β· CPU-only PyTorch (largest wheel – install alone) ────────────
RUN pip install --no-cache-dir --no-compile \
    torch==2.3.1 \
    --index-url https://download.pytorch.org/whl/cpu

# ── 2 of 4 Β· HuggingFace stack (transformers pulls in tokenizers etc.) ───
RUN pip install --no-cache-dir --no-compile \
    transformers==4.46.3 \
    accelerate==1.1.1

# ── 3 of 4 Β· Serialisation libs ──────────────────────────────────────────
RUN pip install --no-cache-dir --no-compile \
    sentencepiece==0.2.0 \
    protobuf==5.28.3

# ── 4 of 4 Β· Async HTTP client + Web framework + ASGI server ─────────────
RUN pip install --no-cache-dir --no-compile \
    httpx==0.27.2 \
    fastapi==0.115.0 \
    uvicorn[standard]==0.30.6

# ── Application code ──────────────────────────────────────────────────────
COPY app.py .

# ── HuggingFace Spaces: run as non-root user (UID 1000) ──────────────────
# mkdir -p /repo-cache/hub ensures the cache path exists and is writable
# by hfuser whether HF Spaces pre-populates it or the model downloads fresh.
RUN useradd -m -u 1000 hfuser \
    && mkdir -p /repo-cache/hub \
    && chown -R hfuser:hfuser /app /repo-cache
USER hfuser

# ── Runtime config ────────────────────────────────────────────────────────
# HF Spaces sets HF_HOME=/repo-cache and places preload_from_hub models
# there before the container starts. HF_HOME alone is sufficient;
# TRANSFORMERS_CACHE is deprecated since transformers v4 and removed in v5.
ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    HF_HOME=/repo-cache

EXPOSE 7860

# ── Start-up command ──────────────────────────────────────────────────────
# Plain uvicorn β€” no gunicorn shim. Eliminates the gunicorn health-check
# race that was killing the worker mid-response and causing 502s.
# timeout-keep-alive covers the full CPU inference time for the 3B model.
CMD ["uvicorn", "app:app", \
     "--host",               "0.0.0.0", \
     "--port",               "7860", \
     "--timeout-keep-alive", "300", \
     "--log-level",          "info"]