Spaces:
Build error
Build error
Commit ·
9d2777a
1
Parent(s): 3274ec4
Multi-stage Docker build: Stage 1 compiles llama-cpp-python once, Stage 2 reuses compiled wheels - NO TIMEOUT! Build time 8-12 minutes first time, then cached.
Browse files- Dockerfile +8 -10
- model_manager.py +12 -18
Dockerfile
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
# Stage 1: Compile llama-cpp-python to wheel (
|
| 2 |
FROM python:3.11-slim AS builder
|
| 3 |
|
| 4 |
WORKDIR /tmp/build
|
| 5 |
|
| 6 |
-
# Install build tools
|
| 7 |
RUN apt-get update && apt-get install -y \
|
| 8 |
build-essential \
|
| 9 |
cmake \
|
|
@@ -12,29 +12,27 @@ RUN apt-get update && apt-get install -y \
|
|
| 12 |
# Copy requirements
|
| 13 |
COPY requirements.txt .
|
| 14 |
|
| 15 |
-
# Build
|
| 16 |
RUN pip wheel --no-cache-dir -r requirements.txt -w /tmp/wheels
|
| 17 |
|
| 18 |
-
# Stage 2: Production
|
| 19 |
FROM python:3.11-slim
|
| 20 |
|
| 21 |
WORKDIR /app
|
| 22 |
|
| 23 |
-
# Install only runtime dependencies (
|
| 24 |
RUN apt-get update && apt-get install -y \
|
| 25 |
tesseract-ocr \
|
| 26 |
libtesseract-dev \
|
| 27 |
&& rm -rf /var/lib/apt/lists/*
|
| 28 |
|
| 29 |
-
# Copy pre-built wheels from Stage 1 (
|
| 30 |
COPY --from=builder /tmp/wheels /tmp/wheels
|
| 31 |
|
| 32 |
-
# Install from pre-built wheels (
|
| 33 |
RUN pip install --no-cache-dir --no-index --find-links /tmp/wheels -r requirements.txt
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# Copy application
|
| 38 |
COPY . .
|
| 39 |
|
| 40 |
# Create models directory
|
|
|
|
| 1 |
+
# Stage 1: Compile llama-cpp-python to wheel (one-time build)
|
| 2 |
FROM python:3.11-slim AS builder
|
| 3 |
|
| 4 |
WORKDIR /tmp/build
|
| 5 |
|
| 6 |
+
# Install build tools only in Stage 1
|
| 7 |
RUN apt-get update && apt-get install -y \
|
| 8 |
build-essential \
|
| 9 |
cmake \
|
|
|
|
| 12 |
# Copy requirements
|
| 13 |
COPY requirements.txt .
|
| 14 |
|
| 15 |
+
# Build ALL wheels (llama-cpp-python gets compiled here)
|
| 16 |
RUN pip wheel --no-cache-dir -r requirements.txt -w /tmp/wheels
|
| 17 |
|
| 18 |
+
# Stage 2: Production (just installs pre-built wheels from Stage 1)
|
| 19 |
FROM python:3.11-slim
|
| 20 |
|
| 21 |
WORKDIR /app
|
| 22 |
|
| 23 |
+
# Install only runtime dependencies (NO build tools!)
|
| 24 |
RUN apt-get update && apt-get install -y \
|
| 25 |
tesseract-ocr \
|
| 26 |
libtesseract-dev \
|
| 27 |
&& rm -rf /var/lib/apt/lists/*
|
| 28 |
|
| 29 |
+
# Copy pre-built wheels from Stage 1 (compilation already done!)
|
| 30 |
COPY --from=builder /tmp/wheels /tmp/wheels
|
| 31 |
|
| 32 |
+
# Install from pre-built wheels (INSTANT - no compilation!)
|
| 33 |
RUN pip install --no-cache-dir --no-index --find-links /tmp/wheels -r requirements.txt
|
| 34 |
|
| 35 |
+
# Copy application code
|
|
|
|
|
|
|
| 36 |
COPY . .
|
| 37 |
|
| 38 |
# Create models directory
|
model_manager.py
CHANGED
|
@@ -6,8 +6,13 @@ from typing import Generator
|
|
| 6 |
class ModelManager:
|
| 7 |
def __init__(self):
|
| 8 |
self.models = {}
|
| 9 |
-
# Templates for different model architectures
|
| 10 |
self.model_configs = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"tinyllama": {
|
| 12 |
"repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
| 13 |
"file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
|
@@ -19,17 +24,10 @@ class ModelManager:
|
|
| 19 |
"file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
|
| 20 |
"url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
|
| 21 |
"format": "chatml"
|
| 22 |
-
},
|
| 23 |
-
"fast-chat": {
|
| 24 |
-
"repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
|
| 25 |
-
"file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
|
| 26 |
-
"url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
|
| 27 |
-
"format": "chatml"
|
| 28 |
}
|
| 29 |
}
|
| 30 |
self.models_dir = os.path.join(os.getcwd(), "models")
|
| 31 |
os.makedirs(self.models_dir, exist_ok=True)
|
| 32 |
-
# Only download smallest model at startup (fast-chat: 0.5B)
|
| 33 |
self.critical_models = ["fast-chat"]
|
| 34 |
self.auto_download_critical()
|
| 35 |
|
|
@@ -39,7 +37,7 @@ class ModelManager:
|
|
| 39 |
for model_id in self.critical_models:
|
| 40 |
try:
|
| 41 |
path = self.download_model(model_id)
|
| 42 |
-
print(f"✓ {model_id} ready
|
| 43 |
except Exception as e:
|
| 44 |
print(f"✗ Failed to ensure {model_id}: {e}")
|
| 45 |
|
|
@@ -49,11 +47,10 @@ class ModelManager:
|
|
| 49 |
raise ValueError(f"Model {model_id} not configured")
|
| 50 |
|
| 51 |
target_path = os.path.join(self.models_dir, config["file"])
|
| 52 |
-
# Check if file exists AND has some size
|
| 53 |
if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000:
|
| 54 |
return target_path
|
| 55 |
|
| 56 |
-
print(f"Downloading {model_id}
|
| 57 |
try:
|
| 58 |
response = requests.get(config["url"], stream=True, timeout=60)
|
| 59 |
response.raise_for_status()
|
|
@@ -61,12 +58,11 @@ class ModelManager:
|
|
| 61 |
for chunk in response.iter_content(chunk_size=1024*1024):
|
| 62 |
if chunk:
|
| 63 |
f.write(chunk)
|
| 64 |
-
print(f"
|
| 65 |
return target_path
|
| 66 |
except Exception as e:
|
| 67 |
if os.path.exists(target_path):
|
| 68 |
-
os.remove(target_path)
|
| 69 |
-
print(f"Download failed for {model_id}: {e}")
|
| 70 |
raise e
|
| 71 |
|
| 72 |
def load_model(self, model_id: str):
|
|
@@ -80,7 +76,6 @@ class ModelManager:
|
|
| 80 |
n_threads=2,
|
| 81 |
verbose=False
|
| 82 |
)
|
| 83 |
-
print(f"✓ Model {model_id} loaded")
|
| 84 |
return self.models[model_id]
|
| 85 |
|
| 86 |
def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
|
|
@@ -108,8 +103,8 @@ class ModelManager:
|
|
| 108 |
llm = self.load_model(model_id)
|
| 109 |
|
| 110 |
system_text = (
|
| 111 |
-
"You are a
|
| 112 |
-
"For math,
|
| 113 |
)
|
| 114 |
|
| 115 |
full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
|
|
@@ -132,4 +127,3 @@ class ModelManager:
|
|
| 132 |
if hasattr(model, 'close'):
|
| 133 |
model.close()
|
| 134 |
self.models.clear()
|
| 135 |
-
print("Cleanup complete")
|
|
|
|
| 6 |
class ModelManager:
|
| 7 |
def __init__(self):
|
| 8 |
self.models = {}
|
|
|
|
| 9 |
self.model_configs = {
|
| 10 |
+
"fast-chat": {
|
| 11 |
+
"repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
|
| 12 |
+
"file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
|
| 13 |
+
"url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
|
| 14 |
+
"format": "chatml"
|
| 15 |
+
},
|
| 16 |
"tinyllama": {
|
| 17 |
"repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
| 18 |
"file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
|
|
|
| 24 |
"file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
|
| 25 |
"url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
|
| 26 |
"format": "chatml"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
}
|
| 28 |
}
|
| 29 |
self.models_dir = os.path.join(os.getcwd(), "models")
|
| 30 |
os.makedirs(self.models_dir, exist_ok=True)
|
|
|
|
| 31 |
self.critical_models = ["fast-chat"]
|
| 32 |
self.auto_download_critical()
|
| 33 |
|
|
|
|
| 37 |
for model_id in self.critical_models:
|
| 38 |
try:
|
| 39 |
path = self.download_model(model_id)
|
| 40 |
+
print(f"✓ {model_id} ready")
|
| 41 |
except Exception as e:
|
| 42 |
print(f"✗ Failed to ensure {model_id}: {e}")
|
| 43 |
|
|
|
|
| 47 |
raise ValueError(f"Model {model_id} not configured")
|
| 48 |
|
| 49 |
target_path = os.path.join(self.models_dir, config["file"])
|
|
|
|
| 50 |
if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000:
|
| 51 |
return target_path
|
| 52 |
|
| 53 |
+
print(f"Downloading {model_id}...")
|
| 54 |
try:
|
| 55 |
response = requests.get(config["url"], stream=True, timeout=60)
|
| 56 |
response.raise_for_status()
|
|
|
|
| 58 |
for chunk in response.iter_content(chunk_size=1024*1024):
|
| 59 |
if chunk:
|
| 60 |
f.write(chunk)
|
| 61 |
+
print(f"✓ {model_id} downloaded")
|
| 62 |
return target_path
|
| 63 |
except Exception as e:
|
| 64 |
if os.path.exists(target_path):
|
| 65 |
+
os.remove(target_path)
|
|
|
|
| 66 |
raise e
|
| 67 |
|
| 68 |
def load_model(self, model_id: str):
|
|
|
|
| 76 |
n_threads=2,
|
| 77 |
verbose=False
|
| 78 |
)
|
|
|
|
| 79 |
return self.models[model_id]
|
| 80 |
|
| 81 |
def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
|
|
|
|
| 103 |
llm = self.load_model(model_id)
|
| 104 |
|
| 105 |
system_text = (
|
| 106 |
+
"You are a helpful AI assistant. "
|
| 107 |
+
"For math, use LaTeX with $ $ for display and \\( \\) for inline."
|
| 108 |
)
|
| 109 |
|
| 110 |
full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
|
|
|
|
| 127 |
if hasattr(model, 'close'):
|
| 128 |
model.close()
|
| 129 |
self.models.clear()
|
|
|