Sabithulla commited on
Commit
9d2777a
·
1 Parent(s): 3274ec4

Multi-stage Docker build: Stage 1 compiles llama-cpp-python once, Stage 2 reuses compiled wheels - NO TIMEOUT! Build time 8-12 minutes first time, then cached.

Browse files
Files changed (2) hide show
  1. Dockerfile +8 -10
  2. model_manager.py +12 -18
Dockerfile CHANGED
@@ -1,9 +1,9 @@
1
- # Stage 1: Compile llama-cpp-python to wheel (happens once)
2
  FROM python:3.11-slim AS builder
3
 
4
  WORKDIR /tmp/build
5
 
6
- # Install build tools
7
  RUN apt-get update && apt-get install -y \
8
  build-essential \
9
  cmake \
@@ -12,29 +12,27 @@ RUN apt-get update && apt-get install -y \
12
  # Copy requirements
13
  COPY requirements.txt .
14
 
15
- # Build wheel for llama-cpp-python (will save it)
16
  RUN pip wheel --no-cache-dir -r requirements.txt -w /tmp/wheels
17
 
18
- # Stage 2: Production image (just installs pre-built wheels)
19
  FROM python:3.11-slim
20
 
21
  WORKDIR /app
22
 
23
- # Install only runtime dependencies (no build tools needed)
24
  RUN apt-get update && apt-get install -y \
25
  tesseract-ocr \
26
  libtesseract-dev \
27
  && rm -rf /var/lib/apt/lists/*
28
 
29
- # Copy pre-built wheels from Stage 1 (NO COMPILATION!)
30
  COPY --from=builder /tmp/wheels /tmp/wheels
31
 
32
- # Install from pre-built wheels (instant, no compilation)
33
  RUN pip install --no-cache-dir --no-index --find-links /tmp/wheels -r requirements.txt
34
 
35
- COPY requirements.txt .
36
-
37
- # Copy application
38
  COPY . .
39
 
40
  # Create models directory
 
1
+ # Stage 1: Compile llama-cpp-python to wheel (one-time build)
2
  FROM python:3.11-slim AS builder
3
 
4
  WORKDIR /tmp/build
5
 
6
+ # Install build tools only in Stage 1
7
  RUN apt-get update && apt-get install -y \
8
  build-essential \
9
  cmake \
 
12
  # Copy requirements
13
  COPY requirements.txt .
14
 
15
+ # Build ALL wheels (llama-cpp-python gets compiled here)
16
  RUN pip wheel --no-cache-dir -r requirements.txt -w /tmp/wheels
17
 
18
+ # Stage 2: Production (just installs pre-built wheels from Stage 1)
19
  FROM python:3.11-slim
20
 
21
  WORKDIR /app
22
 
23
+ # Install only runtime dependencies (NO build tools!)
24
  RUN apt-get update && apt-get install -y \
25
  tesseract-ocr \
26
  libtesseract-dev \
27
  && rm -rf /var/lib/apt/lists/*
28
 
29
+ # Copy pre-built wheels from Stage 1 (compilation already done!)
30
  COPY --from=builder /tmp/wheels /tmp/wheels
31
 
32
+ # Install from pre-built wheels (INSTANT - no compilation!)
33
  RUN pip install --no-cache-dir --no-index --find-links /tmp/wheels -r requirements.txt
34
 
35
+ # Copy application code
 
 
36
  COPY . .
37
 
38
  # Create models directory
model_manager.py CHANGED
@@ -6,8 +6,13 @@ from typing import Generator
6
  class ModelManager:
7
  def __init__(self):
8
  self.models = {}
9
- # Templates for different model architectures
10
  self.model_configs = {
 
 
 
 
 
 
11
  "tinyllama": {
12
  "repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
13
  "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
@@ -19,17 +24,10 @@ class ModelManager:
19
  "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
20
  "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
21
  "format": "chatml"
22
- },
23
- "fast-chat": {
24
- "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
25
- "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
26
- "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
27
- "format": "chatml"
28
  }
29
  }
30
  self.models_dir = os.path.join(os.getcwd(), "models")
31
  os.makedirs(self.models_dir, exist_ok=True)
32
- # Only download smallest model at startup (fast-chat: 0.5B)
33
  self.critical_models = ["fast-chat"]
34
  self.auto_download_critical()
35
 
@@ -39,7 +37,7 @@ class ModelManager:
39
  for model_id in self.critical_models:
40
  try:
41
  path = self.download_model(model_id)
42
- print(f"✓ {model_id} ready ({path})")
43
  except Exception as e:
44
  print(f"✗ Failed to ensure {model_id}: {e}")
45
 
@@ -49,11 +47,10 @@ class ModelManager:
49
  raise ValueError(f"Model {model_id} not configured")
50
 
51
  target_path = os.path.join(self.models_dir, config["file"])
52
- # Check if file exists AND has some size
53
  if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000:
54
  return target_path
55
 
56
- print(f"Downloading {model_id} from {config['url']}...")
57
  try:
58
  response = requests.get(config["url"], stream=True, timeout=60)
59
  response.raise_for_status()
@@ -61,12 +58,11 @@ class ModelManager:
61
  for chunk in response.iter_content(chunk_size=1024*1024):
62
  if chunk:
63
  f.write(chunk)
64
- print(f"Successfully downloaded {model_id}")
65
  return target_path
66
  except Exception as e:
67
  if os.path.exists(target_path):
68
- os.remove(target_path)
69
- print(f"Download failed for {model_id}: {e}")
70
  raise e
71
 
72
  def load_model(self, model_id: str):
@@ -80,7 +76,6 @@ class ModelManager:
80
  n_threads=2,
81
  verbose=False
82
  )
83
- print(f"✓ Model {model_id} loaded")
84
  return self.models[model_id]
85
 
86
  def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
@@ -108,8 +103,8 @@ class ModelManager:
108
  llm = self.load_model(model_id)
109
 
110
  system_text = (
111
- "You are a highly accurate AI assistant. "
112
- "For math, ALWAYS use LaTeX wrapping display equations in $ $ and inline in \\( \\)."
113
  )
114
 
115
  full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
@@ -132,4 +127,3 @@ class ModelManager:
132
  if hasattr(model, 'close'):
133
  model.close()
134
  self.models.clear()
135
- print("Cleanup complete")
 
6
  class ModelManager:
7
  def __init__(self):
8
  self.models = {}
 
9
  self.model_configs = {
10
+ "fast-chat": {
11
+ "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
12
+ "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
13
+ "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
14
+ "format": "chatml"
15
+ },
16
  "tinyllama": {
17
  "repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
18
  "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
 
24
  "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
25
  "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
26
  "format": "chatml"
 
 
 
 
 
 
27
  }
28
  }
29
  self.models_dir = os.path.join(os.getcwd(), "models")
30
  os.makedirs(self.models_dir, exist_ok=True)
 
31
  self.critical_models = ["fast-chat"]
32
  self.auto_download_critical()
33
 
 
37
  for model_id in self.critical_models:
38
  try:
39
  path = self.download_model(model_id)
40
+ print(f"✓ {model_id} ready")
41
  except Exception as e:
42
  print(f"✗ Failed to ensure {model_id}: {e}")
43
 
 
47
  raise ValueError(f"Model {model_id} not configured")
48
 
49
  target_path = os.path.join(self.models_dir, config["file"])
 
50
  if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000:
51
  return target_path
52
 
53
+ print(f"Downloading {model_id}...")
54
  try:
55
  response = requests.get(config["url"], stream=True, timeout=60)
56
  response.raise_for_status()
 
58
  for chunk in response.iter_content(chunk_size=1024*1024):
59
  if chunk:
60
  f.write(chunk)
61
+ print(f" {model_id} downloaded")
62
  return target_path
63
  except Exception as e:
64
  if os.path.exists(target_path):
65
+ os.remove(target_path)
 
66
  raise e
67
 
68
  def load_model(self, model_id: str):
 
76
  n_threads=2,
77
  verbose=False
78
  )
 
79
  return self.models[model_id]
80
 
81
  def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
 
103
  llm = self.load_model(model_id)
104
 
105
  system_text = (
106
+ "You are a helpful AI assistant. "
107
+ "For math, use LaTeX with $ $ for display and \\( \\) for inline."
108
  )
109
 
110
  full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
 
127
  if hasattr(model, 'close'):
128
  model.close()
129
  self.models.clear()