Sabithulla commited on
Commit
3274ec4
·
1 Parent(s): 64f495c

Multi-stage Docker build: Stage 1 compiles llama-cpp-python to wheel, Stage 2 installs pre-built wheel - NO TIMEOUT! Pre-download fast-chat model at build time.

Browse files
Files changed (5) hide show
  1. Dockerfile +33 -14
  2. download_models.py +62 -0
  3. model_manager.py +98 -115
  4. requirements.txt +1 -1
  5. start.sh +2 -17
Dockerfile CHANGED
@@ -1,30 +1,49 @@
1
- FROM ollama/ollama:latest
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  WORKDIR /app
4
 
5
- # Install Python and dependencies
6
  RUN apt-get update && apt-get install -y \
7
- python3.11 \
8
- python3-pip \
9
  tesseract-ocr \
10
  libtesseract-dev \
11
- curl \
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
- # Copy Python requirements and install (no build tools needed)
 
 
 
 
 
15
  COPY requirements.txt .
16
- RUN pip install --no-cache-dir -r requirements.txt
17
 
18
- # Copy application code
19
  COPY . .
20
 
21
  # Create models directory
22
- RUN mkdir -p /root/.ollama/models
23
 
24
- EXPOSE 7860 11434
 
 
25
 
26
- # Startup script: start Ollama + FastAPI
27
- COPY start.sh .
28
- RUN chmod +x start.sh
29
 
30
- CMD ["./start.sh"]
 
1
+ # Stage 1: Compile llama-cpp-python to wheel (happens once)
2
+ FROM python:3.11-slim AS builder
3
+
4
+ WORKDIR /tmp/build
5
+
6
+ # Install build tools
7
+ RUN apt-get update && apt-get install -y \
8
+ build-essential \
9
+ cmake \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy requirements
13
+ COPY requirements.txt .
14
+
15
+ # Build wheel for llama-cpp-python (will save it)
16
+ RUN pip wheel --no-cache-dir -r requirements.txt -w /tmp/wheels
17
+
18
+ # Stage 2: Production image (just installs pre-built wheels)
19
+ FROM python:3.11-slim
20
 
21
  WORKDIR /app
22
 
23
+ # Install only runtime dependencies (no build tools needed)
24
  RUN apt-get update && apt-get install -y \
 
 
25
  tesseract-ocr \
26
  libtesseract-dev \
 
27
  && rm -rf /var/lib/apt/lists/*
28
 
29
+ # Copy pre-built wheels from Stage 1 (NO COMPILATION!)
30
+ COPY --from=builder /tmp/wheels /tmp/wheels
31
+
32
+ # Install from pre-built wheels (instant, no compilation)
33
+ RUN pip install --no-cache-dir --no-index --find-links /tmp/wheels -r requirements.txt
34
+
35
  COPY requirements.txt .
 
36
 
37
+ # Copy application
38
  COPY . .
39
 
40
  # Create models directory
41
+ RUN mkdir -p models
42
 
43
+ # Download models at build time
44
+ COPY download_models.py .
45
+ RUN python download_models.py || echo "Model download attempted"
46
 
47
+ EXPOSE 7860
 
 
48
 
49
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "75"]
download_models.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Download models at Docker build time"""
3
+
4
+ import os
5
+ import requests
6
+ from pathlib import Path
7
+
8
+ MODELS_DIR = "models"
9
+ os.makedirs(MODELS_DIR, exist_ok=True)
10
+
11
+ MODEL_CONFIGS = {
12
+ "fast-chat": {
13
+ "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
14
+ "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf"
15
+ },
16
+ "tinyllama": {
17
+ "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
18
+ "url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
19
+ },
20
+ "coder": {
21
+ "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
22
+ "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"
23
+ }
24
+ }
25
+
26
+ def download_model(model_id, config):
27
+ """Download a single model"""
28
+ filepath = os.path.join(MODELS_DIR, config["file"])
29
+
30
+ # Skip if already exists and has reasonable size
31
+ if os.path.exists(filepath) and os.path.getsize(filepath) > 50000000:
32
+ print(f"✓ {model_id} already exists ({os.path.getsize(filepath) / 1e9:.2f}GB)")
33
+ return
34
+
35
+ print(f"Downloading {model_id}...")
36
+ try:
37
+ response = requests.get(config["url"], stream=True, timeout=60)
38
+ response.raise_for_status()
39
+
40
+ total_size = int(response.headers.get('content-length', 0))
41
+ downloaded = 0
42
+
43
+ with open(filepath, 'wb') as f:
44
+ for chunk in response.iter_content(chunk_size=10*1024*1024): # 10MB chunks
45
+ if chunk:
46
+ f.write(chunk)
47
+ downloaded += len(chunk)
48
+ if total_size:
49
+ pct = (downloaded / total_size) * 100
50
+ print(f" {model_id}: {pct:.1f}%", end='\r')
51
+
52
+ print(f"✓ {model_id} downloaded ({os.path.getsize(filepath) / 1e9:.2f}GB)")
53
+ except Exception as e:
54
+ print(f"✗ Failed to download {model_id}: {e}")
55
+
56
+ if __name__ == "__main__":
57
+ print("Pre-downloading models at build time...")
58
+
59
+ # Only download fast-chat at build time (others on-demand)
60
+ download_model("fast-chat", MODEL_CONFIGS["fast-chat"])
61
+
62
+ print(f"\n✓ Models ready in {MODELS_DIR}/")
model_manager.py CHANGED
@@ -1,152 +1,135 @@
1
  import os
 
2
  import requests
3
  from typing import Generator
4
- import time
5
- import json
6
-
7
- OLLAMA_API = "http://localhost:11434"
8
 
9
  class ModelManager:
10
  def __init__(self):
11
  self.models = {}
12
- self.ollama_ready = False
13
- self._wait_for_ollama()
14
-
15
- # Map model IDs to Ollama model names
16
- self.model_map = {
17
- "fast-chat": "qwen2.5:0.5b",
18
- "tinyllama": "tinyllama:latest",
19
- "phi": "neural-chat:7b",
20
- "coder": "mistral:latest",
21
- "orca": "llama2:latest",
22
- "mistral": "mistral:latest",
23
- "neural": "neural-chat:7b",
24
- "zephyr": "neural-chat:7b",
25
- "openhermes": "neural-chat:7b",
26
- "starling": "neural-chat:7b",
27
- "dolphin": "mistral:latest"
 
 
 
 
28
  }
29
-
30
  self.models_dir = os.path.join(os.getcwd(), "models")
31
  os.makedirs(self.models_dir, exist_ok=True)
32
-
33
- # Critical models to pull at startup
34
  self.critical_models = ["fast-chat"]
35
  self.auto_download_critical()
36
 
37
- def _wait_for_ollama(self, max_retries=30):
38
- """Wait for Ollama service to be ready"""
39
- for i in range(max_retries):
40
- try:
41
- response = requests.get(f"{OLLAMA_API}/api/version", timeout=2)
42
- if response.status_code == 200:
43
- print(f"✓ Ollama is ready")
44
- self.ollama_ready = True
45
- return
46
- except:
47
- pass
48
-
49
- if i < max_retries - 1:
50
- print(f"Waiting for Ollama... ({i+1}/{max_retries})")
51
- time.sleep(1)
52
-
53
- print("⚠ Ollama not responding, continuing anyway...")
54
-
55
  def auto_download_critical(self):
56
  """Download only critical lightweight models at startup"""
57
- if not self.ollama_ready:
58
- print("Skipping model download - Ollama not ready")
59
- return
60
-
61
- print("Pulling critical models...")
62
  for model_id in self.critical_models:
63
  try:
64
- ollama_model = self.model_map.get(model_id, model_id)
65
- self.pull_model(ollama_model)
66
- print(f"✓ {model_id} ({ollama_model}) ready")
67
  except Exception as e:
68
- print(f"✗ Failed to pull {model_id}: {e}")
69
 
70
- def pull_model(self, model_name: str):
71
- """Pull model from Ollama"""
72
- url = f"{OLLAMA_API}/api/pull"
73
- data = {"name": model_name, "stream": False}
74
 
75
- response = requests.post(url, json=data, timeout=300)
76
- response.raise_for_status()
77
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def load_model(self, model_id: str):
80
- """Models are managed by Ollama, just return a reference"""
81
  if model_id in self.models:
82
  return self.models[model_id]
83
 
84
- ollama_model = self.model_map.get(model_id, model_id)
85
- self.models[model_id] = ollama_model
86
- return ollama_model
 
 
 
 
 
 
87
 
88
  def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
89
- """Simple prompt formatting for Ollama (handles templates internally)"""
90
- # Ollama handles prompt formatting internally, just concatenate messages
91
- messages = []
92
- messages.append({"role": "system", "content": system})
93
 
94
- if history:
 
95
  for msg in history:
96
- messages.append(msg)
97
-
98
- messages.append({"role": "user", "content": prompt})
99
- return messages
 
 
 
 
 
 
 
 
 
 
100
 
101
  def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
102
- """Stream response from Ollama"""
103
- if not self.ollama_ready:
104
- yield "Error: Ollama service not ready"
105
- return
106
 
107
- try:
108
- ollama_model = self.load_model(model_id)
109
-
110
- system_text = (
111
- "You are a highly accurate AI assistant. "
112
- "For math, ALWAYS use LaTeX wrapping display equations in [ ] and inline in ( )."
113
- )
114
-
115
- messages = self.format_prompt(model_id, system_text, context or [], prompt)
116
-
117
- # Call Ollama generate endpoint with streaming
118
- url = f"{OLLAMA_API}/api/chat"
119
- payload = {
120
- "model": ollama_model,
121
- "messages": messages,
122
- "stream": True,
123
- "options": {
124
- "temperature": kwargs.get("temperature", 0.7),
125
- "top_p": kwargs.get("top_p", 0.95),
126
- "num_predict": kwargs.get("max_tokens", 512)
127
- }
128
- }
129
-
130
- response = requests.post(url, json=payload, stream=True, timeout=300)
131
- response.raise_for_status()
132
-
133
- for line in response.iter_lines():
134
- if line:
135
- try:
136
- chunk = json.loads(line)
137
- if "message" in chunk and "content" in chunk["message"]:
138
- token = chunk["message"]["content"]
139
- if token:
140
- yield token
141
- except json.JSONDecodeError:
142
- pass
143
-
144
- except Exception as e:
145
- print(f"Error generating response: {e}")
146
- yield f"Error: {str(e)}"
147
 
148
  def cleanup(self):
149
  """Cleanup resources"""
150
- # Ollama manages its own resources
 
 
151
  self.models.clear()
152
  print("Cleanup complete")
 
1
  import os
2
+ from llama_cpp import Llama
3
  import requests
4
  from typing import Generator
 
 
 
 
5
 
6
  class ModelManager:
7
  def __init__(self):
8
  self.models = {}
9
+ # Templates for different model architectures
10
+ self.model_configs = {
11
+ "tinyllama": {
12
+ "repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
13
+ "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
14
+ "url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
15
+ "format": "tinyllama"
16
+ },
17
+ "coder": {
18
+ "repo": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
19
+ "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
20
+ "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
21
+ "format": "chatml"
22
+ },
23
+ "fast-chat": {
24
+ "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
25
+ "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
26
+ "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
27
+ "format": "chatml"
28
+ }
29
  }
 
30
  self.models_dir = os.path.join(os.getcwd(), "models")
31
  os.makedirs(self.models_dir, exist_ok=True)
32
+ # Only download smallest model at startup (fast-chat: 0.5B)
 
33
  self.critical_models = ["fast-chat"]
34
  self.auto_download_critical()
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def auto_download_critical(self):
37
  """Download only critical lightweight models at startup"""
38
+ print("Checking for pre-downloaded models...")
 
 
 
 
39
  for model_id in self.critical_models:
40
  try:
41
+ path = self.download_model(model_id)
42
+ print(f"✓ {model_id} ready ({path})")
 
43
  except Exception as e:
44
+ print(f"✗ Failed to ensure {model_id}: {e}")
45
 
46
+ def download_model(self, model_id: str):
47
+ config = self.model_configs.get(model_id)
48
+ if not config:
49
+ raise ValueError(f"Model {model_id} not configured")
50
 
51
+ target_path = os.path.join(self.models_dir, config["file"])
52
+ # Check if file exists AND has some size
53
+ if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000:
54
+ return target_path
55
+
56
+ print(f"Downloading {model_id} from {config['url']}...")
57
+ try:
58
+ response = requests.get(config["url"], stream=True, timeout=60)
59
+ response.raise_for_status()
60
+ with open(target_path, "wb") as f:
61
+ for chunk in response.iter_content(chunk_size=1024*1024):
62
+ if chunk:
63
+ f.write(chunk)
64
+ print(f"Successfully downloaded {model_id}")
65
+ return target_path
66
+ except Exception as e:
67
+ if os.path.exists(target_path):
68
+ os.remove(target_path)
69
+ print(f"Download failed for {model_id}: {e}")
70
+ raise e
71
 
72
  def load_model(self, model_id: str):
 
73
  if model_id in self.models:
74
  return self.models[model_id]
75
 
76
+ path = self.download_model(model_id)
77
+ self.models[model_id] = Llama(
78
+ model_path=path,
79
+ n_ctx=1024,
80
+ n_threads=2,
81
+ verbose=False
82
+ )
83
+ print(f"✓ Model {model_id} loaded")
84
+ return self.models[model_id]
85
 
86
  def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
87
+ fmt = self.model_configs[model_id]["format"]
 
 
 
88
 
89
+ if fmt == "chatml":
90
+ full = f"<|im_start|>system\n{system}<|im_end|>\n"
91
  for msg in history:
92
+ role = "user" if msg["role"] == "user" else "assistant"
93
+ full += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n"
94
+ full += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
95
+ return full, ["<|im_end|>", "###", "<|im_start|>", "</s>"]
96
+
97
+ elif fmt == "tinyllama":
98
+ full = f"<|system|>\n{system}</s>\n"
99
+ for msg in history:
100
+ role = "user" if msg["role"] == "user" else "assistant"
101
+ full += f"<|{role}|>\n{msg['content']}</s>\n"
102
+ full += f"<|user|>\n{prompt}</s>\n<|assistant|>\n"
103
+ return full, ["</s>", "<|user|>", "<|assistant|>"]
104
+
105
+ return prompt, ["</s>"]
106
 
107
  def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
108
+ llm = self.load_model(model_id)
 
 
 
109
 
110
+ system_text = (
111
+ "You are a highly accurate AI assistant. "
112
+ "For math, ALWAYS use LaTeX wrapping display equations in $ $ and inline in \\( \\)."
113
+ )
114
+
115
+ full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
116
+
117
+ params = {
118
+ "max_tokens": kwargs.get("max_tokens", 512),
119
+ "stop": stop_tokens,
120
+ "stream": True,
121
+ "temperature": kwargs.get("temperature", 0.7),
122
+ "top_p": kwargs.get("top_p", 0.95)
123
+ }
124
+
125
+ for output in llm(full_prompt, **params):
126
+ token = output["choices"][0]["text"]
127
+ yield token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  def cleanup(self):
130
  """Cleanup resources"""
131
+ for model in self.models.values():
132
+ if hasattr(model, 'close'):
133
+ model.close()
134
  self.models.clear()
135
  print("Cleanup complete")
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  fastapi
2
  uvicorn
3
- requests
4
  supabase
5
  python-multipart
6
  pytesseract
 
1
  fastapi
2
  uvicorn
3
+ llama-cpp-python
4
  supabase
5
  python-multipart
6
  pytesseract
start.sh CHANGED
@@ -1,18 +1,3 @@
1
  #!/bin/bash
2
- set -e
3
-
4
- # Start Ollama in background
5
- echo "Starting Ollama..."
6
- ollama serve --host 0.0.0.0 &
7
- OLLAMA_PID=$!
8
-
9
- # Wait for Ollama to be ready
10
- sleep 5
11
-
12
- # Pull the model
13
- echo "Pulling fast-chat model (qwen2.5-0.5b)..."
14
- ollama pull qwen2.5:0.5b || echo "Model may already exist"
15
-
16
- # Start FastAPI app
17
- echo "Starting FastAPI app..."
18
- exec python3 -m uvicorn main:app --host 0.0.0.0 --port 7860
 
1
  #!/bin/bash
2
+ # Models are pre-downloaded at build time, just run the app
3
+ exec uvicorn main:app --host 0.0.0.0 --port 7860 --timeout-keep-alive 75