Sabithulla commited on
Commit
64f495c
·
1 Parent(s): 939e78c

Switch to Ollama for zero-compilation deployment - pre-downloads models at startup

Browse files
Files changed (4) hide show
  1. Dockerfile +13 -17
  2. model_manager.py +119 -161
  3. requirements.txt +1 -1
  4. start.sh +18 -0
Dockerfile CHANGED
@@ -1,34 +1,30 @@
1
- FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install minimal system dependencies (no compilation needed for binary wheels)
6
  RUN apt-get update && apt-get install -y \
 
 
7
  tesseract-ocr \
8
  libtesseract-dev \
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- # Copy requirements
12
  COPY requirements.txt .
13
-
14
- # Install using pre-compiled binary wheels only (NO compilation)
15
- RUN pip install --no-cache-dir --no-build --prefer-binary -r requirements.txt
16
 
17
  # Copy application code
18
  COPY . .
19
 
20
  # Create models directory
21
- RUN mkdir -p models
22
-
23
- EXPOSE 7860
24
 
25
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "75"]
26
-
27
- # Create models directory
28
- RUN mkdir -p models
29
 
30
- # Expose port 7860
31
- EXPOSE 7860
 
32
 
33
- # Run app
34
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "75"]
 
1
+ FROM ollama/ollama:latest
2
 
3
  WORKDIR /app
4
 
5
+ # Install Python and dependencies
6
  RUN apt-get update && apt-get install -y \
7
+ python3.11 \
8
+ python3-pip \
9
  tesseract-ocr \
10
  libtesseract-dev \
11
+ curl \
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
+ # Copy Python requirements and install (no build tools needed)
15
  COPY requirements.txt .
16
+ RUN pip install --no-cache-dir -r requirements.txt
 
 
17
 
18
  # Copy application code
19
  COPY . .
20
 
21
  # Create models directory
22
+ RUN mkdir -p /root/.ollama/models
 
 
23
 
24
+ EXPOSE 7860 11434
 
 
 
25
 
26
+ # Startup script: start Ollama + FastAPI
27
+ COPY start.sh .
28
+ RUN chmod +x start.sh
29
 
30
+ CMD ["./start.sh"]
 
model_manager.py CHANGED
@@ -1,194 +1,152 @@
1
  import os
2
- from llama_cpp import Llama
3
  import requests
4
  from typing import Generator
 
 
 
 
5
 
6
  class ModelManager:
7
  def __init__(self):
8
  self.models = {}
9
- # Templates for different model architectures
10
- self.model_configs = {
11
- "tinyllama": {
12
- "repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
13
- "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
14
- "url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
15
- "format": "tinyllama"
16
- },
17
- "phi": {
18
- "repo": "TheBloke/phi-2-GGUF",
19
- "file": "phi-2.Q4_K_M.gguf",
20
- "url": "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf",
21
- "format": "phi"
22
- },
23
- "coder": {
24
- "repo": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
25
- "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
26
- "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
27
- "format": "chatml"
28
- },
29
- "orca": {
30
- "repo": "bartowski/Llama-3.2-3B-Instruct-GGUF",
31
- "file": "Llama-3.2-3B-Instruct-Q4_K_M.gguf",
32
- "url": "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf",
33
- "format": "llama3"
34
- },
35
- "fast-chat": {
36
- "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
37
- "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
38
- "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
39
- "format": "chatml"
40
- },
41
- "mistral": {
42
- "repo": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
43
- "file": "mistral-7b-instruct-v0.2.Q4_K_M.gguf",
44
- "url": "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
45
- "format": "chatml"
46
- },
47
- "neural": {
48
- "repo": "TheBloke/neural-chat-7B-v3-1-GGUF",
49
- "file": "neural-chat-7b-v3-1.Q4_K_M.gguf",
50
- "url": "https://huggingface.co/TheBloke/neural-chat-7B-v3-1-GGUF/resolve/main/neural-chat-7b-v3-1.Q4_K_M.gguf",
51
- "format": "chatml"
52
- },
53
- "zephyr": {
54
- "repo": "TheBloke/zephyr-7B-beta-GGUF",
55
- "file": "zephyr-7b-beta.Q4_K_M.gguf",
56
- "url": "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q4_K_M.gguf",
57
- "format": "chatml"
58
- },
59
- "openhermes": {
60
- "repo": "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF",
61
- "file": "openhermes-2.5-mistral-7b.Q4_K_M.gguf",
62
- "url": "https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
63
- "format": "chatml"
64
- },
65
- "starling": {
66
- "repo": "TheBloke/Starling-LM-7B-alpha-GGUF",
67
- "file": "starling-lm-7b-alpha.Q4_K_M.gguf",
68
- "url": "https://huggingface.co/TheBloke/Starling-LM-7B-alpha-GGUF/resolve/main/starling-lm-7b-alpha.Q4_K_M.gguf",
69
- "format": "chatml"
70
- },
71
- "dolphin": {
72
- "repo": "TheBloke/dolphin-2.5-mixtral-8x7b-GGUF",
73
- "file": "dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf",
74
- "url": "https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF/resolve/main/dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf",
75
- "format": "chatml"
76
- }
77
  }
 
78
  self.models_dir = os.path.join(os.getcwd(), "models")
79
  os.makedirs(self.models_dir, exist_ok=True)
80
- # Only download smallest model at startup (fast-chat: 0.5B)
 
81
  self.critical_models = ["fast-chat"]
82
  self.auto_download_critical()
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def auto_download_critical(self):
85
  """Download only critical lightweight models at startup"""
86
- print("Downloading critical models...")
 
 
 
 
87
  for model_id in self.critical_models:
88
  try:
89
- path = self.download_model(model_id)
90
- print(f"✓ {model_id} ready ({path})")
 
91
  except Exception as e:
92
- print(f"✗ Failed to download {model_id}: {e}")
93
 
94
- def download_model(self, model_id: str):
95
- config = self.model_configs.get(model_id)
96
- if not config:
97
- raise ValueError(f"Model {model_id} not configured")
98
 
99
- target_path = os.path.join(self.models_dir, config["file"])
100
- # Check if file exists AND has some size
101
- if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000: # Min 50MB
102
- return target_path
103
-
104
- print(f"Downloading {model_id} from {config['url']}...")
105
- try:
106
- # Using a more standard stream download with content-length check if possible
107
- response = requests.get(config["url"], stream=True, timeout=60)
108
- response.raise_for_status()
109
- with open(target_path, "wb") as f:
110
- for chunk in response.iter_content(chunk_size=1024*1024): # 1MB chunks
111
- if chunk:
112
- f.write(chunk)
113
- print(f"Successfully downloaded {model_id}")
114
- return target_path
115
- except Exception as e:
116
- if os.path.exists(target_path):
117
- os.remove(target_path)
118
- print(f"Download failed for {model_id}: {e}")
119
- raise e
120
 
121
  def load_model(self, model_id: str):
 
122
  if model_id in self.models:
123
  return self.models[model_id]
124
 
125
- path = self.download_model(model_id)
126
- self.models[model_id] = Llama(
127
- model_path=path,
128
- n_ctx=1024, # Reduced for memory
129
- n_threads=2, # Light weight
130
- verbose=False
131
- )
132
- print(f"✓ Model {model_id} loaded")
133
- return self.models[model_id]
134
 
135
  def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
136
- fmt = self.model_configs[model_id]["format"]
 
 
 
137
 
138
- if fmt == "chatml":
139
- full = f"<|im_start|>system\n{system}<|im_end|>\n"
140
- for msg in history:
141
- role = "user" if msg["role"] == "user" else "assistant"
142
- full += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n"
143
- full += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
144
- return full, ["<|im_end|>", "###", "<|im_start|>", "</s>"]
145
-
146
- elif fmt == "tinyllama":
147
- full = f"<|system|>\n{system}</s>\n"
148
  for msg in history:
149
- role = "user" if msg["role"] == "user" else "assistant"
150
- full += f"<|{role}|>\n{msg['content']}</s>\n"
151
- full += f"<|user|>\n{prompt}</s>\n<|assistant|>\n"
152
- return full, ["</s>", "<|user|>", "<|assistant|>"]
153
-
154
- elif fmt == "llama3":
155
- # Llama 3.2 template
156
- full = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>"
157
- for msg in history:
158
- role = "user" if msg["role"] == "user" else "assistant"
159
- full += f"<|start_header_id|>{role}<|end_header_id|>\n\n{msg['content']}<|eot_id|>"
160
- full += f"<|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
161
- return full, ["<|eot_id|>", "<|start_header_id|>", "</s>"]
162
-
163
- elif fmt == "phi":
164
- # Phi-2 optimized prompt
165
- full = f"Instruct: {system}\n{prompt}\nOutput:"
166
- return full, ["Instruct:", "Output:", "<|endoftext|>", "</s>"]
167
-
168
- return prompt, ["</s>"]
169
-
170
- return prompt, ["</s>"]
171
 
172
  def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
173
- llm = self.load_model(model_id)
174
-
175
- system_text = (
176
- "You are a highly accurate AI assistant. "
177
- "For math, ALWAYS use LaTeX wrapping display equations in [ ] and inline in ( )."
178
- )
179
 
180
- full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
181
-
182
- params = {
183
- "max_tokens": kwargs.get("max_tokens", 512), # Reduced for memory
184
- "stop": stop_tokens,
185
- "stream": True,
186
- "temperature": kwargs.get("temperature", 0.7),
187
- "top_p": kwargs.get("top_p", 0.95)
188
- }
189
-
190
- for output in llm(full_prompt, **params):
191
- token = output["choices"][0]["text"]
192
- yield token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- model_manager = ModelManager()
 
 
 
 
 
1
  import os
 
2
  import requests
3
  from typing import Generator
4
+ import time
5
+ import json
6
+
7
+ OLLAMA_API = "http://localhost:11434"
8
 
9
  class ModelManager:
10
  def __init__(self):
11
  self.models = {}
12
+ self.ollama_ready = False
13
+ self._wait_for_ollama()
14
+
15
+ # Map model IDs to Ollama model names
16
+ self.model_map = {
17
+ "fast-chat": "qwen2.5:0.5b",
18
+ "tinyllama": "tinyllama:latest",
19
+ "phi": "neural-chat:7b",
20
+ "coder": "mistral:latest",
21
+ "orca": "llama2:latest",
22
+ "mistral": "mistral:latest",
23
+ "neural": "neural-chat:7b",
24
+ "zephyr": "neural-chat:7b",
25
+ "openhermes": "neural-chat:7b",
26
+ "starling": "neural-chat:7b",
27
+ "dolphin": "mistral:latest"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  }
29
+
30
  self.models_dir = os.path.join(os.getcwd(), "models")
31
  os.makedirs(self.models_dir, exist_ok=True)
32
+
33
+ # Critical models to pull at startup
34
  self.critical_models = ["fast-chat"]
35
  self.auto_download_critical()
36
 
37
+ def _wait_for_ollama(self, max_retries=30):
38
+ """Wait for Ollama service to be ready"""
39
+ for i in range(max_retries):
40
+ try:
41
+ response = requests.get(f"{OLLAMA_API}/api/version", timeout=2)
42
+ if response.status_code == 200:
43
+ print(f"✓ Ollama is ready")
44
+ self.ollama_ready = True
45
+ return
46
+ except:
47
+ pass
48
+
49
+ if i < max_retries - 1:
50
+ print(f"Waiting for Ollama... ({i+1}/{max_retries})")
51
+ time.sleep(1)
52
+
53
+ print("⚠ Ollama not responding, continuing anyway...")
54
+
55
  def auto_download_critical(self):
56
  """Download only critical lightweight models at startup"""
57
+ if not self.ollama_ready:
58
+ print("Skipping model download - Ollama not ready")
59
+ return
60
+
61
+ print("Pulling critical models...")
62
  for model_id in self.critical_models:
63
  try:
64
+ ollama_model = self.model_map.get(model_id, model_id)
65
+ self.pull_model(ollama_model)
66
+ print(f"✓ {model_id} ({ollama_model}) ready")
67
  except Exception as e:
68
+ print(f"✗ Failed to pull {model_id}: {e}")
69
 
70
+ def pull_model(self, model_name: str):
71
+ """Pull model from Ollama"""
72
+ url = f"{OLLAMA_API}/api/pull"
73
+ data = {"name": model_name, "stream": False}
74
 
75
+ response = requests.post(url, json=data, timeout=300)
76
+ response.raise_for_status()
77
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def load_model(self, model_id: str):
80
+ """Models are managed by Ollama, just return a reference"""
81
  if model_id in self.models:
82
  return self.models[model_id]
83
 
84
+ ollama_model = self.model_map.get(model_id, model_id)
85
+ self.models[model_id] = ollama_model
86
+ return ollama_model
 
 
 
 
 
 
87
 
88
  def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
89
+ """Simple prompt formatting for Ollama (handles templates internally)"""
90
+ # Ollama handles prompt formatting internally, just concatenate messages
91
+ messages = []
92
+ messages.append({"role": "system", "content": system})
93
 
94
+ if history:
 
 
 
 
 
 
 
 
 
95
  for msg in history:
96
+ messages.append(msg)
97
+
98
+ messages.append({"role": "user", "content": prompt})
99
+ return messages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
102
+ """Stream response from Ollama"""
103
+ if not self.ollama_ready:
104
+ yield "Error: Ollama service not ready"
105
+ return
 
 
106
 
107
+ try:
108
+ ollama_model = self.load_model(model_id)
109
+
110
+ system_text = (
111
+ "You are a highly accurate AI assistant. "
112
+ "For math, ALWAYS use LaTeX wrapping display equations in [ ] and inline in ( )."
113
+ )
114
+
115
+ messages = self.format_prompt(model_id, system_text, context or [], prompt)
116
+
117
+ # Call Ollama generate endpoint with streaming
118
+ url = f"{OLLAMA_API}/api/chat"
119
+ payload = {
120
+ "model": ollama_model,
121
+ "messages": messages,
122
+ "stream": True,
123
+ "options": {
124
+ "temperature": kwargs.get("temperature", 0.7),
125
+ "top_p": kwargs.get("top_p", 0.95),
126
+ "num_predict": kwargs.get("max_tokens", 512)
127
+ }
128
+ }
129
+
130
+ response = requests.post(url, json=payload, stream=True, timeout=300)
131
+ response.raise_for_status()
132
+
133
+ for line in response.iter_lines():
134
+ if line:
135
+ try:
136
+ chunk = json.loads(line)
137
+ if "message" in chunk and "content" in chunk["message"]:
138
+ token = chunk["message"]["content"]
139
+ if token:
140
+ yield token
141
+ except json.JSONDecodeError:
142
+ pass
143
+
144
+ except Exception as e:
145
+ print(f"Error generating response: {e}")
146
+ yield f"Error: {str(e)}"
147
 
148
+ def cleanup(self):
149
+ """Cleanup resources"""
150
+ # Ollama manages its own resources
151
+ self.models.clear()
152
+ print("Cleanup complete")
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  fastapi
2
  uvicorn
3
- llama-cpp-python==0.2.81
4
  supabase
5
  python-multipart
6
  pytesseract
 
1
  fastapi
2
  uvicorn
3
+ requests
4
  supabase
5
  python-multipart
6
  pytesseract
start.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # Start Ollama in background
5
+ echo "Starting Ollama..."
6
+ ollama serve --host 0.0.0.0 &
7
+ OLLAMA_PID=$!
8
+
9
+ # Wait for Ollama to be ready
10
+ sleep 5
11
+
12
+ # Pull the model
13
+ echo "Pulling fast-chat model (qwen2.5-0.5b)..."
14
+ ollama pull qwen2.5:0.5b || echo "Model may already exist"
15
+
16
+ # Start FastAPI app
17
+ echo "Starting FastAPI app..."
18
+ exec python3 -m uvicorn main:app --host 0.0.0.0 --port 7860