Spaces:

Sabithulla
/

alpha-core-ai

Build error

App Files Files Community

Sabithulla commited on Feb 23

Commit

64f495c

1 Parent(s): 939e78c

Switch to Ollama for zero-compilation deployment - pre-downloads models at startup

Browse files

Files changed (4) hide show

Dockerfile +13 -17
model_manager.py +119 -161
requirements.txt +1 -1
start.sh +18 -0

Dockerfile CHANGED Viewed

@@ -1,34 +1,30 @@
-FROM python:3.11-slim
 WORKDIR /app
-# Install minimal system dependencies (no compilation needed for binary wheels)
 RUN apt-get update && apt-get install -y \
     tesseract-ocr \
     libtesseract-dev \
     && rm -rf /var/lib/apt/lists/*
-# Copy requirements
 COPY requirements.txt .
-# Install using pre-compiled binary wheels only (NO compilation)
-RUN pip install --no-cache-dir --no-build --prefer-binary -r requirements.txt
 # Copy application code
 COPY . .
 # Create models directory
-RUN mkdir -p models
-EXPOSE 7860
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "75"]
-# Create models directory
-RUN mkdir -p models
-# Expose port 7860
-EXPOSE 7860
-# Run app
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "75"]

+FROM ollama/ollama:latest
 WORKDIR /app
+# Install Python and dependencies
 RUN apt-get update && apt-get install -y \
+    python3.11 \
+    python3-pip \
     tesseract-ocr \
     libtesseract-dev \
+    curl \
     && rm -rf /var/lib/apt/lists/*
+# Copy Python requirements and install (no build tools needed)
 COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .
 # Create models directory
+RUN mkdir -p /root/.ollama/models
+EXPOSE 7860 11434
+# Startup script: start Ollama + FastAPI
+COPY start.sh .
+RUN chmod +x start.sh
+CMD ["./start.sh"]

model_manager.py CHANGED Viewed

@@ -1,194 +1,152 @@
 import os
-from llama_cpp import Llama
 import requests
 from typing import Generator
 class ModelManager:
     def __init__(self):
         self.models = {}
-        # Templates for different model architectures
-        self.model_configs = {
-            "tinyllama": {
-                "repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-                "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-                "url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-                "format": "tinyllama"
-            },
-            "phi": {
-                "repo": "TheBloke/phi-2-GGUF",
-                "file": "phi-2.Q4_K_M.gguf",
-                "url": "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf",
-                "format": "phi"
-            },
-            "coder": {
-                "repo": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
-                "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
-                "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
-                "format": "chatml"
-            },
-            "orca": {
-                "repo": "bartowski/Llama-3.2-3B-Instruct-GGUF",
-                "file": "Llama-3.2-3B-Instruct-Q4_K_M.gguf",
-                "url": "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf",
-                "format": "llama3"
-            },
-            "fast-chat": {
-                "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
-                "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
-                "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
-                "format": "chatml"
-            },
-            "mistral": {
-                "repo": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
-                "file": "mistral-7b-instruct-v0.2.Q4_K_M.gguf",
-                "url": "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
-                "format": "chatml"
-            },
-            "neural": {
-                "repo": "TheBloke/neural-chat-7B-v3-1-GGUF",
-                "file": "neural-chat-7b-v3-1.Q4_K_M.gguf",
-                "url": "https://huggingface.co/TheBloke/neural-chat-7B-v3-1-GGUF/resolve/main/neural-chat-7b-v3-1.Q4_K_M.gguf",
-                "format": "chatml"
-            },
-            "zephyr": {
-                "repo": "TheBloke/zephyr-7B-beta-GGUF",
-                "file": "zephyr-7b-beta.Q4_K_M.gguf",
-                "url": "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q4_K_M.gguf",
-                "format": "chatml"
-            },
-            "openhermes": {
-                "repo": "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF",
-                "file": "openhermes-2.5-mistral-7b.Q4_K_M.gguf",
-                "url": "https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
-                "format": "chatml"
-            },
-            "starling": {
-                "repo": "TheBloke/Starling-LM-7B-alpha-GGUF",
-                "file": "starling-lm-7b-alpha.Q4_K_M.gguf",
-                "url": "https://huggingface.co/TheBloke/Starling-LM-7B-alpha-GGUF/resolve/main/starling-lm-7b-alpha.Q4_K_M.gguf",
-                "format": "chatml"
-            },
-            "dolphin": {
-                "repo": "TheBloke/dolphin-2.5-mixtral-8x7b-GGUF",
-                "file": "dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf",
-                "url": "https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF/resolve/main/dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf",
-                "format": "chatml"
-            }
         }
         self.models_dir = os.path.join(os.getcwd(), "models")
         os.makedirs(self.models_dir, exist_ok=True)
-        # Only download smallest model at startup (fast-chat: 0.5B)
         self.critical_models = ["fast-chat"]
         self.auto_download_critical()
     def auto_download_critical(self):
         """Download only critical lightweight models at startup"""
-        print("Downloading critical models...")
         for model_id in self.critical_models:
             try:
-                path = self.download_model(model_id)
-                print(f"✓ {model_id} ready ({path})")
             except Exception as e:
-                print(f"✗ Failed to download {model_id}: {e}")
-    def download_model(self, model_id: str):
-        config = self.model_configs.get(model_id)
-        if not config:
-            raise ValueError(f"Model {model_id} not configured")
-        target_path = os.path.join(self.models_dir, config["file"])
-        # Check if file exists AND has some size
-        if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000: # Min 50MB
-            return target_path
-        print(f"Downloading {model_id} from {config['url']}...")
-        try:
-            # Using a more standard stream download with content-length check if possible
-            response = requests.get(config["url"], stream=True, timeout=60)
-            response.raise_for_status()
-            with open(target_path, "wb") as f:
-                for chunk in response.iter_content(chunk_size=1024*1024): # 1MB chunks
-                    if chunk:
-                        f.write(chunk)
-            print(f"Successfully downloaded {model_id}")
-            return target_path
-        except Exception as e:
-            if os.path.exists(target_path):
-                os.remove(target_path)
-            print(f"Download failed for {model_id}: {e}")
-            raise e
     def load_model(self, model_id: str):
         if model_id in self.models:
             return self.models[model_id]
-        path = self.download_model(model_id)
-        self.models[model_id] = Llama(
-            model_path=path,
-            n_ctx=1024,  # Reduced for memory
-            n_threads=2,  # Light weight
-            verbose=False
-        )
-        print(f"✓ Model {model_id} loaded")
-        return self.models[model_id]
     def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
-        fmt = self.model_configs[model_id]["format"]
-        if fmt == "chatml":
-            full = f"<|im_start|>system\n{system}<|im_end|>\n"
-            for msg in history:
-                role = "user" if msg["role"] == "user" else "assistant"
-                full += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n"
-            full += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
-            return full, ["<|im_end|>", "###", "<|im_start|>", "</s>"]
-        elif fmt == "tinyllama":
-            full = f"<|system|>\n{system}</s>\n"
             for msg in history:
-                role = "user" if msg["role"] == "user" else "assistant"
-                full += f"<|{role}|>\n{msg['content']}</s>\n"
-            full += f"<|user|>\n{prompt}</s>\n<|assistant|>\n"
-            return full, ["</s>", "<|user|>", "<|assistant|>"]
-        elif fmt == "llama3":
-            # Llama 3.2 template
-            full = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>"
-            for msg in history:
-                role = "user" if msg["role"] == "user" else "assistant"
-                full += f"<|start_header_id|>{role}<|end_header_id|>\n\n{msg['content']}<|eot_id|>"
-            full += f"<|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-            return full, ["<|eot_id|>", "<|start_header_id|>", "</s>"]
-        elif fmt == "phi":
-            # Phi-2 optimized prompt
-            full = f"Instruct: {system}\n{prompt}\nOutput:"
-            return full, ["Instruct:", "Output:", "<|endoftext|>", "</s>"]
-        return prompt, ["</s>"]
-        return prompt, ["</s>"]
     def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
-        llm = self.load_model(model_id)
-        system_text = (
-            "You are a highly accurate AI assistant. "
-            "For math, ALWAYS use LaTeX wrapping display equations in [ ] and inline in ( )."
-        )
-        full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
-        params = {
-            "max_tokens": kwargs.get("max_tokens", 512),  # Reduced for memory
-            "stop": stop_tokens,
-            "stream": True,
-            "temperature": kwargs.get("temperature", 0.7),
-            "top_p": kwargs.get("top_p", 0.95)
-        }
-        for output in llm(full_prompt, **params):
-            token = output["choices"][0]["text"]
-            yield token
-model_manager = ModelManager()

 import os
 import requests
 from typing import Generator
+import time
+import json
+OLLAMA_API = "http://localhost:11434"
 class ModelManager:
     def __init__(self):
         self.models = {}
+        self.ollama_ready = False
+        self._wait_for_ollama()
+        # Map model IDs to Ollama model names
+        self.model_map = {
+            "fast-chat": "qwen2.5:0.5b",
+            "tinyllama": "tinyllama:latest",
+            "phi": "neural-chat:7b",
+            "coder": "mistral:latest",
+            "orca": "llama2:latest",
+            "mistral": "mistral:latest",
+            "neural": "neural-chat:7b",
+            "zephyr": "neural-chat:7b",
+            "openhermes": "neural-chat:7b",
+            "starling": "neural-chat:7b",
+            "dolphin": "mistral:latest"
         }
         self.models_dir = os.path.join(os.getcwd(), "models")
         os.makedirs(self.models_dir, exist_ok=True)
+        # Critical models to pull at startup
         self.critical_models = ["fast-chat"]
         self.auto_download_critical()
+    def _wait_for_ollama(self, max_retries=30):
+        """Wait for Ollama service to be ready"""
+        for i in range(max_retries):
+            try:
+                response = requests.get(f"{OLLAMA_API}/api/version", timeout=2)
+                if response.status_code == 200:
+                    print(f"✓ Ollama is ready")
+                    self.ollama_ready = True
+                    return
+            except:
+                pass
+            if i < max_retries - 1:
+                print(f"Waiting for Ollama... ({i+1}/{max_retries})")
+                time.sleep(1)
+        print("⚠ Ollama not responding, continuing anyway...")
     def auto_download_critical(self):
         """Download only critical lightweight models at startup"""
+        if not self.ollama_ready:
+            print("Skipping model download - Ollama not ready")
+            return
+        print("Pulling critical models...")
         for model_id in self.critical_models:
             try:
+                ollama_model = self.model_map.get(model_id, model_id)
+                self.pull_model(ollama_model)
+                print(f"✓ {model_id} ({ollama_model}) ready")
             except Exception as e:
+                print(f"✗ Failed to pull {model_id}: {e}")
+    def pull_model(self, model_name: str):
+        """Pull model from Ollama"""
+        url = f"{OLLAMA_API}/api/pull"
+        data = {"name": model_name, "stream": False}
+        response = requests.post(url, json=data, timeout=300)
+        response.raise_for_status()
+        return True
     def load_model(self, model_id: str):
+        """Models are managed by Ollama, just return a reference"""
         if model_id in self.models:
             return self.models[model_id]
+        ollama_model = self.model_map.get(model_id, model_id)
+        self.models[model_id] = ollama_model
+        return ollama_model
     def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
+        """Simple prompt formatting for Ollama (handles templates internally)"""
+        # Ollama handles prompt formatting internally, just concatenate messages
+        messages = []
+        messages.append({"role": "system", "content": system})
+        if history:
             for msg in history:
+                messages.append(msg)
+        messages.append({"role": "user", "content": prompt})
+        return messages
     def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
+        """Stream response from Ollama"""
+        if not self.ollama_ready:
+            yield "Error: Ollama service not ready"
+            return
+        try:
+            ollama_model = self.load_model(model_id)
+            system_text = (
+                "You are a highly accurate AI assistant. "
+                "For math, ALWAYS use LaTeX wrapping display equations in [ ] and inline in ( )."
+            )
+            messages = self.format_prompt(model_id, system_text, context or [], prompt)
+            # Call Ollama generate endpoint with streaming
+            url = f"{OLLAMA_API}/api/chat"
+            payload = {
+                "model": ollama_model,
+                "messages": messages,
+                "stream": True,
+                "options": {
+                    "temperature": kwargs.get("temperature", 0.7),
+                    "top_p": kwargs.get("top_p", 0.95),
+                    "num_predict": kwargs.get("max_tokens", 512)
+                }
+            }
+            response = requests.post(url, json=payload, stream=True, timeout=300)
+            response.raise_for_status()
+            for line in response.iter_lines():
+                if line:
+                    try:
+                        chunk = json.loads(line)
+                        if "message" in chunk and "content" in chunk["message"]:
+                            token = chunk["message"]["content"]
+                            if token:
+                                yield token
+                    except json.JSONDecodeError:
+                        pass
+        except Exception as e:
+            print(f"Error generating response: {e}")
+            yield f"Error: {str(e)}"
+    def cleanup(self):
+        """Cleanup resources"""
+        # Ollama manages its own resources
+        self.models.clear()
+        print("Cleanup complete")

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 fastapi
 uvicorn
-llama-cpp-python==0.2.81
 supabase
 python-multipart
 pytesseract

 fastapi
 uvicorn
+requests
 supabase
 python-multipart
 pytesseract

start.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+set -e
+# Start Ollama in background
+echo "Starting Ollama..."
+ollama serve --host 0.0.0.0 &
+OLLAMA_PID=$!
+# Wait for Ollama to be ready
+sleep 5
+# Pull the model
+echo "Pulling fast-chat model (qwen2.5-0.5b)..."
+ollama pull qwen2.5:0.5b || echo "Model may already exist"
+# Start FastAPI app
+echo "Starting FastAPI app..."
+exec python3 -m uvicorn main:app --host 0.0.0.0 --port 7860