Spaces:

Sabithulla
/

alpha-core-ai

Build error

App Files Files Community

Sabithulla commited on Feb 23

Commit

3274ec4

1 Parent(s): 64f495c

Multi-stage Docker build: Stage 1 compiles llama-cpp-python to wheel, Stage 2 installs pre-built wheel - NO TIMEOUT! Pre-download fast-chat model at build time.

Browse files

Files changed (5) hide show

Dockerfile +33 -14
download_models.py +62 -0
model_manager.py +98 -115
requirements.txt +1 -1
start.sh +2 -17

Dockerfile CHANGED Viewed

@@ -1,30 +1,49 @@
-FROM ollama/ollama:latest
 WORKDIR /app
-# Install Python and dependencies
 RUN apt-get update && apt-get install -y \
-    python3.11 \
-    python3-pip \
     tesseract-ocr \
     libtesseract-dev \
-    curl \
     && rm -rf /var/lib/apt/lists/*
-# Copy Python requirements and install (no build tools needed)
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy application code
 COPY . .
 # Create models directory
-RUN mkdir -p /root/.ollama/models
-EXPOSE 7860 11434
-# Startup script: start Ollama + FastAPI
-COPY start.sh .
-RUN chmod +x start.sh
-CMD ["./start.sh"]

+# Stage 1: Compile llama-cpp-python to wheel (happens once)
+FROM python:3.11-slim AS builder
+WORKDIR /tmp/build
+# Install build tools
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements
+COPY requirements.txt .
+# Build wheel for llama-cpp-python (will save it)
+RUN pip wheel --no-cache-dir -r requirements.txt -w /tmp/wheels
+# Stage 2: Production image (just installs pre-built wheels)
+FROM python:3.11-slim
 WORKDIR /app
+# Install only runtime dependencies (no build tools needed)
 RUN apt-get update && apt-get install -y \
     tesseract-ocr \
     libtesseract-dev \
     && rm -rf /var/lib/apt/lists/*
+# Copy pre-built wheels from Stage 1 (NO COMPILATION!)
+COPY --from=builder /tmp/wheels /tmp/wheels
+# Install from pre-built wheels (instant, no compilation)
+RUN pip install --no-cache-dir --no-index --find-links /tmp/wheels -r requirements.txt
 COPY requirements.txt .
+# Copy application
 COPY . .
 # Create models directory
+RUN mkdir -p models
+# Download models at build time
+COPY download_models.py .
+RUN python download_models.py || echo "Model download attempted"
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "75"]

download_models.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python3
+"""Download models at Docker build time"""
+import os
+import requests
+from pathlib import Path
+MODELS_DIR = "models"
+os.makedirs(MODELS_DIR, exist_ok=True)
+MODEL_CONFIGS = {
+    "fast-chat": {
+        "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
+        "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf"
+    },
+    "tinyllama": {
+        "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+        "url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+    },
+    "coder": {
+        "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
+        "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"
+    }
+}
+def download_model(model_id, config):
+    """Download a single model"""
+    filepath = os.path.join(MODELS_DIR, config["file"])
+    # Skip if already exists and has reasonable size
+    if os.path.exists(filepath) and os.path.getsize(filepath) > 50000000:
+        print(f"✓ {model_id} already exists ({os.path.getsize(filepath) / 1e9:.2f}GB)")
+        return
+    print(f"Downloading {model_id}...")
+    try:
+        response = requests.get(config["url"], stream=True, timeout=60)
+        response.raise_for_status()
+        total_size = int(response.headers.get('content-length', 0))
+        downloaded = 0
+        with open(filepath, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=10*1024*1024):  # 10MB chunks
+                if chunk:
+                    f.write(chunk)
+                    downloaded += len(chunk)
+                    if total_size:
+                        pct = (downloaded / total_size) * 100
+                        print(f"  {model_id}: {pct:.1f}%", end='\r')
+        print(f"✓ {model_id} downloaded ({os.path.getsize(filepath) / 1e9:.2f}GB)")
+    except Exception as e:
+        print(f"✗ Failed to download {model_id}: {e}")
+if __name__ == "__main__":
+    print("Pre-downloading models at build time...")
+    # Only download fast-chat at build time (others on-demand)
+    download_model("fast-chat", MODEL_CONFIGS["fast-chat"])
+    print(f"\n✓ Models ready in {MODELS_DIR}/")

model_manager.py CHANGED Viewed

@@ -1,152 +1,135 @@
 import os
 import requests
 from typing import Generator
-import time
-import json
-OLLAMA_API = "http://localhost:11434"
 class ModelManager:
     def __init__(self):
         self.models = {}
-        self.ollama_ready = False
-        self._wait_for_ollama()
-        # Map model IDs to Ollama model names
-        self.model_map = {
-            "fast-chat": "qwen2.5:0.5b",
-            "tinyllama": "tinyllama:latest",
-            "phi": "neural-chat:7b",
-            "coder": "mistral:latest",
-            "orca": "llama2:latest",
-            "mistral": "mistral:latest",
-            "neural": "neural-chat:7b",
-            "zephyr": "neural-chat:7b",
-            "openhermes": "neural-chat:7b",
-            "starling": "neural-chat:7b",
-            "dolphin": "mistral:latest"
         }
         self.models_dir = os.path.join(os.getcwd(), "models")
         os.makedirs(self.models_dir, exist_ok=True)
-        # Critical models to pull at startup
         self.critical_models = ["fast-chat"]
         self.auto_download_critical()
-    def _wait_for_ollama(self, max_retries=30):
-        """Wait for Ollama service to be ready"""
-        for i in range(max_retries):
-            try:
-                response = requests.get(f"{OLLAMA_API}/api/version", timeout=2)
-                if response.status_code == 200:
-                    print(f"✓ Ollama is ready")
-                    self.ollama_ready = True
-                    return
-            except:
-                pass
-            if i < max_retries - 1:
-                print(f"Waiting for Ollama... ({i+1}/{max_retries})")
-                time.sleep(1)
-        print("⚠ Ollama not responding, continuing anyway...")
     def auto_download_critical(self):
         """Download only critical lightweight models at startup"""
-        if not self.ollama_ready:
-            print("Skipping model download - Ollama not ready")
-            return
-        print("Pulling critical models...")
         for model_id in self.critical_models:
             try:
-                ollama_model = self.model_map.get(model_id, model_id)
-                self.pull_model(ollama_model)
-                print(f"✓ {model_id} ({ollama_model}) ready")
             except Exception as e:
-                print(f"✗ Failed to pull {model_id}: {e}")
-    def pull_model(self, model_name: str):
-        """Pull model from Ollama"""
-        url = f"{OLLAMA_API}/api/pull"
-        data = {"name": model_name, "stream": False}
-        response = requests.post(url, json=data, timeout=300)
-        response.raise_for_status()
-        return True
     def load_model(self, model_id: str):
-        """Models are managed by Ollama, just return a reference"""
         if model_id in self.models:
             return self.models[model_id]
-        ollama_model = self.model_map.get(model_id, model_id)
-        self.models[model_id] = ollama_model
-        return ollama_model
     def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
-        """Simple prompt formatting for Ollama (handles templates internally)"""
-        # Ollama handles prompt formatting internally, just concatenate messages
-        messages = []
-        messages.append({"role": "system", "content": system})
-        if history:
             for msg in history:
-                messages.append(msg)
-        messages.append({"role": "user", "content": prompt})
-        return messages
     def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
-        """Stream response from Ollama"""
-        if not self.ollama_ready:
-            yield "Error: Ollama service not ready"
-            return
-        try:
-            ollama_model = self.load_model(model_id)
-            system_text = (
-                "You are a highly accurate AI assistant. "
-                "For math, ALWAYS use LaTeX wrapping display equations in [ ] and inline in ( )."
-            )
-            messages = self.format_prompt(model_id, system_text, context or [], prompt)
-            # Call Ollama generate endpoint with streaming
-            url = f"{OLLAMA_API}/api/chat"
-            payload = {
-                "model": ollama_model,
-                "messages": messages,
-                "stream": True,
-                "options": {
-                    "temperature": kwargs.get("temperature", 0.7),
-                    "top_p": kwargs.get("top_p", 0.95),
-                    "num_predict": kwargs.get("max_tokens", 512)
-                }
-            }
-            response = requests.post(url, json=payload, stream=True, timeout=300)
-            response.raise_for_status()
-            for line in response.iter_lines():
-                if line:
-                    try:
-                        chunk = json.loads(line)
-                        if "message" in chunk and "content" in chunk["message"]:
-                            token = chunk["message"]["content"]
-                            if token:
-                                yield token
-                    except json.JSONDecodeError:
-                        pass
-        except Exception as e:
-            print(f"Error generating response: {e}")
-            yield f"Error: {str(e)}"
     def cleanup(self):
         """Cleanup resources"""
-        # Ollama manages its own resources
         self.models.clear()
         print("Cleanup complete")

 import os
+from llama_cpp import Llama
 import requests
 from typing import Generator
 class ModelManager:
     def __init__(self):
         self.models = {}
+        # Templates for different model architectures
+        self.model_configs = {
+            "tinyllama": {
+                "repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+                "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+                "url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+                "format": "tinyllama"
+            },
+            "coder": {
+                "repo": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
+                "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
+                "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
+                "format": "chatml"
+            },
+            "fast-chat": {
+                "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
+                "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
+                "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
+                "format": "chatml"
+            }
         }
         self.models_dir = os.path.join(os.getcwd(), "models")
         os.makedirs(self.models_dir, exist_ok=True)
+        # Only download smallest model at startup (fast-chat: 0.5B)
         self.critical_models = ["fast-chat"]
         self.auto_download_critical()
     def auto_download_critical(self):
         """Download only critical lightweight models at startup"""
+        print("Checking for pre-downloaded models...")
         for model_id in self.critical_models:
             try:
+                path = self.download_model(model_id)
+                print(f"✓ {model_id} ready ({path})")
             except Exception as e:
+                print(f"✗ Failed to ensure {model_id}: {e}")
+    def download_model(self, model_id: str):
+        config = self.model_configs.get(model_id)
+        if not config:
+            raise ValueError(f"Model {model_id} not configured")
+        target_path = os.path.join(self.models_dir, config["file"])
+        # Check if file exists AND has some size
+        if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000:
+            return target_path
+        print(f"Downloading {model_id} from {config['url']}...")
+        try:
+            response = requests.get(config["url"], stream=True, timeout=60)
+            response.raise_for_status()
+            with open(target_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=1024*1024):
+                    if chunk:
+                        f.write(chunk)
+            print(f"Successfully downloaded {model_id}")
+            return target_path
+        except Exception as e:
+            if os.path.exists(target_path):
+                os.remove(target_path)
+            print(f"Download failed for {model_id}: {e}")
+            raise e
     def load_model(self, model_id: str):
         if model_id in self.models:
             return self.models[model_id]
+        path = self.download_model(model_id)
+        self.models[model_id] = Llama(
+            model_path=path,
+            n_ctx=1024,
+            n_threads=2,
+            verbose=False
+        )
+        print(f"✓ Model {model_id} loaded")
+        return self.models[model_id]
     def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
+        fmt = self.model_configs[model_id]["format"]
+        if fmt == "chatml":
+            full = f"<|im_start|>system\n{system}<|im_end|>\n"
             for msg in history:
+                role = "user" if msg["role"] == "user" else "assistant"
+                full += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n"
+            full += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+            return full, ["<|im_end|>", "###", "<|im_start|>", "</s>"]
+        elif fmt == "tinyllama":
+            full = f"<|system|>\n{system}</s>\n"
+            for msg in history:
+                role = "user" if msg["role"] == "user" else "assistant"
+                full += f"<|{role}|>\n{msg['content']}</s>\n"
+            full += f"<|user|>\n{prompt}</s>\n<|assistant|>\n"
+            return full, ["</s>", "<|user|>", "<|assistant|>"]
+        return prompt, ["</s>"]
     def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
+        llm = self.load_model(model_id)
+        system_text = (
+            "You are a highly accurate AI assistant. "
+            "For math, ALWAYS use LaTeX wrapping display equations in $ $ and inline in \\( \\)."
+        )
+        full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
+        params = {
+            "max_tokens": kwargs.get("max_tokens", 512),
+            "stop": stop_tokens,
+            "stream": True,
+            "temperature": kwargs.get("temperature", 0.7),
+            "top_p": kwargs.get("top_p", 0.95)
+        }
+        for output in llm(full_prompt, **params):
+            token = output["choices"][0]["text"]
+            yield token
     def cleanup(self):
         """Cleanup resources"""
+        for model in self.models.values():
+            if hasattr(model, 'close'):
+                model.close()
         self.models.clear()
         print("Cleanup complete")

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 fastapi
 uvicorn
-requests
 supabase
 python-multipart
 pytesseract

 fastapi
 uvicorn
+llama-cpp-python
 supabase
 python-multipart
 pytesseract

start.sh CHANGED Viewed

@@ -1,18 +1,3 @@
 #!/bin/bash
-set -e
-# Start Ollama in background
-echo "Starting Ollama..."
-ollama serve --host 0.0.0.0 &
-OLLAMA_PID=$!
-# Wait for Ollama to be ready
-sleep 5
-# Pull the model
-echo "Pulling fast-chat model (qwen2.5-0.5b)..."
-ollama pull qwen2.5:0.5b || echo "Model may already exist"
-# Start FastAPI app
-echo "Starting FastAPI app..."
-exec python3 -m uvicorn main:app --host 0.0.0.0 --port 7860

 #!/bin/bash
+# Models are pre-downloaded at build time, just run the app
+exec uvicorn main:app --host 0.0.0.0 --port 7860 --timeout-keep-alive 75