Spaces:

megharudushi
/

agentic-api

Runtime error

MiniMax Agent commited on Jan 1

Commit

ee3c612

1 Parent(s): 7ae4b71

v5: Minimal lazy-loading architecture for instant startup

- Remove ALL heavy imports from module level (torch, transformers)
- Use background thread to load model after server starts
- Server responds immediately to health checks
- API returns 503 if model is still loading
- Fixes 30-minute timeout issue on Hugging Face Spaces

Files changed (1) hide show

app.py +111 -212

app.py CHANGED Viewed

@@ -1,186 +1,57 @@
 """
-OpenELM OpenAI & Anthropic API Compatible Wrapper
-This version properly handles OpenELM's custom configuration and tokenizer.
 """
 import uuid
 import sys
-import subprocess
 from contextlib import asynccontextmanager
 from typing import AsyncIterator, List, Optional, Dict, Any
-import torch
 from fastapi import FastAPI, HTTPException, Request
-from fastapi.responses import JSONResponse, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
-from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
-from huggingface_hub import hf_hub_download
-import os
-# Import for streaming
-from transformers import TextIteratorStreamer
-from threading import Thread
-# Global model and tokenizer references
-model = None
-tokenizer = None
-model_loaded = False
-model_id = "apple/OpenELM-450M-Instruct"
-def install_sentencepiece():
-    """Install SentencePiece if not available."""
-    try:
-        import sentencepiece
-        return True
-    except ImportError:
-        print("Installing SentencePiece...")
-        try:
-            subprocess.run([sys.executable, "-m", "pip", "install", "sentencepiece", "--quiet"], check=True)
-            print("SentencePiece installed successfully")
-            return True
-        except subprocess.CalledProcessError:
-            print("Failed to install SentencePiece")
-            return False
-def register_openelm_config():
-    """Register OpenELM configuration with transformers."""
-    try:
-        # Try to import and register the config
-        from transformers import AutoConfig, LlamaConfig
-        # Download the OpenELM config
-        config_path = hf_hub_download(
-            repo_id=model_id,
-            filename="configuration_openelm.py",
-            repo_type="model"
-        )
-        # Add to path and import
-        config_dir = os.path.dirname(config_path)
-        if config_dir not in sys.path:
-            sys.path.insert(0, config_dir)
-        # The config file should have the OpenELMConfig class
-        # We'll use LlamaConfig as a base since OpenELM is similar to LLaMA
-        print("OpenELM configuration registered (using LLaMA-compatible loading)")
-        return True
-    except Exception as e:
-        print(f"Could not register OpenELM config: {e}")
-        return False
-def load_tokenizer():
-    """
-    Load tokenizer with multiple fallback strategies.
-    OpenELM uses a custom configuration that transformers doesn't natively support.
-    """
-    print("Loading tokenizer...")
-    # Install sentencepiece first
-    install_sentencepiece()
-    # Strategy 1: Try using the tokenizer files directly
     try:
-        from transformers import LlamaTokenizerFast
-        # Download tokenizer files
-        tokenizer_file = hf_hub_download(
-            repo_id=model_id,
-            filename="tokenizer.json",
-            repo_type="model"
-        )
-        tokenizer = LlamaTokenizerFast(
-            tokenizer_file=tokenizer_file,
-            trust_remote_code=True
-        )
-        print("  Loaded tokenizer using tokenizer.json")
-        return tokenizer
-    except Exception as e:
-        print(f"  Strategy 1 failed: {e}")
-    # Strategy 2: Try LlamaTokenizer with local files
-    try:
-        # Download vocab and merges
-        vocab_file = hf_hub_download(
-            repo_id=model_id,
-            filename="vocab.txt",
-            repo_type="model"
-        )
-        try:
-            merges_file = hf_hub_download(
-                repo_id=model_id,
-                filename="merges.txt",
-                repo_type="model"
-            )
-            tokenizer = LlamaTokenizer(
-                vocab_file=vocab_file,
-                merges_file=merges_file,
-                trust_remote_code=True
-            )
-        except:
-            tokenizer = LlamaTokenizer(
-                vocab_file=vocab_file,
-                trust_remote_code=True
-            )
-        print("  Loaded tokenizer using vocab.txt")
-        return tokenizer
-    except Exception as e:
-        print(f"  Strategy 2 failed: {e}")
-    # Strategy 3: Try AutoTokenizer with use_fast=False
-    try:
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
-            trust_remote_code=True,
-            use_fast=False
         )
-        print("  Loaded tokenizer using AutoTokenizer (slow)")
-        return tokenizer
-    except Exception as e:
-        print(f"  Strategy 3 failed: {e}")
-    # Strategy 4: Use a basic GPT-2 style tokenizer
-    print("  Using fallback tokenizer")
-    tokenizer = PreTrainedTokenizerFast(
-        tokenizer_file=None,
-        bos_token="<s>",
-        eos_token="</s>",
-        unk_token="<unk>",
-        pad_token="<pad>"
-    )
-    return tokenizer
-def load_model():
-    """
-    Load the OpenELM model.
-    """
-    global model, tokenizer, model_loaded
-    if model_loaded:
-        return True
-    print("Initializing OpenELM model...")
-    try:
-        # Load tokenizer
-        print("  Loading tokenizer...")
-        tokenizer = load_tokenizer()
-        # Ensure pad token is set
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         if tokenizer.bos_token is None:
@@ -188,8 +59,10 @@ def load_model():
         if tokenizer.eos_token is None:
             tokenizer.eos_token = "</s>"
-        print("  Loading model...")
-        # Load model with simplified parameters
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             torch_dtype=torch.float32,
@@ -198,45 +71,53 @@ def load_model():
         )
         model.eval()
-        model_loaded = True
-        print(f"  Model loaded successfully!")
-        print(f"  Model device: {next(model.parameters()).device}")
-        return True
     except Exception as e:
-        print(f"  Error loading model: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
 @asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncIterator:
-    """Application lifespan with lazy loading."""
-    global model, tokenizer, model_loaded
-    print("OpenELM API Ready (lazy loading)")
     print("Endpoints:")
     print("  POST /v1/chat/completions - OpenAI format")
     print("  POST /v1/messages - Anthropic format")
     print("  GET /health - Check model status")
     yield
-    # Cleanup
-    if model is not None:
-        del model
-    if tokenizer is not None:
-        del tokenizer
-    torch.cuda.empty_cache() if torch.cuda.is_available() else None
 # Create FastAPI app
 app = FastAPI(
     title="OpenELM OpenAI API",
     description="OpenAI and Anthropic API compatible wrapper for OpenELM models",
-    version="2.1.0",
     lifespan=lifespan
 )
@@ -374,19 +255,17 @@ def format_prompt_for_openelm(messages: List[Message], system: Optional[str] = N
     return "\n\n".join(prompt_parts)
-def count_tokens(text: str) -> int:
-    """Estimate token count."""
-    if tokenizer:
-        try:
-            return len(tokenizer.encode(text))
-        except:
-            pass
-    return max(1, len(text) // 4)
-def truncate_prompt(prompt: str, max_tokens: int, system: Optional[str] = None) -> str:
     """Truncate prompt to fit within context window."""
-    current_tokens = count_tokens(prompt)
     if current_tokens <= max_tokens:
         return prompt
@@ -402,7 +281,7 @@ def truncate_prompt(prompt: str, max_tokens: int, system: Optional[str] = None)
     for line in reversed(lines):
         truncated_lines.insert(0, line)
         test_prompt = "\n\n".join([system_line] + truncated_lines) if system_line else "\n\n".join(truncated_lines)
-        if count_tokens(test_prompt) <= max_tokens:
             break
     if system_line:
@@ -434,30 +313,41 @@ def extract_assistant_response(generated_text: str) -> str:
 @app.get("/", tags=["Root"])
 async def root():
     return {
-        "name": "OpenELM OpenAI API",
-        "version": "2.1.0",
-        "status": "ready" if model_loaded else "initializing",
-        "model_loaded": model_loaded,
         "endpoints": {
             "chat": "POST /v1/chat/completions",
             "messages": "POST /v1/messages",
             "health": "GET /health"
         },
-        "note": "Model loads on first request"
     }
 @app.get("/health", tags=["Health"])
 async def health_check():
-    return {
-        "status": "healthy" if model_loaded else "initializing",
-        "model_loaded": model_loaded
-    }
 @app.get("/v1/models", response_model=OpenAIModelListResponse, tags=["Models"])
 async def list_models():
     return OpenAIModelListResponse(
         data=[
             OpenAIModelInfo(
@@ -472,11 +362,13 @@ async def list_models():
 @app.post("/v1/chat/completions", tags=["OpenAI"])
 async def create_chat_completion(request: ChatCompletionRequest):
     """Create chat completion (OpenAI API format)."""
-    global model, tokenizer, model_loaded
-    if not model_loaded:
-        if not load_model():
-            raise HTTPException(status_code=503, detail="Failed to load model")
     try:
         system_message = None
@@ -490,7 +382,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
         prompt = format_prompt_for_openelm(formatted_messages, system_message)
         max_tokens = request.max_tokens or 1024
-        prompt = truncate_prompt(prompt, 2048 - max_tokens, system_message)
         inputs = tokenizer(prompt, return_tensors="pt")
         input_tokens = len(inputs.input_ids[0])
@@ -510,6 +402,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
         if request.top_p is not None:
             gen_params["top_p"] = request.top_p
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
@@ -520,7 +413,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
         response_text = extract_assistant_response(generated_text)
-        output_tokens = count_tokens(response_text)
         response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
         timestamp = int(uuid.uuid1().time)
@@ -552,11 +445,13 @@ async def create_chat_completion(request: ChatCompletionRequest):
 @app.post("/v1/messages", response_model=MessageResponse, tags=["Messages"])
 async def create_message(params: MessageCreateParams):
     """Create message (Anthropic API format)."""
-    global model, tokenizer, model_loaded
-    if not model_loaded:
-        if not load_model():
-            raise HTTPException(status_code=503, detail="Failed to load model")
     try:
         formatted_messages = []
@@ -567,7 +462,7 @@ async def create_message(params: MessageCreateParams):
             formatted_messages.append(Message(role=msg.role, content=content))
         prompt = format_prompt_for_openelm(formatted_messages, params.system)
-        prompt = truncate_prompt(prompt, 2048 - params.max_tokens, params.system)
         inputs = tokenizer(prompt, return_tensors="pt")
         input_tokens = len(inputs.input_ids[0])
@@ -587,6 +482,7 @@ async def create_message(params: MessageCreateParams):
         if params.top_p is not None:
             gen_params["top_p"] = params.top_p
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
@@ -597,7 +493,7 @@ async def create_message(params: MessageCreateParams):
         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
         response_text = extract_assistant_response(generated_text)
-        output_tokens = count_tokens(response_text)
         return MessageResponse(
             id=f"msg_{uuid.uuid4().hex[:8]}",
@@ -619,7 +515,10 @@ async def create_message(params: MessageCreateParams):
 if __name__ == "__main__":
     import uvicorn
-    port = int(os.environ.get("PORT", 8000))
     uvicorn.run(
         "app:app",

 """
+OpenELM OpenAI & Anthropic API Compatible Wrapper - v5
+Minimal lazy-loading architecture for instant startup.
+Heavy imports (torch, transformers) are deferred to a background thread.
 """
 import uuid
+import os
 import sys
+import time
+import asyncio
+import threading
 from contextlib import asynccontextmanager
 from typing import AsyncIterator, List, Optional, Dict, Any
 from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
+# Global state for lazy loading
+# This allows the server to respond immediately while model loads in background
+global_state = {
+    "status": "INITIALIZING",  # INITIALIZING -> LOADING -> READY -> ERROR
+    "model": None,
+    "tokenizer": None,
+    "error": None
+}
+def model_loader_thread():
+    """Load model in background thread to avoid blocking startup."""
+    global global_state
     try:
+        # Import heavy libraries INSIDE the thread
+        import torch
+        import sys
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        from huggingface_hub import hf_hub_download
+        global_state["status"] = "LOADING"
+        model_id = "apple/OpenELM-450M-Instruct"
+        print("Loading tokenizer...")
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
+            trust_remote_code=True
         )
+        # Set special tokens
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         if tokenizer.bos_token is None:
         if tokenizer.eos_token is None:
             tokenizer.eos_token = "</s>"
+        global_state["tokenizer"] = tokenizer
+        print("Tokenizer loaded")
+        print("Loading model...")
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             torch_dtype=torch.float32,
         )
         model.eval()
+        global_state["model"] = model
+        global_state["status"] = "READY"
+        print(f"Model loaded successfully! Device: {next(model.parameters()).device}")
     except Exception as e:
+        global_state["error"] = str(e)
+        global_state["status"] = "ERROR"
+        print(f"Error loading model: {e}")
 @asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncIterator:
+    """Application lifespan: Start background loader, then yield."""
+    global global_state
+    print("=" * 60)
+    print("OpenELM API v5 - Starting with background model loader")
+    print("=" * 60)
+    print("Server will respond immediately. Model loads in background.")
     print("Endpoints:")
     print("  POST /v1/chat/completions - OpenAI format")
     print("  POST /v1/messages - Anthropic format")
     print("  GET /health - Check model status")
+    print("=" * 60)
+    # Start background thread to load model
+    loader_thread = threading.Thread(target=model_loader_thread, daemon=True)
+    loader_thread.start()
     yield
+    # Cleanup on shutdown
+    if global_state["model"] is not None:
+        del global_state["model"]
+    if global_state["tokenizer"] is not None:
+        del global_state["tokenizer"]
+    if "torch" in sys.modules:
+        import torch
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
 # Create FastAPI app
+# Note: No heavy imports at module level - only fastapi and pydantic
 app = FastAPI(
     title="OpenELM OpenAI API",
     description="OpenAI and Anthropic API compatible wrapper for OpenELM models",
+    version="5.0.0",
     lifespan=lifespan
 )
     return "\n\n".join(prompt_parts)
+def count_tokens(text: str, tokenizer) -> int:
+    """Count tokens using the tokenizer."""
+    try:
+        return len(tokenizer.encode(text))
+    except:
+        return max(1, len(text) // 4)
+def truncate_prompt(prompt: str, max_tokens: int, tokenizer, system: Optional[str] = None) -> str:
     """Truncate prompt to fit within context window."""
+    current_tokens = count_tokens(prompt, tokenizer)
     if current_tokens <= max_tokens:
         return prompt
     for line in reversed(lines):
         truncated_lines.insert(0, line)
         test_prompt = "\n\n".join([system_line] + truncated_lines) if system_line else "\n\n".join(truncated_lines)
+        if count_tokens(test_prompt, tokenizer) <= max_tokens:
             break
     if system_line:
 @app.get("/", tags=["Root"])
 async def root():
+    """Root endpoint with API information."""
     return {
+        "name": "OpenELM OpenAI API v5",
+        "version": "5.0.0",
+        "status": global_state["status"],
+        "model_loaded": global_state["status"] == "READY",
         "endpoints": {
             "chat": "POST /v1/chat/completions",
             "messages": "POST /v1/messages",
             "health": "GET /health"
         },
+        "note": "Model loads in background for instant startup"
     }
 @app.get("/health", tags=["Health"])
 async def health_check():
+    """Health check endpoint."""
+    if global_state["status"] == "READY":
+        return {"status": "healthy", "model_loaded": True}
+    elif global_state["status"] == "ERROR":
+        raise HTTPException(
+            status_code=503,
+            detail=f"Model failed to load: {global_state.get('error', 'Unknown error')}"
+        )
+    else:
+        raise HTTPException(
+            status_code=503,
+            detail="Model is still loading. Please retry in a few moments."
+        )
 @app.get("/v1/models", response_model=OpenAIModelListResponse, tags=["Models"])
 async def list_models():
+    """List available models (OpenAI format)."""
     return OpenAIModelListResponse(
         data=[
             OpenAIModelInfo(
 @app.post("/v1/chat/completions", tags=["OpenAI"])
 async def create_chat_completion(request: ChatCompletionRequest):
     """Create chat completion (OpenAI API format)."""
+    if global_state["status"] != "READY":
+        if global_state["status"] == "ERROR":
+            raise HTTPException(status_code=503, detail="Model failed to load")
+        raise HTTPException(status_code=503, detail="Model is still loading. Please retry.")
+    model = global_state["model"]
+    tokenizer = global_state["tokenizer"]
     try:
         system_message = None
         prompt = format_prompt_for_openelm(formatted_messages, system_message)
         max_tokens = request.max_tokens or 1024
+        prompt = truncate_prompt(prompt, 2048 - max_tokens, tokenizer, system_message)
         inputs = tokenizer(prompt, return_tensors="pt")
         input_tokens = len(inputs.input_ids[0])
         if request.top_p is not None:
             gen_params["top_p"] = request.top_p
+        import torch
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
         response_text = extract_assistant_response(generated_text)
+        output_tokens = count_tokens(response_text, tokenizer)
         response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
         timestamp = int(uuid.uuid1().time)
 @app.post("/v1/messages", response_model=MessageResponse, tags=["Messages"])
 async def create_message(params: MessageCreateParams):
     """Create message (Anthropic API format)."""
+    if global_state["status"] != "READY":
+        if global_state["status"] == "ERROR":
+            raise HTTPException(status_code=503, detail="Model failed to load")
+        raise HTTPException(status_code=503, detail="Model is still loading. Please retry.")
+    model = global_state["model"]
+    tokenizer = global_state["tokenizer"]
     try:
         formatted_messages = []
             formatted_messages.append(Message(role=msg.role, content=content))
         prompt = format_prompt_for_openelm(formatted_messages, params.system)
+        prompt = truncate_prompt(prompt, 2048 - params.max_tokens, tokenizer, params.system)
         inputs = tokenizer(prompt, return_tensors="pt")
         input_tokens = len(inputs.input_ids[0])
         if params.top_p is not None:
             gen_params["top_p"] = params.top_p
+        import torch
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
         response_text = extract_assistant_response(generated_text)
+        output_tokens = count_tokens(response_text, tokenizer)
         return MessageResponse(
             id=f"msg_{uuid.uuid4().hex[:8]}",
 if __name__ == "__main__":
     import uvicorn
+    port = int(os.environ.get("PORT", 7860))
+    print(f"\nStarting OpenELM API v5 on port {port}...")
+    print("The server will respond immediately while the model loads in background.\n")
     uvicorn.run(
         "app:app",