Spaces:
Runtime error
Runtime error
MiniMax Agent
Add complete local Ollama setup with OpenELM - includes setup script, API server, test scripts, and documentation
41831f1 | """ | |
| OpenELM API Server using Local Ollama | |
| This version uses a local Ollama instance instead of Hugging Face, | |
| providing much faster inference with GPU acceleration. | |
| Requirements: | |
| - Ollama running locally (docker run ollama/ollama) | |
| - OpenELM model pulled (docker exec ollama ollama pull apple/OpenELM-3B-Instruct) | |
| - Python packages: pip install -r requirements_local.txt | |
| """ | |
| import uuid | |
| from typing import List, Optional, Dict, Any | |
| import requests | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field | |
| import os | |
| # Configuration for local Ollama | |
| OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "http://127.0.0.1:11434") | |
| OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "apple/OpenELM-3B-Instruct") | |
| # Create FastAPI app | |
| app = FastAPI( | |
| title="OpenELM API (Ollama)", | |
| description="OpenAI & Anthropic compatible API using local Ollama instance", | |
| version="3.0.0" | |
| ) | |
| # Add CORS | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ==================== Pydantic Models ==================== | |
| class ChatMessage(BaseModel): | |
| role: str | |
| content: str | |
| name: Optional[str] = None | |
| class ChatCompletionRequest(BaseModel): | |
| model: str = OLLAMA_MODEL | |
| messages: List[ChatMessage] | |
| temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) | |
| top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) | |
| max_tokens: Optional[int] = Field(default=None, ge=1, le=4096) | |
| stream: Optional[bool] = False | |
| class ChatCompletionChoice(BaseModel): | |
| index: int | |
| message: ChatMessage | |
| finish_reason: Optional[str] = None | |
| class ChatCompletionUsage(BaseModel): | |
| prompt_tokens: int | |
| completion_tokens: int | |
| total_tokens: int | |
| class ChatCompletionResponse(BaseModel): | |
| id: str | |
| object: str = "chat.completion" | |
| created: int | |
| model: str | |
| choices: List[ChatCompletionChoice] | |
| usage: ChatCompletionUsage | |
| class MessageContent(BaseModel): | |
| type: str = "text" | |
| text: str | |
| class Message(BaseModel): | |
| role: str | |
| content: str | List[MessageContent] | |
| name: Optional[str] = None | |
| class Usage(BaseModel): | |
| input_tokens: int = 0 | |
| output_tokens: int = 0 | |
| total_tokens: int = 0 | |
| class ContentBlock(BaseModel): | |
| type: str = "text" | |
| text: str | |
| class MessageResponse(BaseModel): | |
| id: str | |
| type: str = "message" | |
| role: str = "assistant" | |
| content: List[ContentBlock] | |
| model: str | |
| stop_reason: Optional[str] = None | |
| usage: Usage | |
| class MessageCreateParams(BaseModel): | |
| model: str = OLLAMA_MODEL | |
| messages: List[Message] | |
| system: Optional[str] = None | |
| max_tokens: int = Field(default=1024, ge=1, le=4096) | |
| temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) | |
| stream: Optional[bool] = False | |
| # ==================== Ollama Helper Functions ==================== | |
| def generate_with_ollama( | |
| prompt: str, | |
| system: Optional[str] = None, | |
| max_tokens: int = 1024, | |
| temperature: Optional[float] = None, | |
| stream: bool = False | |
| ) -> Dict[str, Any]: | |
| """Generate text using local Ollama instance.""" | |
| # Build the prompt in chat format | |
| full_prompt = "" | |
| if system: | |
| full_prompt += f"[System: {system}]\n\n" | |
| # Extract messages from prompt | |
| lines = prompt.split("\n\n") | |
| for line in lines: | |
| if line.startswith("User:"): | |
| full_prompt += f"User: {line[5:].strip()}\n" | |
| elif line.startswith("Assistant:"): | |
| full_prompt += f"Assistant: {line[10:].strip()}\n" | |
| elif line.startswith("User:"): | |
| full_prompt += f"User: {line[5:].strip()}\n" | |
| # Add final assistant prefix | |
| full_prompt += "Assistant:" | |
| # Prepare options | |
| options = { | |
| "num_predict": max_tokens, | |
| } | |
| if temperature is not None: | |
| options["temperature"] = temperature | |
| # Make request to Ollama | |
| response = requests.post( | |
| f"{OLLAMA_BASE_URL}/api/generate", | |
| json={ | |
| "model": OLLAMA_MODEL, | |
| "prompt": full_prompt, | |
| "stream": stream, | |
| "options": options | |
| } | |
| ) | |
| if response.status_code != 200: | |
| raise HTTPException( | |
| status_code=500, | |
| detail=f"Ollama request failed: {response.text}" | |
| ) | |
| return response.json() | |
| def chat_with_ollama( | |
| messages: List[ChatMessage], | |
| max_tokens: int = 1024, | |
| temperature: Optional[float] = None, | |
| stream: bool = False | |
| ) -> Dict[str, Any]: | |
| """Chat completion using Ollama's chat API.""" | |
| # Convert messages to Ollama format | |
| ollama_messages = [] | |
| for msg in messages: | |
| ollama_messages.append({ | |
| "role": msg.role, | |
| "content": msg.content | |
| }) | |
| # Prepare options | |
| options = { | |
| "num_predict": max_tokens, | |
| } | |
| if temperature is not None: | |
| options["temperature"] = temperature | |
| # Make request to Ollama chat API | |
| response = requests.post( | |
| f"{OLLAMA_BASE_URL}/v1/chat/completions", | |
| json={ | |
| "model": OLLAMA_MODEL, | |
| "messages": ollama_messages, | |
| "stream": stream, | |
| "options": options | |
| } | |
| ) | |
| if response.status_code != 200: | |
| raise HTTPException( | |
| status_code=500, | |
| detail=f"Ollama chat request failed: {response.text}" | |
| ) | |
| return response.json() | |
| # ==================== API Endpoints ==================== | |
| async def root(): | |
| """Root endpoint with API information.""" | |
| return { | |
| "name": "OpenELM API (Ollama Local)", | |
| "version": "3.0.0", | |
| "model": OLLAMA_MODEL, | |
| "ollama_url": OLLAMA_BASE_URL, | |
| "endpoints": { | |
| "chat": "POST /v1/chat/completions", | |
| "messages": "POST /v1/messages", | |
| "health": "GET /health" | |
| } | |
| } | |
| async def health_check(): | |
| """Health check endpoint.""" | |
| try: | |
| response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5) | |
| if response.status_code == 200: | |
| return { | |
| "status": "healthy", | |
| "ollama_connected": True, | |
| "model": OLLAMA_MODEL | |
| } | |
| else: | |
| return { | |
| "status": "unhealthy", | |
| "ollama_connected": False, | |
| "error": "Ollama not responding" | |
| } | |
| except Exception as e: | |
| return { | |
| "status": "unhealthy", | |
| "ollama_connected": False, | |
| "error": str(e) | |
| } | |
| async def create_chat_completion(request: ChatCompletionRequest): | |
| """Create chat completion (OpenAI API format).""" | |
| try: | |
| # Use Ollama chat API | |
| result = chat_with_ollama( | |
| messages=request.messages, | |
| max_tokens=request.max_tokens or 1024, | |
| temperature=request.temperature, | |
| stream=request.stream | |
| ) | |
| # Convert to OpenAI format | |
| choice = result["choices"][0] | |
| message = choice["message"] | |
| response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" | |
| timestamp = int(uuid.uuid1().time) | |
| return ChatCompletionResponse( | |
| id=response_id, | |
| created=timestamp, | |
| model=OLLAMA_MODEL, | |
| choices=[ | |
| ChatCompletionChoice( | |
| index=0, | |
| message=ChatMessage(role=message["role"], content=message["content"]), | |
| finish_reason=choice.get("finish_reason", "stop") | |
| ) | |
| ], | |
| usage=ChatCompletionUsage( | |
| prompt_tokens=result["usage"]["prompt_tokens"], | |
| completion_tokens=result["usage"]["completion_tokens"], | |
| total_tokens=result["usage"]["total_tokens"] | |
| ) | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") | |
| async def create_message(params: MessageCreateParams): | |
| """Create message (Anthropic API format).""" | |
| try: | |
| # Convert Anthropic messages to prompt | |
| prompt_parts = [] | |
| if params.system: | |
| prompt_parts.append(f"[System: {params.system}]") | |
| for msg in params.messages: | |
| content = msg.content | |
| if isinstance(content, list): | |
| content = "".join(b.text for b in content if hasattr(b, 'text')) | |
| if msg.role == "user": | |
| prompt_parts.append(f"User: {content}") | |
| elif msg.role == "assistant": | |
| prompt_parts.append(f"Assistant: {content}") | |
| prompt_parts.append("Assistant:") | |
| prompt = "\n\n".join(prompt_parts) | |
| # Generate with Ollama | |
| result = generate_with_ollama( | |
| prompt=prompt, | |
| system=params.system, | |
| max_tokens=params.max_tokens, | |
| temperature=params.temperature | |
| ) | |
| # Extract response | |
| response_text = result.get("response", "") | |
| # Count tokens (approximate) | |
| input_tokens = len(prompt.split()) | |
| output_tokens = len(response_text.split()) | |
| return MessageResponse( | |
| id=f"msg_{uuid.uuid4().hex[:8]}", | |
| role="assistant", | |
| content=[ContentBlock(type="text", text=response_text)], | |
| model=OLLAMA_MODEL, | |
| stop_reason="end_turn", | |
| usage=Usage( | |
| input_tokens=input_tokens, | |
| output_tokens=output_tokens, | |
| total_tokens=input_tokens + output_tokens | |
| ) | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") | |
| # ==================== Main Entry Point ==================== | |
| if __name__ == "__main__": | |
| import uvicorn | |
| port = int(os.environ.get("PORT", 8001)) # Different port than Hugging Face Space | |
| print(f""" | |
| ======================================== | |
| OpenELM API Server (Ollama Local) | |
| ======================================== | |
| Model: {OLLAMA_MODEL} | |
| Ollama URL: {OLLAMA_BASE_URL} | |
| Server: http://127.0.0.1:{port} | |
| Endpoints: | |
| OpenAI: POST http://127.0.0.1:{port}/v1/chat/completions | |
| Anthropic: POST http://127.0.0.1:{port}/v1/messages | |
| Health: GET http://127.0.0.1:{port}/health | |
| ======================================== | |
| """) | |
| uvicorn.run( | |
| "app_ollama:app", | |
| host="0.0.0.0", | |
| port=port, | |
| reload=False | |
| ) | |