agentic-api / app_ollama.py
MiniMax Agent
Add complete local Ollama setup with OpenELM - includes setup script, API server, test scripts, and documentation
41831f1
"""
OpenELM API Server using Local Ollama
This version uses a local Ollama instance instead of Hugging Face,
providing much faster inference with GPU acceleration.
Requirements:
- Ollama running locally (docker run ollama/ollama)
- OpenELM model pulled (docker exec ollama ollama pull apple/OpenELM-3B-Instruct)
- Python packages: pip install -r requirements_local.txt
"""
import uuid
from typing import List, Optional, Dict, Any
import requests
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
import os
# Configuration for local Ollama
OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "http://127.0.0.1:11434")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "apple/OpenELM-3B-Instruct")
# Create FastAPI app
app = FastAPI(
title="OpenELM API (Ollama)",
description="OpenAI & Anthropic compatible API using local Ollama instance",
version="3.0.0"
)
# Add CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ==================== Pydantic Models ====================
class ChatMessage(BaseModel):
role: str
content: str
name: Optional[str] = None
class ChatCompletionRequest(BaseModel):
model: str = OLLAMA_MODEL
messages: List[ChatMessage]
temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
max_tokens: Optional[int] = Field(default=None, ge=1, le=4096)
stream: Optional[bool] = False
class ChatCompletionChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: Optional[str] = None
class ChatCompletionUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[ChatCompletionChoice]
usage: ChatCompletionUsage
class MessageContent(BaseModel):
type: str = "text"
text: str
class Message(BaseModel):
role: str
content: str | List[MessageContent]
name: Optional[str] = None
class Usage(BaseModel):
input_tokens: int = 0
output_tokens: int = 0
total_tokens: int = 0
class ContentBlock(BaseModel):
type: str = "text"
text: str
class MessageResponse(BaseModel):
id: str
type: str = "message"
role: str = "assistant"
content: List[ContentBlock]
model: str
stop_reason: Optional[str] = None
usage: Usage
class MessageCreateParams(BaseModel):
model: str = OLLAMA_MODEL
messages: List[Message]
system: Optional[str] = None
max_tokens: int = Field(default=1024, ge=1, le=4096)
temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0)
stream: Optional[bool] = False
# ==================== Ollama Helper Functions ====================
def generate_with_ollama(
prompt: str,
system: Optional[str] = None,
max_tokens: int = 1024,
temperature: Optional[float] = None,
stream: bool = False
) -> Dict[str, Any]:
"""Generate text using local Ollama instance."""
# Build the prompt in chat format
full_prompt = ""
if system:
full_prompt += f"[System: {system}]\n\n"
# Extract messages from prompt
lines = prompt.split("\n\n")
for line in lines:
if line.startswith("User:"):
full_prompt += f"User: {line[5:].strip()}\n"
elif line.startswith("Assistant:"):
full_prompt += f"Assistant: {line[10:].strip()}\n"
elif line.startswith("User:"):
full_prompt += f"User: {line[5:].strip()}\n"
# Add final assistant prefix
full_prompt += "Assistant:"
# Prepare options
options = {
"num_predict": max_tokens,
}
if temperature is not None:
options["temperature"] = temperature
# Make request to Ollama
response = requests.post(
f"{OLLAMA_BASE_URL}/api/generate",
json={
"model": OLLAMA_MODEL,
"prompt": full_prompt,
"stream": stream,
"options": options
}
)
if response.status_code != 200:
raise HTTPException(
status_code=500,
detail=f"Ollama request failed: {response.text}"
)
return response.json()
def chat_with_ollama(
messages: List[ChatMessage],
max_tokens: int = 1024,
temperature: Optional[float] = None,
stream: bool = False
) -> Dict[str, Any]:
"""Chat completion using Ollama's chat API."""
# Convert messages to Ollama format
ollama_messages = []
for msg in messages:
ollama_messages.append({
"role": msg.role,
"content": msg.content
})
# Prepare options
options = {
"num_predict": max_tokens,
}
if temperature is not None:
options["temperature"] = temperature
# Make request to Ollama chat API
response = requests.post(
f"{OLLAMA_BASE_URL}/v1/chat/completions",
json={
"model": OLLAMA_MODEL,
"messages": ollama_messages,
"stream": stream,
"options": options
}
)
if response.status_code != 200:
raise HTTPException(
status_code=500,
detail=f"Ollama chat request failed: {response.text}"
)
return response.json()
# ==================== API Endpoints ====================
@app.get("/", tags=["Root"])
async def root():
"""Root endpoint with API information."""
return {
"name": "OpenELM API (Ollama Local)",
"version": "3.0.0",
"model": OLLAMA_MODEL,
"ollama_url": OLLAMA_BASE_URL,
"endpoints": {
"chat": "POST /v1/chat/completions",
"messages": "POST /v1/messages",
"health": "GET /health"
}
}
@app.get("/health", tags=["Health"])
async def health_check():
"""Health check endpoint."""
try:
response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5)
if response.status_code == 200:
return {
"status": "healthy",
"ollama_connected": True,
"model": OLLAMA_MODEL
}
else:
return {
"status": "unhealthy",
"ollama_connected": False,
"error": "Ollama not responding"
}
except Exception as e:
return {
"status": "unhealthy",
"ollama_connected": False,
"error": str(e)
}
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse, tags=["OpenAI"])
async def create_chat_completion(request: ChatCompletionRequest):
"""Create chat completion (OpenAI API format)."""
try:
# Use Ollama chat API
result = chat_with_ollama(
messages=request.messages,
max_tokens=request.max_tokens or 1024,
temperature=request.temperature,
stream=request.stream
)
# Convert to OpenAI format
choice = result["choices"][0]
message = choice["message"]
response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
timestamp = int(uuid.uuid1().time)
return ChatCompletionResponse(
id=response_id,
created=timestamp,
model=OLLAMA_MODEL,
choices=[
ChatCompletionChoice(
index=0,
message=ChatMessage(role=message["role"], content=message["content"]),
finish_reason=choice.get("finish_reason", "stop")
)
],
usage=ChatCompletionUsage(
prompt_tokens=result["usage"]["prompt_tokens"],
completion_tokens=result["usage"]["completion_tokens"],
total_tokens=result["usage"]["total_tokens"]
)
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
@app.post("/v1/messages", response_model=MessageResponse, tags=["Anthropic"])
async def create_message(params: MessageCreateParams):
"""Create message (Anthropic API format)."""
try:
# Convert Anthropic messages to prompt
prompt_parts = []
if params.system:
prompt_parts.append(f"[System: {params.system}]")
for msg in params.messages:
content = msg.content
if isinstance(content, list):
content = "".join(b.text for b in content if hasattr(b, 'text'))
if msg.role == "user":
prompt_parts.append(f"User: {content}")
elif msg.role == "assistant":
prompt_parts.append(f"Assistant: {content}")
prompt_parts.append("Assistant:")
prompt = "\n\n".join(prompt_parts)
# Generate with Ollama
result = generate_with_ollama(
prompt=prompt,
system=params.system,
max_tokens=params.max_tokens,
temperature=params.temperature
)
# Extract response
response_text = result.get("response", "")
# Count tokens (approximate)
input_tokens = len(prompt.split())
output_tokens = len(response_text.split())
return MessageResponse(
id=f"msg_{uuid.uuid4().hex[:8]}",
role="assistant",
content=[ContentBlock(type="text", text=response_text)],
model=OLLAMA_MODEL,
stop_reason="end_turn",
usage=Usage(
input_tokens=input_tokens,
output_tokens=output_tokens,
total_tokens=input_tokens + output_tokens
)
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
# ==================== Main Entry Point ====================
if __name__ == "__main__":
import uvicorn
port = int(os.environ.get("PORT", 8001)) # Different port than Hugging Face Space
print(f"""
========================================
OpenELM API Server (Ollama Local)
========================================
Model: {OLLAMA_MODEL}
Ollama URL: {OLLAMA_BASE_URL}
Server: http://127.0.0.1:{port}
Endpoints:
OpenAI: POST http://127.0.0.1:{port}/v1/chat/completions
Anthropic: POST http://127.0.0.1:{port}/v1/messages
Health: GET http://127.0.0.1:{port}/health
========================================
""")
uvicorn.run(
"app_ollama:app",
host="0.0.0.0",
port=port,
reload=False
)