Ollama-Instance / app.py
willlamerton's picture
Update app.py
b73b389 verified
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel
import requests
import subprocess
import time
import threading
import json
from typing import List, Optional, Dict, Any
app = FastAPI(title="Ollama OpenAI-Compatible API", description="OpenAI-compatible endpoint for Ollama")
# Add request logging middleware
@app.middleware("http")
async def log_requests(request: Request, call_next):
print(f"🌐 Incoming request: {request.method} {request.url}")
print(f"πŸ“‹ Headers: {dict(request.headers)}")
print(f"πŸ” Path: {request.url.path}")
print(f"❓ Query params: {request.url.query}")
response = await call_next(request)
print(f"πŸ“€ Response status: {response.status_code}")
return response
# OpenAI-compatible models
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
temperature: Optional[float] = 0.7
max_tokens: Optional[int] = None
stream: Optional[bool] = False
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[Dict[str, Any]]
usage: Optional[Dict[str, int]] = None
class ModelInfo(BaseModel):
id: str
object: str = "model"
created: int
owned_by: str = "ollama"
# Global variable to track if Ollama is ready
ollama_ready = False
def wait_for_ollama():
"""Wait for Ollama server to be ready"""
global ollama_ready
max_retries = 30
for i in range(max_retries):
try:
response = requests.get("http://localhost:11434/api/tags", timeout=5)
if response.status_code == 200:
ollama_ready = True
print("Ollama server is ready!")
return
except:
pass
print(f"Waiting for Ollama server... ({i+1}/{max_retries})")
time.sleep(10)
print("Ollama server failed to start")
def pull_default_model():
"""Pull a small default model"""
try:
print("Checking for existing models...")
# Model should already be pulled during build, but check anyway
subprocess.run(["ollama", "list"], check=True, timeout=30)
print("Models available")
except Exception as e:
print(f"Error checking models: {e}")
# Start background tasks
def initialize():
wait_for_ollama()
if ollama_ready:
pull_default_model()
threading.Thread(target=initialize, daemon=True).start()
def convert_to_ollama_messages(messages: List[ChatMessage]) -> List[Dict[str, str]]:
"""Convert OpenAI format messages to Ollama format"""
return [{"role": msg.role, "content": msg.content} for msg in messages]
@app.get("/")
async def root():
print("Root endpoint accessed")
return {
"message": "Ollama OpenAI-Compatible API is running",
"endpoints": {
"chat_completions": "/v1/chat/completions",
"models": "/v1/models"
},
"ready": ollama_ready
}
@app.get("/health")
async def health():
print("Health endpoint accessed")
return {"status": "healthy", "ollama_ready": ollama_ready}
# Add the missing /v1 endpoint that nanocoder checks
@app.get("/v1")
async def v1_root():
print("V1 root endpoint accessed")
return {
"message": "OpenAI-compatible API v1",
"endpoints": {
"chat_completions": "/v1/chat/completions",
"models": "/v1/models"
},
"ready": ollama_ready
}
@app.get("/v1/models")
async def list_models():
"""List available models in OpenAI format"""
print("V1 models endpoint accessed")
if not ollama_ready:
print("Ollama not ready, returning 503")
raise HTTPException(status_code=503, detail="Ollama server not ready")
try:
response = requests.get("http://localhost:11434/api/tags", timeout=10)
if response.status_code == 200:
ollama_models = response.json()
models = []
for model in ollama_models.get("models", []):
models.append({
"id": model["name"],
"object": "model",
"created": int(time.time()),
"owned_by": "ollama"
})
print(f"Returning {len(models)} models")
return {"object": "list", "data": models}
else:
print(f"Ollama tags returned {response.status_code}")
raise HTTPException(status_code=500, detail="Failed to fetch models")
except requests.exceptions.RequestException as e:
print(f"Error fetching models: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest):
"""OpenAI-compatible chat completions endpoint - no authentication required"""
print(f"Chat completions endpoint accessed with model: {request.model}")
if not ollama_ready:
print("Ollama not ready, returning 503")
raise HTTPException(status_code=503, detail="Ollama server not ready")
try:
# Convert to Ollama format
ollama_messages = convert_to_ollama_messages(request.messages)
print(f"Converted {len(ollama_messages)} messages for Ollama")
ollama_request = {
"model": request.model,
"messages": ollama_messages,
"stream": False,
"options": {
"num_ctx": 32768 # Maximum context length
}
}
if request.temperature is not None:
ollama_request["options"]["temperature"] = request.temperature
if request.max_tokens is not None:
ollama_request["options"]["num_predict"] = request.max_tokens
print(f"Making request to Ollama with: {ollama_request}")
# Make request to Ollama
response = requests.post(
"http://localhost:11434/api/chat",
json=ollama_request,
timeout=120
)
print(f"Ollama response status: {response.status_code}")
if response.status_code == 200:
ollama_response = response.json()
print("Successfully got response from Ollama")
# Convert to OpenAI format
openai_response = {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": request.model,
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": ollama_response.get("message", {}).get("content", "")
},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": ollama_response.get("prompt_eval_count", 0),
"completion_tokens": ollama_response.get("eval_count", 0),
"total_tokens": ollama_response.get("prompt_eval_count", 0) + ollama_response.get("eval_count", 0)
}
}
return openai_response
else:
print(f"Ollama returned error: {response.status_code} - {response.text}")
raise HTTPException(status_code=500, detail=f"Ollama error: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Request exception: {e}")
raise HTTPException(status_code=500, detail=str(e))
# Legacy endpoints for direct Ollama access
@app.post("/api/generate")
async def generate(request: dict):
print("Generate endpoint accessed")
if not ollama_ready:
raise HTTPException(status_code=503, detail="Ollama server not ready")
try:
response = requests.post(
"http://localhost:11434/api/generate",
json=request,
timeout=120
)
return response.json()
except requests.exceptions.RequestException as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/chat")
async def chat(request: dict):
print("Chat endpoint accessed")
if not ollama_ready:
raise HTTPException(status_code=503, detail="Ollama server not ready")
try:
response = requests.post(
"http://localhost:11434/api/chat",
json=request,
timeout=120
)
return response.json()
except requests.exceptions.RequestException as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)