from fastapi import FastAPI, HTTPException, Request from pydantic import BaseModel import requests import subprocess import time import threading import json from typing import List, Optional, Dict, Any app = FastAPI(title="Ollama OpenAI-Compatible API", description="OpenAI-compatible endpoint for Ollama") # Add request logging middleware @app.middleware("http") async def log_requests(request: Request, call_next): print(f"🌐 Incoming request: {request.method} {request.url}") print(f"📋 Headers: {dict(request.headers)}") print(f"🔍 Path: {request.url.path}") print(f"❓ Query params: {request.url.query}") response = await call_next(request) print(f"📤 Response status: {response.status_code}") return response # OpenAI-compatible models class ChatMessage(BaseModel): role: str content: str class ChatCompletionRequest(BaseModel): model: str messages: List[ChatMessage] temperature: Optional[float] = 0.7 max_tokens: Optional[int] = None stream: Optional[bool] = False class ChatCompletionResponse(BaseModel): id: str object: str = "chat.completion" created: int model: str choices: List[Dict[str, Any]] usage: Optional[Dict[str, int]] = None class ModelInfo(BaseModel): id: str object: str = "model" created: int owned_by: str = "ollama" # Global variable to track if Ollama is ready ollama_ready = False def wait_for_ollama(): """Wait for Ollama server to be ready""" global ollama_ready max_retries = 30 for i in range(max_retries): try: response = requests.get("http://localhost:11434/api/tags", timeout=5) if response.status_code == 200: ollama_ready = True print("Ollama server is ready!") return except: pass print(f"Waiting for Ollama server... ({i+1}/{max_retries})") time.sleep(10) print("Ollama server failed to start") def pull_default_model(): """Pull a small default model""" try: print("Checking for existing models...") # Model should already be pulled during build, but check anyway subprocess.run(["ollama", "list"], check=True, timeout=30) print("Models available") except Exception as e: print(f"Error checking models: {e}") # Start background tasks def initialize(): wait_for_ollama() if ollama_ready: pull_default_model() threading.Thread(target=initialize, daemon=True).start() def convert_to_ollama_messages(messages: List[ChatMessage]) -> List[Dict[str, str]]: """Convert OpenAI format messages to Ollama format""" return [{"role": msg.role, "content": msg.content} for msg in messages] @app.get("/") async def root(): print("Root endpoint accessed") return { "message": "Ollama OpenAI-Compatible API is running", "endpoints": { "chat_completions": "/v1/chat/completions", "models": "/v1/models" }, "ready": ollama_ready } @app.get("/health") async def health(): print("Health endpoint accessed") return {"status": "healthy", "ollama_ready": ollama_ready} # Add the missing /v1 endpoint that nanocoder checks @app.get("/v1") async def v1_root(): print("V1 root endpoint accessed") return { "message": "OpenAI-compatible API v1", "endpoints": { "chat_completions": "/v1/chat/completions", "models": "/v1/models" }, "ready": ollama_ready } @app.get("/v1/models") async def list_models(): """List available models in OpenAI format""" print("V1 models endpoint accessed") if not ollama_ready: print("Ollama not ready, returning 503") raise HTTPException(status_code=503, detail="Ollama server not ready") try: response = requests.get("http://localhost:11434/api/tags", timeout=10) if response.status_code == 200: ollama_models = response.json() models = [] for model in ollama_models.get("models", []): models.append({ "id": model["name"], "object": "model", "created": int(time.time()), "owned_by": "ollama" }) print(f"Returning {len(models)} models") return {"object": "list", "data": models} else: print(f"Ollama tags returned {response.status_code}") raise HTTPException(status_code=500, detail="Failed to fetch models") except requests.exceptions.RequestException as e: print(f"Error fetching models: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/v1/chat/completions") async def create_chat_completion(request: ChatCompletionRequest): """OpenAI-compatible chat completions endpoint - no authentication required""" print(f"Chat completions endpoint accessed with model: {request.model}") if not ollama_ready: print("Ollama not ready, returning 503") raise HTTPException(status_code=503, detail="Ollama server not ready") try: # Convert to Ollama format ollama_messages = convert_to_ollama_messages(request.messages) print(f"Converted {len(ollama_messages)} messages for Ollama") ollama_request = { "model": request.model, "messages": ollama_messages, "stream": False, "options": { "num_ctx": 32768 # Maximum context length } } if request.temperature is not None: ollama_request["options"]["temperature"] = request.temperature if request.max_tokens is not None: ollama_request["options"]["num_predict"] = request.max_tokens print(f"Making request to Ollama with: {ollama_request}") # Make request to Ollama response = requests.post( "http://localhost:11434/api/chat", json=ollama_request, timeout=120 ) print(f"Ollama response status: {response.status_code}") if response.status_code == 200: ollama_response = response.json() print("Successfully got response from Ollama") # Convert to OpenAI format openai_response = { "id": f"chatcmpl-{int(time.time())}", "object": "chat.completion", "created": int(time.time()), "model": request.model, "choices": [{ "index": 0, "message": { "role": "assistant", "content": ollama_response.get("message", {}).get("content", "") }, "finish_reason": "stop" }], "usage": { "prompt_tokens": ollama_response.get("prompt_eval_count", 0), "completion_tokens": ollama_response.get("eval_count", 0), "total_tokens": ollama_response.get("prompt_eval_count", 0) + ollama_response.get("eval_count", 0) } } return openai_response else: print(f"Ollama returned error: {response.status_code} - {response.text}") raise HTTPException(status_code=500, detail=f"Ollama error: {response.status_code}") except requests.exceptions.RequestException as e: print(f"Request exception: {e}") raise HTTPException(status_code=500, detail=str(e)) # Legacy endpoints for direct Ollama access @app.post("/api/generate") async def generate(request: dict): print("Generate endpoint accessed") if not ollama_ready: raise HTTPException(status_code=503, detail="Ollama server not ready") try: response = requests.post( "http://localhost:11434/api/generate", json=request, timeout=120 ) return response.json() except requests.exceptions.RequestException as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/api/chat") async def chat(request: dict): print("Chat endpoint accessed") if not ollama_ready: raise HTTPException(status_code=503, detail="Ollama server not ready") try: response = requests.post( "http://localhost:11434/api/chat", json=request, timeout=120 ) return response.json() except requests.exceptions.RequestException as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)