Spaces:
Paused
Paused
| from fastapi import FastAPI, HTTPException, Request | |
| from pydantic import BaseModel | |
| import requests | |
| import subprocess | |
| import time | |
| import threading | |
| import json | |
| from typing import List, Optional, Dict, Any | |
| app = FastAPI(title="Ollama OpenAI-Compatible API", description="OpenAI-compatible endpoint for Ollama") | |
| # Add request logging middleware | |
| async def log_requests(request: Request, call_next): | |
| print(f"π Incoming request: {request.method} {request.url}") | |
| print(f"π Headers: {dict(request.headers)}") | |
| print(f"π Path: {request.url.path}") | |
| print(f"β Query params: {request.url.query}") | |
| response = await call_next(request) | |
| print(f"π€ Response status: {response.status_code}") | |
| return response | |
| # OpenAI-compatible models | |
| class ChatMessage(BaseModel): | |
| role: str | |
| content: str | |
| class ChatCompletionRequest(BaseModel): | |
| model: str | |
| messages: List[ChatMessage] | |
| temperature: Optional[float] = 0.7 | |
| max_tokens: Optional[int] = None | |
| stream: Optional[bool] = False | |
| class ChatCompletionResponse(BaseModel): | |
| id: str | |
| object: str = "chat.completion" | |
| created: int | |
| model: str | |
| choices: List[Dict[str, Any]] | |
| usage: Optional[Dict[str, int]] = None | |
| class ModelInfo(BaseModel): | |
| id: str | |
| object: str = "model" | |
| created: int | |
| owned_by: str = "ollama" | |
| # Global variable to track if Ollama is ready | |
| ollama_ready = False | |
| def wait_for_ollama(): | |
| """Wait for Ollama server to be ready""" | |
| global ollama_ready | |
| max_retries = 30 | |
| for i in range(max_retries): | |
| try: | |
| response = requests.get("http://localhost:11434/api/tags", timeout=5) | |
| if response.status_code == 200: | |
| ollama_ready = True | |
| print("Ollama server is ready!") | |
| return | |
| except: | |
| pass | |
| print(f"Waiting for Ollama server... ({i+1}/{max_retries})") | |
| time.sleep(10) | |
| print("Ollama server failed to start") | |
| def pull_default_model(): | |
| """Pull a small default model""" | |
| try: | |
| print("Checking for existing models...") | |
| # Model should already be pulled during build, but check anyway | |
| subprocess.run(["ollama", "list"], check=True, timeout=30) | |
| print("Models available") | |
| except Exception as e: | |
| print(f"Error checking models: {e}") | |
| # Start background tasks | |
| def initialize(): | |
| wait_for_ollama() | |
| if ollama_ready: | |
| pull_default_model() | |
| threading.Thread(target=initialize, daemon=True).start() | |
| def convert_to_ollama_messages(messages: List[ChatMessage]) -> List[Dict[str, str]]: | |
| """Convert OpenAI format messages to Ollama format""" | |
| return [{"role": msg.role, "content": msg.content} for msg in messages] | |
| async def root(): | |
| print("Root endpoint accessed") | |
| return { | |
| "message": "Ollama OpenAI-Compatible API is running", | |
| "endpoints": { | |
| "chat_completions": "/v1/chat/completions", | |
| "models": "/v1/models" | |
| }, | |
| "ready": ollama_ready | |
| } | |
| async def health(): | |
| print("Health endpoint accessed") | |
| return {"status": "healthy", "ollama_ready": ollama_ready} | |
| # Add the missing /v1 endpoint that nanocoder checks | |
| async def v1_root(): | |
| print("V1 root endpoint accessed") | |
| return { | |
| "message": "OpenAI-compatible API v1", | |
| "endpoints": { | |
| "chat_completions": "/v1/chat/completions", | |
| "models": "/v1/models" | |
| }, | |
| "ready": ollama_ready | |
| } | |
| async def list_models(): | |
| """List available models in OpenAI format""" | |
| print("V1 models endpoint accessed") | |
| if not ollama_ready: | |
| print("Ollama not ready, returning 503") | |
| raise HTTPException(status_code=503, detail="Ollama server not ready") | |
| try: | |
| response = requests.get("http://localhost:11434/api/tags", timeout=10) | |
| if response.status_code == 200: | |
| ollama_models = response.json() | |
| models = [] | |
| for model in ollama_models.get("models", []): | |
| models.append({ | |
| "id": model["name"], | |
| "object": "model", | |
| "created": int(time.time()), | |
| "owned_by": "ollama" | |
| }) | |
| print(f"Returning {len(models)} models") | |
| return {"object": "list", "data": models} | |
| else: | |
| print(f"Ollama tags returned {response.status_code}") | |
| raise HTTPException(status_code=500, detail="Failed to fetch models") | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching models: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def create_chat_completion(request: ChatCompletionRequest): | |
| """OpenAI-compatible chat completions endpoint - no authentication required""" | |
| print(f"Chat completions endpoint accessed with model: {request.model}") | |
| if not ollama_ready: | |
| print("Ollama not ready, returning 503") | |
| raise HTTPException(status_code=503, detail="Ollama server not ready") | |
| try: | |
| # Convert to Ollama format | |
| ollama_messages = convert_to_ollama_messages(request.messages) | |
| print(f"Converted {len(ollama_messages)} messages for Ollama") | |
| ollama_request = { | |
| "model": request.model, | |
| "messages": ollama_messages, | |
| "stream": False, | |
| "options": { | |
| "num_ctx": 32768 # Maximum context length | |
| } | |
| } | |
| if request.temperature is not None: | |
| ollama_request["options"]["temperature"] = request.temperature | |
| if request.max_tokens is not None: | |
| ollama_request["options"]["num_predict"] = request.max_tokens | |
| print(f"Making request to Ollama with: {ollama_request}") | |
| # Make request to Ollama | |
| response = requests.post( | |
| "http://localhost:11434/api/chat", | |
| json=ollama_request, | |
| timeout=120 | |
| ) | |
| print(f"Ollama response status: {response.status_code}") | |
| if response.status_code == 200: | |
| ollama_response = response.json() | |
| print("Successfully got response from Ollama") | |
| # Convert to OpenAI format | |
| openai_response = { | |
| "id": f"chatcmpl-{int(time.time())}", | |
| "object": "chat.completion", | |
| "created": int(time.time()), | |
| "model": request.model, | |
| "choices": [{ | |
| "index": 0, | |
| "message": { | |
| "role": "assistant", | |
| "content": ollama_response.get("message", {}).get("content", "") | |
| }, | |
| "finish_reason": "stop" | |
| }], | |
| "usage": { | |
| "prompt_tokens": ollama_response.get("prompt_eval_count", 0), | |
| "completion_tokens": ollama_response.get("eval_count", 0), | |
| "total_tokens": ollama_response.get("prompt_eval_count", 0) + ollama_response.get("eval_count", 0) | |
| } | |
| } | |
| return openai_response | |
| else: | |
| print(f"Ollama returned error: {response.status_code} - {response.text}") | |
| raise HTTPException(status_code=500, detail=f"Ollama error: {response.status_code}") | |
| except requests.exceptions.RequestException as e: | |
| print(f"Request exception: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # Legacy endpoints for direct Ollama access | |
| async def generate(request: dict): | |
| print("Generate endpoint accessed") | |
| if not ollama_ready: | |
| raise HTTPException(status_code=503, detail="Ollama server not ready") | |
| try: | |
| response = requests.post( | |
| "http://localhost:11434/api/generate", | |
| json=request, | |
| timeout=120 | |
| ) | |
| return response.json() | |
| except requests.exceptions.RequestException as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def chat(request: dict): | |
| print("Chat endpoint accessed") | |
| if not ollama_ready: | |
| raise HTTPException(status_code=503, detail="Ollama server not ready") | |
| try: | |
| response = requests.post( | |
| "http://localhost:11434/api/chat", | |
| json=request, | |
| timeout=120 | |
| ) | |
| return response.json() | |
| except requests.exceptions.RequestException as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |