from fastapi import FastAPI, Request, Response from fastapi.responses import StreamingResponse from fastapi.middleware.cors import CORSMiddleware import httpx import os app = FastAPI(title="Ollama Compatible API Proxy", version="1.0.0") # CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) OLLAMA_BASE_URL = "http://localhost:11434" # Middleware to disable all caching @app.middleware("http") async def disable_cache_middleware(request, call_next): response = await call_next(request) response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0" response.headers["Pragma"] = "no-cache" response.headers["Expires"] = "0" return response @app.get("/") async def root(): """Health check""" return {"status": "online", "service": "Ollama Compatible API", "model": "deepseek-r1:1.5b"} # Proxy all Ollama API endpoints @app.api_route("/api/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"]) async def proxy_ollama_api(path: str, request: Request): """Proxy all requests to Ollama API - PURE STREAMING, NO BUFFERING""" # Get request body body = await request.body() # Prepare headers headers = dict(request.headers) headers.pop("host", None) # Determine if streaming is_streaming = False if body: try: import json data = json.loads(body) is_streaming = data.get("stream", False) except: pass async with httpx.AsyncClient(timeout=300.0) as client: # Forward request to Ollama if is_streaming: # PURE STREAMING - NO DELAYS, NO BUFFERING async def stream_response(): async with client.stream( request.method, f"{OLLAMA_BASE_URL}/api/{path}", content=body, headers=headers, timeout=300.0 ) as response: async for chunk in response.aiter_raw(): yield chunk return StreamingResponse( stream_response(), media_type="application/x-ndjson", headers={ "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", "Pragma": "no-cache", "Expires": "0", "X-Accel-Buffering": "no" } ) else: # Non-streaming response response = await client.request( request.method, f"{OLLAMA_BASE_URL}/api/{path}", content=body, headers=headers, timeout=300.0 ) return Response( content=response.content, status_code=response.status_code, headers=dict(response.headers), media_type=response.headers.get("content-type") ) # Root level endpoints (for compatibility) @app.get("/api/tags") async def list_models(): """List available models - Ollama compatible""" async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get(f"{OLLAMA_BASE_URL}/api/tags") return Response( content=response.content, status_code=response.status_code, media_type="application/json" ) @app.post("/api/generate") async def generate(request: Request): """Generate completion - Ollama compatible - PURE STREAMING""" body = await request.body() async with httpx.AsyncClient(timeout=300.0) as client: async def stream_response(): async with client.stream( "POST", f"{OLLAMA_BASE_URL}/api/generate", content=body, timeout=300.0 ) as response: async for chunk in response.aiter_raw(): yield chunk return StreamingResponse( stream_response(), media_type="application/x-ndjson", headers={ "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", "Pragma": "no-cache", "Expires": "0", "X-Accel-Buffering": "no" } ) @app.post("/api/chat") async def chat(request: Request): """Chat completion - Ollama compatible - PURE STREAMING""" body = await request.body() async with httpx.AsyncClient(timeout=300.0) as client: async def stream_response(): async with client.stream( "POST", f"{OLLAMA_BASE_URL}/api/chat", content=body, timeout=300.0 ) as response: async for chunk in response.aiter_raw(): yield chunk return StreamingResponse( stream_response(), media_type="application/x-ndjson", headers={ "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", "Pragma": "no-cache", "Expires": "0", "X-Accel-Buffering": "no" } ) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")