Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, Request, Response | |
| from fastapi.responses import StreamingResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import httpx | |
| import os | |
| app = FastAPI(title="Ollama Compatible API Proxy", version="1.0.0") | |
| # CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| OLLAMA_BASE_URL = "http://localhost:11434" | |
| # Middleware to disable all caching | |
| async def disable_cache_middleware(request, call_next): | |
| response = await call_next(request) | |
| response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0" | |
| response.headers["Pragma"] = "no-cache" | |
| response.headers["Expires"] = "0" | |
| return response | |
| async def root(): | |
| """Health check""" | |
| return {"status": "online", "service": "Ollama Compatible API", "model": "deepseek-r1:1.5b"} | |
| # Proxy all Ollama API endpoints | |
| async def proxy_ollama_api(path: str, request: Request): | |
| """Proxy all requests to Ollama API - PURE STREAMING, NO BUFFERING""" | |
| # Get request body | |
| body = await request.body() | |
| # Prepare headers | |
| headers = dict(request.headers) | |
| headers.pop("host", None) | |
| # Determine if streaming | |
| is_streaming = False | |
| if body: | |
| try: | |
| import json | |
| data = json.loads(body) | |
| is_streaming = data.get("stream", False) | |
| except: | |
| pass | |
| async with httpx.AsyncClient(timeout=300.0) as client: | |
| # Forward request to Ollama | |
| if is_streaming: | |
| # PURE STREAMING - NO DELAYS, NO BUFFERING | |
| async def stream_response(): | |
| async with client.stream( | |
| request.method, | |
| f"{OLLAMA_BASE_URL}/api/{path}", | |
| content=body, | |
| headers=headers, | |
| timeout=300.0 | |
| ) as response: | |
| async for chunk in response.aiter_raw(): | |
| yield chunk | |
| return StreamingResponse( | |
| stream_response(), | |
| media_type="application/x-ndjson", | |
| headers={ | |
| "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", | |
| "Pragma": "no-cache", | |
| "Expires": "0", | |
| "X-Accel-Buffering": "no" | |
| } | |
| ) | |
| else: | |
| # Non-streaming response | |
| response = await client.request( | |
| request.method, | |
| f"{OLLAMA_BASE_URL}/api/{path}", | |
| content=body, | |
| headers=headers, | |
| timeout=300.0 | |
| ) | |
| return Response( | |
| content=response.content, | |
| status_code=response.status_code, | |
| headers=dict(response.headers), | |
| media_type=response.headers.get("content-type") | |
| ) | |
| # Root level endpoints (for compatibility) | |
| async def list_models(): | |
| """List available models - Ollama compatible""" | |
| async with httpx.AsyncClient(timeout=10.0) as client: | |
| response = await client.get(f"{OLLAMA_BASE_URL}/api/tags") | |
| return Response( | |
| content=response.content, | |
| status_code=response.status_code, | |
| media_type="application/json" | |
| ) | |
| async def generate(request: Request): | |
| """Generate completion - Ollama compatible - PURE STREAMING""" | |
| body = await request.body() | |
| async with httpx.AsyncClient(timeout=300.0) as client: | |
| async def stream_response(): | |
| async with client.stream( | |
| "POST", | |
| f"{OLLAMA_BASE_URL}/api/generate", | |
| content=body, | |
| timeout=300.0 | |
| ) as response: | |
| async for chunk in response.aiter_raw(): | |
| yield chunk | |
| return StreamingResponse( | |
| stream_response(), | |
| media_type="application/x-ndjson", | |
| headers={ | |
| "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", | |
| "Pragma": "no-cache", | |
| "Expires": "0", | |
| "X-Accel-Buffering": "no" | |
| } | |
| ) | |
| async def chat(request: Request): | |
| """Chat completion - Ollama compatible - PURE STREAMING""" | |
| body = await request.body() | |
| async with httpx.AsyncClient(timeout=300.0) as client: | |
| async def stream_response(): | |
| async with client.stream( | |
| "POST", | |
| f"{OLLAMA_BASE_URL}/api/chat", | |
| content=body, | |
| timeout=300.0 | |
| ) as response: | |
| async for chunk in response.aiter_raw(): | |
| yield chunk | |
| return StreamingResponse( | |
| stream_response(), | |
| media_type="application/x-ndjson", | |
| headers={ | |
| "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", | |
| "Pragma": "no-cache", | |
| "Expires": "0", | |
| "X-Accel-Buffering": "no" | |
| } | |
| ) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info") | |