Spaces:

oki692
/

ollama-fastapi-streaming

Sleeping

File size: 4,592 Bytes

from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import httpx
import os
import asyncio
import json

app = FastAPI(title="Ollama Streaming API", version="1.0.0")

# CORS middleware for browser access
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Hardcoded configuration
MODEL_NAME = "deepseek-r1:1.5b"
CONNECT_KEY = "manus-ollama-2024"
OLLAMA_BASE_URL = "http://localhost:11434"

class ChatRequest(BaseModel):
    prompt: str
    key: str

class HealthResponse(BaseModel):
    status: str
    model: str
    endpoint: str

# Middleware to disable all caching
@app.middleware("http")
async def disable_cache_middleware(request, call_next):
    response = await call_next(request)
    response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
    response.headers["Pragma"] = "no-cache"
    response.headers["Expires"] = "0"
    return response

@app.get("/", response_model=HealthResponse)
async def root():
    """Health check endpoint"""
    space_url = os.getenv("SPACE_URL", "http://localhost:7860")
    return HealthResponse(
        status="online",
        model=MODEL_NAME,
        endpoint=space_url
    )

@app.get("/health")
async def health():
    """Detailed health check"""
    try:
        async with httpx.AsyncClient(timeout=5.0) as client:
            response = await client.get(f"{OLLAMA_BASE_URL}/api/tags")
            if response.status_code == 200:
                return {"status": "healthy", "ollama": "connected", "model": MODEL_NAME}
    except Exception as e:
        return {"status": "degraded", "ollama": "disconnected", "error": str(e)}

async def generate_stream(prompt: str):
    """Generate streaming response from Ollama without caching"""
    try:
        async with httpx.AsyncClient(timeout=300.0) as client:
            payload = {
                "model": MODEL_NAME,
                "prompt": prompt,
                "stream": True,
                "options": {
                    "temperature": 0.7,
                    "num_predict": 2048,
                    "top_k": 40,
                    "top_p": 0.9,
                    "num_ctx": 2048,
                    "num_batch": 512,
                    "num_gpu": 1,
                    "num_thread": 4,
                }
            }
            
            async with client.stream(
                "POST",
                f"{OLLAMA_BASE_URL}/api/generate",
                json=payload,
                timeout=300.0
            ) as response:
                if response.status_code != 200:
                    yield f"data: {json.dumps({'error': 'Ollama API error'})}\n\n"
                    return
                
                async for line in response.aiter_lines():
                    if line.strip():
                        try:
                            data = json.loads(line)
                            if "response" in data:
                                yield f"data: {json.dumps({'text': data['response'], 'done': data.get('done', False)})}\n\n"
                            if data.get("done", False):
                                break
                        except json.JSONDecodeError:
                            continue
                            
    except Exception as e:
        yield f"data: {json.dumps({'error': str(e)})}\n\n"

@app.post("/stream")
async def stream_chat(request: ChatRequest):
    """Stream chat completions with key authentication - NO CACHING"""
    if request.key != CONNECT_KEY:
        raise HTTPException(status_code=403, detail="Invalid connect key")
    
    if not request.prompt or len(request.prompt.strip()) == 0:
        raise HTTPException(status_code=400, detail="Prompt cannot be empty")
    
    return StreamingResponse(
        generate_stream(request.prompt),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0, private",
            "Pragma": "no-cache",
            "Expires": "0",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no",
            "X-Content-Type-Options": "nosniff"
        }
    )

@app.get("/models")
async def list_models():
    """List available models"""
    return {"models": [MODEL_NAME], "default": MODEL_NAME}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")