Spaces:

oki692
/

ollama-fastapi-streaming

Sleeping

App Files Files Community

oki692 commited on 17 days ago

Commit

ca418d0

verified ·

1 Parent(s): 5ad1f05

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +20 -4

app.py CHANGED Viewed

@@ -32,6 +32,15 @@ class HealthResponse(BaseModel):
     model: str
     endpoint: str
 @app.get("/", response_model=HealthResponse)
 async def root():
     """Health check endpoint"""
@@ -54,7 +63,7 @@ async def health():
         return {"status": "degraded", "ollama": "disconnected", "error": str(e)}
 async def generate_stream(prompt: str):
-    """Generate streaming response from Ollama"""
     try:
         async with httpx.AsyncClient(timeout=300.0) as client:
             payload = {
@@ -66,6 +75,10 @@ async def generate_stream(prompt: str):
                     "num_predict": 2048,
                     "top_k": 40,
                     "top_p": 0.9,
                 }
             }
@@ -95,7 +108,7 @@ async def generate_stream(prompt: str):
 @app.post("/stream")
 async def stream_chat(request: ChatRequest):
-    """Stream chat completions with key authentication"""
     if request.key != CONNECT_KEY:
         raise HTTPException(status_code=403, detail="Invalid connect key")
@@ -106,9 +119,12 @@ async def stream_chat(request: ChatRequest):
         generate_stream(request.prompt),
         media_type="text/event-stream",
         headers={
-            "Cache-Control": "no-cache",
             "Connection": "keep-alive",
-            "X-Accel-Buffering": "no"
         }
     )

     model: str
     endpoint: str
+# Middleware to disable all caching
+@app.middleware("http")
+async def disable_cache_middleware(request, call_next):
+    response = await call_next(request)
+    response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
+    response.headers["Pragma"] = "no-cache"
+    response.headers["Expires"] = "0"
+    return response
 @app.get("/", response_model=HealthResponse)
 async def root():
     """Health check endpoint"""
         return {"status": "degraded", "ollama": "disconnected", "error": str(e)}
 async def generate_stream(prompt: str):
+    """Generate streaming response from Ollama without caching"""
     try:
         async with httpx.AsyncClient(timeout=300.0) as client:
             payload = {
                     "num_predict": 2048,
                     "top_k": 40,
                     "top_p": 0.9,
+                    "num_ctx": 2048,
+                    "num_batch": 512,
+                    "num_gpu": 1,
+                    "num_thread": 4,
                 }
             }
 @app.post("/stream")
 async def stream_chat(request: ChatRequest):
+    """Stream chat completions with key authentication - NO CACHING"""
     if request.key != CONNECT_KEY:
         raise HTTPException(status_code=403, detail="Invalid connect key")
         generate_stream(request.prompt),
         media_type="text/event-stream",
         headers={
+            "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0, private",
+            "Pragma": "no-cache",
+            "Expires": "0",
             "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+            "X-Content-Type-Options": "nosniff"
         }
     )