oki692 commited on
Commit
ca418d0
·
verified ·
1 Parent(s): 5ad1f05

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +20 -4
app.py CHANGED
@@ -32,6 +32,15 @@ class HealthResponse(BaseModel):
32
  model: str
33
  endpoint: str
34
 
 
 
 
 
 
 
 
 
 
35
  @app.get("/", response_model=HealthResponse)
36
  async def root():
37
  """Health check endpoint"""
@@ -54,7 +63,7 @@ async def health():
54
  return {"status": "degraded", "ollama": "disconnected", "error": str(e)}
55
 
56
  async def generate_stream(prompt: str):
57
- """Generate streaming response from Ollama"""
58
  try:
59
  async with httpx.AsyncClient(timeout=300.0) as client:
60
  payload = {
@@ -66,6 +75,10 @@ async def generate_stream(prompt: str):
66
  "num_predict": 2048,
67
  "top_k": 40,
68
  "top_p": 0.9,
 
 
 
 
69
  }
70
  }
71
 
@@ -95,7 +108,7 @@ async def generate_stream(prompt: str):
95
 
96
  @app.post("/stream")
97
  async def stream_chat(request: ChatRequest):
98
- """Stream chat completions with key authentication"""
99
  if request.key != CONNECT_KEY:
100
  raise HTTPException(status_code=403, detail="Invalid connect key")
101
 
@@ -106,9 +119,12 @@ async def stream_chat(request: ChatRequest):
106
  generate_stream(request.prompt),
107
  media_type="text/event-stream",
108
  headers={
109
- "Cache-Control": "no-cache",
 
 
110
  "Connection": "keep-alive",
111
- "X-Accel-Buffering": "no"
 
112
  }
113
  )
114
 
 
32
  model: str
33
  endpoint: str
34
 
35
+ # Middleware to disable all caching
36
+ @app.middleware("http")
37
+ async def disable_cache_middleware(request, call_next):
38
+ response = await call_next(request)
39
+ response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
40
+ response.headers["Pragma"] = "no-cache"
41
+ response.headers["Expires"] = "0"
42
+ return response
43
+
44
  @app.get("/", response_model=HealthResponse)
45
  async def root():
46
  """Health check endpoint"""
 
63
  return {"status": "degraded", "ollama": "disconnected", "error": str(e)}
64
 
65
  async def generate_stream(prompt: str):
66
+ """Generate streaming response from Ollama without caching"""
67
  try:
68
  async with httpx.AsyncClient(timeout=300.0) as client:
69
  payload = {
 
75
  "num_predict": 2048,
76
  "top_k": 40,
77
  "top_p": 0.9,
78
+ "num_ctx": 2048,
79
+ "num_batch": 512,
80
+ "num_gpu": 1,
81
+ "num_thread": 4,
82
  }
83
  }
84
 
 
108
 
109
  @app.post("/stream")
110
  async def stream_chat(request: ChatRequest):
111
+ """Stream chat completions with key authentication - NO CACHING"""
112
  if request.key != CONNECT_KEY:
113
  raise HTTPException(status_code=403, detail="Invalid connect key")
114
 
 
119
  generate_stream(request.prompt),
120
  media_type="text/event-stream",
121
  headers={
122
+ "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0, private",
123
+ "Pragma": "no-cache",
124
+ "Expires": "0",
125
  "Connection": "keep-alive",
126
+ "X-Accel-Buffering": "no",
127
+ "X-Content-Type-Options": "nosniff"
128
  }
129
  )
130