Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,9 +18,13 @@ app.add_middleware(
|
|
| 18 |
allow_headers=["*"],
|
| 19 |
)
|
| 20 |
|
| 21 |
-
OLLAMA_BASE
|
| 22 |
-
MODEL
|
| 23 |
-
API_TOKEN
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
|
|
@@ -33,7 +37,7 @@ def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
|
|
| 33 |
|
| 34 |
@app.get("/")
|
| 35 |
async def root():
|
| 36 |
-
return {"status": "ok", "model": MODEL}
|
| 37 |
|
| 38 |
|
| 39 |
@app.get("/health")
|
|
@@ -41,7 +45,7 @@ async def health():
|
|
| 41 |
try:
|
| 42 |
r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
|
| 43 |
models = [m["name"] for m in r.json().get("models", [])]
|
| 44 |
-
return {"status": "ok", "model": MODEL, "available_models": models}
|
| 45 |
except Exception as e:
|
| 46 |
return {"status": "starting", "error": str(e)}
|
| 47 |
|
|
@@ -67,7 +71,8 @@ async def chat_completions(request: Request, token: str = Depends(verify_token))
|
|
| 67 |
"messages": body.get("messages", []),
|
| 68 |
"stream": stream,
|
| 69 |
"options": {
|
| 70 |
-
"num_ctx":
|
|
|
|
| 71 |
"temperature": body.get("temperature", 0.7),
|
| 72 |
}
|
| 73 |
}
|
|
@@ -77,7 +82,7 @@ async def chat_completions(request: Request, token: str = Depends(verify_token))
|
|
| 77 |
try:
|
| 78 |
with requests.post(
|
| 79 |
f"{OLLAMA_BASE}/v1/chat/completions",
|
| 80 |
-
json=payload, stream=True, timeout=
|
| 81 |
) as r:
|
| 82 |
for chunk in r.iter_content(chunk_size=None):
|
| 83 |
if chunk:
|
|
@@ -86,8 +91,11 @@ async def chat_completions(request: Request, token: str = Depends(verify_token))
|
|
| 86 |
yield f"data: {{\"error\": \"{e}\"}}\n\n".encode()
|
| 87 |
return StreamingResponse(generate(), media_type="text/event-stream")
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
@app.post("/v1/messages")
|
|
@@ -101,7 +109,8 @@ async def messages(request: Request, token: str = Depends(verify_token)):
|
|
| 101 |
"messages": body.get("messages", []),
|
| 102 |
"stream": stream,
|
| 103 |
"options": {
|
| 104 |
-
"num_ctx":
|
|
|
|
| 105 |
"temperature": body.get("temperature", 0.7),
|
| 106 |
}
|
| 107 |
}
|
|
@@ -117,7 +126,7 @@ async def messages(request: Request, token: str = Depends(verify_token)):
|
|
| 117 |
try:
|
| 118 |
with requests.post(
|
| 119 |
f"{OLLAMA_BASE}/v1/chat/completions",
|
| 120 |
-
json=payload, stream=True, timeout=
|
| 121 |
) as r:
|
| 122 |
buf = ""
|
| 123 |
for chunk in r.iter_content(chunk_size=None):
|
|
@@ -153,21 +162,24 @@ async def messages(request: Request, token: str = Depends(verify_token)):
|
|
| 153 |
|
| 154 |
return StreamingResponse(generate_anthropic(), media_type="text/event-stream")
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
"
|
| 168 |
-
|
|
|
|
|
|
|
| 169 |
}
|
| 170 |
-
|
|
|
|
| 171 |
|
| 172 |
|
| 173 |
if __name__ == "__main__":
|
|
|
|
| 18 |
allow_headers=["*"],
|
| 19 |
)
|
| 20 |
|
| 21 |
+
OLLAMA_BASE = "http://localhost:11434"
|
| 22 |
+
MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M")
|
| 23 |
+
API_TOKEN = os.environ.get("API_TOKEN", "")
|
| 24 |
+
# Free CPU tier: keep context small or requests will timeout after 5 min
|
| 25 |
+
MAX_CTX = 4096
|
| 26 |
+
MAX_OUT = 1024
|
| 27 |
+
TIMEOUT = 240 # 4 min hard limit — under HF's 5 min kill
|
| 28 |
|
| 29 |
|
| 30 |
def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
|
|
|
|
| 37 |
|
| 38 |
@app.get("/")
|
| 39 |
async def root():
|
| 40 |
+
return {"status": "ok", "model": MODEL, "max_ctx": MAX_CTX}
|
| 41 |
|
| 42 |
|
| 43 |
@app.get("/health")
|
|
|
|
| 45 |
try:
|
| 46 |
r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
|
| 47 |
models = [m["name"] for m in r.json().get("models", [])]
|
| 48 |
+
return {"status": "ok", "model": MODEL, "available_models": models, "max_ctx": MAX_CTX}
|
| 49 |
except Exception as e:
|
| 50 |
return {"status": "starting", "error": str(e)}
|
| 51 |
|
|
|
|
| 71 |
"messages": body.get("messages", []),
|
| 72 |
"stream": stream,
|
| 73 |
"options": {
|
| 74 |
+
"num_ctx": MAX_CTX,
|
| 75 |
+
"num_predict": min(body.get("max_tokens", MAX_OUT), MAX_OUT),
|
| 76 |
"temperature": body.get("temperature", 0.7),
|
| 77 |
}
|
| 78 |
}
|
|
|
|
| 82 |
try:
|
| 83 |
with requests.post(
|
| 84 |
f"{OLLAMA_BASE}/v1/chat/completions",
|
| 85 |
+
json=payload, stream=True, timeout=TIMEOUT
|
| 86 |
) as r:
|
| 87 |
for chunk in r.iter_content(chunk_size=None):
|
| 88 |
if chunk:
|
|
|
|
| 91 |
yield f"data: {{\"error\": \"{e}\"}}\n\n".encode()
|
| 92 |
return StreamingResponse(generate(), media_type="text/event-stream")
|
| 93 |
|
| 94 |
+
try:
|
| 95 |
+
r = requests.post(f"{OLLAMA_BASE}/v1/chat/completions", json=payload, timeout=TIMEOUT)
|
| 96 |
+
return r.json()
|
| 97 |
+
except requests.Timeout:
|
| 98 |
+
raise HTTPException(504, "Inference timeout — try a shorter prompt")
|
| 99 |
|
| 100 |
|
| 101 |
@app.post("/v1/messages")
|
|
|
|
| 109 |
"messages": body.get("messages", []),
|
| 110 |
"stream": stream,
|
| 111 |
"options": {
|
| 112 |
+
"num_ctx": MAX_CTX,
|
| 113 |
+
"num_predict": min(body.get("max_tokens", MAX_OUT), MAX_OUT),
|
| 114 |
"temperature": body.get("temperature", 0.7),
|
| 115 |
}
|
| 116 |
}
|
|
|
|
| 126 |
try:
|
| 127 |
with requests.post(
|
| 128 |
f"{OLLAMA_BASE}/v1/chat/completions",
|
| 129 |
+
json=payload, stream=True, timeout=TIMEOUT
|
| 130 |
) as r:
|
| 131 |
buf = ""
|
| 132 |
for chunk in r.iter_content(chunk_size=None):
|
|
|
|
| 162 |
|
| 163 |
return StreamingResponse(generate_anthropic(), media_type="text/event-stream")
|
| 164 |
|
| 165 |
+
try:
|
| 166 |
+
r = requests.post(f"{OLLAMA_BASE}/v1/chat/completions", json=payload, timeout=TIMEOUT)
|
| 167 |
+
data = r.json()
|
| 168 |
+
content = (data.get("choices") or [{}])[0].get("message", {}).get("content", "")
|
| 169 |
+
return {
|
| 170 |
+
"id": data.get("id", f"msg_{int(time.time())}"),
|
| 171 |
+
"type": "message",
|
| 172 |
+
"role": "assistant",
|
| 173 |
+
"content": [{"type": "text", "text": content}],
|
| 174 |
+
"model": model,
|
| 175 |
+
"stop_reason": "end_turn",
|
| 176 |
+
"usage": {
|
| 177 |
+
"input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
|
| 178 |
+
"output_tokens": data.get("usage", {}).get("completion_tokens", 0)
|
| 179 |
+
}
|
| 180 |
}
|
| 181 |
+
except requests.Timeout:
|
| 182 |
+
raise HTTPException(504, "Inference timeout — try a shorter prompt")
|
| 183 |
|
| 184 |
|
| 185 |
if __name__ == "__main__":
|