truegleai commited on
Commit
eaed04c
·
verified ·
1 Parent(s): 5ee525a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -25
app.py CHANGED
@@ -18,9 +18,13 @@ app.add_middleware(
18
  allow_headers=["*"],
19
  )
20
 
21
- OLLAMA_BASE = "http://localhost:11434"
22
- MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M")
23
- API_TOKEN = os.environ.get("API_TOKEN", "")
 
 
 
 
24
 
25
 
26
  def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
@@ -33,7 +37,7 @@ def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
33
 
34
  @app.get("/")
35
  async def root():
36
- return {"status": "ok", "model": MODEL}
37
 
38
 
39
  @app.get("/health")
@@ -41,7 +45,7 @@ async def health():
41
  try:
42
  r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
43
  models = [m["name"] for m in r.json().get("models", [])]
44
- return {"status": "ok", "model": MODEL, "available_models": models}
45
  except Exception as e:
46
  return {"status": "starting", "error": str(e)}
47
 
@@ -67,7 +71,8 @@ async def chat_completions(request: Request, token: str = Depends(verify_token))
67
  "messages": body.get("messages", []),
68
  "stream": stream,
69
  "options": {
70
- "num_ctx": body.get("max_tokens", 32768),
 
71
  "temperature": body.get("temperature", 0.7),
72
  }
73
  }
@@ -77,7 +82,7 @@ async def chat_completions(request: Request, token: str = Depends(verify_token))
77
  try:
78
  with requests.post(
79
  f"{OLLAMA_BASE}/v1/chat/completions",
80
- json=payload, stream=True, timeout=300
81
  ) as r:
82
  for chunk in r.iter_content(chunk_size=None):
83
  if chunk:
@@ -86,8 +91,11 @@ async def chat_completions(request: Request, token: str = Depends(verify_token))
86
  yield f"data: {{\"error\": \"{e}\"}}\n\n".encode()
87
  return StreamingResponse(generate(), media_type="text/event-stream")
88
 
89
- r = requests.post(f"{OLLAMA_BASE}/v1/chat/completions", json=payload, timeout=300)
90
- return r.json()
 
 
 
91
 
92
 
93
  @app.post("/v1/messages")
@@ -101,7 +109,8 @@ async def messages(request: Request, token: str = Depends(verify_token)):
101
  "messages": body.get("messages", []),
102
  "stream": stream,
103
  "options": {
104
- "num_ctx": body.get("max_tokens", 32768),
 
105
  "temperature": body.get("temperature", 0.7),
106
  }
107
  }
@@ -117,7 +126,7 @@ async def messages(request: Request, token: str = Depends(verify_token)):
117
  try:
118
  with requests.post(
119
  f"{OLLAMA_BASE}/v1/chat/completions",
120
- json=payload, stream=True, timeout=300
121
  ) as r:
122
  buf = ""
123
  for chunk in r.iter_content(chunk_size=None):
@@ -153,21 +162,24 @@ async def messages(request: Request, token: str = Depends(verify_token)):
153
 
154
  return StreamingResponse(generate_anthropic(), media_type="text/event-stream")
155
 
156
- r = requests.post(f"{OLLAMA_BASE}/v1/chat/completions", json=payload, timeout=300)
157
- data = r.json()
158
- content = (data.get("choices") or [{}])[0].get("message", {}).get("content", "")
159
- return {
160
- "id": data.get("id", f"msg_{int(time.time())}"),
161
- "type": "message",
162
- "role": "assistant",
163
- "content": [{"type": "text", "text": content}],
164
- "model": model,
165
- "stop_reason": "end_turn",
166
- "usage": {
167
- "input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
168
- "output_tokens": data.get("usage", {}).get("completion_tokens", 0)
 
 
169
  }
170
- }
 
171
 
172
 
173
  if __name__ == "__main__":
 
18
  allow_headers=["*"],
19
  )
20
 
21
+ OLLAMA_BASE = "http://localhost:11434"
22
+ MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M")
23
+ API_TOKEN = os.environ.get("API_TOKEN", "")
24
+ # Free CPU tier: keep context small or requests will timeout after 5 min
25
+ MAX_CTX = 4096
26
+ MAX_OUT = 1024
27
+ TIMEOUT = 240 # 4 min hard limit — under HF's 5 min kill
28
 
29
 
30
  def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
 
37
 
38
  @app.get("/")
39
  async def root():
40
+ return {"status": "ok", "model": MODEL, "max_ctx": MAX_CTX}
41
 
42
 
43
  @app.get("/health")
 
45
  try:
46
  r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
47
  models = [m["name"] for m in r.json().get("models", [])]
48
+ return {"status": "ok", "model": MODEL, "available_models": models, "max_ctx": MAX_CTX}
49
  except Exception as e:
50
  return {"status": "starting", "error": str(e)}
51
 
 
71
  "messages": body.get("messages", []),
72
  "stream": stream,
73
  "options": {
74
+ "num_ctx": MAX_CTX,
75
+ "num_predict": min(body.get("max_tokens", MAX_OUT), MAX_OUT),
76
  "temperature": body.get("temperature", 0.7),
77
  }
78
  }
 
82
  try:
83
  with requests.post(
84
  f"{OLLAMA_BASE}/v1/chat/completions",
85
+ json=payload, stream=True, timeout=TIMEOUT
86
  ) as r:
87
  for chunk in r.iter_content(chunk_size=None):
88
  if chunk:
 
91
  yield f"data: {{\"error\": \"{e}\"}}\n\n".encode()
92
  return StreamingResponse(generate(), media_type="text/event-stream")
93
 
94
+ try:
95
+ r = requests.post(f"{OLLAMA_BASE}/v1/chat/completions", json=payload, timeout=TIMEOUT)
96
+ return r.json()
97
+ except requests.Timeout:
98
+ raise HTTPException(504, "Inference timeout — try a shorter prompt")
99
 
100
 
101
  @app.post("/v1/messages")
 
109
  "messages": body.get("messages", []),
110
  "stream": stream,
111
  "options": {
112
+ "num_ctx": MAX_CTX,
113
+ "num_predict": min(body.get("max_tokens", MAX_OUT), MAX_OUT),
114
  "temperature": body.get("temperature", 0.7),
115
  }
116
  }
 
126
  try:
127
  with requests.post(
128
  f"{OLLAMA_BASE}/v1/chat/completions",
129
+ json=payload, stream=True, timeout=TIMEOUT
130
  ) as r:
131
  buf = ""
132
  for chunk in r.iter_content(chunk_size=None):
 
162
 
163
  return StreamingResponse(generate_anthropic(), media_type="text/event-stream")
164
 
165
+ try:
166
+ r = requests.post(f"{OLLAMA_BASE}/v1/chat/completions", json=payload, timeout=TIMEOUT)
167
+ data = r.json()
168
+ content = (data.get("choices") or [{}])[0].get("message", {}).get("content", "")
169
+ return {
170
+ "id": data.get("id", f"msg_{int(time.time())}"),
171
+ "type": "message",
172
+ "role": "assistant",
173
+ "content": [{"type": "text", "text": content}],
174
+ "model": model,
175
+ "stop_reason": "end_turn",
176
+ "usage": {
177
+ "input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
178
+ "output_tokens": data.get("usage", {}).get("completion_tokens", 0)
179
+ }
180
  }
181
+ except requests.Timeout:
182
+ raise HTTPException(504, "Inference timeout — try a shorter prompt")
183
 
184
 
185
  if __name__ == "__main__":