truegleai commited on
Commit
fb2dba2
·
verified ·
1 Parent(s): eaed04c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -117
app.py CHANGED
@@ -1,14 +1,46 @@
1
  import os
2
  import json
3
  import time
 
4
  import requests
5
  import uvicorn
6
  from fastapi import FastAPI, Depends, HTTPException, Request
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
9
  from fastapi.responses import StreamingResponse
 
 
 
10
 
11
- app = FastAPI(title="o87Dev Cloud LLM API")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  security = HTTPBearer(auto_error=False)
13
 
14
  app.add_middleware(
@@ -18,14 +50,15 @@ app.add_middleware(
18
  allow_headers=["*"],
19
  )
20
 
21
- OLLAMA_BASE = "http://localhost:11434"
22
- MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M")
23
- API_TOKEN = os.environ.get("API_TOKEN", "")
24
- # Free CPU tier: keep context small or requests will timeout after 5 min
25
- MAX_CTX = 4096
26
- MAX_OUT = 1024
27
- TIMEOUT = 240 # 4 min hard limit — under HF's 5 min kill
28
 
 
 
29
 
30
  def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
31
  if not API_TOKEN:
@@ -35,9 +68,46 @@ def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
35
  return creds.credentials
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  @app.get("/")
39
  async def root():
40
- return {"status": "ok", "model": MODEL, "max_ctx": MAX_CTX}
 
 
 
 
 
41
 
42
 
43
  @app.get("/health")
@@ -45,7 +115,13 @@ async def health():
45
  try:
46
  r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
47
  models = [m["name"] for m in r.json().get("models", [])]
48
- return {"status": "ok", "model": MODEL, "available_models": models, "max_ctx": MAX_CTX}
 
 
 
 
 
 
49
  except Exception as e:
50
  return {"status": "starting", "error": str(e)}
51
 
@@ -62,124 +138,179 @@ async def list_models(token: str = Depends(verify_token)):
62
 
63
  @app.post("/v1/chat/completions")
64
  async def chat_completions(request: Request, token: str = Depends(verify_token)):
65
- body = await request.json()
66
- model = body.get("model", MODEL)
67
- stream = body.get("stream", False)
68
-
69
- payload = {
70
- "model": model,
71
- "messages": body.get("messages", []),
72
- "stream": stream,
73
- "options": {
74
- "num_ctx": MAX_CTX,
75
- "num_predict": min(body.get("max_tokens", MAX_OUT), MAX_OUT),
76
- "temperature": body.get("temperature", 0.7),
 
 
 
 
 
 
 
 
 
 
 
77
  }
78
- }
79
-
80
- if stream:
81
- def generate():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  try:
83
- with requests.post(
84
  f"{OLLAMA_BASE}/v1/chat/completions",
85
- json=payload, stream=True, timeout=TIMEOUT
86
- ) as r:
87
- for chunk in r.iter_content(chunk_size=None):
88
- if chunk:
89
- yield chunk
 
 
 
 
 
 
 
 
 
 
 
 
90
  except Exception as e:
91
- yield f"data: {{\"error\": \"{e}\"}}\n\n".encode()
92
- return StreamingResponse(generate(), media_type="text/event-stream")
93
-
94
- try:
95
- r = requests.post(f"{OLLAMA_BASE}/v1/chat/completions", json=payload, timeout=TIMEOUT)
96
- return r.json()
97
- except requests.Timeout:
98
- raise HTTPException(504, "Inference timeout — try a shorter prompt")
99
 
100
 
101
  @app.post("/v1/messages")
102
  async def messages(request: Request, token: str = Depends(verify_token)):
103
- body = await request.json()
104
- model = body.get("model", MODEL)
105
- stream = body.get("stream", False)
106
-
107
- payload = {
108
- "model": model,
109
- "messages": body.get("messages", []),
110
- "stream": stream,
111
- "options": {
112
- "num_ctx": MAX_CTX,
113
- "num_predict": min(body.get("max_tokens", MAX_OUT), MAX_OUT),
114
- "temperature": body.get("temperature", 0.7),
115
- }
116
- }
117
-
118
- if stream:
119
- def generate_anthropic():
120
- msg_id = f"msg_{int(time.time())}"
121
- yield f"event: message_start\ndata: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n".encode()
122
- yield f"event: content_block_start\ndata: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n".encode()
123
- yield b"event: ping\ndata: {\"type\":\"ping\"}\n\n"
124
-
125
- out_tokens = 0
126
- try:
127
- with requests.post(
128
- f"{OLLAMA_BASE}/v1/chat/completions",
129
- json=payload, stream=True, timeout=TIMEOUT
130
- ) as r:
131
- buf = ""
132
- for chunk in r.iter_content(chunk_size=None):
133
- if not chunk:
134
- continue
135
- buf += chunk.decode("utf-8", errors="ignore")
136
- lines = buf.split("\n")
137
- buf = lines.pop()
138
- for line in lines:
139
- line = line.strip()
140
- if not line or not line.startswith("data: "):
141
- continue
142
- js = line[6:]
143
- if js == "[DONE]":
144
- break
145
- try:
146
- d = json.loads(js)
147
- if d.get("usage"):
148
- out_tokens = d["usage"].get("completion_tokens", 0)
149
- text = (d.get("choices") or [{}])[0].get("delta", {}).get("content", "")
150
- if text:
151
- yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':text}})}\n\n".encode()
152
- if (d.get("choices") or [{}])[0].get("finish_reason"):
153
- break
154
- except Exception:
155
- pass
156
- except Exception as e:
157
- yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':f'Error: {e}'}})}\n\n".encode()
158
-
159
- yield b"event: content_block_stop\ndata: {\"type\":\"content_block_stop\",\"index\":0}\n\n"
160
- yield f"event: message_delta\ndata: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':out_tokens}})}\n\n".encode()
161
- yield b"event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n"
162
-
163
- return StreamingResponse(generate_anthropic(), media_type="text/event-stream")
164
-
165
- try:
166
- r = requests.post(f"{OLLAMA_BASE}/v1/chat/completions", json=payload, timeout=TIMEOUT)
167
- data = r.json()
168
- content = (data.get("choices") or [{}])[0].get("message", {}).get("content", "")
169
- return {
170
- "id": data.get("id", f"msg_{int(time.time())}"),
171
- "type": "message",
172
- "role": "assistant",
173
- "content": [{"type": "text", "text": content}],
174
  "model": model,
175
- "stop_reason": "end_turn",
176
- "usage": {
177
- "input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
178
- "output_tokens": data.get("usage", {}).get("completion_tokens", 0)
 
 
179
  }
180
  }
181
- except requests.Timeout:
182
- raise HTTPException(504, "Inference timeout — try a shorter prompt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
 
185
  if __name__ == "__main__":
 
1
  import os
2
  import json
3
  import time
4
+ import asyncio
5
  import requests
6
  import uvicorn
7
  from fastapi import FastAPI, Depends, HTTPException, Request
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
10
  from fastapi.responses import StreamingResponse
11
+ from contextlib import asynccontextmanager
12
+ import subprocess
13
+ import shutil
14
 
15
+ # Check if ollama is available
16
+ OLLAMA_AVAILABLE = shutil.which("ollama") is not None
17
+
18
+ @asynccontextmanager
19
+ async def lifespan(app: FastAPI):
20
+ """Startup and shutdown events"""
21
+ if OLLAMA_AVAILABLE:
22
+ print("Starting Ollama service...")
23
+ subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
24
+ await asyncio.sleep(3) # Wait for Ollama to start
25
+
26
+ # Set keep-alive to prevent model unloading
27
+ os.environ["OLLAMA_KEEP_ALIVE"] = "24h"
28
+
29
+ # Pull model if needed
30
+ try:
31
+ r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
32
+ models = [m["name"] for m in r.json().get("models", [])]
33
+ if MODEL not in models:
34
+ print(f"Pulling model {MODEL}...")
35
+ subprocess.run(["ollama", "pull", MODEL], check=False)
36
+ except Exception as e:
37
+ print(f"Warning: Could not check/pull model: {e}")
38
+
39
+ yield
40
+
41
+ print("Shutting down...")
42
+
43
+ app = FastAPI(title="o87Dev Cloud LLM API", lifespan=lifespan)
44
  security = HTTPBearer(auto_error=False)
45
 
46
  app.add_middleware(
 
50
  allow_headers=["*"],
51
  )
52
 
53
+ OLLAMA_BASE = "http://localhost:11434"
54
+ MODEL = os.environ.get("DEFAULT_MODEL", "qwen2.5-coder:7b-instruct-q4_K_M")
55
+ API_TOKEN = os.environ.get("API_TOKEN", "")
56
+ MAX_CTX = int(os.environ.get("MAX_CTX", "4096"))
57
+ MAX_OUT = int(os.environ.get("MAX_OUT", "1024"))
58
+ TIMEOUT = int(os.environ.get("TIMEOUT", "240")) # 4 min limit
 
59
 
60
+ # Semaphore to limit concurrent requests (prevents OOM)
61
+ semaphore = asyncio.Semaphore(1) # Only 1 request at a time for CPU Spaces
62
 
63
  def verify_token(creds: HTTPAuthorizationCredentials = Depends(security)):
64
  if not API_TOKEN:
 
68
  return creds.credentials
69
 
70
 
71
+ async def wait_for_ollama(max_retries=10, delay=1):
72
+ """Wait for Ollama to be ready, with retries"""
73
+ for i in range(max_retries):
74
+ try:
75
+ r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=2)
76
+ if r.status_code == 200:
77
+ return True
78
+ except:
79
+ pass
80
+ await asyncio.sleep(delay)
81
+ return False
82
+
83
+
84
+ async def ensure_model_loaded(model_name: str = None):
85
+ """Pre-load model with a dummy request to force it into memory"""
86
+ model = model_name or MODEL
87
+ try:
88
+ # Check if model is already loaded
89
+ r = requests.get(f"{OLLAMA_BASE}/api/ps", timeout=2)
90
+ loaded = [m.get("model") for m in r.json().get("models", [])]
91
+ if model not in loaded:
92
+ print(f"Pre-loading model {model}...")
93
+ requests.post(
94
+ f"{OLLAMA_BASE}/api/generate",
95
+ json={"model": model, "prompt": "test", "stream": False},
96
+ timeout=30
97
+ )
98
+ print(f"Model {model} loaded")
99
+ except Exception as e:
100
+ print(f"Warning: Could not pre-load model: {e}")
101
+
102
+
103
  @app.get("/")
104
  async def root():
105
+ return {
106
+ "status": "ok",
107
+ "model": MODEL,
108
+ "max_ctx": MAX_CTX,
109
+ "ollama_available": OLLAMA_AVAILABLE
110
+ }
111
 
112
 
113
  @app.get("/health")
 
115
  try:
116
  r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
117
  models = [m["name"] for m in r.json().get("models", [])]
118
+ return {
119
+ "status": "ok" if MODEL in models else "model_missing",
120
+ "model": MODEL,
121
+ "model_available": MODEL in models,
122
+ "available_models": models,
123
+ "max_ctx": MAX_CTX
124
+ }
125
  except Exception as e:
126
  return {"status": "starting", "error": str(e)}
127
 
 
138
 
139
  @app.post("/v1/chat/completions")
140
  async def chat_completions(request: Request, token: str = Depends(verify_token)):
141
+ """OpenAI-compatible endpoint with retries and better error handling"""
142
+
143
+ # Wait for Ollama to be ready
144
+ if not await wait_for_ollama():
145
+ raise HTTPException(503, "Ollama service not ready")
146
+
147
+ async with semaphore:
148
+ body = await request.json()
149
+ model = body.get("model", MODEL)
150
+ stream = body.get("stream", False)
151
+
152
+ # Ensure model is loaded before proceeding
153
+ await ensure_model_loaded(model)
154
+
155
+ payload = {
156
+ "model": model,
157
+ "messages": body.get("messages", []),
158
+ "stream": stream,
159
+ "options": {
160
+ "num_ctx": MAX_CTX,
161
+ "num_predict": min(body.get("max_tokens", MAX_OUT), MAX_OUT),
162
+ "temperature": body.get("temperature", 0.7),
163
+ }
164
  }
165
+
166
+ if stream:
167
+ def generate():
168
+ try:
169
+ with requests.post(
170
+ f"{OLLAMA_BASE}/v1/chat/completions",
171
+ json=payload,
172
+ stream=True,
173
+ timeout=TIMEOUT
174
+ ) as r:
175
+ if r.status_code != 200:
176
+ error_msg = f"Ollama error: {r.status_code}"
177
+ yield f"data: {json.dumps({'error': error_msg})}\n\n".encode()
178
+ yield b"data: [DONE]\n\n"
179
+ return
180
+
181
+ for chunk in r.iter_content(chunk_size=None):
182
+ if chunk:
183
+ yield chunk
184
+ except requests.Timeout:
185
+ yield f"data: {json.dumps({'error': 'Request timeout - try a shorter prompt'})}\n\n".encode()
186
+ yield b"data: [DONE]\n\n"
187
+ except Exception as e:
188
+ yield f"data: {json.dumps({'error': str(e)})}\n\n".encode()
189
+ yield b"data: [DONE]\n\n"
190
+
191
+ return StreamingResponse(generate(), media_type="text/event-stream")
192
+
193
+ # Non-streaming request with retry logic
194
+ max_retries = 2
195
+ for attempt in range(max_retries):
196
  try:
197
+ r = requests.post(
198
  f"{OLLAMA_BASE}/v1/chat/completions",
199
+ json=payload,
200
+ timeout=TIMEOUT
201
+ )
202
+ if r.status_code == 200:
203
+ return r.json()
204
+ elif r.status_code == 404:
205
+ # Model not found - try to pull it
206
+ if attempt < max_retries - 1:
207
+ print(f"Model {model} not found, attempting pull...")
208
+ subprocess.run(["ollama", "pull", model], check=False)
209
+ await asyncio.sleep(5)
210
+ continue
211
+ raise HTTPException(r.status_code, f"Ollama error: {r.text}")
212
+ except requests.Timeout:
213
+ if attempt == max_retries - 1:
214
+ raise HTTPException(504, "Inference timeout — try a shorter prompt")
215
+ await asyncio.sleep(2)
216
  except Exception as e:
217
+ if attempt == max_retries - 1:
218
+ raise HTTPException(500, str(e))
219
+ await asyncio.sleep(2)
 
 
 
 
 
220
 
221
 
222
  @app.post("/v1/messages")
223
  async def messages(request: Request, token: str = Depends(verify_token)):
224
+ """Anthropic-compatible messages endpoint"""
225
+
226
+ if not await wait_for_ollama():
227
+ raise HTTPException(503, "Ollama service not ready")
228
+
229
+ async with semaphore:
230
+ body = await request.json()
231
+ model = body.get("model", MODEL)
232
+ stream = body.get("stream", False)
233
+
234
+ await ensure_model_loaded(model)
235
+
236
+ payload = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  "model": model,
238
+ "messages": body.get("messages", []),
239
+ "stream": stream,
240
+ "options": {
241
+ "num_ctx": MAX_CTX,
242
+ "num_predict": min(body.get("max_tokens", MAX_OUT), MAX_OUT),
243
+ "temperature": body.get("temperature", 0.7),
244
  }
245
  }
246
+
247
+ if stream:
248
+ def generate_anthropic():
249
+ msg_id = f"msg_{int(time.time())}"
250
+ yield f"event: message_start\ndata: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n".encode()
251
+ yield f"event: content_block_start\ndata: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n".encode()
252
+ yield b"event: ping\ndata: {\"type\":\"ping\"}\n\n"
253
+
254
+ out_tokens = 0
255
+ try:
256
+ with requests.post(
257
+ f"{OLLAMA_BASE}/v1/chat/completions",
258
+ json=payload, stream=True, timeout=TIMEOUT
259
+ ) as r:
260
+ if r.status_code != 200:
261
+ yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':f'Error: Ollama returned {r.status_code}'}})}\n\n".encode()
262
+ else:
263
+ buf = ""
264
+ for chunk in r.iter_content(chunk_size=None):
265
+ if not chunk:
266
+ continue
267
+ buf += chunk.decode("utf-8", errors="ignore")
268
+ lines = buf.split("\n")
269
+ buf = lines.pop()
270
+ for line in lines:
271
+ line = line.strip()
272
+ if not line or not line.startswith("data: "):
273
+ continue
274
+ js = line[6:]
275
+ if js == "[DONE]":
276
+ break
277
+ try:
278
+ d = json.loads(js)
279
+ if d.get("usage"):
280
+ out_tokens = d["usage"].get("completion_tokens", 0)
281
+ text = (d.get("choices") or [{}])[0].get("delta", {}).get("content", "")
282
+ if text:
283
+ yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':text}})}\n\n".encode()
284
+ except:
285
+ pass
286
+ except Exception as e:
287
+ yield f"event: content_block_delta\ndata: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':f'Error: {e}'}})}\n\n".encode()
288
+
289
+ yield b"event: content_block_stop\ndata: {\"type\":\"content_block_stop\",\"index\":0}\n\n"
290
+ yield f"event: message_delta\ndata: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':out_tokens}})}\n\n".encode()
291
+ yield b"event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n"
292
+
293
+ return StreamingResponse(generate_anthropic(), media_type="text/event-stream")
294
+
295
+ # Non-streaming
296
+ try:
297
+ r = requests.post(f"{OLLAMA_BASE}/v1/chat/completions", json=payload, timeout=TIMEOUT)
298
+ data = r.json()
299
+ content = (data.get("choices") or [{}])[0].get("message", {}).get("content", "")
300
+ return {
301
+ "id": data.get("id", f"msg_{int(time.time())}"),
302
+ "type": "message",
303
+ "role": "assistant",
304
+ "content": [{"type": "text", "text": content}],
305
+ "model": model,
306
+ "stop_reason": "end_turn",
307
+ "usage": {
308
+ "input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
309
+ "output_tokens": data.get("usage", {}).get("completion_tokens", 0)
310
+ }
311
+ }
312
+ except requests.Timeout:
313
+ raise HTTPException(504, "Inference timeout — try a shorter prompt")
314
 
315
 
316
  if __name__ == "__main__":