NOT-OMEGA commited on
Commit
aedc5e9
Β·
verified Β·
1 Parent(s): b39e7fd

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +87 -192
main.py CHANGED
@@ -1,14 +1,5 @@
1
  """
2
- KVInfer β€” FastAPI Backend v2.1
3
- ========================================
4
- Fixes applied:
5
- #1 Persistent C++ process β€” model loads ONCE at startup via lifespan.
6
- #2 O(n) token cache β€” incremental tokens only per turn.
7
- #3 Session KV-cache reuse.
8
- #4 Stop-token bleed fix.
9
- #7 Chat template format fixed to match SFT training format.
10
- #HF Serves index.html at "/" for HF Spaces Docker deployment.
11
- #HF Automatically downloads model.bin & tokenizer.bin from HF Hub.
12
  """
13
  import asyncio
14
  import json
@@ -24,7 +15,7 @@ from fastapi import FastAPI, HTTPException
24
  from fastapi.middleware.cors import CORSMiddleware
25
  from fastapi.responses import FileResponse, StreamingResponse
26
  from pydantic import BaseModel, Field
27
- from huggingface_hub import hf_hub_download # <-- NAYA IMPORT
28
 
29
  # ─────────────────────────────────────────────────────────────────────────
30
  # Config
@@ -34,8 +25,6 @@ INFERENCE_EXE = BASE_DIR / "inference"
34
  MODEL_BIN = BASE_DIR / "model.bin"
35
  TOKENIZER_BIN = BASE_DIR / "tokenizer.bin"
36
 
37
- # ⚠️ YAHAN APNA HUGGING FACE REPO ID DAALO ⚠️
38
- # Example: "Sumeet/KVInfer-152M"
39
  HF_REPO_ID = "NOT-OMEGA/KVInfer-152M"
40
 
41
  SYSTEM_TOKEN = "System:"
@@ -46,7 +35,7 @@ SEP = "\n"
46
  BLOCK_SIZE = 1024
47
  MAX_GEN_CEILING = 500
48
  SAFETY_MARGIN = 24
49
- MAX_SESSION_TOKENS = BLOCK_SIZE - MAX_GEN_CEILING - SAFETY_MARGIN # = 500
50
 
51
  # ─────────────────────────────────────────────────────────────────────────
52
  # Tokenizer
@@ -100,7 +89,10 @@ class InferenceEngine:
100
  async with self._lock:
101
  self._proc.stdin.write(f"RESET|{session_id}\n".encode())
102
  await self._proc.stdin.drain()
103
- await self._proc.stdout.readline()
 
 
 
104
 
105
  async def generate(self, session_id, new_token_ids, max_new, temperature, top_k):
106
  if not self._ready or self._proc is None:
@@ -114,35 +106,38 @@ class InferenceEngine:
114
  async with self._lock:
115
  self._proc.stdin.write(cmd.encode())
116
  await self._proc.stdin.drain()
117
-
118
- while True:
119
- raw = await self._proc.stdout.readline()
120
- line = raw.decode("utf-8", errors="replace").strip()
121
- if not line:
122
- continue
123
-
124
- if line.startswith("TOKEN"):
125
- parts = line.split()
126
- tid = int(parts[1])
127
- ms = float(parts[2])
128
- yield {"type": "token", "id": tid,
129
- "text": enc.decode([tid]), "elapsed_ms": ms}
130
- elif line.startswith("DONE"):
131
- parts = line.split()
132
- total_t = int(parts[1])
133
- total_ms = float(parts[2])
134
- tps = round(total_t / (total_ms / 1000.0), 2) if total_ms > 0 else 0
135
- yield {"type": "done", "total_tokens": total_t,
136
- "total_ms": total_ms, "tps": tps}
137
- break
138
- elif line.startswith("ERROR"):
139
- yield {"type": "error", "message": line}
140
- break
 
 
 
141
 
142
  engine = InferenceEngine()
143
 
144
  # ─────────────────────────────────────────────────────────────────────────
145
- # Session State
146
  # ─────────────────────────────────────────────────────────────────────────
147
  class SessionData:
148
  def __init__(self, system_prompt: str):
@@ -158,105 +153,92 @@ class SessionData:
158
 
159
  def new_turn_tokens(self, user_msg):
160
  if self.tokens_in_engine == 0:
161
- full = (
162
- f"{SYSTEM_TOKEN} {self.system_prompt}{SEP}"
163
- f"{USER_TOKEN} {user_msg}{SEP}"
164
- f"{ASST_TOKEN} "
165
- )
166
  return enc.encode_ordinary(full)
167
  else:
168
- incremental = f"{USER_TOKEN} {user_msg}{SEP}{ASST_TOKEN} "
169
- return enc.encode_ordinary(incremental)
170
 
171
  sessions = {}
172
-
173
  metrics = {
174
- "total_requests": 0,
175
- "total_tokens": 0,
176
- "total_ms": 0.0,
177
- "errors": 0,
178
- "start_time": time.time(),
179
  }
180
 
181
  # ─────────────────────────────────────────────────────────────────────────
182
- # App + Lifespan (Naya HF Download Logic Yahan Hai)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  # ─────────────────────────────────────────────────────────────────────────
184
  @asynccontextmanager
185
  async def lifespan(app: FastAPI):
186
- # 1. Download Model and Tokenizer automatically if missing
187
  try:
188
  print("[HF HUB] Checking for model files...")
189
  if not MODEL_BIN.exists():
190
- print(f"[HF HUB] Downloading model.bin from {HF_REPO_ID}...")
191
  hf_hub_download(repo_id=HF_REPO_ID, filename="model.bin", local_dir=str(BASE_DIR))
192
-
193
  if not TOKENIZER_BIN.exists():
194
- print(f"[HF HUB] Downloading tokenizer.bin from {HF_REPO_ID}...")
195
  hf_hub_download(repo_id=HF_REPO_ID, filename="tokenizer.bin", local_dir=str(BASE_DIR))
196
  except Exception as e:
197
  print(f"[WARNING] Hugging Face Model download failed: {e}")
198
 
199
- # 2. Start the Inference Engine
200
  try:
201
  await engine.start()
202
  except Exception as e:
203
  print(f"[WARNING] Could not start engine: {e}")
204
- print("[WARNING] Server will start but /chat will return 503 until engine is ready.")
205
-
206
  yield
207
  await engine.stop()
208
 
209
- app = FastAPI(title="KVInfer", version="2.1.0", lifespan=lifespan)
210
- app.add_middleware(
211
- CORSMiddleware,
212
- allow_origins=["*"], allow_methods=["*"], allow_headers=["*"],
213
- )
214
 
215
  # ─────────────────────────────────────────────────────────────────────────
216
- # Pydantic Models & Routes
217
  # ─────────────────────────────────────────────────────────────────────────
218
  class ChatRequest(BaseModel):
219
- message: str
220
- session_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
221
- system_prompt: str = "You are a helpful assistant."
222
- max_new_tokens: int = Field(default=200, ge=1, le=500)
223
- temperature: float = Field(default=0.7, ge=0.01, le=2.0)
224
- top_k: int = Field(default=40, ge=1, le=200)
225
 
226
  class ResetRequest(BaseModel):
227
  session_id: str
228
 
229
- class GenerateRequest(BaseModel):
230
- prompt: str
231
- max_tokens: int = Field(default=100, ge=1, le=500)
232
- temperature: float = Field(default=0.7, ge=0.01, le=2.0)
233
- top_k: int = Field(default=40, ge=1, le=200)
234
-
235
-
236
  @app.get("/")
237
  async def serve_ui():
238
  return FileResponse(BASE_DIR / "index.html")
239
 
240
  @app.get("/health")
241
  async def health():
242
- mem = psutil.virtual_memory()
243
  uptime = time.time() - metrics["start_time"]
244
  return {
245
- "status": "ok" if engine._ready else "engine_loading",
246
- "engine_ready": engine._ready,
247
- "inference_exe_found": INFERENCE_EXE.exists(),
248
- "model_bin_found": MODEL_BIN.exists(),
249
- "model_size_mb": round(MODEL_BIN.stat().st_size/1e6, 1) if MODEL_BIN.exists() else None,
250
- "active_sessions": len(sessions),
251
- "memory_available_gb": round(mem.available/1e9, 2),
252
- "memory_used_pct": mem.percent,
253
- "uptime_seconds": round(uptime, 1),
254
  }
255
 
256
  @app.post("/chat")
257
  async def chat(req: ChatRequest):
258
- if not engine._ready:
259
- raise HTTPException(503, "Engine not ready. Check inference and model.bin.")
260
 
261
  sess = sessions.get(req.session_id)
262
  if sess is None:
@@ -264,7 +246,6 @@ async def chat(req: ChatRequest):
264
  sessions[req.session_id] = sess
265
 
266
  new_tokens = sess.new_turn_tokens(req.message)
267
-
268
  if sess.tokens_in_engine + len(new_tokens) + req.max_new_tokens > MAX_SESSION_TOKENS:
269
  await engine.reset_session(req.session_id)
270
  sess.tokens_in_engine = 0
@@ -277,10 +258,7 @@ async def chat(req: ChatRequest):
277
  response_parts = []
278
  t0 = time.time()
279
  try:
280
- async for chunk in engine.generate(
281
- req.session_id, new_tokens,
282
- req.max_new_tokens, req.temperature, req.top_k,
283
- ):
284
  if chunk["type"] == "token":
285
  response_parts.append(chunk["text"])
286
  joined = "".join(response_parts)
@@ -288,32 +266,24 @@ async def chat(req: ChatRequest):
288
  if hit_stop:
289
  for s in STOP_STRINGS[:-1]:
290
  idx = joined.find(f"\n{s}")
291
- if idx != -1:
292
- response_parts = [joined[:idx]]
293
  break
294
  yield f"data: {json.dumps(chunk)}\n\n"
295
  elif chunk["type"] == "done":
296
  reply = "".join(response_parts).strip()
297
  sess.append_assistant(reply)
298
  sess.tokens_in_engine += len(new_tokens) + chunk["total_tokens"]
299
- elapsed = (time.time() - t0) * 1000
300
  metrics["total_tokens"] += chunk["total_tokens"]
301
- metrics["total_ms"] += elapsed
302
  yield f"data: {json.dumps({**chunk, 'session_id': req.session_id, 'full_response': reply})}\n\n"
303
  elif chunk["type"] == "error":
304
- metrics["errors"] += 1
305
  yield f"data: {json.dumps(chunk)}\n\n"
306
  except Exception as e:
307
- metrics["errors"] += 1
308
  yield f"data: {json.dumps({'type':'error','message':str(e)})}\n\n"
309
  finally:
310
  yield "data: [DONE]\n\n"
311
 
312
- return StreamingResponse(
313
- event_stream(),
314
- media_type="text/event-stream",
315
- headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
316
- )
317
 
318
  @app.post("/chat/reset")
319
  async def reset_chat(req: ResetRequest):
@@ -324,96 +294,21 @@ async def reset_chat(req: ResetRequest):
324
  @app.get("/chat/history")
325
  async def get_history(session_id: str):
326
  sess = sessions.get(session_id)
327
- if not sess:
328
- return {"session_id": session_id, "turns": 0, "history": []}
329
- turns = len([m for m in sess.history if m["role"] == "user"])
330
- return {"session_id": session_id, "turns": turns,
331
- "tokens_in_engine": sess.tokens_in_engine,
332
- "history": sess.history}
333
-
334
- @app.post("/generate")
335
- async def generate(req: GenerateRequest):
336
- if not engine._ready:
337
- raise HTTPException(503, "Engine not ready.")
338
- token_ids = enc.encode_ordinary(req.prompt)
339
- tmp_sess = f"_gen_{uuid.uuid4().hex}"
340
- generated = []
341
- total_ms = 0.0
342
- async for chunk in engine.generate(tmp_sess, token_ids, req.max_tokens, req.temperature, req.top_k):
343
- if chunk["type"] == "token":
344
- generated.append(chunk["text"])
345
- elif chunk["type"] == "done":
346
- total_ms = chunk["total_ms"]
347
- elif chunk["type"] == "error":
348
- raise HTTPException(500, chunk["message"])
349
- await engine.reset_session(tmp_sess)
350
- text = "".join(generated)
351
- tps = len(generated) / (total_ms / 1000.0) if total_ms > 0 else 0
352
- return {
353
- "prompt": req.prompt, "generated_text": text,
354
- "tokens_in": len(token_ids), "tokens_out": len(generated),
355
- "latency_ms": round(total_ms, 2), "tokens_per_sec": round(tps, 2),
356
- }
357
 
358
  @app.get("/metrics")
359
  async def get_metrics():
360
- n = metrics["total_requests"]
361
- tok = metrics["total_tokens"]
362
- ms = metrics["total_ms"]
363
  mem = psutil.virtual_memory()
364
- proc = psutil.Process(os.getpid())
365
  return {
366
- "total_requests": n,
367
- "total_tokens": tok,
368
- "avg_tps": round(tok/(ms/1000), 2) if ms > 0 else 0,
369
- "avg_latency_ms": round(ms/n, 2) if n > 0 else 0,
370
- "errors": metrics["errors"],
371
- "active_sessions": len(sessions),
372
- "process_ram_mb": round(proc.memory_info().rss/1e6, 1),
373
  "system_ram_used_pct": mem.percent,
374
- "uptime_s": round(time.time()-metrics["start_time"], 1),
375
- }
376
-
377
- @app.get("/benchmark/run")
378
- async def benchmark_run():
379
- if not engine._ready:
380
- raise HTTPException(503, "Engine not ready.")
381
- prompts = [
382
- "What is artificial intelligence?",
383
- "How does a CPU work?",
384
- "Tell me something interesting.",
385
- "What are the benefits of exercise?",
386
- "How does photosynthesis work?",
387
- ]
388
- results = []
389
- for p in prompts:
390
- sid = f"_bench_{uuid.uuid4().hex}"
391
- toks = enc.encode_ordinary(f"{USER_TOKEN} {p}\n{ASST_TOKEN} ")
392
- gen = 0; total_ms = 0.0; ttft_ms = 0.0; first = True
393
- t0 = time.time()
394
- async for c in engine.generate(sid, toks, 80, 0.1, 1):
395
- if c["type"] == "token":
396
- gen += 1
397
- if first: ttft_ms = (time.time()-t0)*1000; first = False
398
- elif c["type"] == "done":
399
- total_ms = c["total_ms"]
400
- await engine.reset_session(sid)
401
- tps = gen/(total_ms/1000) if total_ms > 0 else 0
402
- results.append({
403
- "prompt_preview": p[:40],
404
- "tokens_in": len(toks),
405
- "tokens_out": gen,
406
- "ttft_ms": round(ttft_ms, 1),
407
- "total_ms": round(total_ms, 1),
408
- "tokens_per_sec": round(tps, 2),
409
- })
410
- avg_tps = sum(r["tokens_per_sec"] for r in results) / len(results)
411
- avg_ttft = sum(r["ttft_ms"] for r in results) / len(results)
412
- return {
413
- "summary": {"avg_tps": round(avg_tps, 2),
414
- "avg_ttft_ms": round(avg_ttft, 1),
415
- "runs": len(results)},
416
- "details": results,
417
  }
418
 
419
  if __name__ == "__main__":
 
1
  """
2
+ KVInfer β€” FastAPI Backend v2.3 (Memory & Sync Fixed)
 
 
 
 
 
 
 
 
 
3
  """
4
  import asyncio
5
  import json
 
15
  from fastapi.middleware.cors import CORSMiddleware
16
  from fastapi.responses import FileResponse, StreamingResponse
17
  from pydantic import BaseModel, Field
18
+ from huggingface_hub import hf_hub_download
19
 
20
  # ─────────────────────────────────────────────────────────────────────────
21
  # Config
 
25
  MODEL_BIN = BASE_DIR / "model.bin"
26
  TOKENIZER_BIN = BASE_DIR / "tokenizer.bin"
27
 
 
 
28
  HF_REPO_ID = "NOT-OMEGA/KVInfer-152M"
29
 
30
  SYSTEM_TOKEN = "System:"
 
35
  BLOCK_SIZE = 1024
36
  MAX_GEN_CEILING = 500
37
  SAFETY_MARGIN = 24
38
+ MAX_SESSION_TOKENS = BLOCK_SIZE - MAX_GEN_CEILING - SAFETY_MARGIN
39
 
40
  # ─────────────────────────────────────────────────────────────────────────
41
  # Tokenizer
 
89
  async with self._lock:
90
  self._proc.stdin.write(f"RESET|{session_id}\n".encode())
91
  await self._proc.stdin.drain()
92
+ while True:
93
+ raw = await self._proc.stdout.readline()
94
+ if not raw or raw.decode().strip() == "RESET_OK":
95
+ break
96
 
97
  async def generate(self, session_id, new_token_ids, max_new, temperature, top_k):
98
  if not self._ready or self._proc is None:
 
106
  async with self._lock:
107
  self._proc.stdin.write(cmd.encode())
108
  await self._proc.stdin.drain()
109
+ try:
110
+ while True:
111
+ raw = await self._proc.stdout.readline()
112
+ if not raw: break
113
+ line = raw.decode("utf-8", errors="replace").strip()
114
+ if not line: continue
115
+
116
+ if line.startswith("TOKEN"):
117
+ parts = line.split()
118
+ tid, ms = int(parts[1]), float(parts[2])
119
+ yield {"type": "token", "id": tid, "text": enc.decode([tid]), "elapsed_ms": ms}
120
+ elif line.startswith("DONE"):
121
+ parts = line.split()
122
+ total_t, total_ms = int(parts[1]), float(parts[2])
123
+ tps = round(total_t / (total_ms / 1000.0), 2) if total_ms > 0 else 0
124
+ yield {"type": "done", "total_tokens": total_t, "total_ms": total_ms, "tps": tps}
125
+ break
126
+ elif line.startswith("ERROR"):
127
+ yield {"type": "error", "message": line}
128
+ break
129
+ except asyncio.CancelledError:
130
+ # User disconnected, clear the pipe so engine doesn't hang!
131
+ while True:
132
+ raw = await self._proc.stdout.readline()
133
+ if not raw or raw.decode().strip().startswith(("DONE", "ERROR")):
134
+ break
135
+ raise
136
 
137
  engine = InferenceEngine()
138
 
139
  # ─────────────────────────────────────────────────────────────────────────
140
+ # Session State & Metrics
141
  # ─────────────────────────────────────────────────────────────────────────
142
  class SessionData:
143
  def __init__(self, system_prompt: str):
 
153
 
154
  def new_turn_tokens(self, user_msg):
155
  if self.tokens_in_engine == 0:
156
+ full = (f"{SYSTEM_TOKEN} {self.system_prompt}{SEP}{USER_TOKEN} {user_msg}{SEP}{ASST_TOKEN} ")
 
 
 
 
157
  return enc.encode_ordinary(full)
158
  else:
159
+ return enc.encode_ordinary(f"{USER_TOKEN} {user_msg}{SEP}{ASST_TOKEN} ")
 
160
 
161
  sessions = {}
 
162
  metrics = {
163
+ "total_requests": 0, "total_tokens": 0, "total_ms": 0.0, "errors": 0, "start_time": time.time(),
 
 
 
 
164
  }
165
 
166
  # ─────────────────────────────────────────────────────────────────────────
167
+ # Process RAM Helper (Gets Python + C++ RAM)
168
+ # ─────────────────────────────────────────────────────────────────────────
169
+ def get_total_ram_mb():
170
+ try:
171
+ proc = psutil.Process(os.getpid())
172
+ total_rss = proc.memory_info().rss
173
+ # Add C++ Engine Memory
174
+ if engine._proc and engine._proc.pid:
175
+ try:
176
+ child = psutil.Process(engine._proc.pid)
177
+ total_rss += child.memory_info().rss
178
+ except psutil.NoSuchProcess:
179
+ pass
180
+ return round(total_rss / 1e6, 1)
181
+ except:
182
+ return 0.0
183
+
184
+ # ─────────────────────────────────────────────────────────────────────────
185
+ # App + Lifespan
186
  # ─────────────────────────────────────────────────────────────────────────
187
  @asynccontextmanager
188
  async def lifespan(app: FastAPI):
 
189
  try:
190
  print("[HF HUB] Checking for model files...")
191
  if not MODEL_BIN.exists():
 
192
  hf_hub_download(repo_id=HF_REPO_ID, filename="model.bin", local_dir=str(BASE_DIR))
 
193
  if not TOKENIZER_BIN.exists():
 
194
  hf_hub_download(repo_id=HF_REPO_ID, filename="tokenizer.bin", local_dir=str(BASE_DIR))
195
  except Exception as e:
196
  print(f"[WARNING] Hugging Face Model download failed: {e}")
197
 
 
198
  try:
199
  await engine.start()
200
  except Exception as e:
201
  print(f"[WARNING] Could not start engine: {e}")
 
 
202
  yield
203
  await engine.stop()
204
 
205
+ app = FastAPI(title="KVInfer", version="2.3.0", lifespan=lifespan)
206
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 
 
 
207
 
208
  # ─────────────────────────────────────────────────────────────────────────
209
+ # Routes
210
  # ─────────────────────────────────────────────────────────────────────────
211
  class ChatRequest(BaseModel):
212
+ message: str
213
+ session_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
214
+ system_prompt: str = "You are a helpful assistant."
215
+ max_new_tokens: int = Field(default=200, ge=1, le=500)
216
+ temperature: float = Field(default=0.7, ge=0.01, le=2.0)
217
+ top_k: int = Field(default=40, ge=1, le=200)
218
 
219
  class ResetRequest(BaseModel):
220
  session_id: str
221
 
 
 
 
 
 
 
 
222
  @app.get("/")
223
  async def serve_ui():
224
  return FileResponse(BASE_DIR / "index.html")
225
 
226
  @app.get("/health")
227
  async def health():
228
+ mem = psutil.virtual_memory()
229
  uptime = time.time() - metrics["start_time"]
230
  return {
231
+ "status": "ok" if engine._ready else "engine_loading",
232
+ "engine_ready": engine._ready,
233
+ "active_sessions": len(sessions),
234
+ "process_ram_mb": get_total_ram_mb(),
235
+ "memory_used_pct": mem.percent,
236
+ "uptime_seconds": round(uptime, 1),
 
 
 
237
  }
238
 
239
  @app.post("/chat")
240
  async def chat(req: ChatRequest):
241
+ if not engine._ready: raise HTTPException(503, "Engine not ready.")
 
242
 
243
  sess = sessions.get(req.session_id)
244
  if sess is None:
 
246
  sessions[req.session_id] = sess
247
 
248
  new_tokens = sess.new_turn_tokens(req.message)
 
249
  if sess.tokens_in_engine + len(new_tokens) + req.max_new_tokens > MAX_SESSION_TOKENS:
250
  await engine.reset_session(req.session_id)
251
  sess.tokens_in_engine = 0
 
258
  response_parts = []
259
  t0 = time.time()
260
  try:
261
+ async for chunk in engine.generate(req.session_id, new_tokens, req.max_new_tokens, req.temperature, req.top_k):
 
 
 
262
  if chunk["type"] == "token":
263
  response_parts.append(chunk["text"])
264
  joined = "".join(response_parts)
 
266
  if hit_stop:
267
  for s in STOP_STRINGS[:-1]:
268
  idx = joined.find(f"\n{s}")
269
+ if idx != -1: response_parts = [joined[:idx]]
 
270
  break
271
  yield f"data: {json.dumps(chunk)}\n\n"
272
  elif chunk["type"] == "done":
273
  reply = "".join(response_parts).strip()
274
  sess.append_assistant(reply)
275
  sess.tokens_in_engine += len(new_tokens) + chunk["total_tokens"]
 
276
  metrics["total_tokens"] += chunk["total_tokens"]
277
+ metrics["total_ms"] += (time.time() - t0) * 1000
278
  yield f"data: {json.dumps({**chunk, 'session_id': req.session_id, 'full_response': reply})}\n\n"
279
  elif chunk["type"] == "error":
 
280
  yield f"data: {json.dumps(chunk)}\n\n"
281
  except Exception as e:
 
282
  yield f"data: {json.dumps({'type':'error','message':str(e)})}\n\n"
283
  finally:
284
  yield "data: [DONE]\n\n"
285
 
286
+ return StreamingResponse(event_stream(), media_type="text/event-stream", headers={"Cache-Control": "no-cache"})
 
 
 
 
287
 
288
  @app.post("/chat/reset")
289
  async def reset_chat(req: ResetRequest):
 
294
  @app.get("/chat/history")
295
  async def get_history(session_id: str):
296
  sess = sessions.get(session_id)
297
+ if not sess: return {"session_id": session_id, "turns": 0, "history": []}
298
+ return {"session_id": session_id, "turns": len([m for m in sess.history if m["role"] == "user"]), "tokens_in_engine": sess.tokens_in_engine, "history": sess.history}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
  @app.get("/metrics")
301
  async def get_metrics():
302
+ n, tok, ms = metrics["total_requests"], metrics["total_tokens"], metrics["total_ms"]
 
 
303
  mem = psutil.virtual_memory()
 
304
  return {
305
+ "total_requests": n,
306
+ "total_tokens": tok,
307
+ "avg_tps": round(tok/(ms/1000), 2) if ms > 0 else 0,
308
+ "active_sessions": len(sessions),
309
+ "process_ram_mb": get_total_ram_mb(),
 
 
310
  "system_ram_used_pct": mem.percent,
311
+ "uptime_s": round(time.time()-metrics["start_time"], 1),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  }
313
 
314
  if __name__ == "__main__":