Spaces:

NOT-OMEGA
/

KVInfer

Sleeping

App Files Files Community

NOT-OMEGA commited on Mar 20

Commit

28073f6

verified ·

1 Parent(s): 1fc8fbb

Update main.py

Browse files

Files changed (1) hide show

main.py +66 -209

main.py CHANGED Viewed

@@ -3,25 +3,17 @@ KVInfer — FastAPI Backend  v2.1
 ========================================
 Fixes applied:
   #1  Persistent C++ process — model loads ONCE at startup via lifespan.
-      All requests share one process via asyncio.Lock (serialized, no spawn overhead).
-  #2  O(n) token cache — each session stores which tokens have already been
-      sent to the C++ engine. New turns only encode + send NEW tokens.
-  #3  Session KV-cache reuse — C++ engine persists KV cache per session;
-      Python only sends the incremental new tokens each turn.
-  #4  Stop-token bleed fix — only EOS (50256) used as stop token since plain
-      text format ("User:") doesn't have a dedicated special token ID.
-  #7  Chat template format fixed to match actual SFT training format:
-      "System: ...\nUser: ...\nAssistant: " — NOT GPT-2 special angle tokens
-      which tiktoken would fragment into multiple pieces and the model never
-      saw during training.
 """
 import asyncio
 import json
 import os
 import time
 import uuid
-from collections import defaultdict
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import AsyncGenerator
@@ -30,80 +22,47 @@ import psutil
 import tiktoken
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
 # ─────────────────────────────────────────────────────────────────────────
 # Config
 # ─────────────────────────────────────────────────────────────────────────
 BASE_DIR      = Path(__file__).parent
 INFERENCE_EXE = BASE_DIR / "inference"
 MODEL_BIN     = BASE_DIR / "model.bin"
-# FIX #7 — Chat template MUST match your SFT training data format exactly.
-#
-# GPT-2 tiktoken has NO special tokens for <|system|>, <|user|>, <|assistant|>.
-# tiktoken breaks them into multiple fragments:
-#   "<|user|>"  →  [27, 91, 7220, 91, 29]  (5 separate tokens!)
-# Your SFT model NEVER saw these fragments during training → garbage output.
-#
-# Your SFT training used plain text format:
-#   "System: You are a helpful assistant.\n"
-#   "User: Hello\n"
-#   "Assistant: Hi\n"
-#
-# We MUST use the same format here.
 SYSTEM_TOKEN = "System:"
 USER_TOKEN   = "User:"
 ASST_TOKEN   = "Assistant:"
 SEP          = "\n"
-# Context limit: 1024 (block_size) - 200 (max generation) - 24 (safety margin)
-# This is the maximum tokens we allow in the KV cache before a soft reset.
-# Formula: block_size - max_new_tokens_ceiling - safety_margin
-BLOCK_SIZE        = 1024   # must match n_layer config in model.bin
-MAX_GEN_CEILING   = 500    # max allowed by API (see ChatRequest)
-SAFETY_MARGIN     = 24     # newlines, role tokens, off-by-one buffer
 MAX_SESSION_TOKENS = BLOCK_SIZE - MAX_GEN_CEILING - SAFETY_MARGIN  # = 500
 # ─────────────────────────────────────────────────────────────────────────
 # Tokenizer
 # ─────────────────────────────────────────────────────────────────────────
-enc = tiktoken.get_encoding("gpt2")
-# Only EOS stop token needed. Plain text "User:" has no dedicated token ID
-# to stop on — the model was trained to emit 50256 at end of each reply.
 STOP_TOKEN_IDS = [50256]
-# String-level stop patterns — model may generate these as plain text since
-# training used plain "User:" / "System:" (not special tokens).
-# We catch them in the Python streaming loop before sending to the client.
-STOP_STRINGS = ["User:", "System:", "Assistant:"]
 # ─────────────────────────────────────────────────────────────────────────
-# Persistent Engine  (FIX #1)
 # ─────────────────────────────────────────────────────────────────────────
 class InferenceEngine:
-    """
-    Wraps one long-lived inference.exe process.
-    All requests are serialised through self._lock so the single
-    stdin/stdout pipe stays consistent.
-    """
     def __init__(self):
-        self._proc: asyncio.subprocess.Process | None = None
-        self._lock = asyncio.Lock()
         self._ready = False
     async def start(self):
         if not INFERENCE_EXE.exists():
-            raise RuntimeError(f"inference.exe not found at {INFERENCE_EXE}")
         if not MODEL_BIN.exists():
             raise RuntimeError(f"model.bin not found at {MODEL_BIN}")
         self._proc = await asyncio.create_subprocess_exec(
             str(INFERENCE_EXE),
             stdin=asyncio.subprocess.PIPE,
@@ -111,7 +70,6 @@ class InferenceEngine:
             stderr=asyncio.subprocess.DEVNULL,
             cwd=str(BASE_DIR),
         )
-        # Wait for READY signal (model loaded)
         while True:
             line = (await self._proc.stdout.readline()).decode().strip()
             if line == "READY":
@@ -132,101 +90,62 @@ class InferenceEngine:
     async def reset_session(self, session_id: str):
         async with self._lock:
-            cmd = f"RESET|{session_id}\n".encode()
-            self._proc.stdin.write(cmd)
             await self._proc.stdin.drain()
-            # read RESET_OK
             await self._proc.stdout.readline()
-    async def generate(
-        self,
-        session_id: str,
-        new_token_ids: list[int],
-        max_new: int,
-        temperature: float,
-        top_k: int,
-    ) -> AsyncGenerator[dict, None]:
-        """
-        Yields dicts:  {"type":"token","id":int,"text":str,"elapsed_ms":float}
-                       {"type":"done","total_tokens":int,"total_ms":float,"tps":float}
-                       {"type":"error","message":str}
-        """
         if not self._ready or self._proc is None:
             yield {"type": "error", "message": "Engine not ready"}
             return
         tokens_csv = ",".join(map(str, new_token_ids))
         stop_csv   = ",".join(map(str, STOP_TOKEN_IDS))
         cmd = f"REQUEST|{session_id}|{tokens_csv}|{max_new}|{temperature}|{top_k}|{stop_csv}\n"
         async with self._lock:
             self._proc.stdin.write(cmd.encode())
             await self._proc.stdin.drain()
-            gen_count = 0
             while True:
-                raw = await self._proc.stdout.readline()
                 line = raw.decode("utf-8", errors="replace").strip()
                 if not line:
                     continue
                 if line.startswith("TOKEN"):
                     parts = line.split()
                     tid   = int(parts[1])
                     ms    = float(parts[2])
-                    gen_count += 1
                     yield {"type": "token", "id": tid,
                            "text": enc.decode([tid]), "elapsed_ms": ms}
                 elif line.startswith("DONE"):
                     parts    = line.split()
                     total_t  = int(parts[1])
                     total_ms = float(parts[2])
-                    tps      = round(total_t / (total_ms / 1000.0), 2) if total_ms > 0 else 0
                     yield {"type": "done", "total_tokens": total_t,
                            "total_ms": total_ms, "tps": tps}
                     break
                 elif line.startswith("ERROR"):
                     yield {"type": "error", "message": line}
                     break
 engine = InferenceEngine()
 # ─────────────────────────────────────────────────────────────────────────
-# Session State  (FIX #2 + #3)
 # ─────────────────────────────────────────────────────────────────────────
 class SessionData:
-    """
-    Tracks what the C++ engine already knows for this session so we
-    only ever send NEW incremental tokens — O(1) per turn instead of O(n).
-    """
     def __init__(self, system_prompt: str):
-        self.system_prompt   = system_prompt
-        self.history: list[dict] = []           # {"role":..., "content":...}
-        self.tokens_in_engine: int = 0          # how many tokens C++ has processed
-        self.total_chars: int = 0
-    def append_user(self, content: str):
         self.history.append({"role": "user", "content": content})
-    def append_assistant(self, content: str):
         self.history.append({"role": "assistant", "content": content})
-    def new_turn_tokens(self, user_msg: str) -> list[int]:
-        """
-        Returns ONLY the token IDs the C++ engine has not seen yet.
-        Format matches EXACTLY what SFT training used:
-          System: <prompt>
-User: <msg>
-Assistant:
-        encode_ordinary() ensures tiktoken never interprets anything as
-        a special token (like <|endoftext|>) mid-prompt by accident.
-        """
         if self.tokens_in_engine == 0:
-            # First turn - send full context: system + first user message
             full = (
                 f"{SYSTEM_TOKEN} {self.system_prompt}{SEP}"
                 f"{USER_TOKEN} {user_msg}{SEP}"
@@ -234,27 +153,11 @@ Assistant:
             )
             return enc.encode_ordinary(full)
         else:
-            # Subsequent turns - engine already has prior context in KV cache.
-            # Only send new user message + assistant cue.
-            incremental = (
-                f"{USER_TOKEN} {user_msg}{SEP}"
-                f"{ASST_TOKEN} "
-            )
             return enc.encode_ordinary(incremental)
-    # NOTE: We intentionally do NOT re-encode the assistant reply to count tokens.
-    # chunk["total_tokens"] from C++ is the exact generated token count — using
-    # enc.encode_ordinary(reply) would re-tokenize decoded text and can differ
-    # due to BPE whitespace/boundary effects. C++ count is always ground truth.
-sessions: dict[str, SessionData] = {}
-# ─────────────────────────────────────────────────────────────────────────
-# Server Metrics
-# ─────────────────────────────────────────────────────────────────────────
-metrics = {
     "total_requests": 0,
     "total_tokens":   0,
     "total_ms":       0.0,
@@ -265,37 +168,25 @@ metrics = {
 # ─────────────────────────────────────────────────────────────────────────
 # App + Lifespan
 # ─────────────────────────────────────────────────────────────────────────
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Startup — launch C++ engine once
     try:
         await engine.start()
     except Exception as e:
         print(f"[WARNING] Could not start engine: {e}")
         print("[WARNING] Server will start but /chat will return 503 until engine is ready.")
     yield
-    # Shutdown
     await engine.stop()
-app = FastAPI(
-    title="KVInfer",
-    version="2.0.0",
-    lifespan=lifespan,
-)
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
 # ─────────────────────────────────────────────────────────────────────────
 # Pydantic Models
 # ─────────────────────────────────────────────────────────────────────────
 class ChatRequest(BaseModel):
     message:        str
     session_id:     str   = Field(default_factory=lambda: str(uuid.uuid4()))
@@ -317,6 +208,12 @@ class GenerateRequest(BaseModel):
 # Routes
 # ─────────────────────────────────────────────────────────────────────────
 @app.get("/health")
 async def health():
     mem    = psutil.virtual_memory()
@@ -336,34 +233,23 @@ async def health():
 @app.post("/chat")
 async def chat(req: ChatRequest):
-    """SSE streaming chat — real-time token-by-token output."""
     if not engine._ready:
-        raise HTTPException(503, "Engine not ready. Check inference.exe and model.bin.")
-    # Get or create session
     sess = sessions.get(req.session_id)
     if sess is None:
         sess = SessionData(req.system_prompt)
         sessions[req.session_id] = sess
-    # FIX #2 — only encode NEW tokens (incremental)
     new_tokens = sess.new_turn_tokens(req.message)
-    # Guard: don't overflow context
     if sess.tokens_in_engine + len(new_tokens) + req.max_new_tokens > MAX_SESSION_TOKENS:
-        # Soft reset: clear C++ session KV cache, rebuild from full history
         await engine.reset_session(req.session_id)
         sess.tokens_in_engine = 0
-        # Re-encode as full prompt
         new_tokens = sess.new_turn_tokens(req.message)
     sess.append_user(req.message)
     metrics["total_requests"] += 1
     async def event_stream():
-        response_parts: list[str] = []
         t0 = time.time()
         try:
             async for chunk in engine.generate(
                 req.session_id, new_tokens,
@@ -371,40 +257,25 @@ async def chat(req: ChatRequest):
             ):
                 if chunk["type"] == "token":
                     response_parts.append(chunk["text"])
-                    # String-level stop detection (Fix #8).
-                    # The model was trained on plain "User:" text — it may
-                    # regenerate the next speaker role instead of stopping on EOS.
-                    # We catch this here before streaming the token to the client.
                     joined = "".join(response_parts)
-                    hit_stop = any(s in joined for s in STOP_STRINGS[:-1])  # User: / System:
-                    if hit_stop:
-                        # Trim the leaked role marker from the reply
                         for s in STOP_STRINGS[:-1]:
                             idx = joined.find(s)
                             if idx != -1:
                                 response_parts = [joined[:idx]]
                         break
                     yield f"data: {json.dumps(chunk)}\n\n"
                 elif chunk["type"] == "done":
                     reply = "".join(response_parts).strip()
                     sess.append_assistant(reply)
-                    # FIX #2 — update how many tokens the engine now holds
                     sess.tokens_in_engine += len(new_tokens) + chunk["total_tokens"]
                     elapsed = (time.time() - t0) * 1000
                     metrics["total_tokens"] += chunk["total_tokens"]
                     metrics["total_ms"]     += elapsed
                     yield f"data: {json.dumps({**chunk, 'session_id': req.session_id, 'full_response': reply})}\n\n"
                 elif chunk["type"] == "error":
                     metrics["errors"] += 1
                     yield f"data: {json.dumps(chunk)}\n\n"
         except Exception as e:
             metrics["errors"] += 1
             yield f"data: {json.dumps({'type':'error','message':str(e)})}\n\n"
@@ -438,30 +309,22 @@ async def get_history(session_id: str):
 @app.post("/generate")
 async def generate(req: GenerateRequest):
-    """Non-streaming single generation (backward-compat)."""
     if not engine._ready:
         raise HTTPException(503, "Engine not ready.")
     token_ids = enc.encode_ordinary(req.prompt)
     tmp_sess  = f"_gen_{uuid.uuid4().hex}"
-    generated: list[str] = []
-    total_ms = 0.0
-    async for chunk in engine.generate(
-        tmp_sess, token_ids, req.max_tokens, req.temperature, req.top_k
-    ):
         if chunk["type"] == "token":
             generated.append(chunk["text"])
         elif chunk["type"] == "done":
             total_ms = chunk["total_ms"]
         elif chunk["type"] == "error":
             raise HTTPException(500, chunk["message"])
-    # Clean up temp session from C++ engine
     await engine.reset_session(tmp_sess)
     text = "".join(generated)
     tps  = len(generated) / (total_ms / 1000.0) if total_ms > 0 else 0
     return {
         "prompt": req.prompt, "generated_text": text,
         "tokens_in": len(token_ids), "tokens_out": len(generated),
@@ -477,24 +340,22 @@ async def get_metrics():
     mem = psutil.virtual_memory()
     proc = psutil.Process(os.getpid())
     return {
-        "total_requests":       n,
-        "total_tokens":         tok,
-        "avg_tps":              round(tok/(ms/1000),2) if ms>0 else 0,
-        "avg_latency_ms":       round(ms/n,2) if n>0 else 0,
-        "errors":               metrics["errors"],
-        "active_sessions":      len(sessions),
-        "process_ram_mb":       round(proc.memory_info().rss/1e6,1),
-        "system_ram_used_pct":  mem.percent,
-        "uptime_s":             round(time.time()-metrics["start_time"],1),
     }
 @app.get("/benchmark/run")
 async def benchmark_run():
-    """Quick 5-prompt internal benchmark (used by frontend modal)."""
     if not engine._ready:
         raise HTTPException(503, "Engine not ready.")
     prompts = [
         "What is artificial intelligence?",
         "How does a CPU work?",
@@ -503,36 +364,32 @@ async def benchmark_run():
         "How does photosynthesis work?",
     ]
     results = []
     for p in prompts:
         sid  = f"_bench_{uuid.uuid4().hex}"
         toks = enc.encode_ordinary(f"{USER_TOKEN} {p}\n{ASST_TOKEN} ")
-        gen  = 0;  total_ms = 0.0;  ttft_ms = 0.0;  first = True
         t0   = time.time()
         async for c in engine.generate(sid, toks, 80, 0.7, 40):
             if c["type"] == "token":
                 gen += 1
-                if first: ttft_ms = (time.time()-t0)*1000; first=False
             elif c["type"] == "done":
                 total_ms = c["total_ms"]
         await engine.reset_session(sid)
-        tps = gen/(total_ms/1000) if total_ms>0 else 0
         results.append({
-            "prompt_preview": p[:40],
-            "tokens_in":  len(toks),
-            "tokens_out": gen,
-            "ttft_ms":    round(ttft_ms,1),
-            "total_ms":   round(total_ms,1),
-            "tokens_per_sec": round(tps,2),
         })
-    avg_tps  = sum(r["tokens_per_sec"] for r in results)/len(results)
-    avg_ttft = sum(r["ttft_ms"] for r in results)/len(results)
     return {
-        "summary": {"avg_tps": round(avg_tps,2),
-                    "avg_ttft_ms": round(avg_ttft,1),
                     "runs": len(results)},
         "details": results,
     }
@@ -540,4 +397,4 @@ async def benchmark_run():
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)

 ========================================
 Fixes applied:
   #1  Persistent C++ process — model loads ONCE at startup via lifespan.
+  #2  O(n) token cache — incremental tokens only per turn.
+  #3  Session KV-cache reuse.
+  #4  Stop-token bleed fix.
+  #7  Chat template format fixed to match SFT training format.
+  #HF Serves index.html at "/" for HF Spaces Docker deployment.
 """
 import asyncio
 import json
 import os
 import time
 import uuid
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import AsyncGenerator
 import tiktoken
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, StreamingResponse
 from pydantic import BaseModel, Field
 # ─────────────────────────────────────────────────────────────────────────
 # Config
 # ─────────────────────────────────────────────────────────────────────────
 BASE_DIR      = Path(__file__).parent
 INFERENCE_EXE = BASE_DIR / "inference"
 MODEL_BIN     = BASE_DIR / "model.bin"
 SYSTEM_TOKEN = "System:"
 USER_TOKEN   = "User:"
 ASST_TOKEN   = "Assistant:"
 SEP          = "\n"
+BLOCK_SIZE         = 1024
+MAX_GEN_CEILING    = 500
+SAFETY_MARGIN      = 24
 MAX_SESSION_TOKENS = BLOCK_SIZE - MAX_GEN_CEILING - SAFETY_MARGIN  # = 500
 # ─────────────────────────────────────────────────────────────────────────
 # Tokenizer
 # ─────────────────────────────────────────────────────────────────────────
+enc            = tiktoken.get_encoding("gpt2")
 STOP_TOKEN_IDS = [50256]
+STOP_STRINGS   = ["User:", "System:", "Assistant:"]
 # ─────────────────────────────────────────────────────────────────────────
+# Persistent Engine
 # ─────────────────────────────────────────────────────────────────────────
 class InferenceEngine:
     def __init__(self):
+        self._proc  = None
+        self._lock  = asyncio.Lock()
         self._ready = False
     async def start(self):
         if not INFERENCE_EXE.exists():
+            raise RuntimeError(f"inference not found at {INFERENCE_EXE}")
         if not MODEL_BIN.exists():
             raise RuntimeError(f"model.bin not found at {MODEL_BIN}")
         self._proc = await asyncio.create_subprocess_exec(
             str(INFERENCE_EXE),
             stdin=asyncio.subprocess.PIPE,
             stderr=asyncio.subprocess.DEVNULL,
             cwd=str(BASE_DIR),
         )
         while True:
             line = (await self._proc.stdout.readline()).decode().strip()
             if line == "READY":
     async def reset_session(self, session_id: str):
         async with self._lock:
+            self._proc.stdin.write(f"RESET|{session_id}\n".encode())
             await self._proc.stdin.drain()
             await self._proc.stdout.readline()
+    async def generate(self, session_id, new_token_ids, max_new, temperature, top_k):
         if not self._ready or self._proc is None:
             yield {"type": "error", "message": "Engine not ready"}
             return
         tokens_csv = ",".join(map(str, new_token_ids))
         stop_csv   = ",".join(map(str, STOP_TOKEN_IDS))
         cmd = f"REQUEST|{session_id}|{tokens_csv}|{max_new}|{temperature}|{top_k}|{stop_csv}\n"
         async with self._lock:
             self._proc.stdin.write(cmd.encode())
             await self._proc.stdin.drain()
             while True:
+                raw  = await self._proc.stdout.readline()
                 line = raw.decode("utf-8", errors="replace").strip()
                 if not line:
                     continue
                 if line.startswith("TOKEN"):
                     parts = line.split()
                     tid   = int(parts[1])
                     ms    = float(parts[2])
                     yield {"type": "token", "id": tid,
                            "text": enc.decode([tid]), "elapsed_ms": ms}
                 elif line.startswith("DONE"):
                     parts    = line.split()
                     total_t  = int(parts[1])
                     total_ms = float(parts[2])
+                    tps = round(total_t / (total_ms / 1000.0), 2) if total_ms > 0 else 0
                     yield {"type": "done", "total_tokens": total_t,
                            "total_ms": total_ms, "tps": tps}
                     break
                 elif line.startswith("ERROR"):
                     yield {"type": "error", "message": line}
                     break
 engine = InferenceEngine()
 # ─────────────────────────────────────────────────────────────────────────
+# Session State
 # ─────────────────────────────────────────────────────────────────────────
 class SessionData:
     def __init__(self, system_prompt: str):
+        self.system_prompt    = system_prompt
+        self.history          = []
+        self.tokens_in_engine = 0
+    def append_user(self, content):
         self.history.append({"role": "user", "content": content})
+    def append_assistant(self, content):
         self.history.append({"role": "assistant", "content": content})
+    def new_turn_tokens(self, user_msg):
         if self.tokens_in_engine == 0:
             full = (
                 f"{SYSTEM_TOKEN} {self.system_prompt}{SEP}"
                 f"{USER_TOKEN} {user_msg}{SEP}"
             )
             return enc.encode_ordinary(full)
         else:
+            incremental = f"{USER_TOKEN} {user_msg}{SEP}{ASST_TOKEN} "
             return enc.encode_ordinary(incremental)
+sessions = {}
+metrics  = {
     "total_requests": 0,
     "total_tokens":   0,
     "total_ms":       0.0,
 # ─────────────────────────────────────────────────────────────────────────
 # App + Lifespan
 # ─────────────────────────────────────────────────────────────────────────
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
         await engine.start()
     except Exception as e:
         print(f"[WARNING] Could not start engine: {e}")
         print("[WARNING] Server will start but /chat will return 503 until engine is ready.")
     yield
     await engine.stop()
+app = FastAPI(title="KVInfer", version="2.1.0", lifespan=lifespan)
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"], allow_methods=["*"], allow_headers=["*"],
 )
 # ─────────────────────────────────────────────────────────────────────────
 # Pydantic Models
 # ─────────────────────────────────────────────────────────────────────────
 class ChatRequest(BaseModel):
     message:        str
     session_id:     str   = Field(default_factory=lambda: str(uuid.uuid4()))
 # Routes
 # ─────────────────────────────────────────────────────────────────────────
+@app.get("/")
+async def serve_ui():
+    """Serve the Chat UI — required for HF Spaces Docker deployment."""
+    return FileResponse(BASE_DIR / "index.html")
 @app.get("/health")
 async def health():
     mem    = psutil.virtual_memory()
 @app.post("/chat")
 async def chat(req: ChatRequest):
     if not engine._ready:
+        raise HTTPException(503, "Engine not ready. Check inference and model.bin.")
     sess = sessions.get(req.session_id)
     if sess is None:
         sess = SessionData(req.system_prompt)
         sessions[req.session_id] = sess
     new_tokens = sess.new_turn_tokens(req.message)
     if sess.tokens_in_engine + len(new_tokens) + req.max_new_tokens > MAX_SESSION_TOKENS:
         await engine.reset_session(req.session_id)
         sess.tokens_in_engine = 0
         new_tokens = sess.new_turn_tokens(req.message)
     sess.append_user(req.message)
     metrics["total_requests"] += 1
     async def event_stream():
+        response_parts = []
         t0 = time.time()
         try:
             async for chunk in engine.generate(
                 req.session_id, new_tokens,
             ):
                 if chunk["type"] == "token":
                     response_parts.append(chunk["text"])
                     joined = "".join(response_parts)
+                    if any(s in joined for s in STOP_STRINGS[:-1]):
                         for s in STOP_STRINGS[:-1]:
                             idx = joined.find(s)
                             if idx != -1:
                                 response_parts = [joined[:idx]]
                         break
                     yield f"data: {json.dumps(chunk)}\n\n"
                 elif chunk["type"] == "done":
                     reply = "".join(response_parts).strip()
                     sess.append_assistant(reply)
                     sess.tokens_in_engine += len(new_tokens) + chunk["total_tokens"]
                     elapsed = (time.time() - t0) * 1000
                     metrics["total_tokens"] += chunk["total_tokens"]
                     metrics["total_ms"]     += elapsed
                     yield f"data: {json.dumps({**chunk, 'session_id': req.session_id, 'full_response': reply})}\n\n"
                 elif chunk["type"] == "error":
                     metrics["errors"] += 1
                     yield f"data: {json.dumps(chunk)}\n\n"
         except Exception as e:
             metrics["errors"] += 1
             yield f"data: {json.dumps({'type':'error','message':str(e)})}\n\n"
 @app.post("/generate")
 async def generate(req: GenerateRequest):
     if not engine._ready:
         raise HTTPException(503, "Engine not ready.")
     token_ids = enc.encode_ordinary(req.prompt)
     tmp_sess  = f"_gen_{uuid.uuid4().hex}"
+    generated = []
+    total_ms  = 0.0
+    async for chunk in engine.generate(tmp_sess, token_ids, req.max_tokens, req.temperature, req.top_k):
         if chunk["type"] == "token":
             generated.append(chunk["text"])
         elif chunk["type"] == "done":
             total_ms = chunk["total_ms"]
         elif chunk["type"] == "error":
             raise HTTPException(500, chunk["message"])
     await engine.reset_session(tmp_sess)
     text = "".join(generated)
     tps  = len(generated) / (total_ms / 1000.0) if total_ms > 0 else 0
     return {
         "prompt": req.prompt, "generated_text": text,
         "tokens_in": len(token_ids), "tokens_out": len(generated),
     mem = psutil.virtual_memory()
     proc = psutil.Process(os.getpid())
     return {
+        "total_requests":      n,
+        "total_tokens":        tok,
+        "avg_tps":             round(tok/(ms/1000), 2) if ms > 0 else 0,
+        "avg_latency_ms":      round(ms/n, 2)          if n > 0  else 0,
+        "errors":              metrics["errors"],
+        "active_sessions":     len(sessions),
+        "process_ram_mb":      round(proc.memory_info().rss/1e6, 1),
+        "system_ram_used_pct": mem.percent,
+        "uptime_s":            round(time.time()-metrics["start_time"], 1),
     }
 @app.get("/benchmark/run")
 async def benchmark_run():
     if not engine._ready:
         raise HTTPException(503, "Engine not ready.")
     prompts = [
         "What is artificial intelligence?",
         "How does a CPU work?",
         "How does photosynthesis work?",
     ]
     results = []
     for p in prompts:
         sid  = f"_bench_{uuid.uuid4().hex}"
         toks = enc.encode_ordinary(f"{USER_TOKEN} {p}\n{ASST_TOKEN} ")
+        gen  = 0; total_ms = 0.0; ttft_ms = 0.0; first = True
         t0   = time.time()
         async for c in engine.generate(sid, toks, 80, 0.7, 40):
             if c["type"] == "token":
                 gen += 1
+                if first: ttft_ms = (time.time()-t0)*1000; first = False
             elif c["type"] == "done":
                 total_ms = c["total_ms"]
         await engine.reset_session(sid)
+        tps = gen/(total_ms/1000) if total_ms > 0 else 0
         results.append({
+            "prompt_preview":  p[:40],
+            "tokens_in":       len(toks),
+            "tokens_out":      gen,
+            "ttft_ms":         round(ttft_ms, 1),
+            "total_ms":        round(total_ms, 1),
+            "tokens_per_sec":  round(tps, 2),
         })
+    avg_tps  = sum(r["tokens_per_sec"] for r in results) / len(results)
+    avg_ttft = sum(r["ttft_ms"]        for r in results) / len(results)
     return {
+        "summary": {"avg_tps": round(avg_tps, 2),
+                    "avg_ttft_ms": round(avg_ttft, 1),
                     "runs": len(results)},
         "details": results,
     }
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=False)