Spaces:

NOT-OMEGA
/

KVInfer

Running

App Files Files Community

NOT-OMEGA commited on 25 days ago

Commit

aedc5e9

verified ·

1 Parent(s): b39e7fd

Update main.py

Browse files

Files changed (1) hide show

main.py +87 -192

main.py CHANGED Viewed

@@ -1,14 +1,5 @@
 """
-KVInfer — FastAPI Backend  v2.1
-========================================
-Fixes applied:
-  #1  Persistent C++ process — model loads ONCE at startup via lifespan.
-  #2  O(n) token cache — incremental tokens only per turn.
-  #3  Session KV-cache reuse.
-  #4  Stop-token bleed fix.
-  #7  Chat template format fixed to match SFT training format.
-  #HF Serves index.html at "/" for HF Spaces Docker deployment.
-  #HF Automatically downloads model.bin & tokenizer.bin from HF Hub.
 """
 import asyncio
 import json
@@ -24,7 +15,7 @@ from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, StreamingResponse
 from pydantic import BaseModel, Field
-from huggingface_hub import hf_hub_download  # <-- NAYA IMPORT
 # ─────────────────────────────────────────────────────────────────────────
 # Config
@@ -34,8 +25,6 @@ INFERENCE_EXE = BASE_DIR / "inference"
 MODEL_BIN     = BASE_DIR / "model.bin"
 TOKENIZER_BIN = BASE_DIR / "tokenizer.bin"
-# ⚠️ YAHAN APNA HUGGING FACE REPO ID DAALO ⚠️
-# Example: "Sumeet/KVInfer-152M"
 HF_REPO_ID = "NOT-OMEGA/KVInfer-152M"
 SYSTEM_TOKEN = "System:"
@@ -46,7 +35,7 @@ SEP          = "\n"
 BLOCK_SIZE         = 1024
 MAX_GEN_CEILING    = 500
 SAFETY_MARGIN      = 24
-MAX_SESSION_TOKENS = BLOCK_SIZE - MAX_GEN_CEILING - SAFETY_MARGIN  # = 500
 # ─────────────────────────────────────────────────────────────────────────
 # Tokenizer
@@ -100,7 +89,10 @@ class InferenceEngine:
         async with self._lock:
             self._proc.stdin.write(f"RESET|{session_id}\n".encode())
             await self._proc.stdin.drain()
-            await self._proc.stdout.readline()
     async def generate(self, session_id, new_token_ids, max_new, temperature, top_k):
         if not self._ready or self._proc is None:
@@ -114,35 +106,38 @@ class InferenceEngine:
         async with self._lock:
             self._proc.stdin.write(cmd.encode())
             await self._proc.stdin.drain()
-            while True:
-                raw  = await self._proc.stdout.readline()
-                line = raw.decode("utf-8", errors="replace").strip()
-                if not line:
-                    continue
-                if line.startswith("TOKEN"):
-                    parts = line.split()
-                    tid   = int(parts[1])
-                    ms    = float(parts[2])
-                    yield {"type": "token", "id": tid,
-                           "text": enc.decode([tid]), "elapsed_ms": ms}
-                elif line.startswith("DONE"):
-                    parts    = line.split()
-                    total_t  = int(parts[1])
-                    total_ms = float(parts[2])
-                    tps = round(total_t / (total_ms / 1000.0), 2) if total_ms > 0 else 0
-                    yield {"type": "done", "total_tokens": total_t,
-                           "total_ms": total_ms, "tps": tps}
-                    break
-                elif line.startswith("ERROR"):
-                    yield {"type": "error", "message": line}
-                    break
 engine = InferenceEngine()
 # ─────────────────────────────────────────────────────────────────────────
-# Session State
 # ─────────────────────────────────────────────────────────────────────────
 class SessionData:
     def __init__(self, system_prompt: str):
@@ -158,105 +153,92 @@ class SessionData:
     def new_turn_tokens(self, user_msg):
         if self.tokens_in_engine == 0:
-            full = (
-                f"{SYSTEM_TOKEN} {self.system_prompt}{SEP}"
-                f"{USER_TOKEN} {user_msg}{SEP}"
-                f"{ASST_TOKEN} "
-            )
             return enc.encode_ordinary(full)
         else:
-            incremental = f"{USER_TOKEN} {user_msg}{SEP}{ASST_TOKEN} "
-            return enc.encode_ordinary(incremental)
 sessions = {}
 metrics  = {
-    "total_requests": 0,
-    "total_tokens":   0,
-    "total_ms":       0.0,
-    "errors":         0,
-    "start_time":     time.time(),
 }
 # ─────────────────────────────────────────────────────────────────────────
-# App + Lifespan (Naya HF Download Logic Yahan Hai)
 # ─────────────────────────────────────────────────────────────────────────
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # 1. Download Model and Tokenizer automatically if missing
     try:
         print("[HF HUB] Checking for model files...")
         if not MODEL_BIN.exists():
-            print(f"[HF HUB] Downloading model.bin from {HF_REPO_ID}...")
             hf_hub_download(repo_id=HF_REPO_ID, filename="model.bin", local_dir=str(BASE_DIR))
         if not TOKENIZER_BIN.exists():
-            print(f"[HF HUB] Downloading tokenizer.bin from {HF_REPO_ID}...")
             hf_hub_download(repo_id=HF_REPO_ID, filename="tokenizer.bin", local_dir=str(BASE_DIR))
     except Exception as e:
         print(f"[WARNING] Hugging Face Model download failed: {e}")
-    # 2. Start the Inference Engine
     try:
         await engine.start()
     except Exception as e:
         print(f"[WARNING] Could not start engine: {e}")
-        print("[WARNING] Server will start but /chat will return 503 until engine is ready.")
     yield
     await engine.stop()
-app = FastAPI(title="KVInfer", version="2.1.0", lifespan=lifespan)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"], allow_methods=["*"], allow_headers=["*"],
-)
 # ─────────────────────────────────────────────────────────────────────────
-# Pydantic Models & Routes
 # ─────────────────────────────────────────────────────────────────────────
 class ChatRequest(BaseModel):
-    message:        str
-    session_id:     str   = Field(default_factory=lambda: str(uuid.uuid4()))
-    system_prompt:  str   = "You are a helpful assistant."
-    max_new_tokens: int   = Field(default=200, ge=1, le=500)
-    temperature:    float = Field(default=0.7,  ge=0.01, le=2.0)
-    top_k:          int   = Field(default=40,   ge=1,    le=200)
 class ResetRequest(BaseModel):
     session_id: str
-class GenerateRequest(BaseModel):
-    prompt:      str
-    max_tokens:  int   = Field(default=100, ge=1, le=500)
-    temperature: float = Field(default=0.7, ge=0.01, le=2.0)
-    top_k:       int   = Field(default=40,  ge=1,    le=200)
 @app.get("/")
 async def serve_ui():
     return FileResponse(BASE_DIR / "index.html")
 @app.get("/health")
 async def health():
-    mem    = psutil.virtual_memory()
     uptime = time.time() - metrics["start_time"]
     return {
-        "status":              "ok" if engine._ready else "engine_loading",
-        "engine_ready":        engine._ready,
-        "inference_exe_found": INFERENCE_EXE.exists(),
-        "model_bin_found":     MODEL_BIN.exists(),
-        "model_size_mb":       round(MODEL_BIN.stat().st_size/1e6, 1) if MODEL_BIN.exists() else None,
-        "active_sessions":     len(sessions),
-        "memory_available_gb": round(mem.available/1e9, 2),
-        "memory_used_pct":     mem.percent,
-        "uptime_seconds":      round(uptime, 1),
     }
 @app.post("/chat")
 async def chat(req: ChatRequest):
-    if not engine._ready:
-        raise HTTPException(503, "Engine not ready. Check inference and model.bin.")
     sess = sessions.get(req.session_id)
     if sess is None:
@@ -264,7 +246,6 @@ async def chat(req: ChatRequest):
         sessions[req.session_id] = sess
     new_tokens = sess.new_turn_tokens(req.message)
     if sess.tokens_in_engine + len(new_tokens) + req.max_new_tokens > MAX_SESSION_TOKENS:
         await engine.reset_session(req.session_id)
         sess.tokens_in_engine = 0
@@ -277,10 +258,7 @@ async def chat(req: ChatRequest):
         response_parts = []
         t0 = time.time()
         try:
-            async for chunk in engine.generate(
-                req.session_id, new_tokens,
-                req.max_new_tokens, req.temperature, req.top_k,
-            ):
                 if chunk["type"] == "token":
                     response_parts.append(chunk["text"])
                     joined = "".join(response_parts)
@@ -288,32 +266,24 @@ async def chat(req: ChatRequest):
                     if hit_stop:
                         for s in STOP_STRINGS[:-1]:
                             idx = joined.find(f"\n{s}")
-                            if idx != -1:
-                                response_parts = [joined[:idx]]
                         break
                     yield f"data: {json.dumps(chunk)}\n\n"
                 elif chunk["type"] == "done":
                     reply = "".join(response_parts).strip()
                     sess.append_assistant(reply)
                     sess.tokens_in_engine += len(new_tokens) + chunk["total_tokens"]
-                    elapsed = (time.time() - t0) * 1000
                     metrics["total_tokens"] += chunk["total_tokens"]
-                    metrics["total_ms"]     += elapsed
                     yield f"data: {json.dumps({**chunk, 'session_id': req.session_id, 'full_response': reply})}\n\n"
                 elif chunk["type"] == "error":
-                    metrics["errors"] += 1
                     yield f"data: {json.dumps(chunk)}\n\n"
         except Exception as e:
-            metrics["errors"] += 1
             yield f"data: {json.dumps({'type':'error','message':str(e)})}\n\n"
         finally:
             yield "data: [DONE]\n\n"
-    return StreamingResponse(
-        event_stream(),
-        media_type="text/event-stream",
-        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
-    )
 @app.post("/chat/reset")
 async def reset_chat(req: ResetRequest):
@@ -324,96 +294,21 @@ async def reset_chat(req: ResetRequest):
 @app.get("/chat/history")
 async def get_history(session_id: str):
     sess = sessions.get(session_id)
-    if not sess:
-        return {"session_id": session_id, "turns": 0, "history": []}
-    turns = len([m for m in sess.history if m["role"] == "user"])
-    return {"session_id": session_id, "turns": turns,
-            "tokens_in_engine": sess.tokens_in_engine,
-            "history": sess.history}
-@app.post("/generate")
-async def generate(req: GenerateRequest):
-    if not engine._ready:
-        raise HTTPException(503, "Engine not ready.")
-    token_ids = enc.encode_ordinary(req.prompt)
-    tmp_sess  = f"_gen_{uuid.uuid4().hex}"
-    generated = []
-    total_ms  = 0.0
-    async for chunk in engine.generate(tmp_sess, token_ids, req.max_tokens, req.temperature, req.top_k):
-        if chunk["type"] == "token":
-            generated.append(chunk["text"])
-        elif chunk["type"] == "done":
-            total_ms = chunk["total_ms"]
-        elif chunk["type"] == "error":
-            raise HTTPException(500, chunk["message"])
-    await engine.reset_session(tmp_sess)
-    text = "".join(generated)
-    tps  = len(generated) / (total_ms / 1000.0) if total_ms > 0 else 0
-    return {
-        "prompt": req.prompt, "generated_text": text,
-        "tokens_in": len(token_ids), "tokens_out": len(generated),
-        "latency_ms": round(total_ms, 2), "tokens_per_sec": round(tps, 2),
-    }
 @app.get("/metrics")
 async def get_metrics():
-    n   = metrics["total_requests"]
-    tok = metrics["total_tokens"]
-    ms  = metrics["total_ms"]
     mem = psutil.virtual_memory()
-    proc = psutil.Process(os.getpid())
     return {
-        "total_requests":      n,
-        "total_tokens":        tok,
-        "avg_tps":             round(tok/(ms/1000), 2) if ms > 0 else 0,
-        "avg_latency_ms":      round(ms/n, 2)          if n > 0  else 0,
-        "errors":              metrics["errors"],
-        "active_sessions":     len(sessions),
-        "process_ram_mb":      round(proc.memory_info().rss/1e6, 1),
         "system_ram_used_pct": mem.percent,
-        "uptime_s":            round(time.time()-metrics["start_time"], 1),
-    }
-@app.get("/benchmark/run")
-async def benchmark_run():
-    if not engine._ready:
-        raise HTTPException(503, "Engine not ready.")
-    prompts = [
-        "What is artificial intelligence?",
-        "How does a CPU work?",
-        "Tell me something interesting.",
-        "What are the benefits of exercise?",
-        "How does photosynthesis work?",
-    ]
-    results = []
-    for p in prompts:
-        sid  = f"_bench_{uuid.uuid4().hex}"
-        toks = enc.encode_ordinary(f"{USER_TOKEN} {p}\n{ASST_TOKEN} ")
-        gen  = 0; total_ms = 0.0; ttft_ms = 0.0; first = True
-        t0   = time.time()
-        async for c in engine.generate(sid, toks, 80, 0.1, 1):
-            if c["type"] == "token":
-                gen += 1
-                if first: ttft_ms = (time.time()-t0)*1000; first = False
-            elif c["type"] == "done":
-                total_ms = c["total_ms"]
-        await engine.reset_session(sid)
-        tps = gen/(total_ms/1000) if total_ms > 0 else 0
-        results.append({
-            "prompt_preview":  p[:40],
-            "tokens_in":       len(toks),
-            "tokens_out":      gen,
-            "ttft_ms":         round(ttft_ms, 1),
-            "total_ms":        round(total_ms, 1),
-            "tokens_per_sec":  round(tps, 2),
-        })
-    avg_tps  = sum(r["tokens_per_sec"] for r in results) / len(results)
-    avg_ttft = sum(r["ttft_ms"]        for r in results) / len(results)
-    return {
-        "summary": {"avg_tps": round(avg_tps, 2),
-                    "avg_ttft_ms": round(avg_ttft, 1),
-                    "runs": len(results)},
-        "details": results,
     }
 if __name__ == "__main__":

 """
+KVInfer — FastAPI Backend  v2.3 (Memory & Sync Fixed)
 """
 import asyncio
 import json
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, StreamingResponse
 from pydantic import BaseModel, Field
+from huggingface_hub import hf_hub_download
 # ─────────────────────────────────────────────────────────────────────────
 # Config
 MODEL_BIN     = BASE_DIR / "model.bin"
 TOKENIZER_BIN = BASE_DIR / "tokenizer.bin"
 HF_REPO_ID = "NOT-OMEGA/KVInfer-152M"
 SYSTEM_TOKEN = "System:"
 BLOCK_SIZE         = 1024
 MAX_GEN_CEILING    = 500
 SAFETY_MARGIN      = 24
+MAX_SESSION_TOKENS = BLOCK_SIZE - MAX_GEN_CEILING - SAFETY_MARGIN
 # ─────────────────────────────────────────────────────────────────────────
 # Tokenizer
         async with self._lock:
             self._proc.stdin.write(f"RESET|{session_id}\n".encode())
             await self._proc.stdin.drain()
+            while True:
+                raw = await self._proc.stdout.readline()
+                if not raw or raw.decode().strip() == "RESET_OK":
+                    break
     async def generate(self, session_id, new_token_ids, max_new, temperature, top_k):
         if not self._ready or self._proc is None:
         async with self._lock:
             self._proc.stdin.write(cmd.encode())
             await self._proc.stdin.drain()
+            try:
+                while True:
+                    raw  = await self._proc.stdout.readline()
+                    if not raw: break
+                    line = raw.decode("utf-8", errors="replace").strip()
+                    if not line: continue
+                    if line.startswith("TOKEN"):
+                        parts = line.split()
+                        tid, ms = int(parts[1]), float(parts[2])
+                        yield {"type": "token", "id": tid, "text": enc.decode([tid]), "elapsed_ms": ms}
+                    elif line.startswith("DONE"):
+                        parts = line.split()
+                        total_t, total_ms = int(parts[1]), float(parts[2])
+                        tps = round(total_t / (total_ms / 1000.0), 2) if total_ms > 0 else 0
+                        yield {"type": "done", "total_tokens": total_t, "total_ms": total_ms, "tps": tps}
+                        break
+                    elif line.startswith("ERROR"):
+                        yield {"type": "error", "message": line}
+                        break
+            except asyncio.CancelledError:
+                # User disconnected, clear the pipe so engine doesn't hang!
+                while True:
+                    raw = await self._proc.stdout.readline()
+                    if not raw or raw.decode().strip().startswith(("DONE", "ERROR")):
+                        break
+                raise
 engine = InferenceEngine()
 # ─────────────────────────────────────────────────────────────────────────
+# Session State & Metrics
 # ─────────────────────────────────────────────────────────────────────────
 class SessionData:
     def __init__(self, system_prompt: str):
     def new_turn_tokens(self, user_msg):
         if self.tokens_in_engine == 0:
+            full = (f"{SYSTEM_TOKEN} {self.system_prompt}{SEP}{USER_TOKEN} {user_msg}{SEP}{ASST_TOKEN} ")
             return enc.encode_ordinary(full)
         else:
+            return enc.encode_ordinary(f"{USER_TOKEN} {user_msg}{SEP}{ASST_TOKEN} ")
 sessions = {}
 metrics  = {
+    "total_requests": 0, "total_tokens": 0, "total_ms": 0.0, "errors": 0, "start_time": time.time(),
 }
 # ─────────────────────────────────────────────────────────────────────────
+# Process RAM Helper (Gets Python + C++ RAM)
+# ─────────────────────────────────────────────────────────────────────────
+def get_total_ram_mb():
+    try:
+        proc = psutil.Process(os.getpid())
+        total_rss = proc.memory_info().rss
+        # Add C++ Engine Memory
+        if engine._proc and engine._proc.pid:
+            try:
+                child = psutil.Process(engine._proc.pid)
+                total_rss += child.memory_info().rss
+            except psutil.NoSuchProcess:
+                pass
+        return round(total_rss / 1e6, 1)
+    except:
+        return 0.0
+# ─────────────────────────────────────────────────────────────────────────
+# App + Lifespan
 # ─────────────────────────────────────────────────────────────────────────
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
         print("[HF HUB] Checking for model files...")
         if not MODEL_BIN.exists():
             hf_hub_download(repo_id=HF_REPO_ID, filename="model.bin", local_dir=str(BASE_DIR))
         if not TOKENIZER_BIN.exists():
             hf_hub_download(repo_id=HF_REPO_ID, filename="tokenizer.bin", local_dir=str(BASE_DIR))
     except Exception as e:
         print(f"[WARNING] Hugging Face Model download failed: {e}")
     try:
         await engine.start()
     except Exception as e:
         print(f"[WARNING] Could not start engine: {e}")
     yield
     await engine.stop()
+app = FastAPI(title="KVInfer", version="2.3.0", lifespan=lifespan)
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 # ─────────────────────────────────────────────────────────────────────────
+# Routes
 # ─────────────────────────────────────────────────────────────────────────
 class ChatRequest(BaseModel):
+    message: str
+    session_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    system_prompt: str = "You are a helpful assistant."
+    max_new_tokens: int = Field(default=200, ge=1, le=500)
+    temperature: float = Field(default=0.7, ge=0.01, le=2.0)
+    top_k: int = Field(default=40, ge=1, le=200)
 class ResetRequest(BaseModel):
     session_id: str
 @app.get("/")
 async def serve_ui():
     return FileResponse(BASE_DIR / "index.html")
 @app.get("/health")
 async def health():
+    mem = psutil.virtual_memory()
     uptime = time.time() - metrics["start_time"]
     return {
+        "status": "ok" if engine._ready else "engine_loading",
+        "engine_ready": engine._ready,
+        "active_sessions": len(sessions),
+        "process_ram_mb": get_total_ram_mb(),
+        "memory_used_pct": mem.percent,
+        "uptime_seconds": round(uptime, 1),
     }
 @app.post("/chat")
 async def chat(req: ChatRequest):
+    if not engine._ready: raise HTTPException(503, "Engine not ready.")
     sess = sessions.get(req.session_id)
     if sess is None:
         sessions[req.session_id] = sess
     new_tokens = sess.new_turn_tokens(req.message)
     if sess.tokens_in_engine + len(new_tokens) + req.max_new_tokens > MAX_SESSION_TOKENS:
         await engine.reset_session(req.session_id)
         sess.tokens_in_engine = 0
         response_parts = []
         t0 = time.time()
         try:
+            async for chunk in engine.generate(req.session_id, new_tokens, req.max_new_tokens, req.temperature, req.top_k):
                 if chunk["type"] == "token":
                     response_parts.append(chunk["text"])
                     joined = "".join(response_parts)
                     if hit_stop:
                         for s in STOP_STRINGS[:-1]:
                             idx = joined.find(f"\n{s}")
+                            if idx != -1: response_parts = [joined[:idx]]
                         break
                     yield f"data: {json.dumps(chunk)}\n\n"
                 elif chunk["type"] == "done":
                     reply = "".join(response_parts).strip()
                     sess.append_assistant(reply)
                     sess.tokens_in_engine += len(new_tokens) + chunk["total_tokens"]
                     metrics["total_tokens"] += chunk["total_tokens"]
+                    metrics["total_ms"] += (time.time() - t0) * 1000
                     yield f"data: {json.dumps({**chunk, 'session_id': req.session_id, 'full_response': reply})}\n\n"
                 elif chunk["type"] == "error":
                     yield f"data: {json.dumps(chunk)}\n\n"
         except Exception as e:
             yield f"data: {json.dumps({'type':'error','message':str(e)})}\n\n"
         finally:
             yield "data: [DONE]\n\n"
+    return StreamingResponse(event_stream(), media_type="text/event-stream", headers={"Cache-Control": "no-cache"})
 @app.post("/chat/reset")
 async def reset_chat(req: ResetRequest):
 @app.get("/chat/history")
 async def get_history(session_id: str):
     sess = sessions.get(session_id)
+    if not sess: return {"session_id": session_id, "turns": 0, "history": []}
+    return {"session_id": session_id, "turns": len([m for m in sess.history if m["role"] == "user"]), "tokens_in_engine": sess.tokens_in_engine, "history": sess.history}
 @app.get("/metrics")
 async def get_metrics():
+    n, tok, ms = metrics["total_requests"], metrics["total_tokens"], metrics["total_ms"]
     mem = psutil.virtual_memory()
     return {
+        "total_requests": n,
+        "total_tokens": tok,
+        "avg_tps": round(tok/(ms/1000), 2) if ms > 0 else 0,
+        "active_sessions": len(sessions),
+        "process_ram_mb": get_total_ram_mb(),
         "system_ram_used_pct": mem.percent,
+        "uptime_s": round(time.time()-metrics["start_time"], 1),
     }
 if __name__ == "__main__":