Spaces:

LisaMegaWatts
/

SymbioGPT-GrammarExpert

Sleeping

App Files Files Community

LisaMegaWatts commited on 2 days ago

Commit

21fcfa2

verified ·

1 Parent(s): 6fe84c4

True token-by-token SSE streaming via thread + queue

Browse files

Files changed (1) hide show

server.py +47 -17

server.py CHANGED Viewed

@@ -3,10 +3,14 @@
 SymbioGPT-10M base model with Grammar Expert LoRA adapter merged at startup.
 The LoRA was discovered via evolutionary search on CoLA (grammar acceptability).
 Downloads base checkpoint + LoRA weights from HuggingFace on first run.
 """
 import json as json_mod
 import math
 import os
 import time
 import uuid
@@ -167,12 +171,14 @@ print(f"  Merged {n_merged} LoRA weight pairs (rank={LORA_RANK}, alpha={LORA_ALP
 model.eval()
 n_params = sum(p.numel() for p in model.parameters())
-print(f"  Model ready: {n_params/1e6:.1f}M params (base) + LoRA merged")
 # ═══════════════════════════════════════════════════════════════════
 # Generation
 # ═══════════════════════════════════════════════════════════════════
 @torch.no_grad()
 def generate(
@@ -181,8 +187,10 @@ def generate(
     temperature: float = 0.8,
     top_k: int = 40,
     top_p: float = 1.0,
-    on_token=None,
 ) -> str:
     tokens = tokenizer.encode(prompt)
     if not tokens:
         tokens = [0]
@@ -216,8 +224,11 @@ def generate(
         generated_ids.append(next_id)
         idx = torch.cat([idx, torch.tensor([[next_id]])], dim=1)
-        if on_token is not None:
-            on_token(tokenizer.decode([next_id]))
     return tokenizer.decode(generated_ids)
@@ -250,7 +261,7 @@ def extract_prompt(messages):
 def health():
     return {
         "name": "SymbioGPT-GrammarExpert",
-        "version": "1.0.0",
         "description": "SymbioGPT-10M + Grammar Expert LoRA (evolved on CoLA)",
         "architecture": "4-organelle decoder (CausalConv + Monarch + LongConv + Attention) "
                         "+ OrganelleGate + LoRA (rank=8, attn+ffn)",
@@ -305,6 +316,7 @@ async def chat_completions(request: Request):
     if stream:
         def sse_stream():
             initial = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
@@ -314,26 +326,41 @@ async def chat_completions(request: Request):
             }
             yield f"data: {json_mod.dumps(initial)}\n\n"
-            token_count = 0
-            def on_token(token_str):
-                nonlocal token_count
                 token_count += 1
-            text = generate(prompt_text, max_tokens=max_tokens, temperature=temperature,
-                            top_k=top_k_val, top_p=top_p_val, on_token=on_token)
-            for word in text.split(" "):
-                chunk_text = word + " " if word else ""
                 chunk = {
                     "id": completion_id,
                     "object": "chat.completion.chunk",
                     "created": created,
                     "model": MODEL_ID,
-                    "choices": [{"index": 0, "delta": {"content": chunk_text}, "finish_reason": None}],
                 }
                 yield f"data: {json_mod.dumps(chunk)}\n\n"
             finish = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
@@ -342,8 +369,8 @@ async def chat_completions(request: Request):
                 "choices": [{"index": 0, "delta": {}, "finish_reason": "length" if token_count >= max_tokens else "stop"}],
                 "usage": {
                     "prompt_tokens": prompt_tokens,
-                    "completion_tokens": max_tokens,
-                    "total_tokens": prompt_tokens + max_tokens,
                 },
             }
             yield f"data: {json_mod.dumps(finish)}\n\n"
@@ -377,4 +404,7 @@ async def chat_completions(request: Request):
 if __name__ == "__main__":
     print(f"\nSymbioGPT-GrammarExpert server starting on 0.0.0.0:{PORT} ...")
     uvicorn.run(app, host="0.0.0.0", port=PORT)

 SymbioGPT-10M base model with Grammar Expert LoRA adapter merged at startup.
 The LoRA was discovered via evolutionary search on CoLA (grammar acceptability).
 Downloads base checkpoint + LoRA weights from HuggingFace on first run.
+True token-by-token SSE streaming via background thread + queue.
 """
 import json as json_mod
 import math
 import os
+import queue
+import threading
 import time
 import uuid
 model.eval()
 n_params = sum(p.numel() for p in model.parameters())
+print(f"  Model ready: {n_params/1e6:.1f}M params (base + LoRA merged)")
 # ═══════════════════════════════════════════════════════════════════
 # Generation
 # ═══════════════════════════════════════════════════════════════════
+_SENTINEL = object()  # marks end of generation
 @torch.no_grad()
 def generate(
     temperature: float = 0.8,
     top_k: int = 40,
     top_p: float = 1.0,
+    token_queue: queue.Queue = None,
 ) -> str:
+    """Generate text. If token_queue is provided, pushes each token string
+    to the queue as it's generated for true streaming."""
     tokens = tokenizer.encode(prompt)
     if not tokens:
         tokens = [0]
         generated_ids.append(next_id)
         idx = torch.cat([idx, torch.tensor([[next_id]])], dim=1)
+        if token_queue is not None:
+            token_queue.put(tokenizer.decode([next_id]))
+    if token_queue is not None:
+        token_queue.put(_SENTINEL)
     return tokenizer.decode(generated_ids)
 def health():
     return {
         "name": "SymbioGPT-GrammarExpert",
+        "version": "1.1.0",
         "description": "SymbioGPT-10M + Grammar Expert LoRA (evolved on CoLA)",
         "architecture": "4-organelle decoder (CausalConv + Monarch + LongConv + Attention) "
                         "+ OrganelleGate + LoRA (rank=8, attn+ffn)",
     if stream:
         def sse_stream():
+            # Initial chunk with role
             initial = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
             }
             yield f"data: {json_mod.dumps(initial)}\n\n"
+            # Start generation in background thread
+            q = queue.Queue()
+            gen_thread = threading.Thread(
+                target=generate,
+                kwargs={
+                    "prompt": prompt_text,
+                    "max_tokens": max_tokens,
+                    "temperature": temperature,
+                    "top_k": top_k_val,
+                    "top_p": top_p_val,
+                    "token_queue": q,
+                },
+                daemon=True,
+            )
+            gen_thread.start()
+            # Stream tokens as they arrive
+            token_count = 0
+            while True:
+                tok = q.get()
+                if tok is _SENTINEL:
+                    break
                 token_count += 1
                 chunk = {
                     "id": completion_id,
                     "object": "chat.completion.chunk",
                     "created": created,
                     "model": MODEL_ID,
+                    "choices": [{"index": 0, "delta": {"content": tok}, "finish_reason": None}],
                 }
                 yield f"data: {json_mod.dumps(chunk)}\n\n"
+            gen_thread.join(timeout=5.0)
+            # Final chunk
             finish = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
                 "choices": [{"index": 0, "delta": {}, "finish_reason": "length" if token_count >= max_tokens else "stop"}],
                 "usage": {
                     "prompt_tokens": prompt_tokens,
+                    "completion_tokens": token_count,
+                    "total_tokens": prompt_tokens + token_count,
                 },
             }
             yield f"data: {json_mod.dumps(finish)}\n\n"
 if __name__ == "__main__":
     print(f"\nSymbioGPT-GrammarExpert server starting on 0.0.0.0:{PORT} ...")
+    print(f"  GET  http://localhost:{PORT}/")
+    print(f"  GET  http://localhost:{PORT}/v1/models")
+    print(f"  POST http://localhost:{PORT}/v1/chat/completions")
     uvicorn.run(app, host="0.0.0.0", port=PORT)