Spaces:

LisaMegaWatts
/

SymbioGPT-10M-space

Running

App Files Files Community

LisaMegaWatts commited on Mar 1

Commit

c79eabb

verified ·

1 Parent(s): 280ed9e

fix: real token-by-token streaming (was generating all tokens then splitting by spaces)

Browse files

Files changed (1) hide show

server.py +21 -27

server.py CHANGED Viewed

@@ -93,37 +93,33 @@ print(f"  Organelles: {MODEL_CONFIG.organelles}")
 @torch.no_grad()
-def generate(
     prompt: str,
     max_tokens: int = 200,
     temperature: float = 0.8,
     top_k: int = 40,
     top_p: float = 1.0,
-    on_token=None,
-) -> str:
     tokens = tokenizer.encode(prompt)
     if not tokens:
         tokens = [0]
     idx = torch.tensor([tokens], dtype=torch.long)
-    generated_ids = []
     for _ in range(max_tokens):
         idx_cond = idx[:, -MODEL_CONFIG.context_length:]
         logits = model(idx_cond)
         logits_last = logits[0, -1, :].float()
-        # Temperature
         if temperature > 0.01:
             logits_last = logits_last / temperature
         else:
             logits_last = logits_last / 0.01
-        # Top-k
         if 0 < top_k < logits_last.size(0):
             threshold = torch.topk(logits_last, top_k).values[-1]
             logits_last[logits_last < threshold] = float("-inf")
-        # Top-p
         if top_p < 1.0:
             sorted_logits, sorted_indices = torch.sort(logits_last, descending=True)
             probs_sorted = F.softmax(sorted_logits, dim=-1)
@@ -134,14 +130,20 @@ def generate(
         probs = F.softmax(logits_last, dim=-1)
         next_id = torch.multinomial(probs, 1).item()
-        generated_ids.append(next_id)
         idx = torch.cat([idx, torch.tensor([[next_id]])], dim=1)
-        if on_token is not None:
-            token_str = tokenizer.decode([next_id])
-            on_token(token_str)
-    return tokenizer.decode(generated_ids)
 # ═══════════════════════════════════════════════════════════════════
@@ -231,7 +233,6 @@ async def chat_completions(request: Request):
         import json as json_mod
         def sse_stream():
-            # Initial chunk
             initial = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
@@ -242,27 +243,20 @@ async def chat_completions(request: Request):
             yield f"data: {json_mod.dumps(initial)}\n\n"
             token_count = 0
-            def on_token(token_str):
-                nonlocal token_count
                 token_count += 1
-            text = generate(prompt_text, max_tokens=max_tokens, temperature=temperature,
-                            top_k=top_k_val, top_p=top_p_val, on_token=on_token)
-            # Send all generated text as chunks (word-level for readability)
-            for word in text.split(" "):
-                chunk_text = word + " " if word else ""
                 chunk = {
                     "id": completion_id,
                     "object": "chat.completion.chunk",
                     "created": created,
                     "model": "symbiogpt-10m",
-                    "choices": [{"index": 0, "delta": {"content": chunk_text}, "finish_reason": None}],
                 }
                 yield f"data: {json_mod.dumps(chunk)}\n\n"
-            # Final chunk
             finish = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
@@ -271,8 +265,8 @@ async def chat_completions(request: Request):
                 "choices": [{"index": 0, "delta": {}, "finish_reason": "length" if token_count >= max_tokens else "stop"}],
                 "usage": {
                     "prompt_tokens": prompt_tokens,
-                    "completion_tokens": max_tokens,
-                    "total_tokens": prompt_tokens + max_tokens,
                 },
             }
             yield f"data: {json_mod.dumps(finish)}\n\n"

 @torch.no_grad()
+def generate_streaming(
     prompt: str,
     max_tokens: int = 200,
     temperature: float = 0.8,
     top_k: int = 40,
     top_p: float = 1.0,
+):
+    """Generator yielding token strings one at a time for real SSE streaming."""
     tokens = tokenizer.encode(prompt)
     if not tokens:
         tokens = [0]
     idx = torch.tensor([tokens], dtype=torch.long)
     for _ in range(max_tokens):
         idx_cond = idx[:, -MODEL_CONFIG.context_length:]
         logits = model(idx_cond)
         logits_last = logits[0, -1, :].float()
         if temperature > 0.01:
             logits_last = logits_last / temperature
         else:
             logits_last = logits_last / 0.01
         if 0 < top_k < logits_last.size(0):
             threshold = torch.topk(logits_last, top_k).values[-1]
             logits_last[logits_last < threshold] = float("-inf")
         if top_p < 1.0:
             sorted_logits, sorted_indices = torch.sort(logits_last, descending=True)
             probs_sorted = F.softmax(sorted_logits, dim=-1)
         probs = F.softmax(logits_last, dim=-1)
         next_id = torch.multinomial(probs, 1).item()
         idx = torch.cat([idx, torch.tensor([[next_id]])], dim=1)
+        yield tokenizer.decode([next_id])
+@torch.no_grad()
+def generate(
+    prompt: str,
+    max_tokens: int = 200,
+    temperature: float = 0.8,
+    top_k: int = 40,
+    top_p: float = 1.0,
+) -> str:
+    """Generate complete text (non-streaming wrapper)."""
+    return "".join(generate_streaming(prompt, max_tokens, temperature, top_k, top_p))
 # ═══════════════════════════════════════════════════════════════════
         import json as json_mod
         def sse_stream():
             initial = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
             yield f"data: {json_mod.dumps(initial)}\n\n"
             token_count = 0
+            for token_str in generate_streaming(
+                prompt_text, max_tokens=max_tokens, temperature=temperature,
+                top_k=top_k_val, top_p=top_p_val,
+            ):
                 token_count += 1
                 chunk = {
                     "id": completion_id,
                     "object": "chat.completion.chunk",
                     "created": created,
                     "model": "symbiogpt-10m",
+                    "choices": [{"index": 0, "delta": {"content": token_str}, "finish_reason": None}],
                 }
                 yield f"data: {json_mod.dumps(chunk)}\n\n"
             finish = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
                 "choices": [{"index": 0, "delta": {}, "finish_reason": "length" if token_count >= max_tokens else "stop"}],
                 "usage": {
                     "prompt_tokens": prompt_tokens,
+                    "completion_tokens": token_count,
+                    "total_tokens": prompt_tokens + token_count,
                 },
             }
             yield f"data: {json_mod.dumps(finish)}\n\n"