Stop distributed inference on EOS

Browse files

Files changed (1) hide show

distributed/inference/agillm35_distributed_infer.py +29 -21

distributed/inference/agillm35_distributed_infer.py CHANGED Viewed

@@ -576,11 +576,13 @@ def cmd_infer(args: argparse.Namespace) -> None:
     prompt_tokens = runtime.tok.encode(args.prompt)
     if not prompt_tokens:
         prompt_tokens = [runtime.EOS]
-    ids = torch.tensor([prompt_tokens], dtype=torch.long)
-    prompt_len = ids.size(1)
-    stage_stats: list[dict[str, Any]] = []
-    session_id = args.session_id or f"agillm35-{uuid.uuid4().hex}"
-    start = time.time()
     with torch.no_grad():
         if args.cache_mode == "kv":
             hidden = emb(ids.to(args.device)).detach().cpu()
@@ -595,12 +597,15 @@ def cmd_infer(args: argparse.Namespace) -> None:
             )
             stage_stats.extend(stats)
             for step in range(int(args.max_new)):
-                h = ln(hidden.to(args.device))
-                nxt = sample_next(runtime, ar_h, h, ids, args)
-                ids = torch.cat([ids, nxt.detach().cpu()], dim=1)
-                if step + 1 >= int(args.max_new):
-                    break
-                hidden = emb(nxt.to(args.device)).detach().cpu()
                 hidden, stats = run_stage_pipeline(
                     stages,
                     hidden,
@@ -616,9 +621,12 @@ def cmd_infer(args: argparse.Namespace) -> None:
                 hidden = emb(ids.to(args.device)).detach().cpu()
                 hidden, stats = run_stage_pipeline(stages, hidden, args, use_cache=False)
                 stage_stats.extend(stats)
-                h = ln(hidden.to(args.device))
-                nxt = sample_next(runtime, ar_h, h, ids, args)
-                ids = torch.cat([ids, nxt.detach().cpu()], dim=1)
     elapsed = time.time() - start
     all_ids = ids[0].tolist()
     prompt = runtime.tok.decode(all_ids[:prompt_len], skip_special_tokens=True)
@@ -631,13 +639,13 @@ def cmd_infer(args: argparse.Namespace) -> None:
         item["wall_sec"] += float(stat.get("wall_sec", stat.get("sec", 0.0)))
     result = {
         "event": "distributed_infer_done",
-        "mode": args.mode,
-        "cache_mode": args.cache_mode,
-        "session_id": session_id if args.cache_mode == "kv" else None,
-        "tokens": int(args.max_new),
-        "elapsed_sec": round(elapsed, 3),
-        "tok_per_sec": round(int(args.max_new) / max(elapsed, 1e-9), 3),
-        "stages": by_stage,
     }
     if args.json:
         result["prompt"] = prompt

     prompt_tokens = runtime.tok.encode(args.prompt)
     if not prompt_tokens:
         prompt_tokens = [runtime.EOS]
+    ids = torch.tensor([prompt_tokens], dtype=torch.long)
+    prompt_len = ids.size(1)
+    stage_stats: list[dict[str, Any]] = []
+    session_id = args.session_id or f"agillm35-{uuid.uuid4().hex}"
+    eos_id = getattr(runtime, "EOS", None)
+    generated_tokens = 0
+    start = time.time()
     with torch.no_grad():
         if args.cache_mode == "kv":
             hidden = emb(ids.to(args.device)).detach().cpu()
             )
             stage_stats.extend(stats)
             for step in range(int(args.max_new)):
+                h = ln(hidden.to(args.device))
+                nxt = sample_next(runtime, ar_h, h, ids, args)
+                ids = torch.cat([ids, nxt.detach().cpu()], dim=1)
+                generated_tokens += 1
+                if eos_id is not None and int(nxt.reshape(-1)[0].item()) == int(eos_id):
+                    break
+                if step + 1 >= int(args.max_new):
+                    break
+                hidden = emb(nxt.to(args.device)).detach().cpu()
                 hidden, stats = run_stage_pipeline(
                     stages,
                     hidden,
                 hidden = emb(ids.to(args.device)).detach().cpu()
                 hidden, stats = run_stage_pipeline(stages, hidden, args, use_cache=False)
                 stage_stats.extend(stats)
+                h = ln(hidden.to(args.device))
+                nxt = sample_next(runtime, ar_h, h, ids, args)
+                ids = torch.cat([ids, nxt.detach().cpu()], dim=1)
+                generated_tokens += 1
+                if eos_id is not None and int(nxt.reshape(-1)[0].item()) == int(eos_id):
+                    break
     elapsed = time.time() - start
     all_ids = ids[0].tolist()
     prompt = runtime.tok.decode(all_ids[:prompt_len], skip_special_tokens=True)
         item["wall_sec"] += float(stat.get("wall_sec", stat.get("sec", 0.0)))
     result = {
         "event": "distributed_infer_done",
+        "mode": args.mode,
+        "cache_mode": args.cache_mode,
+        "session_id": session_id if args.cache_mode == "kv" else None,
+        "tokens": generated_tokens,
+        "elapsed_sec": round(elapsed, 3),
+        "tok_per_sec": round(generated_tokens / max(elapsed, 1e-9), 3),
+        "stages": by_stage,
     }
     if args.json:
         result["prompt"] = prompt