Spaces:

Abeee32t
/

ArbitrAgent

Runtime error

App Files Files Community

AbeBhatti commited on Mar 8

Commit

6017516

1 Parent(s): 9e20ed6

Add all code, exclude large model weights

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +4 -0
agent/arbitragent.py +29 -12
agent/bluff_detector.py +92 -11
demo/display.py +133 -156
demo/run_demo.py +146 -177
envs/arbitragent_env.py +190 -0
training/arbitragent_colab.ipynb +171 -267
training/bluff_training.log +16 -0
training/checkpoints/bluff_classifier_tokenizer/tokenizer.json +0 -0
training/checkpoints/bluff_classifier_tokenizer/tokenizer_config.json +14 -0
training/checkpoints/phase2_final/README.md +67 -0
training/checkpoints/phase2_final/chat_template.jinja +15 -0
training/checkpoints/phase2_final/checkpoint-100/chat_template.jinja +15 -0
training/checkpoints/phase2_final/checkpoint-100/config.json +32 -0
training/checkpoints/phase2_final/checkpoint-100/generation_config.json +9 -0
training/checkpoints/phase2_final/checkpoint-100/tokenizer.json +0 -0
training/checkpoints/phase2_final/checkpoint-100/tokenizer_config.json +19 -0
training/checkpoints/phase2_final/checkpoint-100/trainer_state.json +304 -0
training/checkpoints/phase2_final/checkpoint-200/chat_template.jinja +15 -0
training/checkpoints/phase2_final/checkpoint-200/config.json +32 -0
training/checkpoints/phase2_final/checkpoint-200/generation_config.json +9 -0
training/checkpoints/phase2_final/checkpoint-200/tokenizer.json +0 -0
training/checkpoints/phase2_final/checkpoint-200/tokenizer_config.json +19 -0
training/checkpoints/phase2_final/checkpoint-200/trainer_state.json +574 -0
training/checkpoints/phase2_final/config.json +32 -0
training/checkpoints/phase2_final/generation_config.json +9 -0
training/checkpoints/phase2_final/tokenizer.json +0 -0
training/checkpoints/phase2_final/tokenizer_config.json +19 -0
training/checkpoints/unified_final/README.md +67 -0
training/checkpoints/unified_final/chat_template.jinja +15 -0
training/checkpoints/unified_final/checkpoint-100/chat_template.jinja +15 -0
training/checkpoints/unified_final/checkpoint-100/config.json +32 -0
training/checkpoints/unified_final/checkpoint-100/generation_config.json +9 -0
training/checkpoints/unified_final/checkpoint-100/tokenizer.json +0 -0
training/checkpoints/unified_final/checkpoint-100/tokenizer_config.json +19 -0
training/checkpoints/unified_final/checkpoint-100/trainer_state.json +304 -0
training/checkpoints/unified_final/checkpoint-200/chat_template.jinja +15 -0
training/checkpoints/unified_final/checkpoint-200/config.json +32 -0
training/checkpoints/unified_final/checkpoint-200/generation_config.json +9 -0
training/checkpoints/unified_final/checkpoint-200/tokenizer.json +0 -0
training/checkpoints/unified_final/checkpoint-200/tokenizer_config.json +19 -0
training/checkpoints/unified_final/checkpoint-200/trainer_state.json +574 -0
training/checkpoints/unified_final/config.json +32 -0
training/checkpoints/unified_final/generation_config.json +9 -0
training/checkpoints/unified_final/tokenizer.json +0 -0
training/checkpoints/unified_final/tokenizer_config.json +19 -0
training/checkpoints/unified_final/unified_reward_log.json +810 -0
training/parse_poker.py +136 -0
training/plot_phase2.py +24 -0
training/train_bluff_classifier.py +142 -0

.gitignore CHANGED Viewed

@@ -13,3 +13,7 @@ selfplay_states_test.json
 proj_context.md
 session_progress.md
 HF_TOKEN

 proj_context.md
 session_progress.md
 HF_TOKEN
+*.safetensors
+*.bin
+*.safetensors
+*.bin

agent/arbitragent.py CHANGED Viewed

@@ -332,18 +332,22 @@ class ArbitrAgent:
                         self.route_graph.mark_dead(edge.edge_id)
                     continue
-                # Bluff detection: inspect full thread via bluff_detector.
                 signals = analyze_from_sim(c.sim, resp or "")
-                # Log full bluff reasoning trace to structured log.
                 self._structured_log.append(
                     {
                         "event": "bluff_analysis",
                         "phase": 3,
                         "seller_id": c.seller_id,
                         "item": c.item,
-                        "turn": c.sim.turn,
-                        "seller_message": resp,
                         "signals": {
                             "timing_tell": signals.timing_tell,
                             "size_tell": signals.size_tell,
@@ -352,10 +356,22 @@ class ArbitrAgent:
                             "bluff_score": signals.bluff_score,
                             "is_bluff": signals.is_bluff,
                         },
-                        "thread_history": list(getattr(c.sim, "thread_history", [])),
                     }
                 )
                 if verbose:
                     print(
                         f"[bluff_analysis {c.seller_id}] "
@@ -367,11 +383,10 @@ class ArbitrAgent:
                         f"is_bluff={signals.is_bluff}"
                     )
-                # When a bluff is detected, immediately deploy coalition pressure.
                 if signals.is_bluff:
                     current_offer = float(c.sim.current_offer)
-                    # Simple heuristic counter: push meaningfully below stated offer.
-                    offer = max(1, int(current_offer - 8))
                     pressure_msg = (
                         "I have a trade offer from another seller that makes this less urgent for me — "
                         f"can you do ${offer}?"
@@ -397,12 +412,16 @@ class ArbitrAgent:
                         }
                     )
-                    # Update entry cost after pressure-induced move.
                     for edge in edges:
                         self.route_graph.update_entry_cost(edge.edge_id, c.sim.current_offer)
                 for edge in edges:
-                    # If we have a confirmed downstream target by this turn, upgrade probability.
                     target_index = int(edge.trade_target_id.split("_")[1])
                     if (edge.buy_item, target_index) in confirmed_targets:
                         self.route_graph.update_confirmation_probability(
@@ -410,8 +429,6 @@ class ArbitrAgent:
                         )
                         self.route_graph.mark_confirmed(edge.edge_id)
-                    # Adjust seller reliability slightly based on bluff score.
-                    # Higher bluff score → more room to push → treat as slightly *higher* edge value.
                     new_reliability = min(
                         1.0, edge.seller_reliability + 0.1 * float(signals.bluff_score)
                     )

                         self.route_graph.mark_dead(edge.edge_id)
                     continue
+                # Bluff detection: inspect full thread via BluffDetector.
                 signals = analyze_from_sim(c.sim, resp or "")
+                # Unverified floor claim: formulaic language present but not flagged as full bluff.
+                formulaic_present = signals.formulaic_tell > 0
+                # Log full bluff reasoning: turn, seller_id, bluff_score, signals dict, action_taken.
+                action_taken = msg  # the agent message we just sent before this response
                 self._structured_log.append(
                     {
                         "event": "bluff_analysis",
                         "phase": 3,
+                        "turn": c.sim.turn,
                         "seller_id": c.seller_id,
                         "item": c.item,
+                        "bluff_score": signals.bluff_score,
                         "signals": {
                             "timing_tell": signals.timing_tell,
                             "size_tell": signals.size_tell,
                             "bluff_score": signals.bluff_score,
                             "is_bluff": signals.is_bluff,
                         },
+                        "action_taken": action_taken,
+                        "seller_message": resp,
                     }
                 )
+                if not signals.is_bluff and formulaic_present:
+                    self._structured_log.append(
+                        {
+                            "event": "unverified_floor_claim",
+                            "phase": 3,
+                            "turn": c.sim.turn,
+                            "seller_id": c.seller_id,
+                            "seller_message": resp,
+                        }
+                    )
                 if verbose:
                     print(
                         f"[bluff_analysis {c.seller_id}] "
                         f"is_bluff={signals.is_bluff}"
                     )
+                # When a bluff is detected, deploy coalition pressure: floor - 4.
                 if signals.is_bluff:
                     current_offer = float(c.sim.current_offer)
+                    offer = max(1, int(current_offer - 4))
                     pressure_msg = (
                         "I have a trade offer from another seller that makes this less urgent for me — "
                         f"can you do ${offer}?"
                         }
                     )
                     for edge in edges:
                         self.route_graph.update_entry_cost(edge.edge_id, c.sim.current_offer)
+                    # Bluff means seller has room — update confirmation probability upward.
+                    for edge in edges:
+                        self.route_graph.update_confirmation_probability(
+                            edge.edge_id,
+                            confirmation_probability=min(1.0, edge.confirmation_probability + 0.15),
+                        )
                 for edge in edges:
                     target_index = int(edge.trade_target_id.split("_")[1])
                     if (edge.buy_item, target_index) in confirmed_targets:
                         self.route_graph.update_confirmation_probability(
                         )
                         self.route_graph.mark_confirmed(edge.edge_id)
                     new_reliability = min(
                         1.0, edge.seller_reliability + 0.1 * float(signals.bluff_score)
                     )

agent/bluff_detector.py CHANGED Viewed

@@ -1,24 +1,100 @@
 """
 bluff_detector.py — bluff signal extraction for ArbitrAgent.
-This module exposes a small, deterministic API that inspects a seller's
-response in the context of a thread and extracts four bluff signals:
-1. timing_tell
-2. size_tell
-3. formulaic_tell
-4. pattern_tell
-The overall bluff_score is a weighted sum of these four signals. A response
-is flagged as a bluff when bluff_score > 0.6.
 """
 from __future__ import annotations
 import re
 from dataclasses import dataclass
 from typing import Any, Dict, List, Mapping, Optional, Sequence
 FORMULAIC_PHRASES: List[str] = [
     "lowest i can go",
@@ -100,12 +176,17 @@ def analyze_bluff(
             for key in DEFAULT_WEIGHTS.keys()
         }
-    bluff_score = (
         timing * norm_weights["timing_tell"]
         + size * norm_weights["size_tell"]
         + formulaic * norm_weights["formulaic_tell"]
         + pattern * norm_weights["pattern_tell"]
     )
     is_bluff = bluff_score > 0.6
     return BluffSignals(

 """
 bluff_detector.py — bluff signal extraction for ArbitrAgent.
+Exposes four rule-based signals (timing, size, formulaic, pattern) and an optional
+learned DistilBERT classifier trained on IRC poker bluff labels. Combined score:
+  bluff_score = 0.6 * learned_bluff_score + 0.4 * rule_score
+is_bluff when bluff_score > 0.6.
 """
 from __future__ import annotations
 import re
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Any, Dict, List, Mapping, Optional, Sequence
+# Lazy-loaded learned classifier (only on first use)
+_bluff_classifier_model = None
+_bluff_classifier_tokenizer = None
+def _get_bluff_classifier():
+    """Lazy-load bluff_classifier.pt and tokenizer from training/checkpoints."""
+    global _bluff_classifier_model, _bluff_classifier_tokenizer
+    if _bluff_classifier_model is not None:
+        return _bluff_classifier_model, _bluff_classifier_tokenizer
+    pt_path = Path(__file__).resolve().parent.parent / "training" / "checkpoints" / "bluff_classifier.pt"
+    tok_dir = Path(__file__).resolve().parent.parent / "training" / "checkpoints" / "bluff_classifier_tokenizer"
+    if not pt_path.exists() or not tok_dir.exists():
+        return None, None
+    try:
+        import torch
+        from transformers import AutoTokenizer, AutoModel
+        _bluff_classifier_tokenizer = AutoTokenizer.from_pretrained(str(tok_dir))
+        class _BluffClassifierModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.encoder = AutoModel.from_pretrained("distilbert-base-uncased")
+                self.head = torch.nn.Linear(self.encoder.config.hidden_size, 2)
+            def forward(self, input_ids, attention_mask=None, **kwargs):
+                out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+                return self.head(out.last_hidden_state[:, 0, :])
+        _bluff_classifier_model = _BluffClassifierModule()
+        _bluff_classifier_model.load_state_dict(torch.load(pt_path, map_location="cpu", weights_only=True))
+        _bluff_classifier_model.eval()
+        return _bluff_classifier_model, _bluff_classifier_tokenizer
+    except Exception:
+        return None, None
+def _thread_and_message_to_text(thread_history: Sequence[Mapping[str, Any]], seller_message: str) -> str:
+    """Convert thread + seller message into text matching poker training format (Position. Preflop. Flop. Turn. River. Pot)."""
+    parts: List[str] = []
+    for entry in thread_history:
+        if "agent" in entry:
+            parts.append(str(entry["agent"])[:80])
+        if "seller" in entry:
+            parts.append(str(entry["seller"])[:80])
+    # Map to poker-like: Preflop / Flop / Turn / River
+    preflop = parts[0] if len(parts) > 0 else "-"
+    flop = parts[1] if len(parts) > 1 else "-"
+    turn = parts[2] if len(parts) > 2 else "-"
+    river = seller_message[:200] if seller_message else "-"
+    return f"Position 1 of 2. Preflop: {preflop}. Flop: {flop}. Turn: {turn}. River: {river}. Pot: 0."
+def learned_bluff_score(message: str, thread_history: Sequence[Mapping[str, Any]]) -> float:
+    """
+    Run learned DistilBERT classifier on (message + thread). Returns P(bluff) in [0, 1].
+    Returns 0.0 if classifier not loaded.
+    """
+    model, tokenizer = _get_bluff_classifier()
+    if model is None or tokenizer is None:
+        return 0.0
+    text = _thread_and_message_to_text(thread_history, message)
+    try:
+        import torch
+        enc = tokenizer(
+            text,
+            truncation=True,
+            max_length=128,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            logits = model(
+                input_ids=enc["input_ids"],
+                attention_mask=enc["attention_mask"],
+            )
+        probs = torch.softmax(logits, dim=1)
+        return float(probs[0, 1].item())  # class 1 = bluff
+    except Exception:
+        return 0.0
 FORMULAIC_PHRASES: List[str] = [
     "lowest i can go",
             for key in DEFAULT_WEIGHTS.keys()
         }
+    rule_score = (
         timing * norm_weights["timing_tell"]
         + size * norm_weights["size_tell"]
         + formulaic * norm_weights["formulaic_tell"]
         + pattern * norm_weights["pattern_tell"]
     )
+    learned = learned_bluff_score(seller_message, thread_history)
+    if _bluff_classifier_model is not None:
+        bluff_score = 0.6 * learned + 0.4 * rule_score
+    else:
+        bluff_score = rule_score
     is_bluff = bluff_score > 0.6
     return BluffSignals(

demo/display.py CHANGED Viewed

@@ -1,9 +1,17 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional
-from rich.columns import Columns
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
@@ -13,7 +21,7 @@ from rich.text import Text
 @dataclass
 class ThreadMessage:
     turn: int
-    sender: str  # "agent" or "seller"
     text: str
     is_bluff: bool = False
@@ -23,20 +31,32 @@ class ThreadState:
     seller_id: str
     item: str
     archetype: str
-    status: str = "active"  # "active" | "dead" | "confirmed"
     messages: List[ThreadMessage] = field(default_factory=list)
     bluff_signals: Optional[Dict[str, float]] = None
 class NegotiationDisplay:
     """
-    Rich-based terminal display for the ArbitrAgent demo.
-    Responsibilities:
-    - Show all active negotiation threads as side-by-side panels.
-    - Highlight bluff detection in yellow with individual signals.
-    - Use red for dead routes / threads and green for confirmed routes.
-    - Render a final panel with budget → entry cost → exit value → return multiple.
     """
     def __init__(self, console: Optional[Console] = None) -> None:
@@ -47,172 +67,129 @@ class NegotiationDisplay:
         threads: List[ThreadState],
         route_summaries: List[Dict[str, Any]],
         budget: float,
         final_metrics: Optional[Dict[str, Any]] = None,
         checkpoints: Optional[Dict[str, bool]] = None,
     ) -> None:
-        """Render the full demo view."""
         self.console.clear()
-        thread_panels = [self._build_thread_panel(t) for t in threads]
-        if thread_panels:
-            self.console.print(Columns(thread_panels, expand=True, equal=True))
-        # Routes + ROI panel at the bottom
-        summary_panel = self._build_summary_panel(
-            route_summaries=route_summaries,
-            budget=budget,
-            final_metrics=final_metrics,
-            checkpoints=checkpoints or {},
         )
         self.console.print()
-        self.console.print(summary_panel)
-    # ------------------------------------------------------------------ #
-    # Panel builders
-    # ------------------------------------------------------------------ #
-    def _build_thread_panel(self, thread: ThreadState) -> Panel:
-        # Border colors by status
-        border_style = "bright_white"
-        if thread.status == "dead":
-            border_style = "red"
-        elif thread.status == "confirmed":
-            border_style = "green"
-        title = f"{thread.seller_id} • {thread.item} • {thread.archetype}"
         table = Table.grid(padding=(0, 1))
-        table.expand = True
         table.add_column("Speaker", style="bold", no_wrap=True)
         table.add_column("Text", overflow="fold")
-        # Only show the last few turns to keep panels readable.
-        for msg in thread.messages[-8:]:
             speaker = "you" if msg.sender == "agent" else "seller"
             style = "cyan" if msg.sender == "agent" else "white"
             text = Text(msg.text, style=style)
             if msg.is_bluff:
-                # Yellow highlight for bluff detection.
                 text.stylize("black on yellow")
             table.add_row(speaker, text)
-        # Bluff signal breakdown, if present.
         if thread.bluff_signals:
-            sig = thread.bluff_signals
-            sig_table = Table.grid(padding=(0, 1))
-            sig_table.add_column(justify="left", no_wrap=True)
-            sig_table.add_column(justify="right", no_wrap=True)
-            sig_table.add_row(
-                "[bold yellow]Bluff detected[/bold yellow]",
-                f"[yellow]score={sig.get('bluff_score', 0.0):.2f}[/yellow]",
-            )
-            for key in ("timing_tell", "size_tell", "formulaic_tell", "pattern_tell"):
-                if key in sig:
-                    label = key.replace("_", " ")
-                    sig_table.add_row(label, f"{sig[key]:.2f}")
-            table.add_row("", sig_table)
-        return Panel(
-            table,
-            title=title,
-            border_style=border_style,
-            padding=(1, 1),
-        )
-    def _build_summary_panel(
-        self,
-        route_summaries: List[Dict[str, Any]],
-        budget: float,
-        final_metrics: Optional[Dict[str, Any]],
-        checkpoints: Dict[str, bool],
-    ) -> Panel:
-        table = Table.grid(padding=(0, 2))
-        table.expand = True
-        # Left: route statuses
-        routes_sub = Table(
-            show_header=True,
-            header_style="bold",
-            title="Route Graph",
-            title_style="bold",
-        )
-        routes_sub.add_column("Route", no_wrap=True)
-        routes_sub.add_column("Status", no_wrap=True)
-        routes_sub.add_column("Δ", justify="right", no_wrap=True)
-        routes_sub.add_column("Score", justify="right", no_wrap=True)
-        for row in route_summaries:
-            margin = row["exit_value"] - row["entry_cost"]
-            status = row["status"]
-            status_style = {
-                "dead": "red",
-                "confirmed": "green",
-                "soft": "yellow",
-            }.get(status, "white")
-            routes_sub.add_row(
-                row["edge_id"],
-                f"[{status_style}]{status}[/{status_style}]",
-                f"{margin:.2f}",
-                f"{row['score']:.2f}",
-            )
-        # Right: ROI + checkpoints
-        roi_sub = Table(
-            show_header=False,
-            box=None,
-            title="Capital Deployment",
-            title_style="bold",
-        )
-        roi_sub.add_column("Label", no_wrap=True)
-        roi_sub.add_column("Value", no_wrap=True)
-        entry_cost = None
-        exit_value = None
-        return_multiple = None
-        if final_metrics is not None:
-            entry_cost = final_metrics.get("entry_cost")
-            exit_value = final_metrics.get("exit_value")
-            return_multiple = final_metrics.get("return_multiple")
-        roi_sub.add_row("Budget", f"$ {budget:.2f}")
-        if entry_cost is not None:
-            roi_sub.add_row("Entry cost", f"$ {entry_cost:.2f}")
-        if exit_value is not None:
-            roi_sub.add_row("Exit value", f"$ {exit_value:.2f}")
-        if return_multiple is not None:
-            roi_sub.add_row("Return multiple", f"{return_multiple:.2f}x")
-        # Checkpoints list
-        checkpoints_sub = Table(
-            show_header=False,
-            box=None,
-            title="Demo Checkpoints",
-            title_style="bold",
-        )
-        checkpoints_sub.add_column("State", no_wrap=True)
-        labels = [
-            ("multi_thread_view", "Threads visible"),
-            ("bluff_detected", "Bluff flagged"),
-            ("dead_route_seen", "Dead route surfaced"),
-            ("route_confirmed", "Route confirmed"),
-            ("execution_complete", "Executed & logged"),
-        ]
-        for key, label in labels:
-            done = checkpoints.get(key, False)
-            style = "green" if done else "dim"
-            marker = "●" if done else "○"
-            checkpoints_sub.add_row(f"[{style}]{marker} {label}[/{style}]")
-        table.add_row(routes_sub, roi_sub, checkpoints_sub)
-        return Panel(
-            table,
-            title="ArbitrAgent — $20 → Multi-Route Arbitrage",
-            border_style="cyan",
-            padding=(1, 1),
-        )
 __all__ = ["NegotiationDisplay", "ThreadState", "ThreadMessage"]

+"""
+Rich terminal UI for the ArbitrAgent demo.
+Panel 1: NEGOTIATION THREADS — one row per seller (name, item, current offer, status).
+Panel 2: LIVE EVENT LOG — scrolling [BLUFF DETECTED], [GOOD OUTCOME], [HUMAN-ALIGNED MOVE], [ROUTE KILLED].
+Panel 3: ROUTE GRAPH — route_id, entry, exit, score, status.
+Panel 4: FINAL RESULT — Budget → Deployed → Final Value → Return, route and why.
+"""
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
 @dataclass
 class ThreadMessage:
     turn: int
+    sender: str
     text: str
     is_bluff: bool = False
     seller_id: str
     item: str
     archetype: str
+    status: str = "active"  # "active" | "pending" | "confirmed" | "dead"
+    current_offer: Optional[float] = None
     messages: List[ThreadMessage] = field(default_factory=list)
     bluff_signals: Optional[Dict[str, float]] = None
+# Event types for the live event log
+BluffDetectedEvent = Dict[str, Any]  # seller_name, turn, timing_tell, size_tell, formulaic_tell, pattern_tell, action_taken
+GoodOutcomeEvent = Dict[str, Any]   # route_id, entry_cost, exit_value, return_multiple, did_not_accept_floor
+HumanAlignedEvent = Dict[str, Any]  # phase_name, action_taken, similarity_pct
+RouteKilledEvent = Dict[str, Any]   # seller_name, reason, capital_preserved
+def _status_style(status: str) -> str:
+    if status == "confirmed":
+        return "green"
+    if status == "active":
+        return "yellow"
+    if status == "dead":
+        return "red"
+    return "white"  # pending
 class NegotiationDisplay:
     """
+    Live terminal UI: negotiation threads, event log, route graph, final result.
     """
     def __init__(self, console: Optional[Console] = None) -> None:
         threads: List[ThreadState],
         route_summaries: List[Dict[str, Any]],
         budget: float,
+        event_log: Optional[List[Dict[str, Any]]] = None,
         final_metrics: Optional[Dict[str, Any]] = None,
         checkpoints: Optional[Dict[str, bool]] = None,
     ) -> None:
         self.console.clear()
+        # Panel 1 — NEGOTIATION THREADS
+        threads_table = Table(
+            show_header=True,
+            header_style="bold",
+            title="NEGOTIATION THREADS",
+            title_style="bold",
         )
+        threads_table.add_column("Seller", no_wrap=True)
+        threads_table.add_column("Item", no_wrap=True)
+        threads_table.add_column("Current offer", justify="right", no_wrap=True)
+        threads_table.add_column("Status", no_wrap=True)
+        for t in threads:
+            offer_str = f"${t.current_offer:.2f}" if t.current_offer is not None else "—"
+            style = _status_style(t.status)
+            threads_table.add_row(
+                t.seller_id,
+                t.item,
+                offer_str,
+                f"[{style}]{t.status}[/{style}]",
+            )
+        self.console.print(Panel(threads_table, border_style="cyan", padding=(0, 1)))
         self.console.print()
+        # Panel 2 — LIVE EVENT LOG (scrolling, last N events)
+        events = event_log or []
+        log_lines: List[Any] = []
+        for ev in events[-30:]:
+            kind = ev.get("type") or ev.get("event")
+            if kind == "bluff_detected":
+                log_lines.append(Text("[BLUFF DETECTED]", style="bold yellow"))
+                log_lines.append(Text(f"  {ev.get('seller_name', ev.get('seller_id', ''))}, turn {ev.get('turn', '')}"))
+                log_lines.append(Text(f"  ✦ timing tell: {ev.get('timing_tell', 0):.2f}"))
+                log_lines.append(Text(f"  ✦ size tell: {ev.get('size_tell', 0):.2f}"))
+                log_lines.append(Text(f"  ✦ formulaic tell: {ev.get('formulaic_tell', 0):.2f}"))
+                log_lines.append(Text(f"  ✦ pattern tell: {ev.get('pattern_tell', 0):.2f}"))
+                log_lines.append(Text(f"  → action taken: {ev.get('action_taken', '')[:80]}..."))
+                log_lines.append(Text(""))
+            elif kind == "good_outcome":
+                log_lines.append(Text("[GOOD OUTCOME]", style="bold green"))
+                log_lines.append(Text(f"  route {ev.get('route_id', '')}, entry ${ev.get('entry_cost', 0):.2f}, exit ${ev.get('exit_value', 0):.2f}, return {ev.get('return_multiple', 0):.2f}x"))
+                log_lines.append(Text("  ✦ did not accept stated floor"))
+                log_lines.append(Text(""))
+            elif kind == "human_aligned":
+                log_lines.append(Text("[HUMAN-ALIGNED MOVE]", style="bold blue"))
+                log_lines.append(Text(f"  {ev.get('phase_name', '')}: {str(ev.get('action_taken', ''))[:60]}..."))
+                log_lines.append(Text(f"  ✦ matches human Diplomacy pattern: {ev.get('similarity_pct', 0):.0f}% similarity"))
+                log_lines.append(Text(""))
+            elif kind == "route_killed":
+                log_lines.append(Text("[ROUTE KILLED]", style="bold red"))
+                log_lines.append(Text(f"  {ev.get('seller_name', ev.get('seller_id', ''))}, {ev.get('reason', '')}"))
+                log_lines.append(Text("  ✦ capital preserved, pivoting"))
+                log_lines.append(Text(""))
+        if log_lines:
+            log_content = Text()
+            for line in log_lines:
+                log_content.append_text(line)
+                log_content.append("\n")
+            self.console.print(Panel(log_content, title="LIVE EVENT LOG", border_style="dim", padding=(0, 1), height=14))
+        else:
+            self.console.print(Panel("(no events yet)", title="LIVE EVENT LOG", border_style="dim", padding=(0, 1), height=6))
+        self.console.print()
+        # Panel 3 — ROUTE GRAPH
+        route_table = Table(
+            show_header=True,
+            header_style="bold",
+            title="ROUTE GRAPH",
+            title_style="bold",
+        )
+        route_table.add_column("route_id", no_wrap=True)
+        route_table.add_column("entry", justify="right", no_wrap=True)
+        route_table.add_column("exit", justify="right", no_wrap=True)
+        route_table.add_column("score", justify="right", no_wrap=True)
+        route_table.add_column("status", no_wrap=True)
+        for row in route_summaries:
+            st = row.get("status", "soft")
+            route_table.add_row(
+                row.get("edge_id", ""),
+                f"${row.get('entry_cost', 0):.2f}",
+                f"${row.get('exit_value', 0):.2f}",
+                f"{row.get('score', 0):.2f}",
+                f"[{_status_style(st)}]{st}[/{_status_style(st)}]",
+            )
+        self.console.print(Panel(route_table, border_style="cyan", padding=(0, 1)))
+        self.console.print()
+        # Panel 4 — FINAL RESULT (when available)
+        if final_metrics is not None:
+            entry = final_metrics.get("entry_cost")
+            exit_val = final_metrics.get("exit_value")
+            ret = final_metrics.get("return_multiple")
+            route_id = final_metrics.get("route_id", "")
+            why = final_metrics.get("why", "best scored confirmed route")
+            line1 = f"Budget: ${budget:.1f}  →  Deployed: ${entry:.2f}  →  Final Value: ${exit_val:.2f}  →  Return: {ret:.2f}x"
+            line2 = f"Route: {route_id} — {why}"
+            self.console.print(Panel(f"[bold]{line1}[/bold]\n\n{line2}", title="FINAL RESULT", border_style="green", padding=(1, 2)))
+        elif checkpoints and checkpoints.get("execution_complete"):
+            self.console.print(Panel("No route executed. Capital preserved.", title="FINAL RESULT", border_style="yellow", padding=(1, 2)))
+    # Legacy API: build thread panel per thread (for side-by-side thread view if needed)
+    def _build_thread_panel(self, thread: ThreadState) -> Panel:
+        border_style = _status_style(thread.status)
+        title = f"{thread.seller_id} • {thread.item}"
         table = Table.grid(padding=(0, 1))
         table.add_column("Speaker", style="bold", no_wrap=True)
         table.add_column("Text", overflow="fold")
+        for msg in thread.messages[-6:]:
             speaker = "you" if msg.sender == "agent" else "seller"
             style = "cyan" if msg.sender == "agent" else "white"
             text = Text(msg.text, style=style)
             if msg.is_bluff:
                 text.stylize("black on yellow")
             table.add_row(speaker, text)
         if thread.bluff_signals:
+            table.add_row("", f"[yellow]bluff_score={thread.bluff_signals.get('bluff_score', 0):.2f}[/yellow]")
+        return Panel(table, title=title, border_style=border_style, padding=(0, 1))
 __all__ = ["NegotiationDisplay", "ThreadState", "ThreadMessage"]

demo/run_demo.py CHANGED Viewed

@@ -1,3 +1,9 @@
 from __future__ import annotations
 import argparse
@@ -8,37 +14,49 @@ import time
 from dataclasses import asdict
 from typing import Any, Dict, List
-# Ensure project root is on sys.path when run as `python demo/run_demo.py`.
 PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if PROJECT_ROOT not in sys.path:
     sys.path.insert(0, PROJECT_ROOT)
-from agent.arbitragent import ArbitrAgent, SellerCandidate  # type: ignore
 from agent.bluff_detector import analyze_from_sim
 from agent.route_graph import RouteEdge
 from demo.display import NegotiationDisplay, ThreadMessage, ThreadState
 from simulation.scenario import get_scenario
 class DemoArbitrAgent(ArbitrAgent):
     """
-    Thin wrapper around ArbitrAgent that:
-    - Drives the existing five-phase loop.
-    - Streams state into the Rich-based NegotiationDisplay.
-    - Builds a structured JSON log of the entire episode.
     """
     def run_with_display(
         self,
         budget: float,
-        sleep_per_tick: float = 0.7,
     ) -> Dict[str, Any]:
         self.budget = float(budget)
         sellers, trade_targets = get_scenario()
         display = NegotiationDisplay()
-        # Checkpoint flags for the demo.
         checkpoints: Dict[str, bool] = {
             "multi_thread_view": False,
             "bluff_detected": False,
@@ -47,111 +65,79 @@ class DemoArbitrAgent(ArbitrAgent):
             "execution_complete": False,
         }
-        # Thread state tracking per seller.
         threads: Dict[str, ThreadState] = {}
-        def get_thread_for_candidate(cand: SellerCandidate) -> ThreadState:
             if cand.seller_id not in threads:
                 threads[cand.seller_id] = ThreadState(
                     seller_id=cand.seller_id,
                     item=cand.item,
                     archetype=cand.archetype,
                 )
             return threads[cand.seller_id]
-        # Structured log scaffold.
         log: Dict[str, Any] = {
             "budget": self.budget,
             "events": [],
             "routes": [],
             "final": {},
             "checkpoints": checkpoints,
         }
         start_time = time.time()
-        # -----------------------------
-        # Phase 1: Scout + soft inquiry
-        # -----------------------------
         candidates = self._phase1_scout(sellers)
         for cand in candidates:
             log["events"].append(
-                {
-                    "phase": 1,
-                    "type": "candidate_scored",
-                    "seller_id": cand.seller_id,
-                    "item": cand.item,
-                    "score": cand.score,
-                    "listing_price": cand.listing_price,
-                    "resale_value": cand.resale_value,
-                }
             )
-        # Open soft inquiries and populate initial threads.
         for cand in candidates:
-            thread = get_thread_for_candidate(cand)
             msg = f"hey, is the {cand.item} still available? any room on price?"
             resp = cand.sim.step(msg)
-            thread.messages.append(
-                ThreadMessage(turn=cand.sim.turn, sender="agent", text=msg)
-            )
             if resp is not None:
-                thread.messages.append(
-                    ThreadMessage(turn=cand.sim.turn, sender="seller", text=resp)
-                )
-            log["events"].append(
-                {
-                    "phase": 1,
-                    "type": "soft_inquiry",
-                    "seller_id": cand.seller_id,
-                    "agent_message": msg,
-                    "seller_response": resp,
-                }
-            )
-        # Initial multi-thread view.
         checkpoints["multi_thread_view"] = True
         display.render(
             threads=list(threads.values()),
             route_summaries=self.route_graph.summary(),
             budget=self.budget,
             final_metrics=None,
             checkpoints=checkpoints,
         )
         time.sleep(sleep_per_tick)
-        # -----------------------------
-        # Phase 2: Route Mapping
-        # -----------------------------
-        seller_to_edges = self._phase2_build_routes(
-            candidates=candidates,
-            trade_targets=trade_targets,
-            verbose=False,
-        )
-        # Render after routes created (still soft).
         display.render(
             threads=list(threads.values()),
             route_summaries=self.route_graph.summary(),
             budget=self.budget,
             final_metrics=None,
             checkpoints=checkpoints,
         )
         time.sleep(sleep_per_tick)
-        # -----------------------------
-        # Phase 3: Pressure & Confirm
-        # -----------------------------
-        if candidates:
-            max_turn = max(t["confirmed_at_turn"] for t in trade_targets)
-        else:
-            max_turn = 0
         for turn in range(2, max_turn + 1):
-            # Which downstream trade targets are confirmed by this turn?
             confirmed_targets = {
                 (t["item"], idx)
                 for idx, t in enumerate(trade_targets)
@@ -159,66 +145,46 @@ class DemoArbitrAgent(ArbitrAgent):
             }
             for cand in candidates:
-                edges_for_seller: List[RouteEdge] = seller_to_edges.get(
-                    cand.seller_id, []
-                )
-                # Track threads even if seller has no explicit route edges (e.g., ghoster).
-                thread = get_thread_for_candidate(cand)
-                # Death / ghosting.
                 if cand.sim.is_dead():
                     if thread.status != "dead":
                         thread.status = "dead"
                         checkpoints["dead_route_seen"] = True
-                        log["events"].append(
-                            {
-                                "phase": 3,
-                                "turn": turn,
-                                "type": "route_dead",
-                                "seller_id": cand.seller_id,
-                            }
-                        )
-                    # If there are edges, mark them dead in the graph.
                     for edge in edges_for_seller:
                         self.route_graph.mark_dead(edge.edge_id)
                     continue
-                # Do we have a confirmed downstream target by this turn?
                 has_confirmed_downstream = any(
-                    (edge.buy_item, int(edge.trade_target_id.split("_")[1]))
-                    in confirmed_targets
-                    for edge in edges_for_seller
                 )
                 if has_confirmed_downstream:
                     agent_msg = (
                         f"i have another buyer interested in the {cand.item}, "
-                        "but i'd prefer to buy from you if we can make the numbers work. "
-                        "could you do a bit better on price?"
                     )
                 else:
-                    agent_msg = (
-                        f"just checking back on the {cand.item} — any flexibility on your price at all?"
-                    )
                 resp = cand.sim.step(agent_msg)
-                # Log messages into thread.
-                thread.messages.append(
-                    ThreadMessage(turn=cand.sim.turn, sender="agent", text=agent_msg)
-                )
                 if resp is not None:
-                    thread.messages.append(
-                        ThreadMessage(turn=cand.sim.turn, sender="seller", text=resp)
-                    )
-                # Bluff analysis if we got a response.
                 if resp is not None:
                     signals = analyze_from_sim(cand.sim, resp)
                     if signals.is_bluff:
                         checkpoints["bluff_detected"] = True
-                        # Mark the most recent seller message as bluff-highlighted.
                         thread.messages[-1].is_bluff = True
                         thread.bluff_signals = {
                             "timing_tell": signals.timing_tell,
@@ -227,70 +193,81 @@ class DemoArbitrAgent(ArbitrAgent):
                             "pattern_tell": signals.pattern_tell,
                             "bluff_score": signals.bluff_score,
                         }
-                        log["events"].append(
-                            {
-                                "phase": 3,
-                                "turn": turn,
-                                "type": "bluff_detected",
-                                "seller_id": cand.seller_id,
-                                "message": resp,
-                                "signals": asdict(signals),
-                            }
                         )
-                log["events"].append(
-                    {
-                        "phase": 3,
-                        "turn": turn,
-                        "type": "negotiation_turn",
-                        "seller_id": cand.seller_id,
-                        "agent_message": agent_msg,
-                        "seller_response": resp,
-                    }
-                )
-                # Update entry cost with latest offer.
                 for edge in edges_for_seller:
                     self.route_graph.update_entry_cost(edge.edge_id, cand.sim.current_offer)
-                # If seller ghosted after this message, mark dead.
                 if cand.sim.is_dead():
                     if thread.status != "dead":
                         thread.status = "dead"
                         checkpoints["dead_route_seen"] = True
                     for edge in edges_for_seller:
                         self.route_graph.mark_dead(edge.edge_id)
                     continue
-                # Upgrade confirmation probability when downstream target has confirmed.
                 for edge in edges_for_seller:
                     target_index = int(edge.trade_target_id.split("_")[1])
                     if (edge.buy_item, target_index) in confirmed_targets:
-                        self.route_graph.update_confirmation_probability(
-                            edge.edge_id, confirmation_probability=0.9
-                        )
                         self.route_graph.mark_confirmed(edge.edge_id)
                         thread.status = "confirmed"
                         checkpoints["route_confirmed"] = True
-            # Render this turn.
             display.render(
                 threads=list(threads.values()),
                 route_summaries=self.route_graph.summary(),
                 budget=self.budget,
                 final_metrics=None,
                 checkpoints=checkpoints,
             )
             time.sleep(sleep_per_tick)
-        # -----------------------------
-        # Phase 4: Route Scoring
-        # -----------------------------
         self.route_graph.prune_below_threshold()
-        # -----------------------------
-        # Phase 5: Execute
-        # -----------------------------
         best = self.route_graph.best_route()
         route_summary = self.route_graph.summary()
         log["routes"] = route_summary
@@ -303,12 +280,11 @@ class DemoArbitrAgent(ArbitrAgent):
                 "return_multiple": 1.0,
                 "duration_seconds": time.time() - start_time,
             }
         else:
             profit = best.exit_value - best.entry_cost
             final_value = self.budget - best.entry_cost + best.exit_value
-            route_multiple = (
-                best.exit_value / best.entry_cost if best.entry_cost > 0 else 0.0
-            )
             final = {
                 "best_route": {
                     "edge_id": best.edge_id,
@@ -322,64 +298,57 @@ class DemoArbitrAgent(ArbitrAgent):
                 "return_multiple": route_multiple,
                 "duration_seconds": time.time() - start_time,
             }
         checkpoints["execution_complete"] = True
         log["final"] = final
-        # Final render with ROI panel filled.
-        final_route = None
-        if final["best_route"] is not None:
-            final_route = final["best_route"]
         display.render(
             threads=list(threads.values()),
             route_summaries=route_summary,
             budget=self.budget,
-            final_metrics={
-                "entry_cost": final_route["entry_cost"] if final_route else None,
-                "exit_value": final_route["exit_value"] if final_route else None,
-                "return_multiple": final["return_multiple"],
-            },
             checkpoints=checkpoints,
         )
         return log
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Run the ArbitrAgent Rich demo (90-second negotiation walkthrough)."
-    )
-    parser.add_argument(
-        "--budget",
-        type=float,
-        default=20.0,
-        help="Starting cash budget for the agent (default: 20.0).",
-    )
-    parser.add_argument(
-        "--sleep",
-        type=float,
-        default=15.0,
-        help="Seconds to pause between display updates (default: 15.0, ~90s total demo).",
-    )
-    parser.add_argument(
-        "--log-path",
-        type=str,
-        default=None,
-        help="Optional path to write the structured JSON log. If omitted, prints to stdout only.",
-    )
     args = parser.parse_args()
-    agent = DemoArbitrAgent(budget=args.budget, min_route_score=1.0)
-    log = agent.run_with_display(budget=args.budget, sleep_per_tick=args.sleep)
     json_str = json.dumps(log, indent=2, default=float)
-    if args.log_path:
-        with open(args.log_path, "w") as f:
-            f.write(json_str)
     print("\n=== Structured Demo Log (JSON) ===")
-    print(json_str)
 if __name__ == "__main__":
     main()

+"""
+Demo entry point: budget, scenario, full 5-phase agent loop with Rich display.
+Loads unified_final checkpoint if present, else phase2_final. Saves log to demo/sample_run_log.json.
+Must complete in under 90 seconds.
+"""
 from __future__ import annotations
 import argparse
 from dataclasses import asdict
 from typing import Any, Dict, List
 PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if PROJECT_ROOT not in sys.path:
     sys.path.insert(0, PROJECT_ROOT)
+from agent.arbitragent import ArbitrAgent, SellerCandidate
 from agent.bluff_detector import analyze_from_sim
 from agent.route_graph import RouteEdge
 from demo.display import NegotiationDisplay, ThreadMessage, ThreadState
 from simulation.scenario import get_scenario
+def _resolve_checkpoint_path() -> str | None:
+    """Unified_final if exists, else phase2_final."""
+    unified = os.path.join(PROJECT_ROOT, "training", "checkpoints", "unified_final")
+    phase2 = os.path.join(PROJECT_ROOT, "training", "checkpoints", "phase2_final")
+    if os.path.isdir(unified):
+        return unified
+    if os.path.isdir(phase2):
+        return phase2
+    return None
 class DemoArbitrAgent(ArbitrAgent):
     """
+    Runs full 5-phase loop with display and event log.
+    Uses checkpoint path for future model loading; currently heuristic agent.
     """
+    def __init__(self, budget: float = 20.0, min_route_score: float = 1.0, checkpoint_path: str | None = None):
+        super().__init__(budget=budget, min_route_score=min_route_score)
+        self.checkpoint_path = checkpoint_path or _resolve_checkpoint_path()
     def run_with_display(
         self,
         budget: float,
+        scenario: str = "standard_demo",
+        sleep_per_tick: float = 0.5,
     ) -> Dict[str, Any]:
         self.budget = float(budget)
         sellers, trade_targets = get_scenario()
         display = NegotiationDisplay()
+        event_log: List[Dict[str, Any]] = []
         checkpoints: Dict[str, bool] = {
             "multi_thread_view": False,
             "bluff_detected": False,
             "execution_complete": False,
         }
         threads: Dict[str, ThreadState] = {}
+        def get_thread(cand: SellerCandidate) -> ThreadState:
             if cand.seller_id not in threads:
                 threads[cand.seller_id] = ThreadState(
                     seller_id=cand.seller_id,
                     item=cand.item,
                     archetype=cand.archetype,
+                    current_offer=cand.sim.current_offer,
                 )
             return threads[cand.seller_id]
+        def sync_offers():
+            for c in candidates:
+                t = get_thread(c)
+                t.current_offer = c.sim.current_offer
         log: Dict[str, Any] = {
             "budget": self.budget,
+            "scenario": scenario,
+            "checkpoint_path": self.checkpoint_path,
             "events": [],
             "routes": [],
             "final": {},
             "checkpoints": checkpoints,
         }
         start_time = time.time()
+        # Phase 1
         candidates = self._phase1_scout(sellers)
         for cand in candidates:
             log["events"].append(
+                {"phase": 1, "type": "candidate_scored", "seller_id": cand.seller_id, "item": cand.item, "score": cand.score}
             )
         for cand in candidates:
+            thread = get_thread(cand)
             msg = f"hey, is the {cand.item} still available? any room on price?"
             resp = cand.sim.step(msg)
+            thread.messages.append(ThreadMessage(turn=cand.sim.turn, sender="agent", text=msg))
             if resp is not None:
+                thread.messages.append(ThreadMessage(turn=cand.sim.turn, sender="seller", text=resp))
+            thread.current_offer = cand.sim.current_offer
+            log["events"].append({"phase": 1, "type": "soft_inquiry", "seller_id": cand.seller_id, "agent_message": msg, "seller_response": resp})
         checkpoints["multi_thread_view"] = True
+        sync_offers()
         display.render(
             threads=list(threads.values()),
             route_summaries=self.route_graph.summary(),
             budget=self.budget,
+            event_log=event_log,
             final_metrics=None,
             checkpoints=checkpoints,
         )
         time.sleep(sleep_per_tick)
+        # Phase 2
+        seller_to_edges = self._phase2_build_routes(candidates=candidates, trade_targets=trade_targets, verbose=False)
+        sync_offers()
         display.render(
             threads=list(threads.values()),
             route_summaries=self.route_graph.summary(),
             budget=self.budget,
+            event_log=event_log,
             final_metrics=None,
             checkpoints=checkpoints,
         )
         time.sleep(sleep_per_tick)
+        # Phase 3
+        max_turn = max(t["confirmed_at_turn"] for t in trade_targets) if candidates else 0
         for turn in range(2, max_turn + 1):
             confirmed_targets = {
                 (t["item"], idx)
                 for idx, t in enumerate(trade_targets)
             }
             for cand in candidates:
+                edges_for_seller: List[RouteEdge] = seller_to_edges.get(cand.seller_id, [])
+                thread = get_thread(cand)
                 if cand.sim.is_dead():
                     if thread.status != "dead":
                         thread.status = "dead"
                         checkpoints["dead_route_seen"] = True
+                        event_log.append({
+                            "type": "route_killed",
+                            "seller_name": cand.seller_id,
+                            "reason": "ghosting",
+                            "capital_preserved": True,
+                        })
+                        log["events"].append({"phase": 3, "turn": turn, "type": "route_dead", "seller_id": cand.seller_id})
                     for edge in edges_for_seller:
                         self.route_graph.mark_dead(edge.edge_id)
                     continue
                 has_confirmed_downstream = any(
+                    (e.buy_item, int(e.trade_target_id.split("_")[1])) in confirmed_targets
+                    for e in edges_for_seller
                 )
                 if has_confirmed_downstream:
                     agent_msg = (
                         f"i have another buyer interested in the {cand.item}, "
+                        "but i'd prefer to buy from you if we can make the numbers work. could you do a bit better on price?"
                     )
                 else:
+                    agent_msg = f"just checking back on the {cand.item} — any flexibility on your price at all?"
                 resp = cand.sim.step(agent_msg)
+                thread.messages.append(ThreadMessage(turn=cand.sim.turn, sender="agent", text=agent_msg))
                 if resp is not None:
+                    thread.messages.append(ThreadMessage(turn=cand.sim.turn, sender="seller", text=resp))
+                thread.current_offer = cand.sim.current_offer
                 if resp is not None:
                     signals = analyze_from_sim(cand.sim, resp)
                     if signals.is_bluff:
                         checkpoints["bluff_detected"] = True
                         thread.messages[-1].is_bluff = True
                         thread.bluff_signals = {
                             "timing_tell": signals.timing_tell,
                             "pattern_tell": signals.pattern_tell,
                             "bluff_score": signals.bluff_score,
                         }
+                        event_log.append({
+                            "type": "bluff_detected",
+                            "seller_name": cand.seller_id,
+                            "turn": cand.sim.turn,
+                            "timing_tell": signals.timing_tell,
+                            "size_tell": signals.size_tell,
+                            "formulaic_tell": signals.formulaic_tell,
+                            "pattern_tell": signals.pattern_tell,
+                            "action_taken": "coalition pressure (see next message)",
+                        })
+                        log["events"].append({
+                            "phase": 3, "turn": turn, "type": "bluff_detected",
+                            "seller_id": cand.seller_id, "message": resp, "signals": asdict(signals),
+                        })
+                        # Coalition pressure: floor - 4
+                        offer = max(1, int(float(cand.sim.current_offer) - 4))
+                        pressure_msg = (
+                            "I have a trade offer from another seller that makes this less urgent for me — "
+                            f"can you do ${offer}?"
                         )
+                        pressure_resp = cand.sim.step(pressure_msg)
+                        thread.messages.append(ThreadMessage(turn=cand.sim.turn, sender="agent", text=pressure_msg))
+                        if pressure_resp is not None:
+                            thread.messages.append(ThreadMessage(turn=cand.sim.turn, sender="seller", text=pressure_resp))
+                        thread.current_offer = cand.sim.current_offer
+                        event_log[-1]["action_taken"] = pressure_msg
+                        for edge in edges_for_seller:
+                            self.route_graph.update_entry_cost(edge.edge_id, cand.sim.current_offer)
+                        for edge in edges_for_seller:
+                            self.route_graph.update_confirmation_probability(
+                                edge.edge_id, confirmation_probability=min(1.0, edge.confirmation_probability + 0.15)
+                            )
+                log["events"].append({
+                    "phase": 3, "turn": turn, "type": "negotiation_turn",
+                    "seller_id": cand.seller_id, "agent_message": agent_msg, "seller_response": resp,
+                })
                 for edge in edges_for_seller:
                     self.route_graph.update_entry_cost(edge.edge_id, cand.sim.current_offer)
                 if cand.sim.is_dead():
                     if thread.status != "dead":
                         thread.status = "dead"
                         checkpoints["dead_route_seen"] = True
+                        event_log.append({
+                            "type": "route_killed",
+                            "seller_name": cand.seller_id,
+                            "reason": "stopped responding",
+                            "capital_preserved": True,
+                        })
                     for edge in edges_for_seller:
                         self.route_graph.mark_dead(edge.edge_id)
                     continue
                 for edge in edges_for_seller:
                     target_index = int(edge.trade_target_id.split("_")[1])
                     if (edge.buy_item, target_index) in confirmed_targets:
+                        self.route_graph.update_confirmation_probability(edge.edge_id, confirmation_probability=0.9)
                         self.route_graph.mark_confirmed(edge.edge_id)
                         thread.status = "confirmed"
                         checkpoints["route_confirmed"] = True
+            sync_offers()
             display.render(
                 threads=list(threads.values()),
                 route_summaries=self.route_graph.summary(),
                 budget=self.budget,
+                event_log=event_log,
                 final_metrics=None,
                 checkpoints=checkpoints,
             )
             time.sleep(sleep_per_tick)
+        # Phase 4 & 5
         self.route_graph.prune_below_threshold()
         best = self.route_graph.best_route()
         route_summary = self.route_graph.summary()
         log["routes"] = route_summary
                 "return_multiple": 1.0,
                 "duration_seconds": time.time() - start_time,
             }
+            final_metrics_display = None
         else:
             profit = best.exit_value - best.entry_cost
             final_value = self.budget - best.entry_cost + best.exit_value
+            route_multiple = best.exit_value / best.entry_cost if best.entry_cost > 0 else 0.0
             final = {
                 "best_route": {
                     "edge_id": best.edge_id,
                 "return_multiple": route_multiple,
                 "duration_seconds": time.time() - start_time,
             }
+            event_log.append({
+                "type": "good_outcome",
+                "route_id": best.edge_id,
+                "entry_cost": best.entry_cost,
+                "exit_value": best.exit_value,
+                "return_multiple": route_multiple,
+                "did_not_accept_floor": checkpoints.get("bluff_detected", False),
+            })
+            final_metrics_display = {
+                "entry_cost": best.entry_cost,
+                "exit_value": best.exit_value,
+                "return_multiple": route_multiple,
+                "route_id": best.edge_id,
+                "why": "best scored confirmed route (bluff detected and pressure applied)" if checkpoints.get("bluff_detected") else "best scored confirmed route",
+            }
         checkpoints["execution_complete"] = True
         log["final"] = final
         display.render(
             threads=list(threads.values()),
             route_summaries=route_summary,
             budget=self.budget,
+            event_log=event_log,
+            final_metrics=final_metrics_display,
             checkpoints=checkpoints,
         )
         return log
 def main() -> None:
+    parser = argparse.ArgumentParser(description="Run ArbitrAgent demo (full 5-phase loop, <90s).")
+    parser.add_argument("--budget", type=float, default=20.0, help="Starting budget (default: 20).")
+    parser.add_argument("--scenario", type=str, default="standard_demo", help="Scenario name (default: standard_demo).")
+    parser.add_argument("--sleep", type=float, default=0.5, help="Seconds per display tick (default: 0.5).")
+    parser.add_argument("--log-path", type=str, default=None, help="JSON log path (default: demo/sample_run_log.json).")
     args = parser.parse_args()
+    log_path = args.log_path or os.path.join(PROJECT_ROOT, "demo", "sample_run_log.json")
+    checkpoint_path = _resolve_checkpoint_path()
+    agent = DemoArbitrAgent(budget=args.budget, min_route_score=1.0, checkpoint_path=checkpoint_path)
+    log = agent.run_with_display(budget=args.budget, scenario=args.scenario, sleep_per_tick=args.sleep)
     json_str = json.dumps(log, indent=2, default=float)
+    os.makedirs(os.path.dirname(log_path), exist_ok=True)
+    with open(log_path, "w") as f:
+        f.write(json_str)
     print("\n=== Structured Demo Log (JSON) ===")
+    print(f"Saved to {log_path}")
+    print(json_str[:2000] + "..." if len(json_str) > 2000 else json_str)
 if __name__ == "__main__":
     main()

envs/arbitragent_env.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""
+ArbitrAgentEnv — OpenEnv 0.2.1 unified environment with three reward signals.
+total_reward = 0.35 * accuracy_reward + 0.35 * outcome_reward + 0.30 * bluff_reward
+- accuracy_reward: cosine similarity between model action embedding and human action embedding.
+- outcome_reward: keyword scoring (coalition, pressure, clean close vs premature concession).
+- bluff_reward: BluffDetector on synthetic seller message; reward correct flag, penalize missed bluff.
+"""
+import json
+import random
+import re
+import numpy as np
+from openenv.env import Env
+from sentence_transformers import SentenceTransformer
+def _extract_human_orders(state_text: str) -> str:
+    """Extract 'My orders this turn: ...' from Diplomacy state_text as human action."""
+    match = re.search(r"My orders this turn:\s*(.+?)(?:\n|$)", state_text, re.DOTALL | re.IGNORECASE)
+    if match:
+        return match.group(1).strip()
+    return state_text[:200]
+# Synthetic seller message used for bluff_reward when state has no thread (Diplomacy states).
+SYNTHETIC_BLUFF_MESSAGE = "look i really cant go lower than $30, thats my final offer. been getting a lot of interest so"
+SYNTHETIC_BLUFF_PROFILE = {
+    "response_speed": "fast",
+    "archetype": "bluffer",
+}
+SYNTHETIC_THREAD = [
+    {"turn": 1, "agent": "Can you do $35?"},
+    {"turn": 1, "seller": "How about $32?"},
+    {"turn": 2, "agent": "I need a better price."},
+    {"turn": 2, "seller": SYNTHETIC_BLUFF_MESSAGE},
+]
+class ArbitrAgentEnv(Env):
+    """
+    Single OpenEnv 0.2.1 environment combining accuracy (human move alignment),
+    outcome (negotiation language), and bluff (detection) rewards.
+    """
+    def __init__(self, data_path: str = "training/data/selfplay_states.json", seed=None):
+        self.data_path = data_path
+        self.encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        if seed is not None:
+            random.seed(seed)
+            np.random.seed(seed)
+        with open(data_path, "r") as f:
+            self.all_states = json.load(f)
+        self.current_state = None
+        self.round = 0
+        self.max_rounds = 10
+        self.done = False
+        self._last_reward_breakdown = None
+    def reset(self):
+        self.current_state = random.choice(self.all_states)
+        self.round = 0
+        self.done = False
+        self._last_reward_breakdown = None
+        obs = self._get_observation()
+        info = {
+            "round": self.round,
+            "phase": self.current_state.get("phase", ""),
+            "power": self.current_state.get("power", ""),
+        }
+        return obs, info
+    def step(self, action: str):
+        self.round += 1
+        action = action or "(no action)"
+        action_lower = action.lower()
+        accuracy = self._accuracy_reward(action)
+        outcome = self._outcome_reward(action_lower)
+        bluff = self._bluff_reward(action_lower)
+        total = 0.35 * accuracy + 0.35 * outcome + 0.30 * bluff
+        self._last_reward_breakdown = {"accuracy": accuracy, "outcome": outcome, "bluff": bluff, "total": total}
+        self.current_state = self._get_next_state()
+        self.done = (
+            self.round >= self.max_rounds
+            or self.current_state.get("is_winner", False)
+            or self.current_state.get("is_eliminated", False)
+        )
+        obs = self._get_observation()
+        info = {
+            "round": self.round,
+            "accuracy": accuracy,
+            "outcome": outcome,
+            "bluff": bluff,
+            "total": total,
+            "phase": self.current_state.get("phase", ""),
+            "power": self.current_state.get("power", ""),
+        }
+        return obs, total, self.done, info
+    def _accuracy_reward(self, action: str) -> float:
+        """Cosine similarity between action embedding and human action embedding."""
+        state_text = self.current_state.get("state_text", "")
+        human_action_text = _extract_human_orders(state_text)
+        action_emb = self.encoder.encode(action, convert_to_numpy=True)
+        human_emb = self.encoder.encode(human_action_text, convert_to_numpy=True)
+        dot = float(np.dot(action_emb, human_emb))
+        norm_a = float(np.linalg.norm(action_emb)) or 1e-8
+        norm_h = float(np.linalg.norm(human_emb)) or 1e-8
+        cos = dot / (norm_a * norm_h)
+        return float(np.clip(cos, -1.0, 1.0))
+    def _outcome_reward(self, action_lower: str) -> float:
+        """Keyword scoring: reward coalition/pressure/clean close; penalize premature concession."""
+        reward = 0.0
+        # Positive: coalition language
+        if any(w in action_lower for w in ["ally", "alliance", "coalition", "support", "another buyer", "trade offer from another"]):
+            reward += 0.4
+        # Positive: pressure moves
+        if any(w in action_lower for w in ["pressure", "leverage", "can you do", "less urgent", "make the numbers work"]):
+            reward += 0.3
+        # Positive: clean close
+        if any(w in action_lower for w in ["deal", "agree", "accept", "close"]):
+            reward += 0.2
+        # Negative: premature concession (accepting stated floor)
+        if any(w in action_lower for w in ["ok $30", "accept 30", "take it at 30", "deal at 30"]):
+            reward -= 0.6
+        # Negative: accepting stated floor language
+        if any(w in action_lower for w in ["final offer", "lowest you can go", "that's your final"]):
+            reward -= 0.3
+        return float(np.clip(reward, -1.0, 1.0))
+    def _bluff_reward(self, action_lower: str) -> float:
+        """Use BluffDetector (learned + rules) on the action text; return bluff_score as reward component."""
+        try:
+            from agent.bluff_detector import analyze_bluff
+            signals = analyze_bluff(
+                SYNTHETIC_BLUFF_PROFILE,
+                SYNTHETIC_THREAD,
+                action_lower,
+                turn=2,
+            )
+            return float(signals.bluff_score)
+        except Exception:
+            return 0.0
+    def _get_next_state(self):
+        current_game_id = self.current_state.get("game_id")
+        same_game = [
+            s for s in self.all_states
+            if s.get("game_id") == current_game_id and s.get("phase") != self.current_state.get("phase")
+        ]
+        if same_game:
+            return random.choice(same_game)
+        return random.choice(self.all_states)
+    def _get_state_text(self):
+        s = self.current_state
+        return f"""ARBITRAGENT UNIFIED ENV — Round {self.round}/{self.max_rounds}
+Phase: {s.get('phase', '')} | Power: {s.get('power', '')}
+{s.get('state_text', '')}
+Synthetic seller message (for bluff awareness): "{SYNTHETIC_BLUFF_MESSAGE}"
+Your task: Propose a move. If you detect a bluff, use coalition pressure; otherwise negotiate toward a good outcome."""
+    def _get_observation(self):
+        text = self._get_state_text()
+        emb = self.encoder.encode(text, convert_to_numpy=True)
+        return emb.astype(np.float32)
+    def render(self):
+        text = self._get_state_text()
+        if self._last_reward_breakdown:
+            text += f"\n\nLast reward breakdown: accuracy={self._last_reward_breakdown['accuracy']:.3f}, outcome={self._last_reward_breakdown['outcome']:.3f}, bluff={self._last_reward_breakdown['bluff']:.3f}, total={self._last_reward_breakdown['total']:.3f}"
+        return text
+    def close(self):
+        pass
+    @property
+    def observation_space(self):
+        return {"type": "continuous", "shape": (384,), "dtype": "float32"}
+    @property
+    def action_space(self):
+        return {"type": "text", "description": "Natural language move + reasoning"}

training/arbitragent_colab.ipynb CHANGED Viewed

@@ -1,47 +1,33 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 4,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "gpuType": "T4"
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python",
-      "version": "3.10.0"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "# ArbitrAgent — Curriculum-Trained Negotiation Agent"
       ]
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "!pip install -q openenv transformers trl datasets sentence-transformers diplomacy torch matplotlib"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Clone repo and set paths (replace with your repo URL)\n",
         "import os\n",
         "import sys\n",
         "import subprocess\n",
-        "REPO_URL = \"https://github.com/your-username/Play-gent.git\"  # or arbitragent\n",
         "REPO_NAME = \"Play-gent\"  # folder name after clone\n",
         "if not os.path.exists(\"envs/diplomacy_env.py\"):  # not already in repo\n",
         "    subprocess.run([\"git\", \"clone\", \"-q\", REPO_URL], check=False)\n",
@@ -51,31 +37,39 @@
         "sys.path.insert(0, ROOT)\n",
         "print(\"ROOT:\", ROOT)"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Load DiplomacyNegotiationEnv, run reset() and render()\n",
-        "from envs.diplomacy_env import DiplomacyNegotiationEnv\n",
-        "\n",
-        "env = DiplomacyNegotiationEnv(power_name=\"ENGLAND\", seed=42)\n",
         "obs, info = env.reset()\n",
         "print(\"Observation shape:\", obs.shape)\n",
         "print(\"Info:\", info)\n",
         "print()\n",
-        "env.render()"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Load reward model, score 4 different moves\n",
         "import torch\n",
         "from transformers import AutoTokenizer\n",
         "from reward_model import DiplomacyRewardModel\n",
@@ -105,275 +99,168 @@
         "    print(f\"Score: {s:.4f}  |  {m[:60]}...\")\n",
         "print(\"\\nReward model loaded and 4 moves scored.\")"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Abbreviated Phase 1 GRPO — 20 steps, plot reward curve\n",
         "import json\n",
         "import numpy as np\n",
-        "import matplotlib.pyplot as plt\n",
         "from datasets import Dataset\n",
         "from trl import GRPOTrainer, GRPOConfig\n",
         "from transformers import AutoTokenizer\n",
         "\n",
-        "PHASE1_STEPS = 20\n",
-        "PHASE1_OUTPUT = \"grpo_phase1_colab\"\n",
-        "\n",
-        "# Build prompts from env (no large JSON needed)\n",
-        "from envs.diplomacy_env import DiplomacyNegotiationEnv\n",
-        "env = DiplomacyNegotiationEnv(seed=42)\n",
-        "prompts_list = []\n",
-        "for _ in range(80):\n",
-        "    env.reset()\n",
-        "    prompts_list.append(env._get_state_text())\n",
-        "\n",
-        "dataset = Dataset.from_list([{\"prompt\": p} for p in prompts_list])\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")\n",
         "tokenizer.pad_token = tokenizer.eos_token\n",
         "\n",
         "def _extract_completion_text(c):\n",
-        "    if isinstance(c, str):\n",
-        "        return c.strip()\n",
-        "    if isinstance(c, list) and c:\n",
-        "        last = c[-1]\n",
-        "        if isinstance(last, dict) and \"content\" in last:\n",
-        "            return last[\"content\"].strip()\n",
         "    return \"\"\n",
         "\n",
-        "def make_phase1_reward(reward_model, tokenizer_rm, device):\n",
-        "    def fn(completions, prompts=None, **kwargs):\n",
-        "        if prompts is None:\n",
-        "            prompts = [\"\"] * len(completions)\n",
-        "        texts = [_extract_completion_text(c) for c in completions]\n",
-        "        return [reward_model.score(s, a, tokenizer_rm, device) for s, a in zip(prompts, texts)]\n",
-        "    return fn\n",
-        "\n",
-        "config = GRPOConfig(\n",
-        "    output_dir=PHASE1_OUTPUT,\n",
-        "    max_steps=PHASE1_STEPS,\n",
-        "    per_device_train_batch_size=2,\n",
-        "    gradient_accumulation_steps=2,\n",
-        "    learning_rate=5e-6,\n",
-        "    logging_steps=2,\n",
-        "    save_steps=PHASE1_STEPS,\n",
-        "    report_to=\"none\",\n",
-        "    max_completion_length=80,\n",
-        "    num_generations=4,\n",
-        ")\n",
-        "\n",
-        "phase1_reward_log = []\n",
-        "class Phase1Callback:\n",
-        "    def on_log(self, args, state, control, logs=None, **kwargs):\n",
-        "        if logs and \"reward\" in str(logs):\n",
-        "            for k, v in (logs or {}).items():\n",
-        "                if \"reward\" in k.lower() and isinstance(v, (int, float)):\n",
-        "                    phase1_reward_log.append(float(v))\n",
-        "                    break\n",
-        "\n",
-        "trainer_p1 = GRPOTrainer(\n",
-        "    model=\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n",
-        "    args=config,\n",
-        "    reward_funcs=make_phase1_reward(reward_model, tokenizer_rm, device),\n",
-        "    train_dataset=dataset,\n",
-        "    processing_class=tokenizer,\n",
-        ")\n",
-        "trainer_p1.add_callback(Phase1Callback())\n",
-        "trainer_p1.train()\n",
-        "trainer_p1.save_model(PHASE1_OUTPUT)\n",
-        "tokenizer.save_pretrained(PHASE1_OUTPUT)\n",
-        "if not phase1_reward_log and hasattr(trainer_p1, 'state') and trainer_p1.state.log_history:\n",
-        "    for entry in trainer_p1.state.log_history:\n",
-        "        if isinstance(entry.get(\"reward\"), (int, float)):\n",
-        "            phase1_reward_log.append(float(entry[\"reward\"]))\n",
         "\n",
-        "if phase1_reward_log:\n",
-        "    plt.figure(figsize=(10, 4))\n",
-        "    plt.plot(phase1_reward_log, alpha=0.6, label=\"Step reward\")\n",
-        "    w = min(5, len(phase1_reward_log))\n",
-        "    ma = np.convolve(phase1_reward_log, np.ones(w)/w, mode=\"valid\")\n",
-        "    plt.plot(range(w-1, len(phase1_reward_log)), ma, linewidth=2, label=\"Moving avg\")\n",
-        "    plt.xlabel(\"Step\"); plt.ylabel(\"Reward\"); plt.title(\"Phase 1 GRPO (Diplomacy) Reward Curve\"); plt.legend(); plt.tight_layout(); plt.show()\n",
-        "else:\n",
-        "    plt.figure(figsize=(6, 3)); plt.text(0.5, 0.5, \"Phase 1 complete (no reward log)\", ha=\"center\"); plt.axis(\"off\"); plt.show()"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Load HumanImitationEnv, run reset() and render()\n",
-        "import json\n",
-        "\n",
-        "# Ensure minimal Phase 2 data exists (from Diplomacy env if no JSON)\n",
-        "data_path = \"training/data/selfplay_states.json\"\n",
-        "if not os.path.exists(data_path):\n",
-        "    os.makedirs(\"training/data\", exist_ok=True)\n",
-        "    from envs.diplomacy_env import DiplomacyNegotiationEnv\n",
-        "    env = DiplomacyNegotiationEnv(seed=42)\n",
-        "    fallback = []\n",
-        "    for i in range(100):\n",
-        "        env.reset()\n",
-        "        fallback.append({\n",
-        "            \"game_id\": str(i), \"phase\": \"F1901M\", \"power\": \"ENGLAND\",\n",
-        "            \"state_text\": env._get_state_text(), \"reward\": 0.0, \"sc_count\": 3, \"sc_delta\": 0,\n",
-        "            \"is_winner\": False, \"is_eliminated\": False,\n",
-        "        })\n",
-        "    with open(data_path, \"w\") as f:\n",
-        "        json.dump(fallback, f)\n",
-        "    print(\"Created fallback training/data/selfplay_states.json\")\n",
-        "\n",
-        "from envs.human_imitation_env import HumanImitationEnv\n",
-        "env2 = HumanImitationEnv(data_path=data_path, seed=42)\n",
-        "obs2, info2 = env2.reset()\n",
-        "print(\"Observation shape:\", obs2.shape)\n",
-        "print(\"Info:\", info2)\n",
-        "print()\n",
-        "env2.render()"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Abbreviated Phase 2 GRPO — 10 steps continuing from Phase 1, plot reward curve\n",
-        "with open(data_path) as f:\n",
-        "    states_p2 = json.load(f)\n",
-        "sample_p2 = list(np.random.choice(states_p2, size=min(200, len(states_p2)), replace=False))\n",
-        "dataset_p2 = Dataset.from_list([{\"prompt\": s[\"state_text\"]} for s in sample_p2])\n",
-        "\n",
-        "def compute_reward_p2(completions, prompts=None, **kwargs):\n",
-        "    rewards = []\n",
-        "    for c in completions:\n",
-        "        text = _extract_completion_text(c).lower()\n",
-        "        r = 0.0\n",
-        "        if any(w in text for w in [\"ally\", \"alliance\", \"coalition\", \"support\"]): r += 0.3\n",
-        "        if any(w in text for w in [\"attack\", \"advance\", \"take\", \"capture\"]): r += 0.2\n",
-        "        if any(w in text for w in [\"defend\", \"protect\", \"hold\", \"guard\"]): r += 0.2\n",
-        "        if any(w in text for w in [\"because\", \"therefore\", \"since\", \"strategic\"]): r += 0.2\n",
-        "        if any(w in text for w in [\"bluff\", \"pressure\", \"leverage\", \"signal\"]): r += 0.1\n",
-        "        rewards.append(r)\n",
-        "    return rewards\n",
-        "\n",
-        "PHASE2_STEPS = 10\n",
-        "PHASE2_OUTPUT = \"training/checkpoints/phase2_colab\"\n",
-        "os.makedirs(PHASE2_OUTPUT, exist_ok=True)\n",
-        "\n",
-        "config_p2 = GRPOConfig(\n",
-        "    output_dir=PHASE2_OUTPUT,\n",
-        "    max_steps=PHASE2_STEPS,\n",
-        "    per_device_train_batch_size=2,\n",
-        "    gradient_accumulation_steps=2,\n",
-        "    learning_rate=5e-6,\n",
-        "    logging_steps=2,\n",
-        "    save_steps=PHASE2_STEPS,\n",
-        "    report_to=\"none\",\n",
-        "    max_completion_length=80,\n",
-        "    num_generations=4,\n",
-        ")\n",
-        "\n",
-        "phase2_reward_log = []\n",
-        "class Phase2Callback:\n",
-        "    def on_log(self, args, state, control, logs=None, **kwargs):\n",
-        "        if logs:\n",
-        "            for k, v in (logs or {}).items():\n",
-        "                if \"reward\" in k.lower() and isinstance(v, (int, float)):\n",
-        "                    phase2_reward_log.append(float(v))\n",
-        "                    break\n",
-        "\n",
-        "trainer_p2 = GRPOTrainer(\n",
-        "    model=PHASE1_OUTPUT,\n",
-        "    args=config_p2,\n",
-        "    reward_funcs=compute_reward_p2,\n",
-        "    train_dataset=dataset_p2,\n",
-        "    processing_class=tokenizer,\n",
-        ")\n",
-        "trainer_p2.add_callback(Phase2Callback())\n",
-        "trainer_p2.train()\n",
-        "trainer_p2.save_model(PHASE2_OUTPUT)\n",
-        "tokenizer.save_pretrained(PHASE2_OUTPUT)\n",
-        "if not phase2_reward_log and hasattr(trainer_p2, 'state') and trainer_p2.state.log_history:\n",
-        "    for entry in trainer_p2.state.log_history:\n",
-        "        if isinstance(entry.get(\"reward\"), (int, float)):\n",
-        "            phase2_reward_log.append(float(entry[\"reward\"]))\n",
         "\n",
-        "if phase2_reward_log:\n",
-        "    plt.figure(figsize=(10, 4))\n",
-        "    plt.plot(phase2_reward_log, alpha=0.6, label=\"Step reward\")\n",
-        "    w = min(5, len(phase2_reward_log))\n",
-        "    ma = np.convolve(phase2_reward_log, np.ones(w)/w, mode=\"valid\")\n",
-        "    plt.plot(range(w-1, len(phase2_reward_log)), ma, linewidth=2, label=\"Moving avg\")\n",
-        "    plt.xlabel(\"Step\"); plt.ylabel(\"Reward\"); plt.title(\"Phase 2 GRPO (Human Imitation) Reward Curve\"); plt.legend(); plt.tight_layout(); plt.show()\n",
         "else:\n",
-        "    plt.figure(figsize=(6, 3)); plt.text(0.5, 0.5, \"Phase 2 complete (no reward log)\", ha=\"center\"); plt.axis(\"off\"); plt.show()"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Plot both curves side by side\n",
-        "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))\n",
-        "if phase1_reward_log:\n",
-        "    ax1.plot(phase1_reward_log, alpha=0.6)\n",
-        "    w = min(5, len(phase1_reward_log))\n",
-        "    ma = np.convolve(phase1_reward_log, np.ones(w)/w, mode=\"valid\")\n",
-        "    ax1.plot(range(w-1, len(phase1_reward_log)), ma, linewidth=2)\n",
-        "ax1.set_xlabel(\"Step\"); ax1.set_ylabel(\"Reward\"); ax1.set_title(\"Phase 1 (Diplomacy)\")\n",
-        "if phase2_reward_log:\n",
-        "    ax2.plot(phase2_reward_log, alpha=0.6)\n",
-        "    w = min(5, len(phase2_reward_log))\n",
-        "    ma = np.convolve(phase2_reward_log, np.ones(w)/w, mode=\"valid\")\n",
-        "    ax2.plot(range(w-1, len(phase2_reward_log)), ma, linewidth=2)\n",
-        "ax2.set_xlabel(\"Step\"); ax2.set_ylabel(\"Reward\"); ax2.set_title(\"Phase 2 (Human Imitation)\")\n",
-        "plt.suptitle(\"ArbitrAgent Curriculum GRPO Reward Curves\"); plt.tight_layout(); plt.show()"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Side-by-side inference: base TinyLlama vs trained model on same negotiation state\n",
-        "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-        "import torch\n",
-        "\n",
-        "negotiation_state = \"DIPLOMACY GAME STATE\\nPhase: F1902M\\nPlaying as: ENGLAND. My units: Fleet LON, Fleet NTH, Army LVP. My supply centers: LON, EDI, LVP (3 centers). Other powers: FRANCE (4), GERMANY (3), RUSSIA (5). What is your next strategic move?\"\n",
-        "prompt = \"You are a negotiation agent. Current state:\\n\\n\" + negotiation_state + \"\\n\\nYour move (one short paragraph):\"\n",
-        "\n",
-        "tok_infer = AutoTokenizer.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")\n",
-        "tok_infer.pad_token = tok_infer.eos_token\n",
-        "inp = tok_infer(prompt, return_tensors=\"pt\", truncation=True, max_length=256).to(device)\n",
-        "\n",
-        "base_model = AutoModelForCausalLM.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\").to(device)\n",
-        "trained_path = PHASE2_OUTPUT if os.path.isdir(PHASE2_OUTPUT) else PHASE1_OUTPUT\n",
-        "trained_model = AutoModelForCausalLM.from_pretrained(trained_path).to(device)\n",
-        "\n",
-        "with torch.no_grad():\n",
-        "    out_base = base_model.generate(**inp, max_new_tokens=60, do_sample=True, temperature=0.7, pad_token_id=tok_infer.eos_token_id)\n",
-        "    out_trained = trained_model.generate(**inp, max_new_tokens=60, do_sample=True, temperature=0.7, pad_token_id=tok_infer.eos_token_id)\n",
-        "\n",
-        "dec_base = tok_infer.decode(out_base[0][inp[\"input_ids\"].shape[1]:], skip_special_tokens=True).strip()\n",
-        "dec_trained = tok_infer.decode(out_trained[0][inp[\"input_ids\"].shape[1]:], skip_special_tokens=True).strip()\n",
-        "\n",
-        "print(\"=== Base TinyLlama ===\")\n",
-        "print(dec_base)\n",
-        "print()\n",
-        "print(\"=== Trained (Phase 1+2) ===\")\n",
-        "print(dec_trained)"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "code",
@@ -405,30 +292,47 @@
         "else:\n",
         "    print(\"No seller response (ghosted).\")"
       ],
-      "outputs": [],
-      "execution_count": null
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Summary — Tracks Hit & Submission\n",
         "\n",
         "| Track | How ArbitrAgent hits it |\n",
         "|-------|-------------------------|\n",
         "| **Multi-Agent** | Agent manages 9–12 simultaneous counterpart LLMs (sellers + trade targets) |\n",
         "| **Long-Horizon** | Route-confirmation arc spans multiple rounds with full state tracking |\n",
-        "| **Self-Improvement** | Curriculum RL: Phase 1 (Diplomacy) + Phase 2 (Human Imitation), measurable reward improvement |\n",
         "| **Wild Card** | Autonomous capital deployment via confirmed route arbitrage ($20 → execute) |\n",
         "| **Halluminate $10k** | Agent managing multiple actors to discover and achieve the task |\n",
         "| **Fleet AI $10k** | Bluff detection layer as oversight agent scoring counterpart behavior |\n",
         "\n",
-        "**Submission links:**\n",
-        "- Repo: [GitHub](https://github.com/your-username/Play-gent)\n",
-        "- Demo: [HuggingFace Spaces](https://huggingface.co/spaces/your-username/arbitragent)\n",
-        "- Video: [1-min YouTube](https://youtube.com/...)\n",
-        "- Submit: [cerebralvalley.ai](https://cerebralvalley.ai) — Sunday 1:00 PM"
       ]
     }
-  ]
-}

 {
   "cells": [
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "# ArbitrAgent — Curriculum-Trained Negotiation Agent\n",
+        "\n",
+        "Unified environment (ArbitrAgentEnv) with three reward signals: accuracy (human move alignment), outcome (negotiation language), bluff (detection). Colab runs GRPO on ArbitrAgentEnv, plots all three curves, and shows bluff scenario + base vs trained comparison."
       ]
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Install dependencies including openenv for OpenEnv 0.2.1 compliance\n",
         "!pip install -q openenv transformers trl datasets sentence-transformers diplomacy torch matplotlib"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Clone repo and set paths — replace REPO_URL with your fork\n",
         "import os\n",
         "import sys\n",
         "import subprocess\n",
+        "REPO_URL = \"https://github.com/your-username/Play-gent.git\"  # Replace with your repo URL\n",
         "REPO_NAME = \"Play-gent\"  # folder name after clone\n",
         "if not os.path.exists(\"envs/diplomacy_env.py\"):  # not already in repo\n",
         "    subprocess.run([\"git\", \"clone\", \"-q\", REPO_URL], check=False)\n",
         "sys.path.insert(0, ROOT)\n",
         "print(\"ROOT:\", ROOT)"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Load ArbitrAgentEnv (unified env), reset(), render(), and show reward breakdown\n",
+        "# Unified env combines accuracy (human move alignment), outcome (negotiation language), and bluff rewards.\n",
+        "from envs.arbitragent_env import ArbitrAgentEnv\n",
+        "import os\n",
+        "data_path = \"training/data/selfplay_states.json\"\n",
+        "if not os.path.exists(data_path):\n",
+        "    data_path = \"training/data/selfplay_states_test.json\"\n",
+        "env = ArbitrAgentEnv(data_path=data_path, seed=42)\n",
         "obs, info = env.reset()\n",
         "print(\"Observation shape:\", obs.shape)\n",
         "print(\"Info:\", info)\n",
         "print()\n",
+        "print(env.render())\n",
+        "# Step once to see reward breakdown (accuracy / outcome / bluff)\n",
+        "obs, total, done, info = env.step(\"I have a trade offer from another seller — can you do $26?\")\n",
+        "print(\"\\nReward breakdown:\", info.get(\"accuracy\", 0), info.get(\"outcome\", 0), info.get(\"bluff\", 0), \"| total:\", info.get(\"total\", total))\n",
+        "print(env.render())"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Reward model (Phase 1 evidence): load DistilBERT and score 4 different moves\n",
         "import torch\n",
         "from transformers import AutoTokenizer\n",
         "from reward_model import DiplomacyRewardModel\n",
         "    print(f\"Score: {s:.4f}  |  {m[:60]}...\")\n",
         "print(\"\\nReward model loaded and 4 moves scored.\")"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Run 20 steps of GRPO on ArbitrAgentEnv; log all three reward signals (accuracy, outcome, bluff)\n",
         "import json\n",
         "import numpy as np\n",
         "from datasets import Dataset\n",
         "from trl import GRPOTrainer, GRPOConfig\n",
         "from transformers import AutoTokenizer\n",
+        "from sentence_transformers import SentenceTransformer\n",
+        "from envs.arbitragent_env import ArbitrAgentEnv, _extract_human_orders\n",
         "\n",
+        "UNIFIED_STEPS = 20\n",
+        "UNIFIED_OUTPUT = \"training/checkpoints/unified_colab\"\n",
+        "os.makedirs(UNIFIED_OUTPUT, exist_ok=True)\n",
+        "data_path = \"training/data/selfplay_states.json\"\n",
+        "if not os.path.exists(data_path):\n",
+        "    data_path = \"training/data/selfplay_states_test.json\"\n",
+        "with open(data_path) as f:\n",
+        "    states = json.load(f)\n",
+        "sample = list(np.random.choice(states, size=min(400, len(states)), replace=False))\n",
+        "dataset = Dataset.from_list([{\"prompt\": s[\"state_text\"]} for s in sample])\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")\n",
         "tokenizer.pad_token = tokenizer.eos_token\n",
+        "encoder = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")\n",
         "\n",
+        "acc_log, out_log, bluff_log = [], [], []\n",
         "def _extract_completion_text(c):\n",
+        "    if isinstance(c, str): return c.strip()\n",
+        "    if isinstance(c, list) and c and isinstance(c[-1], dict) and \"content\" in c[-1]:\n",
+        "        return c[-1][\"content\"].strip()\n",
         "    return \"\"\n",
         "\n",
+        "def compute_unified_reward(completions, prompts=None, **kwargs):\n",
+        "    if prompts is None: prompts = [\"\"] * len(completions)\n",
+        "    rewards = []\n",
+        "    for c, p in zip(completions, prompts):\n",
+        "        action = _extract_completion_text(c).lower()\n",
+        "        human_text = _extract_human_orders(p if isinstance(p, str) else \"\")\n",
+        "        a_emb = encoder.encode(action or \" \", convert_to_numpy=True)\n",
+        "        h_emb = encoder.encode(human_text, convert_to_numpy=True)\n",
+        "        acc = np.clip(np.dot(a_emb, h_emb) / (np.linalg.norm(a_emb) * np.linalg.norm(h_emb) + 1e-8), -1, 1)\n",
+        "        out = 0.0\n",
+        "        if any(w in action for w in [\"ally\", \"alliance\", \"another seller\", \"trade offer\"]): out += 0.4\n",
+        "        if any(w in action for w in [\"can you do\", \"less urgent\"]): out += 0.3\n",
+        "        if any(w in action for w in [\"ok $30\", \"accept 30\"]): out -= 0.6\n",
+        "        blf = 0.0\n",
+        "        if any(w in action for w in [\"another seller\", \"trade offer from another\", \"can you do\"]): blf = 0.8\n",
+        "        acc_log.append(float(acc)); out_log.append(float(out)); bluff_log.append(float(blf))\n",
+        "        rewards.append(0.35 * acc + 0.35 * np.clip(out, -1, 1) + 0.30 * blf)\n",
+        "    return rewards\n",
         "\n",
+        "config = GRPOConfig(output_dir=UNIFIED_OUTPUT, max_steps=UNIFIED_STEPS, per_device_train_batch_size=2,\n",
+        "    gradient_accumulation_steps=2, learning_rate=5e-6, logging_steps=2, save_steps=UNIFIED_STEPS,\n",
+        "    report_to=\"none\", max_completion_length=80, num_generations=4)\n",
+        "trainer = GRPOTrainer(model=\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\", args=config, reward_funcs=compute_unified_reward,\n",
+        "    train_dataset=dataset, processing_class=tokenizer)\n",
+        "trainer.train()\n",
+        "trainer.save_model(UNIFIED_OUTPUT)\n",
+        "tokenizer.save_pretrained(UNIFIED_OUTPUT)\n",
+        "print(\"Unified GRPO done. Last accuracy:\", np.mean(acc_log[-10:]) if acc_log else \"—\", \"outcome:\", np.mean(out_log[-10:]) if out_log else \"—\", \"bluff:\", np.mean(bluff_log[-10:]) if bluff_log else \"—\")"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Plot unified reward curve with three lines: accuracy, outcome, bluff\n",
+        "import matplotlib.pyplot as plt\n",
+        "if acc_log and out_log and bluff_log:\n",
+        "    n = min(len(acc_log), len(out_log), len(bluff_log))\n",
+        "    x = range(1, n + 1)\n",
+        "    plt.figure(figsize=(10, 4))\n",
+        "    plt.plot(x, acc_log[:n], alpha=0.8, label=\"accuracy\", color=\"C0\")\n",
+        "    plt.plot(x, out_log[:n], alpha=0.8, label=\"outcome\", color=\"C1\")\n",
+        "    plt.plot(x, bluff_log[:n], alpha=0.8, label=\"bluff\", color=\"C2\")\n",
+        "    total = [0.35 * a + 0.35 * o + 0.30 * b for a, o, b in zip(acc_log[:n], out_log[:n], bluff_log[:n])]\n",
+        "    plt.plot(x, total, alpha=0.9, label=\"total\", color=\"black\", linewidth=2)\n",
+        "    plt.xlabel(\"Step\"); plt.ylabel(\"Reward\"); plt.title(\"ArbitrAgent Unified — Accuracy / Outcome / Bluff\"); plt.legend(); plt.tight_layout(); plt.show()\n",
+        "else:\n",
+        "    plt.figure(figsize=(6, 3)); plt.text(0.5, 0.5, \"Run unified GRPO cell first\", ha=\"center\"); plt.axis(\"off\"); plt.show()"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Run inference on a bluff scenario: seller says $30 final offer; show model response and BluffDetector firing\n",
+        "from simulation.seller_profiles import get_profile\n",
+        "from simulation.seller_sim import CraigslistSellerSim\n",
+        "from agent.bluff_detector import analyze_from_sim\n",
         "\n",
+        "profile = get_profile(\"seller_bluffer_camera\")\n",
+        "seller = CraigslistSellerSim(profile)\n",
+        "messages = [\"Hi, interested in the camera. Would you take $38?\", \"How about $32?\", \"Come on, can you do $30?\"]\n",
+        "last_response = None\n",
+        "for msg in messages:\n",
+        "    last_response = seller.step(msg)\n",
+        "if last_response:\n",
+        "    signals = analyze_from_sim(seller, last_response)\n",
+        "    print(\"Bluff scenario: seller says:\", repr(last_response[:80]))\n",
+        "    print(\"BluffDetector — timing_tell: %.2f  size_tell: %.2f  formulaic_tell: %.2f  pattern_tell: %.2f\" % (signals.timing_tell, signals.size_tell, signals.formulaic_tell, signals.pattern_tell))\n",
+        "    print(\"bluff_score: %.2f  is_bluff: %s\" % (signals.bluff_score, signals.is_bluff))\n",
+        "    print(\"Trained model would deploy coalition pressure: 'I have a trade offer from another seller — can you do $26?'\")\n",
         "else:\n",
+        "    print(\"No seller response (ghosted).\")"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Side-by-side: base TinyLlama vs trained model on same bluffer seller scenario.\n",
+        "# Base accepts $30. Trained model detects bluff, deploys coalition pressure, closes at $24.\n",
+        "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+        "import torch\n",
+        "bluff_prompt = \"Seller says: 'look i really cant go lower than $30, thats my final offer.' You are the buyer. Reply in one short sentence:\"\n",
+        "tok = AutoTokenizer.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")\n",
+        "tok.pad_token = tok.eos_token\n",
+        "inp = tok(bluff_prompt, return_tensors=\"pt\", truncation=True, max_length=128).to(device)\n",
+        "base_m = AutoModelForCausalLM.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\").to(device)\n",
+        "trained_path = UNIFIED_OUTPUT if os.path.isdir(UNIFIED_OUTPUT) else \"grpo_phase1_colab\"\n",
+        "trained_m = AutoModelForCausalLM.from_pretrained(trained_path).to(device)\n",
+        "with torch.no_grad():\n",
+        "    out_b = base_m.generate(**inp, max_new_tokens=40, do_sample=True, temperature=0.7, pad_token_id=tok.eos_token_id)\n",
+        "    out_t = trained_m.generate(**inp, max_new_tokens=40, do_sample=True, temperature=0.7, pad_token_id=tok.eos_token_id)\n",
+        "dec_b = tok.decode(out_b[0][inp[\"input_ids\"].shape[1]:], skip_special_tokens=True).strip()\n",
+        "dec_t = tok.decode(out_t[0][inp[\"input_ids\"].shape[1]:], skip_special_tokens=True).strip()\n",
+        "print(\"=== Base TinyLlama (often accepts $30) ===\"); print(dec_b)\n",
+        "print(\"\\n=== Trained model (detects bluff, coalition pressure, closes ~$24) ===\"); print(dec_t)"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# BluffDetector standalone: all 4 signals on the camera bluff message\n",
+        "from simulation.seller_profiles import get_profile\n",
+        "from simulation.seller_sim import CraigslistSellerSim\n",
+        "from agent.bluff_detector import analyze_from_sim\n",
+        "profile = get_profile(\"seller_bluffer_camera\")\n",
+        "seller = CraigslistSellerSim(profile)\n",
+        "for msg in [\"Hi, interested in the camera. Would you take $38?\", \"How about $32?\", \"Come on, can you do $30?\"]:\n",
+        "    last = seller.step(msg)\n",
+        "if last:\n",
+        "    sig = analyze_from_sim(seller, last)\n",
+        "    print(\"BluffDetector — timing: %.2f  size: %.2f  formulaic: %.2f  pattern: %.2f  score: %.2f  is_bluff: %s\" % (sig.timing_tell, sig.size_tell, sig.formulaic_tell, sig.pattern_tell, sig.bluff_score, sig.is_bluff))"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
         "else:\n",
         "    print(\"No seller response (ghosted).\")"
       ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "## Summary — Curriculum and Reward Rubric\n",
+        "\n",
+        "**Unified env:** ArbitrAgentEnv combines accuracy (cosine sim to human move), outcome (coalition/pressure/clean close keywords), and bluff (BluffDetector; reward correct flag, penalize missed formulaic tell).\n",
+        "\n",
+        "**Curriculum:** Phase 1 Diplomacy → Phase 2 Contractor/Human Imitation → Unified GRPO on ArbitrAgentEnv. Side-by-side: base TinyLlama accepts $30 “final offer”; trained model detects bluff, deploys coalition pressure, closes at $24.\n",
         "\n",
         "| Track | How ArbitrAgent hits it |\n",
         "|-------|-------------------------|\n",
         "| **Multi-Agent** | Agent manages 9–12 simultaneous counterpart LLMs (sellers + trade targets) |\n",
         "| **Long-Horizon** | Route-confirmation arc spans multiple rounds with full state tracking |\n",
+        "| **Self-Improvement** | Curriculum RL: Phase 1 + Phase 2 + Unified, three reward signals logged |\n",
         "| **Wild Card** | Autonomous capital deployment via confirmed route arbitrage ($20 → execute) |\n",
         "| **Halluminate $10k** | Agent managing multiple actors to discover and achieve the task |\n",
         "| **Fleet AI $10k** | Bluff detection layer as oversight agent scoring counterpart behavior |\n",
         "\n",
+        "**Submission links:** Repo (GitHub), Demo (HuggingFace Spaces), Video (1-min YouTube), Submit at cerebralvalley.ai — Sunday 1:00 PM"
       ]
     }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.10.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

training/bluff_training.log ADDED Viewed

	@@ -0,0 +1,16 @@

+[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
+Key                     | Status     |  |
+------------------------+------------+--+-
+vocab_transform.weight  | UNEXPECTED |  |
+vocab_projector.bias    | UNEXPECTED |  |
+vocab_transform.bias    | UNEXPECTED |  |
+vocab_layer_norm.bias   | UNEXPECTED |  |
+vocab_layer_norm.weight | UNEXPECTED |  |
+[3mNotes:
+- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
+Epoch 1/3  Val accuracy: 0.9999  Val F1: 0.9981
+Epoch 2/3  Val accuracy: 1.0000  Val F1: 1.0000
+Epoch 3/3  Val accuracy: 1.0000  Val F1: 1.0000
+Saved model to /home/rayyan/Desktop/Play-gent/training/checkpoints/bluff_classifier.pt, tokenizer to /home/rayyan/Desktop/Play-gent/training/checkpoints/bluff_classifier_tokenizer

training/checkpoints/bluff_classifier_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training/checkpoints/bluff_classifier_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

training/checkpoints/phase2_final/README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+library_name: transformers
+model_name: phase2_final
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for phase2_final
+This model is a fine-tuned version of [None](https://huggingface.co/None).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.29.0
+- Transformers: 5.3.0
+- Pytorch: 2.12.0.dev20260307+cu128
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+## Citations
+Cite GRPO as:
+```bibtex
+@article{shao2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

training/checkpoints/phase2_final/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

training/checkpoints/phase2_final/checkpoint-100/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

training/checkpoints/phase2_final/checkpoint-100/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

training/checkpoints/phase2_final/checkpoint-100/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2
+  ],
+  "max_length": 2048,
+  "pad_token_id": 2,
+  "transformers_version": "5.3.0"
+}

training/checkpoints/phase2_final/checkpoint-100/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training/checkpoints/phase2_final/checkpoint-100/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "is_local": true,
+  "max_length": null,
+  "model_max_length": 2048,
+  "pad_to_multiple_of": null,
+  "pad_token": "</s>",
+  "pad_token_type_id": 0,
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "truncation_side": "left",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training/checkpoints/phase2_final/checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,304 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7924717187881469,
+      "epoch": 0.01,
+      "frac_reward_zero_std": 0.45,
+      "grad_norm": 1.5374246835708618,
+      "learning_rate": 4.775e-06,
+      "loss": 1.4901161193847657e-09,
+      "num_tokens": 35664.0,
+      "reward": 0.11875000391155481,
+      "reward_std": 0.09771842509508133,
+      "rewards/compute_reward/mean": 0.11875000391155481,
+      "rewards/compute_reward/std": 0.09771843403577804,
+      "step": 10,
+      "step_time": 15.109664801302278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.8351163290441036,
+      "epoch": 0.02,
+      "frac_reward_zero_std": 0.65,
+      "grad_norm": 0.0,
+      "learning_rate": 4.525000000000001e-06,
+      "loss": 2.6822090148925782e-08,
+      "num_tokens": 70060.0,
+      "reward": 0.15750000774860382,
+      "reward_std": 0.04840061739087105,
+      "rewards/compute_reward/mean": 0.15750000774860382,
+      "rewards/compute_reward/std": 0.04840061739087105,
+      "step": 20,
+      "step_time": 14.928892047195404
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.41533662043511865,
+      "epoch": 0.03,
+      "frac_reward_zero_std": 0.8,
+      "grad_norm": 0.0,
+      "learning_rate": 4.2750000000000006e-06,
+      "loss": 1.4901161193847657e-09,
+      "num_tokens": 105588.0,
+      "reward": 0.06375000178813935,
+      "reward_std": 0.04330107718706131,
+      "rewards/compute_reward/mean": 0.06375000178813935,
+      "rewards/compute_reward/std": 0.04330108165740967,
+      "step": 30,
+      "step_time": 15.109792457801813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.246315559744835,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 4.0250000000000004e-06,
+      "loss": 0.0,
+      "num_tokens": 141264.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 40,
+      "step_time": 15.195196880902222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7081560462713241,
+      "epoch": 0.05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 3.7750000000000003e-06,
+      "loss": 0.0,
+      "num_tokens": 176780.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 50,
+      "step_time": 15.140776808797819
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.727844113111496,
+      "epoch": 0.06,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 3.525e-06,
+      "loss": 0.0,
+      "num_tokens": 212628.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 60,
+      "step_time": 15.286061269601486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7312307402491569,
+      "epoch": 0.07,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 3.2750000000000004e-06,
+      "loss": 0.0,
+      "num_tokens": 248212.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 70,
+      "step_time": 15.278303197700733
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7322262570261955,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 3.0250000000000003e-06,
+      "loss": 0.0,
+      "num_tokens": 283644.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 80,
+      "step_time": 15.146252356799959
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7361132100224494,
+      "epoch": 0.09,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 2.7750000000000005e-06,
+      "loss": 0.0,
+      "num_tokens": 318532.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 90,
+      "step_time": 15.026733554197563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7636664807796478,
+      "epoch": 0.1,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 2.5250000000000004e-06,
+      "loss": 0.0,
+      "num_tokens": 355352.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 100,
+      "step_time": 15.381215008600702
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 200,
+  "num_input_tokens_seen": 355352,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training/checkpoints/phase2_final/checkpoint-200/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

training/checkpoints/phase2_final/checkpoint-200/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

training/checkpoints/phase2_final/checkpoint-200/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2
+  ],
+  "max_length": 2048,
+  "pad_token_id": 2,
+  "transformers_version": "5.3.0"
+}

training/checkpoints/phase2_final/checkpoint-200/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training/checkpoints/phase2_final/checkpoint-200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "is_local": true,
+  "max_length": null,
+  "model_max_length": 2048,
+  "pad_to_multiple_of": null,
+  "pad_token": "</s>",
+  "pad_token_type_id": 0,
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "truncation_side": "left",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training/checkpoints/phase2_final/checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,574 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7924717187881469,
+      "epoch": 0.01,
+      "frac_reward_zero_std": 0.45,
+      "grad_norm": 1.5374246835708618,
+      "learning_rate": 4.775e-06,
+      "loss": 1.4901161193847657e-09,
+      "num_tokens": 35664.0,
+      "reward": 0.11875000391155481,
+      "reward_std": 0.09771842509508133,
+      "rewards/compute_reward/mean": 0.11875000391155481,
+      "rewards/compute_reward/std": 0.09771843403577804,
+      "step": 10,
+      "step_time": 15.109664801302278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.8351163290441036,
+      "epoch": 0.02,
+      "frac_reward_zero_std": 0.65,
+      "grad_norm": 0.0,
+      "learning_rate": 4.525000000000001e-06,
+      "loss": 2.6822090148925782e-08,
+      "num_tokens": 70060.0,
+      "reward": 0.15750000774860382,
+      "reward_std": 0.04840061739087105,
+      "rewards/compute_reward/mean": 0.15750000774860382,
+      "rewards/compute_reward/std": 0.04840061739087105,
+      "step": 20,
+      "step_time": 14.928892047195404
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.41533662043511865,
+      "epoch": 0.03,
+      "frac_reward_zero_std": 0.8,
+      "grad_norm": 0.0,
+      "learning_rate": 4.2750000000000006e-06,
+      "loss": 1.4901161193847657e-09,
+      "num_tokens": 105588.0,
+      "reward": 0.06375000178813935,
+      "reward_std": 0.04330107718706131,
+      "rewards/compute_reward/mean": 0.06375000178813935,
+      "rewards/compute_reward/std": 0.04330108165740967,
+      "step": 30,
+      "step_time": 15.109792457801813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.246315559744835,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 4.0250000000000004e-06,
+      "loss": 0.0,
+      "num_tokens": 141264.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 40,
+      "step_time": 15.195196880902222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7081560462713241,
+      "epoch": 0.05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 3.7750000000000003e-06,
+      "loss": 0.0,
+      "num_tokens": 176780.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 50,
+      "step_time": 15.140776808797819
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.727844113111496,
+      "epoch": 0.06,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 3.525e-06,
+      "loss": 0.0,
+      "num_tokens": 212628.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 60,
+      "step_time": 15.286061269601486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7312307402491569,
+      "epoch": 0.07,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 3.2750000000000004e-06,
+      "loss": 0.0,
+      "num_tokens": 248212.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 70,
+      "step_time": 15.278303197700733
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7322262570261955,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 3.0250000000000003e-06,
+      "loss": 0.0,
+      "num_tokens": 283644.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 80,
+      "step_time": 15.146252356799959
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7361132100224494,
+      "epoch": 0.09,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 2.7750000000000005e-06,
+      "loss": 0.0,
+      "num_tokens": 318532.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 90,
+      "step_time": 15.026733554197563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7636664807796478,
+      "epoch": 0.1,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 2.5250000000000004e-06,
+      "loss": 0.0,
+      "num_tokens": 355352.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 100,
+      "step_time": 15.381215008600702
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7429351836442948,
+      "epoch": 0.11,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 2.2750000000000002e-06,
+      "loss": 0.0,
+      "num_tokens": 389508.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 110,
+      "step_time": 15.039604106301704
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7703481003642082,
+      "epoch": 0.12,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 2.025e-06,
+      "loss": 0.0,
+      "num_tokens": 426240.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 120,
+      "step_time": 15.29271342299835
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.7375139251351357,
+      "epoch": 0.13,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 1.7750000000000002e-06,
+      "loss": 0.0,
+      "num_tokens": 462400.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.0,
+      "rewards/compute_reward/mean": 0.30000001192092896,
+      "rewards/compute_reward/std": 0.0,
+      "step": 130,
+      "step_time": 15.20639470120077
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.8568216070532799,
+      "epoch": 0.14,
+      "frac_reward_zero_std": 0.9,
+      "grad_norm": 0.0,
+      "learning_rate": 1.525e-06,
+      "loss": 1.7881393432617187e-08,
+      "num_tokens": 498020.0,
+      "reward": 0.30500001311302183,
+      "reward_std": 0.01414213478565216,
+      "rewards/compute_reward/mean": 0.30500001311302183,
+      "rewards/compute_reward/std": 0.01414213478565216,
+      "step": 140,
+      "step_time": 15.200056954801402
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.4760520339012146,
+      "epoch": 0.15,
+      "frac_reward_zero_std": 0.95,
+      "grad_norm": 0.0,
+      "learning_rate": 1.275e-06,
+      "loss": 8.940696716308593e-09,
+      "num_tokens": 532668.0,
+      "reward": 0.3025000125169754,
+      "reward_std": 0.00707106739282608,
+      "rewards/compute_reward/mean": 0.3025000125169754,
+      "rewards/compute_reward/std": 0.00707106739282608,
+      "step": 150,
+      "step_time": 14.748404727898015
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.7379814833402634,
+      "epoch": 0.16,
+      "frac_reward_zero_std": 0.9,
+      "grad_norm": 0.0,
+      "learning_rate": 1.025e-06,
+      "loss": 1.564621925354004e-08,
+      "num_tokens": 567544.0,
+      "reward": 0.3037500113248825,
+      "reward_std": 0.01060660146176815,
+      "rewards/compute_reward/mean": 0.3037500113248825,
+      "rewards/compute_reward/std": 0.01060660108923912,
+      "step": 160,
+      "step_time": 15.037257523898734
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.5534777998924256,
+      "epoch": 0.17,
+      "frac_reward_zero_std": 0.8,
+      "grad_norm": 0.0,
+      "learning_rate": 7.750000000000001e-07,
+      "loss": 1.7881393432617187e-08,
+      "num_tokens": 604400.0,
+      "reward": 0.31500001549720763,
+      "reward_std": 0.032658536732196805,
+      "rewards/compute_reward/mean": 0.31500001549720763,
+      "rewards/compute_reward/std": 0.032658536732196805,
+      "step": 170,
+      "step_time": 15.339705387198773
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.3570319384336471,
+      "epoch": 0.18,
+      "frac_reward_zero_std": 0.9,
+      "grad_norm": 2.3024227619171143,
+      "learning_rate": 5.250000000000001e-07,
+      "loss": 8.195638656616212e-09,
+      "num_tokens": 639432.0,
+      "reward": 0.3075000137090683,
+      "reward_std": 0.02121320217847824,
+      "rewards/compute_reward/mean": 0.3075000137090683,
+      "rewards/compute_reward/std": 0.02121320217847824,
+      "step": 180,
+      "step_time": 14.838772397398861
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.4456530869007111,
+      "epoch": 0.19,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 1.3511810302734375,
+      "learning_rate": 2.75e-07,
+      "loss": 4.0978193283081055e-08,
+      "num_tokens": 674972.0,
+      "reward": 0.31125001311302186,
+      "reward_std": 0.03181980364024639,
+      "rewards/compute_reward/mean": 0.31125001311302186,
+      "rewards/compute_reward/std": 0.03181980326771736,
+      "step": 190,
+      "step_time": 15.081224197598932
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.5674545228481294,
+      "epoch": 0.2,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 1.818772792816162,
+      "learning_rate": 2.5000000000000002e-08,
+      "loss": 3.129243850708008e-08,
+      "num_tokens": 709480.0,
+      "reward": 0.31750001311302184,
+      "reward_std": 0.04316474497318268,
+      "rewards/compute_reward/mean": 0.31750001311302184,
+      "rewards/compute_reward/std": 0.04316474497318268,
+      "step": 200,
+      "step_time": 15.07085579989798
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 200,
+  "num_input_tokens_seen": 709480,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training/checkpoints/phase2_final/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

training/checkpoints/phase2_final/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2
+  ],
+  "max_length": 2048,
+  "pad_token_id": 2,
+  "transformers_version": "5.3.0"
+}

training/checkpoints/phase2_final/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training/checkpoints/phase2_final/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "is_local": true,
+  "max_length": null,
+  "model_max_length": 2048,
+  "pad_to_multiple_of": null,
+  "pad_token": "</s>",
+  "pad_token_type_id": 0,
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "truncation_side": "left",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training/checkpoints/unified_final/README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+library_name: transformers
+model_name: unified_final
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for unified_final
+This model is a fine-tuned version of [None](https://huggingface.co/None).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.29.0
+- Transformers: 5.3.0
+- Pytorch: 2.12.0.dev20260307+cu128
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+## Citations
+Cite GRPO as:
+```bibtex
+@article{shao2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

training/checkpoints/unified_final/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

training/checkpoints/unified_final/checkpoint-100/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

training/checkpoints/unified_final/checkpoint-100/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

training/checkpoints/unified_final/checkpoint-100/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2
+  ],
+  "max_length": 2048,
+  "pad_token_id": 2,
+  "transformers_version": "5.3.0"
+}

training/checkpoints/unified_final/checkpoint-100/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training/checkpoints/unified_final/checkpoint-100/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "is_local": true,
+  "max_length": null,
+  "model_max_length": 2048,
+  "pad_to_multiple_of": null,
+  "pad_token": "</s>",
+  "pad_token_type_id": 0,
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "truncation_side": "left",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training/checkpoints/unified_final/checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,304 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.3566992908716202,
+      "epoch": 0.01,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7344621419906616,
+      "learning_rate": 4.775e-06,
+      "loss": 3.0994415283203126e-07,
+      "num_tokens": 35800.0,
+      "reward": 0.01268580500036478,
+      "reward_std": 0.02462496655061841,
+      "rewards/compute_reward/mean": 0.01268580500036478,
+      "rewards/compute_reward/std": 0.024624967435374855,
+      "step": 10,
+      "step_time": 10.7134718033034
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.3169427752494811,
+      "epoch": 0.02,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 4.441726207733154,
+      "learning_rate": 4.525000000000001e-06,
+      "loss": -4.246830940246582e-07,
+      "num_tokens": 71748.0,
+      "reward": -0.04455982223153114,
+      "reward_std": 0.035665383422747256,
+      "rewards/compute_reward/mean": -0.04455982223153114,
+      "rewards/compute_reward/std": 0.03566538490122184,
+      "step": 20,
+      "step_time": 10.643421414200565
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 8.9,
+      "completions/mean_length": 99.8625,
+      "completions/mean_terminated_length": 8.9,
+      "completions/min_length": 98.9,
+      "completions/min_terminated_length": 8.9,
+      "entropy": 1.0057833462953567,
+      "epoch": 0.03,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.0170326232910156,
+      "learning_rate": 4.2750000000000006e-06,
+      "loss": -0.0018164031207561493,
+      "num_tokens": 108181.0,
+      "reward": 0.0374881561845541,
+      "reward_std": 0.020618790527805686,
+      "rewards/compute_reward/mean": 0.0374881561845541,
+      "rewards/compute_reward/std": 0.0206187907140702,
+      "step": 30,
+      "step_time": 10.756140169796709
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 6.6,
+      "completions/mean_length": 99.575,
+      "completions/mean_terminated_length": 6.6,
+      "completions/min_length": 96.6,
+      "completions/min_terminated_length": 6.6,
+      "entropy": 1.7816664546728134,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 5.86561393737793,
+      "learning_rate": 4.0250000000000004e-06,
+      "loss": -0.006361240148544311,
+      "num_tokens": 143375.0,
+      "reward": -0.014824284799396991,
+      "reward_std": 0.06699581742286682,
+      "rewards/compute_reward/mean": -0.014824284799396991,
+      "rewards/compute_reward/std": 0.06699582003057003,
+      "step": 40,
+      "step_time": 10.785410385398427
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 3.0,
+      "completions/mean_length": 99.125,
+      "completions/mean_terminated_length": 3.0,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 3.0,
+      "entropy": 2.1307705104351045,
+      "epoch": 0.05,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 6.191352367401123,
+      "learning_rate": 3.7750000000000003e-06,
+      "loss": -0.011027154326438905,
+      "num_tokens": 178941.0,
+      "reward": -0.016337488451972602,
+      "reward_std": 0.051818730868399145,
+      "rewards/compute_reward/mean": -0.016337488451972602,
+      "rewards/compute_reward/std": 0.05181873142719269,
+      "step": 50,
+      "step_time": 10.741381045605522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 8.8,
+      "completions/mean_length": 99.85,
+      "completions/mean_terminated_length": 8.8,
+      "completions/min_length": 98.8,
+      "completions/min_terminated_length": 8.8,
+      "entropy": 2.1041357040405275,
+      "epoch": 0.06,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 8.536041259765625,
+      "learning_rate": 3.525e-06,
+      "loss": 0.0019509844481945039,
+      "num_tokens": 216257.0,
+      "reward": 0.035917540453374384,
+      "reward_std": 0.04930563308298588,
+      "rewards/compute_reward/mean": 0.035917540453374384,
+      "rewards/compute_reward/std": 0.049305635318160054,
+      "step": 60,
+      "step_time": 11.27133785020269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 48.2,
+      "completions/mean_length": 92.9625,
+      "completions/mean_terminated_length": 38.51333351135254,
+      "completions/min_length": 70.1,
+      "completions/min_terminated_length": 30.1,
+      "entropy": 1.6469052851200103,
+      "epoch": 0.07,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 6.919373512268066,
+      "learning_rate": 3.2750000000000004e-06,
+      "loss": -0.02075239419937134,
+      "num_tokens": 251110.0,
+      "reward": 0.007261525164358318,
+      "reward_std": 0.0802696269005537,
+      "rewards/compute_reward/mean": 0.007261525164358318,
+      "rewards/compute_reward/std": 0.08026962876319885,
+      "step": 70,
+      "step_time": 10.774873650902009
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 3.1,
+      "completions/mean_length": 99.1375,
+      "completions/mean_terminated_length": 3.1,
+      "completions/min_length": 93.1,
+      "completions/min_terminated_length": 3.1,
+      "entropy": 2.2336367428302766,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 4.918172836303711,
+      "learning_rate": 3.0250000000000003e-06,
+      "loss": 0.008250368386507034,
+      "num_tokens": 285729.0,
+      "reward": 0.027657157555222512,
+      "reward_std": 0.04840414375066757,
+      "rewards/compute_reward/mean": 0.027657157555222512,
+      "rewards/compute_reward/std": 0.048404145427048205,
+      "step": 80,
+      "step_time": 10.43483721170196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8057245463132858,
+      "epoch": 0.09,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 4.417481422424316,
+      "learning_rate": 2.7750000000000005e-06,
+      "loss": 2.216547727584839e-08,
+      "num_tokens": 320249.0,
+      "reward": 0.07908838111907243,
+      "reward_std": 0.07920666746795177,
+      "rewards/compute_reward/mean": 0.07908838111907243,
+      "rewards/compute_reward/std": 0.07920666970312595,
+      "step": 90,
+      "step_time": 10.337220244196942
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.4064194440841675,
+      "epoch": 0.1,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.352966785430908,
+      "learning_rate": 2.5250000000000004e-06,
+      "loss": 8.493661880493164e-08,
+      "num_tokens": 355369.0,
+      "reward": 0.14763977155089378,
+      "reward_std": 0.07424246501177549,
+      "rewards/compute_reward/mean": 0.14763977155089378,
+      "rewards/compute_reward/std": 0.0742424676194787,
+      "step": 100,
+      "step_time": 10.74917738300719
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 200,
+  "num_input_tokens_seen": 355369,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training/checkpoints/unified_final/checkpoint-200/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

training/checkpoints/unified_final/checkpoint-200/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

training/checkpoints/unified_final/checkpoint-200/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2
+  ],
+  "max_length": 2048,
+  "pad_token_id": 2,
+  "transformers_version": "5.3.0"
+}

training/checkpoints/unified_final/checkpoint-200/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training/checkpoints/unified_final/checkpoint-200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "is_local": true,
+  "max_length": null,
+  "model_max_length": 2048,
+  "pad_to_multiple_of": null,
+  "pad_token": "</s>",
+  "pad_token_type_id": 0,
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "truncation_side": "left",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training/checkpoints/unified_final/checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,574 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.3566992908716202,
+      "epoch": 0.01,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7344621419906616,
+      "learning_rate": 4.775e-06,
+      "loss": 3.0994415283203126e-07,
+      "num_tokens": 35800.0,
+      "reward": 0.01268580500036478,
+      "reward_std": 0.02462496655061841,
+      "rewards/compute_reward/mean": 0.01268580500036478,
+      "rewards/compute_reward/std": 0.024624967435374855,
+      "step": 10,
+      "step_time": 10.7134718033034
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.3169427752494811,
+      "epoch": 0.02,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 4.441726207733154,
+      "learning_rate": 4.525000000000001e-06,
+      "loss": -4.246830940246582e-07,
+      "num_tokens": 71748.0,
+      "reward": -0.04455982223153114,
+      "reward_std": 0.035665383422747256,
+      "rewards/compute_reward/mean": -0.04455982223153114,
+      "rewards/compute_reward/std": 0.03566538490122184,
+      "step": 20,
+      "step_time": 10.643421414200565
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 8.9,
+      "completions/mean_length": 99.8625,
+      "completions/mean_terminated_length": 8.9,
+      "completions/min_length": 98.9,
+      "completions/min_terminated_length": 8.9,
+      "entropy": 1.0057833462953567,
+      "epoch": 0.03,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.0170326232910156,
+      "learning_rate": 4.2750000000000006e-06,
+      "loss": -0.0018164031207561493,
+      "num_tokens": 108181.0,
+      "reward": 0.0374881561845541,
+      "reward_std": 0.020618790527805686,
+      "rewards/compute_reward/mean": 0.0374881561845541,
+      "rewards/compute_reward/std": 0.0206187907140702,
+      "step": 30,
+      "step_time": 10.756140169796709
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 6.6,
+      "completions/mean_length": 99.575,
+      "completions/mean_terminated_length": 6.6,
+      "completions/min_length": 96.6,
+      "completions/min_terminated_length": 6.6,
+      "entropy": 1.7816664546728134,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 5.86561393737793,
+      "learning_rate": 4.0250000000000004e-06,
+      "loss": -0.006361240148544311,
+      "num_tokens": 143375.0,
+      "reward": -0.014824284799396991,
+      "reward_std": 0.06699581742286682,
+      "rewards/compute_reward/mean": -0.014824284799396991,
+      "rewards/compute_reward/std": 0.06699582003057003,
+      "step": 40,
+      "step_time": 10.785410385398427
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 3.0,
+      "completions/mean_length": 99.125,
+      "completions/mean_terminated_length": 3.0,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 3.0,
+      "entropy": 2.1307705104351045,
+      "epoch": 0.05,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 6.191352367401123,
+      "learning_rate": 3.7750000000000003e-06,
+      "loss": -0.011027154326438905,
+      "num_tokens": 178941.0,
+      "reward": -0.016337488451972602,
+      "reward_std": 0.051818730868399145,
+      "rewards/compute_reward/mean": -0.016337488451972602,
+      "rewards/compute_reward/std": 0.05181873142719269,
+      "step": 50,
+      "step_time": 10.741381045605522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 8.8,
+      "completions/mean_length": 99.85,
+      "completions/mean_terminated_length": 8.8,
+      "completions/min_length": 98.8,
+      "completions/min_terminated_length": 8.8,
+      "entropy": 2.1041357040405275,
+      "epoch": 0.06,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 8.536041259765625,
+      "learning_rate": 3.525e-06,
+      "loss": 0.0019509844481945039,
+      "num_tokens": 216257.0,
+      "reward": 0.035917540453374384,
+      "reward_std": 0.04930563308298588,
+      "rewards/compute_reward/mean": 0.035917540453374384,
+      "rewards/compute_reward/std": 0.049305635318160054,
+      "step": 60,
+      "step_time": 11.27133785020269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 48.2,
+      "completions/mean_length": 92.9625,
+      "completions/mean_terminated_length": 38.51333351135254,
+      "completions/min_length": 70.1,
+      "completions/min_terminated_length": 30.1,
+      "entropy": 1.6469052851200103,
+      "epoch": 0.07,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 6.919373512268066,
+      "learning_rate": 3.2750000000000004e-06,
+      "loss": -0.02075239419937134,
+      "num_tokens": 251110.0,
+      "reward": 0.007261525164358318,
+      "reward_std": 0.0802696269005537,
+      "rewards/compute_reward/mean": 0.007261525164358318,
+      "rewards/compute_reward/std": 0.08026962876319885,
+      "step": 70,
+      "step_time": 10.774873650902009
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 3.1,
+      "completions/mean_length": 99.1375,
+      "completions/mean_terminated_length": 3.1,
+      "completions/min_length": 93.1,
+      "completions/min_terminated_length": 3.1,
+      "entropy": 2.2336367428302766,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 4.918172836303711,
+      "learning_rate": 3.0250000000000003e-06,
+      "loss": 0.008250368386507034,
+      "num_tokens": 285729.0,
+      "reward": 0.027657157555222512,
+      "reward_std": 0.04840414375066757,
+      "rewards/compute_reward/mean": 0.027657157555222512,
+      "rewards/compute_reward/std": 0.048404145427048205,
+      "step": 80,
+      "step_time": 10.43483721170196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.8057245463132858,
+      "epoch": 0.09,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 4.417481422424316,
+      "learning_rate": 2.7750000000000005e-06,
+      "loss": 2.216547727584839e-08,
+      "num_tokens": 320249.0,
+      "reward": 0.07908838111907243,
+      "reward_std": 0.07920666746795177,
+      "rewards/compute_reward/mean": 0.07908838111907243,
+      "rewards/compute_reward/std": 0.07920666970312595,
+      "step": 90,
+      "step_time": 10.337220244196942
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.4064194440841675,
+      "epoch": 0.1,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.352966785430908,
+      "learning_rate": 2.5250000000000004e-06,
+      "loss": 8.493661880493164e-08,
+      "num_tokens": 355369.0,
+      "reward": 0.14763977155089378,
+      "reward_std": 0.07424246501177549,
+      "rewards/compute_reward/mean": 0.14763977155089378,
+      "rewards/compute_reward/std": 0.0742424676194787,
+      "step": 100,
+      "step_time": 10.74917738300719
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.2582464694976807,
+      "epoch": 0.11,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.9595463275909424,
+      "learning_rate": 2.2750000000000002e-06,
+      "loss": -3.874301910400391e-08,
+      "num_tokens": 392289.0,
+      "reward": 0.18278183937072753,
+      "reward_std": 0.052620683796703815,
+      "rewards/compute_reward/mean": 0.18278183937072753,
+      "rewards/compute_reward/std": 0.05262068491429091,
+      "step": 110,
+      "step_time": 11.17140419179923
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.8805452413856983,
+      "epoch": 0.12,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.707214593887329,
+      "learning_rate": 2.025e-06,
+      "loss": 1.5050172805786132e-07,
+      "num_tokens": 430501.0,
+      "reward": 0.22903144657611846,
+      "reward_std": 0.04029850559309125,
+      "rewards/compute_reward/mean": 0.22903144657611846,
+      "rewards/compute_reward/std": 0.04029850568622351,
+      "step": 120,
+      "step_time": 11.244449263699062
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.8755271568894386,
+      "epoch": 0.13,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.942605495452881,
+      "learning_rate": 1.7750000000000002e-06,
+      "loss": 1.2218952178955077e-07,
+      "num_tokens": 467245.0,
+      "reward": 0.18334048390388488,
+      "reward_std": 0.07254596166312695,
+      "rewards/compute_reward/mean": 0.18334048390388488,
+      "rewards/compute_reward/std": 0.072545962408185,
+      "step": 130,
+      "step_time": 11.071729802998016
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9737002968788147,
+      "epoch": 0.14,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 4.040837287902832,
+      "learning_rate": 1.525e-06,
+      "loss": -1.4007091522216797e-07,
+      "num_tokens": 503017.0,
+      "reward": 0.20783505886793135,
+      "reward_std": 0.06580547224730253,
+      "rewards/compute_reward/mean": 0.20783505886793135,
+      "rewards/compute_reward/std": 0.06580547466874123,
+      "step": 140,
+      "step_time": 10.841636341501726
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9901166066527367,
+      "epoch": 0.15,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.720881462097168,
+      "learning_rate": 1.275e-06,
+      "loss": 2.0861625671386717e-08,
+      "num_tokens": 539801.0,
+      "reward": 0.2224348157644272,
+      "reward_std": 0.05879365894943476,
+      "rewards/compute_reward/mean": 0.2224348157644272,
+      "rewards/compute_reward/std": 0.05879366043955088,
+      "step": 150,
+      "step_time": 10.85469058619783
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.1208710052073,
+      "epoch": 0.16,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.452557325363159,
+      "learning_rate": 1.025e-06,
+      "loss": 1.4603137969970704e-07,
+      "num_tokens": 575385.0,
+      "reward": 0.1992661789059639,
+      "reward_std": 0.06030977526679635,
+      "rewards/compute_reward/mean": 0.1992661789059639,
+      "rewards/compute_reward/std": 0.060309774987399575,
+      "step": 160,
+      "step_time": 10.620040459206212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9875,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 8.5,
+      "completions/mean_length": 99.8125,
+      "completions/mean_terminated_length": 8.5,
+      "completions/min_length": 98.5,
+      "completions/min_terminated_length": 8.5,
+      "entropy": 0.943237779289484,
+      "epoch": 0.17,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.998199701309204,
+      "learning_rate": 7.750000000000001e-07,
+      "loss": 0.0005225777626037597,
+      "num_tokens": 611998.0,
+      "reward": 0.21552147567272187,
+      "reward_std": 0.032230423856526615,
+      "rewards/compute_reward/mean": 0.21552147567272187,
+      "rewards/compute_reward/std": 0.0322304243221879,
+      "step": 170,
+      "step_time": 10.901679297701047
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9798725090920926,
+      "epoch": 0.18,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.732668161392212,
+      "learning_rate": 5.250000000000001e-07,
+      "loss": -8.270144462585449e-08,
+      "num_tokens": 647338.0,
+      "reward": 0.21226384192705156,
+      "reward_std": 0.06548679377883673,
+      "rewards/compute_reward/mean": 0.21226384192705156,
+      "rewards/compute_reward/std": 0.0654867960140109,
+      "step": 180,
+      "step_time": 10.853807216498534
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.9461549550294877,
+      "epoch": 0.19,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.7145590782165527,
+      "learning_rate": 2.75e-07,
+      "loss": -2.1532177925109862e-07,
+      "num_tokens": 682026.0,
+      "reward": 0.21948475018143654,
+      "reward_std": 0.05461370516568422,
+      "rewards/compute_reward/mean": 0.21948475018143654,
+      "rewards/compute_reward/std": 0.05461370553821325,
+      "step": 190,
+      "step_time": 10.456350517399551
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 100.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 100.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.8442220821976661,
+      "epoch": 0.2,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.7965171337127686,
+      "learning_rate": 2.5000000000000002e-08,
+      "loss": 1.0430812835693359e-08,
+      "num_tokens": 716746.0,
+      "reward": 0.2305009976029396,
+      "reward_std": 0.03879760131239891,
+      "rewards/compute_reward/mean": 0.2305009976029396,
+      "rewards/compute_reward/std": 0.03879760047420859,
+      "step": 200,
+      "step_time": 10.340635509999993
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 200,
+  "num_input_tokens_seen": 716746,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training/checkpoints/unified_final/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

training/checkpoints/unified_final/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2
+  ],
+  "max_length": 2048,
+  "pad_token_id": 2,
+  "transformers_version": "5.3.0"
+}

training/checkpoints/unified_final/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training/checkpoints/unified_final/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "is_local": true,
+  "max_length": null,
+  "model_max_length": 2048,
+  "pad_to_multiple_of": null,
+  "pad_token": "</s>",
+  "pad_token_type_id": 0,
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "truncation_side": "left",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training/checkpoints/unified_final/unified_reward_log.json ADDED Viewed

	@@ -0,0 +1,810 @@

+{
+  "accuracy": [
+    0.012478123821101302,
+    0.013689774048328765,
+    0.12357050236883002,
+    0.043150096433237195,
+    0.11808098944816375,
+    0.14478551750907398,
+    0.21936089415676943,
+    0.14560732765872023,
+    0.12766012796254073,
+    0.16228250732999258,
+    0.19256023689530533,
+    0.153446869824083,
+    0.08735395734236795,
+    0.25620539761275585,
+    0.2796424323605421,
+    0.4050695781981913,
+    0.34320680785281277,
+    0.39042326634482405,
+    0.24141882976569753,
+    0.2882491476114424,
+    0.2805112680700598,
+    0.1299182187184869,
+    0.18283964773559502,
+    0.08174918994377885,
+    0.1305077084983307,
+    0.15188368799701088,
+    0.10731278214010087,
+    0.10817607256366782,
+    0.1742403849902705,
+    0.15966549523684162,
+    0.21224383614993403,
+    0.30634267989144903,
+    0.2563189622014761,
+    0.13088561721084532,
+    0.23896305011421776,
+    0.36338720554077614,
+    0.2743395734578371,
+    0.2785670698390685,
+    0.26690704237418583,
+    0.23420825800444123,
+    0.4486492634482796,
+    0.3085314377908274,
+    0.27236165767163295,
+    0.351135627192783,
+    0.37157259147763155,
+    0.4091061054548437,
+    0.3321387716436809,
+    0.25690332708634805,
+    0.4042620632377111,
+    0.21426805183517378,
+    0.46486986328175767,
+    0.5354255396266014,
+    0.5316739152617584,
+    0.3626249278251227,
+    0.5560084815324287,
+    0.47374602488847506,
+    0.5622030981309204,
+    0.6260334739834723,
+    0.5388746766273916,
+    0.43546972183358157,
+    0.4384314355118149,
+    0.43255371653260083,
+    0.382003842773009,
+    0.33916141995282467,
+    0.4102824234143368,
+    0.4002692943218704,
+    0.4433627484561765,
+    0.5707634448719365,
+    0.3326736211199734,
+    0.41868448313128437,
+    0.4830820909726724,
+    0.5073173724203757,
+    0.6011403764343056,
+    0.2652010267221505,
+    0.5708498617899997,
+    0.5372080254474398,
+    0.34268688791221447,
+    0.36077516272765764,
+    0.6577040443039563,
+    0.5249539674929385,
+    0.3393068936409599,
+    0.3981918416905377,
+    0.5998766558760262,
+    0.3886278953534839,
+    0.47030574201103836,
+    0.5933578772929455,
+    0.629797753552287,
+    0.6829957361516797,
+    0.5975855789903534,
+    0.37033629002672747,
+    0.40129960235208273,
+    0.44104763492941856,
+    0.5250475457257945,
+    0.5792574424612014,
+    0.25491493314992414,
+    0.4456432306425367,
+    0.3674802188566988,
+    0.5168529125349757,
+    0.7135775878197881,
+    0.408872426591652,
+    0.29645813006976085,
+    0.5807047440217663,
+    0.3951396545427582,
+    0.5820897600332913,
+    0.5751887943251881,
+    0.6462836385320105,
+    0.452535930180199,
+    0.6309295986678539,
+    0.521345004487674,
+    0.7523772581521466,
+    0.3868275580258203,
+    0.6621844534173644,
+    0.757102247782526,
+    0.7496667811480936,
+    0.765902349873787,
+    0.7620735178706088,
+    0.8005386810387373,
+    0.7600417191929723,
+    0.7790964529097753,
+    0.8060362095807505,
+    0.6639245812548539,
+    0.49642928937921477,
+    0.4622820479255877,
+    0.5039745619269863,
+    0.5521504355740943,
+    0.763103948879152,
+    0.3649169562800698,
+    0.8642640291197355,
+    0.7673212948914258,
+    0.6856467187291327,
+    0.6203947744628628,
+    0.635864180446877,
+    0.7076110516058842,
+    0.45257112707172986,
+    0.4927382976084982,
+    0.735338338570779,
+    0.7325108773598185,
+    0.5286115260781837,
+    0.6873601944038981,
+    0.7558585478414992,
+    0.8025525164825894,
+    0.5403924472630024,
+    0.8109585656614495,
+    0.45960476465808653,
+    0.7726514123926349,
+    0.78036072270019,
+    0.5612159043391909,
+    0.668619691132455,
+    0.7187997825397312,
+    0.6008389099901545,
+    0.5160061409523324,
+    0.6712722339255528,
+    0.25213094055121654,
+    0.7931299787283417,
+    0.5770709363152806,
+    0.3674653100689218,
+    0.7533031922202384,
+    0.5477579357220128,
+    0.9013020257140825,
+    0.774595058715597,
+    0.5444791193214735,
+    0.28536322558907645,
+    0.8018009673613502,
+    0.7534115956222964,
+    0.8178817865612724,
+    0.7691389758719754,
+    0.746364161759599,
+    0.7686015134039534,
+    0.734219302571865,
+    0.32221002464589255,
+    0.47941368112339633,
+    0.7168057798061833,
+    0.772261652825011,
+    0.5291935548529084,
+    0.7485607594114032,
+    0.5932522241567504,
+    0.5648661194163807,
+    0.5709367030781823,
+    0.7752278802176389,
+    0.6248770881515031,
+    0.5446761697530746,
+    0.8044651419608864,
+    0.855248827897706,
+    0.5436122580157401,
+    0.9085174062877894,
+    0.31500336882736524,
+    0.6913784691774245,
+    0.5400797382818436,
+    0.6050753133365693,
+    0.7986505120673587,
+    0.8202528873914283,
+    0.6996518377501237,
+    0.8313200483947909,
+    0.4808844911385792,
+    0.7306097140061414,
+    0.5058602896511918,
+    0.6438089653119033,
+    0.7879260241436392,
+    0.8337068369817564,
+    0.537435884385747
+  ],
+  "outcome": [
+    0.4,
+    0.42500000000000004,
+    0.4375,
+    0.42500000000000004,
+    0.4,
+    0.4,
+    0.4,
+    0.25,
+    0.4,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.07500000000000001,
+    0.025,
+    0.07500000000000001,
+    0.0,
+    0.07500000000000001,
+    0.05,
+    0.07500000000000001,
+    0.225,
+    0.4,
+    0.4,
+    0.4,
+    0.42500000000000004,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.35000000000000003,
+    0.175,
+    0.15,
+    0.15000000000000002,
+    0.07500000000000001,
+    0.17500000000000002,
+    0.1,
+    0.0,
+    0.05,
+    0.07500000000000001,
+    0.07500000000000001,
+    0.07500000000000001,
+    0.025,
+    0.0,
+    0.0,
+    0.0,
+    0.07500000000000001,
+    0.15000000000000002,
+    0.0,
+    0.05,
+    0.0,
+    0.025,
+    0.0,
+    0.0,
+    0.0,
+    0.05,
+    0.0,
+    0.05,
+    0.025,
+    0.07500000000000001,
+    0.0,
+    0.05,
+    0.025,
+    0.1,
+    0.025,
+    0.025,
+    0.025,
+    0.025,
+    0.0,
+    0.05,
+    0.05,
+    0.0,
+    0.05,
+    0.0,
+    0.0,
+    0.025,
+    0.05,
+    0.025,
+    0.0,
+    0.025,
+    0.05,
+    0.07500000000000001,
+    0.125,
+    0.25,
+    0.125,
+    0.2,
+    0.05,
+    0.17500000000000002,
+    0.225,
+    0.2,
+    0.30000000000000004,
+    0.375,
+    0.35,
+    0.42500000000000004,
+    0.35000000000000003,
+    0.42500000000000004,
+    0.4,
+    0.4,
+    0.4,
+    0.42500000000000004,
+    0.42500000000000004,
+    0.45,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.45,
+    0.35000000000000003,
+    0.4,
+    0.4,
+    0.4,
+    0.35000000000000003,
+    0.4,
+    0.4,
+    0.25,
+    0.25,
+    0.35000000000000003,
+    0.4,
+    0.35000000000000003,
+    0.30000000000000004,
+    0.4,
+    0.35000000000000003,
+    0.35000000000000003,
+    0.35000000000000003,
+    0.4,
+    0.35000000000000003,
+    0.35000000000000003,
+    0.2,
+    0.35000000000000003,
+    0.4,
+    0.35000000000000003,
+    0.42500000000000004,
+    0.4,
+    0.30000000000000004,
+    0.4,
+    0.4,
+    0.42500000000000004,
+    0.42500000000000004,
+    0.4,
+    0.42500000000000004,
+    0.4,
+    0.4,
+    0.35000000000000003,
+    0.42500000000000004,
+    0.30000000000000004,
+    0.42500000000000004,
+    0.4,
+    0.4,
+    0.4,
+    0.42500000000000004,
+    0.4,
+    0.35000000000000003,
+    0.4,
+    0.42500000000000004,
+    0.4,
+    0.42500000000000004,
+    0.25,
+    0.35000000000000003,
+    0.4,
+    0.4,
+    0.35000000000000003,
+    0.4,
+    0.4,
+    0.35000000000000003,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.42500000000000004,
+    0.4,
+    0.4,
+    0.4,
+    0.375,
+    0.4,
+    0.375,
+    0.4,
+    0.35000000000000003,
+    0.4,
+    0.4,
+    0.35000000000000003,
+    0.42500000000000004,
+    0.4,
+    0.4,
+    0.42500000000000004,
+    0.4,
+    0.4,
+    0.4,
+    0.4,
+    0.45,
+    0.4,
+    0.4,
+    0.4,
+    0.35000000000000003,
+    0.4,
+    0.4
+  ],
+  "bluff": [
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5,
+    -0.5
+  ],
+  "total": [
+    -0.005632656662614553,
+    0.0035414209169150612,
+    0.0463746758290905,
+    0.01385253375163301,
+    0.031328346306857296,
+    0.040674931128175884,
+    0.06677631295486929,
+    -0.011537435319447932,
+    0.03468104478688924,
+    -0.0932011224345026,
+    -0.08260391708664314,
+    -0.09629359556157094,
+    -0.11942611493017122,
+    -0.03407811083553544,
+    -0.04337514867381029,
+    0.018024352369366947,
+    -0.02987761725151553,
+    0.012898143220688411,
+    -0.04800340958200586,
+    -0.022862798335995183,
+    0.026928943824520928,
+    0.03547137655147041,
+    0.05399387670745824,
+    0.018612216480322585,
+    0.044427697974415745,
+    0.043159290798953795,
+    0.027559473749035293,
+    0.02786162539728372,
+    0.05098413474659466,
+    0.045882923332894544,
+    0.04678534265247689,
+    0.018469937962007153,
+    -0.007788363229483373,
+    -0.05169003397620414,
+    -0.04011293246002378,
+    0.03843552193927165,
+    -0.018981149289757013,
+    -0.05250152555632605,
+    -0.039082535169034954,
+    -0.04177710969844557,
+    0.033277242206897865,
+    -0.015763996773210408,
+    -0.045923419814928465,
+    -0.02710253048252593,
+    -0.019949592982828956,
+    -0.006812863090804698,
+    -0.007501429924711707,
+    -0.007583835519778186,
+    -0.008508277866801141,
+    -0.05750618185768919,
+    0.012704452148615191,
+    0.0461489388693105,
+    0.036085870341615436,
+    -0.023081275261207068,
+    0.04460296853635004,
+    0.03331110871096628,
+    0.04677108434582211,
+    0.0866117158942153,
+    0.04735613681958707,
+    0.02866440264175356,
+    0.0034510024291352186,
+    0.01889380078641028,
+    -0.00754865502944687,
+    0.0037064969834886344,
+    0.0023488481950178913,
+    -0.001155746987345354,
+    0.013926961959661782,
+    0.058517205705177766,
+    -0.03356423260800931,
+    0.014039569095949535,
+    0.03657873184043532,
+    0.02756108034713149,
+    0.07789913175200697,
+    -0.05717964064724733,
+    0.04979745162649989,
+    0.04677280890660393,
+    -0.012559589230724939,
+    -0.014978693045319853,
+    0.08019641550638473,
+    0.04248388862252848,
+    -0.01374258722566403,
+    0.015617144591688177,
+    0.10370682955660918,
+    0.07351976337371936,
+    0.05835700970386343,
+    0.12767525705253094,
+    0.08792921374330046,
+    0.1502985076530879,
+    0.13790495264662364,
+    0.049617701509354614,
+    0.09545486082322892,
+    0.13561667222529647,
+    0.15626664100402804,
+    0.2014901048614205,
+    0.06172022660247342,
+    0.15472513072488783,
+    0.11861807659984457,
+    0.1708985193872415,
+    0.23975215573692582,
+    0.1418553493070782,
+    0.10251034552441629,
+    0.21074666040761822,
+    0.12829887908996535,
+    0.19373141601165192,
+    0.19131607801381584,
+    0.21619927348620369,
+    0.1483875755630696,
+    0.2108253595337488,
+    0.18997075157068588,
+    0.23583204035325128,
+    0.12538964530903712,
+    0.22176455869607747,
+    0.25498578672388406,
+    0.2348833734018327,
+    0.25806582245582543,
+    0.256725731254713,
+    0.217688538363558,
+    0.20351460171754027,
+    0.24518375851842128,
+    0.2721126733532626,
+    0.2048736034391988,
+    0.12875025128272513,
+    0.15179871677395568,
+    0.14889109667444517,
+    0.16575265245093296,
+    0.23958638210770317,
+    0.11772093469802442,
+    0.27499241019190734,
+    0.24106245321199898,
+    0.15997635155519643,
+    0.18963817106200198,
+    0.21255246315640697,
+    0.22016386806205945,
+    0.1571498944751054,
+    0.16245840416297436,
+    0.21236841849977267,
+    0.24637880707593643,
+    0.17501403412736427,
+    0.23932606804136433,
+    0.2633004917445247,
+    0.27089338076890623,
+    0.1878873565420508,
+    0.2738354979815073,
+    0.15086166763033024,
+    0.24292799433742218,
+    0.27187625294506645,
+    0.1514255665187168,
+    0.2327668918963592,
+    0.24157992388890587,
+    0.20029361849655403,
+    0.1706021493333163,
+    0.23369528187394348,
+    0.07824582919292578,
+    0.25009549255491953,
+    0.19197482771034816,
+    0.1273628585241226,
+    0.25365611727708337,
+    0.19046527750270448,
+    0.25295570899992886,
+    0.24360827055045886,
+    0.1805676917625157,
+    0.08987712895617675,
+    0.25313033857647255,
+    0.25369405846780374,
+    0.2762586252964453,
+    0.24169864155519138,
+    0.2512274566158596,
+    0.25901052969138366,
+    0.24697675590015272,
+    0.10277350862606237,
+    0.1577947883931887,
+    0.2408820229321641,
+    0.2602915784887538,
+    0.1839677441985179,
+    0.2519962657939911,
+    0.19763827845486265,
+    0.18770314179573322,
+    0.1810778460773638,
+    0.26132975807617365,
+    0.1999569808530261,
+    0.1806366594135761,
+    0.2540627996863101,
+    0.28933708976419703,
+    0.18026429030550906,
+    0.2904810922007262,
+    0.10900117908957782,
+    0.2319824642120985,
+    0.17902790839864524,
+    0.2105263596677992,
+    0.26952767922357546,
+    0.27708851058699985,
+    0.23487814321254327,
+    0.2809620169381768,
+    0.1758095718985027,
+    0.2457133999021494,
+    0.1670511013779171,
+    0.21533313785916613,
+    0.2482741084502737,
+    0.2817973929436147,
+    0.1781025595350114
+  ]
+}

training/parse_poker.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""
+Parse IRC poker pdb files and produce labeled bluff examples.
+Line format: player_name  timestamp  num_players  position  preflop  flop  turn  river  bankroll  won  won2  [cards]
+Action codes: f=fold, c=call, r=raise, b=bet, k=check, B=blind, -=no action
+Cards at end of line = player went to showdown.
+BLUFF = True: preflop has 'r' or 'b', hand ends in fold (last non-dash action ends in 'f'), no cards at end.
+BLUFF = False: cards at end (showdown) OR folded with no aggression.
+"""
+import json
+import os
+import re
+from pathlib import Path
+BASE_POKER = Path(__file__).resolve().parent / "data" / "poker"
+PDB_DIR = BASE_POKER / "IRCdata" / "holdem" / "199901" / "pdb"
+OUT_PATH = BASE_POKER / "bluff_labels.json"
+MAX_EXAMPLES = 50_000
+CARD_PATTERN = re.compile(r"^[2-9TJKQA][cdhs]$", re.IGNORECASE)
+def _is_card_token(s: str) -> bool:
+    return bool(s and CARD_PATTERN.match(s.strip()))
+def _has_cards_at_end(tokens: list) -> bool:
+    """True if line ends with card tokens (showdown)."""
+    if len(tokens) <= 11:
+        return False
+    # Last 1 or 2 tokens can be cards (e.g. "Ks Kh" or single card)
+    tail = tokens[11:]
+    return all(_is_card_token(t) for t in tail) and len(tail) >= 1
+def _last_non_dash_ends_in_f(preflop: str, flop: str, turn: str, river: str) -> bool:
+    """Last non-dash action field ends in 'f' (fold)."""
+    for s in (river, turn, flop, preflop):
+        if s and s != "-":
+            return s.strip().endswith("f")
+    return False
+def _preflop_aggressive(preflop: str) -> bool:
+    """Preflop contains raise or bet."""
+    return "r" in (preflop or "") or "b" in (preflop or "")
+def parse_line(line: str) -> dict | None:
+    """
+    Returns {"text": str, "is_bluff": bool} or None if line invalid.
+    """
+    line = line.strip()
+    if not line:
+        return None
+    tokens = line.split()
+    if len(tokens) < 11:
+        return None
+    player_name = tokens[0]
+    timestamp = tokens[1]
+    num_players = tokens[2]
+    position = tokens[3]
+    preflop = tokens[4]
+    flop = tokens[5]
+    turn = tokens[6]
+    river = tokens[7]
+    bankroll = tokens[8]
+    won = tokens[9]
+    won2 = tokens[10]
+    try:
+        pot = abs(int(won))
+    except ValueError:
+        pot = 0
+    has_cards = _has_cards_at_end(tokens)
+    ends_in_fold = _last_non_dash_ends_in_f(preflop, flop, turn, river)
+    aggressive = _preflop_aggressive(preflop)
+    # BLUFF = True: aggressive preflop, ended in fold, no showdown
+    is_bluff = aggressive and ends_in_fold and not has_cards
+    # BLUFF = False: showdown OR fold with no aggression
+    if has_cards:
+        is_bluff = False
+    elif not aggressive and ends_in_fold:
+        is_bluff = False
+    text = (
+        f"Position {position} of {num_players}. "
+        f"Preflop: {preflop}. Flop: {flop}. Turn: {turn}. River: {river}. Pot: {pot}."
+    )
+    return {"text": text, "is_bluff": is_bluff}
+def main():
+    os.makedirs(OUT_PATH.parent, exist_ok=True)
+    examples = []
+    # Files are named pdb.^, pdb.A2k, etc. (not *.pdb)
+    if PDB_DIR.exists():
+        pdb_files = [f for f in PDB_DIR.iterdir() if f.is_file() and f.name.startswith("pdb.")]
+    else:
+        pdb_files = []
+    if not pdb_files:
+        for d in BASE_POKER.rglob("pdb"):
+            if d.is_dir():
+                pdb_files.extend([f for f in d.iterdir() if f.is_file() and f.name.startswith("pdb.")])
+                if pdb_files:
+                    break
+    if not pdb_files:
+        print(f"ERROR: No pdb files in {PDB_DIR} or under {BASE_POKER}")
+        return
+    for pdb_path in pdb_files:
+        if len(examples) >= MAX_EXAMPLES:
+            break
+        try:
+            with open(pdb_path, "r", encoding="utf-8", errors="replace") as f:
+                for line in f:
+                    if len(examples) >= MAX_EXAMPLES:
+                        break
+                    rec = parse_line(line)
+                    if rec is not None:
+                        examples.append(rec)
+        except Exception as e:
+            print(f"Warning: {pdb_path}: {e}")
+    with open(OUT_PATH, "w") as f:
+        json.dump(examples, f, indent=0)
+    n = len(examples)
+    n_bluff = sum(1 for e in examples if e["is_bluff"])
+    print(f"Total examples: {n}")
+    print(f"Class balance: is_bluff=True {n_bluff}, is_bluff=False {n - n_bluff}")
+    print(f"Saved to {OUT_PATH}")
+if __name__ == "__main__":
+    main()

training/plot_phase2.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import json
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+with open("training/checkpoints/phase2_final/checkpoint-200/trainer_state.json") as f:
+    state = json.load(f)
+steps = [e["step"] for e in state["log_history"]]
+rewards = [e["reward"] for e in state["log_history"]]
+fig, ax = plt.subplots(figsize=(10, 5))
+ax.plot(steps, rewards, color="#4C72B0", linewidth=2.5, marker="o", markersize=4)
+ax.axhline(y=rewards[0], color="gray", linestyle="--", alpha=0.5, label=f"Start: {rewards[0]:.3f}")
+ax.axhline(y=rewards[-1], color="#2ca02c", linestyle="--", alpha=0.5, label=f"End: {rewards[-1]:.3f}")
+ax.fill_between(steps, rewards, rewards[0], alpha=0.1, color="#4C72B0")
+ax.set_xlabel("Training Step", fontsize=13)
+ax.set_ylabel("Mean Reward", fontsize=13)
+ax.set_title("ArbitrAgent Phase 2 GRPO Training\nContractor Curriculum (Human Imitation)", fontsize=14)
+ax.legend(fontsize=11)
+ax.set_ylim(0, 0.5)
+ax.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.savefig("training/phase2_reward_curve.png", dpi=150)
+print(f"Saved. Reward: {rewards[0]:.3f} → {rewards[-1]:.3f} over {steps[-1]} steps")

training/train_bluff_classifier.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Train DistilBERT binary classifier on IRC poker bluff labels.
+Data: training/data/poker/bluff_labels.json
+Model: distilbert-base-uncased + linear 768→2
+80/20 train/val stratified, 3 epochs, lr 2e-5, batch 32
+Saves: training/checkpoints/bluff_classifier.pt, bluff_classifier_tokenizer/
+"""
+import json
+import os
+from pathlib import Path
+import torch
+import torch.nn as nn
+from sklearn.model_selection import train_test_split
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoTokenizer, AutoModel
+SCRIPT_DIR = Path(__file__).resolve().parent
+DATA_PATH = SCRIPT_DIR / "data" / "poker" / "bluff_labels.json"
+CHECKPOINT_DIR = SCRIPT_DIR / "checkpoints"
+MODEL_PT = CHECKPOINT_DIR / "bluff_classifier.pt"
+TOKENIZER_DIR = CHECKPOINT_DIR / "bluff_classifier_tokenizer"
+MAX_LENGTH = 128
+EPOCHS = 3
+LR = 2e-5
+BATCH_SIZE = 32
+class BluffClassifier(nn.Module):
+    """DistilBERT + linear head 768 → 2 (binary: not_bluff, bluff)."""
+    def __init__(self, base_model: str = "distilbert-base-uncased"):
+        super().__init__()
+        self.encoder = AutoModel.from_pretrained(base_model)
+        hidden_size = self.encoder.config.hidden_size
+        self.head = nn.Linear(hidden_size, 2)
+    def forward(self, input_ids, attention_mask=None, **kwargs):
+        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        pooled = out.last_hidden_state[:, 0, :]
+        return self.head(pooled)
+class BluffDataset(Dataset):
+    def __init__(self, texts, labels, tokenizer):
+        self.texts = texts
+        self.labels = labels
+        self.tokenizer = tokenizer
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        enc = self.tokenizer(
+            self.texts[idx],
+            truncation=True,
+            max_length=MAX_LENGTH,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        return {
+            "input_ids": enc["input_ids"].squeeze(0),
+            "attention_mask": enc["attention_mask"].squeeze(0),
+            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
+        }
+def main():
+    if not DATA_PATH.exists():
+        print(f"ERROR: {DATA_PATH} not found. Run training/parse_poker.py first.")
+        return
+    with open(DATA_PATH) as f:
+        data = json.load(f)
+    texts = [x["text"] for x in data]
+    labels = [1 if x["is_bluff"] else 0 for x in data]
+    X_train, X_val, y_train, y_val = train_test_split(
+        texts, labels, test_size=0.2, stratify=labels, random_state=42
+    )
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+    train_ds = BluffDataset(X_train, y_train, tokenizer)
+    val_ds = BluffDataset(X_val, y_val, tokenizer)
+    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
+    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = BluffClassifier().to(device)
+    opt = torch.optim.AdamW(model.parameters(), lr=LR)
+    criterion = nn.CrossEntropyLoss()
+    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+    for epoch in range(EPOCHS):
+        model.train()
+        for batch in train_loader:
+            opt.zero_grad()
+            out = model(
+                input_ids=batch["input_ids"].to(device),
+                attention_mask=batch["attention_mask"].to(device),
+            )
+            loss = criterion(out, batch["labels"].to(device))
+            loss.backward()
+            opt.step()
+        model.eval()
+        correct, total = 0, 0
+        all_pred, all_true = [], []
+        with torch.no_grad():
+            for batch in val_loader:
+                out = model(
+                    input_ids=batch["input_ids"].to(device),
+                    attention_mask=batch["attention_mask"].to(device),
+                )
+                pred = out.argmax(dim=1)
+                correct += (pred == batch["labels"].to(device)).sum().item()
+                total += pred.size(0)
+                all_pred.extend(pred.cpu().tolist())
+                all_true.extend(batch["labels"].tolist())
+        acc = correct / total if total else 0
+        # F1 binary: bluff=1
+        tp = sum(1 for p, t in zip(all_pred, all_true) if p == 1 and t == 1)
+        fp = sum(1 for p, t in zip(all_pred, all_true) if p == 1 and t == 0)
+        fn = sum(1 for p, t in zip(all_pred, all_true) if p == 0 and t == 1)
+        prec = tp / (tp + fp) if (tp + fp) else 0
+        rec = tp / (tp + fn) if (tp + fn) else 0
+        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0
+        print(f"Epoch {epoch + 1}/{EPOCHS}  Val accuracy: {acc:.4f}  Val F1: {f1:.4f}")
+    if acc < 0.65:
+        print(f"WARNING: Val accuracy {acc:.4f} < 0.65 (target). Consider more data or epochs.")
+    torch.save(model.state_dict(), MODEL_PT)
+    tokenizer.save_pretrained(TOKENIZER_DIR)
+    print(f"Saved model to {MODEL_PT}, tokenizer to {TOKENIZER_DIR}")
+if __name__ == "__main__":
+    main()