agharsallah
feat: Implement early termination for the judge in Twenty Sprouts game and enhance event handling
0460922 | """Competition handlers β turning a judge's ruling into a machine-readable winner. | |
| The arena contract (ADR-0029) says a winner is *data*, not prose: a | |
| ``judge.verdict`` carries ``payload["winner"]`` naming a cast member (or team), so | |
| the leaderboard can attribute the win to a model. Responsibilities split cleanly: | |
| * **The model + the engine handle the live path.** A judge manifest lists | |
| ``winner`` in ``output_extra_fields`` (a well-known typed field, | |
| ``src/core/structured.py``), and :class:`~src.agents.base.ManifestAgent` | |
| validates that name against the injected ``cast_names`` β one corrective re-ask, | |
| then ``no_contest`` if the model still won't name a real player. | |
| * **This handler keeps the OFFLINE demo watchable.** The deterministic stub never | |
| emits a ``winner`` (the field is optional, ADR-0029), so without help an offline | |
| judged run would crown no one. :class:`JudgedCompetition` fills an *empty* winner | |
| by reading the name out of the verdict prose, then falling back to the most active | |
| competitor β deterministic, so offline runs are reproducible. It defers entirely | |
| when the engine already forfeited the round (``no_contest``). | |
| A judge with a known ground truth (The Steeped's :class:`~src.agents.handlers.SpyHost`, | |
| Twenty Sprouts' :class:`~src.agents.twenty_sprouts.SproutJudge`) computes the winner in | |
| code instead β ``SproutJudge`` subclasses this and overrides :meth:`decide_winner`. | |
| The best-practice split the roadmap calls for: AI is load-bearing for judgment, code is | |
| load-bearing for bookkeeping. | |
| """ | |
| from __future__ import annotations | |
| from src.agents.base import ManifestAgent | |
| from src.core.events import Event | |
| from src.core.projections import StageProjection | |
| from src.core.registry import register_handler | |
| # Kinds a *competitor* emits β used to find who is eligible to win. A judge emits | |
| # ``judge.verdict`` and is excluded; the scene narrator emits ``world.observed`` (and at | |
| # genesis its actor is the *scenario name*, not a player), so that is excluded too. | |
| # Candidates are thus only the minds that actually spoke, derived from the run's events. | |
| _COMPETITOR_KINDS = frozenset({"agent.spoke", "agent.thought", "oracle.spoke"}) | |
| class JudgedCompetition(ManifestAgent): | |
| """A judge that fills an empty offline ``winner`` so the stub demo still crowns one. | |
| The generic turn (and the engine's live validation) handle a model-named winner. | |
| This handler runs *after* that: when the verdict carries no winner β the offline | |
| stub, which never emits the field β it derives one from the verdict prose, then | |
| from the most active competitor. It honours a model-named winner that is already a | |
| real competitor, and defers when the engine forfeited the round (``no_contest``). | |
| Deterministic, so offline runs are reproducible. | |
| """ | |
| def act( | |
| self, | |
| run_id: str, | |
| turn: int, | |
| projection: StageProjection, | |
| recent_events: tuple[Event, ...], | |
| ) -> Event | None: | |
| # A reactive judge (one that subscribes to the events that might end the game) | |
| # abstains until either its win condition is met or the show reaches its finale β | |
| # otherwise its first firing would rule on turn 1 and end the show (the "first | |
| # verdict ends the show" trap). Returns None to abstain (no event, no budget cost). | |
| if self._abstains(turn, recent_events): | |
| return None | |
| event = super().act(run_id, turn, projection, recent_events) | |
| candidates = self._candidates(recent_events) | |
| if not candidates: | |
| return event # nothing to attribute β leave the verdict as prose only | |
| winner = self.decide_winner(event, candidates, recent_events) | |
| if winner: | |
| event.payload["winner"] = winner | |
| # We crowned someone β clear any forfeit the engine stamped (a ground-truth | |
| # judge overrides no-contest; a repaired offline winner supersedes the empty). | |
| event.payload.pop("no_contest", None) | |
| return event | |
| # ββ early termination (override ``has_early_winner`` to end a game the moment it's won) ββ | |
| def _abstains(self, turn: int, recent_events: tuple[Event, ...]) -> bool: | |
| """Should this judge decline to rule on *this* invocation? | |
| The default judge (no subscriptions) never abstains β it fires once, on its | |
| scheduled tick or a forced curtain call, exactly as before. A *reactive* judge (one | |
| with ``subscribes_to`` set, woken by every spoken line) instead waits: it rules only | |
| when the game is already decided (:meth:`has_early_winner`), when its scheduled | |
| finale tick lands, or when forced (``_forced``, set by ``Conductor.force_verdict`` | |
| for the curtain call / budget end). Either way it never rules twice β once a verdict | |
| is on the ledger, every later invocation abstains.""" | |
| if any(e.kind == "judge.verdict" for e in recent_events): | |
| return True # already ruled β never emit a second verdict | |
| if not self.manifest.subscribes_to: | |
| return False # a plain tick/forced judge always rules when invoked | |
| if getattr(self, "_forced", False): | |
| return False # the curtain call must produce a ruling | |
| tick = self.manifest.schedule.tick_every | |
| if tick and turn % tick == 0: | |
| return False # the scheduled finale β rule now (timeout) | |
| return not self.has_early_winner(recent_events) # reactive: rule only once it's won | |
| def has_early_winner(self, recent_events: tuple[Event, ...]) -> bool: | |
| """True when the game is already decided and the judge should rule *now*. | |
| Default ``False``: a generic judged scenario has no code-known early win, so it | |
| rules only at its finale. A ground-truth scenario overrides this (Twenty Sprouts' | |
| :class:`~src.agents.twenty_sprouts.SproutJudge` returns True the moment the secret | |
| word is guessed) so the show ends on the win instead of running to the timeout.""" | |
| return False | |
| # ββ decision (override for ground-truth judges) ββββββββββββββββββββββββββββββ | |
| def decide_winner( | |
| self, | |
| event: Event, | |
| candidates: list[str], | |
| recent_events: tuple[Event, ...], | |
| ) -> str | None: | |
| """Return the winning cast name (or team label). | |
| Honours the model's ``winner`` when it already names a real competitor or team; | |
| otherwise repairs it deterministically (prose mention β most-active fallback). | |
| Defers when the engine forfeited the round (``no_contest``) β a live model that | |
| refused to name a real player keeps its forfeit; only the offline empty-winner | |
| path (no forfeit) is repaired. Ground-truth judges override this and ignore | |
| ``no_contest`` so their code-decided winner always lands.""" | |
| if event.payload.get("no_contest"): | |
| return None | |
| named = (event.payload.get("winner") or "").strip() | |
| if named and (named in candidates or named in self._team_labels()): | |
| return named | |
| return self._winner_from_prose(event, candidates) or self._most_active(candidates, recent_events) | |
| # ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _team_labels(self) -> set[str]: | |
| """Valid team labels for this scenario (empty for judged / symmetric-seat duels).""" | |
| return set((getattr(self.competition, "teams", None) or {}).keys()) | |
| def _candidates(self, recent_events: tuple[Event, ...]) -> list[str]: | |
| """On-stage competitors: actors who actually spoke, minus this judge. | |
| Insertion-ordered (first appearance) so the fallback is stable and readable. | |
| """ | |
| seen: dict[str, None] = {} | |
| for e in recent_events: | |
| if e.kind in _COMPETITOR_KINDS and e.actor and e.actor != self.name: | |
| seen.setdefault(e.actor, None) | |
| return list(seen) | |
| def _winner_from_prose(event: Event, candidates: list[str]) -> str | None: | |
| """Find the candidate the verdict text names, by full-slug substring match. | |
| Matches on the exact cast slug (``debater-a``) β and the hyphenβspace variant | |
| (``debater a``) a live model might write β so it distinguishes symmetric seats | |
| that share a stem (``debater-a`` vs ``debater-b``) instead of matching the stem. | |
| When several are named, the earliest-mentioned wins (the one the judge leads | |
| with). Returns ``None`` when the prose names no competitor.""" | |
| text = (event.payload.get("text") or "").lower() | |
| best: str | None = None | |
| best_pos = len(text) + 1 | |
| for name in candidates: | |
| for needle in (name.lower(), name.lower().replace("-", " ")): | |
| pos = text.find(needle) | |
| if 0 <= pos < best_pos: | |
| best, best_pos = name, pos | |
| break | |
| return best | |
| def _most_active(candidates: list[str], recent_events: tuple[Event, ...]) -> str: | |
| """Deterministic fallback: the competitor who spoke most (ties β cast order).""" | |
| counts = {name: 0 for name in candidates} | |
| for e in recent_events: | |
| if e.actor in counts and e.kind in _COMPETITOR_KINDS: | |
| counts[e.actor] += 1 | |
| return max(candidates, key=lambda name: (counts[name], -candidates.index(name))) | |