File size: 11,270 Bytes
5424fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d0c561
5424fe6
 
0d0c561
 
5424fe6
 
 
 
 
 
 
 
 
 
 
 
7df0a45
 
 
 
 
 
 
5424fe6
 
 
7df0a45
 
 
 
 
 
 
 
 
 
5424fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d0c561
 
 
ce159dc
 
1c7b5c9
 
 
 
 
 
 
 
 
 
0d0c561
 
 
 
ce159dc
1c7b5c9
 
 
0d0c561
 
ce159dc
0d0c561
 
1c7b5c9
 
 
0d0c561
 
ce159dc
 
 
 
1c7b5c9
0d0c561
1c7b5c9
 
ce159dc
1c7b5c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d0c561
 
 
 
5424fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce159dc
1c7b5c9
 
 
 
0d0c561
5424fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce159dc
1c7b5c9
ce159dc
 
1c7b5c9
 
 
 
 
ce159dc
 
1c7b5c9
ce159dc
 
 
 
1c7b5c9
ce159dc
0d0c561
ce159dc
 
0d0c561
5424fe6
0d0c561
5424fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""Declarative configuration schemas β€” config as validatable data.

These Pydantic models are the contract for the 'easily configurable' surface.
Every knob β€” which agents exist, what they emit, which model tier they use, the
scenario goal, the cast that participates, tool grants, budgets β€” is expressed as
data that:

  * round-trips to/from YAML files under ``config/`` (the on-disk surface), and
  * can equally be produced by a UI form or an LLM and checked with one call.

That last property is the point: ``validate_world`` / ``validate_agent`` /
``validate_scenario`` turn an arbitrary dict into a typed, cross-checked object or
a precise error.  So "let an agent build the configuration from a prompt" reduces
to "emit JSON, validate it, run it."  See ADR-0011.

The agent schema itself is :class:`AgentManifest` (``src/core/manifest.py``) β€” we
reuse it here rather than duplicating, so the four stable contracts stay singular.
"""

from __future__ import annotations

from typing import Literal

from pydantic import BaseModel, ConfigDict, Field, model_validator

from src.core.manifest import AgentManifest

# ── model profiles ─────────────────────────────────────────────────────────────


class ModelProfileConfig(BaseModel):
    model_config = ConfigDict(extra="forbid", protected_namespaces=())

    model: str
    base_url: str | None = None
    """OpenAI-compatible endpoint URL (ends in /v1).  Env-templatable in YAML via
    ``${MODAL_LLM_BASE_URL}`` so the Modal workspace is never hard-coded."""

    api_key: str | None = None
    """Key for the endpoint (env-templatable, e.g. ``${MODAL_LLM_KEY}``).  vLLM
    accepts any token unless the server enforces one."""

    temperature: float = 0.8
    max_tokens: int = 256

    @model_validator(mode="after")
    def _blank_to_none(self) -> "ModelProfileConfig":
        # An unset ``${VAR}`` template expands to "" (see registry._expand_env);
        # normalise empty bindings back to None so the live transport omits them.
        if not self.base_url:
            self.base_url = None
        if not self.api_key:
            self.api_key = None
        return self


class ModelsConfig(BaseModel):
    model_config = ConfigDict(extra="forbid")

    offline: bool | None = None
    """True = always use the deterministic stub; False = always live; None = auto
    (live when credentials are present, stub otherwise)."""

    profiles: dict[str, ModelProfileConfig] = Field(default_factory=dict)
    """Concrete binding per logical profile (tiny/fast/balanced/strong)."""


# ── budgets ─────────────────────────────────────────────────────────────────────


class GovernorConfig(BaseModel):
    model_config = ConfigDict(extra="forbid")

    max_turns: int = 100
    max_calls_per_turn: int = 8
    max_total_calls: int = 500
    max_total_tokens: int | None = None
    hourly_budget_usd: float | None = None


# ── competition ──────────────────────────────────────────────────────────────────

class CompetitionConfig(BaseModel):
    """Declares whether β€” and how β€” a scenario produces a winner (ADR-0029).

    A scenario can be a ``versus`` contest β€” between named ``teams`` (asymmetric
    sides, e.g. The Steeped's spy vs herd) or between ``symmetric_seats`` (identical
    seats that differ only by model, e.g. Debate Duel / Beat Battle β€” the comparison
    that makes the model-leaderboard meaningful) β€” a ``judged`` pick where the
    judge's verdict *is* the result, or ``none`` (the default) where nobody wins.
    ``winner`` downstream carries either an agent name or a team label, so the team
    labels here must stay distinct from agent names β€” that cross-cast check lives in
    :meth:`WorldConfig._check_cast_references`, while the rules a competition can
    enforce on its own (team shape, disjointness, seat count) live in the validator
    below.
    """

    model_config = ConfigDict(extra="forbid")

    kind: Literal["versus", "judged", "none"] = "none"
    """How a winner is derived β€” ``versus`` (team or seat contest), ``judged`` (the
    judge's pick is the answer), or ``none`` (no winner; the default and the absent
    block)."""

    teams: dict[str, list[str]] | None = None
    """Team label β†’ member agent names.  Permitted only when ``kind == 'versus'``."""

    symmetric_seats: list[str] | None = None
    """Cast members occupying *identical* seats that differ only by which model fills
    them β€” the "which model argues better" comparison.  ``versus`` only; needs β‰₯2
    entries.  An alternative to ``teams`` (a versus scenario declares one or the other)."""

    @model_validator(mode="after")
    def _check_teams(self) -> "CompetitionConfig":
        if self.kind != "versus":
            if self.teams is not None:
                raise ValueError(f"competition.teams is only allowed when kind is 'versus' (got kind={self.kind!r})")
            if self.symmetric_seats is not None:
                raise ValueError(
                    f"competition.symmetric_seats is only allowed when kind is 'versus' (got kind={self.kind!r})"
                )
            return self
        # kind == "versus": the contest is described by teams OR symmetric_seats.
        if self.teams is None and self.symmetric_seats is None:
            raise ValueError("competition.kind 'versus' requires either a 'teams' mapping or 'symmetric_seats'")
        if self.symmetric_seats is not None and len(self.symmetric_seats) < 2:
            raise ValueError("competition.symmetric_seats needs β‰₯2 entries to be a contest")
        if self.teams is not None:
            if not self.teams:
                raise ValueError("competition.kind 'versus' requires a non-empty 'teams' mapping")
            empty = [label for label, members in self.teams.items() if not members]
            if empty:
                raise ValueError(f"competition.teams has empty member lists for teams: {sorted(empty)}")
            seen: dict[str, str] = {}
            overlap: set[str] = set()
            for label, members in self.teams.items():
                for member in members:
                    if member in seen and seen[member] != label:
                        overlap.add(member)
                    seen[member] = label
            if overlap:
                raise ValueError(
                    f"competition.teams must be mutually disjoint; agents on more than one team: {sorted(overlap)}"
                )
        return self


# ── scenario ─────────────────────────────────────────────────────────────────────


class ScenarioConfig(BaseModel):
    model_config = ConfigDict(extra="forbid")

    name: str
    title: str = ""
    goal: str = ""
    """The shared objective handed to the whole cast (rendered into every prompt)."""

    default_seed: str
    example_seeds: list[str] = Field(default_factory=list)

    cast: list[str] = Field(default_factory=list)
    """Agent names that participate, resolved against the agent registry.
    Selecting who participates is just editing this list."""

    genesis_text: str | None = None
    governor: GovernorConfig | None = None

    competition: CompetitionConfig | None = None
    """Optional winner contract (ADR-0029); absent == ``none`` (no winner).  The
    authoring checklist (``tests/test_scenario_contract.py``) requires an explicit
    block on every *shipped* scenario, but the schema stays permissive so a partial
    world from the Lab/an LLM still validates."""


# ── the whole world ──────────────────────────────────────────────────────────────


class WorldConfig(BaseModel):
    """A complete, self-contained, validatable description of a runnable world.

    A UI or an LLM can emit one of these (agents + scenarios + models + budgets
    inline) and ``validate_world`` confirms it is coherent before anything runs β€”
    including that every scenario's cast references a defined agent.
    """

    model_config = ConfigDict(extra="forbid")

    models: ModelsConfig = Field(default_factory=ModelsConfig)
    governor: GovernorConfig = Field(default_factory=GovernorConfig)
    agents: list[AgentManifest] = Field(default_factory=list)
    scenarios: list[ScenarioConfig] = Field(default_factory=list)

    @model_validator(mode="after")
    def _check_cast_references(self) -> "WorldConfig":
        defined = {a.name for a in self.agents}
        for scenario in self.scenarios:
            missing = [name for name in scenario.cast if name not in defined]
            if missing:
                raise ValueError(
                    f"scenario {scenario.name!r} references undefined agents: {missing}. "
                    f"Defined agents: {sorted(defined)}"
                )
            competition = scenario.competition
            if competition is None:
                continue
            cast = set(scenario.cast)
            # Every team member AND every symmetric seat must be in this scenario's
            # cast (ADR-0029 Β§1).
            members = {m for members in (competition.teams or {}).values() for m in members}
            members.update(competition.symmetric_seats or [])
            off_cast = sorted(m for m in members if m not in cast)
            if off_cast:
                raise ValueError(
                    f"scenario {scenario.name!r} competition members not in its cast: {off_cast}. "
                    f"Cast: {sorted(cast)}"
                )
            # A team label must not collide with any agent name, or the winner union
            # (agent name OR team label) becomes ambiguous (ADR-0029 Β§1).
            collisions = sorted(label for label in (competition.teams or {}) if label in defined)
            if collisions:
                raise ValueError(
                    f"scenario {scenario.name!r} competition team labels collide with agent names: {collisions}. "
                    f"Team labels must be distinct from agent names to keep the winner unambiguous."
                )
        return self


# ── validation entrypoints (the 'configure from a prompt' surface) ───────────────


def validate_agent(data: dict) -> AgentManifest:
    """Validate one agent definition (e.g. UI form output or LLM-proposed agent)."""
    return AgentManifest.model_validate(data)


def validate_scenario(data: dict) -> ScenarioConfig:
    """Validate one scenario definition."""
    return ScenarioConfig.model_validate(data)


def validate_world(data: dict) -> WorldConfig:
    """Validate a whole world (agents + scenarios + models + budgets) at once."""
    return WorldConfig.model_validate(data)