ayushozha Claude Opus 4.6 commited on
Commit
5f8c92c
·
1 Parent(s): b1d2209

Add MOD 11 typed StepInfo/RewardBreakdown and import completed foundation modules

Browse files

MOD 11: Replace untyped StepResult.info dict with StepInfo model
(extra="allow") and RewardBreakdown model with constrained score
fields. Stub server now explicitly constructs typed objects.

Also imports previously completed work: config.py (MOD 12),
seed.py (SCN 01), normalized scenario layer with three domain
adapters (SCN 02-09), scientist policy parser, and all tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

replicalab/agents/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Agent policy helpers exposed as importable modules."""
2
+
3
+ from .scientist_policy import (
4
+ ScientistOutputParseError,
5
+ build_scientist_system_prompt,
6
+ parse_scientist_output,
7
+ )
8
+
9
+ __all__ = [
10
+ "ScientistOutputParseError",
11
+ "build_scientist_system_prompt",
12
+ "parse_scientist_output",
13
+ ]
replicalab/agents/scientist_policy.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scientist policy helpers.
2
+
3
+ MOD 09 introduced strict parsing from raw model output into
4
+ ``ScientistAction``. AGT 01 adds the first domain-neutral system prompt
5
+ builder so prompt assembly can be driven by the normalized scenario pack
6
+ instead of hard-coded domain text.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import re
13
+ from typing import Any, Literal, Mapping
14
+
15
+ from pydantic import ValidationError
16
+
17
+ from replicalab.models import ScientistAction, ScientistActionType
18
+ from replicalab.scenarios import NormalizedScenarioPack
19
+
20
+ _JSON_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)```", re.IGNORECASE | re.DOTALL)
21
+
22
+
23
+ class ScientistOutputParseError(ValueError):
24
+ """Explicit parser error for malformed or invalid Scientist output."""
25
+
26
+ def __init__(
27
+ self,
28
+ code: Literal["no_json", "invalid_json", "invalid_action"],
29
+ message: str,
30
+ raw_text: str,
31
+ *,
32
+ parsed_payload: dict[str, Any] | None = None,
33
+ ) -> None:
34
+ super().__init__(message)
35
+ self.code = code
36
+ self.message = message
37
+ self.raw_text = raw_text
38
+ self.parsed_payload = parsed_payload
39
+
40
+ def to_dict(self) -> dict[str, Any]:
41
+ """Return a stable error shape for callers and future retries."""
42
+
43
+ return {
44
+ "code": self.code,
45
+ "message": self.message,
46
+ "raw_text": self.raw_text,
47
+ "parsed_payload": self.parsed_payload,
48
+ }
49
+
50
+
51
+ def build_scientist_system_prompt(
52
+ scenario: NormalizedScenarioPack | Mapping[str, Any],
53
+ ) -> str:
54
+ """Build a domain-neutral Scientist system prompt from normalized data."""
55
+
56
+ pack = _coerce_scenario_pack(scenario)
57
+ allowed_actions = ", ".join(action.value for action in ScientistActionType)
58
+
59
+ sections = [
60
+ "You are the Scientist agent in ReplicaLab.",
61
+ (
62
+ "Your job is to negotiate toward the strongest feasible plan under the "
63
+ "provided constraints. You do not invent resources, loosen constraints, "
64
+ "or assume access to hidden ground truth."
65
+ ),
66
+ f"Domain: {pack.domain_id}",
67
+ f"Task: {pack.task_summary}",
68
+ "Success criteria:",
69
+ _render_bullets(pack.success_criteria),
70
+ "Constraints:",
71
+ _render_constraints(pack),
72
+ "Available resources:",
73
+ _render_resources(pack),
74
+ "Allowed substitutions:",
75
+ _render_substitutions(pack),
76
+ (
77
+ "Output contract: return exactly one JSON object with all "
78
+ "ScientistAction fields and no extra keys."
79
+ ),
80
+ f"Allowed action_type values: {allowed_actions}.",
81
+ (
82
+ "Use propose_protocol or revise_protocol only when you can provide a full "
83
+ "protocol payload. Use request_info only when a blocking question remains. "
84
+ "Use accept only when the plan is ready without further edits."
85
+ ),
86
+ (
87
+ "For propose_protocol and revise_protocol, the JSON must include: "
88
+ "sample_size >= 1, controls, technique, duration_days >= 0, "
89
+ "required_equipment, required_reagents, questions = [], and rationale."
90
+ ),
91
+ (
92
+ "For request_info, all protocol fields must stay empty or zero and "
93
+ "questions must contain at least one concrete question."
94
+ ),
95
+ (
96
+ "For accept, questions must be empty and protocol-edit fields must stay "
97
+ "empty or zero."
98
+ ),
99
+ ]
100
+
101
+ return "\n\n".join(section for section in sections if section)
102
+
103
+
104
+ def parse_scientist_output(raw_text: str) -> ScientistAction:
105
+ """Parse raw model text into a validated ``ScientistAction``.
106
+
107
+ The parser accepts:
108
+ - plain JSON objects
109
+ - fenced JSON blocks
110
+ - prose that contains one JSON object
111
+ """
112
+
113
+ payload = _parse_json_payload(raw_text)
114
+ try:
115
+ return ScientistAction.model_validate(payload)
116
+ except ValidationError as exc:
117
+ raise ScientistOutputParseError(
118
+ "invalid_action",
119
+ _format_validation_error(exc),
120
+ raw_text,
121
+ parsed_payload=payload,
122
+ ) from exc
123
+
124
+
125
+ def _parse_json_payload(raw_text: str) -> dict[str, Any]:
126
+ if not raw_text.strip():
127
+ raise ScientistOutputParseError(
128
+ "no_json",
129
+ "Scientist output is empty and does not contain a JSON object.",
130
+ raw_text,
131
+ )
132
+
133
+ saw_json_like_text = False
134
+ last_json_error: json.JSONDecodeError | None = None
135
+
136
+ for candidate in _iter_json_candidates(raw_text):
137
+ saw_json_like_text = True
138
+ try:
139
+ decoded = json.loads(candidate)
140
+ except json.JSONDecodeError as exc:
141
+ last_json_error = exc
142
+ continue
143
+
144
+ if not isinstance(decoded, dict):
145
+ raise ScientistOutputParseError(
146
+ "invalid_json",
147
+ "Scientist output must decode to a JSON object.",
148
+ raw_text,
149
+ )
150
+ return decoded
151
+
152
+ if saw_json_like_text and last_json_error is not None:
153
+ raise ScientistOutputParseError(
154
+ "invalid_json",
155
+ (
156
+ "Scientist output contains JSON-like text but it could not be decoded: "
157
+ f"{last_json_error.msg} at line {last_json_error.lineno}, "
158
+ f"column {last_json_error.colno}."
159
+ ),
160
+ raw_text,
161
+ ) from last_json_error
162
+
163
+ raise ScientistOutputParseError(
164
+ "no_json",
165
+ "Scientist output does not contain a JSON object.",
166
+ raw_text,
167
+ )
168
+
169
+
170
+ def _iter_json_candidates(raw_text: str) -> list[str]:
171
+ candidates: list[str] = []
172
+ seen: set[str] = set()
173
+
174
+ def add(candidate: str | None) -> None:
175
+ if candidate is None:
176
+ return
177
+ cleaned = candidate.strip()
178
+ if not cleaned or cleaned in seen:
179
+ return
180
+ seen.add(cleaned)
181
+ candidates.append(cleaned)
182
+
183
+ stripped = raw_text.strip()
184
+ if stripped.startswith("{") or stripped.startswith("```"):
185
+ add(raw_text)
186
+ add(_extract_first_json_object(raw_text))
187
+
188
+ for match in _JSON_FENCE_RE.finditer(raw_text):
189
+ fenced = match.group(1)
190
+ add(fenced)
191
+ add(_extract_first_json_object(fenced))
192
+
193
+ return candidates
194
+
195
+
196
+ def _extract_first_json_object(text: str) -> str | None:
197
+ start = text.find("{")
198
+ if start < 0:
199
+ return None
200
+
201
+ depth = 0
202
+ in_string = False
203
+ escaped = False
204
+
205
+ for index in range(start, len(text)):
206
+ char = text[index]
207
+
208
+ if in_string:
209
+ if escaped:
210
+ escaped = False
211
+ elif char == "\\":
212
+ escaped = True
213
+ elif char == '"':
214
+ in_string = False
215
+ continue
216
+
217
+ if char == '"':
218
+ in_string = True
219
+ elif char == "{":
220
+ depth += 1
221
+ elif char == "}":
222
+ depth -= 1
223
+ if depth == 0:
224
+ return text[start : index + 1]
225
+
226
+ return None
227
+
228
+
229
+ def _format_validation_error(error: ValidationError) -> str:
230
+ parts: list[str] = []
231
+ for item in error.errors():
232
+ path = ".".join(str(segment) for segment in item.get("loc", ()))
233
+ message = item.get("msg", "Validation error")
234
+ parts.append(f"{path}: {message}" if path else message)
235
+
236
+ detail = "; ".join(parts) if parts else str(error)
237
+ return f"Scientist output JSON failed ScientistAction validation: {detail}"
238
+
239
+
240
+ def _coerce_scenario_pack(
241
+ scenario: NormalizedScenarioPack | Mapping[str, Any],
242
+ ) -> NormalizedScenarioPack:
243
+ if isinstance(scenario, NormalizedScenarioPack):
244
+ return scenario
245
+ return NormalizedScenarioPack.model_validate(scenario)
246
+
247
+
248
+ def _render_bullets(items: list[str]) -> str:
249
+ return "\n".join(f"- {item}" for item in items)
250
+
251
+
252
+ def _render_constraints(pack: NormalizedScenarioPack) -> str:
253
+ lines = []
254
+ for constraint in pack.constraints:
255
+ amount = ""
256
+ if constraint.quantity is not None:
257
+ unit = f" {constraint.unit}" if constraint.unit else ""
258
+ amount = f" ({constraint.comparator} {constraint.quantity}{unit})"
259
+ hardness = "hard" if constraint.hard else "soft"
260
+ lines.append(f"- [{hardness}] {constraint.label}{amount}: {constraint.details}")
261
+ return "\n".join(lines)
262
+
263
+
264
+ def _render_resources(pack: NormalizedScenarioPack) -> str:
265
+ lines = []
266
+ for resource in pack.resources:
267
+ availability = "available" if resource.available else "unavailable"
268
+ amount = ""
269
+ if resource.quantity is not None:
270
+ unit = f" {resource.unit}" if resource.unit else ""
271
+ amount = f" ({resource.quantity}{unit})"
272
+ lines.append(
273
+ f"- [{availability}] {resource.label}{amount}: {resource.details}"
274
+ )
275
+ return "\n".join(lines)
276
+
277
+
278
+ def _render_substitutions(pack: NormalizedScenarioPack) -> str:
279
+ if not pack.allowed_substitutions:
280
+ return "- No substitutions are pre-approved."
281
+
282
+ lines = []
283
+ for substitution in pack.allowed_substitutions:
284
+ lines.append(
285
+ (
286
+ f"- {substitution.original} -> {substitution.alternative}. "
287
+ f"Condition: {substitution.condition} Tradeoff: {substitution.tradeoff}"
288
+ )
289
+ )
290
+ return "\n".join(lines)
replicalab/config.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared configuration constants for ReplicaLab.
2
+
3
+ MOD 12 centralizes the small set of repo-wide defaults that were previously
4
+ scattered across the stub server and scenario builders. Future environment,
5
+ scoring, and client modules should import from here instead of introducing
6
+ new magic numbers.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ DEFAULT_SCENARIO_TEMPLATE = "math_reasoning"
12
+ DEFAULT_DIFFICULTY = "easy"
13
+
14
+ MAX_ROUNDS = 6
15
+ MAX_BUDGET = 5000.0
16
+
17
+ TIMEOUT_SECONDS = 300
18
+ ROUND_TIME_LIMIT_SECONDS = 300
19
+
20
+ SESSION_TTL_SECONDS = TIMEOUT_SECONDS
21
+ WS_IDLE_TIMEOUT_SECONDS = TIMEOUT_SECONDS
22
+
23
+ STUB_ACCEPT_REWARD = 5.0
24
+
25
+ API_HOST = "0.0.0.0"
26
+ API_PORT = 7860
replicalab/models.py CHANGED
@@ -307,18 +307,50 @@ class Observation(BaseModel):
307
  lab_manager: Optional[LabManagerObservation]
308
 
309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  # ---------------------------------------------------------------------------
311
  # Step result
312
  # ---------------------------------------------------------------------------
313
 
314
  class StepResult(BaseModel):
315
  """Returned by env.step(). Contains the next observation, reward,
316
- termination flag, and optional info dict."""
317
 
318
  observation: Optional[Observation] = None
319
  reward: float = 0.0
320
  done: bool = False
321
- info: dict = Field(default_factory=dict)
322
 
323
 
324
  # ---------------------------------------------------------------------------
 
307
  lab_manager: Optional[LabManagerObservation]
308
 
309
 
310
+ # ---------------------------------------------------------------------------
311
+ # Reward breakdown and step metadata
312
+ # ---------------------------------------------------------------------------
313
+
314
+
315
+ class RewardBreakdown(BaseModel):
316
+ """Component scores and adjustments produced by the judge rubric engine."""
317
+
318
+ rigor: float = Field(default=0.0, ge=0, le=1)
319
+ feasibility: float = Field(default=0.0, ge=0, le=1)
320
+ fidelity: float = Field(default=0.0, ge=0, le=1)
321
+ efficiency_bonus: float = 0.0
322
+ communication_bonus: float = 0.0
323
+ penalties: dict[str, float] = Field(default_factory=dict)
324
+
325
+
326
+ class StepInfo(BaseModel):
327
+ """Typed metadata returned alongside each step result.
328
+
329
+ Reserved keys from the frozen contract are typed fields.
330
+ Additional debug or runtime metadata is allowed via extra="allow".
331
+ """
332
+
333
+ model_config = ConfigDict(extra="allow")
334
+
335
+ agreement_reached: bool = False
336
+ error: Optional[str] = None
337
+ reward_breakdown: Optional[RewardBreakdown] = None
338
+ judge_notes: Optional[str] = None
339
+ verdict: Optional[str] = None
340
+
341
+
342
  # ---------------------------------------------------------------------------
343
  # Step result
344
  # ---------------------------------------------------------------------------
345
 
346
  class StepResult(BaseModel):
347
  """Returned by env.step(). Contains the next observation, reward,
348
+ termination flag, and typed step info."""
349
 
350
  observation: Optional[Observation] = None
351
  reward: float = 0.0
352
  done: bool = False
353
+ info: StepInfo = Field(default_factory=StepInfo)
354
 
355
 
356
  # ---------------------------------------------------------------------------
replicalab/scenarios/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scenario generation exports."""
2
+
3
+ from .templates import (
4
+ GOLDEN_SCENARIO_SPECS_PATH,
5
+ HiddenReferenceSpec,
6
+ NormalizedScenarioPack,
7
+ ScenarioConstraint,
8
+ ScenarioResource,
9
+ available_scenario_families,
10
+ apply_difficulty,
11
+ generate_scenario,
12
+ load_template,
13
+ )
14
+
15
+ __all__ = [
16
+ "GOLDEN_SCENARIO_SPECS_PATH",
17
+ "HiddenReferenceSpec",
18
+ "NormalizedScenarioPack",
19
+ "ScenarioConstraint",
20
+ "ScenarioResource",
21
+ "available_scenario_families",
22
+ "apply_difficulty",
23
+ "generate_scenario",
24
+ "load_template",
25
+ ]
replicalab/scenarios/finance_trading.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Finance and trading planning scenario templates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ from typing import Any
7
+
8
+ from replicalab.config import MAX_ROUNDS
9
+
10
+
11
+ def build_finance_trading_template(rng: random.Random) -> dict[str, Any]:
12
+ cases = [
13
+ {
14
+ "domain_id": "finance_trading",
15
+ "paper_title": "Planning an offline mean-reversion backtest for SPY and QQQ",
16
+ "paper_hypothesis": "A simple mean-reversion design can be evaluated fairly without live execution.",
17
+ "paper_method": "Run an offline daily-bar backtest with transaction costs, slippage assumptions, and fixed entry rules.",
18
+ "paper_key_finding": "The plan is accepted only if risk limits and evaluation hygiene remain explicit.",
19
+ "task_summary": "Design a mean-reversion backtest workflow for SPY and QQQ under capital, drawdown, and deadline limits.",
20
+ "success_criteria": [
21
+ "Use only offline historical data with explicit slippage assumptions.",
22
+ "Keep position sizing inside the stated capital and drawdown rules.",
23
+ "Separate strategy design from final evaluation.",
24
+ ],
25
+ "reference_summary": "A valid plan keeps the workflow offline, constrains drawdown, and documents slippage assumptions.",
26
+ "required_elements": [
27
+ "offline historical data only",
28
+ "transaction cost assumption",
29
+ "drawdown guardrail",
30
+ "final evaluation split",
31
+ ],
32
+ "flexible_elements": [
33
+ "lookback window",
34
+ "entry threshold",
35
+ "report visualization format",
36
+ ],
37
+ "target_metric": "risk_adjusted_return",
38
+ "target_value": "positive Sharpe with drawdown inside the guardrail",
39
+ "constraints": [
40
+ {
41
+ "key": "max_capital",
42
+ "label": "Maximum simulated capital",
43
+ "quantity": 50000,
44
+ "unit": "usd",
45
+ "comparator": "<=",
46
+ "hard": True,
47
+ "details": "The simulation must stay within the stated capital cap.",
48
+ },
49
+ {
50
+ "key": "max_drawdown",
51
+ "label": "Maximum allowed drawdown",
52
+ "quantity": 8,
53
+ "unit": "percent",
54
+ "comparator": "<=",
55
+ "hard": True,
56
+ "details": "Any accepted plan must respect the drawdown guardrail.",
57
+ },
58
+ {
59
+ "key": "live_execution",
60
+ "label": "Execution mode",
61
+ "quantity": None,
62
+ "unit": None,
63
+ "comparator": "=",
64
+ "hard": True,
65
+ "details": "Only offline or backtest planning is allowed. No live trading.",
66
+ },
67
+ ],
68
+ "resources": [
69
+ {
70
+ "key": "historical_bars",
71
+ "label": "Historical daily bar dataset",
72
+ "quantity": 1,
73
+ "unit": "dataset",
74
+ "available": True,
75
+ "category": "data",
76
+ "details": "Contains adjusted SPY and QQQ bars with metadata.",
77
+ },
78
+ {
79
+ "key": "backtest_engine",
80
+ "label": "Backtest engine",
81
+ "quantity": 1,
82
+ "unit": "engine",
83
+ "available": True,
84
+ "category": "tool",
85
+ "details": "Supports offline simulation with transaction costs and slippage.",
86
+ },
87
+ {
88
+ "key": "risk_reviewer",
89
+ "label": "Risk reviewer",
90
+ "quantity": 1,
91
+ "unit": "reviewer",
92
+ "available": True,
93
+ "category": "personnel",
94
+ "details": "Reviews risk assumptions and evaluation hygiene.",
95
+ },
96
+ ],
97
+ "allowed_substitutions": [
98
+ {
99
+ "original": "daily bars",
100
+ "alternative": "hourly bars aggregated to daily decisions",
101
+ "condition": "Use if the daily dataset is delayed or incomplete.",
102
+ "tradeoff": "The plan must justify any slippage-model change.",
103
+ },
104
+ {
105
+ "original": "risk reviewer",
106
+ "alternative": "pre-committed risk checklist",
107
+ "condition": "Use if the reviewer is unavailable.",
108
+ "tradeoff": "The plan must include explicit drawdown checks.",
109
+ },
110
+ ],
111
+ "budget_total": 950.0,
112
+ "staff_count": 1,
113
+ "time_limit_days": 3,
114
+ "max_rounds": MAX_ROUNDS,
115
+ },
116
+ {
117
+ "domain_id": "finance_trading",
118
+ "paper_title": "Planning an offline momentum backtest for liquid futures",
119
+ "paper_hypothesis": "A disciplined momentum design can be evaluated offline with strict liquidity and cost assumptions.",
120
+ "paper_method": "Run a futures momentum backtest with predefined roll logic, cost model, and walk-forward evaluation.",
121
+ "paper_key_finding": "The plan is accepted only if walk-forward evaluation and liquidity constraints are explicit.",
122
+ "task_summary": "Design an offline momentum futures backtest under liquidity, slippage, and review constraints.",
123
+ "success_criteria": [
124
+ "Use only offline walk-forward evaluation.",
125
+ "Model roll handling and transaction costs explicitly.",
126
+ "Keep liquidity and concentration rules visible in the final plan.",
127
+ ],
128
+ "reference_summary": "A valid plan models roll logic, transaction costs, and walk-forward evaluation with liquidity limits.",
129
+ "required_elements": [
130
+ "walk-forward evaluation",
131
+ "roll logic",
132
+ "transaction cost assumption",
133
+ "liquidity limit",
134
+ ],
135
+ "flexible_elements": [
136
+ "lookback horizon",
137
+ "rebalance frequency",
138
+ "reporting template",
139
+ ],
140
+ "target_metric": "risk_adjusted_return",
141
+ "target_value": "positive out-of-sample Sharpe with liquidity-compliant trades",
142
+ "constraints": [
143
+ {
144
+ "key": "max_markets",
145
+ "label": "Maximum simultaneous markets",
146
+ "quantity": 4,
147
+ "unit": "markets",
148
+ "comparator": "<=",
149
+ "hard": False,
150
+ "details": "Keep the design narrow enough to review in one session.",
151
+ },
152
+ {
153
+ "key": "max_drawdown",
154
+ "label": "Maximum allowed drawdown",
155
+ "quantity": 10,
156
+ "unit": "percent",
157
+ "comparator": "<=",
158
+ "hard": True,
159
+ "details": "The plan must remain inside the drawdown guardrail.",
160
+ },
161
+ {
162
+ "key": "live_execution",
163
+ "label": "Execution mode",
164
+ "quantity": None,
165
+ "unit": None,
166
+ "comparator": "=",
167
+ "hard": True,
168
+ "details": "Only offline design and backtesting are allowed.",
169
+ },
170
+ ],
171
+ "resources": [
172
+ {
173
+ "key": "futures_dataset",
174
+ "label": "Historical futures dataset",
175
+ "quantity": 1,
176
+ "unit": "dataset",
177
+ "available": True,
178
+ "category": "data",
179
+ "details": "Includes roll metadata and contract-level liquidity fields.",
180
+ },
181
+ {
182
+ "key": "backtest_engine",
183
+ "label": "Walk-forward backtest engine",
184
+ "quantity": 1,
185
+ "unit": "engine",
186
+ "available": True,
187
+ "category": "tool",
188
+ "details": "Supports walk-forward slicing and execution-cost modeling.",
189
+ },
190
+ {
191
+ "key": "risk_reviewer",
192
+ "label": "Risk reviewer",
193
+ "quantity": 1,
194
+ "unit": "reviewer",
195
+ "available": True,
196
+ "category": "personnel",
197
+ "details": "Checks liquidity and concentration assumptions.",
198
+ },
199
+ ],
200
+ "allowed_substitutions": [
201
+ {
202
+ "original": "contract-level backtest",
203
+ "alternative": "continuous-series backtest with explicit caveat",
204
+ "condition": "Use if contract roll metadata is incomplete.",
205
+ "tradeoff": "The plan must document the fidelity loss clearly.",
206
+ }
207
+ ],
208
+ "budget_total": 1100.0,
209
+ "staff_count": 1,
210
+ "time_limit_days": 4,
211
+ "max_rounds": MAX_ROUNDS,
212
+ },
213
+ ]
214
+ return rng.choice(cases)
replicalab/scenarios/math_reasoning.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Mathematics scenario templates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ from typing import Any
7
+
8
+ from replicalab.config import MAX_ROUNDS
9
+
10
+
11
+ def build_math_reasoning_template(rng: random.Random) -> dict[str, Any]:
12
+ cases = [
13
+ {
14
+ "domain_id": "mathematics",
15
+ "paper_title": "Planning a proof of the Cauchy-Schwarz inequality",
16
+ "paper_hypothesis": "A square-expansion argument gives the cleanest proof path.",
17
+ "paper_method": "Outline the proof using one algebraic identity, one equality-case check, and reviewer notes.",
18
+ "paper_key_finding": "The proof is accepted only if every inequality step and equality case is justified.",
19
+ "task_summary": "Produce a proof-planning workflow for the Cauchy-Schwarz inequality for an undergraduate seminar handout.",
20
+ "success_criteria": [
21
+ "Every inequality step is justified in plain language.",
22
+ "The equality case is checked explicitly.",
23
+ "The final plan fits within the review and deadline constraints.",
24
+ ],
25
+ "reference_summary": "A valid plan uses a square-expansion route, checks equality, and includes one verification pass.",
26
+ "required_elements": [
27
+ "explicit target inequality",
28
+ "square-expansion or inner-product setup",
29
+ "equality-case check",
30
+ "final verification pass",
31
+ ],
32
+ "flexible_elements": [
33
+ "notation style",
34
+ "ordering of supporting lemmas",
35
+ "proof-sketch granularity",
36
+ ],
37
+ "target_metric": "proof_validity",
38
+ "target_value": "all required justification steps are present",
39
+ "constraints": [
40
+ {
41
+ "key": "deadline_days",
42
+ "label": "Proof planning deadline",
43
+ "quantity": 3,
44
+ "unit": "days",
45
+ "comparator": "<=",
46
+ "hard": True,
47
+ "details": "The seminar notes must be ready within three days.",
48
+ },
49
+ {
50
+ "key": "review_passes",
51
+ "label": "Required review passes",
52
+ "quantity": 1,
53
+ "unit": "pass",
54
+ "comparator": ">=",
55
+ "hard": True,
56
+ "details": "At least one verification pass is required before acceptance.",
57
+ },
58
+ {
59
+ "key": "max_pages",
60
+ "label": "Maximum proof outline length",
61
+ "quantity": 2,
62
+ "unit": "pages",
63
+ "comparator": "<=",
64
+ "hard": False,
65
+ "details": "The outline should stay concise enough for seminar notes.",
66
+ },
67
+ ],
68
+ "resources": [
69
+ {
70
+ "key": "proof_notebook",
71
+ "label": "Structured proof notebook",
72
+ "quantity": 1,
73
+ "unit": "workspace",
74
+ "available": True,
75
+ "category": "tool",
76
+ "details": "A shared note workspace for the outline and checks.",
77
+ },
78
+ {
79
+ "key": "theorem_library",
80
+ "label": "Reference theorem library",
81
+ "quantity": 1,
82
+ "unit": "library",
83
+ "available": True,
84
+ "category": "reference",
85
+ "details": "Contains previous inequality proofs and notation conventions.",
86
+ },
87
+ {
88
+ "key": "reviewer",
89
+ "label": "Graduate reviewer",
90
+ "quantity": 1,
91
+ "unit": "reviewer",
92
+ "available": True,
93
+ "category": "personnel",
94
+ "details": "A reviewer can check one draft before the deadline.",
95
+ },
96
+ ],
97
+ "allowed_substitutions": [
98
+ {
99
+ "original": "graduate reviewer",
100
+ "alternative": "self-check rubric",
101
+ "condition": "Use only if the reviewer is unavailable.",
102
+ "tradeoff": "Requires a stricter written checklist inside the plan.",
103
+ },
104
+ {
105
+ "original": "full derivation",
106
+ "alternative": "proof sketch with explicit checkpoints",
107
+ "condition": "Use when page budget is tight.",
108
+ "tradeoff": "The plan must still spell out all justification steps.",
109
+ },
110
+ ],
111
+ "budget_total": 300.0,
112
+ "staff_count": 1,
113
+ "time_limit_days": 3,
114
+ "max_rounds": MAX_ROUNDS,
115
+ },
116
+ {
117
+ "domain_id": "mathematics",
118
+ "paper_title": "Planning a proof of Jensen's inequality for convex quadratics",
119
+ "paper_hypothesis": "A convexity-first outline is shorter than an expectation-expansion route.",
120
+ "paper_method": "Use the convexity definition, midpoint intuition, and one numerical sanity check.",
121
+ "paper_key_finding": "The plan succeeds only if the convexity assumption and averaging step are both explicit.",
122
+ "task_summary": "Produce a proof-planning workflow for Jensen's inequality on convex quadratics for a revision session.",
123
+ "success_criteria": [
124
+ "The convexity assumption is named before the main argument.",
125
+ "Averaging and expectation steps are justified.",
126
+ "The plan includes at least one sanity check example.",
127
+ ],
128
+ "reference_summary": "A valid plan states convexity early, justifies averaging, and uses one sanity check.",
129
+ "required_elements": [
130
+ "convexity assumption",
131
+ "averaging step",
132
+ "sanity check example",
133
+ "closing statement tied to the task objective",
134
+ ],
135
+ "flexible_elements": [
136
+ "example choice",
137
+ "notation style",
138
+ "proof sketch ordering",
139
+ ],
140
+ "target_metric": "proof_validity",
141
+ "target_value": "convexity and averaging are justified with one sanity check",
142
+ "constraints": [
143
+ {
144
+ "key": "deadline_days",
145
+ "label": "Proof planning deadline",
146
+ "quantity": 2,
147
+ "unit": "days",
148
+ "comparator": "<=",
149
+ "hard": True,
150
+ "details": "The revision notes are due within two days.",
151
+ },
152
+ {
153
+ "key": "review_passes",
154
+ "label": "Required review passes",
155
+ "quantity": 1,
156
+ "unit": "pass",
157
+ "comparator": ">=",
158
+ "hard": True,
159
+ "details": "The plan needs at least one self-check or peer review.",
160
+ },
161
+ {
162
+ "key": "max_pages",
163
+ "label": "Maximum proof outline length",
164
+ "quantity": 1,
165
+ "unit": "page",
166
+ "comparator": "<=",
167
+ "hard": False,
168
+ "details": "The final outline should fit on one page.",
169
+ },
170
+ ],
171
+ "resources": [
172
+ {
173
+ "key": "whiteboard",
174
+ "label": "Whiteboard workspace",
175
+ "quantity": 1,
176
+ "unit": "workspace",
177
+ "available": True,
178
+ "category": "tool",
179
+ "details": "Used to sketch the proof structure and sanity check.",
180
+ },
181
+ {
182
+ "key": "reference_notes",
183
+ "label": "Reference lecture notes",
184
+ "quantity": 1,
185
+ "unit": "packet",
186
+ "available": True,
187
+ "category": "reference",
188
+ "details": "Contains the convexity definition and worked examples.",
189
+ },
190
+ {
191
+ "key": "peer_reviewer",
192
+ "label": "Peer reviewer",
193
+ "quantity": 1,
194
+ "unit": "reviewer",
195
+ "available": True,
196
+ "category": "personnel",
197
+ "details": "Available for one short review pass.",
198
+ },
199
+ ],
200
+ "allowed_substitutions": [
201
+ {
202
+ "original": "peer reviewer",
203
+ "alternative": "checklist-driven self-review",
204
+ "condition": "Use if the peer reviewer is unavailable.",
205
+ "tradeoff": "The final plan must include explicit verification checkpoints.",
206
+ }
207
+ ],
208
+ "budget_total": 220.0,
209
+ "staff_count": 1,
210
+ "time_limit_days": 2,
211
+ "max_rounds": MAX_ROUNDS,
212
+ },
213
+ ]
214
+ return rng.choice(cases)
replicalab/scenarios/ml_benchmark.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Machine learning benchmark scenario templates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ from typing import Any
7
+
8
+ from replicalab.config import MAX_ROUNDS
9
+
10
+
11
+ def build_ml_benchmark_template(rng: random.Random) -> dict[str, Any]:
12
+ cases = [
13
+ {
14
+ "domain_id": "machine_learning",
15
+ "paper_title": "Reproducing an AG News TinyBERT baseline",
16
+ "paper_hypothesis": "A distilled model can match the published accuracy within the stated compute budget.",
17
+ "paper_method": "Fine-tune TinyBERT on AG News with the published split, tokenizer, and evaluation script.",
18
+ "paper_key_finding": "The baseline is accepted only if the held-out accuracy is within one point of the target.",
19
+ "task_summary": "Plan an ML benchmark replication for AG News classification with strict GPU and deadline limits.",
20
+ "success_criteria": [
21
+ "Use the published train-validation-test split.",
22
+ "Report held-out accuracy with the same metric definition as the paper.",
23
+ "Fit the full plan within the available GPU budget and time window.",
24
+ ],
25
+ "reference_summary": "A valid plan keeps the published split and evaluation metric while staying inside the compute budget.",
26
+ "required_elements": [
27
+ "published data split",
28
+ "matching tokenizer family",
29
+ "held-out accuracy evaluation",
30
+ "run logging",
31
+ ],
32
+ "flexible_elements": [
33
+ "batch size",
34
+ "learning-rate schedule",
35
+ "checkpoint cadence",
36
+ ],
37
+ "target_metric": "held_out_accuracy",
38
+ "target_value": "within one point of the reported AG News baseline",
39
+ "constraints": [
40
+ {
41
+ "key": "gpu_hours",
42
+ "label": "Maximum GPU budget",
43
+ "quantity": 8,
44
+ "unit": "gpu_hours",
45
+ "comparator": "<=",
46
+ "hard": True,
47
+ "details": "The full run must fit within eight GPU-hours.",
48
+ },
49
+ {
50
+ "key": "deadline_days",
51
+ "label": "Replication deadline",
52
+ "quantity": 4,
53
+ "unit": "days",
54
+ "comparator": "<=",
55
+ "hard": True,
56
+ "details": "The benchmark must be reproduced within four days.",
57
+ },
58
+ {
59
+ "key": "evaluation_policy",
60
+ "label": "Evaluation policy",
61
+ "quantity": None,
62
+ "unit": None,
63
+ "comparator": "=",
64
+ "hard": True,
65
+ "details": "Use only the held-out split; no test-set peeking.",
66
+ },
67
+ ],
68
+ "resources": [
69
+ {
70
+ "key": "gpu_node",
71
+ "label": "A100 GPU node",
72
+ "quantity": 1,
73
+ "unit": "node",
74
+ "available": True,
75
+ "category": "compute",
76
+ "details": "Reserved for one benchmark run at a time.",
77
+ },
78
+ {
79
+ "key": "dataset_mirror",
80
+ "label": "AG News dataset mirror",
81
+ "quantity": 1,
82
+ "unit": "mirror",
83
+ "available": True,
84
+ "category": "data",
85
+ "details": "Local mirror with the published split manifest.",
86
+ },
87
+ {
88
+ "key": "tracking_tool",
89
+ "label": "Experiment tracking workspace",
90
+ "quantity": 1,
91
+ "unit": "workspace",
92
+ "available": True,
93
+ "category": "tool",
94
+ "details": "Captures configs, metrics, and artifacts.",
95
+ },
96
+ ],
97
+ "allowed_substitutions": [
98
+ {
99
+ "original": "full training schedule",
100
+ "alternative": "shorter schedule with early stopping",
101
+ "condition": "Use when the GPU budget is tight.",
102
+ "tradeoff": "The plan must justify why the metric remains trustworthy.",
103
+ },
104
+ {
105
+ "original": "large batch size",
106
+ "alternative": "smaller batch size with accumulation",
107
+ "condition": "Use when the node has limited memory.",
108
+ "tradeoff": "Training takes longer and must still fit the deadline.",
109
+ },
110
+ ],
111
+ "budget_total": 1800.0,
112
+ "staff_count": 2,
113
+ "time_limit_days": 4,
114
+ "max_rounds": MAX_ROUNDS,
115
+ },
116
+ {
117
+ "domain_id": "machine_learning",
118
+ "paper_title": "Reproducing a CIFAR-10 ResNet-18 baseline",
119
+ "paper_hypothesis": "The reported top-1 accuracy is reachable with the stated data pipeline and a smaller tuning budget.",
120
+ "paper_method": "Train ResNet-18 on CIFAR-10 with the published augmentation recipe and evaluation checkpoint.",
121
+ "paper_key_finding": "The baseline is accepted only if the final accuracy and training recipe are reproducible.",
122
+ "task_summary": "Plan a CIFAR-10 benchmark replication with limited compute, strict evaluation rules, and one reviewer pass.",
123
+ "success_criteria": [
124
+ "Use the published augmentation recipe or justify a compatible substitution.",
125
+ "Keep evaluation isolated from any tuning loop.",
126
+ "Log all seeds, configs, and final metrics for reproducibility.",
127
+ ],
128
+ "reference_summary": "A valid plan preserves the published augmentation and evaluation rules while logging every run.",
129
+ "required_elements": [
130
+ "published augmentation recipe",
131
+ "fixed evaluation checkpoint",
132
+ "seed logging",
133
+ "final metric report",
134
+ ],
135
+ "flexible_elements": [
136
+ "optimizer implementation",
137
+ "checkpoint interval",
138
+ "data-loader worker count",
139
+ ],
140
+ "target_metric": "top1_accuracy",
141
+ "target_value": "within one point of the CIFAR-10 baseline",
142
+ "constraints": [
143
+ {
144
+ "key": "gpu_hours",
145
+ "label": "Maximum GPU budget",
146
+ "quantity": 10,
147
+ "unit": "gpu_hours",
148
+ "comparator": "<=",
149
+ "hard": True,
150
+ "details": "The benchmark must fit within ten GPU-hours.",
151
+ },
152
+ {
153
+ "key": "deadline_days",
154
+ "label": "Replication deadline",
155
+ "quantity": 5,
156
+ "unit": "days",
157
+ "comparator": "<=",
158
+ "hard": True,
159
+ "details": "The plan must finish inside the review window.",
160
+ },
161
+ {
162
+ "key": "review_passes",
163
+ "label": "Required review passes",
164
+ "quantity": 1,
165
+ "unit": "pass",
166
+ "comparator": ">=",
167
+ "hard": False,
168
+ "details": "A teammate should review the config before launch.",
169
+ },
170
+ ],
171
+ "resources": [
172
+ {
173
+ "key": "gpu_node",
174
+ "label": "L40S GPU node",
175
+ "quantity": 1,
176
+ "unit": "node",
177
+ "available": True,
178
+ "category": "compute",
179
+ "details": "Shared node with moderate queue pressure.",
180
+ },
181
+ {
182
+ "key": "dataset_archive",
183
+ "label": "CIFAR-10 dataset archive",
184
+ "quantity": 1,
185
+ "unit": "archive",
186
+ "available": True,
187
+ "category": "data",
188
+ "details": "Local archive with checksum verification.",
189
+ },
190
+ {
191
+ "key": "reviewer",
192
+ "label": "Benchmark reviewer",
193
+ "quantity": 1,
194
+ "unit": "reviewer",
195
+ "available": True,
196
+ "category": "personnel",
197
+ "details": "Can review the config once before training.",
198
+ },
199
+ ],
200
+ "allowed_substitutions": [
201
+ {
202
+ "original": "full epoch schedule",
203
+ "alternative": "reduced epoch schedule with checkpoint comparison",
204
+ "condition": "Use if queue time or GPU budget becomes tight.",
205
+ "tradeoff": "Needs a clear explanation for any metric gap.",
206
+ }
207
+ ],
208
+ "budget_total": 2100.0,
209
+ "staff_count": 2,
210
+ "time_limit_days": 5,
211
+ "max_rounds": MAX_ROUNDS,
212
+ },
213
+ ]
214
+ return rng.choice(cases)
replicalab/scenarios/templates.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Normalized scenario generation and mapping helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ from pathlib import Path
7
+ from typing import Any, Callable, Literal
8
+
9
+ from pydantic import BaseModel, ConfigDict
10
+
11
+ from replicalab.config import MAX_BUDGET, MAX_ROUNDS
12
+ from replicalab.models import LabManagerObservation, ScientistObservation
13
+ from replicalab.scenarios.finance_trading import build_finance_trading_template
14
+ from replicalab.scenarios.math_reasoning import build_math_reasoning_template
15
+ from replicalab.scenarios.ml_benchmark import build_ml_benchmark_template
16
+ from replicalab.utils.seed import seed_rng
17
+
18
+ Difficulty = Literal["easy", "medium", "hard"]
19
+ TemplateName = Literal["math_reasoning", "ml_benchmark", "finance_trading"]
20
+
21
+ GOLDEN_SCENARIO_SPECS_PATH = (
22
+ Path(__file__).resolve().parents[2] / "tests" / "fixtures" / "golden_scenarios.json"
23
+ )
24
+
25
+
26
+ class ScenarioConstraint(BaseModel):
27
+ model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
28
+
29
+ key: str
30
+ label: str
31
+ quantity: float | int | None = None
32
+ unit: str | None = None
33
+ comparator: Literal["<=", ">=", "="] = "="
34
+ hard: bool = True
35
+ details: str
36
+
37
+
38
+ class ScenarioResource(BaseModel):
39
+ model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
40
+
41
+ key: str
42
+ label: str
43
+ quantity: float | int | None = None
44
+ unit: str | None = None
45
+ available: bool = True
46
+ category: str
47
+ details: str
48
+
49
+
50
+ class AllowedSubstitution(BaseModel):
51
+ model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
52
+
53
+ original: str
54
+ alternative: str
55
+ condition: str
56
+ tradeoff: str
57
+
58
+
59
+ class HiddenReferenceSpec(BaseModel):
60
+ model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
61
+
62
+ summary: str
63
+ required_elements: list[str]
64
+ flexible_elements: list[str]
65
+ target_metric: str
66
+ target_value: str
67
+
68
+
69
+ class NormalizedScenarioPack(BaseModel):
70
+ model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
71
+
72
+ scenario_id: str
73
+ template: TemplateName
74
+ domain_id: str
75
+ difficulty: Difficulty
76
+ seed: int
77
+ task_summary: str
78
+ success_criteria: list[str]
79
+ constraints: list[ScenarioConstraint]
80
+ resources: list[ScenarioResource]
81
+ allowed_substitutions: list[AllowedSubstitution]
82
+ hidden_reference_spec: HiddenReferenceSpec
83
+ scientist_observation: ScientistObservation
84
+ lab_manager_observation: LabManagerObservation
85
+
86
+
87
+ TemplateBuilder = Callable[[Any], dict[str, Any]]
88
+
89
+ _TEMPLATE_BUILDERS: dict[TemplateName, TemplateBuilder] = {
90
+ "math_reasoning": build_math_reasoning_template,
91
+ "ml_benchmark": build_ml_benchmark_template,
92
+ "finance_trading": build_finance_trading_template,
93
+ }
94
+
95
+
96
+ def available_scenario_families() -> list[dict[str, Any]]:
97
+ return [
98
+ {"family": name, "difficulties": ["easy", "medium", "hard"]}
99
+ for name in _TEMPLATE_BUILDERS
100
+ ]
101
+
102
+
103
+ def load_template(template: TemplateName) -> TemplateBuilder:
104
+ try:
105
+ return _TEMPLATE_BUILDERS[template]
106
+ except KeyError as exc:
107
+ raise ValueError(f"Unknown scenario template: {template}") from exc
108
+
109
+
110
+ def apply_difficulty(
111
+ draft: dict[str, Any],
112
+ difficulty: Difficulty,
113
+ rng: Any,
114
+ ) -> dict[str, Any]:
115
+ scaled = copy.deepcopy(draft)
116
+ scaled["difficulty"] = difficulty
117
+
118
+ if difficulty == "easy":
119
+ scaled["budget_total"] = round(float(draft["budget_total"]) * 1.15, 2)
120
+ return scaled
121
+
122
+ if difficulty == "medium":
123
+ scaled["budget_total"] = round(float(draft["budget_total"]) * 0.95, 2)
124
+ scaled["time_limit_days"] = max(1, int(draft["time_limit_days"]) - 1)
125
+ _tighten_one_resource(scaled["resources"], rng)
126
+ _append_conflict_constraint(
127
+ scaled["constraints"],
128
+ "One resource is partially constrained, so the plan must justify a fallback path.",
129
+ )
130
+ return scaled
131
+
132
+ scaled["budget_total"] = round(float(draft["budget_total"]) * 0.8, 2)
133
+ scaled["time_limit_days"] = max(1, int(draft["time_limit_days"]) - 1)
134
+ scaled["staff_count"] = max(1, int(draft["staff_count"]) - 1)
135
+ _tighten_one_resource(scaled["resources"], rng)
136
+ _tighten_one_resource(scaled["resources"], rng)
137
+ _append_conflict_constraint(
138
+ scaled["constraints"],
139
+ "At least one primary resource is unavailable, so the plan must use an allowed substitution or reduced scope.",
140
+ )
141
+ _append_conflict_constraint(
142
+ scaled["constraints"],
143
+ "The final plan must remain concise because review capacity is limited under hard mode.",
144
+ )
145
+ return scaled
146
+
147
+
148
+ def generate_scenario(
149
+ seed: int,
150
+ template: TemplateName,
151
+ difficulty: Difficulty,
152
+ ) -> NormalizedScenarioPack:
153
+ rng = seed_rng(seed, namespace=f"scenario:{template}")
154
+ base_draft = load_template(template)(rng)
155
+ scaled = apply_difficulty(base_draft, difficulty, rng)
156
+ return _build_pack(seed=seed, template=template, draft=scaled)
157
+
158
+
159
+ def _build_pack(seed: int, template: TemplateName, draft: dict[str, Any]) -> NormalizedScenarioPack:
160
+ constraints = [ScenarioConstraint.model_validate(item) for item in draft["constraints"]]
161
+ resources = [ScenarioResource.model_validate(item) for item in draft["resources"]]
162
+ substitutions = [
163
+ AllowedSubstitution.model_validate(item)
164
+ for item in draft["allowed_substitutions"]
165
+ ]
166
+
167
+ time_limit_days = int(draft["time_limit_days"])
168
+ budget_total = float(draft["budget_total"])
169
+ staff_count = int(draft["staff_count"])
170
+ max_rounds = int(draft["max_rounds"])
171
+
172
+ if budget_total > MAX_BUDGET:
173
+ raise ValueError(
174
+ f"Scenario budget {budget_total} exceeds configured MAX_BUDGET={MAX_BUDGET}."
175
+ )
176
+ if max_rounds > MAX_ROUNDS:
177
+ raise ValueError(
178
+ f"Scenario max_rounds {max_rounds} exceeds configured MAX_ROUNDS={MAX_ROUNDS}."
179
+ )
180
+
181
+ equipment_available, equipment_booked = _split_resources(
182
+ resources,
183
+ include_categories={"tool", "compute"},
184
+ )
185
+ reagents_in_stock, reagents_out_of_stock = _split_resources(
186
+ resources,
187
+ include_categories={"reference", "data", "personnel"},
188
+ )
189
+
190
+ safety_restrictions = [
191
+ constraint.details
192
+ for constraint in constraints
193
+ if not constraint.hard or constraint.key in {"live_execution", "evaluation_policy"}
194
+ ]
195
+ if not safety_restrictions:
196
+ safety_restrictions = ["No policy exceptions are allowed."]
197
+
198
+ scientist_observation = ScientistObservation(
199
+ paper_title=draft["paper_title"],
200
+ paper_hypothesis=draft["paper_hypothesis"],
201
+ paper_method=draft["paper_method"],
202
+ paper_key_finding=draft["paper_key_finding"],
203
+ experiment_goal=draft["task_summary"],
204
+ conversation_history=[],
205
+ current_protocol=None,
206
+ round_number=0,
207
+ max_rounds=max_rounds,
208
+ )
209
+
210
+ lab_manager_observation = LabManagerObservation(
211
+ budget_total=budget_total,
212
+ budget_remaining=budget_total,
213
+ equipment_available=equipment_available,
214
+ equipment_booked=equipment_booked,
215
+ reagents_in_stock=reagents_in_stock,
216
+ reagents_out_of_stock=reagents_out_of_stock,
217
+ staff_count=staff_count,
218
+ time_limit_days=time_limit_days,
219
+ safety_restrictions=safety_restrictions,
220
+ conversation_history=[],
221
+ current_protocol=None,
222
+ round_number=0,
223
+ max_rounds=max_rounds,
224
+ )
225
+
226
+ hidden_reference = HiddenReferenceSpec(
227
+ summary=draft["reference_summary"],
228
+ required_elements=list(draft["required_elements"]),
229
+ flexible_elements=list(draft["flexible_elements"]),
230
+ target_metric=draft["target_metric"],
231
+ target_value=draft["target_value"],
232
+ )
233
+
234
+ return NormalizedScenarioPack(
235
+ scenario_id=f"{template}-{draft['difficulty']}-{seed}",
236
+ template=template,
237
+ domain_id=draft["domain_id"],
238
+ difficulty=draft["difficulty"],
239
+ seed=seed,
240
+ task_summary=draft["task_summary"],
241
+ success_criteria=list(draft["success_criteria"]),
242
+ constraints=constraints,
243
+ resources=resources,
244
+ allowed_substitutions=substitutions,
245
+ hidden_reference_spec=hidden_reference,
246
+ scientist_observation=scientist_observation,
247
+ lab_manager_observation=lab_manager_observation,
248
+ )
249
+
250
+
251
+ def _split_resources(
252
+ resources: list[ScenarioResource],
253
+ *,
254
+ include_categories: set[str],
255
+ ) -> tuple[list[str], list[str]]:
256
+ available: list[str] = []
257
+ unavailable: list[str] = []
258
+
259
+ for resource in resources:
260
+ if resource.category not in include_categories:
261
+ continue
262
+ target = available if resource.available else unavailable
263
+ target.append(resource.label)
264
+
265
+ return available, unavailable
266
+
267
+
268
+ def _tighten_one_resource(resources: list[dict[str, Any]], rng: Any) -> None:
269
+ available_indices = [
270
+ index
271
+ for index, resource in enumerate(resources)
272
+ if resource.get("available", True)
273
+ ]
274
+ if not available_indices:
275
+ return
276
+
277
+ chosen_index = rng.choice(available_indices)
278
+ chosen = resources[chosen_index]
279
+ chosen["available"] = False
280
+ chosen["details"] = (
281
+ f"{chosen['details']} Availability is constrained under the current difficulty."
282
+ )
283
+
284
+
285
+ def _append_conflict_constraint(
286
+ constraints: list[dict[str, Any]],
287
+ details: str,
288
+ ) -> None:
289
+ constraints.append(
290
+ {
291
+ "key": f"conflict_{len(constraints) + 1}",
292
+ "label": "Difficulty-induced conflict",
293
+ "quantity": None,
294
+ "unit": None,
295
+ "comparator": "=",
296
+ "hard": True,
297
+ "details": details,
298
+ }
299
+ )
replicalab/utils/seed.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic seeding helpers shared by scenarios and the environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import random
7
+
8
+
9
+ def get_deterministic_seed(seed: int, namespace: str = "") -> int:
10
+ """Derive a stable child seed from a base seed plus namespace."""
11
+
12
+ payload = f"{seed}:{namespace}".encode("utf-8")
13
+ digest = hashlib.sha256(payload).digest()
14
+ return int.from_bytes(digest[:8], byteorder="big", signed=False)
15
+
16
+
17
+ def seed_rng(seed: int, namespace: str = "") -> random.Random:
18
+ """Return a dedicated RNG instance seeded deterministically."""
19
+
20
+ return random.Random(get_deterministic_seed(seed, namespace))
server/app.py CHANGED
@@ -31,13 +31,25 @@ from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect
31
  from fastapi.middleware.cors import CORSMiddleware
32
  from pydantic import BaseModel
33
 
 
 
 
 
 
 
 
 
 
 
34
  from replicalab.models import (
35
  EpisodeLog,
36
  EpisodeState,
37
  LabManagerObservation,
38
  Observation,
 
39
  ScientistAction,
40
  ScientistObservation,
 
41
  StepResult,
42
  )
43
 
@@ -65,18 +77,18 @@ except ImportError:
65
  log.warning("ReplicaLabEnv not found — using _StubEnv (replace when Person A ships env)")
66
 
67
 
68
- def _reward_breakdown_from_state(state: EpisodeState) -> dict[str, Any]:
69
- return {
70
- "rigor": state.rigor_score,
71
- "feasibility": state.feasibility_score,
72
- "fidelity": state.fidelity_score,
73
- "efficiency_bonus": 0.0,
74
- "communication_bonus": 0.0,
75
- "penalties": {
76
  "invalid_action": 0.0,
77
  "timeout": 0.0,
78
  },
79
- }
80
 
81
 
82
  def _build_episode_log(episode_id: str, state: EpisodeState) -> EpisodeLog:
@@ -113,29 +125,33 @@ class _StubEnv:
113
  def reset(
114
  self,
115
  seed: int = 0,
116
- scenario: str = "cell_biology",
117
- difficulty: str = "easy",
118
  ) -> Observation:
119
  self._episode_id = str(uuid.uuid4())
120
  self._logs = []
 
121
  self._state = EpisodeState(
122
  seed=seed,
123
  scenario_template=scenario,
124
  difficulty=difficulty,
125
- paper_title="[stub] Effect of compound X on cell proliferation",
126
  paper_hypothesis="Compound X inhibits cell growth at 10 µM",
127
- paper_method="MTT assay, 96-well plate, 72 h incubation",
128
  paper_key_finding="IC50 = 8.3 µM",
129
- experiment_goal="Replicate IC50 measurement within 20 % margin",
130
- lab_budget_total=5000.0,
131
- lab_budget_remaining=5000.0,
132
- lab_equipment=["96-well plate reader", "incubator", "pipettes"],
133
  lab_reagents=["MTT reagent", "DMSO", "cell culture media"],
134
- lab_staff_count=2,
135
- lab_time_limit_days=14,
136
- max_rounds=6,
137
  round_number=0,
138
  )
 
 
 
139
  self._state.conversation_history = list(self._logs)
140
  log.info("Stub reset | episode=%s seed=%d scenario=%s", self._episode_id, seed, scenario)
141
  return self._make_observation()
@@ -150,7 +166,7 @@ class _StubEnv:
150
  action.action_type == "accept"
151
  or self._state.round_number >= self._state.max_rounds
152
  )
153
- reward = 5.0 if done and action.action_type == "accept" else 0.0
154
  if done:
155
  self._state.done = True
156
  self._state.agreement_reached = action.action_type == "accept"
@@ -163,11 +179,16 @@ class _StubEnv:
163
  observation=self._make_observation(),
164
  reward=reward,
165
  done=done,
166
- info={
167
- "round": self._state.round_number,
168
- "stub": True,
169
- "episode_id": self._episode_id,
170
- },
 
 
 
 
 
171
  )
172
 
173
  def state(self) -> EpisodeState:
@@ -264,7 +285,7 @@ def _make_env() -> "_StubEnv":
264
  # In-memory session store (REST sessions)
265
  # ---------------------------------------------------------------------------
266
 
267
- _SESSION_TTL_SECONDS = 300 # 5 minutes idle before cleanup
268
 
269
  _sessions: dict[str, dict[str, Any]] = {}
270
  # { session_id: { "env": env_instance, "last_active": float, "episode_id": str } }
@@ -340,11 +361,7 @@ app.add_middleware(
340
  # Available scenarios constant
341
  # ---------------------------------------------------------------------------
342
 
343
- SCENARIOS = [
344
- {"family": "cell_biology", "difficulties": ["easy", "medium", "hard"]},
345
- {"family": "ml_benchmark", "difficulties": ["easy", "medium", "hard"]},
346
- {"family": "behavioral_psych", "difficulties": ["easy", "medium", "hard"]},
347
- ]
348
 
349
  # ---------------------------------------------------------------------------
350
  # REST request/response schemas
@@ -353,8 +370,8 @@ SCENARIOS = [
353
 
354
  class ResetRequest(BaseModel):
355
  seed: int = 0
356
- scenario: str = "cell_biology"
357
- difficulty: str = "easy"
358
  session_id: Optional[str] = None # pass to reuse an existing session slot
359
 
360
 
@@ -451,7 +468,7 @@ async def get_replay(episode_id: str):
451
 
452
  # WebSocket message protocol:
453
  # Client → Server:
454
- # { "type": "reset", "seed": 42, "scenario": "cell_biology", "difficulty": "easy" }
455
  # { "type": "step", "action": { ...ScientistAction fields... } }
456
  # { "type": "ping" }
457
  #
@@ -461,14 +478,14 @@ async def get_replay(episode_id: str):
461
  # { "type": "pong" }
462
  # { "type": "error", "message": "..." }
463
 
464
- _WS_IDLE_TIMEOUT = 300 # seconds before server closes an idle WebSocket
465
 
466
 
467
  async def _ws_send(ws: WebSocket, payload: dict) -> None:
468
  await ws.send_text(json.dumps(payload))
469
 
470
 
471
- def main(host: str = "0.0.0.0", port: int = 7860) -> None:
472
  import uvicorn
473
 
474
  uvicorn.run("server.app:app", host=host, port=port, reload=False)
@@ -503,8 +520,8 @@ async def websocket_endpoint(ws: WebSocket):
503
 
504
  elif msg_type == "reset":
505
  seed = int(msg.get("seed", 0))
506
- scenario = str(msg.get("scenario", "cell_biology"))
507
- difficulty = str(msg.get("difficulty", "easy"))
508
 
509
  try:
510
  obs = env.reset(seed=seed, scenario=scenario, difficulty=difficulty)
@@ -584,10 +601,10 @@ if __name__ == "__main__":
584
  import argparse
585
 
586
  parser = argparse.ArgumentParser()
587
- parser.add_argument("--port", type=int, default=7860)
588
- parser.add_argument("--host", default="0.0.0.0")
589
  args = parser.parse_args()
590
- if args.host == "0.0.0.0" and args.port == 7860:
591
  main()
592
  else:
593
  main(host=args.host, port=args.port)
 
31
  from fastapi.middleware.cors import CORSMiddleware
32
  from pydantic import BaseModel
33
 
34
+ from replicalab.config import (
35
+ API_HOST,
36
+ API_PORT,
37
+ DEFAULT_DIFFICULTY,
38
+ DEFAULT_SCENARIO_TEMPLATE,
39
+ SESSION_TTL_SECONDS,
40
+ STUB_ACCEPT_REWARD,
41
+ WS_IDLE_TIMEOUT_SECONDS,
42
+ )
43
+ from replicalab.scenarios import available_scenario_families, generate_scenario
44
  from replicalab.models import (
45
  EpisodeLog,
46
  EpisodeState,
47
  LabManagerObservation,
48
  Observation,
49
+ RewardBreakdown,
50
  ScientistAction,
51
  ScientistObservation,
52
+ StepInfo,
53
  StepResult,
54
  )
55
 
 
77
  log.warning("ReplicaLabEnv not found — using _StubEnv (replace when Person A ships env)")
78
 
79
 
80
+ def _reward_breakdown_from_state(state: EpisodeState) -> RewardBreakdown:
81
+ return RewardBreakdown(
82
+ rigor=state.rigor_score,
83
+ feasibility=state.feasibility_score,
84
+ fidelity=state.fidelity_score,
85
+ efficiency_bonus=0.0,
86
+ communication_bonus=0.0,
87
+ penalties={
88
  "invalid_action": 0.0,
89
  "timeout": 0.0,
90
  },
91
+ )
92
 
93
 
94
  def _build_episode_log(episode_id: str, state: EpisodeState) -> EpisodeLog:
 
125
  def reset(
126
  self,
127
  seed: int = 0,
128
+ scenario: str = DEFAULT_SCENARIO_TEMPLATE,
129
+ difficulty: str = DEFAULT_DIFFICULTY,
130
  ) -> Observation:
131
  self._episode_id = str(uuid.uuid4())
132
  self._logs = []
133
+ pack = generate_scenario(seed=seed, template=scenario, difficulty=difficulty)
134
  self._state = EpisodeState(
135
  seed=seed,
136
  scenario_template=scenario,
137
  difficulty=difficulty,
138
+ paper_title=pack.scientist_observation.paper_title,
139
  paper_hypothesis="Compound X inhibits cell growth at 10 µM",
140
+ paper_method=pack.scientist_observation.paper_method,
141
  paper_key_finding="IC50 = 8.3 µM",
142
+ experiment_goal=pack.scientist_observation.experiment_goal,
143
+ lab_budget_total=pack.lab_manager_observation.budget_total,
144
+ lab_budget_remaining=pack.lab_manager_observation.budget_remaining,
145
+ lab_equipment=list(pack.lab_manager_observation.equipment_available),
146
  lab_reagents=["MTT reagent", "DMSO", "cell culture media"],
147
+ lab_staff_count=pack.lab_manager_observation.staff_count,
148
+ lab_time_limit_days=pack.lab_manager_observation.time_limit_days,
149
+ max_rounds=pack.scientist_observation.max_rounds,
150
  round_number=0,
151
  )
152
+ self._state.paper_hypothesis = pack.scientist_observation.paper_hypothesis
153
+ self._state.paper_key_finding = pack.scientist_observation.paper_key_finding
154
+ self._state.lab_reagents = list(pack.lab_manager_observation.reagents_in_stock)
155
  self._state.conversation_history = list(self._logs)
156
  log.info("Stub reset | episode=%s seed=%d scenario=%s", self._episode_id, seed, scenario)
157
  return self._make_observation()
 
166
  action.action_type == "accept"
167
  or self._state.round_number >= self._state.max_rounds
168
  )
169
+ reward = STUB_ACCEPT_REWARD if done and action.action_type == "accept" else 0.0
170
  if done:
171
  self._state.done = True
172
  self._state.agreement_reached = action.action_type == "accept"
 
179
  observation=self._make_observation(),
180
  reward=reward,
181
  done=done,
182
+ info=StepInfo(
183
+ agreement_reached=self._state.agreement_reached,
184
+ error=None,
185
+ reward_breakdown=_reward_breakdown_from_state(self._state) if done else None,
186
+ judge_notes="Stub audit until judge integration lands." if done else None,
187
+ verdict=("accept" if self._state.agreement_reached else "revise") if done else None,
188
+ round=self._state.round_number,
189
+ stub=True,
190
+ episode_id=self._episode_id,
191
+ ),
192
  )
193
 
194
  def state(self) -> EpisodeState:
 
285
  # In-memory session store (REST sessions)
286
  # ---------------------------------------------------------------------------
287
 
288
+ _SESSION_TTL_SECONDS = SESSION_TTL_SECONDS
289
 
290
  _sessions: dict[str, dict[str, Any]] = {}
291
  # { session_id: { "env": env_instance, "last_active": float, "episode_id": str } }
 
361
  # Available scenarios constant
362
  # ---------------------------------------------------------------------------
363
 
364
+ SCENARIOS = available_scenario_families()
 
 
 
 
365
 
366
  # ---------------------------------------------------------------------------
367
  # REST request/response schemas
 
370
 
371
  class ResetRequest(BaseModel):
372
  seed: int = 0
373
+ scenario: str = DEFAULT_SCENARIO_TEMPLATE
374
+ difficulty: str = DEFAULT_DIFFICULTY
375
  session_id: Optional[str] = None # pass to reuse an existing session slot
376
 
377
 
 
468
 
469
  # WebSocket message protocol:
470
  # Client → Server:
471
+ # { "type": "reset", "seed": 42, "scenario": DEFAULT_SCENARIO_TEMPLATE, "difficulty": DEFAULT_DIFFICULTY }
472
  # { "type": "step", "action": { ...ScientistAction fields... } }
473
  # { "type": "ping" }
474
  #
 
478
  # { "type": "pong" }
479
  # { "type": "error", "message": "..." }
480
 
481
+ _WS_IDLE_TIMEOUT = WS_IDLE_TIMEOUT_SECONDS
482
 
483
 
484
  async def _ws_send(ws: WebSocket, payload: dict) -> None:
485
  await ws.send_text(json.dumps(payload))
486
 
487
 
488
+ def main(host: str = API_HOST, port: int = API_PORT) -> None:
489
  import uvicorn
490
 
491
  uvicorn.run("server.app:app", host=host, port=port, reload=False)
 
520
 
521
  elif msg_type == "reset":
522
  seed = int(msg.get("seed", 0))
523
+ scenario = str(msg.get("scenario", DEFAULT_SCENARIO_TEMPLATE))
524
+ difficulty = str(msg.get("difficulty", DEFAULT_DIFFICULTY))
525
 
526
  try:
527
  obs = env.reset(seed=seed, scenario=scenario, difficulty=difficulty)
 
601
  import argparse
602
 
603
  parser = argparse.ArgumentParser()
604
+ parser.add_argument("--port", type=int, default=API_PORT)
605
+ parser.add_argument("--host", default=API_HOST)
606
  args = parser.parse_args()
607
+ if args.host == API_HOST and args.port == API_PORT:
608
  main()
609
  else:
610
  main(host=args.host, port=args.port)
tests/fixtures/golden_scenarios.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "golden_math_easy",
4
+ "template": "math_reasoning",
5
+ "difficulty": "easy",
6
+ "seed": 101,
7
+ "expected_domain_id": "mathematics",
8
+ "expected_title_contains": "Jensen"
9
+ },
10
+ {
11
+ "id": "golden_ml_medium",
12
+ "template": "ml_benchmark",
13
+ "difficulty": "medium",
14
+ "seed": 202,
15
+ "expected_domain_id": "machine_learning",
16
+ "expected_title_contains": "CIFAR-10"
17
+ },
18
+ {
19
+ "id": "golden_finance_hard",
20
+ "template": "finance_trading",
21
+ "difficulty": "hard",
22
+ "seed": 303,
23
+ "expected_domain_id": "finance_trading",
24
+ "expected_title_contains": "momentum"
25
+ }
26
+ ]
tests/test_config.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from replicalab.config import (
4
+ DEFAULT_DIFFICULTY,
5
+ DEFAULT_SCENARIO_TEMPLATE,
6
+ MAX_BUDGET,
7
+ MAX_ROUNDS,
8
+ SESSION_TTL_SECONDS,
9
+ WS_IDLE_TIMEOUT_SECONDS,
10
+ )
11
+ from replicalab.scenarios import generate_scenario
12
+ from server.app import ResetRequest
13
+
14
+
15
+ def test_reset_request_defaults_match_shared_config() -> None:
16
+ request = ResetRequest()
17
+
18
+ assert request.scenario == DEFAULT_SCENARIO_TEMPLATE
19
+ assert request.difficulty == DEFAULT_DIFFICULTY
20
+
21
+
22
+ def test_generated_scenarios_respect_shared_round_and_budget_caps() -> None:
23
+ for template in ("math_reasoning", "ml_benchmark", "finance_trading"):
24
+ for difficulty in ("easy", "medium", "hard"):
25
+ pack = generate_scenario(seed=123, template=template, difficulty=difficulty)
26
+ assert pack.scientist_observation.max_rounds == MAX_ROUNDS
27
+ assert pack.lab_manager_observation.max_rounds == MAX_ROUNDS
28
+ assert pack.lab_manager_observation.budget_total <= MAX_BUDGET
29
+
30
+
31
+ def test_timeout_exports_share_the_same_default_value() -> None:
32
+ assert SESSION_TTL_SECONDS == WS_IDLE_TIMEOUT_SECONDS
tests/test_scenarios.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from replicalab.scenarios import (
6
+ GOLDEN_SCENARIO_SPECS_PATH,
7
+ available_scenario_families,
8
+ generate_scenario,
9
+ )
10
+
11
+
12
+ def test_generate_scenario_is_deterministic_for_same_seed() -> None:
13
+ first = generate_scenario(seed=101, template="math_reasoning", difficulty="easy")
14
+ second = generate_scenario(seed=101, template="math_reasoning", difficulty="easy")
15
+
16
+ assert first.model_dump(mode="json") == second.model_dump(mode="json")
17
+
18
+
19
+ def test_generate_scenario_varies_across_seeded_cases() -> None:
20
+ first = generate_scenario(seed=101, template="math_reasoning", difficulty="easy")
21
+ second = generate_scenario(seed=102, template="math_reasoning", difficulty="easy")
22
+
23
+ assert first.scientist_observation.paper_title != second.scientist_observation.paper_title
24
+
25
+
26
+ def test_available_scenario_families_exposes_three_domain_families() -> None:
27
+ assert available_scenario_families() == [
28
+ {"family": "math_reasoning", "difficulties": ["easy", "medium", "hard"]},
29
+ {"family": "ml_benchmark", "difficulties": ["easy", "medium", "hard"]},
30
+ {"family": "finance_trading", "difficulties": ["easy", "medium", "hard"]},
31
+ ]
32
+
33
+
34
+ def test_hard_finance_scenario_exposes_unavailable_resource_and_safety_rules() -> None:
35
+ pack = generate_scenario(seed=303, template="finance_trading", difficulty="hard")
36
+
37
+ assert any(not resource.available for resource in pack.resources)
38
+ assert pack.lab_manager_observation.reagents_out_of_stock
39
+ assert pack.lab_manager_observation.safety_restrictions
40
+
41
+
42
+ def test_difficulty_levels_mechanically_change_budget_and_constraints() -> None:
43
+ easy = generate_scenario(seed=202, template="ml_benchmark", difficulty="easy")
44
+ medium = generate_scenario(seed=202, template="ml_benchmark", difficulty="medium")
45
+ hard = generate_scenario(seed=202, template="ml_benchmark", difficulty="hard")
46
+
47
+ assert easy.lab_manager_observation.budget_total > medium.lab_manager_observation.budget_total
48
+ assert medium.lab_manager_observation.budget_total > hard.lab_manager_observation.budget_total
49
+ assert len(easy.constraints) < len(medium.constraints) < len(hard.constraints)
50
+
51
+
52
+ def test_generated_scenarios_keep_unique_constraint_and_resource_keys() -> None:
53
+ for template in ("math_reasoning", "ml_benchmark", "finance_trading"):
54
+ pack = generate_scenario(seed=303, template=template, difficulty="hard")
55
+ constraint_keys = [constraint.key for constraint in pack.constraints]
56
+ resource_keys = [resource.key for resource in pack.resources]
57
+ assert len(constraint_keys) == len(set(constraint_keys))
58
+ assert len(resource_keys) == len(set(resource_keys))
59
+ assert pack.hidden_reference_spec.required_elements
60
+ assert pack.allowed_substitutions
61
+
62
+
63
+ def test_golden_scenario_specs_exist_for_manual_prompt_checks() -> None:
64
+ specs = json.loads(GOLDEN_SCENARIO_SPECS_PATH.read_text(encoding="utf-8"))
65
+
66
+ assert len(specs) == 3
67
+ assert [spec["id"] for spec in specs] == [
68
+ "golden_math_easy",
69
+ "golden_ml_medium",
70
+ "golden_finance_hard",
71
+ ]
72
+
73
+
74
+ def test_golden_scenarios_match_expected_title_and_domain() -> None:
75
+ specs = json.loads(GOLDEN_SCENARIO_SPECS_PATH.read_text(encoding="utf-8"))
76
+
77
+ for spec in specs:
78
+ pack = generate_scenario(
79
+ seed=spec["seed"],
80
+ template=spec["template"],
81
+ difficulty=spec["difficulty"],
82
+ )
83
+ assert pack.domain_id == spec["expected_domain_id"]
84
+ assert spec["expected_title_contains"] in pack.scientist_observation.paper_title
tests/test_scientist_policy.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import pytest
4
+
5
+ from replicalab.agents.scientist_policy import (
6
+ ScientistOutputParseError,
7
+ build_scientist_system_prompt,
8
+ parse_scientist_output,
9
+ )
10
+ from replicalab.models import ScientistActionType
11
+ from replicalab.scenarios import generate_scenario
12
+
13
+
14
+ def test_parse_scientist_output_accepts_plain_json() -> None:
15
+ raw_text = """
16
+ {
17
+ "action_type": "request_info",
18
+ "sample_size": 0,
19
+ "controls": [],
20
+ "technique": "",
21
+ "duration_days": 0,
22
+ "required_equipment": [],
23
+ "required_reagents": [],
24
+ "questions": ["What compute budget is available?"],
25
+ "rationale": ""
26
+ }
27
+ """
28
+
29
+ action = parse_scientist_output(raw_text)
30
+
31
+ assert action.action_type is ScientistActionType.REQUEST_INFO
32
+ assert action.questions == ["What compute budget is available?"]
33
+
34
+
35
+ def test_parse_scientist_output_accepts_fenced_json_with_prose() -> None:
36
+ raw_text = """
37
+ I would revise the plan as follows:
38
+
39
+ ```json
40
+ {
41
+ "action_type": "revise_protocol",
42
+ "sample_size": 24,
43
+ "controls": ["baseline", "ablation"],
44
+ "technique": "small_scale_backtest",
45
+ "duration_days": 3,
46
+ "required_equipment": ["gpu_node"],
47
+ "required_reagents": [],
48
+ "questions": [],
49
+ "rationale": "Shrink the trial to fit the available compute window."
50
+ }
51
+ ```
52
+ """
53
+
54
+ action = parse_scientist_output(raw_text)
55
+
56
+ assert action.action_type is ScientistActionType.REVISE_PROTOCOL
57
+ assert action.technique == "small_scale_backtest"
58
+
59
+
60
+ def test_parse_scientist_output_raises_explicit_error_when_json_is_missing() -> None:
61
+ with pytest.raises(ScientistOutputParseError) as exc_info:
62
+ parse_scientist_output("I need more context before I can answer.")
63
+
64
+ assert exc_info.value.code == "no_json"
65
+ assert "does not contain a JSON object" in exc_info.value.message
66
+
67
+
68
+ def test_parse_scientist_output_raises_explicit_error_when_json_is_invalid() -> None:
69
+ raw_text = """
70
+ ```json
71
+ {
72
+ "action_type": "request_info",
73
+ "questions": ["What budget do we have?"],
74
+ }
75
+ ```
76
+ """
77
+
78
+ with pytest.raises(ScientistOutputParseError) as exc_info:
79
+ parse_scientist_output(raw_text)
80
+
81
+ assert exc_info.value.code == "invalid_json"
82
+ assert "could not be decoded" in exc_info.value.message
83
+
84
+
85
+ def test_parse_scientist_output_raises_explicit_error_when_schema_is_invalid() -> None:
86
+ raw_text = """
87
+ {
88
+ "action_type": "request_info",
89
+ "sample_size": 0,
90
+ "controls": [],
91
+ "technique": "",
92
+ "duration_days": 0,
93
+ "required_equipment": [],
94
+ "required_reagents": [],
95
+ "questions": [],
96
+ "rationale": ""
97
+ }
98
+ """
99
+
100
+ with pytest.raises(ScientistOutputParseError) as exc_info:
101
+ parse_scientist_output(raw_text)
102
+
103
+ assert exc_info.value.code == "invalid_action"
104
+ assert "ScientistAction validation" in exc_info.value.message
105
+
106
+
107
+ def test_build_scientist_system_prompt_uses_normalized_scenario_data() -> None:
108
+ scenario = generate_scenario(seed=202, template="ml_benchmark", difficulty="medium")
109
+
110
+ prompt = build_scientist_system_prompt(scenario)
111
+
112
+ assert "You are the Scientist agent in ReplicaLab." in prompt
113
+ assert scenario.task_summary in prompt
114
+ assert scenario.success_criteria[0] in prompt
115
+ assert scenario.resources[0].label in prompt
116
+ assert "action_type values" in prompt
117
+ assert "propose_protocol" in prompt
118
+ assert "request_info" in prompt