Spaces:
Running
Running
Add MOD 11 typed StepInfo/RewardBreakdown and import completed foundation modules
Browse filesMOD 11: Replace untyped StepResult.info dict with StepInfo model
(extra="allow") and RewardBreakdown model with constrained score
fields. Stub server now explicitly constructs typed objects.
Also imports previously completed work: config.py (MOD 12),
seed.py (SCN 01), normalized scenario layer with three domain
adapters (SCN 02-09), scientist policy parser, and all tests.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- replicalab/agents/__init__.py +13 -0
- replicalab/agents/scientist_policy.py +290 -0
- replicalab/config.py +26 -0
- replicalab/models.py +34 -2
- replicalab/scenarios/__init__.py +25 -0
- replicalab/scenarios/finance_trading.py +214 -0
- replicalab/scenarios/math_reasoning.py +214 -0
- replicalab/scenarios/ml_benchmark.py +214 -0
- replicalab/scenarios/templates.py +299 -0
- replicalab/utils/seed.py +20 -0
- server/app.py +59 -42
- tests/fixtures/golden_scenarios.json +26 -0
- tests/test_config.py +32 -0
- tests/test_scenarios.py +84 -0
- tests/test_scientist_policy.py +118 -0
replicalab/agents/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agent policy helpers exposed as importable modules."""
|
| 2 |
+
|
| 3 |
+
from .scientist_policy import (
|
| 4 |
+
ScientistOutputParseError,
|
| 5 |
+
build_scientist_system_prompt,
|
| 6 |
+
parse_scientist_output,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"ScientistOutputParseError",
|
| 11 |
+
"build_scientist_system_prompt",
|
| 12 |
+
"parse_scientist_output",
|
| 13 |
+
]
|
replicalab/agents/scientist_policy.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Scientist policy helpers.
|
| 2 |
+
|
| 3 |
+
MOD 09 introduced strict parsing from raw model output into
|
| 4 |
+
``ScientistAction``. AGT 01 adds the first domain-neutral system prompt
|
| 5 |
+
builder so prompt assembly can be driven by the normalized scenario pack
|
| 6 |
+
instead of hard-coded domain text.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import re
|
| 13 |
+
from typing import Any, Literal, Mapping
|
| 14 |
+
|
| 15 |
+
from pydantic import ValidationError
|
| 16 |
+
|
| 17 |
+
from replicalab.models import ScientistAction, ScientistActionType
|
| 18 |
+
from replicalab.scenarios import NormalizedScenarioPack
|
| 19 |
+
|
| 20 |
+
_JSON_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)```", re.IGNORECASE | re.DOTALL)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class ScientistOutputParseError(ValueError):
|
| 24 |
+
"""Explicit parser error for malformed or invalid Scientist output."""
|
| 25 |
+
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
code: Literal["no_json", "invalid_json", "invalid_action"],
|
| 29 |
+
message: str,
|
| 30 |
+
raw_text: str,
|
| 31 |
+
*,
|
| 32 |
+
parsed_payload: dict[str, Any] | None = None,
|
| 33 |
+
) -> None:
|
| 34 |
+
super().__init__(message)
|
| 35 |
+
self.code = code
|
| 36 |
+
self.message = message
|
| 37 |
+
self.raw_text = raw_text
|
| 38 |
+
self.parsed_payload = parsed_payload
|
| 39 |
+
|
| 40 |
+
def to_dict(self) -> dict[str, Any]:
|
| 41 |
+
"""Return a stable error shape for callers and future retries."""
|
| 42 |
+
|
| 43 |
+
return {
|
| 44 |
+
"code": self.code,
|
| 45 |
+
"message": self.message,
|
| 46 |
+
"raw_text": self.raw_text,
|
| 47 |
+
"parsed_payload": self.parsed_payload,
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def build_scientist_system_prompt(
|
| 52 |
+
scenario: NormalizedScenarioPack | Mapping[str, Any],
|
| 53 |
+
) -> str:
|
| 54 |
+
"""Build a domain-neutral Scientist system prompt from normalized data."""
|
| 55 |
+
|
| 56 |
+
pack = _coerce_scenario_pack(scenario)
|
| 57 |
+
allowed_actions = ", ".join(action.value for action in ScientistActionType)
|
| 58 |
+
|
| 59 |
+
sections = [
|
| 60 |
+
"You are the Scientist agent in ReplicaLab.",
|
| 61 |
+
(
|
| 62 |
+
"Your job is to negotiate toward the strongest feasible plan under the "
|
| 63 |
+
"provided constraints. You do not invent resources, loosen constraints, "
|
| 64 |
+
"or assume access to hidden ground truth."
|
| 65 |
+
),
|
| 66 |
+
f"Domain: {pack.domain_id}",
|
| 67 |
+
f"Task: {pack.task_summary}",
|
| 68 |
+
"Success criteria:",
|
| 69 |
+
_render_bullets(pack.success_criteria),
|
| 70 |
+
"Constraints:",
|
| 71 |
+
_render_constraints(pack),
|
| 72 |
+
"Available resources:",
|
| 73 |
+
_render_resources(pack),
|
| 74 |
+
"Allowed substitutions:",
|
| 75 |
+
_render_substitutions(pack),
|
| 76 |
+
(
|
| 77 |
+
"Output contract: return exactly one JSON object with all "
|
| 78 |
+
"ScientistAction fields and no extra keys."
|
| 79 |
+
),
|
| 80 |
+
f"Allowed action_type values: {allowed_actions}.",
|
| 81 |
+
(
|
| 82 |
+
"Use propose_protocol or revise_protocol only when you can provide a full "
|
| 83 |
+
"protocol payload. Use request_info only when a blocking question remains. "
|
| 84 |
+
"Use accept only when the plan is ready without further edits."
|
| 85 |
+
),
|
| 86 |
+
(
|
| 87 |
+
"For propose_protocol and revise_protocol, the JSON must include: "
|
| 88 |
+
"sample_size >= 1, controls, technique, duration_days >= 0, "
|
| 89 |
+
"required_equipment, required_reagents, questions = [], and rationale."
|
| 90 |
+
),
|
| 91 |
+
(
|
| 92 |
+
"For request_info, all protocol fields must stay empty or zero and "
|
| 93 |
+
"questions must contain at least one concrete question."
|
| 94 |
+
),
|
| 95 |
+
(
|
| 96 |
+
"For accept, questions must be empty and protocol-edit fields must stay "
|
| 97 |
+
"empty or zero."
|
| 98 |
+
),
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
return "\n\n".join(section for section in sections if section)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def parse_scientist_output(raw_text: str) -> ScientistAction:
|
| 105 |
+
"""Parse raw model text into a validated ``ScientistAction``.
|
| 106 |
+
|
| 107 |
+
The parser accepts:
|
| 108 |
+
- plain JSON objects
|
| 109 |
+
- fenced JSON blocks
|
| 110 |
+
- prose that contains one JSON object
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
payload = _parse_json_payload(raw_text)
|
| 114 |
+
try:
|
| 115 |
+
return ScientistAction.model_validate(payload)
|
| 116 |
+
except ValidationError as exc:
|
| 117 |
+
raise ScientistOutputParseError(
|
| 118 |
+
"invalid_action",
|
| 119 |
+
_format_validation_error(exc),
|
| 120 |
+
raw_text,
|
| 121 |
+
parsed_payload=payload,
|
| 122 |
+
) from exc
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _parse_json_payload(raw_text: str) -> dict[str, Any]:
|
| 126 |
+
if not raw_text.strip():
|
| 127 |
+
raise ScientistOutputParseError(
|
| 128 |
+
"no_json",
|
| 129 |
+
"Scientist output is empty and does not contain a JSON object.",
|
| 130 |
+
raw_text,
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
saw_json_like_text = False
|
| 134 |
+
last_json_error: json.JSONDecodeError | None = None
|
| 135 |
+
|
| 136 |
+
for candidate in _iter_json_candidates(raw_text):
|
| 137 |
+
saw_json_like_text = True
|
| 138 |
+
try:
|
| 139 |
+
decoded = json.loads(candidate)
|
| 140 |
+
except json.JSONDecodeError as exc:
|
| 141 |
+
last_json_error = exc
|
| 142 |
+
continue
|
| 143 |
+
|
| 144 |
+
if not isinstance(decoded, dict):
|
| 145 |
+
raise ScientistOutputParseError(
|
| 146 |
+
"invalid_json",
|
| 147 |
+
"Scientist output must decode to a JSON object.",
|
| 148 |
+
raw_text,
|
| 149 |
+
)
|
| 150 |
+
return decoded
|
| 151 |
+
|
| 152 |
+
if saw_json_like_text and last_json_error is not None:
|
| 153 |
+
raise ScientistOutputParseError(
|
| 154 |
+
"invalid_json",
|
| 155 |
+
(
|
| 156 |
+
"Scientist output contains JSON-like text but it could not be decoded: "
|
| 157 |
+
f"{last_json_error.msg} at line {last_json_error.lineno}, "
|
| 158 |
+
f"column {last_json_error.colno}."
|
| 159 |
+
),
|
| 160 |
+
raw_text,
|
| 161 |
+
) from last_json_error
|
| 162 |
+
|
| 163 |
+
raise ScientistOutputParseError(
|
| 164 |
+
"no_json",
|
| 165 |
+
"Scientist output does not contain a JSON object.",
|
| 166 |
+
raw_text,
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _iter_json_candidates(raw_text: str) -> list[str]:
|
| 171 |
+
candidates: list[str] = []
|
| 172 |
+
seen: set[str] = set()
|
| 173 |
+
|
| 174 |
+
def add(candidate: str | None) -> None:
|
| 175 |
+
if candidate is None:
|
| 176 |
+
return
|
| 177 |
+
cleaned = candidate.strip()
|
| 178 |
+
if not cleaned or cleaned in seen:
|
| 179 |
+
return
|
| 180 |
+
seen.add(cleaned)
|
| 181 |
+
candidates.append(cleaned)
|
| 182 |
+
|
| 183 |
+
stripped = raw_text.strip()
|
| 184 |
+
if stripped.startswith("{") or stripped.startswith("```"):
|
| 185 |
+
add(raw_text)
|
| 186 |
+
add(_extract_first_json_object(raw_text))
|
| 187 |
+
|
| 188 |
+
for match in _JSON_FENCE_RE.finditer(raw_text):
|
| 189 |
+
fenced = match.group(1)
|
| 190 |
+
add(fenced)
|
| 191 |
+
add(_extract_first_json_object(fenced))
|
| 192 |
+
|
| 193 |
+
return candidates
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def _extract_first_json_object(text: str) -> str | None:
|
| 197 |
+
start = text.find("{")
|
| 198 |
+
if start < 0:
|
| 199 |
+
return None
|
| 200 |
+
|
| 201 |
+
depth = 0
|
| 202 |
+
in_string = False
|
| 203 |
+
escaped = False
|
| 204 |
+
|
| 205 |
+
for index in range(start, len(text)):
|
| 206 |
+
char = text[index]
|
| 207 |
+
|
| 208 |
+
if in_string:
|
| 209 |
+
if escaped:
|
| 210 |
+
escaped = False
|
| 211 |
+
elif char == "\\":
|
| 212 |
+
escaped = True
|
| 213 |
+
elif char == '"':
|
| 214 |
+
in_string = False
|
| 215 |
+
continue
|
| 216 |
+
|
| 217 |
+
if char == '"':
|
| 218 |
+
in_string = True
|
| 219 |
+
elif char == "{":
|
| 220 |
+
depth += 1
|
| 221 |
+
elif char == "}":
|
| 222 |
+
depth -= 1
|
| 223 |
+
if depth == 0:
|
| 224 |
+
return text[start : index + 1]
|
| 225 |
+
|
| 226 |
+
return None
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def _format_validation_error(error: ValidationError) -> str:
|
| 230 |
+
parts: list[str] = []
|
| 231 |
+
for item in error.errors():
|
| 232 |
+
path = ".".join(str(segment) for segment in item.get("loc", ()))
|
| 233 |
+
message = item.get("msg", "Validation error")
|
| 234 |
+
parts.append(f"{path}: {message}" if path else message)
|
| 235 |
+
|
| 236 |
+
detail = "; ".join(parts) if parts else str(error)
|
| 237 |
+
return f"Scientist output JSON failed ScientistAction validation: {detail}"
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def _coerce_scenario_pack(
|
| 241 |
+
scenario: NormalizedScenarioPack | Mapping[str, Any],
|
| 242 |
+
) -> NormalizedScenarioPack:
|
| 243 |
+
if isinstance(scenario, NormalizedScenarioPack):
|
| 244 |
+
return scenario
|
| 245 |
+
return NormalizedScenarioPack.model_validate(scenario)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def _render_bullets(items: list[str]) -> str:
|
| 249 |
+
return "\n".join(f"- {item}" for item in items)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _render_constraints(pack: NormalizedScenarioPack) -> str:
|
| 253 |
+
lines = []
|
| 254 |
+
for constraint in pack.constraints:
|
| 255 |
+
amount = ""
|
| 256 |
+
if constraint.quantity is not None:
|
| 257 |
+
unit = f" {constraint.unit}" if constraint.unit else ""
|
| 258 |
+
amount = f" ({constraint.comparator} {constraint.quantity}{unit})"
|
| 259 |
+
hardness = "hard" if constraint.hard else "soft"
|
| 260 |
+
lines.append(f"- [{hardness}] {constraint.label}{amount}: {constraint.details}")
|
| 261 |
+
return "\n".join(lines)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def _render_resources(pack: NormalizedScenarioPack) -> str:
|
| 265 |
+
lines = []
|
| 266 |
+
for resource in pack.resources:
|
| 267 |
+
availability = "available" if resource.available else "unavailable"
|
| 268 |
+
amount = ""
|
| 269 |
+
if resource.quantity is not None:
|
| 270 |
+
unit = f" {resource.unit}" if resource.unit else ""
|
| 271 |
+
amount = f" ({resource.quantity}{unit})"
|
| 272 |
+
lines.append(
|
| 273 |
+
f"- [{availability}] {resource.label}{amount}: {resource.details}"
|
| 274 |
+
)
|
| 275 |
+
return "\n".join(lines)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def _render_substitutions(pack: NormalizedScenarioPack) -> str:
|
| 279 |
+
if not pack.allowed_substitutions:
|
| 280 |
+
return "- No substitutions are pre-approved."
|
| 281 |
+
|
| 282 |
+
lines = []
|
| 283 |
+
for substitution in pack.allowed_substitutions:
|
| 284 |
+
lines.append(
|
| 285 |
+
(
|
| 286 |
+
f"- {substitution.original} -> {substitution.alternative}. "
|
| 287 |
+
f"Condition: {substitution.condition} Tradeoff: {substitution.tradeoff}"
|
| 288 |
+
)
|
| 289 |
+
)
|
| 290 |
+
return "\n".join(lines)
|
replicalab/config.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared configuration constants for ReplicaLab.
|
| 2 |
+
|
| 3 |
+
MOD 12 centralizes the small set of repo-wide defaults that were previously
|
| 4 |
+
scattered across the stub server and scenario builders. Future environment,
|
| 5 |
+
scoring, and client modules should import from here instead of introducing
|
| 6 |
+
new magic numbers.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
DEFAULT_SCENARIO_TEMPLATE = "math_reasoning"
|
| 12 |
+
DEFAULT_DIFFICULTY = "easy"
|
| 13 |
+
|
| 14 |
+
MAX_ROUNDS = 6
|
| 15 |
+
MAX_BUDGET = 5000.0
|
| 16 |
+
|
| 17 |
+
TIMEOUT_SECONDS = 300
|
| 18 |
+
ROUND_TIME_LIMIT_SECONDS = 300
|
| 19 |
+
|
| 20 |
+
SESSION_TTL_SECONDS = TIMEOUT_SECONDS
|
| 21 |
+
WS_IDLE_TIMEOUT_SECONDS = TIMEOUT_SECONDS
|
| 22 |
+
|
| 23 |
+
STUB_ACCEPT_REWARD = 5.0
|
| 24 |
+
|
| 25 |
+
API_HOST = "0.0.0.0"
|
| 26 |
+
API_PORT = 7860
|
replicalab/models.py
CHANGED
|
@@ -307,18 +307,50 @@ class Observation(BaseModel):
|
|
| 307 |
lab_manager: Optional[LabManagerObservation]
|
| 308 |
|
| 309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
# ---------------------------------------------------------------------------
|
| 311 |
# Step result
|
| 312 |
# ---------------------------------------------------------------------------
|
| 313 |
|
| 314 |
class StepResult(BaseModel):
|
| 315 |
"""Returned by env.step(). Contains the next observation, reward,
|
| 316 |
-
termination flag, and
|
| 317 |
|
| 318 |
observation: Optional[Observation] = None
|
| 319 |
reward: float = 0.0
|
| 320 |
done: bool = False
|
| 321 |
-
info:
|
| 322 |
|
| 323 |
|
| 324 |
# ---------------------------------------------------------------------------
|
|
|
|
| 307 |
lab_manager: Optional[LabManagerObservation]
|
| 308 |
|
| 309 |
|
| 310 |
+
# ---------------------------------------------------------------------------
|
| 311 |
+
# Reward breakdown and step metadata
|
| 312 |
+
# ---------------------------------------------------------------------------
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
class RewardBreakdown(BaseModel):
|
| 316 |
+
"""Component scores and adjustments produced by the judge rubric engine."""
|
| 317 |
+
|
| 318 |
+
rigor: float = Field(default=0.0, ge=0, le=1)
|
| 319 |
+
feasibility: float = Field(default=0.0, ge=0, le=1)
|
| 320 |
+
fidelity: float = Field(default=0.0, ge=0, le=1)
|
| 321 |
+
efficiency_bonus: float = 0.0
|
| 322 |
+
communication_bonus: float = 0.0
|
| 323 |
+
penalties: dict[str, float] = Field(default_factory=dict)
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
class StepInfo(BaseModel):
|
| 327 |
+
"""Typed metadata returned alongside each step result.
|
| 328 |
+
|
| 329 |
+
Reserved keys from the frozen contract are typed fields.
|
| 330 |
+
Additional debug or runtime metadata is allowed via extra="allow".
|
| 331 |
+
"""
|
| 332 |
+
|
| 333 |
+
model_config = ConfigDict(extra="allow")
|
| 334 |
+
|
| 335 |
+
agreement_reached: bool = False
|
| 336 |
+
error: Optional[str] = None
|
| 337 |
+
reward_breakdown: Optional[RewardBreakdown] = None
|
| 338 |
+
judge_notes: Optional[str] = None
|
| 339 |
+
verdict: Optional[str] = None
|
| 340 |
+
|
| 341 |
+
|
| 342 |
# ---------------------------------------------------------------------------
|
| 343 |
# Step result
|
| 344 |
# ---------------------------------------------------------------------------
|
| 345 |
|
| 346 |
class StepResult(BaseModel):
|
| 347 |
"""Returned by env.step(). Contains the next observation, reward,
|
| 348 |
+
termination flag, and typed step info."""
|
| 349 |
|
| 350 |
observation: Optional[Observation] = None
|
| 351 |
reward: float = 0.0
|
| 352 |
done: bool = False
|
| 353 |
+
info: StepInfo = Field(default_factory=StepInfo)
|
| 354 |
|
| 355 |
|
| 356 |
# ---------------------------------------------------------------------------
|
replicalab/scenarios/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Scenario generation exports."""
|
| 2 |
+
|
| 3 |
+
from .templates import (
|
| 4 |
+
GOLDEN_SCENARIO_SPECS_PATH,
|
| 5 |
+
HiddenReferenceSpec,
|
| 6 |
+
NormalizedScenarioPack,
|
| 7 |
+
ScenarioConstraint,
|
| 8 |
+
ScenarioResource,
|
| 9 |
+
available_scenario_families,
|
| 10 |
+
apply_difficulty,
|
| 11 |
+
generate_scenario,
|
| 12 |
+
load_template,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"GOLDEN_SCENARIO_SPECS_PATH",
|
| 17 |
+
"HiddenReferenceSpec",
|
| 18 |
+
"NormalizedScenarioPack",
|
| 19 |
+
"ScenarioConstraint",
|
| 20 |
+
"ScenarioResource",
|
| 21 |
+
"available_scenario_families",
|
| 22 |
+
"apply_difficulty",
|
| 23 |
+
"generate_scenario",
|
| 24 |
+
"load_template",
|
| 25 |
+
]
|
replicalab/scenarios/finance_trading.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Finance and trading planning scenario templates."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from replicalab.config import MAX_ROUNDS
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def build_finance_trading_template(rng: random.Random) -> dict[str, Any]:
|
| 12 |
+
cases = [
|
| 13 |
+
{
|
| 14 |
+
"domain_id": "finance_trading",
|
| 15 |
+
"paper_title": "Planning an offline mean-reversion backtest for SPY and QQQ",
|
| 16 |
+
"paper_hypothesis": "A simple mean-reversion design can be evaluated fairly without live execution.",
|
| 17 |
+
"paper_method": "Run an offline daily-bar backtest with transaction costs, slippage assumptions, and fixed entry rules.",
|
| 18 |
+
"paper_key_finding": "The plan is accepted only if risk limits and evaluation hygiene remain explicit.",
|
| 19 |
+
"task_summary": "Design a mean-reversion backtest workflow for SPY and QQQ under capital, drawdown, and deadline limits.",
|
| 20 |
+
"success_criteria": [
|
| 21 |
+
"Use only offline historical data with explicit slippage assumptions.",
|
| 22 |
+
"Keep position sizing inside the stated capital and drawdown rules.",
|
| 23 |
+
"Separate strategy design from final evaluation.",
|
| 24 |
+
],
|
| 25 |
+
"reference_summary": "A valid plan keeps the workflow offline, constrains drawdown, and documents slippage assumptions.",
|
| 26 |
+
"required_elements": [
|
| 27 |
+
"offline historical data only",
|
| 28 |
+
"transaction cost assumption",
|
| 29 |
+
"drawdown guardrail",
|
| 30 |
+
"final evaluation split",
|
| 31 |
+
],
|
| 32 |
+
"flexible_elements": [
|
| 33 |
+
"lookback window",
|
| 34 |
+
"entry threshold",
|
| 35 |
+
"report visualization format",
|
| 36 |
+
],
|
| 37 |
+
"target_metric": "risk_adjusted_return",
|
| 38 |
+
"target_value": "positive Sharpe with drawdown inside the guardrail",
|
| 39 |
+
"constraints": [
|
| 40 |
+
{
|
| 41 |
+
"key": "max_capital",
|
| 42 |
+
"label": "Maximum simulated capital",
|
| 43 |
+
"quantity": 50000,
|
| 44 |
+
"unit": "usd",
|
| 45 |
+
"comparator": "<=",
|
| 46 |
+
"hard": True,
|
| 47 |
+
"details": "The simulation must stay within the stated capital cap.",
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"key": "max_drawdown",
|
| 51 |
+
"label": "Maximum allowed drawdown",
|
| 52 |
+
"quantity": 8,
|
| 53 |
+
"unit": "percent",
|
| 54 |
+
"comparator": "<=",
|
| 55 |
+
"hard": True,
|
| 56 |
+
"details": "Any accepted plan must respect the drawdown guardrail.",
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"key": "live_execution",
|
| 60 |
+
"label": "Execution mode",
|
| 61 |
+
"quantity": None,
|
| 62 |
+
"unit": None,
|
| 63 |
+
"comparator": "=",
|
| 64 |
+
"hard": True,
|
| 65 |
+
"details": "Only offline or backtest planning is allowed. No live trading.",
|
| 66 |
+
},
|
| 67 |
+
],
|
| 68 |
+
"resources": [
|
| 69 |
+
{
|
| 70 |
+
"key": "historical_bars",
|
| 71 |
+
"label": "Historical daily bar dataset",
|
| 72 |
+
"quantity": 1,
|
| 73 |
+
"unit": "dataset",
|
| 74 |
+
"available": True,
|
| 75 |
+
"category": "data",
|
| 76 |
+
"details": "Contains adjusted SPY and QQQ bars with metadata.",
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"key": "backtest_engine",
|
| 80 |
+
"label": "Backtest engine",
|
| 81 |
+
"quantity": 1,
|
| 82 |
+
"unit": "engine",
|
| 83 |
+
"available": True,
|
| 84 |
+
"category": "tool",
|
| 85 |
+
"details": "Supports offline simulation with transaction costs and slippage.",
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"key": "risk_reviewer",
|
| 89 |
+
"label": "Risk reviewer",
|
| 90 |
+
"quantity": 1,
|
| 91 |
+
"unit": "reviewer",
|
| 92 |
+
"available": True,
|
| 93 |
+
"category": "personnel",
|
| 94 |
+
"details": "Reviews risk assumptions and evaluation hygiene.",
|
| 95 |
+
},
|
| 96 |
+
],
|
| 97 |
+
"allowed_substitutions": [
|
| 98 |
+
{
|
| 99 |
+
"original": "daily bars",
|
| 100 |
+
"alternative": "hourly bars aggregated to daily decisions",
|
| 101 |
+
"condition": "Use if the daily dataset is delayed or incomplete.",
|
| 102 |
+
"tradeoff": "The plan must justify any slippage-model change.",
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"original": "risk reviewer",
|
| 106 |
+
"alternative": "pre-committed risk checklist",
|
| 107 |
+
"condition": "Use if the reviewer is unavailable.",
|
| 108 |
+
"tradeoff": "The plan must include explicit drawdown checks.",
|
| 109 |
+
},
|
| 110 |
+
],
|
| 111 |
+
"budget_total": 950.0,
|
| 112 |
+
"staff_count": 1,
|
| 113 |
+
"time_limit_days": 3,
|
| 114 |
+
"max_rounds": MAX_ROUNDS,
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"domain_id": "finance_trading",
|
| 118 |
+
"paper_title": "Planning an offline momentum backtest for liquid futures",
|
| 119 |
+
"paper_hypothesis": "A disciplined momentum design can be evaluated offline with strict liquidity and cost assumptions.",
|
| 120 |
+
"paper_method": "Run a futures momentum backtest with predefined roll logic, cost model, and walk-forward evaluation.",
|
| 121 |
+
"paper_key_finding": "The plan is accepted only if walk-forward evaluation and liquidity constraints are explicit.",
|
| 122 |
+
"task_summary": "Design an offline momentum futures backtest under liquidity, slippage, and review constraints.",
|
| 123 |
+
"success_criteria": [
|
| 124 |
+
"Use only offline walk-forward evaluation.",
|
| 125 |
+
"Model roll handling and transaction costs explicitly.",
|
| 126 |
+
"Keep liquidity and concentration rules visible in the final plan.",
|
| 127 |
+
],
|
| 128 |
+
"reference_summary": "A valid plan models roll logic, transaction costs, and walk-forward evaluation with liquidity limits.",
|
| 129 |
+
"required_elements": [
|
| 130 |
+
"walk-forward evaluation",
|
| 131 |
+
"roll logic",
|
| 132 |
+
"transaction cost assumption",
|
| 133 |
+
"liquidity limit",
|
| 134 |
+
],
|
| 135 |
+
"flexible_elements": [
|
| 136 |
+
"lookback horizon",
|
| 137 |
+
"rebalance frequency",
|
| 138 |
+
"reporting template",
|
| 139 |
+
],
|
| 140 |
+
"target_metric": "risk_adjusted_return",
|
| 141 |
+
"target_value": "positive out-of-sample Sharpe with liquidity-compliant trades",
|
| 142 |
+
"constraints": [
|
| 143 |
+
{
|
| 144 |
+
"key": "max_markets",
|
| 145 |
+
"label": "Maximum simultaneous markets",
|
| 146 |
+
"quantity": 4,
|
| 147 |
+
"unit": "markets",
|
| 148 |
+
"comparator": "<=",
|
| 149 |
+
"hard": False,
|
| 150 |
+
"details": "Keep the design narrow enough to review in one session.",
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"key": "max_drawdown",
|
| 154 |
+
"label": "Maximum allowed drawdown",
|
| 155 |
+
"quantity": 10,
|
| 156 |
+
"unit": "percent",
|
| 157 |
+
"comparator": "<=",
|
| 158 |
+
"hard": True,
|
| 159 |
+
"details": "The plan must remain inside the drawdown guardrail.",
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"key": "live_execution",
|
| 163 |
+
"label": "Execution mode",
|
| 164 |
+
"quantity": None,
|
| 165 |
+
"unit": None,
|
| 166 |
+
"comparator": "=",
|
| 167 |
+
"hard": True,
|
| 168 |
+
"details": "Only offline design and backtesting are allowed.",
|
| 169 |
+
},
|
| 170 |
+
],
|
| 171 |
+
"resources": [
|
| 172 |
+
{
|
| 173 |
+
"key": "futures_dataset",
|
| 174 |
+
"label": "Historical futures dataset",
|
| 175 |
+
"quantity": 1,
|
| 176 |
+
"unit": "dataset",
|
| 177 |
+
"available": True,
|
| 178 |
+
"category": "data",
|
| 179 |
+
"details": "Includes roll metadata and contract-level liquidity fields.",
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"key": "backtest_engine",
|
| 183 |
+
"label": "Walk-forward backtest engine",
|
| 184 |
+
"quantity": 1,
|
| 185 |
+
"unit": "engine",
|
| 186 |
+
"available": True,
|
| 187 |
+
"category": "tool",
|
| 188 |
+
"details": "Supports walk-forward slicing and execution-cost modeling.",
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"key": "risk_reviewer",
|
| 192 |
+
"label": "Risk reviewer",
|
| 193 |
+
"quantity": 1,
|
| 194 |
+
"unit": "reviewer",
|
| 195 |
+
"available": True,
|
| 196 |
+
"category": "personnel",
|
| 197 |
+
"details": "Checks liquidity and concentration assumptions.",
|
| 198 |
+
},
|
| 199 |
+
],
|
| 200 |
+
"allowed_substitutions": [
|
| 201 |
+
{
|
| 202 |
+
"original": "contract-level backtest",
|
| 203 |
+
"alternative": "continuous-series backtest with explicit caveat",
|
| 204 |
+
"condition": "Use if contract roll metadata is incomplete.",
|
| 205 |
+
"tradeoff": "The plan must document the fidelity loss clearly.",
|
| 206 |
+
}
|
| 207 |
+
],
|
| 208 |
+
"budget_total": 1100.0,
|
| 209 |
+
"staff_count": 1,
|
| 210 |
+
"time_limit_days": 4,
|
| 211 |
+
"max_rounds": MAX_ROUNDS,
|
| 212 |
+
},
|
| 213 |
+
]
|
| 214 |
+
return rng.choice(cases)
|
replicalab/scenarios/math_reasoning.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Mathematics scenario templates."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from replicalab.config import MAX_ROUNDS
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def build_math_reasoning_template(rng: random.Random) -> dict[str, Any]:
|
| 12 |
+
cases = [
|
| 13 |
+
{
|
| 14 |
+
"domain_id": "mathematics",
|
| 15 |
+
"paper_title": "Planning a proof of the Cauchy-Schwarz inequality",
|
| 16 |
+
"paper_hypothesis": "A square-expansion argument gives the cleanest proof path.",
|
| 17 |
+
"paper_method": "Outline the proof using one algebraic identity, one equality-case check, and reviewer notes.",
|
| 18 |
+
"paper_key_finding": "The proof is accepted only if every inequality step and equality case is justified.",
|
| 19 |
+
"task_summary": "Produce a proof-planning workflow for the Cauchy-Schwarz inequality for an undergraduate seminar handout.",
|
| 20 |
+
"success_criteria": [
|
| 21 |
+
"Every inequality step is justified in plain language.",
|
| 22 |
+
"The equality case is checked explicitly.",
|
| 23 |
+
"The final plan fits within the review and deadline constraints.",
|
| 24 |
+
],
|
| 25 |
+
"reference_summary": "A valid plan uses a square-expansion route, checks equality, and includes one verification pass.",
|
| 26 |
+
"required_elements": [
|
| 27 |
+
"explicit target inequality",
|
| 28 |
+
"square-expansion or inner-product setup",
|
| 29 |
+
"equality-case check",
|
| 30 |
+
"final verification pass",
|
| 31 |
+
],
|
| 32 |
+
"flexible_elements": [
|
| 33 |
+
"notation style",
|
| 34 |
+
"ordering of supporting lemmas",
|
| 35 |
+
"proof-sketch granularity",
|
| 36 |
+
],
|
| 37 |
+
"target_metric": "proof_validity",
|
| 38 |
+
"target_value": "all required justification steps are present",
|
| 39 |
+
"constraints": [
|
| 40 |
+
{
|
| 41 |
+
"key": "deadline_days",
|
| 42 |
+
"label": "Proof planning deadline",
|
| 43 |
+
"quantity": 3,
|
| 44 |
+
"unit": "days",
|
| 45 |
+
"comparator": "<=",
|
| 46 |
+
"hard": True,
|
| 47 |
+
"details": "The seminar notes must be ready within three days.",
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"key": "review_passes",
|
| 51 |
+
"label": "Required review passes",
|
| 52 |
+
"quantity": 1,
|
| 53 |
+
"unit": "pass",
|
| 54 |
+
"comparator": ">=",
|
| 55 |
+
"hard": True,
|
| 56 |
+
"details": "At least one verification pass is required before acceptance.",
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"key": "max_pages",
|
| 60 |
+
"label": "Maximum proof outline length",
|
| 61 |
+
"quantity": 2,
|
| 62 |
+
"unit": "pages",
|
| 63 |
+
"comparator": "<=",
|
| 64 |
+
"hard": False,
|
| 65 |
+
"details": "The outline should stay concise enough for seminar notes.",
|
| 66 |
+
},
|
| 67 |
+
],
|
| 68 |
+
"resources": [
|
| 69 |
+
{
|
| 70 |
+
"key": "proof_notebook",
|
| 71 |
+
"label": "Structured proof notebook",
|
| 72 |
+
"quantity": 1,
|
| 73 |
+
"unit": "workspace",
|
| 74 |
+
"available": True,
|
| 75 |
+
"category": "tool",
|
| 76 |
+
"details": "A shared note workspace for the outline and checks.",
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"key": "theorem_library",
|
| 80 |
+
"label": "Reference theorem library",
|
| 81 |
+
"quantity": 1,
|
| 82 |
+
"unit": "library",
|
| 83 |
+
"available": True,
|
| 84 |
+
"category": "reference",
|
| 85 |
+
"details": "Contains previous inequality proofs and notation conventions.",
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"key": "reviewer",
|
| 89 |
+
"label": "Graduate reviewer",
|
| 90 |
+
"quantity": 1,
|
| 91 |
+
"unit": "reviewer",
|
| 92 |
+
"available": True,
|
| 93 |
+
"category": "personnel",
|
| 94 |
+
"details": "A reviewer can check one draft before the deadline.",
|
| 95 |
+
},
|
| 96 |
+
],
|
| 97 |
+
"allowed_substitutions": [
|
| 98 |
+
{
|
| 99 |
+
"original": "graduate reviewer",
|
| 100 |
+
"alternative": "self-check rubric",
|
| 101 |
+
"condition": "Use only if the reviewer is unavailable.",
|
| 102 |
+
"tradeoff": "Requires a stricter written checklist inside the plan.",
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"original": "full derivation",
|
| 106 |
+
"alternative": "proof sketch with explicit checkpoints",
|
| 107 |
+
"condition": "Use when page budget is tight.",
|
| 108 |
+
"tradeoff": "The plan must still spell out all justification steps.",
|
| 109 |
+
},
|
| 110 |
+
],
|
| 111 |
+
"budget_total": 300.0,
|
| 112 |
+
"staff_count": 1,
|
| 113 |
+
"time_limit_days": 3,
|
| 114 |
+
"max_rounds": MAX_ROUNDS,
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"domain_id": "mathematics",
|
| 118 |
+
"paper_title": "Planning a proof of Jensen's inequality for convex quadratics",
|
| 119 |
+
"paper_hypothesis": "A convexity-first outline is shorter than an expectation-expansion route.",
|
| 120 |
+
"paper_method": "Use the convexity definition, midpoint intuition, and one numerical sanity check.",
|
| 121 |
+
"paper_key_finding": "The plan succeeds only if the convexity assumption and averaging step are both explicit.",
|
| 122 |
+
"task_summary": "Produce a proof-planning workflow for Jensen's inequality on convex quadratics for a revision session.",
|
| 123 |
+
"success_criteria": [
|
| 124 |
+
"The convexity assumption is named before the main argument.",
|
| 125 |
+
"Averaging and expectation steps are justified.",
|
| 126 |
+
"The plan includes at least one sanity check example.",
|
| 127 |
+
],
|
| 128 |
+
"reference_summary": "A valid plan states convexity early, justifies averaging, and uses one sanity check.",
|
| 129 |
+
"required_elements": [
|
| 130 |
+
"convexity assumption",
|
| 131 |
+
"averaging step",
|
| 132 |
+
"sanity check example",
|
| 133 |
+
"closing statement tied to the task objective",
|
| 134 |
+
],
|
| 135 |
+
"flexible_elements": [
|
| 136 |
+
"example choice",
|
| 137 |
+
"notation style",
|
| 138 |
+
"proof sketch ordering",
|
| 139 |
+
],
|
| 140 |
+
"target_metric": "proof_validity",
|
| 141 |
+
"target_value": "convexity and averaging are justified with one sanity check",
|
| 142 |
+
"constraints": [
|
| 143 |
+
{
|
| 144 |
+
"key": "deadline_days",
|
| 145 |
+
"label": "Proof planning deadline",
|
| 146 |
+
"quantity": 2,
|
| 147 |
+
"unit": "days",
|
| 148 |
+
"comparator": "<=",
|
| 149 |
+
"hard": True,
|
| 150 |
+
"details": "The revision notes are due within two days.",
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"key": "review_passes",
|
| 154 |
+
"label": "Required review passes",
|
| 155 |
+
"quantity": 1,
|
| 156 |
+
"unit": "pass",
|
| 157 |
+
"comparator": ">=",
|
| 158 |
+
"hard": True,
|
| 159 |
+
"details": "The plan needs at least one self-check or peer review.",
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"key": "max_pages",
|
| 163 |
+
"label": "Maximum proof outline length",
|
| 164 |
+
"quantity": 1,
|
| 165 |
+
"unit": "page",
|
| 166 |
+
"comparator": "<=",
|
| 167 |
+
"hard": False,
|
| 168 |
+
"details": "The final outline should fit on one page.",
|
| 169 |
+
},
|
| 170 |
+
],
|
| 171 |
+
"resources": [
|
| 172 |
+
{
|
| 173 |
+
"key": "whiteboard",
|
| 174 |
+
"label": "Whiteboard workspace",
|
| 175 |
+
"quantity": 1,
|
| 176 |
+
"unit": "workspace",
|
| 177 |
+
"available": True,
|
| 178 |
+
"category": "tool",
|
| 179 |
+
"details": "Used to sketch the proof structure and sanity check.",
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"key": "reference_notes",
|
| 183 |
+
"label": "Reference lecture notes",
|
| 184 |
+
"quantity": 1,
|
| 185 |
+
"unit": "packet",
|
| 186 |
+
"available": True,
|
| 187 |
+
"category": "reference",
|
| 188 |
+
"details": "Contains the convexity definition and worked examples.",
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"key": "peer_reviewer",
|
| 192 |
+
"label": "Peer reviewer",
|
| 193 |
+
"quantity": 1,
|
| 194 |
+
"unit": "reviewer",
|
| 195 |
+
"available": True,
|
| 196 |
+
"category": "personnel",
|
| 197 |
+
"details": "Available for one short review pass.",
|
| 198 |
+
},
|
| 199 |
+
],
|
| 200 |
+
"allowed_substitutions": [
|
| 201 |
+
{
|
| 202 |
+
"original": "peer reviewer",
|
| 203 |
+
"alternative": "checklist-driven self-review",
|
| 204 |
+
"condition": "Use if the peer reviewer is unavailable.",
|
| 205 |
+
"tradeoff": "The final plan must include explicit verification checkpoints.",
|
| 206 |
+
}
|
| 207 |
+
],
|
| 208 |
+
"budget_total": 220.0,
|
| 209 |
+
"staff_count": 1,
|
| 210 |
+
"time_limit_days": 2,
|
| 211 |
+
"max_rounds": MAX_ROUNDS,
|
| 212 |
+
},
|
| 213 |
+
]
|
| 214 |
+
return rng.choice(cases)
|
replicalab/scenarios/ml_benchmark.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Machine learning benchmark scenario templates."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from replicalab.config import MAX_ROUNDS
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def build_ml_benchmark_template(rng: random.Random) -> dict[str, Any]:
|
| 12 |
+
cases = [
|
| 13 |
+
{
|
| 14 |
+
"domain_id": "machine_learning",
|
| 15 |
+
"paper_title": "Reproducing an AG News TinyBERT baseline",
|
| 16 |
+
"paper_hypothesis": "A distilled model can match the published accuracy within the stated compute budget.",
|
| 17 |
+
"paper_method": "Fine-tune TinyBERT on AG News with the published split, tokenizer, and evaluation script.",
|
| 18 |
+
"paper_key_finding": "The baseline is accepted only if the held-out accuracy is within one point of the target.",
|
| 19 |
+
"task_summary": "Plan an ML benchmark replication for AG News classification with strict GPU and deadline limits.",
|
| 20 |
+
"success_criteria": [
|
| 21 |
+
"Use the published train-validation-test split.",
|
| 22 |
+
"Report held-out accuracy with the same metric definition as the paper.",
|
| 23 |
+
"Fit the full plan within the available GPU budget and time window.",
|
| 24 |
+
],
|
| 25 |
+
"reference_summary": "A valid plan keeps the published split and evaluation metric while staying inside the compute budget.",
|
| 26 |
+
"required_elements": [
|
| 27 |
+
"published data split",
|
| 28 |
+
"matching tokenizer family",
|
| 29 |
+
"held-out accuracy evaluation",
|
| 30 |
+
"run logging",
|
| 31 |
+
],
|
| 32 |
+
"flexible_elements": [
|
| 33 |
+
"batch size",
|
| 34 |
+
"learning-rate schedule",
|
| 35 |
+
"checkpoint cadence",
|
| 36 |
+
],
|
| 37 |
+
"target_metric": "held_out_accuracy",
|
| 38 |
+
"target_value": "within one point of the reported AG News baseline",
|
| 39 |
+
"constraints": [
|
| 40 |
+
{
|
| 41 |
+
"key": "gpu_hours",
|
| 42 |
+
"label": "Maximum GPU budget",
|
| 43 |
+
"quantity": 8,
|
| 44 |
+
"unit": "gpu_hours",
|
| 45 |
+
"comparator": "<=",
|
| 46 |
+
"hard": True,
|
| 47 |
+
"details": "The full run must fit within eight GPU-hours.",
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"key": "deadline_days",
|
| 51 |
+
"label": "Replication deadline",
|
| 52 |
+
"quantity": 4,
|
| 53 |
+
"unit": "days",
|
| 54 |
+
"comparator": "<=",
|
| 55 |
+
"hard": True,
|
| 56 |
+
"details": "The benchmark must be reproduced within four days.",
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"key": "evaluation_policy",
|
| 60 |
+
"label": "Evaluation policy",
|
| 61 |
+
"quantity": None,
|
| 62 |
+
"unit": None,
|
| 63 |
+
"comparator": "=",
|
| 64 |
+
"hard": True,
|
| 65 |
+
"details": "Use only the held-out split; no test-set peeking.",
|
| 66 |
+
},
|
| 67 |
+
],
|
| 68 |
+
"resources": [
|
| 69 |
+
{
|
| 70 |
+
"key": "gpu_node",
|
| 71 |
+
"label": "A100 GPU node",
|
| 72 |
+
"quantity": 1,
|
| 73 |
+
"unit": "node",
|
| 74 |
+
"available": True,
|
| 75 |
+
"category": "compute",
|
| 76 |
+
"details": "Reserved for one benchmark run at a time.",
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"key": "dataset_mirror",
|
| 80 |
+
"label": "AG News dataset mirror",
|
| 81 |
+
"quantity": 1,
|
| 82 |
+
"unit": "mirror",
|
| 83 |
+
"available": True,
|
| 84 |
+
"category": "data",
|
| 85 |
+
"details": "Local mirror with the published split manifest.",
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"key": "tracking_tool",
|
| 89 |
+
"label": "Experiment tracking workspace",
|
| 90 |
+
"quantity": 1,
|
| 91 |
+
"unit": "workspace",
|
| 92 |
+
"available": True,
|
| 93 |
+
"category": "tool",
|
| 94 |
+
"details": "Captures configs, metrics, and artifacts.",
|
| 95 |
+
},
|
| 96 |
+
],
|
| 97 |
+
"allowed_substitutions": [
|
| 98 |
+
{
|
| 99 |
+
"original": "full training schedule",
|
| 100 |
+
"alternative": "shorter schedule with early stopping",
|
| 101 |
+
"condition": "Use when the GPU budget is tight.",
|
| 102 |
+
"tradeoff": "The plan must justify why the metric remains trustworthy.",
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"original": "large batch size",
|
| 106 |
+
"alternative": "smaller batch size with accumulation",
|
| 107 |
+
"condition": "Use when the node has limited memory.",
|
| 108 |
+
"tradeoff": "Training takes longer and must still fit the deadline.",
|
| 109 |
+
},
|
| 110 |
+
],
|
| 111 |
+
"budget_total": 1800.0,
|
| 112 |
+
"staff_count": 2,
|
| 113 |
+
"time_limit_days": 4,
|
| 114 |
+
"max_rounds": MAX_ROUNDS,
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"domain_id": "machine_learning",
|
| 118 |
+
"paper_title": "Reproducing a CIFAR-10 ResNet-18 baseline",
|
| 119 |
+
"paper_hypothesis": "The reported top-1 accuracy is reachable with the stated data pipeline and a smaller tuning budget.",
|
| 120 |
+
"paper_method": "Train ResNet-18 on CIFAR-10 with the published augmentation recipe and evaluation checkpoint.",
|
| 121 |
+
"paper_key_finding": "The baseline is accepted only if the final accuracy and training recipe are reproducible.",
|
| 122 |
+
"task_summary": "Plan a CIFAR-10 benchmark replication with limited compute, strict evaluation rules, and one reviewer pass.",
|
| 123 |
+
"success_criteria": [
|
| 124 |
+
"Use the published augmentation recipe or justify a compatible substitution.",
|
| 125 |
+
"Keep evaluation isolated from any tuning loop.",
|
| 126 |
+
"Log all seeds, configs, and final metrics for reproducibility.",
|
| 127 |
+
],
|
| 128 |
+
"reference_summary": "A valid plan preserves the published augmentation and evaluation rules while logging every run.",
|
| 129 |
+
"required_elements": [
|
| 130 |
+
"published augmentation recipe",
|
| 131 |
+
"fixed evaluation checkpoint",
|
| 132 |
+
"seed logging",
|
| 133 |
+
"final metric report",
|
| 134 |
+
],
|
| 135 |
+
"flexible_elements": [
|
| 136 |
+
"optimizer implementation",
|
| 137 |
+
"checkpoint interval",
|
| 138 |
+
"data-loader worker count",
|
| 139 |
+
],
|
| 140 |
+
"target_metric": "top1_accuracy",
|
| 141 |
+
"target_value": "within one point of the CIFAR-10 baseline",
|
| 142 |
+
"constraints": [
|
| 143 |
+
{
|
| 144 |
+
"key": "gpu_hours",
|
| 145 |
+
"label": "Maximum GPU budget",
|
| 146 |
+
"quantity": 10,
|
| 147 |
+
"unit": "gpu_hours",
|
| 148 |
+
"comparator": "<=",
|
| 149 |
+
"hard": True,
|
| 150 |
+
"details": "The benchmark must fit within ten GPU-hours.",
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"key": "deadline_days",
|
| 154 |
+
"label": "Replication deadline",
|
| 155 |
+
"quantity": 5,
|
| 156 |
+
"unit": "days",
|
| 157 |
+
"comparator": "<=",
|
| 158 |
+
"hard": True,
|
| 159 |
+
"details": "The plan must finish inside the review window.",
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"key": "review_passes",
|
| 163 |
+
"label": "Required review passes",
|
| 164 |
+
"quantity": 1,
|
| 165 |
+
"unit": "pass",
|
| 166 |
+
"comparator": ">=",
|
| 167 |
+
"hard": False,
|
| 168 |
+
"details": "A teammate should review the config before launch.",
|
| 169 |
+
},
|
| 170 |
+
],
|
| 171 |
+
"resources": [
|
| 172 |
+
{
|
| 173 |
+
"key": "gpu_node",
|
| 174 |
+
"label": "L40S GPU node",
|
| 175 |
+
"quantity": 1,
|
| 176 |
+
"unit": "node",
|
| 177 |
+
"available": True,
|
| 178 |
+
"category": "compute",
|
| 179 |
+
"details": "Shared node with moderate queue pressure.",
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"key": "dataset_archive",
|
| 183 |
+
"label": "CIFAR-10 dataset archive",
|
| 184 |
+
"quantity": 1,
|
| 185 |
+
"unit": "archive",
|
| 186 |
+
"available": True,
|
| 187 |
+
"category": "data",
|
| 188 |
+
"details": "Local archive with checksum verification.",
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"key": "reviewer",
|
| 192 |
+
"label": "Benchmark reviewer",
|
| 193 |
+
"quantity": 1,
|
| 194 |
+
"unit": "reviewer",
|
| 195 |
+
"available": True,
|
| 196 |
+
"category": "personnel",
|
| 197 |
+
"details": "Can review the config once before training.",
|
| 198 |
+
},
|
| 199 |
+
],
|
| 200 |
+
"allowed_substitutions": [
|
| 201 |
+
{
|
| 202 |
+
"original": "full epoch schedule",
|
| 203 |
+
"alternative": "reduced epoch schedule with checkpoint comparison",
|
| 204 |
+
"condition": "Use if queue time or GPU budget becomes tight.",
|
| 205 |
+
"tradeoff": "Needs a clear explanation for any metric gap.",
|
| 206 |
+
}
|
| 207 |
+
],
|
| 208 |
+
"budget_total": 2100.0,
|
| 209 |
+
"staff_count": 2,
|
| 210 |
+
"time_limit_days": 5,
|
| 211 |
+
"max_rounds": MAX_ROUNDS,
|
| 212 |
+
},
|
| 213 |
+
]
|
| 214 |
+
return rng.choice(cases)
|
replicalab/scenarios/templates.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Normalized scenario generation and mapping helpers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import copy
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any, Callable, Literal
|
| 8 |
+
|
| 9 |
+
from pydantic import BaseModel, ConfigDict
|
| 10 |
+
|
| 11 |
+
from replicalab.config import MAX_BUDGET, MAX_ROUNDS
|
| 12 |
+
from replicalab.models import LabManagerObservation, ScientistObservation
|
| 13 |
+
from replicalab.scenarios.finance_trading import build_finance_trading_template
|
| 14 |
+
from replicalab.scenarios.math_reasoning import build_math_reasoning_template
|
| 15 |
+
from replicalab.scenarios.ml_benchmark import build_ml_benchmark_template
|
| 16 |
+
from replicalab.utils.seed import seed_rng
|
| 17 |
+
|
| 18 |
+
Difficulty = Literal["easy", "medium", "hard"]
|
| 19 |
+
TemplateName = Literal["math_reasoning", "ml_benchmark", "finance_trading"]
|
| 20 |
+
|
| 21 |
+
GOLDEN_SCENARIO_SPECS_PATH = (
|
| 22 |
+
Path(__file__).resolve().parents[2] / "tests" / "fixtures" / "golden_scenarios.json"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ScenarioConstraint(BaseModel):
|
| 27 |
+
model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
|
| 28 |
+
|
| 29 |
+
key: str
|
| 30 |
+
label: str
|
| 31 |
+
quantity: float | int | None = None
|
| 32 |
+
unit: str | None = None
|
| 33 |
+
comparator: Literal["<=", ">=", "="] = "="
|
| 34 |
+
hard: bool = True
|
| 35 |
+
details: str
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class ScenarioResource(BaseModel):
|
| 39 |
+
model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
|
| 40 |
+
|
| 41 |
+
key: str
|
| 42 |
+
label: str
|
| 43 |
+
quantity: float | int | None = None
|
| 44 |
+
unit: str | None = None
|
| 45 |
+
available: bool = True
|
| 46 |
+
category: str
|
| 47 |
+
details: str
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class AllowedSubstitution(BaseModel):
|
| 51 |
+
model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
|
| 52 |
+
|
| 53 |
+
original: str
|
| 54 |
+
alternative: str
|
| 55 |
+
condition: str
|
| 56 |
+
tradeoff: str
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class HiddenReferenceSpec(BaseModel):
|
| 60 |
+
model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
|
| 61 |
+
|
| 62 |
+
summary: str
|
| 63 |
+
required_elements: list[str]
|
| 64 |
+
flexible_elements: list[str]
|
| 65 |
+
target_metric: str
|
| 66 |
+
target_value: str
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class NormalizedScenarioPack(BaseModel):
|
| 70 |
+
model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
|
| 71 |
+
|
| 72 |
+
scenario_id: str
|
| 73 |
+
template: TemplateName
|
| 74 |
+
domain_id: str
|
| 75 |
+
difficulty: Difficulty
|
| 76 |
+
seed: int
|
| 77 |
+
task_summary: str
|
| 78 |
+
success_criteria: list[str]
|
| 79 |
+
constraints: list[ScenarioConstraint]
|
| 80 |
+
resources: list[ScenarioResource]
|
| 81 |
+
allowed_substitutions: list[AllowedSubstitution]
|
| 82 |
+
hidden_reference_spec: HiddenReferenceSpec
|
| 83 |
+
scientist_observation: ScientistObservation
|
| 84 |
+
lab_manager_observation: LabManagerObservation
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
TemplateBuilder = Callable[[Any], dict[str, Any]]
|
| 88 |
+
|
| 89 |
+
_TEMPLATE_BUILDERS: dict[TemplateName, TemplateBuilder] = {
|
| 90 |
+
"math_reasoning": build_math_reasoning_template,
|
| 91 |
+
"ml_benchmark": build_ml_benchmark_template,
|
| 92 |
+
"finance_trading": build_finance_trading_template,
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def available_scenario_families() -> list[dict[str, Any]]:
|
| 97 |
+
return [
|
| 98 |
+
{"family": name, "difficulties": ["easy", "medium", "hard"]}
|
| 99 |
+
for name in _TEMPLATE_BUILDERS
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def load_template(template: TemplateName) -> TemplateBuilder:
|
| 104 |
+
try:
|
| 105 |
+
return _TEMPLATE_BUILDERS[template]
|
| 106 |
+
except KeyError as exc:
|
| 107 |
+
raise ValueError(f"Unknown scenario template: {template}") from exc
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def apply_difficulty(
|
| 111 |
+
draft: dict[str, Any],
|
| 112 |
+
difficulty: Difficulty,
|
| 113 |
+
rng: Any,
|
| 114 |
+
) -> dict[str, Any]:
|
| 115 |
+
scaled = copy.deepcopy(draft)
|
| 116 |
+
scaled["difficulty"] = difficulty
|
| 117 |
+
|
| 118 |
+
if difficulty == "easy":
|
| 119 |
+
scaled["budget_total"] = round(float(draft["budget_total"]) * 1.15, 2)
|
| 120 |
+
return scaled
|
| 121 |
+
|
| 122 |
+
if difficulty == "medium":
|
| 123 |
+
scaled["budget_total"] = round(float(draft["budget_total"]) * 0.95, 2)
|
| 124 |
+
scaled["time_limit_days"] = max(1, int(draft["time_limit_days"]) - 1)
|
| 125 |
+
_tighten_one_resource(scaled["resources"], rng)
|
| 126 |
+
_append_conflict_constraint(
|
| 127 |
+
scaled["constraints"],
|
| 128 |
+
"One resource is partially constrained, so the plan must justify a fallback path.",
|
| 129 |
+
)
|
| 130 |
+
return scaled
|
| 131 |
+
|
| 132 |
+
scaled["budget_total"] = round(float(draft["budget_total"]) * 0.8, 2)
|
| 133 |
+
scaled["time_limit_days"] = max(1, int(draft["time_limit_days"]) - 1)
|
| 134 |
+
scaled["staff_count"] = max(1, int(draft["staff_count"]) - 1)
|
| 135 |
+
_tighten_one_resource(scaled["resources"], rng)
|
| 136 |
+
_tighten_one_resource(scaled["resources"], rng)
|
| 137 |
+
_append_conflict_constraint(
|
| 138 |
+
scaled["constraints"],
|
| 139 |
+
"At least one primary resource is unavailable, so the plan must use an allowed substitution or reduced scope.",
|
| 140 |
+
)
|
| 141 |
+
_append_conflict_constraint(
|
| 142 |
+
scaled["constraints"],
|
| 143 |
+
"The final plan must remain concise because review capacity is limited under hard mode.",
|
| 144 |
+
)
|
| 145 |
+
return scaled
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def generate_scenario(
|
| 149 |
+
seed: int,
|
| 150 |
+
template: TemplateName,
|
| 151 |
+
difficulty: Difficulty,
|
| 152 |
+
) -> NormalizedScenarioPack:
|
| 153 |
+
rng = seed_rng(seed, namespace=f"scenario:{template}")
|
| 154 |
+
base_draft = load_template(template)(rng)
|
| 155 |
+
scaled = apply_difficulty(base_draft, difficulty, rng)
|
| 156 |
+
return _build_pack(seed=seed, template=template, draft=scaled)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def _build_pack(seed: int, template: TemplateName, draft: dict[str, Any]) -> NormalizedScenarioPack:
|
| 160 |
+
constraints = [ScenarioConstraint.model_validate(item) for item in draft["constraints"]]
|
| 161 |
+
resources = [ScenarioResource.model_validate(item) for item in draft["resources"]]
|
| 162 |
+
substitutions = [
|
| 163 |
+
AllowedSubstitution.model_validate(item)
|
| 164 |
+
for item in draft["allowed_substitutions"]
|
| 165 |
+
]
|
| 166 |
+
|
| 167 |
+
time_limit_days = int(draft["time_limit_days"])
|
| 168 |
+
budget_total = float(draft["budget_total"])
|
| 169 |
+
staff_count = int(draft["staff_count"])
|
| 170 |
+
max_rounds = int(draft["max_rounds"])
|
| 171 |
+
|
| 172 |
+
if budget_total > MAX_BUDGET:
|
| 173 |
+
raise ValueError(
|
| 174 |
+
f"Scenario budget {budget_total} exceeds configured MAX_BUDGET={MAX_BUDGET}."
|
| 175 |
+
)
|
| 176 |
+
if max_rounds > MAX_ROUNDS:
|
| 177 |
+
raise ValueError(
|
| 178 |
+
f"Scenario max_rounds {max_rounds} exceeds configured MAX_ROUNDS={MAX_ROUNDS}."
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
equipment_available, equipment_booked = _split_resources(
|
| 182 |
+
resources,
|
| 183 |
+
include_categories={"tool", "compute"},
|
| 184 |
+
)
|
| 185 |
+
reagents_in_stock, reagents_out_of_stock = _split_resources(
|
| 186 |
+
resources,
|
| 187 |
+
include_categories={"reference", "data", "personnel"},
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
safety_restrictions = [
|
| 191 |
+
constraint.details
|
| 192 |
+
for constraint in constraints
|
| 193 |
+
if not constraint.hard or constraint.key in {"live_execution", "evaluation_policy"}
|
| 194 |
+
]
|
| 195 |
+
if not safety_restrictions:
|
| 196 |
+
safety_restrictions = ["No policy exceptions are allowed."]
|
| 197 |
+
|
| 198 |
+
scientist_observation = ScientistObservation(
|
| 199 |
+
paper_title=draft["paper_title"],
|
| 200 |
+
paper_hypothesis=draft["paper_hypothesis"],
|
| 201 |
+
paper_method=draft["paper_method"],
|
| 202 |
+
paper_key_finding=draft["paper_key_finding"],
|
| 203 |
+
experiment_goal=draft["task_summary"],
|
| 204 |
+
conversation_history=[],
|
| 205 |
+
current_protocol=None,
|
| 206 |
+
round_number=0,
|
| 207 |
+
max_rounds=max_rounds,
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
lab_manager_observation = LabManagerObservation(
|
| 211 |
+
budget_total=budget_total,
|
| 212 |
+
budget_remaining=budget_total,
|
| 213 |
+
equipment_available=equipment_available,
|
| 214 |
+
equipment_booked=equipment_booked,
|
| 215 |
+
reagents_in_stock=reagents_in_stock,
|
| 216 |
+
reagents_out_of_stock=reagents_out_of_stock,
|
| 217 |
+
staff_count=staff_count,
|
| 218 |
+
time_limit_days=time_limit_days,
|
| 219 |
+
safety_restrictions=safety_restrictions,
|
| 220 |
+
conversation_history=[],
|
| 221 |
+
current_protocol=None,
|
| 222 |
+
round_number=0,
|
| 223 |
+
max_rounds=max_rounds,
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
hidden_reference = HiddenReferenceSpec(
|
| 227 |
+
summary=draft["reference_summary"],
|
| 228 |
+
required_elements=list(draft["required_elements"]),
|
| 229 |
+
flexible_elements=list(draft["flexible_elements"]),
|
| 230 |
+
target_metric=draft["target_metric"],
|
| 231 |
+
target_value=draft["target_value"],
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
return NormalizedScenarioPack(
|
| 235 |
+
scenario_id=f"{template}-{draft['difficulty']}-{seed}",
|
| 236 |
+
template=template,
|
| 237 |
+
domain_id=draft["domain_id"],
|
| 238 |
+
difficulty=draft["difficulty"],
|
| 239 |
+
seed=seed,
|
| 240 |
+
task_summary=draft["task_summary"],
|
| 241 |
+
success_criteria=list(draft["success_criteria"]),
|
| 242 |
+
constraints=constraints,
|
| 243 |
+
resources=resources,
|
| 244 |
+
allowed_substitutions=substitutions,
|
| 245 |
+
hidden_reference_spec=hidden_reference,
|
| 246 |
+
scientist_observation=scientist_observation,
|
| 247 |
+
lab_manager_observation=lab_manager_observation,
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def _split_resources(
|
| 252 |
+
resources: list[ScenarioResource],
|
| 253 |
+
*,
|
| 254 |
+
include_categories: set[str],
|
| 255 |
+
) -> tuple[list[str], list[str]]:
|
| 256 |
+
available: list[str] = []
|
| 257 |
+
unavailable: list[str] = []
|
| 258 |
+
|
| 259 |
+
for resource in resources:
|
| 260 |
+
if resource.category not in include_categories:
|
| 261 |
+
continue
|
| 262 |
+
target = available if resource.available else unavailable
|
| 263 |
+
target.append(resource.label)
|
| 264 |
+
|
| 265 |
+
return available, unavailable
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def _tighten_one_resource(resources: list[dict[str, Any]], rng: Any) -> None:
|
| 269 |
+
available_indices = [
|
| 270 |
+
index
|
| 271 |
+
for index, resource in enumerate(resources)
|
| 272 |
+
if resource.get("available", True)
|
| 273 |
+
]
|
| 274 |
+
if not available_indices:
|
| 275 |
+
return
|
| 276 |
+
|
| 277 |
+
chosen_index = rng.choice(available_indices)
|
| 278 |
+
chosen = resources[chosen_index]
|
| 279 |
+
chosen["available"] = False
|
| 280 |
+
chosen["details"] = (
|
| 281 |
+
f"{chosen['details']} Availability is constrained under the current difficulty."
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _append_conflict_constraint(
|
| 286 |
+
constraints: list[dict[str, Any]],
|
| 287 |
+
details: str,
|
| 288 |
+
) -> None:
|
| 289 |
+
constraints.append(
|
| 290 |
+
{
|
| 291 |
+
"key": f"conflict_{len(constraints) + 1}",
|
| 292 |
+
"label": "Difficulty-induced conflict",
|
| 293 |
+
"quantity": None,
|
| 294 |
+
"unit": None,
|
| 295 |
+
"comparator": "=",
|
| 296 |
+
"hard": True,
|
| 297 |
+
"details": details,
|
| 298 |
+
}
|
| 299 |
+
)
|
replicalab/utils/seed.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic seeding helpers shared by scenarios and the environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import hashlib
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def get_deterministic_seed(seed: int, namespace: str = "") -> int:
|
| 10 |
+
"""Derive a stable child seed from a base seed plus namespace."""
|
| 11 |
+
|
| 12 |
+
payload = f"{seed}:{namespace}".encode("utf-8")
|
| 13 |
+
digest = hashlib.sha256(payload).digest()
|
| 14 |
+
return int.from_bytes(digest[:8], byteorder="big", signed=False)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def seed_rng(seed: int, namespace: str = "") -> random.Random:
|
| 18 |
+
"""Return a dedicated RNG instance seeded deterministically."""
|
| 19 |
+
|
| 20 |
+
return random.Random(get_deterministic_seed(seed, namespace))
|
server/app.py
CHANGED
|
@@ -31,13 +31,25 @@ from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect
|
|
| 31 |
from fastapi.middleware.cors import CORSMiddleware
|
| 32 |
from pydantic import BaseModel
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
from replicalab.models import (
|
| 35 |
EpisodeLog,
|
| 36 |
EpisodeState,
|
| 37 |
LabManagerObservation,
|
| 38 |
Observation,
|
|
|
|
| 39 |
ScientistAction,
|
| 40 |
ScientistObservation,
|
|
|
|
| 41 |
StepResult,
|
| 42 |
)
|
| 43 |
|
|
@@ -65,18 +77,18 @@ except ImportError:
|
|
| 65 |
log.warning("ReplicaLabEnv not found — using _StubEnv (replace when Person A ships env)")
|
| 66 |
|
| 67 |
|
| 68 |
-
def _reward_breakdown_from_state(state: EpisodeState) ->
|
| 69 |
-
return
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
"invalid_action": 0.0,
|
| 77 |
"timeout": 0.0,
|
| 78 |
},
|
| 79 |
-
|
| 80 |
|
| 81 |
|
| 82 |
def _build_episode_log(episode_id: str, state: EpisodeState) -> EpisodeLog:
|
|
@@ -113,29 +125,33 @@ class _StubEnv:
|
|
| 113 |
def reset(
|
| 114 |
self,
|
| 115 |
seed: int = 0,
|
| 116 |
-
scenario: str =
|
| 117 |
-
difficulty: str =
|
| 118 |
) -> Observation:
|
| 119 |
self._episode_id = str(uuid.uuid4())
|
| 120 |
self._logs = []
|
|
|
|
| 121 |
self._state = EpisodeState(
|
| 122 |
seed=seed,
|
| 123 |
scenario_template=scenario,
|
| 124 |
difficulty=difficulty,
|
| 125 |
-
paper_title=
|
| 126 |
paper_hypothesis="Compound X inhibits cell growth at 10 µM",
|
| 127 |
-
paper_method=
|
| 128 |
paper_key_finding="IC50 = 8.3 µM",
|
| 129 |
-
experiment_goal=
|
| 130 |
-
lab_budget_total=
|
| 131 |
-
lab_budget_remaining=
|
| 132 |
-
lab_equipment=
|
| 133 |
lab_reagents=["MTT reagent", "DMSO", "cell culture media"],
|
| 134 |
-
lab_staff_count=
|
| 135 |
-
lab_time_limit_days=
|
| 136 |
-
max_rounds=
|
| 137 |
round_number=0,
|
| 138 |
)
|
|
|
|
|
|
|
|
|
|
| 139 |
self._state.conversation_history = list(self._logs)
|
| 140 |
log.info("Stub reset | episode=%s seed=%d scenario=%s", self._episode_id, seed, scenario)
|
| 141 |
return self._make_observation()
|
|
@@ -150,7 +166,7 @@ class _StubEnv:
|
|
| 150 |
action.action_type == "accept"
|
| 151 |
or self._state.round_number >= self._state.max_rounds
|
| 152 |
)
|
| 153 |
-
reward =
|
| 154 |
if done:
|
| 155 |
self._state.done = True
|
| 156 |
self._state.agreement_reached = action.action_type == "accept"
|
|
@@ -163,11 +179,16 @@ class _StubEnv:
|
|
| 163 |
observation=self._make_observation(),
|
| 164 |
reward=reward,
|
| 165 |
done=done,
|
| 166 |
-
info=
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
)
|
| 172 |
|
| 173 |
def state(self) -> EpisodeState:
|
|
@@ -264,7 +285,7 @@ def _make_env() -> "_StubEnv":
|
|
| 264 |
# In-memory session store (REST sessions)
|
| 265 |
# ---------------------------------------------------------------------------
|
| 266 |
|
| 267 |
-
_SESSION_TTL_SECONDS =
|
| 268 |
|
| 269 |
_sessions: dict[str, dict[str, Any]] = {}
|
| 270 |
# { session_id: { "env": env_instance, "last_active": float, "episode_id": str } }
|
|
@@ -340,11 +361,7 @@ app.add_middleware(
|
|
| 340 |
# Available scenarios constant
|
| 341 |
# ---------------------------------------------------------------------------
|
| 342 |
|
| 343 |
-
SCENARIOS =
|
| 344 |
-
{"family": "cell_biology", "difficulties": ["easy", "medium", "hard"]},
|
| 345 |
-
{"family": "ml_benchmark", "difficulties": ["easy", "medium", "hard"]},
|
| 346 |
-
{"family": "behavioral_psych", "difficulties": ["easy", "medium", "hard"]},
|
| 347 |
-
]
|
| 348 |
|
| 349 |
# ---------------------------------------------------------------------------
|
| 350 |
# REST request/response schemas
|
|
@@ -353,8 +370,8 @@ SCENARIOS = [
|
|
| 353 |
|
| 354 |
class ResetRequest(BaseModel):
|
| 355 |
seed: int = 0
|
| 356 |
-
scenario: str =
|
| 357 |
-
difficulty: str =
|
| 358 |
session_id: Optional[str] = None # pass to reuse an existing session slot
|
| 359 |
|
| 360 |
|
|
@@ -451,7 +468,7 @@ async def get_replay(episode_id: str):
|
|
| 451 |
|
| 452 |
# WebSocket message protocol:
|
| 453 |
# Client → Server:
|
| 454 |
-
# { "type": "reset", "seed": 42, "scenario":
|
| 455 |
# { "type": "step", "action": { ...ScientistAction fields... } }
|
| 456 |
# { "type": "ping" }
|
| 457 |
#
|
|
@@ -461,14 +478,14 @@ async def get_replay(episode_id: str):
|
|
| 461 |
# { "type": "pong" }
|
| 462 |
# { "type": "error", "message": "..." }
|
| 463 |
|
| 464 |
-
_WS_IDLE_TIMEOUT =
|
| 465 |
|
| 466 |
|
| 467 |
async def _ws_send(ws: WebSocket, payload: dict) -> None:
|
| 468 |
await ws.send_text(json.dumps(payload))
|
| 469 |
|
| 470 |
|
| 471 |
-
def main(host: str =
|
| 472 |
import uvicorn
|
| 473 |
|
| 474 |
uvicorn.run("server.app:app", host=host, port=port, reload=False)
|
|
@@ -503,8 +520,8 @@ async def websocket_endpoint(ws: WebSocket):
|
|
| 503 |
|
| 504 |
elif msg_type == "reset":
|
| 505 |
seed = int(msg.get("seed", 0))
|
| 506 |
-
scenario = str(msg.get("scenario",
|
| 507 |
-
difficulty = str(msg.get("difficulty",
|
| 508 |
|
| 509 |
try:
|
| 510 |
obs = env.reset(seed=seed, scenario=scenario, difficulty=difficulty)
|
|
@@ -584,10 +601,10 @@ if __name__ == "__main__":
|
|
| 584 |
import argparse
|
| 585 |
|
| 586 |
parser = argparse.ArgumentParser()
|
| 587 |
-
parser.add_argument("--port", type=int, default=
|
| 588 |
-
parser.add_argument("--host", default=
|
| 589 |
args = parser.parse_args()
|
| 590 |
-
if args.host ==
|
| 591 |
main()
|
| 592 |
else:
|
| 593 |
main(host=args.host, port=args.port)
|
|
|
|
| 31 |
from fastapi.middleware.cors import CORSMiddleware
|
| 32 |
from pydantic import BaseModel
|
| 33 |
|
| 34 |
+
from replicalab.config import (
|
| 35 |
+
API_HOST,
|
| 36 |
+
API_PORT,
|
| 37 |
+
DEFAULT_DIFFICULTY,
|
| 38 |
+
DEFAULT_SCENARIO_TEMPLATE,
|
| 39 |
+
SESSION_TTL_SECONDS,
|
| 40 |
+
STUB_ACCEPT_REWARD,
|
| 41 |
+
WS_IDLE_TIMEOUT_SECONDS,
|
| 42 |
+
)
|
| 43 |
+
from replicalab.scenarios import available_scenario_families, generate_scenario
|
| 44 |
from replicalab.models import (
|
| 45 |
EpisodeLog,
|
| 46 |
EpisodeState,
|
| 47 |
LabManagerObservation,
|
| 48 |
Observation,
|
| 49 |
+
RewardBreakdown,
|
| 50 |
ScientistAction,
|
| 51 |
ScientistObservation,
|
| 52 |
+
StepInfo,
|
| 53 |
StepResult,
|
| 54 |
)
|
| 55 |
|
|
|
|
| 77 |
log.warning("ReplicaLabEnv not found — using _StubEnv (replace when Person A ships env)")
|
| 78 |
|
| 79 |
|
| 80 |
+
def _reward_breakdown_from_state(state: EpisodeState) -> RewardBreakdown:
|
| 81 |
+
return RewardBreakdown(
|
| 82 |
+
rigor=state.rigor_score,
|
| 83 |
+
feasibility=state.feasibility_score,
|
| 84 |
+
fidelity=state.fidelity_score,
|
| 85 |
+
efficiency_bonus=0.0,
|
| 86 |
+
communication_bonus=0.0,
|
| 87 |
+
penalties={
|
| 88 |
"invalid_action": 0.0,
|
| 89 |
"timeout": 0.0,
|
| 90 |
},
|
| 91 |
+
)
|
| 92 |
|
| 93 |
|
| 94 |
def _build_episode_log(episode_id: str, state: EpisodeState) -> EpisodeLog:
|
|
|
|
| 125 |
def reset(
|
| 126 |
self,
|
| 127 |
seed: int = 0,
|
| 128 |
+
scenario: str = DEFAULT_SCENARIO_TEMPLATE,
|
| 129 |
+
difficulty: str = DEFAULT_DIFFICULTY,
|
| 130 |
) -> Observation:
|
| 131 |
self._episode_id = str(uuid.uuid4())
|
| 132 |
self._logs = []
|
| 133 |
+
pack = generate_scenario(seed=seed, template=scenario, difficulty=difficulty)
|
| 134 |
self._state = EpisodeState(
|
| 135 |
seed=seed,
|
| 136 |
scenario_template=scenario,
|
| 137 |
difficulty=difficulty,
|
| 138 |
+
paper_title=pack.scientist_observation.paper_title,
|
| 139 |
paper_hypothesis="Compound X inhibits cell growth at 10 µM",
|
| 140 |
+
paper_method=pack.scientist_observation.paper_method,
|
| 141 |
paper_key_finding="IC50 = 8.3 µM",
|
| 142 |
+
experiment_goal=pack.scientist_observation.experiment_goal,
|
| 143 |
+
lab_budget_total=pack.lab_manager_observation.budget_total,
|
| 144 |
+
lab_budget_remaining=pack.lab_manager_observation.budget_remaining,
|
| 145 |
+
lab_equipment=list(pack.lab_manager_observation.equipment_available),
|
| 146 |
lab_reagents=["MTT reagent", "DMSO", "cell culture media"],
|
| 147 |
+
lab_staff_count=pack.lab_manager_observation.staff_count,
|
| 148 |
+
lab_time_limit_days=pack.lab_manager_observation.time_limit_days,
|
| 149 |
+
max_rounds=pack.scientist_observation.max_rounds,
|
| 150 |
round_number=0,
|
| 151 |
)
|
| 152 |
+
self._state.paper_hypothesis = pack.scientist_observation.paper_hypothesis
|
| 153 |
+
self._state.paper_key_finding = pack.scientist_observation.paper_key_finding
|
| 154 |
+
self._state.lab_reagents = list(pack.lab_manager_observation.reagents_in_stock)
|
| 155 |
self._state.conversation_history = list(self._logs)
|
| 156 |
log.info("Stub reset | episode=%s seed=%d scenario=%s", self._episode_id, seed, scenario)
|
| 157 |
return self._make_observation()
|
|
|
|
| 166 |
action.action_type == "accept"
|
| 167 |
or self._state.round_number >= self._state.max_rounds
|
| 168 |
)
|
| 169 |
+
reward = STUB_ACCEPT_REWARD if done and action.action_type == "accept" else 0.0
|
| 170 |
if done:
|
| 171 |
self._state.done = True
|
| 172 |
self._state.agreement_reached = action.action_type == "accept"
|
|
|
|
| 179 |
observation=self._make_observation(),
|
| 180 |
reward=reward,
|
| 181 |
done=done,
|
| 182 |
+
info=StepInfo(
|
| 183 |
+
agreement_reached=self._state.agreement_reached,
|
| 184 |
+
error=None,
|
| 185 |
+
reward_breakdown=_reward_breakdown_from_state(self._state) if done else None,
|
| 186 |
+
judge_notes="Stub audit until judge integration lands." if done else None,
|
| 187 |
+
verdict=("accept" if self._state.agreement_reached else "revise") if done else None,
|
| 188 |
+
round=self._state.round_number,
|
| 189 |
+
stub=True,
|
| 190 |
+
episode_id=self._episode_id,
|
| 191 |
+
),
|
| 192 |
)
|
| 193 |
|
| 194 |
def state(self) -> EpisodeState:
|
|
|
|
| 285 |
# In-memory session store (REST sessions)
|
| 286 |
# ---------------------------------------------------------------------------
|
| 287 |
|
| 288 |
+
_SESSION_TTL_SECONDS = SESSION_TTL_SECONDS
|
| 289 |
|
| 290 |
_sessions: dict[str, dict[str, Any]] = {}
|
| 291 |
# { session_id: { "env": env_instance, "last_active": float, "episode_id": str } }
|
|
|
|
| 361 |
# Available scenarios constant
|
| 362 |
# ---------------------------------------------------------------------------
|
| 363 |
|
| 364 |
+
SCENARIOS = available_scenario_families()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
|
| 366 |
# ---------------------------------------------------------------------------
|
| 367 |
# REST request/response schemas
|
|
|
|
| 370 |
|
| 371 |
class ResetRequest(BaseModel):
|
| 372 |
seed: int = 0
|
| 373 |
+
scenario: str = DEFAULT_SCENARIO_TEMPLATE
|
| 374 |
+
difficulty: str = DEFAULT_DIFFICULTY
|
| 375 |
session_id: Optional[str] = None # pass to reuse an existing session slot
|
| 376 |
|
| 377 |
|
|
|
|
| 468 |
|
| 469 |
# WebSocket message protocol:
|
| 470 |
# Client → Server:
|
| 471 |
+
# { "type": "reset", "seed": 42, "scenario": DEFAULT_SCENARIO_TEMPLATE, "difficulty": DEFAULT_DIFFICULTY }
|
| 472 |
# { "type": "step", "action": { ...ScientistAction fields... } }
|
| 473 |
# { "type": "ping" }
|
| 474 |
#
|
|
|
|
| 478 |
# { "type": "pong" }
|
| 479 |
# { "type": "error", "message": "..." }
|
| 480 |
|
| 481 |
+
_WS_IDLE_TIMEOUT = WS_IDLE_TIMEOUT_SECONDS
|
| 482 |
|
| 483 |
|
| 484 |
async def _ws_send(ws: WebSocket, payload: dict) -> None:
|
| 485 |
await ws.send_text(json.dumps(payload))
|
| 486 |
|
| 487 |
|
| 488 |
+
def main(host: str = API_HOST, port: int = API_PORT) -> None:
|
| 489 |
import uvicorn
|
| 490 |
|
| 491 |
uvicorn.run("server.app:app", host=host, port=port, reload=False)
|
|
|
|
| 520 |
|
| 521 |
elif msg_type == "reset":
|
| 522 |
seed = int(msg.get("seed", 0))
|
| 523 |
+
scenario = str(msg.get("scenario", DEFAULT_SCENARIO_TEMPLATE))
|
| 524 |
+
difficulty = str(msg.get("difficulty", DEFAULT_DIFFICULTY))
|
| 525 |
|
| 526 |
try:
|
| 527 |
obs = env.reset(seed=seed, scenario=scenario, difficulty=difficulty)
|
|
|
|
| 601 |
import argparse
|
| 602 |
|
| 603 |
parser = argparse.ArgumentParser()
|
| 604 |
+
parser.add_argument("--port", type=int, default=API_PORT)
|
| 605 |
+
parser.add_argument("--host", default=API_HOST)
|
| 606 |
args = parser.parse_args()
|
| 607 |
+
if args.host == API_HOST and args.port == API_PORT:
|
| 608 |
main()
|
| 609 |
else:
|
| 610 |
main(host=args.host, port=args.port)
|
tests/fixtures/golden_scenarios.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "golden_math_easy",
|
| 4 |
+
"template": "math_reasoning",
|
| 5 |
+
"difficulty": "easy",
|
| 6 |
+
"seed": 101,
|
| 7 |
+
"expected_domain_id": "mathematics",
|
| 8 |
+
"expected_title_contains": "Jensen"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"id": "golden_ml_medium",
|
| 12 |
+
"template": "ml_benchmark",
|
| 13 |
+
"difficulty": "medium",
|
| 14 |
+
"seed": 202,
|
| 15 |
+
"expected_domain_id": "machine_learning",
|
| 16 |
+
"expected_title_contains": "CIFAR-10"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"id": "golden_finance_hard",
|
| 20 |
+
"template": "finance_trading",
|
| 21 |
+
"difficulty": "hard",
|
| 22 |
+
"seed": 303,
|
| 23 |
+
"expected_domain_id": "finance_trading",
|
| 24 |
+
"expected_title_contains": "momentum"
|
| 25 |
+
}
|
| 26 |
+
]
|
tests/test_config.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from replicalab.config import (
|
| 4 |
+
DEFAULT_DIFFICULTY,
|
| 5 |
+
DEFAULT_SCENARIO_TEMPLATE,
|
| 6 |
+
MAX_BUDGET,
|
| 7 |
+
MAX_ROUNDS,
|
| 8 |
+
SESSION_TTL_SECONDS,
|
| 9 |
+
WS_IDLE_TIMEOUT_SECONDS,
|
| 10 |
+
)
|
| 11 |
+
from replicalab.scenarios import generate_scenario
|
| 12 |
+
from server.app import ResetRequest
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_reset_request_defaults_match_shared_config() -> None:
|
| 16 |
+
request = ResetRequest()
|
| 17 |
+
|
| 18 |
+
assert request.scenario == DEFAULT_SCENARIO_TEMPLATE
|
| 19 |
+
assert request.difficulty == DEFAULT_DIFFICULTY
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_generated_scenarios_respect_shared_round_and_budget_caps() -> None:
|
| 23 |
+
for template in ("math_reasoning", "ml_benchmark", "finance_trading"):
|
| 24 |
+
for difficulty in ("easy", "medium", "hard"):
|
| 25 |
+
pack = generate_scenario(seed=123, template=template, difficulty=difficulty)
|
| 26 |
+
assert pack.scientist_observation.max_rounds == MAX_ROUNDS
|
| 27 |
+
assert pack.lab_manager_observation.max_rounds == MAX_ROUNDS
|
| 28 |
+
assert pack.lab_manager_observation.budget_total <= MAX_BUDGET
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_timeout_exports_share_the_same_default_value() -> None:
|
| 32 |
+
assert SESSION_TTL_SECONDS == WS_IDLE_TIMEOUT_SECONDS
|
tests/test_scenarios.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
from replicalab.scenarios import (
|
| 6 |
+
GOLDEN_SCENARIO_SPECS_PATH,
|
| 7 |
+
available_scenario_families,
|
| 8 |
+
generate_scenario,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_generate_scenario_is_deterministic_for_same_seed() -> None:
|
| 13 |
+
first = generate_scenario(seed=101, template="math_reasoning", difficulty="easy")
|
| 14 |
+
second = generate_scenario(seed=101, template="math_reasoning", difficulty="easy")
|
| 15 |
+
|
| 16 |
+
assert first.model_dump(mode="json") == second.model_dump(mode="json")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def test_generate_scenario_varies_across_seeded_cases() -> None:
|
| 20 |
+
first = generate_scenario(seed=101, template="math_reasoning", difficulty="easy")
|
| 21 |
+
second = generate_scenario(seed=102, template="math_reasoning", difficulty="easy")
|
| 22 |
+
|
| 23 |
+
assert first.scientist_observation.paper_title != second.scientist_observation.paper_title
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_available_scenario_families_exposes_three_domain_families() -> None:
|
| 27 |
+
assert available_scenario_families() == [
|
| 28 |
+
{"family": "math_reasoning", "difficulties": ["easy", "medium", "hard"]},
|
| 29 |
+
{"family": "ml_benchmark", "difficulties": ["easy", "medium", "hard"]},
|
| 30 |
+
{"family": "finance_trading", "difficulties": ["easy", "medium", "hard"]},
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_hard_finance_scenario_exposes_unavailable_resource_and_safety_rules() -> None:
|
| 35 |
+
pack = generate_scenario(seed=303, template="finance_trading", difficulty="hard")
|
| 36 |
+
|
| 37 |
+
assert any(not resource.available for resource in pack.resources)
|
| 38 |
+
assert pack.lab_manager_observation.reagents_out_of_stock
|
| 39 |
+
assert pack.lab_manager_observation.safety_restrictions
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_difficulty_levels_mechanically_change_budget_and_constraints() -> None:
|
| 43 |
+
easy = generate_scenario(seed=202, template="ml_benchmark", difficulty="easy")
|
| 44 |
+
medium = generate_scenario(seed=202, template="ml_benchmark", difficulty="medium")
|
| 45 |
+
hard = generate_scenario(seed=202, template="ml_benchmark", difficulty="hard")
|
| 46 |
+
|
| 47 |
+
assert easy.lab_manager_observation.budget_total > medium.lab_manager_observation.budget_total
|
| 48 |
+
assert medium.lab_manager_observation.budget_total > hard.lab_manager_observation.budget_total
|
| 49 |
+
assert len(easy.constraints) < len(medium.constraints) < len(hard.constraints)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def test_generated_scenarios_keep_unique_constraint_and_resource_keys() -> None:
|
| 53 |
+
for template in ("math_reasoning", "ml_benchmark", "finance_trading"):
|
| 54 |
+
pack = generate_scenario(seed=303, template=template, difficulty="hard")
|
| 55 |
+
constraint_keys = [constraint.key for constraint in pack.constraints]
|
| 56 |
+
resource_keys = [resource.key for resource in pack.resources]
|
| 57 |
+
assert len(constraint_keys) == len(set(constraint_keys))
|
| 58 |
+
assert len(resource_keys) == len(set(resource_keys))
|
| 59 |
+
assert pack.hidden_reference_spec.required_elements
|
| 60 |
+
assert pack.allowed_substitutions
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def test_golden_scenario_specs_exist_for_manual_prompt_checks() -> None:
|
| 64 |
+
specs = json.loads(GOLDEN_SCENARIO_SPECS_PATH.read_text(encoding="utf-8"))
|
| 65 |
+
|
| 66 |
+
assert len(specs) == 3
|
| 67 |
+
assert [spec["id"] for spec in specs] == [
|
| 68 |
+
"golden_math_easy",
|
| 69 |
+
"golden_ml_medium",
|
| 70 |
+
"golden_finance_hard",
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def test_golden_scenarios_match_expected_title_and_domain() -> None:
|
| 75 |
+
specs = json.loads(GOLDEN_SCENARIO_SPECS_PATH.read_text(encoding="utf-8"))
|
| 76 |
+
|
| 77 |
+
for spec in specs:
|
| 78 |
+
pack = generate_scenario(
|
| 79 |
+
seed=spec["seed"],
|
| 80 |
+
template=spec["template"],
|
| 81 |
+
difficulty=spec["difficulty"],
|
| 82 |
+
)
|
| 83 |
+
assert pack.domain_id == spec["expected_domain_id"]
|
| 84 |
+
assert spec["expected_title_contains"] in pack.scientist_observation.paper_title
|
tests/test_scientist_policy.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from replicalab.agents.scientist_policy import (
|
| 6 |
+
ScientistOutputParseError,
|
| 7 |
+
build_scientist_system_prompt,
|
| 8 |
+
parse_scientist_output,
|
| 9 |
+
)
|
| 10 |
+
from replicalab.models import ScientistActionType
|
| 11 |
+
from replicalab.scenarios import generate_scenario
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_parse_scientist_output_accepts_plain_json() -> None:
|
| 15 |
+
raw_text = """
|
| 16 |
+
{
|
| 17 |
+
"action_type": "request_info",
|
| 18 |
+
"sample_size": 0,
|
| 19 |
+
"controls": [],
|
| 20 |
+
"technique": "",
|
| 21 |
+
"duration_days": 0,
|
| 22 |
+
"required_equipment": [],
|
| 23 |
+
"required_reagents": [],
|
| 24 |
+
"questions": ["What compute budget is available?"],
|
| 25 |
+
"rationale": ""
|
| 26 |
+
}
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
action = parse_scientist_output(raw_text)
|
| 30 |
+
|
| 31 |
+
assert action.action_type is ScientistActionType.REQUEST_INFO
|
| 32 |
+
assert action.questions == ["What compute budget is available?"]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_parse_scientist_output_accepts_fenced_json_with_prose() -> None:
|
| 36 |
+
raw_text = """
|
| 37 |
+
I would revise the plan as follows:
|
| 38 |
+
|
| 39 |
+
```json
|
| 40 |
+
{
|
| 41 |
+
"action_type": "revise_protocol",
|
| 42 |
+
"sample_size": 24,
|
| 43 |
+
"controls": ["baseline", "ablation"],
|
| 44 |
+
"technique": "small_scale_backtest",
|
| 45 |
+
"duration_days": 3,
|
| 46 |
+
"required_equipment": ["gpu_node"],
|
| 47 |
+
"required_reagents": [],
|
| 48 |
+
"questions": [],
|
| 49 |
+
"rationale": "Shrink the trial to fit the available compute window."
|
| 50 |
+
}
|
| 51 |
+
```
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
action = parse_scientist_output(raw_text)
|
| 55 |
+
|
| 56 |
+
assert action.action_type is ScientistActionType.REVISE_PROTOCOL
|
| 57 |
+
assert action.technique == "small_scale_backtest"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def test_parse_scientist_output_raises_explicit_error_when_json_is_missing() -> None:
|
| 61 |
+
with pytest.raises(ScientistOutputParseError) as exc_info:
|
| 62 |
+
parse_scientist_output("I need more context before I can answer.")
|
| 63 |
+
|
| 64 |
+
assert exc_info.value.code == "no_json"
|
| 65 |
+
assert "does not contain a JSON object" in exc_info.value.message
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def test_parse_scientist_output_raises_explicit_error_when_json_is_invalid() -> None:
|
| 69 |
+
raw_text = """
|
| 70 |
+
```json
|
| 71 |
+
{
|
| 72 |
+
"action_type": "request_info",
|
| 73 |
+
"questions": ["What budget do we have?"],
|
| 74 |
+
}
|
| 75 |
+
```
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
with pytest.raises(ScientistOutputParseError) as exc_info:
|
| 79 |
+
parse_scientist_output(raw_text)
|
| 80 |
+
|
| 81 |
+
assert exc_info.value.code == "invalid_json"
|
| 82 |
+
assert "could not be decoded" in exc_info.value.message
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def test_parse_scientist_output_raises_explicit_error_when_schema_is_invalid() -> None:
|
| 86 |
+
raw_text = """
|
| 87 |
+
{
|
| 88 |
+
"action_type": "request_info",
|
| 89 |
+
"sample_size": 0,
|
| 90 |
+
"controls": [],
|
| 91 |
+
"technique": "",
|
| 92 |
+
"duration_days": 0,
|
| 93 |
+
"required_equipment": [],
|
| 94 |
+
"required_reagents": [],
|
| 95 |
+
"questions": [],
|
| 96 |
+
"rationale": ""
|
| 97 |
+
}
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
with pytest.raises(ScientistOutputParseError) as exc_info:
|
| 101 |
+
parse_scientist_output(raw_text)
|
| 102 |
+
|
| 103 |
+
assert exc_info.value.code == "invalid_action"
|
| 104 |
+
assert "ScientistAction validation" in exc_info.value.message
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def test_build_scientist_system_prompt_uses_normalized_scenario_data() -> None:
|
| 108 |
+
scenario = generate_scenario(seed=202, template="ml_benchmark", difficulty="medium")
|
| 109 |
+
|
| 110 |
+
prompt = build_scientist_system_prompt(scenario)
|
| 111 |
+
|
| 112 |
+
assert "You are the Scientist agent in ReplicaLab." in prompt
|
| 113 |
+
assert scenario.task_summary in prompt
|
| 114 |
+
assert scenario.success_criteria[0] in prompt
|
| 115 |
+
assert scenario.resources[0].label in prompt
|
| 116 |
+
assert "action_type values" in prompt
|
| 117 |
+
assert "propose_protocol" in prompt
|
| 118 |
+
assert "request_info" in prompt
|