Spaces:

ayushozha
/

replicalab

Running

ayushozha Claude Opus 4.6 commited on 28 days ago

Commit

5f8c92c

1 Parent(s): b1d2209

Add MOD 11 typed StepInfo/RewardBreakdown and import completed foundation modules

MOD 11: Replace untyped StepResult.info dict with StepInfo model
(extra="allow") and RewardBreakdown model with constrained score
fields. Stub server now explicitly constructs typed objects.

Also imports previously completed work: config.py (MOD 12),
seed.py (SCN 01), normalized scenario layer with three domain
adapters (SCN 02-09), scientist policy parser, and all tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (15) hide show

replicalab/agents/__init__.py +13 -0
replicalab/agents/scientist_policy.py +290 -0
replicalab/config.py +26 -0
replicalab/models.py +34 -2
replicalab/scenarios/__init__.py +25 -0
replicalab/scenarios/finance_trading.py +214 -0
replicalab/scenarios/math_reasoning.py +214 -0
replicalab/scenarios/ml_benchmark.py +214 -0
replicalab/scenarios/templates.py +299 -0
replicalab/utils/seed.py +20 -0
server/app.py +59 -42
tests/fixtures/golden_scenarios.json +26 -0
tests/test_config.py +32 -0
tests/test_scenarios.py +84 -0
tests/test_scientist_policy.py +118 -0

replicalab/agents/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Agent policy helpers exposed as importable modules."""
+from .scientist_policy import (
+    ScientistOutputParseError,
+    build_scientist_system_prompt,
+    parse_scientist_output,
+)
+__all__ = [
+    "ScientistOutputParseError",
+    "build_scientist_system_prompt",
+    "parse_scientist_output",
+]

replicalab/agents/scientist_policy.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""Scientist policy helpers.
+MOD 09 introduced strict parsing from raw model output into
+``ScientistAction``. AGT 01 adds the first domain-neutral system prompt
+builder so prompt assembly can be driven by the normalized scenario pack
+instead of hard-coded domain text.
+"""
+from __future__ import annotations
+import json
+import re
+from typing import Any, Literal, Mapping
+from pydantic import ValidationError
+from replicalab.models import ScientistAction, ScientistActionType
+from replicalab.scenarios import NormalizedScenarioPack
+_JSON_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)```", re.IGNORECASE | re.DOTALL)
+class ScientistOutputParseError(ValueError):
+    """Explicit parser error for malformed or invalid Scientist output."""
+    def __init__(
+        self,
+        code: Literal["no_json", "invalid_json", "invalid_action"],
+        message: str,
+        raw_text: str,
+        *,
+        parsed_payload: dict[str, Any] | None = None,
+    ) -> None:
+        super().__init__(message)
+        self.code = code
+        self.message = message
+        self.raw_text = raw_text
+        self.parsed_payload = parsed_payload
+    def to_dict(self) -> dict[str, Any]:
+        """Return a stable error shape for callers and future retries."""
+        return {
+            "code": self.code,
+            "message": self.message,
+            "raw_text": self.raw_text,
+            "parsed_payload": self.parsed_payload,
+        }
+def build_scientist_system_prompt(
+    scenario: NormalizedScenarioPack | Mapping[str, Any],
+) -> str:
+    """Build a domain-neutral Scientist system prompt from normalized data."""
+    pack = _coerce_scenario_pack(scenario)
+    allowed_actions = ", ".join(action.value for action in ScientistActionType)
+    sections = [
+        "You are the Scientist agent in ReplicaLab.",
+        (
+            "Your job is to negotiate toward the strongest feasible plan under the "
+            "provided constraints. You do not invent resources, loosen constraints, "
+            "or assume access to hidden ground truth."
+        ),
+        f"Domain: {pack.domain_id}",
+        f"Task: {pack.task_summary}",
+        "Success criteria:",
+        _render_bullets(pack.success_criteria),
+        "Constraints:",
+        _render_constraints(pack),
+        "Available resources:",
+        _render_resources(pack),
+        "Allowed substitutions:",
+        _render_substitutions(pack),
+        (
+            "Output contract: return exactly one JSON object with all "
+            "ScientistAction fields and no extra keys."
+        ),
+        f"Allowed action_type values: {allowed_actions}.",
+        (
+            "Use propose_protocol or revise_protocol only when you can provide a full "
+            "protocol payload. Use request_info only when a blocking question remains. "
+            "Use accept only when the plan is ready without further edits."
+        ),
+        (
+            "For propose_protocol and revise_protocol, the JSON must include: "
+            "sample_size >= 1, controls, technique, duration_days >= 0, "
+            "required_equipment, required_reagents, questions = [], and rationale."
+        ),
+        (
+            "For request_info, all protocol fields must stay empty or zero and "
+            "questions must contain at least one concrete question."
+        ),
+        (
+            "For accept, questions must be empty and protocol-edit fields must stay "
+            "empty or zero."
+        ),
+    ]
+    return "\n\n".join(section for section in sections if section)
+def parse_scientist_output(raw_text: str) -> ScientistAction:
+    """Parse raw model text into a validated ``ScientistAction``.
+    The parser accepts:
+    - plain JSON objects
+    - fenced JSON blocks
+    - prose that contains one JSON object
+    """
+    payload = _parse_json_payload(raw_text)
+    try:
+        return ScientistAction.model_validate(payload)
+    except ValidationError as exc:
+        raise ScientistOutputParseError(
+            "invalid_action",
+            _format_validation_error(exc),
+            raw_text,
+            parsed_payload=payload,
+        ) from exc
+def _parse_json_payload(raw_text: str) -> dict[str, Any]:
+    if not raw_text.strip():
+        raise ScientistOutputParseError(
+            "no_json",
+            "Scientist output is empty and does not contain a JSON object.",
+            raw_text,
+        )
+    saw_json_like_text = False
+    last_json_error: json.JSONDecodeError | None = None
+    for candidate in _iter_json_candidates(raw_text):
+        saw_json_like_text = True
+        try:
+            decoded = json.loads(candidate)
+        except json.JSONDecodeError as exc:
+            last_json_error = exc
+            continue
+        if not isinstance(decoded, dict):
+            raise ScientistOutputParseError(
+                "invalid_json",
+                "Scientist output must decode to a JSON object.",
+                raw_text,
+            )
+        return decoded
+    if saw_json_like_text and last_json_error is not None:
+        raise ScientistOutputParseError(
+            "invalid_json",
+            (
+                "Scientist output contains JSON-like text but it could not be decoded: "
+                f"{last_json_error.msg} at line {last_json_error.lineno}, "
+                f"column {last_json_error.colno}."
+            ),
+            raw_text,
+        ) from last_json_error
+    raise ScientistOutputParseError(
+        "no_json",
+        "Scientist output does not contain a JSON object.",
+        raw_text,
+    )
+def _iter_json_candidates(raw_text: str) -> list[str]:
+    candidates: list[str] = []
+    seen: set[str] = set()
+    def add(candidate: str | None) -> None:
+        if candidate is None:
+            return
+        cleaned = candidate.strip()
+        if not cleaned or cleaned in seen:
+            return
+        seen.add(cleaned)
+        candidates.append(cleaned)
+    stripped = raw_text.strip()
+    if stripped.startswith("{") or stripped.startswith("```"):
+        add(raw_text)
+    add(_extract_first_json_object(raw_text))
+    for match in _JSON_FENCE_RE.finditer(raw_text):
+        fenced = match.group(1)
+        add(fenced)
+        add(_extract_first_json_object(fenced))
+    return candidates
+def _extract_first_json_object(text: str) -> str | None:
+    start = text.find("{")
+    if start < 0:
+        return None
+    depth = 0
+    in_string = False
+    escaped = False
+    for index in range(start, len(text)):
+        char = text[index]
+        if in_string:
+            if escaped:
+                escaped = False
+            elif char == "\\":
+                escaped = True
+            elif char == '"':
+                in_string = False
+            continue
+        if char == '"':
+            in_string = True
+        elif char == "{":
+            depth += 1
+        elif char == "}":
+            depth -= 1
+            if depth == 0:
+                return text[start : index + 1]
+    return None
+def _format_validation_error(error: ValidationError) -> str:
+    parts: list[str] = []
+    for item in error.errors():
+        path = ".".join(str(segment) for segment in item.get("loc", ()))
+        message = item.get("msg", "Validation error")
+        parts.append(f"{path}: {message}" if path else message)
+    detail = "; ".join(parts) if parts else str(error)
+    return f"Scientist output JSON failed ScientistAction validation: {detail}"
+def _coerce_scenario_pack(
+    scenario: NormalizedScenarioPack | Mapping[str, Any],
+) -> NormalizedScenarioPack:
+    if isinstance(scenario, NormalizedScenarioPack):
+        return scenario
+    return NormalizedScenarioPack.model_validate(scenario)
+def _render_bullets(items: list[str]) -> str:
+    return "\n".join(f"- {item}" for item in items)
+def _render_constraints(pack: NormalizedScenarioPack) -> str:
+    lines = []
+    for constraint in pack.constraints:
+        amount = ""
+        if constraint.quantity is not None:
+            unit = f" {constraint.unit}" if constraint.unit else ""
+            amount = f" ({constraint.comparator} {constraint.quantity}{unit})"
+        hardness = "hard" if constraint.hard else "soft"
+        lines.append(f"- [{hardness}] {constraint.label}{amount}: {constraint.details}")
+    return "\n".join(lines)
+def _render_resources(pack: NormalizedScenarioPack) -> str:
+    lines = []
+    for resource in pack.resources:
+        availability = "available" if resource.available else "unavailable"
+        amount = ""
+        if resource.quantity is not None:
+            unit = f" {resource.unit}" if resource.unit else ""
+            amount = f" ({resource.quantity}{unit})"
+        lines.append(
+            f"- [{availability}] {resource.label}{amount}: {resource.details}"
+        )
+    return "\n".join(lines)
+def _render_substitutions(pack: NormalizedScenarioPack) -> str:
+    if not pack.allowed_substitutions:
+        return "- No substitutions are pre-approved."
+    lines = []
+    for substitution in pack.allowed_substitutions:
+        lines.append(
+            (
+                f"- {substitution.original} -> {substitution.alternative}. "
+                f"Condition: {substitution.condition} Tradeoff: {substitution.tradeoff}"
+            )
+        )
+    return "\n".join(lines)

replicalab/config.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""Shared configuration constants for ReplicaLab.
+MOD 12 centralizes the small set of repo-wide defaults that were previously
+scattered across the stub server and scenario builders. Future environment,
+scoring, and client modules should import from here instead of introducing
+new magic numbers.
+"""
+from __future__ import annotations
+DEFAULT_SCENARIO_TEMPLATE = "math_reasoning"
+DEFAULT_DIFFICULTY = "easy"
+MAX_ROUNDS = 6
+MAX_BUDGET = 5000.0
+TIMEOUT_SECONDS = 300
+ROUND_TIME_LIMIT_SECONDS = 300
+SESSION_TTL_SECONDS = TIMEOUT_SECONDS
+WS_IDLE_TIMEOUT_SECONDS = TIMEOUT_SECONDS
+STUB_ACCEPT_REWARD = 5.0
+API_HOST = "0.0.0.0"
+API_PORT = 7860

replicalab/models.py CHANGED Viewed

@@ -307,18 +307,50 @@ class Observation(BaseModel):
     lab_manager: Optional[LabManagerObservation]
 # ---------------------------------------------------------------------------
 # Step result
 # ---------------------------------------------------------------------------
 class StepResult(BaseModel):
     """Returned by env.step(). Contains the next observation, reward,
-    termination flag, and optional info dict."""
     observation: Optional[Observation] = None
     reward: float = 0.0
     done: bool = False
-    info: dict = Field(default_factory=dict)
 # ---------------------------------------------------------------------------

     lab_manager: Optional[LabManagerObservation]
+# ---------------------------------------------------------------------------
+# Reward breakdown and step metadata
+# ---------------------------------------------------------------------------
+class RewardBreakdown(BaseModel):
+    """Component scores and adjustments produced by the judge rubric engine."""
+    rigor: float = Field(default=0.0, ge=0, le=1)
+    feasibility: float = Field(default=0.0, ge=0, le=1)
+    fidelity: float = Field(default=0.0, ge=0, le=1)
+    efficiency_bonus: float = 0.0
+    communication_bonus: float = 0.0
+    penalties: dict[str, float] = Field(default_factory=dict)
+class StepInfo(BaseModel):
+    """Typed metadata returned alongside each step result.
+    Reserved keys from the frozen contract are typed fields.
+    Additional debug or runtime metadata is allowed via extra="allow".
+    """
+    model_config = ConfigDict(extra="allow")
+    agreement_reached: bool = False
+    error: Optional[str] = None
+    reward_breakdown: Optional[RewardBreakdown] = None
+    judge_notes: Optional[str] = None
+    verdict: Optional[str] = None
 # ---------------------------------------------------------------------------
 # Step result
 # ---------------------------------------------------------------------------
 class StepResult(BaseModel):
     """Returned by env.step(). Contains the next observation, reward,
+    termination flag, and typed step info."""
     observation: Optional[Observation] = None
     reward: float = 0.0
     done: bool = False
+    info: StepInfo = Field(default_factory=StepInfo)
 # ---------------------------------------------------------------------------

replicalab/scenarios/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""Scenario generation exports."""
+from .templates import (
+    GOLDEN_SCENARIO_SPECS_PATH,
+    HiddenReferenceSpec,
+    NormalizedScenarioPack,
+    ScenarioConstraint,
+    ScenarioResource,
+    available_scenario_families,
+    apply_difficulty,
+    generate_scenario,
+    load_template,
+)
+__all__ = [
+    "GOLDEN_SCENARIO_SPECS_PATH",
+    "HiddenReferenceSpec",
+    "NormalizedScenarioPack",
+    "ScenarioConstraint",
+    "ScenarioResource",
+    "available_scenario_families",
+    "apply_difficulty",
+    "generate_scenario",
+    "load_template",
+]

replicalab/scenarios/finance_trading.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""Finance and trading planning scenario templates."""
+from __future__ import annotations
+import random
+from typing import Any
+from replicalab.config import MAX_ROUNDS
+def build_finance_trading_template(rng: random.Random) -> dict[str, Any]:
+    cases = [
+        {
+            "domain_id": "finance_trading",
+            "paper_title": "Planning an offline mean-reversion backtest for SPY and QQQ",
+            "paper_hypothesis": "A simple mean-reversion design can be evaluated fairly without live execution.",
+            "paper_method": "Run an offline daily-bar backtest with transaction costs, slippage assumptions, and fixed entry rules.",
+            "paper_key_finding": "The plan is accepted only if risk limits and evaluation hygiene remain explicit.",
+            "task_summary": "Design a mean-reversion backtest workflow for SPY and QQQ under capital, drawdown, and deadline limits.",
+            "success_criteria": [
+                "Use only offline historical data with explicit slippage assumptions.",
+                "Keep position sizing inside the stated capital and drawdown rules.",
+                "Separate strategy design from final evaluation.",
+            ],
+            "reference_summary": "A valid plan keeps the workflow offline, constrains drawdown, and documents slippage assumptions.",
+            "required_elements": [
+                "offline historical data only",
+                "transaction cost assumption",
+                "drawdown guardrail",
+                "final evaluation split",
+            ],
+            "flexible_elements": [
+                "lookback window",
+                "entry threshold",
+                "report visualization format",
+            ],
+            "target_metric": "risk_adjusted_return",
+            "target_value": "positive Sharpe with drawdown inside the guardrail",
+            "constraints": [
+                {
+                    "key": "max_capital",
+                    "label": "Maximum simulated capital",
+                    "quantity": 50000,
+                    "unit": "usd",
+                    "comparator": "<=",
+                    "hard": True,
+                    "details": "The simulation must stay within the stated capital cap.",
+                },
+                {
+                    "key": "max_drawdown",
+                    "label": "Maximum allowed drawdown",
+                    "quantity": 8,
+                    "unit": "percent",
+                    "comparator": "<=",
+                    "hard": True,
+                    "details": "Any accepted plan must respect the drawdown guardrail.",
+                },
+                {
+                    "key": "live_execution",
+                    "label": "Execution mode",
+                    "quantity": None,
+                    "unit": None,
+                    "comparator": "=",
+                    "hard": True,
+                    "details": "Only offline or backtest planning is allowed. No live trading.",
+                },
+            ],
+            "resources": [
+                {
+                    "key": "historical_bars",
+                    "label": "Historical daily bar dataset",
+                    "quantity": 1,
+                    "unit": "dataset",
+                    "available": True,
+                    "category": "data",
+                    "details": "Contains adjusted SPY and QQQ bars with metadata.",
+                },
+                {
+                    "key": "backtest_engine",
+                    "label": "Backtest engine",
+                    "quantity": 1,
+                    "unit": "engine",
+                    "available": True,
+                    "category": "tool",
+                    "details": "Supports offline simulation with transaction costs and slippage.",
+                },
+                {
+                    "key": "risk_reviewer",
+                    "label": "Risk reviewer",
+                    "quantity": 1,
+                    "unit": "reviewer",
+                    "available": True,
+                    "category": "personnel",
+                    "details": "Reviews risk assumptions and evaluation hygiene.",
+                },
+            ],
+            "allowed_substitutions": [
+                {
+                    "original": "daily bars",
+                    "alternative": "hourly bars aggregated to daily decisions",
+                    "condition": "Use if the daily dataset is delayed or incomplete.",
+                    "tradeoff": "The plan must justify any slippage-model change.",
+                },
+                {
+                    "original": "risk reviewer",
+                    "alternative": "pre-committed risk checklist",
+                    "condition": "Use if the reviewer is unavailable.",
+                    "tradeoff": "The plan must include explicit drawdown checks.",
+                },
+            ],
+            "budget_total": 950.0,
+            "staff_count": 1,
+            "time_limit_days": 3,
+            "max_rounds": MAX_ROUNDS,
+        },
+        {
+            "domain_id": "finance_trading",
+            "paper_title": "Planning an offline momentum backtest for liquid futures",
+            "paper_hypothesis": "A disciplined momentum design can be evaluated offline with strict liquidity and cost assumptions.",
+            "paper_method": "Run a futures momentum backtest with predefined roll logic, cost model, and walk-forward evaluation.",
+            "paper_key_finding": "The plan is accepted only if walk-forward evaluation and liquidity constraints are explicit.",
+            "task_summary": "Design an offline momentum futures backtest under liquidity, slippage, and review constraints.",
+            "success_criteria": [
+                "Use only offline walk-forward evaluation.",
+                "Model roll handling and transaction costs explicitly.",
+                "Keep liquidity and concentration rules visible in the final plan.",
+            ],
+            "reference_summary": "A valid plan models roll logic, transaction costs, and walk-forward evaluation with liquidity limits.",
+            "required_elements": [
+                "walk-forward evaluation",
+                "roll logic",
+                "transaction cost assumption",
+                "liquidity limit",
+            ],
+            "flexible_elements": [
+                "lookback horizon",
+                "rebalance frequency",
+                "reporting template",
+            ],
+            "target_metric": "risk_adjusted_return",
+            "target_value": "positive out-of-sample Sharpe with liquidity-compliant trades",
+            "constraints": [
+                {
+                    "key": "max_markets",
+                    "label": "Maximum simultaneous markets",
+                    "quantity": 4,
+                    "unit": "markets",
+                    "comparator": "<=",
+                    "hard": False,
+                    "details": "Keep the design narrow enough to review in one session.",
+                },
+                {
+                    "key": "max_drawdown",
+                    "label": "Maximum allowed drawdown",
+                    "quantity": 10,
+                    "unit": "percent",
+                    "comparator": "<=",
+                    "hard": True,
+                    "details": "The plan must remain inside the drawdown guardrail.",
+                },
+                {
+                    "key": "live_execution",
+                    "label": "Execution mode",
+                    "quantity": None,
+                    "unit": None,
+                    "comparator": "=",
+                    "hard": True,
+                    "details": "Only offline design and backtesting are allowed.",
+                },
+            ],
+            "resources": [
+                {
+                    "key": "futures_dataset",
+                    "label": "Historical futures dataset",
+                    "quantity": 1,
+                    "unit": "dataset",
+                    "available": True,
+                    "category": "data",
+                    "details": "Includes roll metadata and contract-level liquidity fields.",
+                },
+                {
+                    "key": "backtest_engine",
+                    "label": "Walk-forward backtest engine",
+                    "quantity": 1,
+                    "unit": "engine",
+                    "available": True,
+                    "category": "tool",
+                    "details": "Supports walk-forward slicing and execution-cost modeling.",
+                },
+                {
+                    "key": "risk_reviewer",
+                    "label": "Risk reviewer",
+                    "quantity": 1,
+                    "unit": "reviewer",
+                    "available": True,
+                    "category": "personnel",
+                    "details": "Checks liquidity and concentration assumptions.",
+                },
+            ],
+            "allowed_substitutions": [
+                {
+                    "original": "contract-level backtest",
+                    "alternative": "continuous-series backtest with explicit caveat",
+                    "condition": "Use if contract roll metadata is incomplete.",
+                    "tradeoff": "The plan must document the fidelity loss clearly.",
+                }
+            ],
+            "budget_total": 1100.0,
+            "staff_count": 1,
+            "time_limit_days": 4,
+            "max_rounds": MAX_ROUNDS,
+        },
+    ]
+    return rng.choice(cases)

replicalab/scenarios/math_reasoning.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""Mathematics scenario templates."""
+from __future__ import annotations
+import random
+from typing import Any
+from replicalab.config import MAX_ROUNDS
+def build_math_reasoning_template(rng: random.Random) -> dict[str, Any]:
+    cases = [
+        {
+            "domain_id": "mathematics",
+            "paper_title": "Planning a proof of the Cauchy-Schwarz inequality",
+            "paper_hypothesis": "A square-expansion argument gives the cleanest proof path.",
+            "paper_method": "Outline the proof using one algebraic identity, one equality-case check, and reviewer notes.",
+            "paper_key_finding": "The proof is accepted only if every inequality step and equality case is justified.",
+            "task_summary": "Produce a proof-planning workflow for the Cauchy-Schwarz inequality for an undergraduate seminar handout.",
+            "success_criteria": [
+                "Every inequality step is justified in plain language.",
+                "The equality case is checked explicitly.",
+                "The final plan fits within the review and deadline constraints.",
+            ],
+            "reference_summary": "A valid plan uses a square-expansion route, checks equality, and includes one verification pass.",
+            "required_elements": [
+                "explicit target inequality",
+                "square-expansion or inner-product setup",
+                "equality-case check",
+                "final verification pass",
+            ],
+            "flexible_elements": [
+                "notation style",
+                "ordering of supporting lemmas",
+                "proof-sketch granularity",
+            ],
+            "target_metric": "proof_validity",
+            "target_value": "all required justification steps are present",
+            "constraints": [
+                {
+                    "key": "deadline_days",
+                    "label": "Proof planning deadline",
+                    "quantity": 3,
+                    "unit": "days",
+                    "comparator": "<=",
+                    "hard": True,
+                    "details": "The seminar notes must be ready within three days.",
+                },
+                {
+                    "key": "review_passes",
+                    "label": "Required review passes",
+                    "quantity": 1,
+                    "unit": "pass",
+                    "comparator": ">=",
+                    "hard": True,
+                    "details": "At least one verification pass is required before acceptance.",
+                },
+                {
+                    "key": "max_pages",
+                    "label": "Maximum proof outline length",
+                    "quantity": 2,
+                    "unit": "pages",
+                    "comparator": "<=",
+                    "hard": False,
+                    "details": "The outline should stay concise enough for seminar notes.",
+                },
+            ],
+            "resources": [
+                {
+                    "key": "proof_notebook",
+                    "label": "Structured proof notebook",
+                    "quantity": 1,
+                    "unit": "workspace",
+                    "available": True,
+                    "category": "tool",
+                    "details": "A shared note workspace for the outline and checks.",
+                },
+                {
+                    "key": "theorem_library",
+                    "label": "Reference theorem library",
+                    "quantity": 1,
+                    "unit": "library",
+                    "available": True,
+                    "category": "reference",
+                    "details": "Contains previous inequality proofs and notation conventions.",
+                },
+                {
+                    "key": "reviewer",
+                    "label": "Graduate reviewer",
+                    "quantity": 1,
+                    "unit": "reviewer",
+                    "available": True,
+                    "category": "personnel",
+                    "details": "A reviewer can check one draft before the deadline.",
+                },
+            ],
+            "allowed_substitutions": [
+                {
+                    "original": "graduate reviewer",
+                    "alternative": "self-check rubric",
+                    "condition": "Use only if the reviewer is unavailable.",
+                    "tradeoff": "Requires a stricter written checklist inside the plan.",
+                },
+                {
+                    "original": "full derivation",
+                    "alternative": "proof sketch with explicit checkpoints",
+                    "condition": "Use when page budget is tight.",
+                    "tradeoff": "The plan must still spell out all justification steps.",
+                },
+            ],
+            "budget_total": 300.0,
+            "staff_count": 1,
+            "time_limit_days": 3,
+            "max_rounds": MAX_ROUNDS,
+        },
+        {
+            "domain_id": "mathematics",
+            "paper_title": "Planning a proof of Jensen's inequality for convex quadratics",
+            "paper_hypothesis": "A convexity-first outline is shorter than an expectation-expansion route.",
+            "paper_method": "Use the convexity definition, midpoint intuition, and one numerical sanity check.",
+            "paper_key_finding": "The plan succeeds only if the convexity assumption and averaging step are both explicit.",
+            "task_summary": "Produce a proof-planning workflow for Jensen's inequality on convex quadratics for a revision session.",
+            "success_criteria": [
+                "The convexity assumption is named before the main argument.",
+                "Averaging and expectation steps are justified.",
+                "The plan includes at least one sanity check example.",
+            ],
+            "reference_summary": "A valid plan states convexity early, justifies averaging, and uses one sanity check.",
+            "required_elements": [
+                "convexity assumption",
+                "averaging step",
+                "sanity check example",
+                "closing statement tied to the task objective",
+            ],
+            "flexible_elements": [
+                "example choice",
+                "notation style",
+                "proof sketch ordering",
+            ],
+            "target_metric": "proof_validity",
+            "target_value": "convexity and averaging are justified with one sanity check",
+            "constraints": [
+                {
+                    "key": "deadline_days",
+                    "label": "Proof planning deadline",
+                    "quantity": 2,
+                    "unit": "days",
+                    "comparator": "<=",
+                    "hard": True,
+                    "details": "The revision notes are due within two days.",
+                },
+                {
+                    "key": "review_passes",
+                    "label": "Required review passes",
+                    "quantity": 1,
+                    "unit": "pass",
+                    "comparator": ">=",
+                    "hard": True,
+                    "details": "The plan needs at least one self-check or peer review.",
+                },
+                {
+                    "key": "max_pages",
+                    "label": "Maximum proof outline length",
+                    "quantity": 1,
+                    "unit": "page",
+                    "comparator": "<=",
+                    "hard": False,
+                    "details": "The final outline should fit on one page.",
+                },
+            ],
+            "resources": [
+                {
+                    "key": "whiteboard",
+                    "label": "Whiteboard workspace",
+                    "quantity": 1,
+                    "unit": "workspace",
+                    "available": True,
+                    "category": "tool",
+                    "details": "Used to sketch the proof structure and sanity check.",
+                },
+                {
+                    "key": "reference_notes",
+                    "label": "Reference lecture notes",
+                    "quantity": 1,
+                    "unit": "packet",
+                    "available": True,
+                    "category": "reference",
+                    "details": "Contains the convexity definition and worked examples.",
+                },
+                {
+                    "key": "peer_reviewer",
+                    "label": "Peer reviewer",
+                    "quantity": 1,
+                    "unit": "reviewer",
+                    "available": True,
+                    "category": "personnel",
+                    "details": "Available for one short review pass.",
+                },
+            ],
+            "allowed_substitutions": [
+                {
+                    "original": "peer reviewer",
+                    "alternative": "checklist-driven self-review",
+                    "condition": "Use if the peer reviewer is unavailable.",
+                    "tradeoff": "The final plan must include explicit verification checkpoints.",
+                }
+            ],
+            "budget_total": 220.0,
+            "staff_count": 1,
+            "time_limit_days": 2,
+            "max_rounds": MAX_ROUNDS,
+        },
+    ]
+    return rng.choice(cases)

replicalab/scenarios/ml_benchmark.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""Machine learning benchmark scenario templates."""
+from __future__ import annotations
+import random
+from typing import Any
+from replicalab.config import MAX_ROUNDS
+def build_ml_benchmark_template(rng: random.Random) -> dict[str, Any]:
+    cases = [
+        {
+            "domain_id": "machine_learning",
+            "paper_title": "Reproducing an AG News TinyBERT baseline",
+            "paper_hypothesis": "A distilled model can match the published accuracy within the stated compute budget.",
+            "paper_method": "Fine-tune TinyBERT on AG News with the published split, tokenizer, and evaluation script.",
+            "paper_key_finding": "The baseline is accepted only if the held-out accuracy is within one point of the target.",
+            "task_summary": "Plan an ML benchmark replication for AG News classification with strict GPU and deadline limits.",
+            "success_criteria": [
+                "Use the published train-validation-test split.",
+                "Report held-out accuracy with the same metric definition as the paper.",
+                "Fit the full plan within the available GPU budget and time window.",
+            ],
+            "reference_summary": "A valid plan keeps the published split and evaluation metric while staying inside the compute budget.",
+            "required_elements": [
+                "published data split",
+                "matching tokenizer family",
+                "held-out accuracy evaluation",
+                "run logging",
+            ],
+            "flexible_elements": [
+                "batch size",
+                "learning-rate schedule",
+                "checkpoint cadence",
+            ],
+            "target_metric": "held_out_accuracy",
+            "target_value": "within one point of the reported AG News baseline",
+            "constraints": [
+                {
+                    "key": "gpu_hours",
+                    "label": "Maximum GPU budget",
+                    "quantity": 8,
+                    "unit": "gpu_hours",
+                    "comparator": "<=",
+                    "hard": True,
+                    "details": "The full run must fit within eight GPU-hours.",
+                },
+                {
+                    "key": "deadline_days",
+                    "label": "Replication deadline",
+                    "quantity": 4,
+                    "unit": "days",
+                    "comparator": "<=",
+                    "hard": True,
+                    "details": "The benchmark must be reproduced within four days.",
+                },
+                {
+                    "key": "evaluation_policy",
+                    "label": "Evaluation policy",
+                    "quantity": None,
+                    "unit": None,
+                    "comparator": "=",
+                    "hard": True,
+                    "details": "Use only the held-out split; no test-set peeking.",
+                },
+            ],
+            "resources": [
+                {
+                    "key": "gpu_node",
+                    "label": "A100 GPU node",
+                    "quantity": 1,
+                    "unit": "node",
+                    "available": True,
+                    "category": "compute",
+                    "details": "Reserved for one benchmark run at a time.",
+                },
+                {
+                    "key": "dataset_mirror",
+                    "label": "AG News dataset mirror",
+                    "quantity": 1,
+                    "unit": "mirror",
+                    "available": True,
+                    "category": "data",
+                    "details": "Local mirror with the published split manifest.",
+                },
+                {
+                    "key": "tracking_tool",
+                    "label": "Experiment tracking workspace",
+                    "quantity": 1,
+                    "unit": "workspace",
+                    "available": True,
+                    "category": "tool",
+                    "details": "Captures configs, metrics, and artifacts.",
+                },
+            ],
+            "allowed_substitutions": [
+                {
+                    "original": "full training schedule",
+                    "alternative": "shorter schedule with early stopping",
+                    "condition": "Use when the GPU budget is tight.",
+                    "tradeoff": "The plan must justify why the metric remains trustworthy.",
+                },
+                {
+                    "original": "large batch size",
+                    "alternative": "smaller batch size with accumulation",
+                    "condition": "Use when the node has limited memory.",
+                    "tradeoff": "Training takes longer and must still fit the deadline.",
+                },
+            ],
+            "budget_total": 1800.0,
+            "staff_count": 2,
+            "time_limit_days": 4,
+            "max_rounds": MAX_ROUNDS,
+        },
+        {
+            "domain_id": "machine_learning",
+            "paper_title": "Reproducing a CIFAR-10 ResNet-18 baseline",
+            "paper_hypothesis": "The reported top-1 accuracy is reachable with the stated data pipeline and a smaller tuning budget.",
+            "paper_method": "Train ResNet-18 on CIFAR-10 with the published augmentation recipe and evaluation checkpoint.",
+            "paper_key_finding": "The baseline is accepted only if the final accuracy and training recipe are reproducible.",
+            "task_summary": "Plan a CIFAR-10 benchmark replication with limited compute, strict evaluation rules, and one reviewer pass.",
+            "success_criteria": [
+                "Use the published augmentation recipe or justify a compatible substitution.",
+                "Keep evaluation isolated from any tuning loop.",
+                "Log all seeds, configs, and final metrics for reproducibility.",
+            ],
+            "reference_summary": "A valid plan preserves the published augmentation and evaluation rules while logging every run.",
+            "required_elements": [
+                "published augmentation recipe",
+                "fixed evaluation checkpoint",
+                "seed logging",
+                "final metric report",
+            ],
+            "flexible_elements": [
+                "optimizer implementation",
+                "checkpoint interval",
+                "data-loader worker count",
+            ],
+            "target_metric": "top1_accuracy",
+            "target_value": "within one point of the CIFAR-10 baseline",
+            "constraints": [
+                {
+                    "key": "gpu_hours",
+                    "label": "Maximum GPU budget",
+                    "quantity": 10,
+                    "unit": "gpu_hours",
+                    "comparator": "<=",
+                    "hard": True,
+                    "details": "The benchmark must fit within ten GPU-hours.",
+                },
+                {
+                    "key": "deadline_days",
+                    "label": "Replication deadline",
+                    "quantity": 5,
+                    "unit": "days",
+                    "comparator": "<=",
+                    "hard": True,
+                    "details": "The plan must finish inside the review window.",
+                },
+                {
+                    "key": "review_passes",
+                    "label": "Required review passes",
+                    "quantity": 1,
+                    "unit": "pass",
+                    "comparator": ">=",
+                    "hard": False,
+                    "details": "A teammate should review the config before launch.",
+                },
+            ],
+            "resources": [
+                {
+                    "key": "gpu_node",
+                    "label": "L40S GPU node",
+                    "quantity": 1,
+                    "unit": "node",
+                    "available": True,
+                    "category": "compute",
+                    "details": "Shared node with moderate queue pressure.",
+                },
+                {
+                    "key": "dataset_archive",
+                    "label": "CIFAR-10 dataset archive",
+                    "quantity": 1,
+                    "unit": "archive",
+                    "available": True,
+                    "category": "data",
+                    "details": "Local archive with checksum verification.",
+                },
+                {
+                    "key": "reviewer",
+                    "label": "Benchmark reviewer",
+                    "quantity": 1,
+                    "unit": "reviewer",
+                    "available": True,
+                    "category": "personnel",
+                    "details": "Can review the config once before training.",
+                },
+            ],
+            "allowed_substitutions": [
+                {
+                    "original": "full epoch schedule",
+                    "alternative": "reduced epoch schedule with checkpoint comparison",
+                    "condition": "Use if queue time or GPU budget becomes tight.",
+                    "tradeoff": "Needs a clear explanation for any metric gap.",
+                }
+            ],
+            "budget_total": 2100.0,
+            "staff_count": 2,
+            "time_limit_days": 5,
+            "max_rounds": MAX_ROUNDS,
+        },
+    ]
+    return rng.choice(cases)

replicalab/scenarios/templates.py ADDED Viewed

	@@ -0,0 +1,299 @@

+"""Normalized scenario generation and mapping helpers."""
+from __future__ import annotations
+import copy
+from pathlib import Path
+from typing import Any, Callable, Literal
+from pydantic import BaseModel, ConfigDict
+from replicalab.config import MAX_BUDGET, MAX_ROUNDS
+from replicalab.models import LabManagerObservation, ScientistObservation
+from replicalab.scenarios.finance_trading import build_finance_trading_template
+from replicalab.scenarios.math_reasoning import build_math_reasoning_template
+from replicalab.scenarios.ml_benchmark import build_ml_benchmark_template
+from replicalab.utils.seed import seed_rng
+Difficulty = Literal["easy", "medium", "hard"]
+TemplateName = Literal["math_reasoning", "ml_benchmark", "finance_trading"]
+GOLDEN_SCENARIO_SPECS_PATH = (
+    Path(__file__).resolve().parents[2] / "tests" / "fixtures" / "golden_scenarios.json"
+)
+class ScenarioConstraint(BaseModel):
+    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
+    key: str
+    label: str
+    quantity: float | int | None = None
+    unit: str | None = None
+    comparator: Literal["<=", ">=", "="] = "="
+    hard: bool = True
+    details: str
+class ScenarioResource(BaseModel):
+    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
+    key: str
+    label: str
+    quantity: float | int | None = None
+    unit: str | None = None
+    available: bool = True
+    category: str
+    details: str
+class AllowedSubstitution(BaseModel):
+    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
+    original: str
+    alternative: str
+    condition: str
+    tradeoff: str
+class HiddenReferenceSpec(BaseModel):
+    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
+    summary: str
+    required_elements: list[str]
+    flexible_elements: list[str]
+    target_metric: str
+    target_value: str
+class NormalizedScenarioPack(BaseModel):
+    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
+    scenario_id: str
+    template: TemplateName
+    domain_id: str
+    difficulty: Difficulty
+    seed: int
+    task_summary: str
+    success_criteria: list[str]
+    constraints: list[ScenarioConstraint]
+    resources: list[ScenarioResource]
+    allowed_substitutions: list[AllowedSubstitution]
+    hidden_reference_spec: HiddenReferenceSpec
+    scientist_observation: ScientistObservation
+    lab_manager_observation: LabManagerObservation
+TemplateBuilder = Callable[[Any], dict[str, Any]]
+_TEMPLATE_BUILDERS: dict[TemplateName, TemplateBuilder] = {
+    "math_reasoning": build_math_reasoning_template,
+    "ml_benchmark": build_ml_benchmark_template,
+    "finance_trading": build_finance_trading_template,
+}
+def available_scenario_families() -> list[dict[str, Any]]:
+    return [
+        {"family": name, "difficulties": ["easy", "medium", "hard"]}
+        for name in _TEMPLATE_BUILDERS
+    ]
+def load_template(template: TemplateName) -> TemplateBuilder:
+    try:
+        return _TEMPLATE_BUILDERS[template]
+    except KeyError as exc:
+        raise ValueError(f"Unknown scenario template: {template}") from exc
+def apply_difficulty(
+    draft: dict[str, Any],
+    difficulty: Difficulty,
+    rng: Any,
+) -> dict[str, Any]:
+    scaled = copy.deepcopy(draft)
+    scaled["difficulty"] = difficulty
+    if difficulty == "easy":
+        scaled["budget_total"] = round(float(draft["budget_total"]) * 1.15, 2)
+        return scaled
+    if difficulty == "medium":
+        scaled["budget_total"] = round(float(draft["budget_total"]) * 0.95, 2)
+        scaled["time_limit_days"] = max(1, int(draft["time_limit_days"]) - 1)
+        _tighten_one_resource(scaled["resources"], rng)
+        _append_conflict_constraint(
+            scaled["constraints"],
+            "One resource is partially constrained, so the plan must justify a fallback path.",
+        )
+        return scaled
+    scaled["budget_total"] = round(float(draft["budget_total"]) * 0.8, 2)
+    scaled["time_limit_days"] = max(1, int(draft["time_limit_days"]) - 1)
+    scaled["staff_count"] = max(1, int(draft["staff_count"]) - 1)
+    _tighten_one_resource(scaled["resources"], rng)
+    _tighten_one_resource(scaled["resources"], rng)
+    _append_conflict_constraint(
+        scaled["constraints"],
+        "At least one primary resource is unavailable, so the plan must use an allowed substitution or reduced scope.",
+    )
+    _append_conflict_constraint(
+        scaled["constraints"],
+        "The final plan must remain concise because review capacity is limited under hard mode.",
+    )
+    return scaled
+def generate_scenario(
+    seed: int,
+    template: TemplateName,
+    difficulty: Difficulty,
+) -> NormalizedScenarioPack:
+    rng = seed_rng(seed, namespace=f"scenario:{template}")
+    base_draft = load_template(template)(rng)
+    scaled = apply_difficulty(base_draft, difficulty, rng)
+    return _build_pack(seed=seed, template=template, draft=scaled)
+def _build_pack(seed: int, template: TemplateName, draft: dict[str, Any]) -> NormalizedScenarioPack:
+    constraints = [ScenarioConstraint.model_validate(item) for item in draft["constraints"]]
+    resources = [ScenarioResource.model_validate(item) for item in draft["resources"]]
+    substitutions = [
+        AllowedSubstitution.model_validate(item)
+        for item in draft["allowed_substitutions"]
+    ]
+    time_limit_days = int(draft["time_limit_days"])
+    budget_total = float(draft["budget_total"])
+    staff_count = int(draft["staff_count"])
+    max_rounds = int(draft["max_rounds"])
+    if budget_total > MAX_BUDGET:
+        raise ValueError(
+            f"Scenario budget {budget_total} exceeds configured MAX_BUDGET={MAX_BUDGET}."
+        )
+    if max_rounds > MAX_ROUNDS:
+        raise ValueError(
+            f"Scenario max_rounds {max_rounds} exceeds configured MAX_ROUNDS={MAX_ROUNDS}."
+        )
+    equipment_available, equipment_booked = _split_resources(
+        resources,
+        include_categories={"tool", "compute"},
+    )
+    reagents_in_stock, reagents_out_of_stock = _split_resources(
+        resources,
+        include_categories={"reference", "data", "personnel"},
+    )
+    safety_restrictions = [
+        constraint.details
+        for constraint in constraints
+        if not constraint.hard or constraint.key in {"live_execution", "evaluation_policy"}
+    ]
+    if not safety_restrictions:
+        safety_restrictions = ["No policy exceptions are allowed."]
+    scientist_observation = ScientistObservation(
+        paper_title=draft["paper_title"],
+        paper_hypothesis=draft["paper_hypothesis"],
+        paper_method=draft["paper_method"],
+        paper_key_finding=draft["paper_key_finding"],
+        experiment_goal=draft["task_summary"],
+        conversation_history=[],
+        current_protocol=None,
+        round_number=0,
+        max_rounds=max_rounds,
+    )
+    lab_manager_observation = LabManagerObservation(
+        budget_total=budget_total,
+        budget_remaining=budget_total,
+        equipment_available=equipment_available,
+        equipment_booked=equipment_booked,
+        reagents_in_stock=reagents_in_stock,
+        reagents_out_of_stock=reagents_out_of_stock,
+        staff_count=staff_count,
+        time_limit_days=time_limit_days,
+        safety_restrictions=safety_restrictions,
+        conversation_history=[],
+        current_protocol=None,
+        round_number=0,
+        max_rounds=max_rounds,
+    )
+    hidden_reference = HiddenReferenceSpec(
+        summary=draft["reference_summary"],
+        required_elements=list(draft["required_elements"]),
+        flexible_elements=list(draft["flexible_elements"]),
+        target_metric=draft["target_metric"],
+        target_value=draft["target_value"],
+    )
+    return NormalizedScenarioPack(
+        scenario_id=f"{template}-{draft['difficulty']}-{seed}",
+        template=template,
+        domain_id=draft["domain_id"],
+        difficulty=draft["difficulty"],
+        seed=seed,
+        task_summary=draft["task_summary"],
+        success_criteria=list(draft["success_criteria"]),
+        constraints=constraints,
+        resources=resources,
+        allowed_substitutions=substitutions,
+        hidden_reference_spec=hidden_reference,
+        scientist_observation=scientist_observation,
+        lab_manager_observation=lab_manager_observation,
+    )
+def _split_resources(
+    resources: list[ScenarioResource],
+    *,
+    include_categories: set[str],
+) -> tuple[list[str], list[str]]:
+    available: list[str] = []
+    unavailable: list[str] = []
+    for resource in resources:
+        if resource.category not in include_categories:
+            continue
+        target = available if resource.available else unavailable
+        target.append(resource.label)
+    return available, unavailable
+def _tighten_one_resource(resources: list[dict[str, Any]], rng: Any) -> None:
+    available_indices = [
+        index
+        for index, resource in enumerate(resources)
+        if resource.get("available", True)
+    ]
+    if not available_indices:
+        return
+    chosen_index = rng.choice(available_indices)
+    chosen = resources[chosen_index]
+    chosen["available"] = False
+    chosen["details"] = (
+        f"{chosen['details']} Availability is constrained under the current difficulty."
+    )
+def _append_conflict_constraint(
+    constraints: list[dict[str, Any]],
+    details: str,
+) -> None:
+    constraints.append(
+        {
+            "key": f"conflict_{len(constraints) + 1}",
+            "label": "Difficulty-induced conflict",
+            "quantity": None,
+            "unit": None,
+            "comparator": "=",
+            "hard": True,
+            "details": details,
+        }
+    )

replicalab/utils/seed.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Deterministic seeding helpers shared by scenarios and the environment."""
+from __future__ import annotations
+import hashlib
+import random
+def get_deterministic_seed(seed: int, namespace: str = "") -> int:
+    """Derive a stable child seed from a base seed plus namespace."""
+    payload = f"{seed}:{namespace}".encode("utf-8")
+    digest = hashlib.sha256(payload).digest()
+    return int.from_bytes(digest[:8], byteorder="big", signed=False)
+def seed_rng(seed: int, namespace: str = "") -> random.Random:
+    """Return a dedicated RNG instance seeded deterministically."""
+    return random.Random(get_deterministic_seed(seed, namespace))

server/app.py CHANGED Viewed

@@ -31,13 +31,25 @@ from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from replicalab.models import (
     EpisodeLog,
     EpisodeState,
     LabManagerObservation,
     Observation,
     ScientistAction,
     ScientistObservation,
     StepResult,
 )
@@ -65,18 +77,18 @@ except ImportError:
     log.warning("ReplicaLabEnv not found — using _StubEnv (replace when Person A ships env)")
-def _reward_breakdown_from_state(state: EpisodeState) -> dict[str, Any]:
-    return {
-        "rigor": state.rigor_score,
-        "feasibility": state.feasibility_score,
-        "fidelity": state.fidelity_score,
-        "efficiency_bonus": 0.0,
-        "communication_bonus": 0.0,
-        "penalties": {
             "invalid_action": 0.0,
             "timeout": 0.0,
         },
-    }
 def _build_episode_log(episode_id: str, state: EpisodeState) -> EpisodeLog:
@@ -113,29 +125,33 @@ class _StubEnv:
     def reset(
         self,
         seed: int = 0,
-        scenario: str = "cell_biology",
-        difficulty: str = "easy",
     ) -> Observation:
         self._episode_id = str(uuid.uuid4())
         self._logs = []
         self._state = EpisodeState(
             seed=seed,
             scenario_template=scenario,
             difficulty=difficulty,
-            paper_title="[stub] Effect of compound X on cell proliferation",
             paper_hypothesis="Compound X inhibits cell growth at 10 µM",
-            paper_method="MTT assay, 96-well plate, 72 h incubation",
             paper_key_finding="IC50 = 8.3 µM",
-            experiment_goal="Replicate IC50 measurement within 20 % margin",
-            lab_budget_total=5000.0,
-            lab_budget_remaining=5000.0,
-            lab_equipment=["96-well plate reader", "incubator", "pipettes"],
             lab_reagents=["MTT reagent", "DMSO", "cell culture media"],
-            lab_staff_count=2,
-            lab_time_limit_days=14,
-            max_rounds=6,
             round_number=0,
         )
         self._state.conversation_history = list(self._logs)
         log.info("Stub reset | episode=%s seed=%d scenario=%s", self._episode_id, seed, scenario)
         return self._make_observation()
@@ -150,7 +166,7 @@ class _StubEnv:
             action.action_type == "accept"
             or self._state.round_number >= self._state.max_rounds
         )
-        reward = 5.0 if done and action.action_type == "accept" else 0.0
         if done:
             self._state.done = True
             self._state.agreement_reached = action.action_type == "accept"
@@ -163,11 +179,16 @@ class _StubEnv:
             observation=self._make_observation(),
             reward=reward,
             done=done,
-            info={
-                "round": self._state.round_number,
-                "stub": True,
-                "episode_id": self._episode_id,
-            },
         )
     def state(self) -> EpisodeState:
@@ -264,7 +285,7 @@ def _make_env() -> "_StubEnv":
 # In-memory session store (REST sessions)
 # ---------------------------------------------------------------------------
-_SESSION_TTL_SECONDS = 300  # 5 minutes idle before cleanup
 _sessions: dict[str, dict[str, Any]] = {}
 # { session_id: { "env": env_instance, "last_active": float, "episode_id": str } }
@@ -340,11 +361,7 @@ app.add_middleware(
 # Available scenarios constant
 # ---------------------------------------------------------------------------
-SCENARIOS = [
-    {"family": "cell_biology", "difficulties": ["easy", "medium", "hard"]},
-    {"family": "ml_benchmark", "difficulties": ["easy", "medium", "hard"]},
-    {"family": "behavioral_psych", "difficulties": ["easy", "medium", "hard"]},
-]
 # ---------------------------------------------------------------------------
 # REST request/response schemas
@@ -353,8 +370,8 @@ SCENARIOS = [
 class ResetRequest(BaseModel):
     seed: int = 0
-    scenario: str = "cell_biology"
-    difficulty: str = "easy"
     session_id: Optional[str] = None  # pass to reuse an existing session slot
@@ -451,7 +468,7 @@ async def get_replay(episode_id: str):
 # WebSocket message protocol:
 #   Client → Server:
-#     { "type": "reset", "seed": 42, "scenario": "cell_biology", "difficulty": "easy" }
 #     { "type": "step", "action": { ...ScientistAction fields... } }
 #     { "type": "ping" }
 #
@@ -461,14 +478,14 @@ async def get_replay(episode_id: str):
 #     { "type": "pong" }
 #     { "type": "error", "message": "..." }
-_WS_IDLE_TIMEOUT = 300  # seconds before server closes an idle WebSocket
 async def _ws_send(ws: WebSocket, payload: dict) -> None:
     await ws.send_text(json.dumps(payload))
-def main(host: str = "0.0.0.0", port: int = 7860) -> None:
     import uvicorn
     uvicorn.run("server.app:app", host=host, port=port, reload=False)
@@ -503,8 +520,8 @@ async def websocket_endpoint(ws: WebSocket):
             elif msg_type == "reset":
                 seed = int(msg.get("seed", 0))
-                scenario = str(msg.get("scenario", "cell_biology"))
-                difficulty = str(msg.get("difficulty", "easy"))
                 try:
                     obs = env.reset(seed=seed, scenario=scenario, difficulty=difficulty)
@@ -584,10 +601,10 @@ if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser()
-    parser.add_argument("--port", type=int, default=7860)
-    parser.add_argument("--host", default="0.0.0.0")
     args = parser.parse_args()
-    if args.host == "0.0.0.0" and args.port == 7860:
         main()
     else:
         main(host=args.host, port=args.port)

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from replicalab.config import (
+    API_HOST,
+    API_PORT,
+    DEFAULT_DIFFICULTY,
+    DEFAULT_SCENARIO_TEMPLATE,
+    SESSION_TTL_SECONDS,
+    STUB_ACCEPT_REWARD,
+    WS_IDLE_TIMEOUT_SECONDS,
+)
+from replicalab.scenarios import available_scenario_families, generate_scenario
 from replicalab.models import (
     EpisodeLog,
     EpisodeState,
     LabManagerObservation,
     Observation,
+    RewardBreakdown,
     ScientistAction,
     ScientistObservation,
+    StepInfo,
     StepResult,
 )
     log.warning("ReplicaLabEnv not found — using _StubEnv (replace when Person A ships env)")
+def _reward_breakdown_from_state(state: EpisodeState) -> RewardBreakdown:
+    return RewardBreakdown(
+        rigor=state.rigor_score,
+        feasibility=state.feasibility_score,
+        fidelity=state.fidelity_score,
+        efficiency_bonus=0.0,
+        communication_bonus=0.0,
+        penalties={
             "invalid_action": 0.0,
             "timeout": 0.0,
         },
+    )
 def _build_episode_log(episode_id: str, state: EpisodeState) -> EpisodeLog:
     def reset(
         self,
         seed: int = 0,
+        scenario: str = DEFAULT_SCENARIO_TEMPLATE,
+        difficulty: str = DEFAULT_DIFFICULTY,
     ) -> Observation:
         self._episode_id = str(uuid.uuid4())
         self._logs = []
+        pack = generate_scenario(seed=seed, template=scenario, difficulty=difficulty)
         self._state = EpisodeState(
             seed=seed,
             scenario_template=scenario,
             difficulty=difficulty,
+            paper_title=pack.scientist_observation.paper_title,
             paper_hypothesis="Compound X inhibits cell growth at 10 µM",
+            paper_method=pack.scientist_observation.paper_method,
             paper_key_finding="IC50 = 8.3 µM",
+            experiment_goal=pack.scientist_observation.experiment_goal,
+            lab_budget_total=pack.lab_manager_observation.budget_total,
+            lab_budget_remaining=pack.lab_manager_observation.budget_remaining,
+            lab_equipment=list(pack.lab_manager_observation.equipment_available),
             lab_reagents=["MTT reagent", "DMSO", "cell culture media"],
+            lab_staff_count=pack.lab_manager_observation.staff_count,
+            lab_time_limit_days=pack.lab_manager_observation.time_limit_days,
+            max_rounds=pack.scientist_observation.max_rounds,
             round_number=0,
         )
+        self._state.paper_hypothesis = pack.scientist_observation.paper_hypothesis
+        self._state.paper_key_finding = pack.scientist_observation.paper_key_finding
+        self._state.lab_reagents = list(pack.lab_manager_observation.reagents_in_stock)
         self._state.conversation_history = list(self._logs)
         log.info("Stub reset | episode=%s seed=%d scenario=%s", self._episode_id, seed, scenario)
         return self._make_observation()
             action.action_type == "accept"
             or self._state.round_number >= self._state.max_rounds
         )
+        reward = STUB_ACCEPT_REWARD if done and action.action_type == "accept" else 0.0
         if done:
             self._state.done = True
             self._state.agreement_reached = action.action_type == "accept"
             observation=self._make_observation(),
             reward=reward,
             done=done,
+            info=StepInfo(
+                agreement_reached=self._state.agreement_reached,
+                error=None,
+                reward_breakdown=_reward_breakdown_from_state(self._state) if done else None,
+                judge_notes="Stub audit until judge integration lands." if done else None,
+                verdict=("accept" if self._state.agreement_reached else "revise") if done else None,
+                round=self._state.round_number,
+                stub=True,
+                episode_id=self._episode_id,
+            ),
         )
     def state(self) -> EpisodeState:
 # In-memory session store (REST sessions)
 # ---------------------------------------------------------------------------
+_SESSION_TTL_SECONDS = SESSION_TTL_SECONDS
 _sessions: dict[str, dict[str, Any]] = {}
 # { session_id: { "env": env_instance, "last_active": float, "episode_id": str } }
 # Available scenarios constant
 # ---------------------------------------------------------------------------
+SCENARIOS = available_scenario_families()
 # ---------------------------------------------------------------------------
 # REST request/response schemas
 class ResetRequest(BaseModel):
     seed: int = 0
+    scenario: str = DEFAULT_SCENARIO_TEMPLATE
+    difficulty: str = DEFAULT_DIFFICULTY
     session_id: Optional[str] = None  # pass to reuse an existing session slot
 # WebSocket message protocol:
 #   Client → Server:
+#     { "type": "reset", "seed": 42, "scenario": DEFAULT_SCENARIO_TEMPLATE, "difficulty": DEFAULT_DIFFICULTY }
 #     { "type": "step", "action": { ...ScientistAction fields... } }
 #     { "type": "ping" }
 #
 #     { "type": "pong" }
 #     { "type": "error", "message": "..." }
+_WS_IDLE_TIMEOUT = WS_IDLE_TIMEOUT_SECONDS
 async def _ws_send(ws: WebSocket, payload: dict) -> None:
     await ws.send_text(json.dumps(payload))
+def main(host: str = API_HOST, port: int = API_PORT) -> None:
     import uvicorn
     uvicorn.run("server.app:app", host=host, port=port, reload=False)
             elif msg_type == "reset":
                 seed = int(msg.get("seed", 0))
+                scenario = str(msg.get("scenario", DEFAULT_SCENARIO_TEMPLATE))
+                difficulty = str(msg.get("difficulty", DEFAULT_DIFFICULTY))
                 try:
                     obs = env.reset(seed=seed, scenario=scenario, difficulty=difficulty)
     import argparse
     parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=API_PORT)
+    parser.add_argument("--host", default=API_HOST)
     args = parser.parse_args()
+    if args.host == API_HOST and args.port == API_PORT:
         main()
     else:
         main(host=args.host, port=args.port)

tests/fixtures/golden_scenarios.json ADDED Viewed

	@@ -0,0 +1,26 @@

+[
+  {
+    "id": "golden_math_easy",
+    "template": "math_reasoning",
+    "difficulty": "easy",
+    "seed": 101,
+    "expected_domain_id": "mathematics",
+    "expected_title_contains": "Jensen"
+  },
+  {
+    "id": "golden_ml_medium",
+    "template": "ml_benchmark",
+    "difficulty": "medium",
+    "seed": 202,
+    "expected_domain_id": "machine_learning",
+    "expected_title_contains": "CIFAR-10"
+  },
+  {
+    "id": "golden_finance_hard",
+    "template": "finance_trading",
+    "difficulty": "hard",
+    "seed": 303,
+    "expected_domain_id": "finance_trading",
+    "expected_title_contains": "momentum"
+  }
+]

tests/test_config.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from __future__ import annotations
+from replicalab.config import (
+    DEFAULT_DIFFICULTY,
+    DEFAULT_SCENARIO_TEMPLATE,
+    MAX_BUDGET,
+    MAX_ROUNDS,
+    SESSION_TTL_SECONDS,
+    WS_IDLE_TIMEOUT_SECONDS,
+)
+from replicalab.scenarios import generate_scenario
+from server.app import ResetRequest
+def test_reset_request_defaults_match_shared_config() -> None:
+    request = ResetRequest()
+    assert request.scenario == DEFAULT_SCENARIO_TEMPLATE
+    assert request.difficulty == DEFAULT_DIFFICULTY
+def test_generated_scenarios_respect_shared_round_and_budget_caps() -> None:
+    for template in ("math_reasoning", "ml_benchmark", "finance_trading"):
+        for difficulty in ("easy", "medium", "hard"):
+            pack = generate_scenario(seed=123, template=template, difficulty=difficulty)
+            assert pack.scientist_observation.max_rounds == MAX_ROUNDS
+            assert pack.lab_manager_observation.max_rounds == MAX_ROUNDS
+            assert pack.lab_manager_observation.budget_total <= MAX_BUDGET
+def test_timeout_exports_share_the_same_default_value() -> None:
+    assert SESSION_TTL_SECONDS == WS_IDLE_TIMEOUT_SECONDS

tests/test_scenarios.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from __future__ import annotations
+import json
+from replicalab.scenarios import (
+    GOLDEN_SCENARIO_SPECS_PATH,
+    available_scenario_families,
+    generate_scenario,
+)
+def test_generate_scenario_is_deterministic_for_same_seed() -> None:
+    first = generate_scenario(seed=101, template="math_reasoning", difficulty="easy")
+    second = generate_scenario(seed=101, template="math_reasoning", difficulty="easy")
+    assert first.model_dump(mode="json") == second.model_dump(mode="json")
+def test_generate_scenario_varies_across_seeded_cases() -> None:
+    first = generate_scenario(seed=101, template="math_reasoning", difficulty="easy")
+    second = generate_scenario(seed=102, template="math_reasoning", difficulty="easy")
+    assert first.scientist_observation.paper_title != second.scientist_observation.paper_title
+def test_available_scenario_families_exposes_three_domain_families() -> None:
+    assert available_scenario_families() == [
+        {"family": "math_reasoning", "difficulties": ["easy", "medium", "hard"]},
+        {"family": "ml_benchmark", "difficulties": ["easy", "medium", "hard"]},
+        {"family": "finance_trading", "difficulties": ["easy", "medium", "hard"]},
+    ]
+def test_hard_finance_scenario_exposes_unavailable_resource_and_safety_rules() -> None:
+    pack = generate_scenario(seed=303, template="finance_trading", difficulty="hard")
+    assert any(not resource.available for resource in pack.resources)
+    assert pack.lab_manager_observation.reagents_out_of_stock
+    assert pack.lab_manager_observation.safety_restrictions
+def test_difficulty_levels_mechanically_change_budget_and_constraints() -> None:
+    easy = generate_scenario(seed=202, template="ml_benchmark", difficulty="easy")
+    medium = generate_scenario(seed=202, template="ml_benchmark", difficulty="medium")
+    hard = generate_scenario(seed=202, template="ml_benchmark", difficulty="hard")
+    assert easy.lab_manager_observation.budget_total > medium.lab_manager_observation.budget_total
+    assert medium.lab_manager_observation.budget_total > hard.lab_manager_observation.budget_total
+    assert len(easy.constraints) < len(medium.constraints) < len(hard.constraints)
+def test_generated_scenarios_keep_unique_constraint_and_resource_keys() -> None:
+    for template in ("math_reasoning", "ml_benchmark", "finance_trading"):
+        pack = generate_scenario(seed=303, template=template, difficulty="hard")
+        constraint_keys = [constraint.key for constraint in pack.constraints]
+        resource_keys = [resource.key for resource in pack.resources]
+        assert len(constraint_keys) == len(set(constraint_keys))
+        assert len(resource_keys) == len(set(resource_keys))
+        assert pack.hidden_reference_spec.required_elements
+        assert pack.allowed_substitutions
+def test_golden_scenario_specs_exist_for_manual_prompt_checks() -> None:
+    specs = json.loads(GOLDEN_SCENARIO_SPECS_PATH.read_text(encoding="utf-8"))
+    assert len(specs) == 3
+    assert [spec["id"] for spec in specs] == [
+        "golden_math_easy",
+        "golden_ml_medium",
+        "golden_finance_hard",
+    ]
+def test_golden_scenarios_match_expected_title_and_domain() -> None:
+    specs = json.loads(GOLDEN_SCENARIO_SPECS_PATH.read_text(encoding="utf-8"))
+    for spec in specs:
+        pack = generate_scenario(
+            seed=spec["seed"],
+            template=spec["template"],
+            difficulty=spec["difficulty"],
+        )
+        assert pack.domain_id == spec["expected_domain_id"]
+        assert spec["expected_title_contains"] in pack.scientist_observation.paper_title

tests/test_scientist_policy.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from __future__ import annotations
+import pytest
+from replicalab.agents.scientist_policy import (
+    ScientistOutputParseError,
+    build_scientist_system_prompt,
+    parse_scientist_output,
+)
+from replicalab.models import ScientistActionType
+from replicalab.scenarios import generate_scenario
+def test_parse_scientist_output_accepts_plain_json() -> None:
+    raw_text = """
+    {
+      "action_type": "request_info",
+      "sample_size": 0,
+      "controls": [],
+      "technique": "",
+      "duration_days": 0,
+      "required_equipment": [],
+      "required_reagents": [],
+      "questions": ["What compute budget is available?"],
+      "rationale": ""
+    }
+    """
+    action = parse_scientist_output(raw_text)
+    assert action.action_type is ScientistActionType.REQUEST_INFO
+    assert action.questions == ["What compute budget is available?"]
+def test_parse_scientist_output_accepts_fenced_json_with_prose() -> None:
+    raw_text = """
+    I would revise the plan as follows:
+    ```json
+    {
+      "action_type": "revise_protocol",
+      "sample_size": 24,
+      "controls": ["baseline", "ablation"],
+      "technique": "small_scale_backtest",
+      "duration_days": 3,
+      "required_equipment": ["gpu_node"],
+      "required_reagents": [],
+      "questions": [],
+      "rationale": "Shrink the trial to fit the available compute window."
+    }
+    ```
+    """
+    action = parse_scientist_output(raw_text)
+    assert action.action_type is ScientistActionType.REVISE_PROTOCOL
+    assert action.technique == "small_scale_backtest"
+def test_parse_scientist_output_raises_explicit_error_when_json_is_missing() -> None:
+    with pytest.raises(ScientistOutputParseError) as exc_info:
+        parse_scientist_output("I need more context before I can answer.")
+    assert exc_info.value.code == "no_json"
+    assert "does not contain a JSON object" in exc_info.value.message
+def test_parse_scientist_output_raises_explicit_error_when_json_is_invalid() -> None:
+    raw_text = """
+    ```json
+    {
+      "action_type": "request_info",
+      "questions": ["What budget do we have?"],
+    }
+    ```
+    """
+    with pytest.raises(ScientistOutputParseError) as exc_info:
+        parse_scientist_output(raw_text)
+    assert exc_info.value.code == "invalid_json"
+    assert "could not be decoded" in exc_info.value.message
+def test_parse_scientist_output_raises_explicit_error_when_schema_is_invalid() -> None:
+    raw_text = """
+    {
+      "action_type": "request_info",
+      "sample_size": 0,
+      "controls": [],
+      "technique": "",
+      "duration_days": 0,
+      "required_equipment": [],
+      "required_reagents": [],
+      "questions": [],
+      "rationale": ""
+    }
+    """
+    with pytest.raises(ScientistOutputParseError) as exc_info:
+        parse_scientist_output(raw_text)
+    assert exc_info.value.code == "invalid_action"
+    assert "ScientistAction validation" in exc_info.value.message
+def test_build_scientist_system_prompt_uses_normalized_scenario_data() -> None:
+    scenario = generate_scenario(seed=202, template="ml_benchmark", difficulty="medium")
+    prompt = build_scientist_system_prompt(scenario)
+    assert "You are the Scientist agent in ReplicaLab." in prompt
+    assert scenario.task_summary in prompt
+    assert scenario.success_criteria[0] in prompt
+    assert scenario.resources[0].label in prompt
+    assert "action_type values" in prompt
+    assert "propose_protocol" in prompt
+    assert "request_info" in prompt