from __future__ import annotations import json import httpx import pytest from replicalab.agents.scientist_policy import ( RetryMetadata, ScientistCallResult, ScientistOutputParseError, build_anthropic_scientist_policy, build_baseline_scientist_action, build_ollama_scientist_policy, build_scientist_system_prompt, call_scientist_with_retry, format_scientist_observation, parse_scientist_output, ) from replicalab.models import ( ConversationEntry, Protocol, ScientistActionType, ScientistObservation, ) from replicalab.scenarios import generate_scenario # --------------------------------------------------------------------------- # Shared valid JSON payloads for retry tests # --------------------------------------------------------------------------- _VALID_REQUEST_INFO_JSON = """{ "action_type": "request_info", "sample_size": 0, "controls": [], "technique": "", "duration_days": 0, "required_equipment": [], "required_reagents": [], "questions": ["What compute budget is available?"], "rationale": "" }""" def test_parse_scientist_output_accepts_plain_json() -> None: raw_text = """ { "action_type": "request_info", "sample_size": 0, "controls": [], "technique": "", "duration_days": 0, "required_equipment": [], "required_reagents": [], "questions": ["What compute budget is available?"], "rationale": "" } """ action = parse_scientist_output(raw_text) assert action.action_type is ScientistActionType.REQUEST_INFO assert action.questions == ["What compute budget is available?"] def test_parse_scientist_output_accepts_fenced_json_with_prose() -> None: raw_text = """ I would revise the plan as follows: ```json { "action_type": "revise_protocol", "sample_size": 24, "controls": ["baseline", "ablation"], "technique": "small_scale_backtest", "duration_days": 3, "required_equipment": ["gpu_node"], "required_reagents": [], "questions": [], "rationale": "Shrink the trial to fit the available compute window." } ``` """ action = parse_scientist_output(raw_text) assert action.action_type is ScientistActionType.REVISE_PROTOCOL assert action.technique == "small_scale_backtest" def test_parse_scientist_output_raises_explicit_error_when_json_is_missing() -> None: with pytest.raises(ScientistOutputParseError) as exc_info: parse_scientist_output("I need more context before I can answer.") assert exc_info.value.code == "no_json" assert "does not contain a JSON object" in exc_info.value.message def test_parse_scientist_output_raises_explicit_error_when_json_is_invalid() -> None: raw_text = """ ```json { "action_type": "request_info", "questions": ["What budget do we have?"], } ``` """ with pytest.raises(ScientistOutputParseError) as exc_info: parse_scientist_output(raw_text) assert exc_info.value.code == "invalid_json" assert "could not be decoded" in exc_info.value.message def test_parse_scientist_output_raises_explicit_error_when_schema_is_invalid() -> None: raw_text = """ { "action_type": "request_info", "sample_size": 0, "controls": [], "technique": "", "duration_days": 0, "required_equipment": [], "required_reagents": [], "questions": [], "rationale": "" } """ with pytest.raises(ScientistOutputParseError) as exc_info: parse_scientist_output(raw_text) assert exc_info.value.code == "invalid_action" assert "ScientistAction validation" in exc_info.value.message def test_build_scientist_system_prompt_uses_normalized_scenario_data() -> None: scenario = generate_scenario(seed=202, template="ml_benchmark", difficulty="medium") prompt = build_scientist_system_prompt(scenario) assert "You are the Scientist agent in ReplicaLab." in prompt assert scenario.task_summary in prompt assert scenario.success_criteria[0] in prompt assert scenario.resources[0].label in prompt assert "action_type values" in prompt assert "propose_protocol" in prompt assert "request_info" in prompt # --------------------------------------------------------------------------- # AGT 02 — format_scientist_observation # --------------------------------------------------------------------------- def _base_observation(**overrides) -> ScientistObservation: defaults = { "paper_title": "Test Paper Title", "paper_hypothesis": "X improves Y.", "paper_method": "Run benchmark Z.", "paper_key_finding": "10% improvement.", "experiment_goal": "Replicate the 10% improvement.", "conversation_history": [], "current_protocol": None, "round_number": 0, "max_rounds": 6, } defaults.update(overrides) return ScientistObservation(**defaults) def test_format_observation_empty_history_no_protocol() -> None: obs = _base_observation() result = format_scientist_observation(obs) assert "Round 0 of 6" in result assert "Test Paper Title" in result assert "X improves Y." in result assert "Run benchmark Z." in result assert "10% improvement." in result assert "Replicate the 10% improvement." in result assert "No conversation history yet" in result assert "No protocol has been proposed yet" in result assert "ScientistAction schema reminder:" in result assert ( "allowed action_type values: propose_protocol, revise_protocol, " "request_info, accept" ) in result assert "include all ScientistAction fields in every response" in result assert "Respond with exactly one JSON object" in result def test_format_observation_with_history_and_protocol() -> None: history = [ ConversationEntry( role="scientist", message="I propose a grid search protocol.", round_number=1, action_type="propose_protocol", ), ConversationEntry( role="lab_manager", message="The compute cluster is booked. Consider a smaller run.", round_number=1, action_type="suggest_alternative", ), ] protocol = Protocol( sample_size=24, controls=["baseline", "ablation"], technique="grid_search", duration_days=3, required_equipment=["gpu_node"], required_reagents=["benchmark_dataset"], rationale="Standard hyperparameter sweep.", ) obs = _base_observation( conversation_history=history, current_protocol=protocol, round_number=2, ) result = format_scientist_observation(obs) assert "Round 2 of 6" in result assert "Conversation so far:" in result assert "[SCIENTIST r1 [propose_protocol]]" in result assert "I propose a grid search protocol." in result assert "[LAB_MANAGER r1 [suggest_alternative]]" in result assert "compute cluster is booked" in result assert "Current protocol:" in result assert "technique: grid_search" in result assert "sample_size: 24" in result assert "controls: baseline, ablation" in result assert "duration_days: 3" in result assert "gpu_node" in result assert "benchmark_dataset" in result assert "Standard hyperparameter sweep." in result def test_format_observation_stable_section_order() -> None: obs = _base_observation( conversation_history=[ ConversationEntry( role="scientist", message="First proposal.", round_number=1, action_type="propose_protocol", ), ], current_protocol=Protocol( sample_size=10, controls=["ctrl"], technique="method_a", duration_days=2, required_equipment=["tool_a"], required_reagents=["reagent_a"], rationale="Simple test.", ), round_number=1, ) result = format_scientist_observation(obs) # Sections must appear in this fixed order round_pos = result.index("Round 1 of 6") paper_pos = result.index("Paper:") history_pos = result.index("Conversation so far:") protocol_pos = result.index("Current protocol:") action_pos = result.index("Respond with exactly one JSON") assert round_pos < paper_pos < history_pos < protocol_pos < action_pos def test_format_observation_history_entry_without_action_type() -> None: obs = _base_observation( conversation_history=[ ConversationEntry( role="system", message="Episode started.", round_number=0, action_type=None, ), ], ) result = format_scientist_observation(obs) assert "[SYSTEM r0]:" in result assert "Episode started." in result def test_format_observation_from_generated_scenario() -> None: scenario = generate_scenario(seed=77, template="finance_trading", difficulty="easy") obs = scenario.scientist_observation result = format_scientist_observation(obs) assert obs.paper_title in result assert obs.experiment_goal in result assert "Round 0" in result assert "No conversation history yet" in result assert "Respond with exactly one JSON" in result # --------------------------------------------------------------------------- # AGT 03 — call_scientist_with_retry # --------------------------------------------------------------------------- def _make_system_prompt() -> str: scenario = generate_scenario(seed=1, template="math_reasoning", difficulty="easy") return build_scientist_system_prompt(scenario) def test_retry_success_on_first_try() -> None: def gen_fn(messages: list[dict[str, str]]) -> str: return _VALID_REQUEST_INFO_JSON obs = _base_observation() result = call_scientist_with_retry(gen_fn, _make_system_prompt(), obs) assert isinstance(result, ScientistCallResult) assert result.action.action_type is ScientistActionType.REQUEST_INFO assert result.metadata.attempt_count == 1 assert result.metadata.retry_count == 0 assert result.metadata.last_error_code is None assert result.metadata.last_error_message is None def test_retry_malformed_json_then_valid() -> None: call_count = 0 def gen_fn(messages: list[dict[str, str]]) -> str: nonlocal call_count call_count += 1 if call_count == 1: return '{"action_type": "request_info", trailing garbage' return _VALID_REQUEST_INFO_JSON obs = _base_observation() result = call_scientist_with_retry(gen_fn, _make_system_prompt(), obs) assert result.action.action_type is ScientistActionType.REQUEST_INFO assert result.metadata.attempt_count == 2 assert result.metadata.retry_count == 1 assert result.metadata.last_error_code == "invalid_json" def test_retry_invalid_action_then_valid() -> None: # First attempt: valid JSON but questions is empty for request_info invalid_json = """{ "action_type": "request_info", "sample_size": 0, "controls": [], "technique": "", "duration_days": 0, "required_equipment": [], "required_reagents": [], "questions": [], "rationale": "" }""" call_count = 0 def gen_fn(messages: list[dict[str, str]]) -> str: nonlocal call_count call_count += 1 if call_count == 1: return invalid_json return _VALID_REQUEST_INFO_JSON obs = _base_observation() result = call_scientist_with_retry(gen_fn, _make_system_prompt(), obs) assert result.action.action_type is ScientistActionType.REQUEST_INFO assert result.metadata.attempt_count == 2 assert result.metadata.retry_count == 1 assert result.metadata.last_error_code == "invalid_action" assert "ScientistAction validation" in result.metadata.last_error_message def test_retry_exhausted_raises_last_error() -> None: def gen_fn(messages: list[dict[str, str]]) -> str: return "I cannot produce JSON right now." obs = _base_observation() with pytest.raises(ScientistOutputParseError) as exc_info: call_scientist_with_retry(gen_fn, _make_system_prompt(), obs, max_retries=2) assert exc_info.value.code == "no_json" def test_retry_correction_message_includes_parser_error() -> None: """The correction prompt sent to the model must include the specific error.""" captured_messages: list[list[dict[str, str]]] = [] call_count = 0 def gen_fn(messages: list[dict[str, str]]) -> str: nonlocal call_count call_count += 1 captured_messages.append(list(messages)) if call_count == 1: return "Just some prose, no JSON here." return _VALID_REQUEST_INFO_JSON obs = _base_observation() call_scientist_with_retry(gen_fn, _make_system_prompt(), obs) # Second call should have 4 messages: system, user, assistant (bad), user (correction) assert len(captured_messages) == 2 retry_messages = captured_messages[1] assert len(retry_messages) == 4 assert retry_messages[2]["role"] == "assistant" assert retry_messages[2]["content"] == "Just some prose, no JSON here." assert retry_messages[3]["role"] == "user" correction = retry_messages[3]["content"] assert "did not contain a JSON object" in correction assert "No markdown fences, no prose" in correction def test_retry_correction_for_invalid_action_includes_validation_detail() -> None: """invalid_action correction must include the schema validation message.""" captured_messages: list[list[dict[str, str]]] = [] invalid_json = """{ "action_type": "request_info", "sample_size": 0, "controls": [], "technique": "", "duration_days": 0, "required_equipment": [], "required_reagents": [], "questions": [], "rationale": "" }""" call_count = 0 def gen_fn(messages: list[dict[str, str]]) -> str: nonlocal call_count call_count += 1 captured_messages.append(list(messages)) if call_count == 1: return invalid_json return _VALID_REQUEST_INFO_JSON obs = _base_observation() call_scientist_with_retry(gen_fn, _make_system_prompt(), obs) retry_messages = captured_messages[1] correction = retry_messages[3]["content"] assert "failed ScientistAction validation" in correction assert "Fix the validation error" in correction def test_retry_metadata_serializable() -> None: def gen_fn(messages: list[dict[str, str]]) -> str: return _VALID_REQUEST_INFO_JSON obs = _base_observation() result = call_scientist_with_retry(gen_fn, _make_system_prompt(), obs) dumped = result.metadata.model_dump_json() restored = RetryMetadata.model_validate_json(dumped) assert restored.attempt_count == 1 assert restored.retry_count == 0 # --------------------------------------------------------------------------- # AGT 04 - build_baseline_scientist_action # --------------------------------------------------------------------------- def test_baseline_scientist_proposes_protocol_for_fresh_observation() -> None: action = build_baseline_scientist_action(_base_observation()) assert action.action_type is ScientistActionType.PROPOSE_PROTOCOL assert action.sample_size >= 1 assert action.duration_days >= 1 assert action.questions == [] assert action.rationale def test_baseline_scientist_accepts_existing_protocol_without_blocker() -> None: obs = _base_observation( current_protocol=Protocol( sample_size=10, controls=["baseline_check"], technique="published_split_replication", duration_days=2, required_equipment=[], required_reagents=[], rationale="Initial protocol is already in place.", ), conversation_history=[ ConversationEntry( role="lab_manager", message="The current plan remains feasible.", round_number=1, action_type="report_feasibility", ) ], round_number=1, ) action = build_baseline_scientist_action(obs) assert action.action_type is ScientistActionType.ACCEPT assert action.sample_size == 0 assert action.controls == [] def test_baseline_scientist_revises_when_latest_feedback_has_blocker() -> None: obs = _base_observation( current_protocol=Protocol( sample_size=12, controls=["published_split_check", "heldout_evaluation"], technique="published_split_replication", duration_days=3, required_equipment=[], required_reagents=[], rationale="Original scope is full-size.", ), conversation_history=[ ConversationEntry( role="lab_manager", message="The current GPU plan is booked, so the schedule is too tight.", round_number=1, action_type="suggest_alternative", ) ], round_number=1, ) action = build_baseline_scientist_action(obs) assert action.action_type is ScientistActionType.REVISE_PROTOCOL assert action.sample_size == 6 assert action.duration_days == 2 assert "latest Lab Manager concern" in action.rationale def test_baseline_scientist_finishes_stub_episode_without_crashing() -> None: from server.app import _StubEnv env = _StubEnv() first_observation = env.reset( seed=14, scenario="ml_benchmark", difficulty="easy", ).scientist assert first_observation is not None first_action = build_baseline_scientist_action(first_observation) first_step = env.step(first_action) assert first_step.done is False assert first_step.observation is not None assert first_step.observation.scientist is not None second_action = build_baseline_scientist_action(first_step.observation.scientist) second_step = env.step(second_action) assert second_step.done is True assert second_step.info.agreement_reached is True # --------------------------------------------------------------------------- # AGT 08 — Extended prompt, parser, formatter, and baseline coverage # --------------------------------------------------------------------------- # --- Parser happy paths --- def test_parse_scientist_output_accepts_propose_protocol() -> None: raw_text = """{ "action_type": "propose_protocol", "sample_size": 48, "controls": ["vehicle_control", "positive_control"], "technique": "wst1_assay", "duration_days": 5, "required_equipment": ["plate_reader"], "required_reagents": ["wst1", "dmso"], "questions": [], "rationale": "Standard viability assay with two controls." }""" action = parse_scientist_output(raw_text) assert action.action_type is ScientistActionType.PROPOSE_PROTOCOL assert action.sample_size == 48 assert action.technique == "wst1_assay" assert action.controls == ["vehicle_control", "positive_control"] assert action.questions == [] def test_parse_scientist_output_accepts_accept_action() -> None: raw_text = """{ "action_type": "accept", "sample_size": 0, "controls": [], "technique": "", "duration_days": 0, "required_equipment": [], "required_reagents": [], "questions": [], "rationale": "" }""" action = parse_scientist_output(raw_text) assert action.action_type is ScientistActionType.ACCEPT assert action.sample_size == 0 assert action.rationale == "" def test_parse_scientist_output_accepts_prose_wrapped_json() -> None: raw_text = ( "After reviewing the constraints I think a request is in order.\n\n" '{"action_type": "request_info", "sample_size": 0, ' '"controls": [], "technique": "", "duration_days": 0, ' '"required_equipment": [], "required_reagents": [], ' '"questions": ["Is the GPU available?"], "rationale": ""}\n\n' "That should clarify the compute situation." ) action = parse_scientist_output(raw_text) assert action.action_type is ScientistActionType.REQUEST_INFO assert action.questions == ["Is the GPU available?"] # --- Parser edge cases --- def test_parse_scientist_output_raises_on_empty_string() -> None: with pytest.raises(ScientistOutputParseError) as exc_info: parse_scientist_output("") assert exc_info.value.code == "no_json" def test_parse_scientist_output_raises_on_whitespace_only() -> None: with pytest.raises(ScientistOutputParseError) as exc_info: parse_scientist_output(" \n\t ") assert exc_info.value.code == "no_json" def test_parse_scientist_output_raises_on_json_list() -> None: # The parser's brace extractor finds the inner object from the list, # so this surfaces as an invalid_action (missing required fields) # rather than an invalid_json error. with pytest.raises(ScientistOutputParseError) as exc_info: parse_scientist_output('[{"action_type": "accept"}]') assert exc_info.value.code == "invalid_action" def test_parse_scientist_output_raises_on_extra_forbidden_keys() -> None: raw_text = """{ "action_type": "accept", "sample_size": 0, "controls": [], "technique": "", "duration_days": 0, "required_equipment": [], "required_reagents": [], "questions": [], "rationale": "", "secret_field": "should not be here" }""" with pytest.raises(ScientistOutputParseError) as exc_info: parse_scientist_output(raw_text) assert exc_info.value.code == "invalid_action" assert exc_info.value.parsed_payload is not None assert "secret_field" in exc_info.value.parsed_payload def test_parse_error_to_dict_serialization() -> None: try: parse_scientist_output("no json here") except ScientistOutputParseError as exc: result = exc.to_dict() assert result["code"] == "no_json" assert result["raw_text"] == "no json here" assert result["parsed_payload"] is None assert "message" in result else: pytest.fail("Expected ScientistOutputParseError") def test_parse_error_to_dict_with_parsed_payload() -> None: raw_text = """{ "action_type": "request_info", "sample_size": 0, "controls": [], "technique": "", "duration_days": 0, "required_equipment": [], "required_reagents": [], "questions": [], "rationale": "" }""" try: parse_scientist_output(raw_text) except ScientistOutputParseError as exc: result = exc.to_dict() assert result["code"] == "invalid_action" assert result["parsed_payload"] is not None assert result["parsed_payload"]["action_type"] == "request_info" else: pytest.fail("Expected ScientistOutputParseError") # --- System prompt: domain coverage --- def test_system_prompt_math_domain() -> None: scenario = generate_scenario(seed=10, template="math_reasoning", difficulty="easy") prompt = build_scientist_system_prompt(scenario) assert "Domain: mathematics" in prompt assert scenario.task_summary in prompt assert "You are the Scientist agent" in prompt def test_system_prompt_finance_domain() -> None: scenario = generate_scenario(seed=10, template="finance_trading", difficulty="easy") prompt = build_scientist_system_prompt(scenario) assert "Domain: finance_trading" in prompt assert scenario.task_summary in prompt def test_system_prompt_ml_domain() -> None: scenario = generate_scenario(seed=10, template="ml_benchmark", difficulty="easy") prompt = build_scientist_system_prompt(scenario) assert "Domain: machine_learning" in prompt assert scenario.task_summary in prompt def test_system_prompt_accepts_dict_input() -> None: scenario = generate_scenario(seed=5, template="math_reasoning", difficulty="easy") pack_dict = scenario.model_dump() prompt = build_scientist_system_prompt(pack_dict) assert "You are the Scientist agent" in prompt assert scenario.task_summary in prompt assert "Domain: mathematics" in prompt # --- System prompt: bounded-tool policy assertions --- def test_system_prompt_contains_bounded_tool_policy() -> None: scenario = generate_scenario(seed=1, template="math_reasoning", difficulty="easy") prompt = build_scientist_system_prompt(scenario) assert "search_evidence" in prompt assert "run_code_check" in prompt assert "inspect_image" in prompt def test_system_prompt_bounded_tool_policy_rules() -> None: scenario = generate_scenario(seed=1, template="ml_benchmark", difficulty="medium") prompt = build_scientist_system_prompt(scenario) assert "No unrestricted web browsing" in prompt assert "No audio" in prompt assert "do not override constraints" in prompt or "Tools do not override constraints" in prompt def test_system_prompt_bounded_tool_policy_present_in_all_domains() -> None: for template in ("math_reasoning", "ml_benchmark", "finance_trading"): scenario = generate_scenario(seed=42, template=template, difficulty="easy") prompt = build_scientist_system_prompt(scenario) assert "Bounded tool policy" in prompt, f"Missing in {template}" assert "search_evidence" in prompt, f"Missing search_evidence in {template}" assert "run_code_check" in prompt, f"Missing run_code_check in {template}" assert "inspect_image" in prompt, f"Missing inspect_image in {template}" # --- System prompt: role-boundary assertions --- def test_system_prompt_contains_role_boundaries() -> None: scenario = generate_scenario(seed=1, template="math_reasoning", difficulty="easy") prompt = build_scientist_system_prompt(scenario) assert "do not invent resources" in prompt assert "do not assume access to hidden ground truth" in prompt.lower() or \ "hidden ground truth" in prompt def test_system_prompt_contains_output_contract() -> None: scenario = generate_scenario(seed=1, template="math_reasoning", difficulty="easy") prompt = build_scientist_system_prompt(scenario) assert "Output contract" in prompt assert "exactly one JSON object" in prompt assert "no extra keys" in prompt # --- Observation formatter edge cases --- def test_format_observation_final_round() -> None: obs = _base_observation(round_number=5, max_rounds=6) result = format_scientist_observation(obs) assert "Round 5 of 6" in result assert "Respond with exactly one JSON" in result def test_format_observation_protocol_with_empty_lists() -> None: protocol = Protocol( sample_size=1, controls=[], technique="minimal_check", duration_days=1, required_equipment=[], required_reagents=[], rationale="Minimal protocol.", ) obs = _base_observation(current_protocol=protocol, round_number=1) result = format_scientist_observation(obs) assert "Current protocol:" in result assert "technique: minimal_check" in result assert "controls: (none)" in result assert "required_equipment: (none)" in result assert "required_reagents: (none)" in result # --- Baseline: domain inference --- def test_baseline_scientist_infers_ml_domain() -> None: obs = _base_observation( paper_title="Reproducing CIFAR-10 accuracy with ResNet", paper_method="Train on CIFAR dataset with GPU", experiment_goal="Match the published benchmark accuracy.", ) action = build_baseline_scientist_action(obs) assert action.action_type is ScientistActionType.PROPOSE_PROTOCOL assert action.technique == "published_split_replication" def test_baseline_scientist_infers_finance_domain() -> None: obs = _base_observation( paper_title="Offline backtest of SPY mean-reversion", paper_method="Daily bar backtest with slippage modeling", experiment_goal="Evaluate Sharpe ratio under drawdown limits.", ) action = build_baseline_scientist_action(obs) assert action.action_type is ScientistActionType.PROPOSE_PROTOCOL assert action.technique == "offline_backtest_workflow" def test_baseline_scientist_infers_math_domain() -> None: obs = _base_observation( paper_title="Planning a proof of AM-GM inequality", paper_method="Algebraic manipulation with induction.", experiment_goal="Verify the proof outline.", ) action = build_baseline_scientist_action(obs) assert action.action_type is ScientistActionType.PROPOSE_PROTOCOL assert action.technique == "structured_proof_outline" # --- Baseline: forced accept at final round --- def test_baseline_scientist_accepts_at_final_round_even_with_blocker() -> None: obs = _base_observation( current_protocol=Protocol( sample_size=20, controls=["ctrl"], technique="method_a", duration_days=5, required_equipment=[], required_reagents=[], rationale="Full scope plan.", ), conversation_history=[ ConversationEntry( role="lab_manager", message="Budget is tight and equipment is booked.", round_number=4, action_type="suggest_alternative", ), ], round_number=5, max_rounds=6, ) action = build_baseline_scientist_action(obs) assert action.action_type is ScientistActionType.ACCEPT def test_build_anthropic_scientist_policy_calls_messages_api() -> None: captured: dict[str, object] = {} def handler(request: httpx.Request) -> httpx.Response: captured["headers"] = dict(request.headers) captured["payload"] = json.loads(request.content.decode("utf-8")) return httpx.Response( 200, json={ "content": [ { "type": "text", "text": _VALID_REQUEST_INFO_JSON, } ] }, ) client = httpx.Client(transport=httpx.MockTransport(handler)) policy = build_anthropic_scientist_policy( api_key="test-key", model="claude-test", client=client, base_url="https://api.anthropic.com/v1/messages", ) action = policy( _base_observation(), scenario="ml_benchmark", difficulty="medium", ) assert action.action_type is ScientistActionType.REQUEST_INFO payload = captured["payload"] assert isinstance(payload, dict) assert payload["model"] == "claude-test" assert payload["temperature"] == 0.0 assert payload["messages"][0]["role"] == "user" headers = captured["headers"] assert isinstance(headers, dict) assert headers["x-api-key"] == "test-key" def test_build_ollama_scientist_policy_calls_local_chat_api() -> None: captured: dict[str, object] = {} def handler(request: httpx.Request) -> httpx.Response: captured["payload"] = json.loads(request.content.decode("utf-8")) return httpx.Response( 200, json={ "message": { "role": "assistant", "content": _VALID_REQUEST_INFO_JSON, } }, ) client = httpx.Client(transport=httpx.MockTransport(handler)) policy = build_ollama_scientist_policy( model="glm-5:cloud", client=client, base_url="http://127.0.0.1:11434/api/chat", ) action = policy( _base_observation(), scenario="finance_trading", difficulty="medium", ) assert action.action_type is ScientistActionType.REQUEST_INFO payload = captured["payload"] assert isinstance(payload, dict) assert payload["model"] == "glm-5:cloud" assert payload["stream"] is False assert payload["format"] == "json" assert payload["messages"][0]["role"] == "system"