Spaces:
Running
Running
| """Prompt asset tests for AGT 10.""" | |
| from __future__ import annotations | |
| from replicalab.prompts import ( | |
| load_prompt_asset, | |
| load_prompt_template, | |
| render_judge_prompt, | |
| render_lab_manager_prompt, | |
| render_scientist_prompt, | |
| ) | |
| from replicalab.scenarios import generate_scenario | |
| def _scenario(template: str = "ml_benchmark"): | |
| return generate_scenario(seed=42, template=template, difficulty="medium") | |
| def test_load_prompt_template_reads_all_role_files() -> None: | |
| for role in ("scientist", "lab_manager", "judge"): | |
| template = load_prompt_template(role) | |
| assert len(template) > 100 | |
| assert "ReplicaLab" in template | |
| def test_load_oracle_prompt_assets_reads_all_oracle_files() -> None: | |
| for name in ( | |
| "oracle_world_architect", | |
| "oracle_adjudicator", | |
| "oracle_event_injector", | |
| "oracle_post_mortem", | |
| "oracle_lab_manager", | |
| ): | |
| template = load_prompt_asset(name) | |
| assert len(template) > 100 | |
| def test_render_scientist_prompt_injects_task_and_bounded_tools() -> None: | |
| prompt = render_scientist_prompt(_scenario("ml_benchmark")) | |
| assert "You are the Scientist agent" in prompt | |
| assert "search_evidence" in prompt | |
| assert "run_code_check" in prompt | |
| assert "inspect_image" in prompt | |
| assert "Allowed action_type values:" in prompt | |
| assert "Task:" in prompt | |
| assert "Domain:" in prompt | |
| def test_render_lab_manager_prompt_injects_grounding_rules() -> None: | |
| prompt = render_lab_manager_prompt(_scenario("finance_trading")) | |
| assert "You are the Lab Manager agent" in prompt | |
| assert "deterministic checker outputs" in prompt | |
| assert "suggest_alternative" in prompt | |
| assert "No unrestricted browsing" in prompt | |
| assert "Available resources:" in prompt | |
| def test_render_judge_prompt_injects_rubric_rules() -> None: | |
| prompt = render_judge_prompt(_scenario("math_reasoning")) | |
| assert "You are the Judge agent" in prompt | |
| assert "Explain rigor, feasibility, and fidelity" in prompt | |
| assert "Never rescore" in prompt | |
| assert "top failure reasons" in prompt | |
| def test_rendered_prompts_leave_no_unformatted_placeholders() -> None: | |
| scenario = _scenario("ml_benchmark") | |
| prompts = ( | |
| render_scientist_prompt(scenario), | |
| render_lab_manager_prompt(scenario), | |
| render_judge_prompt(scenario), | |
| ) | |
| for prompt in prompts: | |
| assert "{domain_id}" not in prompt | |
| assert "{task_summary}" not in prompt | |
| assert "{constraints}" not in prompt | |
| def test_rendered_prompts_are_domain_neutral_but_scenario_specific() -> None: | |
| ml_scenario = _scenario("ml_benchmark") | |
| finance_scenario = _scenario("finance_trading") | |
| ml_prompt = render_scientist_prompt(ml_scenario) | |
| finance_prompt = render_scientist_prompt(finance_scenario) | |
| assert ml_prompt != finance_prompt | |
| assert ml_scenario.task_summary in ml_prompt | |
| assert finance_scenario.task_summary in finance_prompt | |