replicalab / tests /test_prompts.py
maxxie114's picture
Initial HF Spaces deployment
80d8c84
"""Prompt asset tests for AGT 10."""
from __future__ import annotations
from replicalab.prompts import (
load_prompt_asset,
load_prompt_template,
render_judge_prompt,
render_lab_manager_prompt,
render_scientist_prompt,
)
from replicalab.scenarios import generate_scenario
def _scenario(template: str = "ml_benchmark"):
return generate_scenario(seed=42, template=template, difficulty="medium")
def test_load_prompt_template_reads_all_role_files() -> None:
for role in ("scientist", "lab_manager", "judge"):
template = load_prompt_template(role)
assert len(template) > 100
assert "ReplicaLab" in template
def test_load_oracle_prompt_assets_reads_all_oracle_files() -> None:
for name in (
"oracle_world_architect",
"oracle_adjudicator",
"oracle_event_injector",
"oracle_post_mortem",
"oracle_lab_manager",
):
template = load_prompt_asset(name)
assert len(template) > 100
def test_render_scientist_prompt_injects_task_and_bounded_tools() -> None:
prompt = render_scientist_prompt(_scenario("ml_benchmark"))
assert "You are the Scientist agent" in prompt
assert "search_evidence" in prompt
assert "run_code_check" in prompt
assert "inspect_image" in prompt
assert "Allowed action_type values:" in prompt
assert "Task:" in prompt
assert "Domain:" in prompt
def test_render_lab_manager_prompt_injects_grounding_rules() -> None:
prompt = render_lab_manager_prompt(_scenario("finance_trading"))
assert "You are the Lab Manager agent" in prompt
assert "deterministic checker outputs" in prompt
assert "suggest_alternative" in prompt
assert "No unrestricted browsing" in prompt
assert "Available resources:" in prompt
def test_render_judge_prompt_injects_rubric_rules() -> None:
prompt = render_judge_prompt(_scenario("math_reasoning"))
assert "You are the Judge agent" in prompt
assert "Explain rigor, feasibility, and fidelity" in prompt
assert "Never rescore" in prompt
assert "top failure reasons" in prompt
def test_rendered_prompts_leave_no_unformatted_placeholders() -> None:
scenario = _scenario("ml_benchmark")
prompts = (
render_scientist_prompt(scenario),
render_lab_manager_prompt(scenario),
render_judge_prompt(scenario),
)
for prompt in prompts:
assert "{domain_id}" not in prompt
assert "{task_summary}" not in prompt
assert "{constraints}" not in prompt
def test_rendered_prompts_are_domain_neutral_but_scenario_specific() -> None:
ml_scenario = _scenario("ml_benchmark")
finance_scenario = _scenario("finance_trading")
ml_prompt = render_scientist_prompt(ml_scenario)
finance_prompt = render_scientist_prompt(finance_scenario)
assert ml_prompt != finance_prompt
assert ml_scenario.task_summary in ml_prompt
assert finance_scenario.task_summary in finance_prompt