mutationgym-env / examples /baseline_eval.py
paradox44's picture
Initial OpenEnv upload
8850413 verified
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Dict
from mutationgym_env.models import MutationGymAction
from mutationgym_env.server.mutationgym_environment import MutationGymEnvironment
TEST_TEMPLATES: Dict[str, str] = {
"task_001": """
from solution import clamp
def test_clamp_bounds():
assert clamp(-1, 0, 10) == 0
assert clamp(0, 0, 10) == 0
assert clamp(5, 0, 10) == 5
assert clamp(10, 0, 10) == 10
assert clamp(11, 0, 10) == 10
""",
"task_002": """
from solution import fizzbuzz
def test_fizzbuzz():
assert fizzbuzz(3) == "Fizz"
assert fizzbuzz(5) == "Buzz"
assert fizzbuzz(15) == "FizzBuzz"
assert fizzbuzz(7) == "7"
""",
"task_003": """
from solution import safe_divide
def test_safe_divide():
assert safe_divide(10, 2) == 5
assert safe_divide(5, -2) == -2.5
assert safe_divide(1, 0) is None
""",
"task_004": """
from solution import normalize_whitespace
def test_normalize_whitespace():
assert normalize_whitespace(" a\\t b ") == "a b"
assert normalize_whitespace("a\\n\\n b") == "a b"
assert normalize_whitespace("a b c") == "a b c"
""",
"task_005": """
from solution import unique_preserve_order
def test_unique_preserve_order():
assert unique_preserve_order(["a", "b", "a", "c", "b"]) == ["a", "b", "c"]
assert unique_preserve_order([]) == []
assert unique_preserve_order(["x", "x", "x"]) == ["x"]
""",
"task_006": """
from solution import median
def test_median():
assert median([3, 1, 2]) == 2
assert median([4, 1, 2, 3]) == 2.5
assert median([10]) == 10
""",
"task_007": """
from solution import rotate_right
def test_rotate_right():
assert rotate_right([1, 2, 3], 1) == [3, 1, 2]
assert rotate_right([1, 2, 3], 0) == [1, 2, 3]
assert rotate_right([1, 2, 3], 4) == [3, 1, 2]
assert rotate_right([1, 2, 3], -1) == [2, 3, 1]
assert rotate_right([], 3) == []
""",
"task_008": """
from solution import count_vowels
def test_count_vowels():
assert count_vowels("AEIOU") == 5
assert count_vowels("sky") == 0
assert count_vowels("banana") == 3
""",
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run baseline MutationGym eval.")
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--episodes", type=int, default=8)
parser.add_argument("--output", type=Path, default=Path("outputs/eval_baseline.json"))
return parser.parse_args()
def main() -> None:
args = parse_args()
env = MutationGymEnvironment()
results = []
killed_total = 0
mutant_total = 0
passed_reference_total = 0
for episode in range(args.episodes):
obs = env.reset(seed=args.seed + episode)
task_id = obs.task_id
tests = TEST_TEMPLATES.get(
task_id,
f"from solution import {task_id}\\n\\n"
"def test_smoke():\\n"
" assert True\\n",
)
outcome = env.step(MutationGymAction(tests_py=tests, finalize=True))
results.append(
{
"task_id": task_id,
"reward": outcome.reward,
"killed": outcome.killed,
"total_mutants": outcome.total_mutants,
"passed_reference": outcome.passed_reference,
"runtime_ms": outcome.runtime_ms,
"error": outcome.error,
}
)
killed_total += outcome.killed
mutant_total += outcome.total_mutants
passed_reference_total += 1 if outcome.passed_reference else 0
summary = {
"episodes": args.episodes,
"avg_kill_rate": (killed_total / mutant_total) if mutant_total else 0.0,
"reference_pass_rate": passed_reference_total / args.episodes,
}
payload = {"summary": summary, "results": results}
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
print(json.dumps(summary, indent=2))
if __name__ == "__main__":
main()