File size: 4,136 Bytes
8850413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import Dict

from mutationgym_env.models import MutationGymAction
from mutationgym_env.server.mutationgym_environment import MutationGymEnvironment


TEST_TEMPLATES: Dict[str, str] = {
    "task_001": """
from solution import clamp

def test_clamp_bounds():
    assert clamp(-1, 0, 10) == 0
    assert clamp(0, 0, 10) == 0
    assert clamp(5, 0, 10) == 5
    assert clamp(10, 0, 10) == 10
    assert clamp(11, 0, 10) == 10
""",
    "task_002": """
from solution import fizzbuzz

def test_fizzbuzz():
    assert fizzbuzz(3) == "Fizz"
    assert fizzbuzz(5) == "Buzz"
    assert fizzbuzz(15) == "FizzBuzz"
    assert fizzbuzz(7) == "7"
""",
    "task_003": """
from solution import safe_divide

def test_safe_divide():
    assert safe_divide(10, 2) == 5
    assert safe_divide(5, -2) == -2.5
    assert safe_divide(1, 0) is None
""",
    "task_004": """
from solution import normalize_whitespace

def test_normalize_whitespace():
    assert normalize_whitespace("  a\\t b  ") == "a b"
    assert normalize_whitespace("a\\n\\n b") == "a b"
    assert normalize_whitespace("a    b   c") == "a b c"
""",
    "task_005": """
from solution import unique_preserve_order

def test_unique_preserve_order():
    assert unique_preserve_order(["a", "b", "a", "c", "b"]) == ["a", "b", "c"]
    assert unique_preserve_order([]) == []
    assert unique_preserve_order(["x", "x", "x"]) == ["x"]
""",
    "task_006": """
from solution import median

def test_median():
    assert median([3, 1, 2]) == 2
    assert median([4, 1, 2, 3]) == 2.5
    assert median([10]) == 10
""",
    "task_007": """
from solution import rotate_right

def test_rotate_right():
    assert rotate_right([1, 2, 3], 1) == [3, 1, 2]
    assert rotate_right([1, 2, 3], 0) == [1, 2, 3]
    assert rotate_right([1, 2, 3], 4) == [3, 1, 2]
    assert rotate_right([1, 2, 3], -1) == [2, 3, 1]
    assert rotate_right([], 3) == []
""",
    "task_008": """
from solution import count_vowels

def test_count_vowels():
    assert count_vowels("AEIOU") == 5
    assert count_vowels("sky") == 0
    assert count_vowels("banana") == 3
""",
}


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run baseline MutationGym eval.")
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--episodes", type=int, default=8)
    parser.add_argument("--output", type=Path, default=Path("outputs/eval_baseline.json"))
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    env = MutationGymEnvironment()

    results = []
    killed_total = 0
    mutant_total = 0
    passed_reference_total = 0

    for episode in range(args.episodes):
        obs = env.reset(seed=args.seed + episode)
        task_id = obs.task_id
        tests = TEST_TEMPLATES.get(
            task_id,
            f"from solution import {task_id}\\n\\n"
            "def test_smoke():\\n"
            "    assert True\\n",
        )
        outcome = env.step(MutationGymAction(tests_py=tests, finalize=True))
        results.append(
            {
                "task_id": task_id,
                "reward": outcome.reward,
                "killed": outcome.killed,
                "total_mutants": outcome.total_mutants,
                "passed_reference": outcome.passed_reference,
                "runtime_ms": outcome.runtime_ms,
                "error": outcome.error,
            }
        )
        killed_total += outcome.killed
        mutant_total += outcome.total_mutants
        passed_reference_total += 1 if outcome.passed_reference else 0

    summary = {
        "episodes": args.episodes,
        "avg_kill_rate": (killed_total / mutant_total) if mutant_total else 0.0,
        "reference_pass_rate": passed_reference_total / args.episodes,
    }

    payload = {"summary": summary, "results": results}
    args.output.parent.mkdir(parents=True, exist_ok=True)
    args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")

    print(json.dumps(summary, indent=2))


if __name__ == "__main__":
    main()