File size: 6,005 Bytes
30bf68a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | import unittest
from env.anti_hacking import AntiHackingDetector
from env.graders.deterministic import DeterministicGrader
from env.hidden_tests import HiddenTestRunner
from env.rewards import RewardCalculator
EXPECTED_CONFIG = """
name: CI
on: [push]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: npm ci
- run: npm test
"""
WRONG_CONFIG = """
name: CI
on: [push]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: npm ci
- run: npm tset
"""
BROKEN_YAML = """
name CI
jobs:
test:
steps
- run npm test
"""
class FakeJudge:
def evaluate_fix(self, original, fixed, error):
return {
"correctness": 0.9,
"minimalism": 0.8,
"quality": 0.9,
}
class Day2EngineTests(unittest.TestCase):
def setUp(self):
self.grader = DeterministicGrader()
self.detector = AntiHackingDetector()
self.hidden_runner = HiddenTestRunner(grader=self.grader)
self.reward_calculator = RewardCalculator(
llm_judge=FakeJudge(),
anti_hacking_detector=self.detector,
deterministic_grader=self.grader,
hidden_test_runner=self.hidden_runner,
)
def test_deterministic_grader_high_for_correct_fix(self):
score = self.grader.grade(EXPECTED_CONFIG, EXPECTED_CONFIG)
self.assertGreaterEqual(score, 0.85)
self.assertLessEqual(score, 1.0)
def test_deterministic_grader_low_for_wrong_fix(self):
score = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
self.assertLess(score, 0.6)
def test_deterministic_grader_penalizes_broken_yaml(self):
score = self.grader.grade(BROKEN_YAML, EXPECTED_CONFIG)
self.assertLess(score, 0.4)
def test_deterministic_grader_is_reproducible(self):
first = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
second = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
self.assertEqual(first, second)
def test_anti_hacking_detects_stage_skipping(self):
config = EXPECTED_CONFIG + "\n if: false\n"
penalty = self.detector.penalty_stage_skipping(config)
self.assertLess(penalty, 0.0)
def test_anti_hacking_detects_fake_success(self):
config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0')
penalty = self.detector.penalty_fake_success(config)
self.assertLess(penalty, 0.0)
def test_anti_hacking_detects_breaking_pipeline(self):
previous_config = """
stages:
- build
- test
jobs:
build:
run: npm ci
test:
run: npm test
"""
new_config = """
stages:
- build
jobs:
build:
run: npm ci
"""
penalty = self.detector.penalty_breaking_pipeline(previous_config, new_config)
self.assertLess(penalty, 0.0)
def test_anti_hacking_detects_excessive_edits(self):
penalty = self.detector.penalty_excessive_edits(changed_files_count=12, changed_lines_count=400)
self.assertLess(penalty, 0.0)
def test_anti_hacking_detects_timeout_abuse(self):
penalty = self.detector.penalty_timeout_abuse(step_count=25)
self.assertLess(penalty, 0.0)
def test_hidden_tests_returns_high_pass_rate_for_good_fix(self):
pass_rate = self.hidden_runner.evaluate_fix(
fixed_config=EXPECTED_CONFIG,
expected_config=EXPECTED_CONFIG,
)
self.assertGreaterEqual(pass_rate, 0.8)
def test_hidden_tests_returns_lower_pass_rate_for_bad_fix(self):
pass_rate = self.hidden_runner.evaluate_fix(
fixed_config=WRONG_CONFIG,
expected_config=EXPECTED_CONFIG,
)
self.assertLess(pass_rate, 0.8)
def test_reward_calculator_positive_for_good_behavior(self):
state = {
"step_count": 3,
"previous_config": WRONG_CONFIG,
"expected_config": EXPECTED_CONFIG,
"original_config": WRONG_CONFIG,
"error": "command not found",
}
result = {
"logs_analyzed": True,
"error_diagnosed": True,
"fix_proposed": True,
"pipeline_run": True,
"tests_passed": True,
"command_succeeded": True,
"current_config": EXPECTED_CONFIG,
"expected_config": EXPECTED_CONFIG,
"changed_files_count": 1,
"changed_lines_count": 1,
}
reward = self.reward_calculator.calculate_step_reward(
state=state,
action="validate_fix",
result=result,
original_config=WRONG_CONFIG,
fixed_config=EXPECTED_CONFIG,
error_message="command not found",
expected_config=EXPECTED_CONFIG,
metadata={"broken_token": "npm tset", "fixed_token": "npm test"},
)
self.assertGreater(reward, 0.5)
def test_reward_calculator_penalizes_hacking(self):
hacked_config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0') + "\n if: false\n"
state = {
"step_count": 25,
"previous_config": EXPECTED_CONFIG,
"changed_files_count": 15,
"changed_lines_count": 500,
}
result = {
"current_config": hacked_config,
"expected_config": EXPECTED_CONFIG,
"hacking_attempt": True,
"changed_files_count": 15,
"changed_lines_count": 500,
}
reward = self.reward_calculator.calculate_step_reward(
state=state,
action="edit_config",
result=result,
original_config=EXPECTED_CONFIG,
fixed_config=hacked_config,
error_message="",
expected_config=EXPECTED_CONFIG,
)
self.assertGreaterEqual(reward, 0.0)
self.assertLessEqual(reward, 0.3)
if __name__ == "__main__":
unittest.main()
|