Spaces:
Sleeping
Sleeping
| import unittest | |
| from env.anti_hacking import AntiHackingDetector | |
| from env.graders.deterministic import DeterministicGrader | |
| from env.hidden_tests import HiddenTestRunner | |
| from env.rewards import RewardCalculator | |
| EXPECTED_CONFIG = """ | |
| name: CI | |
| on: [push] | |
| jobs: | |
| test: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - run: npm ci | |
| - run: npm test | |
| """ | |
| WRONG_CONFIG = """ | |
| name: CI | |
| on: [push] | |
| jobs: | |
| test: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - run: npm ci | |
| - run: npm tset | |
| """ | |
| BROKEN_YAML = """ | |
| name CI | |
| jobs: | |
| test: | |
| steps | |
| - run npm test | |
| """ | |
| class FakeJudge: | |
| def evaluate_fix(self, original, fixed, error): | |
| return { | |
| "correctness": 0.9, | |
| "minimalism": 0.8, | |
| "quality": 0.9, | |
| } | |
| class Day2EngineTests(unittest.TestCase): | |
| def setUp(self): | |
| self.grader = DeterministicGrader() | |
| self.detector = AntiHackingDetector() | |
| self.hidden_runner = HiddenTestRunner(grader=self.grader) | |
| self.reward_calculator = RewardCalculator( | |
| llm_judge=FakeJudge(), | |
| anti_hacking_detector=self.detector, | |
| deterministic_grader=self.grader, | |
| hidden_test_runner=self.hidden_runner, | |
| ) | |
| def test_deterministic_grader_high_for_correct_fix(self): | |
| score = self.grader.grade(EXPECTED_CONFIG, EXPECTED_CONFIG) | |
| self.assertGreaterEqual(score, 0.85) | |
| self.assertLessEqual(score, 1.0) | |
| def test_deterministic_grader_low_for_wrong_fix(self): | |
| score = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG) | |
| self.assertLess(score, 0.6) | |
| def test_deterministic_grader_penalizes_broken_yaml(self): | |
| score = self.grader.grade(BROKEN_YAML, EXPECTED_CONFIG) | |
| self.assertLess(score, 0.4) | |
| def test_deterministic_grader_is_reproducible(self): | |
| first = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG) | |
| second = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG) | |
| self.assertEqual(first, second) | |
| def test_anti_hacking_detects_stage_skipping(self): | |
| config = EXPECTED_CONFIG + "\n if: false\n" | |
| penalty = self.detector.penalty_stage_skipping(config) | |
| self.assertLess(penalty, 0.0) | |
| def test_anti_hacking_detects_fake_success(self): | |
| config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0') | |
| penalty = self.detector.penalty_fake_success(config) | |
| self.assertLess(penalty, 0.0) | |
| def test_anti_hacking_detects_breaking_pipeline(self): | |
| previous_config = """ | |
| stages: | |
| - build | |
| - test | |
| jobs: | |
| build: | |
| run: npm ci | |
| test: | |
| run: npm test | |
| """ | |
| new_config = """ | |
| stages: | |
| - build | |
| jobs: | |
| build: | |
| run: npm ci | |
| """ | |
| penalty = self.detector.penalty_breaking_pipeline(previous_config, new_config) | |
| self.assertLess(penalty, 0.0) | |
| def test_anti_hacking_detects_excessive_edits(self): | |
| penalty = self.detector.penalty_excessive_edits(changed_files_count=12, changed_lines_count=400) | |
| self.assertLess(penalty, 0.0) | |
| def test_anti_hacking_detects_timeout_abuse(self): | |
| penalty = self.detector.penalty_timeout_abuse(step_count=25) | |
| self.assertLess(penalty, 0.0) | |
| def test_hidden_tests_returns_high_pass_rate_for_good_fix(self): | |
| pass_rate = self.hidden_runner.evaluate_fix( | |
| fixed_config=EXPECTED_CONFIG, | |
| expected_config=EXPECTED_CONFIG, | |
| ) | |
| self.assertGreaterEqual(pass_rate, 0.8) | |
| def test_hidden_tests_returns_lower_pass_rate_for_bad_fix(self): | |
| pass_rate = self.hidden_runner.evaluate_fix( | |
| fixed_config=WRONG_CONFIG, | |
| expected_config=EXPECTED_CONFIG, | |
| ) | |
| self.assertLess(pass_rate, 0.8) | |
| def test_reward_calculator_positive_for_good_behavior(self): | |
| state = { | |
| "step_count": 3, | |
| "previous_config": WRONG_CONFIG, | |
| "expected_config": EXPECTED_CONFIG, | |
| "original_config": WRONG_CONFIG, | |
| "error": "command not found", | |
| } | |
| result = { | |
| "logs_analyzed": True, | |
| "error_diagnosed": True, | |
| "fix_proposed": True, | |
| "pipeline_run": True, | |
| "tests_passed": True, | |
| "command_succeeded": True, | |
| "current_config": EXPECTED_CONFIG, | |
| "expected_config": EXPECTED_CONFIG, | |
| "changed_files_count": 1, | |
| "changed_lines_count": 1, | |
| } | |
| reward = self.reward_calculator.calculate_step_reward( | |
| state=state, | |
| action="validate_fix", | |
| result=result, | |
| original_config=WRONG_CONFIG, | |
| fixed_config=EXPECTED_CONFIG, | |
| error_message="command not found", | |
| expected_config=EXPECTED_CONFIG, | |
| metadata={"broken_token": "npm tset", "fixed_token": "npm test"}, | |
| ) | |
| self.assertGreater(reward, 0.5) | |
| def test_reward_calculator_penalizes_hacking(self): | |
| hacked_config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0') + "\n if: false\n" | |
| state = { | |
| "step_count": 25, | |
| "previous_config": EXPECTED_CONFIG, | |
| "changed_files_count": 15, | |
| "changed_lines_count": 500, | |
| } | |
| result = { | |
| "current_config": hacked_config, | |
| "expected_config": EXPECTED_CONFIG, | |
| "hacking_attempt": True, | |
| "changed_files_count": 15, | |
| "changed_lines_count": 500, | |
| } | |
| reward = self.reward_calculator.calculate_step_reward( | |
| state=state, | |
| action="edit_config", | |
| result=result, | |
| original_config=EXPECTED_CONFIG, | |
| fixed_config=hacked_config, | |
| error_message="", | |
| expected_config=EXPECTED_CONFIG, | |
| ) | |
| self.assertGreaterEqual(reward, 0.0) | |
| self.assertLessEqual(reward, 0.3) | |
| if __name__ == "__main__": | |
| unittest.main() | |