Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from env import DebugEnv | |
| from tasks.registry import list_tasks | |
| env = DebugEnv() | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # HELPERS | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| def print_state(state): | |
| print(f" step : {state.step_count}") | |
| print(f" tests passed: {state.tests_passed} / {state.tests_total}") | |
| print(f" done : {state.done}") | |
| print(f" last output :\n{state.last_test_output.strip()}") | |
| print() | |
| def section(title): | |
| print("\n" + "="*50) | |
| print(f" {title}") | |
| print("="*50) | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 1 β list tasks | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 1: list_tasks()") | |
| tasks = list_tasks() | |
| for t in tasks: | |
| print(f" [{t['difficulty']}] {t['task_id']} β {t['description']}") | |
| assert len(tasks) == 3, "Should have 3 tasks" | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 2 β reset loads buggy code | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 2: reset() loads buggy state") | |
| state = env.reset("easy") | |
| assert state.code is not None | |
| assert state.tests_total == 4, f"Expected 4 tests for easy task, got {state.tests_total}" | |
| assert state.step_count == 0 | |
| assert state.done == False | |
| print(f" code loaded : {repr(state.code[:40])}...") | |
| print(f" tests_total : {state.tests_total}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 3 β run tests on buggy code | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 3: run_tests on buggy code β should fail") | |
| state, reward, done = env.step({"type": "run_tests"}) | |
| print_state(state) | |
| assert state.tests_passed < state.tests_total, "Buggy code should not pass all tests" | |
| assert reward < 0, f"Reward should be negative, got {reward}" | |
| print(f" reward: {reward}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 4 β edit_function then run tests | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 4: edit_function apply_discount fix β run tests") | |
| fixed_apply_discount = """\ | |
| def apply_discount(price, percent): | |
| discount = price * percent / 100 | |
| return round(discount, 2) | |
| """ | |
| state, reward, done = env.step({ | |
| "type": "edit_function", | |
| "function_name": "apply_discount", | |
| "new_code": fixed_apply_discount | |
| }) | |
| print(f" after edit apply_discount β tests_passed: {state.tests_passed}") | |
| state, reward, done = env.step({"type": "run_tests"}) | |
| print_state(state) | |
| print(f" reward: {reward}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 5 β fix second function, all tests pass | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 5: edit_function compute_final fix β all tests pass") | |
| fixed_compute_final = """\ | |
| def compute_final(price, percent): | |
| discount = apply_discount(price, percent) | |
| return round(price - discount, 2) | |
| """ | |
| state, reward, done = env.step({ | |
| "type": "edit_function", | |
| "function_name": "compute_final", | |
| "new_code": fixed_compute_final | |
| }) | |
| print(f" after edit compute_final β tests_passed: {state.tests_passed}") | |
| state, reward, done = env.step({"type": "run_tests"}) | |
| print_state(state) | |
| assert state.tests_passed == state.tests_total, "Fixed code should pass all tests" | |
| assert done == True, "Episode should be done after all tests pass" | |
| print(f" reward: {reward}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 6 β state() returns current state | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 6: state() returns correct snapshot") | |
| snapshot = env.state() | |
| assert snapshot.done == True | |
| assert snapshot.tests_passed == 4, f"Expected 4 tests passed, got {snapshot.tests_passed}" | |
| print(f" snapshot done : {snapshot.done}") | |
| print(f" snapshot tests_passed: {snapshot.tests_passed}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 7 β step after done raises error | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 7: step() after done raises RuntimeError") | |
| try: | |
| env.step({"type": "run_tests"}) | |
| assert False, "Should have raised RuntimeError" | |
| except RuntimeError as e: | |
| print(f" caught expected error: {e}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 8 β medium task loads and runs | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 8: medium task β reset and run buggy tests") | |
| state = env.reset("medium") | |
| assert state.tests_total == 5, f"Expected 5 tests for medium task, got {state.tests_total}" | |
| state, reward, done = env.step({"type": "run_tests"}) | |
| print_state(state) | |
| assert state.tests_passed < state.tests_total | |
| print(f" reward: {reward}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 9 β hard task loads and runs | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 9: hard task β reset and run buggy tests") | |
| state = env.reset("hard") | |
| assert state.tests_total == 10, f"Expected 10 tests for hard task, got {state.tests_total}" | |
| state, reward, done = env.step({"type": "run_tests"}) | |
| print_state(state) | |
| assert state.tests_passed < state.tests_total | |
| print(f" reward: {reward}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 10 β step limit ends episode | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 10: step limit β episode ends at MAX_STEPS") | |
| state = env.reset("easy") | |
| done = False | |
| steps = 0 | |
| while not done: | |
| state, reward, done = env.step({"type": "run_tests"}) | |
| steps += 1 | |
| assert done == True | |
| print(f" episode ended after {steps} steps") | |
| print(f" tests_passed: {state.tests_passed} / {state.tests_total}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # TEST 11 β unknown action raises error | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("TEST 11: unknown action type raises ValueError") | |
| env.reset("easy") | |
| try: | |
| env.step({"type": "fly_to_moon"}) | |
| assert False, "Should have raised ValueError" | |
| except ValueError as e: | |
| print(f" caught expected error: {e}") | |
| print("\n PASSED") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # DONE | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| section("ALL TESTS PASSED") |