Spaces:
Running
Running
File size: 3,680 Bytes
96b50a5 378972b 9d95848 378972b 267d60a 378972b c0a3e8d 9d95848 378972b 267d60a 378972b 9d95848 267d60a 2be5c6e 267d60a 9d95848 e3f224d 267d60a 7f2d9e7 9d95848 2be5c6e 267d60a e3f224d 267d60a 7f2d9e7 9d95848 378972b 9d95848 e3f224d 267d60a 2be5c6e 7f2d9e7 2be5c6e 7f2d9e7 c0a3e8d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from verifier.verifier import verify
test_cases = [
{"input": "5\n2 3 4 5 6\n", "output": "12", "is_visible": False},
{"input": "4\n1 3 5 7\n", "output": "0", "is_visible": False},
{"input": "6\n-2 -3 -4 -5 -6 -7\n", "output": "-12", "is_visible": False},
{"input": "3\n0 10 11\n", "output": "10", "is_visible": False},
{"input": "5\n8 8 8 8 8\n", "output": "40", "is_visible": False},
]
empirical_test_cases = [
{"input": "10\n", "output": "45", "is_visible": False},
{"input": "200\n", "output": "19900", "is_visible": False},
{"input": "1000\n", "output": "499500", "is_visible": False},
]
correct_code = """
n = int(input())
nums = list(map(int, input().split()))
print(sum(x for x in nums if x % 2 == 0))
"""
wrong_code = """
n = int(input())
nums = list(map(int, input().split()))
print(sum(nums))
"""
less_optimized_code = """
n = int(input())
nums = list(map(int, input().split()))
evens = [x for x in nums if x % 2 == 0]
print(sum(evens))
"""
invalid_output_code = """
n = int(input())
input()
print()
"""
timeout_code = """
while True:
pass
"""
runtime_error_code = """
n = int(input())
nums = list(map(int, input().split()))
print(nums[n])
"""
safety_violation_code = """
import os
print(os.listdir("."))
"""
for name, code in [
("correct", correct_code),
("wrong", wrong_code),
("less_optimized", less_optimized_code),
("invalid_output", invalid_output_code),
("timeout", timeout_code),
("runtime_error", runtime_error_code),
("safety_violation", safety_violation_code),
]:
reward, info = verify(code, test_cases)
print("\nCASE:", name)
print("Reward:", reward)
print("Pass rate:", info["pass_rate"])
print("Passed:", info["passed"], "/", info["total"])
print("Timeouts:", info["timeout_count"])
print("Runtime errors:", info["runtime_error_count"])
print("Invalid output:", info["invalid_output_count"])
print("Wrong answers:", info["wrong_answer_count"])
print("Status:", info["execution_status"])
print("Efficiency:", info.get("efficiency_score"))
reward_optimal, info_optimal = verify(correct_code, test_cases)
reward_less_optimal, info_less_optimal = verify(less_optimized_code, test_cases)
reward_safety, info_safety = verify(safety_violation_code, test_cases)
assert info_optimal["efficiency_score"] > info_less_optimal["efficiency_score"]
assert info_less_optimal["complexity_signals"]["list_comprehensions"] > 0
assert info_optimal["verifier_components"]["hidden_correctness"] == 1.0
assert info_optimal["verifier_components"]["anti_cheat_compliance"] == 1.0
assert reward_safety == 0.0
assert info_safety["execution_status"] == "safety_violation"
linear_code = "n = int(input()); print(sum(range(n)))"
quadratic_code = "n = int(input()); print(sum(i*j for i in range(n) for j in range(n)))"
_, linear_result = verify(linear_code, empirical_test_cases)
_, quadratic_result = verify(quadratic_code, empirical_test_cases)
assert linear_result["efficiency_score"] >= quadratic_result["efficiency_score"], (
"Empirical complexity: O(n) should score >= O(n^2)"
)
small_alloc = "n = int(input()); print(n)"
large_alloc = "n = int(input()); x = [0] * (n * 10000); print(len(x))"
_, small_result = verify(small_alloc, empirical_test_cases)
_, large_result = verify(large_alloc, empirical_test_cases)
assert small_result["efficiency_score"] >= large_result["efficiency_score"], (
"Empirical complexity: small allocation should score >= large allocation"
)
|