File size: 3,680 Bytes
96b50a5
 
 
 
 
 
 
 
 
378972b
9d95848
 
378972b
267d60a
 
 
 
 
378972b
 
c0a3e8d
 
 
 
 
 
9d95848
378972b
267d60a
 
378972b
 
9d95848
 
267d60a
 
 
 
2be5c6e
 
 
 
 
 
 
267d60a
 
 
 
9d95848
 
e3f224d
 
 
 
 
267d60a
 
 
 
 
 
7f2d9e7
 
 
 
 
9d95848
 
 
2be5c6e
267d60a
e3f224d
267d60a
7f2d9e7
9d95848
 
378972b
9d95848
 
 
e3f224d
 
267d60a
 
 
 
2be5c6e
 
 
 
7f2d9e7
2be5c6e
 
7f2d9e7
 
 
 
c0a3e8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from __future__ import annotations

import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from verifier.verifier import verify


test_cases = [
    {"input": "5\n2 3 4 5 6\n", "output": "12", "is_visible": False},
    {"input": "4\n1 3 5 7\n", "output": "0", "is_visible": False},
    {"input": "6\n-2 -3 -4 -5 -6 -7\n", "output": "-12", "is_visible": False},
    {"input": "3\n0 10 11\n", "output": "10", "is_visible": False},
    {"input": "5\n8 8 8 8 8\n", "output": "40", "is_visible": False},
]

empirical_test_cases = [
    {"input": "10\n", "output": "45", "is_visible": False},
    {"input": "200\n", "output": "19900", "is_visible": False},
    {"input": "1000\n", "output": "499500", "is_visible": False},
]

correct_code = """
n = int(input())
nums = list(map(int, input().split()))
print(sum(x for x in nums if x % 2 == 0))
"""

wrong_code = """
n = int(input())
nums = list(map(int, input().split()))
print(sum(nums))
"""

less_optimized_code = """
n = int(input())
nums = list(map(int, input().split()))
evens = [x for x in nums if x % 2 == 0]
print(sum(evens))
"""

invalid_output_code = """
n = int(input())
input()
print()
"""

timeout_code = """
while True:
    pass
"""

runtime_error_code = """
n = int(input())
nums = list(map(int, input().split()))
print(nums[n])
"""

safety_violation_code = """
import os
print(os.listdir("."))
"""

for name, code in [
    ("correct", correct_code),
    ("wrong", wrong_code),
    ("less_optimized", less_optimized_code),
    ("invalid_output", invalid_output_code),
    ("timeout", timeout_code),
    ("runtime_error", runtime_error_code),
    ("safety_violation", safety_violation_code),
]:
    reward, info = verify(code, test_cases)

    print("\nCASE:", name)
    print("Reward:", reward)
    print("Pass rate:", info["pass_rate"])
    print("Passed:", info["passed"], "/", info["total"])
    print("Timeouts:", info["timeout_count"])
    print("Runtime errors:", info["runtime_error_count"])
    print("Invalid output:", info["invalid_output_count"])
    print("Wrong answers:", info["wrong_answer_count"])
    print("Status:", info["execution_status"])
    print("Efficiency:", info.get("efficiency_score"))

reward_optimal, info_optimal = verify(correct_code, test_cases)
reward_less_optimal, info_less_optimal = verify(less_optimized_code, test_cases)
reward_safety, info_safety = verify(safety_violation_code, test_cases)
assert info_optimal["efficiency_score"] > info_less_optimal["efficiency_score"]
assert info_less_optimal["complexity_signals"]["list_comprehensions"] > 0
assert info_optimal["verifier_components"]["hidden_correctness"] == 1.0
assert info_optimal["verifier_components"]["anti_cheat_compliance"] == 1.0
assert reward_safety == 0.0
assert info_safety["execution_status"] == "safety_violation"

linear_code = "n = int(input()); print(sum(range(n)))"
quadratic_code = "n = int(input()); print(sum(i*j for i in range(n) for j in range(n)))"
_, linear_result = verify(linear_code, empirical_test_cases)
_, quadratic_result = verify(quadratic_code, empirical_test_cases)
assert linear_result["efficiency_score"] >= quadratic_result["efficiency_score"], (
    "Empirical complexity: O(n) should score >= O(n^2)"
)

small_alloc = "n = int(input()); print(n)"
large_alloc = "n = int(input()); x = [0] * (n * 10000); print(len(x))"
_, small_result = verify(small_alloc, empirical_test_cases)
_, large_result = verify(large_alloc, empirical_test_cases)
assert small_result["efficiency_score"] >= large_result["efficiency_score"], (
    "Empirical complexity: small allocation should score >= large allocation"
)