meta-rl-dsa-solver / scripts /test_verifier.py
Dishaaa25's picture
Smoke-train deployment update
c0a3e8d
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from verifier.verifier import verify
test_cases = [
{"input": "5\n2 3 4 5 6\n", "output": "12", "is_visible": False},
{"input": "4\n1 3 5 7\n", "output": "0", "is_visible": False},
{"input": "6\n-2 -3 -4 -5 -6 -7\n", "output": "-12", "is_visible": False},
{"input": "3\n0 10 11\n", "output": "10", "is_visible": False},
{"input": "5\n8 8 8 8 8\n", "output": "40", "is_visible": False},
]
empirical_test_cases = [
{"input": "10\n", "output": "45", "is_visible": False},
{"input": "200\n", "output": "19900", "is_visible": False},
{"input": "1000\n", "output": "499500", "is_visible": False},
]
correct_code = """
n = int(input())
nums = list(map(int, input().split()))
print(sum(x for x in nums if x % 2 == 0))
"""
wrong_code = """
n = int(input())
nums = list(map(int, input().split()))
print(sum(nums))
"""
less_optimized_code = """
n = int(input())
nums = list(map(int, input().split()))
evens = [x for x in nums if x % 2 == 0]
print(sum(evens))
"""
invalid_output_code = """
n = int(input())
input()
print()
"""
timeout_code = """
while True:
pass
"""
runtime_error_code = """
n = int(input())
nums = list(map(int, input().split()))
print(nums[n])
"""
safety_violation_code = """
import os
print(os.listdir("."))
"""
for name, code in [
("correct", correct_code),
("wrong", wrong_code),
("less_optimized", less_optimized_code),
("invalid_output", invalid_output_code),
("timeout", timeout_code),
("runtime_error", runtime_error_code),
("safety_violation", safety_violation_code),
]:
reward, info = verify(code, test_cases)
print("\nCASE:", name)
print("Reward:", reward)
print("Pass rate:", info["pass_rate"])
print("Passed:", info["passed"], "/", info["total"])
print("Timeouts:", info["timeout_count"])
print("Runtime errors:", info["runtime_error_count"])
print("Invalid output:", info["invalid_output_count"])
print("Wrong answers:", info["wrong_answer_count"])
print("Status:", info["execution_status"])
print("Efficiency:", info.get("efficiency_score"))
reward_optimal, info_optimal = verify(correct_code, test_cases)
reward_less_optimal, info_less_optimal = verify(less_optimized_code, test_cases)
reward_safety, info_safety = verify(safety_violation_code, test_cases)
assert info_optimal["efficiency_score"] > info_less_optimal["efficiency_score"]
assert info_less_optimal["complexity_signals"]["list_comprehensions"] > 0
assert info_optimal["verifier_components"]["hidden_correctness"] == 1.0
assert info_optimal["verifier_components"]["anti_cheat_compliance"] == 1.0
assert reward_safety == 0.0
assert info_safety["execution_status"] == "safety_violation"
linear_code = "n = int(input()); print(sum(range(n)))"
quadratic_code = "n = int(input()); print(sum(i*j for i in range(n) for j in range(n)))"
_, linear_result = verify(linear_code, empirical_test_cases)
_, quadratic_result = verify(quadratic_code, empirical_test_cases)
assert linear_result["efficiency_score"] >= quadratic_result["efficiency_score"], (
"Empirical complexity: O(n) should score >= O(n^2)"
)
small_alloc = "n = int(input()); print(n)"
large_alloc = "n = int(input()); x = [0] * (n * 10000); print(len(x))"
_, small_result = verify(small_alloc, empirical_test_cases)
_, large_result = verify(large_alloc, empirical_test_cases)
assert small_result["efficiency_score"] >= large_result["efficiency_score"], (
"Empirical complexity: small allocation should score >= large allocation"
)