mvm2-math-verification / scripts /benchmark_competitive.py
Varshith dharmaj
Robust MVM2 System Sync: Fixed Imports and Restored Services
b25b8f2 verified
import time
import asyncio
import sys
import os
# Ensure the root of the project is in the path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from core.verification_engine import run_verification_parallel
COMPETITIVE_EXAM_MOCK = [
{
"exam": "GATE (CS) - Linear Algebra",
"question": "Let M be a 2x2 matrix such that M = [[4, 1], [2, 3]]. Find the sum of the eigenvalues of M.",
"steps": [
"The sum of the eigenvalues of a matrix is equal to its trace.",
"Trace(M) = 4 + 3",
"Trace(M) = 7",
"Therefore, the sum of the eigenvalues is 7."
],
"answer": "7"
},
{
"exam": "JEE Advanced - Calculus",
"question": "Evaluate the definite integral of x * e^x from x=0 to x=1.",
"steps": [
r"Use integration by parts: \int u dv = uv - \int v du",
r"Let u = x, so du = dx. Let dv = e^x dx, so v = e^x.",
r"\int_0^1 x e^x dx = [x e^x]_0^1 - \int_0^1 e^x dx",
"= (1 * e^1 - 0 * e^0) - [e^x]_0^1",
"= e - (e^1 - e^0)",
"= e - e + 1 = 1",
"The final evaluated definite integral is 1."
],
"answer": "1"
},
{
"exam": "GATE (EC) - Probability",
"question": "A box contains 4 red balls and 6 black balls. Three balls are drawn at random without replacement. What is the probability that exactly two are red?",
"steps": [
"Total ways to draw 3 balls from 10 is C(10,3).",
"C(10,3) = (10*9*8)/(3*2*1) = 120",
"Ways to draw exactly 2 red balls from 4 is C(4,2) = 6.",
"Ways to draw 1 black ball from 6 is C(6,1) = 6.",
"Total favorable ways = 6 * 6 = 36.",
"Probability = 36 / 120 = 3 / 10 = 0.3."
],
"answer": "0.3"
},
{
"exam": "JEE Mains - Kinematics Paradox",
"question": "A particle moves such that its velocity v is given by v = t^2 - 4t + 3. Find the acceleration when velocity is zero.",
"steps": [
"Velocity v = t^2 - 4t + 3 = 0",
"(t - 1)(t - 3) = 0",
"So t = 1 or t = 3.",
"Acceleration a = dv/dt = 2t - 4.",
"At t = 1, a = 2(1) - 4 = -2.",
"At t = 3, a = 2(3) - 4 = 2.",
"The accelerations are -2 and 2."
],
"answer": "2" # or -2, testing logic handling branching paths
},
{
"exam": "GATE (ME) - Differential Equations",
"question": "Solve the initial value problem dy/dx = 2xy, y(0) = 1. Find y at x = 1.",
"steps": [
"dy/y = 2x dx",
r"\int dy/y = \int 2x dx",
"ln(y) = x^2 + C",
"Use y(0) = 1: ln(1) = 0 + C => C = 0",
"ln(y) = x^2 => y = e^(x^2)",
"At x = 1, y = e^(1^2) = e."
],
"answer": "e"
}
]
def evaluate_competitive_problem(item: dict):
problem = item["question"]
steps = item["steps"]
expected = item["answer"]
exam = item["exam"]
print(f"\n[EVALUATING] {exam}")
print(f"Problem: {problem[:80]}...")
# Consume generator to get final consensus
result = None
# We use all 4 models to simulate max rigorous verification for competitive exams
active_models = ["GPT-4", "Claude 3.5 Sonnet", "Gemini 1.5 Pro", "Llama 3"]
for partial_res in run_verification_parallel(problem, steps, model_name="Ensemble", model_list=active_models):
if partial_res["type"] == "final":
result = partial_res
consensus = result.get("consensus", {})
verdict = consensus.get("final_verdict", "ERROR")
latency = result.get("processing_time", 0.0)
confidence = consensus.get("overall_confidence", 0.0)
is_correct = verdict == "VALID"
if not is_correct:
print(f" [ATTENTION] System flagged errors in logic: {result.get('classified_errors', [])}")
return is_correct, latency, confidence
def run_competitive_benchmark():
num_samples = len(COMPETITIVE_EXAM_MOCK)
correct_count = 0
latencies = []
confidences = []
print("="*60)
print("🎓 MVM² ADVANCED COMPETITIVE EXAM BENCHMARK (GATE / JEE)")
print("="*60)
print(f"Total problems queued: {num_samples}")
for item in COMPETITIVE_EXAM_MOCK:
is_correct, lat, conf = evaluate_competitive_problem(item)
if is_correct:
correct_count += 1
latencies.append(lat)
confidences.append(conf)
print(f" -> Result: {'✅ VERIFIED' if is_correct else '❌ FLAGGED'} | Confidence: {conf*100:.1f}% | Latency: {lat:.3f}s")
accuracy = (correct_count / num_samples) * 100
avg_latency = sum(latencies) / len(latencies) if latencies else 0
avg_conf = sum(confidences) / len(confidences) if confidences else 0
print("\n" + "="*60)
print("🏆 FINAL COMPETITIVE BENCHMARK METRICS")
print("="*60)
print(f"Advanced Exam Accuracy: {accuracy:.1f}% (Expected > 85%)")
print(f"Average Confidence: {avg_conf*100:.1f}%")
print(f"Average Latency: {avg_latency:.3f}s")
print("="*60)
if __name__ == "__main__":
# Ensure UTF-8 output
sys.stdout.reconfigure(encoding='utf-8')
run_competitive_benchmark()