File size: 5,492 Bytes
b25b8f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | import time
import asyncio
import sys
import os
# Ensure the root of the project is in the path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from core.verification_engine import run_verification_parallel
COMPETITIVE_EXAM_MOCK = [
{
"exam": "GATE (CS) - Linear Algebra",
"question": "Let M be a 2x2 matrix such that M = [[4, 1], [2, 3]]. Find the sum of the eigenvalues of M.",
"steps": [
"The sum of the eigenvalues of a matrix is equal to its trace.",
"Trace(M) = 4 + 3",
"Trace(M) = 7",
"Therefore, the sum of the eigenvalues is 7."
],
"answer": "7"
},
{
"exam": "JEE Advanced - Calculus",
"question": "Evaluate the definite integral of x * e^x from x=0 to x=1.",
"steps": [
r"Use integration by parts: \int u dv = uv - \int v du",
r"Let u = x, so du = dx. Let dv = e^x dx, so v = e^x.",
r"\int_0^1 x e^x dx = [x e^x]_0^1 - \int_0^1 e^x dx",
"= (1 * e^1 - 0 * e^0) - [e^x]_0^1",
"= e - (e^1 - e^0)",
"= e - e + 1 = 1",
"The final evaluated definite integral is 1."
],
"answer": "1"
},
{
"exam": "GATE (EC) - Probability",
"question": "A box contains 4 red balls and 6 black balls. Three balls are drawn at random without replacement. What is the probability that exactly two are red?",
"steps": [
"Total ways to draw 3 balls from 10 is C(10,3).",
"C(10,3) = (10*9*8)/(3*2*1) = 120",
"Ways to draw exactly 2 red balls from 4 is C(4,2) = 6.",
"Ways to draw 1 black ball from 6 is C(6,1) = 6.",
"Total favorable ways = 6 * 6 = 36.",
"Probability = 36 / 120 = 3 / 10 = 0.3."
],
"answer": "0.3"
},
{
"exam": "JEE Mains - Kinematics Paradox",
"question": "A particle moves such that its velocity v is given by v = t^2 - 4t + 3. Find the acceleration when velocity is zero.",
"steps": [
"Velocity v = t^2 - 4t + 3 = 0",
"(t - 1)(t - 3) = 0",
"So t = 1 or t = 3.",
"Acceleration a = dv/dt = 2t - 4.",
"At t = 1, a = 2(1) - 4 = -2.",
"At t = 3, a = 2(3) - 4 = 2.",
"The accelerations are -2 and 2."
],
"answer": "2" # or -2, testing logic handling branching paths
},
{
"exam": "GATE (ME) - Differential Equations",
"question": "Solve the initial value problem dy/dx = 2xy, y(0) = 1. Find y at x = 1.",
"steps": [
"dy/y = 2x dx",
r"\int dy/y = \int 2x dx",
"ln(y) = x^2 + C",
"Use y(0) = 1: ln(1) = 0 + C => C = 0",
"ln(y) = x^2 => y = e^(x^2)",
"At x = 1, y = e^(1^2) = e."
],
"answer": "e"
}
]
def evaluate_competitive_problem(item: dict):
problem = item["question"]
steps = item["steps"]
expected = item["answer"]
exam = item["exam"]
print(f"\n[EVALUATING] {exam}")
print(f"Problem: {problem[:80]}...")
# Consume generator to get final consensus
result = None
# We use all 4 models to simulate max rigorous verification for competitive exams
active_models = ["GPT-4", "Claude 3.5 Sonnet", "Gemini 1.5 Pro", "Llama 3"]
for partial_res in run_verification_parallel(problem, steps, model_name="Ensemble", model_list=active_models):
if partial_res["type"] == "final":
result = partial_res
consensus = result.get("consensus", {})
verdict = consensus.get("final_verdict", "ERROR")
latency = result.get("processing_time", 0.0)
confidence = consensus.get("overall_confidence", 0.0)
is_correct = verdict == "VALID"
if not is_correct:
print(f" [ATTENTION] System flagged errors in logic: {result.get('classified_errors', [])}")
return is_correct, latency, confidence
def run_competitive_benchmark():
num_samples = len(COMPETITIVE_EXAM_MOCK)
correct_count = 0
latencies = []
confidences = []
print("="*60)
print("🎓 MVM² ADVANCED COMPETITIVE EXAM BENCHMARK (GATE / JEE)")
print("="*60)
print(f"Total problems queued: {num_samples}")
for item in COMPETITIVE_EXAM_MOCK:
is_correct, lat, conf = evaluate_competitive_problem(item)
if is_correct:
correct_count += 1
latencies.append(lat)
confidences.append(conf)
print(f" -> Result: {'✅ VERIFIED' if is_correct else '❌ FLAGGED'} | Confidence: {conf*100:.1f}% | Latency: {lat:.3f}s")
accuracy = (correct_count / num_samples) * 100
avg_latency = sum(latencies) / len(latencies) if latencies else 0
avg_conf = sum(confidences) / len(confidences) if confidences else 0
print("\n" + "="*60)
print("🏆 FINAL COMPETITIVE BENCHMARK METRICS")
print("="*60)
print(f"Advanced Exam Accuracy: {accuracy:.1f}% (Expected > 85%)")
print(f"Average Confidence: {avg_conf*100:.1f}%")
print(f"Average Latency: {avg_latency:.3f}s")
print("="*60)
if __name__ == "__main__":
# Ensure UTF-8 output
sys.stdout.reconfigure(encoding='utf-8')
run_competitive_benchmark()
|