| import time
|
| import asyncio
|
| import sys
|
| import os
|
|
|
|
|
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
|
| from core.verification_engine import run_verification_parallel
|
|
|
| COMPETITIVE_EXAM_MOCK = [
|
| {
|
| "exam": "GATE (CS) - Linear Algebra",
|
| "question": "Let M be a 2x2 matrix such that M = [[4, 1], [2, 3]]. Find the sum of the eigenvalues of M.",
|
| "steps": [
|
| "The sum of the eigenvalues of a matrix is equal to its trace.",
|
| "Trace(M) = 4 + 3",
|
| "Trace(M) = 7",
|
| "Therefore, the sum of the eigenvalues is 7."
|
| ],
|
| "answer": "7"
|
| },
|
| {
|
| "exam": "JEE Advanced - Calculus",
|
| "question": "Evaluate the definite integral of x * e^x from x=0 to x=1.",
|
| "steps": [
|
| r"Use integration by parts: \int u dv = uv - \int v du",
|
| r"Let u = x, so du = dx. Let dv = e^x dx, so v = e^x.",
|
| r"\int_0^1 x e^x dx = [x e^x]_0^1 - \int_0^1 e^x dx",
|
| "= (1 * e^1 - 0 * e^0) - [e^x]_0^1",
|
| "= e - (e^1 - e^0)",
|
| "= e - e + 1 = 1",
|
| "The final evaluated definite integral is 1."
|
| ],
|
| "answer": "1"
|
| },
|
| {
|
| "exam": "GATE (EC) - Probability",
|
| "question": "A box contains 4 red balls and 6 black balls. Three balls are drawn at random without replacement. What is the probability that exactly two are red?",
|
| "steps": [
|
| "Total ways to draw 3 balls from 10 is C(10,3).",
|
| "C(10,3) = (10*9*8)/(3*2*1) = 120",
|
| "Ways to draw exactly 2 red balls from 4 is C(4,2) = 6.",
|
| "Ways to draw 1 black ball from 6 is C(6,1) = 6.",
|
| "Total favorable ways = 6 * 6 = 36.",
|
| "Probability = 36 / 120 = 3 / 10 = 0.3."
|
| ],
|
| "answer": "0.3"
|
| },
|
| {
|
| "exam": "JEE Mains - Kinematics Paradox",
|
| "question": "A particle moves such that its velocity v is given by v = t^2 - 4t + 3. Find the acceleration when velocity is zero.",
|
| "steps": [
|
| "Velocity v = t^2 - 4t + 3 = 0",
|
| "(t - 1)(t - 3) = 0",
|
| "So t = 1 or t = 3.",
|
| "Acceleration a = dv/dt = 2t - 4.",
|
| "At t = 1, a = 2(1) - 4 = -2.",
|
| "At t = 3, a = 2(3) - 4 = 2.",
|
| "The accelerations are -2 and 2."
|
| ],
|
| "answer": "2"
|
| },
|
| {
|
| "exam": "GATE (ME) - Differential Equations",
|
| "question": "Solve the initial value problem dy/dx = 2xy, y(0) = 1. Find y at x = 1.",
|
| "steps": [
|
| "dy/y = 2x dx",
|
| r"\int dy/y = \int 2x dx",
|
| "ln(y) = x^2 + C",
|
| "Use y(0) = 1: ln(1) = 0 + C => C = 0",
|
| "ln(y) = x^2 => y = e^(x^2)",
|
| "At x = 1, y = e^(1^2) = e."
|
| ],
|
| "answer": "e"
|
| }
|
| ]
|
|
|
| def evaluate_competitive_problem(item: dict):
|
| problem = item["question"]
|
| steps = item["steps"]
|
| expected = item["answer"]
|
| exam = item["exam"]
|
|
|
| print(f"\n[EVALUATING] {exam}")
|
| print(f"Problem: {problem[:80]}...")
|
|
|
|
|
| result = None
|
|
|
| active_models = ["GPT-4", "Claude 3.5 Sonnet", "Gemini 1.5 Pro", "Llama 3"]
|
|
|
| for partial_res in run_verification_parallel(problem, steps, model_name="Ensemble", model_list=active_models):
|
| if partial_res["type"] == "final":
|
| result = partial_res
|
|
|
| consensus = result.get("consensus", {})
|
| verdict = consensus.get("final_verdict", "ERROR")
|
| latency = result.get("processing_time", 0.0)
|
| confidence = consensus.get("overall_confidence", 0.0)
|
|
|
| is_correct = verdict == "VALID"
|
|
|
| if not is_correct:
|
| print(f" [ATTENTION] System flagged errors in logic: {result.get('classified_errors', [])}")
|
|
|
| return is_correct, latency, confidence
|
|
|
| def run_competitive_benchmark():
|
| num_samples = len(COMPETITIVE_EXAM_MOCK)
|
| correct_count = 0
|
| latencies = []
|
| confidences = []
|
|
|
| print("="*60)
|
| print("🎓 MVM² ADVANCED COMPETITIVE EXAM BENCHMARK (GATE / JEE)")
|
| print("="*60)
|
| print(f"Total problems queued: {num_samples}")
|
|
|
| for item in COMPETITIVE_EXAM_MOCK:
|
| is_correct, lat, conf = evaluate_competitive_problem(item)
|
| if is_correct:
|
| correct_count += 1
|
| latencies.append(lat)
|
| confidences.append(conf)
|
| print(f" -> Result: {'✅ VERIFIED' if is_correct else '❌ FLAGGED'} | Confidence: {conf*100:.1f}% | Latency: {lat:.3f}s")
|
|
|
| accuracy = (correct_count / num_samples) * 100
|
| avg_latency = sum(latencies) / len(latencies) if latencies else 0
|
| avg_conf = sum(confidences) / len(confidences) if confidences else 0
|
|
|
| print("\n" + "="*60)
|
| print("🏆 FINAL COMPETITIVE BENCHMARK METRICS")
|
| print("="*60)
|
| print(f"Advanced Exam Accuracy: {accuracy:.1f}% (Expected > 85%)")
|
| print(f"Average Confidence: {avg_conf*100:.1f}%")
|
| print(f"Average Latency: {avg_latency:.3f}s")
|
| print("="*60)
|
|
|
| if __name__ == "__main__":
|
|
|
| sys.stdout.reconfigure(encoding='utf-8')
|
| run_competitive_benchmark()
|
|
|