File size: 5,492 Bytes
b25b8f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import time
import asyncio
import sys
import os

# Ensure the root of the project is in the path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from core.verification_engine import run_verification_parallel

COMPETITIVE_EXAM_MOCK = [
    {
        "exam": "GATE (CS) - Linear Algebra",
        "question": "Let M be a 2x2 matrix such that M = [[4, 1], [2, 3]]. Find the sum of the eigenvalues of M.",
        "steps": [
            "The sum of the eigenvalues of a matrix is equal to its trace.",
            "Trace(M) = 4 + 3",
            "Trace(M) = 7",
            "Therefore, the sum of the eigenvalues is 7."
        ],
        "answer": "7"
    },
    {
        "exam": "JEE Advanced - Calculus",
        "question": "Evaluate the definite integral of x * e^x from x=0 to x=1.",
        "steps": [
            r"Use integration by parts: \int u dv = uv - \int v du",
            r"Let u = x, so du = dx. Let dv = e^x dx, so v = e^x.",
            r"\int_0^1 x e^x dx = [x e^x]_0^1 - \int_0^1 e^x dx",
            "= (1 * e^1 - 0 * e^0) - [e^x]_0^1",
            "= e - (e^1 - e^0)",
            "= e - e + 1 = 1",
            "The final evaluated definite integral is 1."
        ],
        "answer": "1"
    },
    {
        "exam": "GATE (EC) - Probability",
        "question": "A box contains 4 red balls and 6 black balls. Three balls are drawn at random without replacement. What is the probability that exactly two are red?",
        "steps": [
            "Total ways to draw 3 balls from 10 is C(10,3).",
            "C(10,3) = (10*9*8)/(3*2*1) = 120",
            "Ways to draw exactly 2 red balls from 4 is C(4,2) = 6.",
            "Ways to draw 1 black ball from 6 is C(6,1) = 6.",
            "Total favorable ways = 6 * 6 = 36.",
            "Probability = 36 / 120 = 3 / 10 = 0.3."
        ],
        "answer": "0.3"
    },
    {
        "exam": "JEE Mains - Kinematics Paradox",
        "question": "A particle moves such that its velocity v is given by v = t^2 - 4t + 3. Find the acceleration when velocity is zero.",
        "steps": [
            "Velocity v = t^2 - 4t + 3 = 0",
            "(t - 1)(t - 3) = 0",
            "So t = 1 or t = 3.",
            "Acceleration a = dv/dt = 2t - 4.",
            "At t = 1, a = 2(1) - 4 = -2.",
            "At t = 3, a = 2(3) - 4 = 2.",
            "The accelerations are -2 and 2."
        ],
        "answer": "2" # or -2, testing logic handling branching paths
    },
    {
        "exam": "GATE (ME) - Differential Equations",
        "question": "Solve the initial value problem dy/dx = 2xy, y(0) = 1. Find y at x = 1.",
        "steps": [
            "dy/y = 2x dx",
            r"\int dy/y = \int 2x dx",
            "ln(y) = x^2 + C",
            "Use y(0) = 1: ln(1) = 0 + C => C = 0",
            "ln(y) = x^2 => y = e^(x^2)",
            "At x = 1, y = e^(1^2) = e."
        ],
        "answer": "e"
    }
]

def evaluate_competitive_problem(item: dict):
    problem = item["question"]
    steps = item["steps"]
    expected = item["answer"]
    exam = item["exam"]
    
    print(f"\n[EVALUATING] {exam}")
    print(f"Problem: {problem[:80]}...")
    
    # Consume generator to get final consensus
    result = None
    # We use all 4 models to simulate max rigorous verification for competitive exams
    active_models = ["GPT-4", "Claude 3.5 Sonnet", "Gemini 1.5 Pro", "Llama 3"]
    
    for partial_res in run_verification_parallel(problem, steps, model_name="Ensemble", model_list=active_models):
        if partial_res["type"] == "final":
            result = partial_res
            
    consensus = result.get("consensus", {})
    verdict = consensus.get("final_verdict", "ERROR")
    latency = result.get("processing_time", 0.0)
    confidence = consensus.get("overall_confidence", 0.0)
    
    is_correct = verdict == "VALID"
    
    if not is_correct:
        print(f"    [ATTENTION] System flagged errors in logic: {result.get('classified_errors', [])}")
        
    return is_correct, latency, confidence

def run_competitive_benchmark():
    num_samples = len(COMPETITIVE_EXAM_MOCK)
    correct_count = 0
    latencies = []
    confidences = []
    
    print("="*60)
    print("🎓 MVM² ADVANCED COMPETITIVE EXAM BENCHMARK (GATE / JEE)")
    print("="*60)
    print(f"Total problems queued: {num_samples}")
    
    for item in COMPETITIVE_EXAM_MOCK:
        is_correct, lat, conf = evaluate_competitive_problem(item)
        if is_correct: 
            correct_count += 1
        latencies.append(lat)
        confidences.append(conf)
        print(f"  -> Result: {'✅ VERIFIED' if is_correct else '❌ FLAGGED'} | Confidence: {conf*100:.1f}% | Latency: {lat:.3f}s")
        
    accuracy = (correct_count / num_samples) * 100
    avg_latency = sum(latencies) / len(latencies) if latencies else 0
    avg_conf = sum(confidences) / len(confidences) if confidences else 0
    
    print("\n" + "="*60)
    print("🏆 FINAL COMPETITIVE BENCHMARK METRICS")
    print("="*60)
    print(f"Advanced Exam Accuracy: {accuracy:.1f}% (Expected > 85%)")
    print(f"Average Confidence:     {avg_conf*100:.1f}%")
    print(f"Average Latency:        {avg_latency:.3f}s")
    print("="*60)

if __name__ == "__main__":
    # Ensure UTF-8 output
    sys.stdout.reconfigure(encoding='utf-8')
    run_competitive_benchmark()