Spaces:

Varshithdharmajv
/

mvm2-math-verification

Running

File size: 5,492 Bytes

b25b8f2

import time
import asyncio
import sys
import os

# Ensure the root of the project is in the path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from core.verification_engine import run_verification_parallel

COMPETITIVE_EXAM_MOCK = [
    {
        "exam": "GATE (CS) - Linear Algebra",
        "question": "Let M be a 2x2 matrix such that M = [[4, 1], [2, 3]]. Find the sum of the eigenvalues of M.",
        "steps": [
            "The sum of the eigenvalues of a matrix is equal to its trace.",
            "Trace(M) = 4 + 3",
            "Trace(M) = 7",
            "Therefore, the sum of the eigenvalues is 7."
        ],
        "answer": "7"
    },
    {
        "exam": "JEE Advanced - Calculus",
        "question": "Evaluate the definite integral of x * e^x from x=0 to x=1.",
        "steps": [
            r"Use integration by parts: \int u dv = uv - \int v du",
            r"Let u = x, so du = dx. Let dv = e^x dx, so v = e^x.",
            r"\int_0^1 x e^x dx = [x e^x]_0^1 - \int_0^1 e^x dx",
            "= (1 * e^1 - 0 * e^0) - [e^x]_0^1",
            "= e - (e^1 - e^0)",
            "= e - e + 1 = 1",
            "The final evaluated definite integral is 1."
        ],
        "answer": "1"
    },
    {
        "exam": "GATE (EC) - Probability",
        "question": "A box contains 4 red balls and 6 black balls. Three balls are drawn at random without replacement. What is the probability that exactly two are red?",
        "steps": [
            "Total ways to draw 3 balls from 10 is C(10,3).",
            "C(10,3) = (10*9*8)/(3*2*1) = 120",
            "Ways to draw exactly 2 red balls from 4 is C(4,2) = 6.",
            "Ways to draw 1 black ball from 6 is C(6,1) = 6.",
            "Total favorable ways = 6 * 6 = 36.",
            "Probability = 36 / 120 = 3 / 10 = 0.3."
        ],
        "answer": "0.3"
    },
    {
        "exam": "JEE Mains - Kinematics Paradox",
        "question": "A particle moves such that its velocity v is given by v = t^2 - 4t + 3. Find the acceleration when velocity is zero.",
        "steps": [
            "Velocity v = t^2 - 4t + 3 = 0",
            "(t - 1)(t - 3) = 0",
            "So t = 1 or t = 3.",
            "Acceleration a = dv/dt = 2t - 4.",
            "At t = 1, a = 2(1) - 4 = -2.",
            "At t = 3, a = 2(3) - 4 = 2.",
            "The accelerations are -2 and 2."
        ],
        "answer": "2" # or -2, testing logic handling branching paths
    },
    {
        "exam": "GATE (ME) - Differential Equations",
        "question": "Solve the initial value problem dy/dx = 2xy, y(0) = 1. Find y at x = 1.",
        "steps": [
            "dy/y = 2x dx",
            r"\int dy/y = \int 2x dx",
            "ln(y) = x^2 + C",
            "Use y(0) = 1: ln(1) = 0 + C => C = 0",
            "ln(y) = x^2 => y = e^(x^2)",
            "At x = 1, y = e^(1^2) = e."
        ],
        "answer": "e"
    }
]

def evaluate_competitive_problem(item: dict):
    problem = item["question"]
    steps = item["steps"]
    expected = item["answer"]
    exam = item["exam"]
    
    print(f"\n[EVALUATING] {exam}")
    print(f"Problem: {problem[:80]}...")
    
    # Consume generator to get final consensus
    result = None
    # We use all 4 models to simulate max rigorous verification for competitive exams
    active_models = ["GPT-4", "Claude 3.5 Sonnet", "Gemini 1.5 Pro", "Llama 3"]
    
    for partial_res in run_verification_parallel(problem, steps, model_name="Ensemble", model_list=active_models):
        if partial_res["type"] == "final":
            result = partial_res
            
    consensus = result.get("consensus", {})
    verdict = consensus.get("final_verdict", "ERROR")
    latency = result.get("processing_time", 0.0)
    confidence = consensus.get("overall_confidence", 0.0)
    
    is_correct = verdict == "VALID"
    
    if not is_correct:
        print(f"    [ATTENTION] System flagged errors in logic: {result.get('classified_errors', [])}")
        
    return is_correct, latency, confidence

def run_competitive_benchmark():
    num_samples = len(COMPETITIVE_EXAM_MOCK)
    correct_count = 0
    latencies = []
    confidences = []
    
    print("="*60)
    print("🎓 MVM² ADVANCED COMPETITIVE EXAM BENCHMARK (GATE / JEE)")
    print("="*60)
    print(f"Total problems queued: {num_samples}")
    
    for item in COMPETITIVE_EXAM_MOCK:
        is_correct, lat, conf = evaluate_competitive_problem(item)
        if is_correct: 
            correct_count += 1
        latencies.append(lat)
        confidences.append(conf)
        print(f"  -> Result: {'✅ VERIFIED' if is_correct else '❌ FLAGGED'} | Confidence: {conf*100:.1f}% | Latency: {lat:.3f}s")
        
    accuracy = (correct_count / num_samples) * 100
    avg_latency = sum(latencies) / len(latencies) if latencies else 0
    avg_conf = sum(confidences) / len(confidences) if confidences else 0
    
    print("\n" + "="*60)
    print("🏆 FINAL COMPETITIVE BENCHMARK METRICS")
    print("="*60)
    print(f"Advanced Exam Accuracy: {accuracy:.1f}% (Expected > 85%)")
    print(f"Average Confidence:     {avg_conf*100:.1f}%")
    print(f"Average Latency:        {avg_latency:.3f}s")
    print("="*60)

if __name__ == "__main__":
    # Ensure UTF-8 output
    sys.stdout.reconfigure(encoding='utf-8')
    run_competitive_benchmark()