Spaces:

Varshithdharmajv
/

mvm2-math-verification

Running

mvm2-math-verification / scripts /benchmark_competitive.py

Varshith dharmaj

Robust MVM2 System Sync: Fixed Imports and Restored Services

b25b8f2 verified 17 days ago

5.49 kB

	import time
	import asyncio
	import sys
	import os

	# Ensure the root of the project is in the path
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

	from core.verification_engine import run_verification_parallel

	COMPETITIVE_EXAM_MOCK = [
	{
	"exam": "GATE (CS) - Linear Algebra",
	"question": "Let M be a 2x2 matrix such that M = [[4, 1], [2, 3]]. Find the sum of the eigenvalues of M.",
	"steps": [
	"The sum of the eigenvalues of a matrix is equal to its trace.",
	"Trace(M) = 4 + 3",
	"Trace(M) = 7",
	"Therefore, the sum of the eigenvalues is 7."
	],
	"answer": "7"
	},
	{
	"exam": "JEE Advanced - Calculus",
	"question": "Evaluate the definite integral of x * e^x from x=0 to x=1.",
	"steps": [
	r"Use integration by parts: \int u dv = uv - \int v du",
	r"Let u = x, so du = dx. Let dv = e^x dx, so v = e^x.",
	r"\int_0^1 x e^x dx = [x e^x]_0^1 - \int_0^1 e^x dx",
	"= (1 * e^1 - 0 * e^0) - [e^x]_0^1",
	"= e - (e^1 - e^0)",
	"= e - e + 1 = 1",
	"The final evaluated definite integral is 1."
	],
	"answer": "1"
	},
	{
	"exam": "GATE (EC) - Probability",
	"question": "A box contains 4 red balls and 6 black balls. Three balls are drawn at random without replacement. What is the probability that exactly two are red?",
	"steps": [
	"Total ways to draw 3 balls from 10 is C(10,3).",
	"C(10,3) = (1098)/(321) = 120",
	"Ways to draw exactly 2 red balls from 4 is C(4,2) = 6.",
	"Ways to draw 1 black ball from 6 is C(6,1) = 6.",
	"Total favorable ways = 6 * 6 = 36.",
	"Probability = 36 / 120 = 3 / 10 = 0.3."
	],
	"answer": "0.3"
	},
	{
	"exam": "JEE Mains - Kinematics Paradox",
	"question": "A particle moves such that its velocity v is given by v = t^2 - 4t + 3. Find the acceleration when velocity is zero.",
	"steps": [
	"Velocity v = t^2 - 4t + 3 = 0",
	"(t - 1)(t - 3) = 0",
	"So t = 1 or t = 3.",
	"Acceleration a = dv/dt = 2t - 4.",
	"At t = 1, a = 2(1) - 4 = -2.",
	"At t = 3, a = 2(3) - 4 = 2.",
	"The accelerations are -2 and 2."
	],
	"answer": "2" # or -2, testing logic handling branching paths
	},
	{
	"exam": "GATE (ME) - Differential Equations",
	"question": "Solve the initial value problem dy/dx = 2xy, y(0) = 1. Find y at x = 1.",
	"steps": [
	"dy/y = 2x dx",
	r"\int dy/y = \int 2x dx",
	"ln(y) = x^2 + C",
	"Use y(0) = 1: ln(1) = 0 + C => C = 0",
	"ln(y) = x^2 => y = e^(x^2)",
	"At x = 1, y = e^(1^2) = e."
	],
	"answer": "e"
	}
	]

	def evaluate_competitive_problem(item: dict):
	problem = item["question"]
	steps = item["steps"]
	expected = item["answer"]
	exam = item["exam"]

	print(f"\n[EVALUATING] {exam}")
	print(f"Problem: {problem[:80]}...")

	# Consume generator to get final consensus
	result = None
	# We use all 4 models to simulate max rigorous verification for competitive exams
	active_models = ["GPT-4", "Claude 3.5 Sonnet", "Gemini 1.5 Pro", "Llama 3"]

	for partial_res in run_verification_parallel(problem, steps, model_name="Ensemble", model_list=active_models):
	if partial_res["type"] == "final":
	result = partial_res

	consensus = result.get("consensus", {})
	verdict = consensus.get("final_verdict", "ERROR")
	latency = result.get("processing_time", 0.0)
	confidence = consensus.get("overall_confidence", 0.0)

	is_correct = verdict == "VALID"

	if not is_correct:
	print(f" [ATTENTION] System flagged errors in logic: {result.get('classified_errors', [])}")

	return is_correct, latency, confidence

	def run_competitive_benchmark():
	num_samples = len(COMPETITIVE_EXAM_MOCK)
	correct_count = 0
	latencies = []
	confidences = []

	print("="*60)
	print("🎓 MVM² ADVANCED COMPETITIVE EXAM BENCHMARK (GATE / JEE)")
	print("="*60)
	print(f"Total problems queued: {num_samples}")

	for item in COMPETITIVE_EXAM_MOCK:
	is_correct, lat, conf = evaluate_competitive_problem(item)
	if is_correct:
	correct_count += 1
	latencies.append(lat)
	confidences.append(conf)
	print(f" -> Result: {'✅ VERIFIED' if is_correct else '❌ FLAGGED'} \| Confidence: {conf*100:.1f}% \| Latency: {lat:.3f}s")

	accuracy = (correct_count / num_samples) * 100
	avg_latency = sum(latencies) / len(latencies) if latencies else 0
	avg_conf = sum(confidences) / len(confidences) if confidences else 0

	print("\n" + "="*60)
	print("🏆 FINAL COMPETITIVE BENCHMARK METRICS")
	print("="*60)
	print(f"Advanced Exam Accuracy: {accuracy:.1f}% (Expected > 85%)")
	print(f"Average Confidence: {avg_conf*100:.1f}%")
	print(f"Average Latency: {avg_latency:.3f}s")
	print("="*60)

	if __name__ == "__main__":
	# Ensure UTF-8 output
	sys.stdout.reconfigure(encoding='utf-8')
	run_competitive_benchmark()