mvm2-math-verification / scripts /test_step_by_step.py
Varshith dharmaj
Robust MVM2 System Sync: Fixed Imports and Restored Services
b25b8f2 verified
"""
Step-by-step testing of all components
"""
print("="*60)
print("TESTING ALL COMPONENTS ONE BY ONE")
print("="*60)
# Test 1: Model 1 - SymbolicVerifier
print("\n[1/10] Testing Model 1: SymbolicVerifier")
try:
from models.symbolic_verifier import SymbolicVerifier
verifier = SymbolicVerifier()
# Test valid calculation
result1 = verifier.verify(["3 + 2 = 5"])
print(f" โœ“ Valid test: {result1['verdict']} ({result1['confidence']*100:.0f}% confidence)")
# Test error calculation
result2 = verifier.verify(["5 - 1 = 6"])
print(f" โœ“ Error test: {result2['verdict']} ({result2['confidence']*100:.0f}% confidence, {len(result2['errors'])} errors found)")
print(" โœ… Model 1 PASSED")
except Exception as e:
print(f" โŒ Model 1 FAILED: {e}")
# Test 2: Model 2 - LLMLogicalChecker
print("\n[2/10] Testing Model 2: LLMLogicalChecker")
try:
from models.llm_logical_checker import LLMLogicalChecker
checker = LLMLogicalChecker("GPT-4")
result = checker.verify(["She buys 2 more: 3 + 2 = 5 apples"])
print(f" โœ“ Test: {result['verdict']} ({result['confidence']*100:.0f}% confidence)")
print(f" โœ“ Model name: {result['model_name']}")
print(" โœ… Model 2 PASSED")
except Exception as e:
print(f" โŒ Model 2 FAILED: {e}")
# Test 3: Model 3 - EnsembleNeuralChecker
print("\n[3/10] Testing Model 3: EnsembleNeuralChecker")
try:
from models.ensemble_neural_checker import EnsembleNeuralChecker
ensemble = EnsembleNeuralChecker(["GPT-4", "Llama 2", "Gemini"])
result = ensemble.verify(["5 - 1 = 6"])
print(f" โœ“ Test: {result['verdict']} ({result['confidence']*100:.0f}% confidence)")
print(f" โœ“ Agreement: {result['agreement']}")
print(f" โœ“ Sub-models: {result['sub_models']}")
print(" โœ… Model 3 PASSED")
except Exception as e:
print(f" โŒ Model 3 FAILED: {e}")
# Test 4: Consensus Mechanism
print("\n[4/10] Testing Consensus Mechanism")
try:
from consensus.consensus_mechanism import compute_consensus
from models.symbolic_verifier import SymbolicVerifier
from models.llm_logical_checker import LLMLogicalChecker
from models.ensemble_neural_checker import EnsembleNeuralChecker
steps = ["5 - 1 = 6"]
symbolic = SymbolicVerifier()
llm = LLMLogicalChecker()
ensemble = EnsembleNeuralChecker()
r1 = symbolic.verify(steps)
r2 = llm.verify(steps)
r3 = ensemble.verify(steps)
consensus = compute_consensus(r1, r2, r3)
print(f" โœ“ Final verdict: {consensus['final_verdict']}")
print(f" โœ“ Confidence: {consensus['overall_confidence']*100:.1f}%")
print(f" โœ“ Agreement: {consensus['agreement_type']}")
print(f" โœ“ Error score: {consensus['error_score']:.3f}")
print(" โœ… Consensus Mechanism PASSED")
except Exception as e:
print(f" โŒ Consensus Mechanism FAILED: {e}")
# Test 5: Error Classification
print("\n[5/10] Testing Error Classification")
try:
from utils.error_classifier import classify_error
error = {
"type": "calculation_error",
"found": "5 - 1 = 6",
"correct": "5 - 1 = 4",
"operation": "-",
"step_number": 1
}
classified = classify_error(error)
print(f" โœ“ Category: {classified['category']}")
print(f" โœ“ Severity: {classified['severity']}")
print(f" โœ“ Fixable: {classified['fixable']}")
print(f" โœ“ Fixability score: {classified['fixability_score']*100:.0f}%")
print(" โœ… Error Classification PASSED")
except Exception as e:
print(f" โŒ Error Classification FAILED: {e}")
# Test 6: Explanation Generation
print("\n[6/10] Testing Explanation Generation")
try:
from utils.explanation_generator import generate_explanation
error = {
"type": "calculation_error",
"found": "5 - 1 = 6",
"correct": "5 - 1 = 4",
"operation": "-",
"step_number": 1
}
explanation = generate_explanation(error)
print(f" โœ“ Explanation generated ({len(explanation)} chars)")
print(f" โœ“ Preview: {explanation[:80]}...")
print(" โœ… Explanation Generation PASSED")
except Exception as e:
print(f" โŒ Explanation Generation FAILED: {e}")
# Test 7: Error Correction
print("\n[7/10] Testing Error Correction")
try:
from utils.error_corrector import correct_solution
steps = ["She gives 1 away: 5 - 1 = 6 apples"]
errors = [{
"type": "calculation_error",
"found": "5 - 1 = 6",
"correct": "5 - 1 = 4",
"operation": "-",
"step_number": 1,
"fixable": True
}]
correction = correct_solution(steps, errors)
print(f" โœ“ Fixed: {correction['fixed_count']}/{correction['total_fixable']} errors")
print(f" โœ“ Success rate: {correction['success_rate']*100:.0f}%")
if correction['correction_log']:
print(f" โœ“ Original: {correction['correction_log'][0]['original']}")
print(f" โœ“ Corrected: {correction['correction_log'][0]['corrected']}")
print(" โœ… Error Correction PASSED")
except Exception as e:
print(f" โŒ Error Correction FAILED: {e}")
# Test 8: Full Integration
print("\n[8/10] Testing Full Integration (Parallel Execution)")
try:
from core.verification_engine import run_verification_parallel
problem = "Janet has 3 apples. She buys 2 more. She gives 1 away. How many?"
steps = [
"Janet starts with 3 apples",
"She buys 2 more: 3 + 2 = 5 apples",
"She gives 1 away: 5 - 1 = 6 apples" # ERROR
]
result = run_verification_parallel(
problem=problem,
steps=steps,
model_name="GPT-4",
model_list=["GPT-4", "Llama 2", "Gemini"]
)
print(f" โœ“ Processing time: {result['processing_time']:.2f}s")
print(f" โœ“ Final verdict: {result['consensus']['final_verdict']}")
print(f" โœ“ Confidence: {result['consensus']['overall_confidence']*100:.1f}%")
print(f" โœ“ Errors found: {len(result['classified_errors'])}")
print(f" โœ“ All 3 models executed: {len(result['model_results']) == 3}")
print(" โœ… Full Integration PASSED")
except Exception as e:
print(f" โŒ Full Integration FAILED: {e}")
import traceback
traceback.print_exc()
# Test 9: Check Dependencies
print("\n[9/10] Checking Dependencies")
try:
import streamlit
import sympy
import pytest
print(" โœ“ streamlit installed")
print(" โœ“ sympy installed")
print(" โœ“ pytest installed")
print(" โœ… All Dependencies Available")
except ImportError as e:
print(f" โš ๏ธ Missing dependency: {e}")
print(" Run: pip install -r requirements.txt")
# Test 10: Import Check for App
print("\n[10/10] Testing App Imports")
try:
# Try importing what app.py needs
import streamlit as st
import time
from typing import List, Dict, Any
from core import run_verification_parallel
print(" โœ“ All app imports successful")
print(" โœ… App Ready to Run")
except Exception as e:
print(f" โŒ App Import FAILED: {e}")
print("\n" + "="*60)
print("TESTING COMPLETE")
print("="*60)
print("\nNext steps:")
print("1. Run: streamlit run app.py")
print("2. Or test: python run_example.py")
print("3. Or run tests: pytest")