| """
|
| Step-by-step testing of all components
|
| """
|
|
|
| print("="*60)
|
| print("TESTING ALL COMPONENTS ONE BY ONE")
|
| print("="*60)
|
|
|
|
|
| print("\n[1/10] Testing Model 1: SymbolicVerifier")
|
| try:
|
| from models.symbolic_verifier import SymbolicVerifier
|
| verifier = SymbolicVerifier()
|
|
|
|
|
| result1 = verifier.verify(["3 + 2 = 5"])
|
| print(f" โ Valid test: {result1['verdict']} ({result1['confidence']*100:.0f}% confidence)")
|
|
|
|
|
| result2 = verifier.verify(["5 - 1 = 6"])
|
| print(f" โ Error test: {result2['verdict']} ({result2['confidence']*100:.0f}% confidence, {len(result2['errors'])} errors found)")
|
|
|
| print(" โ
Model 1 PASSED")
|
| except Exception as e:
|
| print(f" โ Model 1 FAILED: {e}")
|
|
|
|
|
| print("\n[2/10] Testing Model 2: LLMLogicalChecker")
|
| try:
|
| from models.llm_logical_checker import LLMLogicalChecker
|
| checker = LLMLogicalChecker("GPT-4")
|
|
|
| result = checker.verify(["She buys 2 more: 3 + 2 = 5 apples"])
|
| print(f" โ Test: {result['verdict']} ({result['confidence']*100:.0f}% confidence)")
|
| print(f" โ Model name: {result['model_name']}")
|
|
|
| print(" โ
Model 2 PASSED")
|
| except Exception as e:
|
| print(f" โ Model 2 FAILED: {e}")
|
|
|
|
|
| print("\n[3/10] Testing Model 3: EnsembleNeuralChecker")
|
| try:
|
| from models.ensemble_neural_checker import EnsembleNeuralChecker
|
| ensemble = EnsembleNeuralChecker(["GPT-4", "Llama 2", "Gemini"])
|
|
|
| result = ensemble.verify(["5 - 1 = 6"])
|
| print(f" โ Test: {result['verdict']} ({result['confidence']*100:.0f}% confidence)")
|
| print(f" โ Agreement: {result['agreement']}")
|
| print(f" โ Sub-models: {result['sub_models']}")
|
|
|
| print(" โ
Model 3 PASSED")
|
| except Exception as e:
|
| print(f" โ Model 3 FAILED: {e}")
|
|
|
|
|
| print("\n[4/10] Testing Consensus Mechanism")
|
| try:
|
| from consensus.consensus_mechanism import compute_consensus
|
| from models.symbolic_verifier import SymbolicVerifier
|
| from models.llm_logical_checker import LLMLogicalChecker
|
| from models.ensemble_neural_checker import EnsembleNeuralChecker
|
|
|
| steps = ["5 - 1 = 6"]
|
| symbolic = SymbolicVerifier()
|
| llm = LLMLogicalChecker()
|
| ensemble = EnsembleNeuralChecker()
|
|
|
| r1 = symbolic.verify(steps)
|
| r2 = llm.verify(steps)
|
| r3 = ensemble.verify(steps)
|
|
|
| consensus = compute_consensus(r1, r2, r3)
|
| print(f" โ Final verdict: {consensus['final_verdict']}")
|
| print(f" โ Confidence: {consensus['overall_confidence']*100:.1f}%")
|
| print(f" โ Agreement: {consensus['agreement_type']}")
|
| print(f" โ Error score: {consensus['error_score']:.3f}")
|
|
|
| print(" โ
Consensus Mechanism PASSED")
|
| except Exception as e:
|
| print(f" โ Consensus Mechanism FAILED: {e}")
|
|
|
|
|
| print("\n[5/10] Testing Error Classification")
|
| try:
|
| from utils.error_classifier import classify_error
|
|
|
| error = {
|
| "type": "calculation_error",
|
| "found": "5 - 1 = 6",
|
| "correct": "5 - 1 = 4",
|
| "operation": "-",
|
| "step_number": 1
|
| }
|
|
|
| classified = classify_error(error)
|
| print(f" โ Category: {classified['category']}")
|
| print(f" โ Severity: {classified['severity']}")
|
| print(f" โ Fixable: {classified['fixable']}")
|
| print(f" โ Fixability score: {classified['fixability_score']*100:.0f}%")
|
|
|
| print(" โ
Error Classification PASSED")
|
| except Exception as e:
|
| print(f" โ Error Classification FAILED: {e}")
|
|
|
|
|
| print("\n[6/10] Testing Explanation Generation")
|
| try:
|
| from utils.explanation_generator import generate_explanation
|
|
|
| error = {
|
| "type": "calculation_error",
|
| "found": "5 - 1 = 6",
|
| "correct": "5 - 1 = 4",
|
| "operation": "-",
|
| "step_number": 1
|
| }
|
|
|
| explanation = generate_explanation(error)
|
| print(f" โ Explanation generated ({len(explanation)} chars)")
|
| print(f" โ Preview: {explanation[:80]}...")
|
|
|
| print(" โ
Explanation Generation PASSED")
|
| except Exception as e:
|
| print(f" โ Explanation Generation FAILED: {e}")
|
|
|
|
|
| print("\n[7/10] Testing Error Correction")
|
| try:
|
| from utils.error_corrector import correct_solution
|
|
|
| steps = ["She gives 1 away: 5 - 1 = 6 apples"]
|
| errors = [{
|
| "type": "calculation_error",
|
| "found": "5 - 1 = 6",
|
| "correct": "5 - 1 = 4",
|
| "operation": "-",
|
| "step_number": 1,
|
| "fixable": True
|
| }]
|
|
|
| correction = correct_solution(steps, errors)
|
| print(f" โ Fixed: {correction['fixed_count']}/{correction['total_fixable']} errors")
|
| print(f" โ Success rate: {correction['success_rate']*100:.0f}%")
|
| if correction['correction_log']:
|
| print(f" โ Original: {correction['correction_log'][0]['original']}")
|
| print(f" โ Corrected: {correction['correction_log'][0]['corrected']}")
|
|
|
| print(" โ
Error Correction PASSED")
|
| except Exception as e:
|
| print(f" โ Error Correction FAILED: {e}")
|
|
|
|
|
| print("\n[8/10] Testing Full Integration (Parallel Execution)")
|
| try:
|
| from core.verification_engine import run_verification_parallel
|
|
|
| problem = "Janet has 3 apples. She buys 2 more. She gives 1 away. How many?"
|
| steps = [
|
| "Janet starts with 3 apples",
|
| "She buys 2 more: 3 + 2 = 5 apples",
|
| "She gives 1 away: 5 - 1 = 6 apples"
|
| ]
|
|
|
| result = run_verification_parallel(
|
| problem=problem,
|
| steps=steps,
|
| model_name="GPT-4",
|
| model_list=["GPT-4", "Llama 2", "Gemini"]
|
| )
|
|
|
| print(f" โ Processing time: {result['processing_time']:.2f}s")
|
| print(f" โ Final verdict: {result['consensus']['final_verdict']}")
|
| print(f" โ Confidence: {result['consensus']['overall_confidence']*100:.1f}%")
|
| print(f" โ Errors found: {len(result['classified_errors'])}")
|
| print(f" โ All 3 models executed: {len(result['model_results']) == 3}")
|
|
|
| print(" โ
Full Integration PASSED")
|
| except Exception as e:
|
| print(f" โ Full Integration FAILED: {e}")
|
| import traceback
|
| traceback.print_exc()
|
|
|
|
|
| print("\n[9/10] Checking Dependencies")
|
| try:
|
| import streamlit
|
| import sympy
|
| import pytest
|
| print(" โ streamlit installed")
|
| print(" โ sympy installed")
|
| print(" โ pytest installed")
|
| print(" โ
All Dependencies Available")
|
| except ImportError as e:
|
| print(f" โ ๏ธ Missing dependency: {e}")
|
| print(" Run: pip install -r requirements.txt")
|
|
|
|
|
| print("\n[10/10] Testing App Imports")
|
| try:
|
|
|
| import streamlit as st
|
| import time
|
| from typing import List, Dict, Any
|
| from core import run_verification_parallel
|
| print(" โ All app imports successful")
|
| print(" โ
App Ready to Run")
|
| except Exception as e:
|
| print(f" โ App Import FAILED: {e}")
|
|
|
| print("\n" + "="*60)
|
| print("TESTING COMPLETE")
|
| print("="*60)
|
| print("\nNext steps:")
|
| print("1. Run: streamlit run app.py")
|
| print("2. Or test: python run_example.py")
|
| print("3. Or run tests: pytest")
|
|
|
|
|