| """
|
| Streamlit Dashboard for Mathematical Reasoning Verification System
|
| Interactive UI with real-time processing logs and results display
|
| """
|
|
|
| import streamlit as st
|
| import time
|
| from typing import List, Dict, Any
|
|
|
| from core import run_verification_parallel
|
|
|
|
|
|
|
| if 'steps_log' not in st.session_state:
|
| st.session_state.steps_log = []
|
| if 'results' not in st.session_state:
|
| st.session_state.results = None
|
|
|
|
|
| def add_log(step: str, model: str, status: str, details: str):
|
| """Add entry to processing log."""
|
| log_entry = {
|
| "step": step,
|
| "model": model,
|
| "status": status,
|
| "details": details,
|
| "timestamp": time.time()
|
| }
|
| st.session_state.steps_log.append(log_entry)
|
|
|
|
|
| def display_flowchart(problem="", steps_input=""):
|
| """Display interactive flowchart with expandable explanations."""
|
|
|
|
|
| has_results = st.session_state.results is not None
|
| results = st.session_state.results if has_results else None
|
|
|
|
|
| steps = []
|
| if steps_input:
|
| steps = [s.strip() for s in steps_input.split('\n') if s.strip()]
|
| elif has_results:
|
| steps = results.get('steps', [])
|
|
|
|
|
| if problem or steps:
|
| st.markdown("### ๐ Problem Flowchart")
|
| st.markdown("**Problem:**")
|
| st.info(problem if problem else "No problem entered yet")
|
|
|
| if steps:
|
| st.markdown("**Solution Flow:**")
|
|
|
| flowchart_lines = []
|
| flowchart_lines.append("```")
|
|
|
| problem_display = problem[:45] + "..." if len(problem) > 45 else problem
|
| problem_display = problem_display.ljust(50)
|
| flowchart_lines.append("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| flowchart_lines.append(f"โ ๐ฅ PROBLEM: {problem_display} โ")
|
| flowchart_lines.append("โโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| flowchart_lines.append(" โ")
|
| flowchart_lines.append(" โผ")
|
|
|
| for i, step in enumerate(steps, 1):
|
|
|
| step_short = step[:45] + "..." if len(step) > 45 else step
|
|
|
|
|
| has_error = False
|
| if has_results:
|
| classified_errors = results.get('classified_errors', [])
|
| for error in classified_errors:
|
| if error.get('step_number') == i:
|
| has_error = True
|
| break
|
|
|
|
|
| status_icon = "โ" if has_error else "โ
"
|
|
|
|
|
| step_display = step_short.ljust(45)
|
|
|
| if i < len(steps):
|
| flowchart_lines.append(f"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| flowchart_lines.append(f"โ {status_icon} STEP {i}: {step_display} โ")
|
| flowchart_lines.append("โโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| flowchart_lines.append(" โ")
|
| flowchart_lines.append(" โผ")
|
| else:
|
| flowchart_lines.append(f"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| flowchart_lines.append(f"โ {status_icon} STEP {i}: {step_display} โ")
|
| flowchart_lines.append("โโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| flowchart_lines.append(" โ")
|
| flowchart_lines.append(" โผ")
|
| flowchart_lines.append("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| flowchart_lines.append("โ ๐ค FINAL ANSWER โ")
|
| if has_results:
|
| consensus = results.get('consensus', {})
|
| final_verdict = consensus.get('final_verdict', 'UNKNOWN')
|
| verdict_icon = "โ ERROR" if final_verdict == "ERROR" else "โ
VALID"
|
| verdict_display = verdict_icon.ljust(55)
|
| flowchart_lines.append(f"โ {verdict_display} โ")
|
| flowchart_lines.append("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
|
|
| flowchart_lines.append("```")
|
| flowchart_text = "\n".join(flowchart_lines)
|
| st.code(flowchart_text, language=None)
|
|
|
|
|
| st.markdown("**Step Details:**")
|
| for i, step in enumerate(steps, 1):
|
|
|
| step_errors = []
|
| if has_results:
|
| classified_errors = results.get('classified_errors', [])
|
| step_errors = [e for e in classified_errors if e.get('step_number') == i]
|
|
|
| if step_errors:
|
| with st.expander(f"Step {i}: {step[:60]}{'...' if len(step) > 60 else ''} โ", expanded=False):
|
| st.write(f"**Full Step:** {step}")
|
| for error in step_errors:
|
| st.error(f"**Error Found:** {error.get('category', 'Unknown')}")
|
| st.write(f"- Found: `{error.get('found', 'N/A')}`")
|
| st.write(f"- Correct: `{error.get('correct', 'N/A')}`")
|
| explanations = results.get('explanations', {})
|
| if i in explanations:
|
| st.info(f"**Explanation:** {explanations[i]}")
|
| else:
|
| with st.expander(f"Step {i}: {step[:60]}{'...' if len(step) > 60 else ''} โ
", expanded=False):
|
| st.write(f"**Full Step:** {step}")
|
| st.success("No errors detected in this step")
|
| else:
|
| st.info("Enter solution steps to see the problem flowchart")
|
|
|
| st.markdown("---")
|
|
|
|
|
| with st.expander("๐ฅ **STEP 1: INPUT** - Problem & Solution Steps", expanded=True):
|
| st.markdown("""
|
| **What happens here?**
|
| - The system receives the math problem and step-by-step solution
|
| - Input is validated and prepared for processing
|
| - Steps are parsed and segmented for analysis
|
| """)
|
| if problem:
|
| st.success(f"โ
Received problem: {problem}")
|
| if steps:
|
| st.success(f"โ
Received {len(steps)} solution steps")
|
| if has_results:
|
| st.code(f"Problem: {results.get('problem', '')[:100]}...")
|
|
|
|
|
| with st.expander("๐ **STEP 2: PARSING** - Extract Mathematical Expressions", expanded=has_results):
|
| st.markdown("""
|
| **What happens here?**
|
| - Mathematical expressions are extracted using regex patterns
|
| - Operations (+, -, *, /) are identified
|
| - Numbers and variables are recognized
|
| - Each step is prepared for verification
|
| """)
|
| if has_results:
|
| steps = results.get('steps', [])
|
| st.success(f"โ
Parsed {len(steps)} steps")
|
| for i, step in enumerate(steps[:3], 1):
|
| st.write(f" Step {i}: {step[:60]}...")
|
| if len(steps) > 3:
|
| st.write(f" ... and {len(steps) - 3} more steps")
|
|
|
|
|
| with st.expander("๐ **STEP 3: PARALLEL EXECUTION** - 3 Models Running Simultaneously", expanded=has_results):
|
| st.markdown("""
|
| **What happens here?**
|
| - **Model 1 (Symbolic) ๐ข**: Uses SymPy to verify all arithmetic calculations
|
| - Weight: 40% (most reliable for math)
|
| - Not affected by sidebar selection
|
|
|
| - **Model 2 (LLM Logical) ๐ง **: Checks for logical consistency and contradictions
|
| - Weight: 35%
|
| - Uses first selected model from sidebar (e.g., GPT-4)
|
| - Currently: Pattern-based simulation
|
|
|
| - **Model 3 (Ensemble) ๐ค**: Simulates multiple LLMs voting on solution validity
|
| - Weight: 25%
|
| - Uses ALL selected models from sidebar (GPT-4, Llama 2, Gemini)
|
| - Each model votes, majority wins
|
| - Currently: Pattern-based simulation
|
|
|
| All three models run **in parallel** using ThreadPoolExecutor for speed!
|
| """)
|
| if has_results:
|
| model_results = results.get('model_results', {})
|
| col1, col2, col3 = st.columns(3)
|
|
|
| with col1:
|
| if 'symbolic' in model_results:
|
| verdict = model_results['symbolic']['verdict']
|
| conf = model_results['symbolic']['confidence'] * 100
|
| errors = len(model_results['symbolic'].get('errors', []))
|
| if verdict == "ERROR":
|
| st.error(f"**๐ข Symbolic**\n\nโ {verdict}\n\nConfidence: {conf:.1f}%\n\nErrors: {errors}")
|
| else:
|
| st.success(f"**๐ข Symbolic**\n\nโ
{verdict}\n\nConfidence: {conf:.1f}%\n\nErrors: {errors}")
|
|
|
| with col2:
|
| if 'llm_logical' in model_results:
|
| verdict = model_results['llm_logical']['verdict']
|
| conf = model_results['llm_logical']['confidence'] * 100
|
| errors = len(model_results['llm_logical'].get('errors', []))
|
| if verdict == "ERROR":
|
| st.error(f"**๐ง LLM Logical**\n\nโ {verdict}\n\nConfidence: {conf:.1f}%\n\nErrors: {errors}")
|
| else:
|
| st.success(f"**๐ง LLM Logical**\n\nโ
{verdict}\n\nConfidence: {conf:.1f}%\n\nErrors: {errors}")
|
|
|
| with col3:
|
| if 'ensemble' in model_results:
|
| verdict = model_results['ensemble']['verdict']
|
| conf = model_results['ensemble']['confidence'] * 100
|
| agreement = model_results['ensemble'].get('agreement', 'N/A')
|
| if verdict == "ERROR":
|
| st.error(f"**๐ค Ensemble**\n\nโ {verdict}\n\nConfidence: {conf:.1f}%\n\nAgreement: {agreement}")
|
| else:
|
| st.success(f"**๐ค Ensemble**\n\nโ
{verdict}\n\nConfidence: {conf:.1f}%\n\nAgreement: {agreement}")
|
| else:
|
| st.info("โณ Models will execute in parallel when you click 'Verify Solution'")
|
|
|
|
|
| with st.expander("โ๏ธ **STEP 4: CONSENSUS** - Weighted Voting Mechanism", expanded=has_results):
|
| st.markdown("""
|
| **What happens here?**
|
| - The system combines results from all 3 models using **weighted voting**:
|
| - **Symbolic Model**: 40% weight (most reliable for arithmetic)
|
| - **LLM Logical Model**: 35% weight (good for reasoning)
|
| - **Ensemble Model**: 25% weight (provides diversity)
|
|
|
| - An **error score** is calculated: if > 0.50, verdict = ERROR
|
| - **Confidence** is adjusted based on agreement:
|
| - All 3 agree: confidence boosted by 10%
|
| - 2/3 agree: uses average of agreeing models
|
| - Mixed: confidence penalized by 20%
|
| """)
|
| if has_results:
|
| consensus = results.get('consensus', {})
|
| final_verdict = consensus.get('final_verdict', 'UNKNOWN')
|
| overall_conf = consensus.get('overall_confidence', 0) * 100
|
| error_score = consensus.get('error_score', 0)
|
| agreement = consensus.get('agreement_type', 'UNKNOWN')
|
|
|
| st.markdown(f"""
|
| **Consensus Results:**
|
| - **Final Verdict**: {'โ ERROR' if final_verdict == 'ERROR' else 'โ
VALID'}
|
| - **Overall Confidence**: {overall_conf:.1f}%
|
| - **Error Score**: {error_score:.3f} (threshold: 0.50)
|
| - **Agreement Type**: {agreement}
|
| """)
|
|
|
|
|
| st.markdown("**Model Contributions:**")
|
| individual_verdicts = consensus.get('individual_verdicts', {})
|
| individual_confidences = consensus.get('individual_confidences', {})
|
| weights = {"symbolic": 0.40, "llm_logical": 0.35, "ensemble": 0.25}
|
|
|
| for model_name, verdict in individual_verdicts.items():
|
| weight = weights.get(model_name, 0)
|
| confidence = individual_confidences.get(model_name, 0) * 100
|
| contribution = weight * individual_confidences.get(model_name, 0)
|
| st.write(f" - **{model_name.title()}**: {verdict} ({confidence:.1f}% confidence) โ {weight*100:.0f}% weight โ contributes {contribution:.3f}")
|
|
|
|
|
| with st.expander("๐ท๏ธ **STEP 5: ERROR CLASSIFICATION** - Categorize & Analyze Errors", expanded=has_results and len(results.get('classified_errors', [])) > 0):
|
| st.markdown("""
|
| **What happens here?**
|
| - Each detected error is classified into one of 10+ error types:
|
| - Arithmetic Error (calculation mistakes)
|
| - Logical Error (contradictions)
|
| - Operation Mismatch (says one thing, does another)
|
| - Semantic Error (meaning doesn't match)
|
| - And more...
|
|
|
| - **Severity** is assigned: HIGH, MEDIUM, or LOW
|
| - **Fixability** is assessed: can the error be auto-corrected?
|
| """)
|
| if has_results:
|
| classified_errors = results.get('classified_errors', [])
|
| if classified_errors:
|
| st.success(f"โ
Classified {len(classified_errors)} error(s)")
|
| for error in classified_errors[:3]:
|
| st.markdown(f"""
|
| **Error in Step {error.get('step_number', '?')}:**
|
| - **Category**: {error.get('category', 'Unknown')}
|
| - **Severity**: {error.get('severity', 'Unknown')}
|
| - **Fixable**: {'Yes' if error.get('fixable', False) else 'No'}
|
| - **Fixability Score**: {error.get('fixability_score', 0)*100:.0f}%
|
| """)
|
| else:
|
| st.info("โ
No errors found - solution is valid!")
|
|
|
|
|
| with st.expander("๐ฌ **STEP 6: EXPLANATION GENERATION** - Create Human-Readable Explanations", expanded=has_results and len(results.get('explanations', {})) > 0):
|
| st.markdown("""
|
| **What happens here?**
|
| - For each error, a natural language explanation is generated
|
| - Explains **why** the error occurred
|
| - Provides educational context
|
| - Suggests how to avoid similar mistakes
|
| - Includes learning tips
|
| """)
|
| if has_results:
|
| explanations = results.get('explanations', {})
|
| if explanations:
|
| st.success(f"โ
Generated {len(explanations)} explanation(s)")
|
| for step_num, explanation in list(explanations.items())[:2]:
|
| with st.container():
|
| st.markdown(f"**Step {step_num} Explanation:**")
|
| st.info(explanation)
|
| else:
|
| st.info("โ
No explanations needed - solution is correct!")
|
|
|
|
|
| with st.expander("๐ง **STEP 7: ERROR CORRECTION** - Automatic Fixes", expanded=has_results and results.get('correction', {}).get('fixed_count', 0) > 0):
|
| st.markdown("""
|
| **What happens here?**
|
| - Fixable errors are automatically corrected
|
| - Arithmetic errors: correct values are calculated and replaced
|
| - Operation mismatches: operations are corrected
|
| - Success rate is tracked for each error type
|
| - Errors requiring manual review are flagged
|
| """)
|
| if has_results:
|
| correction = results.get('correction', {})
|
| fixed_count = correction.get('fixed_count', 0)
|
| if fixed_count > 0:
|
| st.success(f"โ
Fixed {fixed_count} error(s)")
|
| st.write(f"**Success Rate**: {correction.get('success_rate', 0)*100:.1f}%")
|
| correction_log = correction.get('correction_log', [])
|
| if correction_log:
|
| for log_entry in correction_log[:2]:
|
| st.markdown(f"""
|
| **Step {log_entry['step']} ({log_entry['type']}):**
|
| - Original: `{log_entry['original']}`
|
| - Corrected: `{log_entry['corrected']}`
|
| """)
|
| else:
|
| st.info("โ
No corrections needed")
|
|
|
|
|
| with st.expander("๐ค **STEP 8: OUTPUT** - Final Results", expanded=has_results):
|
| st.markdown("""
|
| **What happens here?**
|
| - Final verdict is displayed (VALID or ERROR)
|
| - Overall confidence score is shown
|
| - All errors with explanations are presented
|
| - Processing time is reported
|
| - Results are ready for review
|
| """)
|
| if has_results:
|
| consensus = results.get('consensus', {})
|
| final_verdict = consensus.get('final_verdict', 'UNKNOWN')
|
| overall_conf = consensus.get('overall_confidence', 0) * 100
|
| processing_time = results.get('processing_time', 0)
|
| total_errors = len(results.get('classified_errors', []))
|
|
|
| if final_verdict == "ERROR":
|
| st.error(f"**Final Verdict**: โ {final_verdict}")
|
| else:
|
| st.success(f"**Final Verdict**: โ
{final_verdict}")
|
|
|
| st.metric("Overall Confidence", f"{overall_conf:.1f}%")
|
| st.metric("Processing Time", f"{processing_time:.3f}s")
|
| st.metric("Total Errors Found", total_errors)
|
|
|
| st.success("โ
Verification complete! Results displayed above.")
|
| else:
|
| st.info("โณ Results will appear here after verification")
|
|
|
|
|
| st.markdown("---")
|
| st.markdown("### ๐ Processing Flow Diagram")
|
| st.markdown("""
|
| ```
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ ๐ฅ INPUT โ
|
| โ Problem + Solution Steps โ
|
| โโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ
|
| โผ
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ ๐ PARSING โ
|
| โ Extract expressions, identify operations โ
|
| โโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ
|
| โผ
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ ๐ PARALLEL EXECUTION โ
|
| โ โโโโโโโโโโโโ โโโโโโโโโโโโ โโโโโโโโโโโโ โ
|
| โ โ Symbolic โ โ LLM โ โ Ensemble โ โ
|
| โ โ (SymPy) โ โ Logical โ โ (Voting)โ โ
|
| โ โ 40% โ โ 35% โ โ 25% โ โ
|
| โ โโโโโโฌโโโโโโ โโโโโโฌโโโโโโ โโโโโโฌโโโโโโ โ
|
| โโโโโโโโโผโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ โ โ
|
| โโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโ
|
| โ
|
| โผ
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ โ๏ธ CONSENSUS โ
|
| โ Weighted Voting โ Final Verdict & Confidence โ
|
| โโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ
|
| โผ
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ ๐ท๏ธ ERROR CLASSIFICATION โ
|
| โ Categorize โ Severity โ Fixability โ
|
| โโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ
|
| โผ
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ ๐ฌ EXPLANATION GENERATION โ
|
| โ Natural language explanations for each error โ
|
| โโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ
|
| โผ
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ ๐ง ERROR CORRECTION โ
|
| โ Auto-fix fixable errors โ Track success rate โ
|
| โโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ
|
| โผ
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ ๐ค OUTPUT โ
|
| โ Final Verdict + Confidence + All Details โ
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| ```""")
|
|
|
|
|
| def display_logs():
|
| """Display processing logs with color coding."""
|
| if not st.session_state.steps_log:
|
| return
|
|
|
| st.subheader("๐ Processing Flow")
|
|
|
| for log_entry in st.session_state.steps_log:
|
| status = log_entry["status"]
|
| step = log_entry["step"]
|
| model = log_entry["model"]
|
| details = log_entry["details"]
|
|
|
|
|
| if status.startswith("โ"):
|
| st.success(f"**{status}** [{model}] {step}: {details}")
|
| elif status.startswith("โ"):
|
| st.error(f"**{status}** [{model}] {step}: {details}")
|
| elif status.startswith("โ ๏ธ"):
|
| st.warning(f"**{status}** [{model}] {step}: {details}")
|
| else:
|
| st.info(f"**{status}** [{model}] {step}: {details}")
|
|
|
|
|
| def display_results():
|
| """Display verification results."""
|
| if not st.session_state.results:
|
| return
|
|
|
| results = st.session_state.results
|
|
|
| st.header("๐ฏ Results")
|
|
|
|
|
| col1, col2, col3 = st.columns(3)
|
|
|
| consensus = results.get("consensus", {})
|
| final_verdict = consensus.get("final_verdict", "UNKNOWN")
|
| overall_confidence = consensus.get("overall_confidence", 0.0)
|
| processing_time = results.get("processing_time", 0.0)
|
|
|
| with col1:
|
| if final_verdict == "ERROR":
|
| st.error(f"**Final Verdict:** โ {final_verdict}")
|
| else:
|
| st.success(f"**Final Verdict:** โ
{final_verdict}")
|
|
|
| with col2:
|
| st.metric("**Confidence**", f"{overall_confidence * 100:.1f}%")
|
|
|
| with col3:
|
| st.metric("**Processing Time**", f"{processing_time:.2f}s")
|
|
|
|
|
| st.subheader("๐ค Model Verdicts")
|
| model_results = results.get("model_results", {})
|
|
|
| cols = st.columns(3)
|
| model_names = ["symbolic", "llm_logical", "ensemble"]
|
| model_display_names = {
|
| "symbolic": "๐ข Symbolic",
|
| "llm_logical": "๐ง LLM Logical",
|
| "ensemble": "๐ค Ensemble"
|
| }
|
|
|
| for idx, model_name in enumerate(model_names):
|
| with cols[idx]:
|
| if model_name in model_results:
|
| model_result = model_results[model_name]
|
| verdict = model_result.get("verdict", "UNKNOWN")
|
| confidence = model_result.get("confidence", 0.0)
|
| errors_count = len(model_result.get("errors", []))
|
|
|
| if verdict == "ERROR":
|
| st.error(f"**{model_display_names[model_name]}**\n\nVerdict: โ {verdict}\n\nConfidence: {confidence * 100:.1f}%\n\nErrors: {errors_count}")
|
| else:
|
| st.success(f"**{model_display_names[model_name]}**\n\nVerdict: โ
{verdict}\n\nConfidence: {confidence * 100:.1f}%\n\nErrors: {errors_count}")
|
|
|
|
|
| st.subheader("โ๏ธ Consensus Mechanism")
|
| agreement_type = consensus.get("agreement_type", "UNKNOWN")
|
| error_score = consensus.get("error_score", 0.0)
|
| individual_verdicts = consensus.get("individual_verdicts", {})
|
| individual_confidences = consensus.get("individual_confidences", {})
|
|
|
| st.info(f"**Agreement:** {agreement_type}")
|
| st.info(f"**Error Score:** {error_score:.3f} (threshold: 0.50)")
|
|
|
| st.write("**Individual Model Results:**")
|
| for model_name, verdict in individual_verdicts.items():
|
| confidence = individual_confidences.get(model_name, 0.0)
|
| st.write(f"- {model_display_names.get(model_name, model_name)}: {verdict} (confidence: {confidence * 100:.1f}%)")
|
|
|
|
|
| classified_errors = results.get("classified_errors", [])
|
| if classified_errors:
|
| st.subheader("๐ด Error Details")
|
| for error in classified_errors:
|
| with st.expander(f"Error in Step {error.get('step_number', 0)}: {error.get('category', 'Unknown')}"):
|
| st.write(f"**Type:** {error.get('type', 'unknown')}")
|
| st.write(f"**Found:** {error.get('found', 'N/A')}")
|
| st.write(f"**Correct:** {error.get('correct', 'N/A')}")
|
| st.write(f"**Severity:** {error.get('severity', 'UNKNOWN')}")
|
| st.write(f"**Fixable:** {'Yes' if error.get('fixable', False) else 'No'}")
|
|
|
|
|
| explanations = results.get("explanations", {})
|
| step_num = error.get("step_number", 0)
|
| if step_num in explanations:
|
| st.write(f"**Explanation:** {explanations[step_num]}")
|
|
|
|
|
| correction = results.get("correction", {})
|
| if correction and correction.get("fixed_count", 0) > 0:
|
| st.subheader("๐ง Corrections Applied")
|
| st.success(f"**Fixed:** {correction.get('fixed_count', 0)} / {correction.get('total_fixable', 0)} errors")
|
| st.write(f"**Success Rate:** {correction.get('success_rate', 0.0) * 100:.1f}%")
|
|
|
| correction_log = correction.get("correction_log", [])
|
| if correction_log:
|
| with st.expander("View Correction Log"):
|
| for log_entry in correction_log:
|
| st.write(f"**Step {log_entry['step']}:** {log_entry['type']}")
|
| st.write(f"Original: {log_entry['original']}")
|
| st.write(f"Corrected: {log_entry['corrected']}")
|
|
|
|
|
|
|
| st.set_page_config(page_title="Math Verification System", page_icon="๐ข", layout="wide")
|
|
|
| st.title("๐ข Mathematical Reasoning Verification System")
|
| st.markdown("3-Model Parallel Verification with Weighted Consensus")
|
|
|
|
|
| with st.sidebar:
|
| st.header("โ๏ธ Configuration")
|
|
|
| gpt4_enabled = st.checkbox("GPT-4", value=True)
|
| llama_enabled = st.checkbox("Llama 2", value=True)
|
| gemini_enabled = st.checkbox("Gemini", value=True)
|
|
|
| selected_models = []
|
| if gpt4_enabled:
|
| selected_models.append("GPT-4")
|
| if llama_enabled:
|
| selected_models.append("Llama 2")
|
| if gemini_enabled:
|
| selected_models.append("Gemini")
|
|
|
| if not selected_models:
|
| selected_models = ["GPT-4", "Llama 2", "Gemini"]
|
|
|
| st.info(f"Selected models: {', '.join(selected_models)}")
|
|
|
| st.markdown("---")
|
| st.markdown("### ๐ How Models Are Used")
|
| with st.expander("โน๏ธ Click to see how sidebar models work"):
|
| st.markdown("""
|
| **Model 1 (Symbolic) ๐ข:**
|
| - Uses SymPy library (not affected by sidebar)
|
| - Verifies arithmetic calculations
|
| - Weight: 40%
|
|
|
| **Model 2 (LLM Logical) ๐ง :**
|
| - Uses first selected model from sidebar
|
| - Checks logical consistency
|
| - Weight: 35%
|
|
|
| **Model 3 (Ensemble) ๐ค:**
|
| - Uses ALL selected models from sidebar
|
| - Each model votes on solution validity
|
| - Majority voting determines verdict
|
| - Weight: 25%
|
|
|
| **Note:** Currently using pattern-based simulation.
|
| For production, integrate real LLM APIs (OpenAI, Anthropic, Google).
|
| """)
|
|
|
| st.markdown("### ๐ Model Flow Diagram")
|
| st.code("""
|
| Sidebar Selection
|
| โโโโโโโโโโโโโโโโโโโ
|
| โ GPT-4 โ โ
|
| โ Llama 2 โ โโโโ
|
| โ Gemini โ โ โ
|
| โโโโโโโโโโโโโโโโโโโ โ
|
| โ
|
| โโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโ
|
| โ โ
|
| โผ โผ
|
| Model 2 (LLM Logical) Model 3 (Ensemble)
|
| Uses: GPT-4 (first selected) Uses: All selected
|
| Weight: 35% Weight: 25%
|
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| โ
|
| โผ
|
| Consensus Mechanism
|
| (Weighted Voting)
|
| """, language=None)
|
|
|
|
|
| col_left, col_right = st.columns([1, 1])
|
|
|
| with col_left:
|
| st.header("๐ Input")
|
|
|
| problem = st.text_area(
|
| "Problem:",
|
| height=80,
|
| placeholder="Enter the math problem here...",
|
| value="Janet has 3 apples. She buys 2 more. She gives 1 away. How many?"
|
| )
|
|
|
| steps_input = st.text_area(
|
| "Solution Steps (one per line):",
|
| height=120,
|
| placeholder="Enter solution steps, one per line...",
|
| value="Janet starts with 3 apples\nShe buys 2 more: 3 + 2 = 5 apples\nShe gives 1 away: 5 - 1 = 6 apples"
|
| )
|
|
|
| col_btn1, col_btn2 = st.columns(2)
|
|
|
| with col_btn1:
|
| verify_button = st.button("๐ Verify Solution", type="primary", use_container_width=True)
|
|
|
| with col_btn2:
|
| clear_button = st.button("๐ Clear", use_container_width=True)
|
|
|
| with col_right:
|
| st.header("๐ฏ Live Flowchart")
|
| display_flowchart(problem=problem, steps_input=steps_input)
|
|
|
|
|
| if verify_button:
|
|
|
| st.session_state.steps_log = []
|
| st.session_state.results = None
|
|
|
|
|
| steps = [s.strip() for s in steps_input.split('\n') if s.strip()]
|
|
|
| if not problem or not steps:
|
| st.warning("Please enter both problem and solution steps.")
|
| else:
|
|
|
| add_log("Verification Started", "System", "โณ", "Initializing models...")
|
|
|
|
|
| with st.spinner("Running verification..."):
|
| try:
|
|
|
| add_log("Model 1 Started", "Symbolic", "๐ข", "Checking arithmetic...")
|
| add_log("Model 2 Started", "LLM Logical", "๐ง ", "Checking logical consistency...")
|
| add_log("Model 3 Started", "Ensemble", "๐ค", "Running ensemble voting...")
|
|
|
|
|
| llm_model_name = selected_models[0] if selected_models else "GPT-4"
|
|
|
| results = run_verification_parallel(
|
| problem=problem,
|
| steps=steps,
|
| model_name=llm_model_name,
|
| model_list=selected_models
|
| )
|
|
|
|
|
| for model_name, model_result in results["model_results"].items():
|
| verdict = model_result.get("verdict", "UNKNOWN")
|
| errors_count = len(model_result.get("errors", []))
|
| status = "โ ERROR" if verdict == "ERROR" else "โ VALID"
|
| add_log(
|
| f"Model {model_name} Completed",
|
| model_name.title(),
|
| status,
|
| f"Found {errors_count} error(s)"
|
| )
|
|
|
|
|
| consensus_verdict = results["consensus"].get("final_verdict", "UNKNOWN")
|
| add_log(
|
| "Consensus Computed",
|
| "Consensus",
|
| "โ๏ธ",
|
| f"Final verdict: {consensus_verdict}"
|
| )
|
|
|
| st.session_state.results = results
|
|
|
| except Exception as e:
|
| st.error(f"Error during verification: {str(e)}")
|
| add_log("Error", "System", "โ", str(e))
|
|
|
| if clear_button:
|
| st.session_state.steps_log = []
|
| st.session_state.results = None
|
| st.rerun()
|
|
|
|
|
| display_logs()
|
| display_results()
|
|
|
|
|