Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| """ | |
| Gradio Web Interface for Math Validator | |
| """ | |
| import gradio as gr | |
| import pandas as pd | |
| import os | |
| import subprocess | |
| import sys | |
| import json | |
| from datetime import datetime | |
| import threading | |
| import queue | |
| import time | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| class ValidatorGUI: | |
| def __init__(self): | |
| self.process = None | |
| self.output_queue = queue.Queue() | |
| self.is_running = False | |
| self.total_questions = 0 | |
| self.math_questions = 0 | |
| # Progress tracking | |
| self.questions_processed = 0 | |
| self.correct_answers = 0 | |
| self.incorrect_answers = 0 | |
| self.timeouts = 0 | |
| self.errors = 0 | |
| # Model options | |
| self.openai_models = [ | |
| "o3-mini", | |
| "gpt-4o", | |
| "gpt-5", | |
| "gpt-5-mini", | |
| "gpt-5-nano", | |
| "gpt-4-turbo" | |
| ] | |
| self.openrouter_models = [ | |
| # Anthropic Claude 4 Series (NEW) | |
| "anthropic/claude-4-opus", | |
| "anthropic/claude-4-sonnet", | |
| # Anthropic Claude 3.5 Series | |
| "anthropic/claude-3.5-sonnet", | |
| "anthropic/claude-3-5-sonnet-20241022", | |
| "anthropic/claude-3-opus", | |
| "anthropic/claude-3-haiku", | |
| # xAI Grok Series (including Grok 4) | |
| "x-ai/grok-4", | |
| "x-ai/grok-2", | |
| "x-ai/grok-2-1212", | |
| # DeepSeek Reasoning Models (NEW) | |
| "deepseek/deepseek-r1", | |
| "deepseek/deepseek-v3", | |
| "deepseek/deepseek-chat", | |
| # Google Gemini | |
| "google/gemini-2.0-pro", | |
| "google/gemini-2.0-flash", | |
| "google/gemini-pro-1.5", | |
| "google/gemini-flash-1.5", | |
| # Baidu ERNIE (NEW) | |
| "baidu/ernie-4.0-turbo-8k", | |
| "baidu/ernie-bot-4", | |
| # Meta Llama | |
| "meta-llama/llama-3.2-405b", | |
| "meta-llama/llama-3.1-405b-instruct", | |
| # Mistral | |
| "mistralai/mistral-large", | |
| "mistralai/mixtral-8x22b-instruct" | |
| ] | |
| self.all_models = self.openai_models + self.openrouter_models | |
| def get_excel_files(self): | |
| """Get list of Excel files in current directory""" | |
| files = [f for f in os.listdir('.') if f.endswith('.xlsx') and not f.endswith('_validated.xlsx')] | |
| return files | |
| def analyze_file(self, file_path): | |
| """Analyze Excel file and return summary and question count""" | |
| if not file_path: | |
| return "No file selected", 0, 0 | |
| try: | |
| df = pd.read_excel(file_path, sheet_name='Data') | |
| # Store total questions | |
| self.total_questions = len(df) | |
| # Count math questions | |
| if 'raw_subject' in df.columns: | |
| math_filter = df['raw_subject'].str.lower().str.contains( | |
| 'math|statistic|calculus|algebra|geometry|trigonometry', | |
| na=False, regex=True | |
| ) | |
| self.math_questions = math_filter.sum() | |
| else: | |
| self.math_questions = len(df) | |
| # Check for images | |
| image_count = 0 | |
| if 'file_url' in df.columns: | |
| image_count = df['file_url'].notna().sum() | |
| summary = f"""### File Analysis | |
| **File:** {os.path.basename(file_path)} | |
| **Total rows:** {self.total_questions} | |
| **Math questions:** {self.math_questions} | |
| **Questions with images:** {image_count} | |
| **Columns found:** {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''} | |
| **Estimated processing time:** | |
| - Serial: ~{self.math_questions * 30 // 60} minutes | |
| - Parallel (4 processes): ~{self.math_questions * 30 // (60 * 4)} minutes | |
| """ | |
| return summary, self.total_questions, self.math_questions | |
| except Exception as e: | |
| return f"Error analyzing file: {str(e)}", 0, 0 | |
| def validate_config(self, file_path, solver_model, recon_model, num_processes, batch_size): | |
| """Validate configuration before running""" | |
| errors = [] | |
| if not file_path or not os.path.exists(file_path): | |
| errors.append("Please select a valid Excel file") | |
| if not solver_model: | |
| errors.append("Please select a solver model") | |
| if not recon_model: | |
| errors.append("Please select a reconciliation model") | |
| # Check API keys | |
| needs_openai = solver_model in self.openai_models or recon_model in self.openai_models | |
| needs_openrouter = solver_model in self.openrouter_models or recon_model in self.openrouter_models | |
| if needs_openai and not os.getenv('OPENAI_API_KEY'): | |
| errors.append("OPENAI_API_KEY not found in environment") | |
| if needs_openrouter and not os.getenv('OPENROUTER_API_KEY'): | |
| errors.append("OPENROUTER_API_KEY not found in environment") | |
| return errors | |
| def generate_output_filename(self, file_path, start_q, end_q): | |
| """Generate output filename with timestamp and range""" | |
| base_name = os.path.basename(file_path).replace('.xlsx', '') | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| if start_q is not None and end_q is not None and (start_q > 0 or end_q < self.math_questions): | |
| # Add range to filename | |
| range_str = f"_q{start_q+1}_q{end_q}" | |
| else: | |
| range_str = "_full" | |
| return f"{base_name}_validated_{timestamp}{range_str}.xlsx" | |
| def parse_progress_line(self, line): | |
| """Parse output line for progress information""" | |
| # Parse based on the new [TAG] format | |
| line_lower = line.lower() | |
| if "[ok] got answer" in line_lower and "chars" in line_lower: | |
| self.questions_processed += 1 | |
| elif "[fail] failed to get answer" in line_lower: | |
| self.errors += 1 | |
| self.questions_processed += 1 # Still count as processed | |
| elif "[match]" in line_lower: | |
| self.correct_answers += 1 | |
| elif "[mismatch]" in line_lower: | |
| self.incorrect_answers += 1 | |
| elif "[timeout]" in line_lower: | |
| self.timeouts += 1 | |
| elif "[error]" in line_lower: | |
| if "failed after" in line_lower: | |
| self.errors += 1 | |
| elif "[warning]" in line_lower: | |
| # Just a warning, not an error | |
| pass | |
| elif "question" in line_lower and "getting answer from" in line_lower: | |
| # This indicates a question is starting to be processed | |
| pass | |
| # Also parse parallel processing output | |
| elif "starting process for questions" in line_lower: | |
| # Parallel process starting | |
| pass | |
| elif "completed range" in line_lower: | |
| # Parallel process completed a range | |
| import re | |
| # Try to extract question count from "Completed range X-Y" | |
| match = re.search(r'range (\d+)-(\d+)', line_lower) | |
| if match: | |
| start, end = int(match.group(1)), int(match.group(2)) | |
| # This is approximate since we don't know exact results | |
| self.questions_processed = max(self.questions_processed, end) | |
| def get_progress_stats(self): | |
| """Get formatted progress statistics""" | |
| if self.questions_processed == 0: | |
| return "Waiting for processing to start..." | |
| accuracy = (self.correct_answers / self.questions_processed * 100) if self.questions_processed > 0 else 0 | |
| return f"""**Progress Stats:** | |
| - Processed: {self.questions_processed} | |
| - Correct: {self.correct_answers} ({accuracy:.1f}%) | |
| - Incorrect: {self.incorrect_answers} | |
| - Timeouts: {self.timeouts} | |
| - Errors: {self.errors} | |
| """ | |
| def run_validation(self, file_path, solver_model, recon_model, image_mode, | |
| num_processes, batch_size, start_q, end_q, compile_latex, progress=gr.Progress()): | |
| """Run the validation process""" | |
| # Reset progress counters | |
| self.questions_processed = 0 | |
| self.correct_answers = 0 | |
| self.incorrect_answers = 0 | |
| self.timeouts = 0 | |
| self.errors = 0 | |
| # Validate configuration | |
| errors = self.validate_config(file_path, solver_model, recon_model, num_processes, batch_size) | |
| if errors: | |
| yield f"### Configuration Errors\n" + "\n".join(f"- {e}" for e in errors), None, "" | |
| return | |
| self.is_running = True | |
| output_log = [] | |
| # Generate output filename | |
| output_file = self.generate_output_filename(file_path, start_q, end_q) | |
| output_path = os.path.join(os.path.dirname(file_path), output_file) | |
| try: | |
| # Prepare command | |
| base_cmd = [ | |
| sys.executable, "universal_validator.py", file_path, | |
| "--model", solver_model, | |
| "--reconciliation-model", recon_model, | |
| "--images", image_mode, | |
| "--batch-size", str(batch_size), | |
| "--output", output_path | |
| ] | |
| # Add range parameters if specified | |
| if start_q is not None and start_q >= 0: | |
| base_cmd.extend(["--start", str(start_q)]) | |
| if end_q is not None and end_q > 0: | |
| base_cmd.extend(["--end", str(end_q)]) | |
| # Add LaTeX compilation flag if requested | |
| if compile_latex: | |
| base_cmd.append("--compile-latex") | |
| # Use parallel processing for larger ranges | |
| if num_processes > 1 and (end_q - start_q) > 20: | |
| cmd = [ | |
| sys.executable, "run_parallel.py", file_path, | |
| "--num-processes", str(num_processes), | |
| "--solver", solver_model, | |
| "--reconciler", recon_model, | |
| "--images", image_mode, | |
| "--batch-size", str(batch_size), | |
| "--output", output_path, | |
| "--start-range", str(start_q), | |
| "--end-range", str(end_q) | |
| ] | |
| if compile_latex: | |
| cmd.append("--compile-latex") | |
| print(f"[GUI] Using parallel processing with {num_processes} processes") | |
| else: | |
| # Use single process for small ranges | |
| cmd = base_cmd | |
| if num_processes > 1 and (end_q - start_q) <= 20: | |
| print(f"[GUI] Range too small for parallel processing, using single process") | |
| # Start process | |
| progress(0, desc="Starting validation...") | |
| output_log.append(f"Running: {' '.join(cmd)}\n") | |
| output_log.append(f"Output file: {output_path}\n") | |
| output_log.append(f"Question range: {start_q+1} to {end_q}\n\n") | |
| print(f"[GUI] Starting subprocess: {' '.join(cmd)}") | |
| try: | |
| self.process = subprocess.Popen( | |
| cmd, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| text=True, | |
| bufsize=1, | |
| universal_newlines=True, | |
| encoding='utf-8', | |
| errors='replace' | |
| ) | |
| print(f"[GUI] Process started with PID: {self.process.pid}") | |
| except Exception as e: | |
| error_msg = f"Failed to start validator: {str(e)}" | |
| print(f"[GUI Error] {error_msg}") | |
| yield error_msg, None, "" | |
| return | |
| # Read output | |
| lines_processed = 0 | |
| last_update_time = time.time() | |
| while True: | |
| line = self.process.stdout.readline() | |
| if not line: | |
| # Check if process is still running | |
| if self.process.poll() is not None: | |
| break | |
| time.sleep(0.1) | |
| continue | |
| output_log.append(line) | |
| self.parse_progress_line(line) | |
| # Debug: Print every line to see what's happening | |
| print(f"[GUI Debug] {line.strip()}") | |
| # Update progress based on output | |
| if "processing batch" in line.lower() or "question" in line.lower(): | |
| lines_processed += 1 | |
| if self.math_questions > 0 and self.questions_processed > 0: | |
| actual_progress = min(self.questions_processed / (end_q - start_q), 1.0) | |
| progress(actual_progress, desc=f"Processing question {self.questions_processed}/{end_q - start_q}") | |
| # Yield intermediate results with stats every 2 seconds or every 5 lines | |
| current_time = time.time() | |
| if lines_processed % 5 == 0 or (current_time - last_update_time) > 2: | |
| stats = self.get_progress_stats() | |
| output_text = stats + "\n\n" + "="*60 + "\n" + "".join(output_log[-50:]) | |
| yield output_text, None, stats | |
| last_update_time = current_time | |
| self.process.wait() | |
| # Get final results | |
| final_stats = self.get_progress_stats() | |
| output_text = f"### Validation Complete\n\n{final_stats}\n\n" + "="*60 + "\n\nFull Log:\n" + "".join(output_log[-200:]) | |
| # Check if output file exists | |
| if os.path.exists(output_path): | |
| yield output_text, output_path, final_stats | |
| else: | |
| # Try original naming convention as fallback | |
| fallback_path = file_path.replace('.xlsx', '_validated.xlsx') | |
| if os.path.exists(fallback_path): | |
| yield output_text, fallback_path, final_stats | |
| else: | |
| yield output_text, None, final_stats | |
| except Exception as e: | |
| stats = self.get_progress_stats() | |
| yield f"Error: {str(e)}\n\n{stats}\n\n{''.join(output_log)}", None, stats | |
| finally: | |
| self.is_running = False | |
| self.process = None | |
| def stop_validation(self): | |
| """Stop the running validation""" | |
| if self.process: | |
| self.process.terminate() | |
| time.sleep(1) | |
| if self.process.poll() is None: | |
| self.process.kill() | |
| return "Validation stopped" | |
| return "No validation running" | |
| def create_interface(self): | |
| """Create the Gradio interface""" | |
| with gr.Blocks(title="Math Validator", theme=gr.themes.Soft()) as interface: | |
| gr.Markdown("# Math Question Validator") | |
| gr.Markdown("Web interface for validating mathematical questions and answers") | |
| with gr.Tab("Validation"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # File selection | |
| file_dropdown = gr.Dropdown( | |
| choices=self.get_excel_files(), | |
| label="Select Excel File", | |
| value=self.get_excel_files()[0] if self.get_excel_files() else None | |
| ) | |
| refresh_btn = gr.Button("🔄 Refresh Files", size="sm") | |
| file_info = gr.Markdown("Select a file to see analysis") | |
| # Question range selection (dynamically updated) | |
| gr.Markdown("### Question Range") | |
| with gr.Row(): | |
| start_question = gr.Number( | |
| label="Start Question", | |
| value=1, | |
| minimum=1, | |
| step=1, | |
| info="First question to process" | |
| ) | |
| end_question = gr.Number( | |
| label="End Question", | |
| value=100, | |
| minimum=1, | |
| step=1, | |
| info="Last question to process" | |
| ) | |
| use_all_questions = gr.Checkbox( | |
| label="Process all questions", | |
| value=True, | |
| info="Uncheck to specify custom range" | |
| ) | |
| with gr.Column(scale=2): | |
| with gr.Row(): | |
| # Model selection | |
| solver_dropdown = gr.Dropdown( | |
| choices=["o3-mini (recommended)"] + self.all_models, | |
| value="o3-mini (recommended)", | |
| label="Solver Model", | |
| info="Model for answering questions" | |
| ) | |
| recon_dropdown = gr.Dropdown( | |
| choices=["gpt-4o (recommended)"] + self.all_models, | |
| value="gpt-4o (recommended)", | |
| label="Reconciliation Model", | |
| info="Model for comparing answers" | |
| ) | |
| with gr.Row(): | |
| image_mode = gr.Radio( | |
| choices=["when_needed", "always", "never"], | |
| value="when_needed", | |
| label="Image Handling", | |
| info="When to include images with questions" | |
| ) | |
| parallel_slider = gr.Slider( | |
| minimum=1, | |
| maximum=8, | |
| value=1, | |
| step=1, | |
| label="Parallel Processes", | |
| info="Number of concurrent processes (1 = serial)" | |
| ) | |
| batch_slider = gr.Slider( | |
| minimum=1, | |
| maximum=20, | |
| value=5, | |
| step=1, | |
| label="Batch Size", | |
| info="Questions per batch" | |
| ) | |
| # LaTeX compilation option | |
| compile_latex = gr.Checkbox( | |
| label="Compile LaTeX reconciliation documents to PDF", | |
| value=False, | |
| info="Requires pdflatex installed (slower but produces PDFs)" | |
| ) | |
| with gr.Row(): | |
| run_btn = gr.Button("▶️ Start Validation", variant="primary", size="lg") | |
| stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg") | |
| # Output section with progress stats | |
| progress_stats = gr.Markdown("**Progress:** Waiting to start...") | |
| output_text = gr.Textbox( | |
| label="Validation Output", | |
| lines=20, | |
| max_lines=30, | |
| value="Click 'Start Validation' to begin..." | |
| ) | |
| output_file = gr.File( | |
| label="Download Results", | |
| visible=False | |
| ) | |
| # Event handlers | |
| def update_file_info(file_path): | |
| if file_path: | |
| full_path = os.path.join(os.getcwd(), file_path) | |
| summary, total, math_q = self.analyze_file(full_path) | |
| # Update end question to match file | |
| return summary, math_q | |
| return "No file selected", 100 | |
| def refresh_files(): | |
| files = self.get_excel_files() | |
| return gr.update(choices=files, value=files[0] if files else None) | |
| def clean_model_name(model): | |
| # Remove "(recommended)" suffix if present | |
| if "(recommended)" in model: | |
| return model.split(" (")[0] | |
| return model | |
| def toggle_range_inputs(use_all): | |
| # Enable/disable range inputs based on checkbox | |
| return gr.update(interactive=not use_all), gr.update(interactive=not use_all) | |
| def run_with_clean_models(file_path, solver, recon, images, parallel, batch, | |
| use_all, start_q, end_q, compile_tex): | |
| solver_clean = clean_model_name(solver) | |
| recon_clean = clean_model_name(recon) | |
| if file_path: | |
| full_path = os.path.join(os.getcwd(), file_path) | |
| # Adjust question range (convert to 0-indexed) | |
| if use_all: | |
| actual_start = 0 | |
| actual_end = self.math_questions | |
| else: | |
| actual_start = max(0, int(start_q) - 1) # Convert to 0-indexed | |
| actual_end = min(self.math_questions, int(end_q)) | |
| # Run validation with progress updates | |
| for result in self.run_validation( | |
| full_path, solver_clean, recon_clean, images, parallel, batch, | |
| actual_start, actual_end, compile_tex | |
| ): | |
| if len(result) == 3: | |
| result_text, result_file, stats = result | |
| if result_file: | |
| yield result_text, gr.update(value=result_file, visible=True), stats | |
| else: | |
| yield result_text, gr.update(visible=False), stats | |
| else: | |
| yield result[0], gr.update(visible=False), result[1] if len(result) > 1 else "" | |
| else: | |
| yield "No file selected", gr.update(visible=False), "" | |
| file_dropdown.change(update_file_info, inputs=[file_dropdown], | |
| outputs=[file_info, end_question]) | |
| refresh_btn.click(refresh_files, outputs=[file_dropdown]) | |
| # Toggle range inputs when checkbox changes | |
| use_all_questions.change(toggle_range_inputs, inputs=[use_all_questions], | |
| outputs=[start_question, end_question]) | |
| run_btn.click( | |
| run_with_clean_models, | |
| inputs=[file_dropdown, solver_dropdown, recon_dropdown, | |
| image_mode, parallel_slider, batch_slider, | |
| use_all_questions, start_question, end_question, compile_latex], | |
| outputs=[output_text, output_file, progress_stats] | |
| ) | |
| stop_btn.click(self.stop_validation, outputs=[output_text]) | |
| with gr.Tab("Configuration"): | |
| gr.Markdown(""" | |
| ### API Configuration | |
| Make sure you have the required API keys set as environment variables: | |
| - **OPENAI_API_KEY**: Required for OpenAI models (o3-mini, GPT-5, GPT-4o) | |
| - **OPENROUTER_API_KEY**: Required for Claude, Grok, Gemini, and other models | |
| ### Model Recommendations | |
| **For best results:** | |
| - Solver: o3-mini (best accuracy) | |
| - Reconciliation: gpt-4o (fast and reliable) | |
| **For speed:** | |
| - Use 4-6 parallel processes | |
| - Batch size of 5-10 | |
| **For GPT-5 testing:** | |
| - Use gpt-5-mini (faster than gpt-5) | |
| - Use gpt-4o for reconciliation (GPT-5 has timeout issues) | |
| """) | |
| # Check current configuration | |
| config_status = [] | |
| if os.getenv('OPENAI_API_KEY'): | |
| config_status.append("✅ OPENAI_API_KEY is set") | |
| else: | |
| config_status.append("❌ OPENAI_API_KEY is not set") | |
| if os.getenv('OPENROUTER_API_KEY'): | |
| config_status.append("✅ OPENROUTER_API_KEY is set") | |
| else: | |
| config_status.append("❌ OPENROUTER_API_KEY is not set") | |
| gr.Markdown("### Current Status\n" + "\n".join(config_status)) | |
| with gr.Tab("Results Analysis"): | |
| gr.Markdown(""" | |
| ### How to Analyze Results | |
| After validation completes: | |
| 1. **Download the validated Excel file** - Contains all results | |
| 2. **Check the latex_documents folder** - Contains reconciliation documents | |
| 3. **Run analysis scripts:** | |
| - `python analyze_reconciliations.py` - Analyze which answers were vindicated | |
| - `python summarize_results.py` - Get overall statistics | |
| ### Understanding Results | |
| - **answer_match = Yes**: Model answer matches reference | |
| - **answer_match = No**: Mismatch (see LaTeX reconciliation) | |
| - **latex_file**: Path to detailed reconciliation document | |
| - **model_answer_file**: Path to model's complete response | |
| """) | |
| return interface | |
| def main(): | |
| gui = ValidatorGUI() | |
| interface = gui.create_interface() | |
| interface.launch( | |
| share=False, | |
| server_name="127.0.0.1", | |
| server_port=7860, | |
| inbrowser=True | |
| ) | |
| if __name__ == "__main__": | |
| main() |