Spaces:

igriv
/

math-validator

Sleeping

App Files Files Community

math-validator / validator_gui.py

Nanny7

Fix run_validation to consistently use yield instead of return

63fa4c6 5 months ago

raw

history blame contribute delete

27.3 kB

	#!/usr/bin/env python
	"""
	Gradio Web Interface for Math Validator
	"""

	import gradio as gr
	import pandas as pd
	import os
	import subprocess
	import sys
	import json
	from datetime import datetime
	import threading
	import queue
	import time
	from dotenv import load_dotenv

	# Load environment variables from .env file
	load_dotenv()

	class ValidatorGUI:
	def __init__(self):
	self.process = None
	self.output_queue = queue.Queue()
	self.is_running = False
	self.total_questions = 0
	self.math_questions = 0

	# Progress tracking
	self.questions_processed = 0
	self.correct_answers = 0
	self.incorrect_answers = 0
	self.timeouts = 0
	self.errors = 0

	# Model options
	self.openai_models = [
	"o3-mini",
	"gpt-4o",
	"gpt-5",
	"gpt-5-mini",
	"gpt-5-nano",
	"gpt-4-turbo"
	]

	self.openrouter_models = [
	# Anthropic Claude 4 Series (NEW)
	"anthropic/claude-4-opus",
	"anthropic/claude-4-sonnet",

	# Anthropic Claude 3.5 Series
	"anthropic/claude-3.5-sonnet",
	"anthropic/claude-3-5-sonnet-20241022",
	"anthropic/claude-3-opus",
	"anthropic/claude-3-haiku",

	# xAI Grok Series (including Grok 4)
	"x-ai/grok-4",
	"x-ai/grok-2",
	"x-ai/grok-2-1212",

	# DeepSeek Reasoning Models (NEW)
	"deepseek/deepseek-r1",
	"deepseek/deepseek-v3",
	"deepseek/deepseek-chat",

	# Google Gemini
	"google/gemini-2.0-pro",
	"google/gemini-2.0-flash",
	"google/gemini-pro-1.5",
	"google/gemini-flash-1.5",

	# Baidu ERNIE (NEW)
	"baidu/ernie-4.0-turbo-8k",
	"baidu/ernie-bot-4",

	# Meta Llama
	"meta-llama/llama-3.2-405b",
	"meta-llama/llama-3.1-405b-instruct",

	# Mistral
	"mistralai/mistral-large",
	"mistralai/mixtral-8x22b-instruct"
	]

	self.all_models = self.openai_models + self.openrouter_models

	def get_excel_files(self):
	"""Get list of Excel files in current directory"""
	files = [f for f in os.listdir('.') if f.endswith('.xlsx') and not f.endswith('_validated.xlsx')]
	return files

	def analyze_file(self, file_path):
	"""Analyze Excel file and return summary and question count"""
	if not file_path:
	return "No file selected", 0, 0

	try:
	df = pd.read_excel(file_path, sheet_name='Data')

	# Store total questions
	self.total_questions = len(df)

	# Count math questions
	if 'raw_subject' in df.columns:
	math_filter = df['raw_subject'].str.lower().str.contains(
	'math\|statistic\|calculus\|algebra\|geometry\|trigonometry',
	na=False, regex=True
	)
	self.math_questions = math_filter.sum()
	else:
	self.math_questions = len(df)

	# Check for images
	image_count = 0
	if 'file_url' in df.columns:
	image_count = df['file_url'].notna().sum()

	summary = f"""### File Analysis

	File: {os.path.basename(file_path)}
	Total rows: {self.total_questions}
	Math questions: {self.math_questions}
	Questions with images: {image_count}

	Columns found: {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''}

	Estimated processing time:
	- Serial: ~{self.math_questions * 30 // 60} minutes
	- Parallel (4 processes): ~{self.math_questions * 30 // (60 * 4)} minutes
	"""
	return summary, self.total_questions, self.math_questions

	except Exception as e:
	return f"Error analyzing file: {str(e)}", 0, 0

	def validate_config(self, file_path, solver_model, recon_model, num_processes, batch_size):
	"""Validate configuration before running"""
	errors = []

	if not file_path or not os.path.exists(file_path):
	errors.append("Please select a valid Excel file")

	if not solver_model:
	errors.append("Please select a solver model")

	if not recon_model:
	errors.append("Please select a reconciliation model")

	# Check API keys
	needs_openai = solver_model in self.openai_models or recon_model in self.openai_models
	needs_openrouter = solver_model in self.openrouter_models or recon_model in self.openrouter_models

	if needs_openai and not os.getenv('OPENAI_API_KEY'):
	errors.append("OPENAI_API_KEY not found in environment")

	if needs_openrouter and not os.getenv('OPENROUTER_API_KEY'):
	errors.append("OPENROUTER_API_KEY not found in environment")

	return errors

	def generate_output_filename(self, file_path, start_q, end_q):
	"""Generate output filename with timestamp and range"""
	base_name = os.path.basename(file_path).replace('.xlsx', '')
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

	if start_q is not None and end_q is not None and (start_q > 0 or end_q < self.math_questions):
	# Add range to filename
	range_str = f"_q{start_q+1}_q{end_q}"
	else:
	range_str = "_full"

	return f"{base_name}_validated_{timestamp}{range_str}.xlsx"

	def parse_progress_line(self, line):
	"""Parse output line for progress information"""
	# Parse based on the new [TAG] format
	line_lower = line.lower()

	if "[ok] got answer" in line_lower and "chars" in line_lower:
	self.questions_processed += 1
	elif "[fail] failed to get answer" in line_lower:
	self.errors += 1
	self.questions_processed += 1 # Still count as processed
	elif "[match]" in line_lower:
	self.correct_answers += 1
	elif "[mismatch]" in line_lower:
	self.incorrect_answers += 1
	elif "[timeout]" in line_lower:
	self.timeouts += 1
	elif "[error]" in line_lower:
	if "failed after" in line_lower:
	self.errors += 1
	elif "[warning]" in line_lower:
	# Just a warning, not an error
	pass
	elif "question" in line_lower and "getting answer from" in line_lower:
	# This indicates a question is starting to be processed
	pass

	# Also parse parallel processing output
	elif "starting process for questions" in line_lower:
	# Parallel process starting
	pass
	elif "completed range" in line_lower:
	# Parallel process completed a range
	import re
	# Try to extract question count from "Completed range X-Y"
	match = re.search(r'range (\d+)-(\d+)', line_lower)
	if match:
	start, end = int(match.group(1)), int(match.group(2))
	# This is approximate since we don't know exact results
	self.questions_processed = max(self.questions_processed, end)

	def get_progress_stats(self):
	"""Get formatted progress statistics"""
	if self.questions_processed == 0:
	return "Waiting for processing to start..."

	accuracy = (self.correct_answers / self.questions_processed * 100) if self.questions_processed > 0 else 0

	return f"""Progress Stats:
	- Processed: {self.questions_processed}
	- Correct: {self.correct_answers} ({accuracy:.1f}%)
	- Incorrect: {self.incorrect_answers}
	- Timeouts: {self.timeouts}
	- Errors: {self.errors}
	"""

	def run_validation(self, file_path, solver_model, recon_model, image_mode,
	num_processes, batch_size, start_q, end_q, compile_latex, progress=gr.Progress()):
	"""Run the validation process"""

	# Reset progress counters
	self.questions_processed = 0
	self.correct_answers = 0
	self.incorrect_answers = 0
	self.timeouts = 0
	self.errors = 0

	# Validate configuration
	errors = self.validate_config(file_path, solver_model, recon_model, num_processes, batch_size)
	if errors:
	yield f"### Configuration Errors\n" + "\n".join(f"- {e}" for e in errors), None, ""
	return

	self.is_running = True
	output_log = []

	# Generate output filename
	output_file = self.generate_output_filename(file_path, start_q, end_q)
	output_path = os.path.join(os.path.dirname(file_path), output_file)

	try:
	# Prepare command
	base_cmd = [
	sys.executable, "universal_validator.py", file_path,
	"--model", solver_model,
	"--reconciliation-model", recon_model,
	"--images", image_mode,
	"--batch-size", str(batch_size),
	"--output", output_path
	]

	# Add range parameters if specified
	if start_q is not None and start_q >= 0:
	base_cmd.extend(["--start", str(start_q)])
	if end_q is not None and end_q > 0:
	base_cmd.extend(["--end", str(end_q)])

	# Add LaTeX compilation flag if requested
	if compile_latex:
	base_cmd.append("--compile-latex")

	# Use parallel processing for larger ranges
	if num_processes > 1 and (end_q - start_q) > 20:
	cmd = [
	sys.executable, "run_parallel.py", file_path,
	"--num-processes", str(num_processes),
	"--solver", solver_model,
	"--reconciler", recon_model,
	"--images", image_mode,
	"--batch-size", str(batch_size),
	"--output", output_path,
	"--start-range", str(start_q),
	"--end-range", str(end_q)
	]
	if compile_latex:
	cmd.append("--compile-latex")
	print(f"[GUI] Using parallel processing with {num_processes} processes")
	else:
	# Use single process for small ranges
	cmd = base_cmd
	if num_processes > 1 and (end_q - start_q) <= 20:
	print(f"[GUI] Range too small for parallel processing, using single process")

	# Start process
	progress(0, desc="Starting validation...")
	output_log.append(f"Running: {' '.join(cmd)}\n")
	output_log.append(f"Output file: {output_path}\n")
	output_log.append(f"Question range: {start_q+1} to {end_q}\n\n")

	print(f"[GUI] Starting subprocess: {' '.join(cmd)}")

	try:
	self.process = subprocess.Popen(
	cmd,
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	text=True,
	bufsize=1,
	universal_newlines=True,
	encoding='utf-8',
	errors='replace'
	)
	print(f"[GUI] Process started with PID: {self.process.pid}")
	except Exception as e:
	error_msg = f"Failed to start validator: {str(e)}"
	print(f"[GUI Error] {error_msg}")
	yield error_msg, None, ""
	return

	# Read output
	lines_processed = 0
	last_update_time = time.time()

	while True:
	line = self.process.stdout.readline()
	if not line:
	# Check if process is still running
	if self.process.poll() is not None:
	break
	time.sleep(0.1)
	continue

	output_log.append(line)
	self.parse_progress_line(line)

	# Debug: Print every line to see what's happening
	print(f"[GUI Debug] {line.strip()}")

	# Update progress based on output
	if "processing batch" in line.lower() or "question" in line.lower():
	lines_processed += 1
	if self.math_questions > 0 and self.questions_processed > 0:
	actual_progress = min(self.questions_processed / (end_q - start_q), 1.0)
	progress(actual_progress, desc=f"Processing question {self.questions_processed}/{end_q - start_q}")

	# Yield intermediate results with stats every 2 seconds or every 5 lines
	current_time = time.time()
	if lines_processed % 5 == 0 or (current_time - last_update_time) > 2:
	stats = self.get_progress_stats()
	output_text = stats + "\n\n" + "="*60 + "\n" + "".join(output_log[-50:])
	yield output_text, None, stats
	last_update_time = current_time

	self.process.wait()

	# Get final results
	final_stats = self.get_progress_stats()
	output_text = f"### Validation Complete\n\n{final_stats}\n\n" + "="*60 + "\n\nFull Log:\n" + "".join(output_log[-200:])

	# Check if output file exists
	if os.path.exists(output_path):
	yield output_text, output_path, final_stats
	else:
	# Try original naming convention as fallback
	fallback_path = file_path.replace('.xlsx', '_validated.xlsx')
	if os.path.exists(fallback_path):
	yield output_text, fallback_path, final_stats
	else:
	yield output_text, None, final_stats

	except Exception as e:
	stats = self.get_progress_stats()
	yield f"Error: {str(e)}\n\n{stats}\n\n{''.join(output_log)}", None, stats
	finally:
	self.is_running = False
	self.process = None

	def stop_validation(self):
	"""Stop the running validation"""
	if self.process:
	self.process.terminate()
	time.sleep(1)
	if self.process.poll() is None:
	self.process.kill()
	return "Validation stopped"
	return "No validation running"

	def create_interface(self):
	"""Create the Gradio interface"""

	with gr.Blocks(title="Math Validator", theme=gr.themes.Soft()) as interface:
	gr.Markdown("# Math Question Validator")
	gr.Markdown("Web interface for validating mathematical questions and answers")

	with gr.Tab("Validation"):
	with gr.Row():
	with gr.Column(scale=1):
	# File selection
	file_dropdown = gr.Dropdown(
	choices=self.get_excel_files(),
	label="Select Excel File",
	value=self.get_excel_files()[0] if self.get_excel_files() else None
	)

	refresh_btn = gr.Button("🔄 Refresh Files", size="sm")

	file_info = gr.Markdown("Select a file to see analysis")

	# Question range selection (dynamically updated)
	gr.Markdown("### Question Range")
	with gr.Row():
	start_question = gr.Number(
	label="Start Question",
	value=1,
	minimum=1,
	step=1,
	info="First question to process"
	)
	end_question = gr.Number(
	label="End Question",
	value=100,
	minimum=1,
	step=1,
	info="Last question to process"
	)

	use_all_questions = gr.Checkbox(
	label="Process all questions",
	value=True,
	info="Uncheck to specify custom range"
	)

	with gr.Column(scale=2):
	with gr.Row():
	# Model selection
	solver_dropdown = gr.Dropdown(
	choices=["o3-mini (recommended)"] + self.all_models,
	value="o3-mini (recommended)",
	label="Solver Model",
	info="Model for answering questions"
	)

	recon_dropdown = gr.Dropdown(
	choices=["gpt-4o (recommended)"] + self.all_models,
	value="gpt-4o (recommended)",
	label="Reconciliation Model",
	info="Model for comparing answers"
	)

	with gr.Row():
	image_mode = gr.Radio(
	choices=["when_needed", "always", "never"],
	value="when_needed",
	label="Image Handling",
	info="When to include images with questions"
	)

	parallel_slider = gr.Slider(
	minimum=1,
	maximum=8,
	value=1,
	step=1,
	label="Parallel Processes",
	info="Number of concurrent processes (1 = serial)"
	)

	batch_slider = gr.Slider(
	minimum=1,
	maximum=20,
	value=5,
	step=1,
	label="Batch Size",
	info="Questions per batch"
	)

	# LaTeX compilation option
	compile_latex = gr.Checkbox(
	label="Compile LaTeX reconciliation documents to PDF",
	value=False,
	info="Requires pdflatex installed (slower but produces PDFs)"
	)

	with gr.Row():
	run_btn = gr.Button("▶️ Start Validation", variant="primary", size="lg")
	stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg")

	# Output section with progress stats
	progress_stats = gr.Markdown("Progress: Waiting to start...")

	output_text = gr.Textbox(
	label="Validation Output",
	lines=20,
	max_lines=30,
	value="Click 'Start Validation' to begin..."
	)

	output_file = gr.File(
	label="Download Results",
	visible=False
	)

	# Event handlers
	def update_file_info(file_path):
	if file_path:
	full_path = os.path.join(os.getcwd(), file_path)
	summary, total, math_q = self.analyze_file(full_path)
	# Update end question to match file
	return summary, math_q
	return "No file selected", 100

	def refresh_files():
	files = self.get_excel_files()
	return gr.update(choices=files, value=files[0] if files else None)

	def clean_model_name(model):
	# Remove "(recommended)" suffix if present
	if "(recommended)" in model:
	return model.split(" (")[0]
	return model

	def toggle_range_inputs(use_all):
	# Enable/disable range inputs based on checkbox
	return gr.update(interactive=not use_all), gr.update(interactive=not use_all)

	def run_with_clean_models(file_path, solver, recon, images, parallel, batch,
	use_all, start_q, end_q, compile_tex):
	solver_clean = clean_model_name(solver)
	recon_clean = clean_model_name(recon)

	if file_path:
	full_path = os.path.join(os.getcwd(), file_path)

	# Adjust question range (convert to 0-indexed)
	if use_all:
	actual_start = 0
	actual_end = self.math_questions
	else:
	actual_start = max(0, int(start_q) - 1) # Convert to 0-indexed
	actual_end = min(self.math_questions, int(end_q))

	# Run validation with progress updates
	for result in self.run_validation(
	full_path, solver_clean, recon_clean, images, parallel, batch,
	actual_start, actual_end, compile_tex
	):
	if len(result) == 3:
	result_text, result_file, stats = result
	if result_file:
	yield result_text, gr.update(value=result_file, visible=True), stats
	else:
	yield result_text, gr.update(visible=False), stats
	else:
	yield result[0], gr.update(visible=False), result[1] if len(result) > 1 else ""
	else:
	yield "No file selected", gr.update(visible=False), ""

	file_dropdown.change(update_file_info, inputs=[file_dropdown],
	outputs=[file_info, end_question])
	refresh_btn.click(refresh_files, outputs=[file_dropdown])

	# Toggle range inputs when checkbox changes
	use_all_questions.change(toggle_range_inputs, inputs=[use_all_questions],
	outputs=[start_question, end_question])

	run_btn.click(
	run_with_clean_models,
	inputs=[file_dropdown, solver_dropdown, recon_dropdown,
	image_mode, parallel_slider, batch_slider,
	use_all_questions, start_question, end_question, compile_latex],
	outputs=[output_text, output_file, progress_stats]
	)

	stop_btn.click(self.stop_validation, outputs=[output_text])

	with gr.Tab("Configuration"):
	gr.Markdown("""
	### API Configuration

	Make sure you have the required API keys set as environment variables:

	- OPENAI_API_KEY: Required for OpenAI models (o3-mini, GPT-5, GPT-4o)
	- OPENROUTER_API_KEY: Required for Claude, Grok, Gemini, and other models

	### Model Recommendations

	For best results:
	- Solver: o3-mini (best accuracy)
	- Reconciliation: gpt-4o (fast and reliable)

	For speed:
	- Use 4-6 parallel processes
	- Batch size of 5-10

	For GPT-5 testing:
	- Use gpt-5-mini (faster than gpt-5)
	- Use gpt-4o for reconciliation (GPT-5 has timeout issues)
	""")

	# Check current configuration
	config_status = []
	if os.getenv('OPENAI_API_KEY'):
	config_status.append("✅ OPENAI_API_KEY is set")
	else:
	config_status.append("❌ OPENAI_API_KEY is not set")

	if os.getenv('OPENROUTER_API_KEY'):
	config_status.append("✅ OPENROUTER_API_KEY is set")
	else:
	config_status.append("❌ OPENROUTER_API_KEY is not set")

	gr.Markdown("### Current Status\n" + "\n".join(config_status))

	with gr.Tab("Results Analysis"):
	gr.Markdown("""
	### How to Analyze Results

	After validation completes:

	1. Download the validated Excel file - Contains all results
	2. Check the latex_documents folder - Contains reconciliation documents
	3. Run analysis scripts:
	- `python analyze_reconciliations.py` - Analyze which answers were vindicated
	- `python summarize_results.py` - Get overall statistics

	### Understanding Results

	- answer_match = Yes: Model answer matches reference
	- answer_match = No: Mismatch (see LaTeX reconciliation)
	- latex_file: Path to detailed reconciliation document
	- model_answer_file: Path to model's complete response
	""")

	return interface

	def main():
	gui = ValidatorGUI()
	interface = gui.create_interface()
	interface.launch(
	share=False,
	server_name="127.0.0.1",
	server_port=7860,
	inbrowser=True
	)

	if __name__ == "__main__":
	main()