igriv commited on
Commit
1ea9c72
·
verified ·
1 Parent(s): acf0d28

Update validator app

Browse files
Files changed (9) hide show
  1. README.md +98 -12
  2. app.py +38 -0
  3. compile_all_pdfs.py +154 -0
  4. latex_compiler.py +252 -0
  5. packages.txt +6 -0
  6. requirements.txt +8 -0
  7. run_parallel.py +248 -0
  8. universal_validator.py +697 -0
  9. validator_gui.py +646 -0
README.md CHANGED
@@ -1,12 +1,98 @@
1
- ---
2
- title: Math Validator
3
- emoji: 🏆
4
- colorFrom: blue
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Math Question Validator with OpenAI o3
2
+
3
+ A Python tool for validating mathematical questions and answers using OpenAI's o3 model, with automatic reconciliation and quality assessment.
4
+
5
+ ## Features
6
+
7
+ - **Automated Answer Validation**: Uses OpenAI o3 model to solve math problems
8
+ - **Quality Assessment**: Evaluates question clarity, difficulty, and pedagogical value
9
+ - **Smart Reconciliation**: Generates detailed LaTeX documents comparing different solutions
10
+ - **Batch Processing**: Handles large datasets with progress tracking
11
+ - **File-based Output**: Avoids truncation issues with cloud storage by saving outputs as separate files
12
+
13
+ ## Setup
14
+
15
+ ### Prerequisites
16
+ - Python 3.8+
17
+ - OpenAI API key with o3 access
18
+ - MiKTeX (optional, for PDF compilation)
19
+
20
+ ### Installation
21
+
22
+ 1. Clone the repository:
23
+ ```bash
24
+ git clone https://github.com/YOUR_USERNAME/validator.git
25
+ cd validator
26
+ ```
27
+
28
+ 2. Install dependencies:
29
+ ```bash
30
+ pip install pandas openpyxl python-dotenv openai tqdm
31
+ ```
32
+
33
+ 3. Create `.env` file with your OpenAI API key:
34
+ ```
35
+ OPENAI_API_KEY=your_key_here
36
+ ```
37
+
38
+ ## Usage
39
+
40
+ ### Basic Validation
41
+ ```bash
42
+ python math_validator.py
43
+ ```
44
+
45
+ This will:
46
+ 1. Load questions from the Excel file
47
+ 2. Filter for math/statistics questions
48
+ 3. Assess each question's quality
49
+ 4. Generate o3 model answers
50
+ 5. Compare with reference answers
51
+ 6. Create LaTeX reconciliation documents for mismatches
52
+
53
+ ### Compile LaTeX to PDF
54
+ ```bash
55
+ python compile_latex.py
56
+ ```
57
+
58
+ ## Output Structure
59
+
60
+ ```
61
+ validation_results/
62
+ └── run_YYYYMMDD_HHMMSS/
63
+ ├── manifest.json # Index of all results
64
+ ├── model_answers/ # Full model responses
65
+ │ └── q_XXXX_answer.txt
66
+ ├── latex_documents/ # Reconciliation documents
67
+ │ └── q_XXXX_reconciliation.tex
68
+ └── compiled_pdfs/ # Compiled PDFs (if generated)
69
+ └── q_XXXX_reconciliation.pdf
70
+ ```
71
+
72
+ ## File Naming Convention
73
+
74
+ Files are named using Excel row numbers for easy cross-reference:
75
+ - `q_0116_reconciliation.tex` → Excel row 116
76
+ - `q_0117_answer.txt` → Excel row 117
77
+
78
+ ## Models Used
79
+
80
+ - **o3**: Primary model for solving mathematical problems
81
+ - **gpt-4o**: Quality assessment and question evaluation
82
+
83
+ ## Excel Output Columns
84
+
85
+ - `model_answer_file`: Path to model's answer
86
+ - `answer_match`: MATCH/DIFFERENT/ERROR
87
+ - `latex_file`: Path to reconciliation document
88
+ - `quality_rating`: excellent/good/fair/poor
89
+ - `difficulty_level`: too_easy/appropriate/too_hard/unclear
90
+ - `quality_comment`: Detailed assessment
91
+
92
+ ## License
93
+
94
+ [Your chosen license]
95
+
96
+ ## Author
97
+
98
+ [Your name]
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Hugging Face Spaces app for Math Validator
4
+ This is the main entry point for HF Spaces deployment
5
+ """
6
+
7
+ import os
8
+ import sys
9
+
10
+ # For Hugging Face Spaces, we need to ensure all modules are importable
11
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
12
+
13
+ # Import and run the GUI
14
+ from validator_gui import ValidatorGUI
15
+
16
+ def main():
17
+ """Main entry point for HF Spaces"""
18
+ gui = ValidatorGUI()
19
+ interface = gui.create_interface()
20
+
21
+ # Launch with HF Spaces settings
22
+ interface.launch(
23
+ server_name="0.0.0.0", # Required for HF Spaces
24
+ server_port=7860, # Default HF Spaces port
25
+ share=False, # Sharing handled by HF
26
+ # Set cache examples to False to save space
27
+ cache_examples=False
28
+ )
29
+
30
+ if __name__ == "__main__":
31
+ # Check if we're in HF Spaces
32
+ if os.getenv("SPACE_ID"):
33
+ print("Running in Hugging Face Spaces")
34
+ print("Note: Set your API keys in the Spaces Settings > Variables and secrets")
35
+ else:
36
+ print("Running locally")
37
+
38
+ main()
compile_all_pdfs.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Batch compile all LaTeX reconciliation documents to PDFs
4
+ Can be run after validation to generate all PDFs at once
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import argparse
10
+ from pathlib import Path
11
+ from latex_compiler import compile_latex_batch, check_latex_available
12
+ import time
13
+
14
+ def find_tex_files(base_dir="validation_results"):
15
+ """Find all .tex files in validation results"""
16
+ tex_files = []
17
+ base_path = Path(base_dir)
18
+
19
+ if not base_path.exists():
20
+ print(f"Directory not found: {base_dir}")
21
+ return tex_files
22
+
23
+ # Find all .tex files recursively
24
+ for tex_file in base_path.rglob("*.tex"):
25
+ # Skip auxiliary files
26
+ if not any(skip in tex_file.name for skip in ['.aux', '.log', '.out']):
27
+ tex_files.append(str(tex_file))
28
+
29
+ return tex_files
30
+
31
+ def compile_validation_pdfs(run_dir=None, max_workers=4):
32
+ """
33
+ Compile all LaTeX files from a validation run
34
+
35
+ Args:
36
+ run_dir: Specific run directory, or None for latest
37
+ max_workers: Number of parallel workers
38
+ """
39
+ if not check_latex_available():
40
+ print("Error: pdflatex not installed")
41
+ print("Install with:")
42
+ print(" Linux: apt-get install texlive-latex-base")
43
+ print(" Windows: Install MiKTeX")
44
+ print(" macOS: brew install --cask mactex")
45
+ return
46
+
47
+ # Find run directory
48
+ if run_dir:
49
+ base_dir = run_dir
50
+ else:
51
+ # Find latest run
52
+ base_path = Path("validation_results")
53
+ if not base_path.exists():
54
+ print("No validation_results directory found")
55
+ return
56
+
57
+ runs = [d for d in base_path.iterdir() if d.is_dir() and d.name.startswith("run_")]
58
+ if not runs:
59
+ print("No validation runs found")
60
+ return
61
+
62
+ # Get latest by timestamp
63
+ latest_run = max(runs, key=lambda x: x.stat().st_mtime)
64
+ base_dir = str(latest_run)
65
+ print(f"Using latest run: {latest_run.name}")
66
+
67
+ # Find LaTeX documents directory
68
+ latex_dir = Path(base_dir) / "latex_documents"
69
+ if not latex_dir.exists():
70
+ print(f"No latex_documents directory in {base_dir}")
71
+ return
72
+
73
+ # Find all .tex files
74
+ tex_files = list(latex_dir.glob("*.tex"))
75
+ if not tex_files:
76
+ print(f"No .tex files found in {latex_dir}")
77
+ return
78
+
79
+ print(f"Found {len(tex_files)} LaTeX files to compile")
80
+
81
+ # Check for already compiled PDFs
82
+ existing_pdfs = list(latex_dir.glob("*.pdf"))
83
+ if existing_pdfs:
84
+ print(f" ({len(existing_pdfs)} PDFs already exist)")
85
+
86
+ # Filter to only uncompiled
87
+ tex_names = {f.stem for f in tex_files}
88
+ pdf_names = {f.stem for f in existing_pdfs}
89
+ new_tex = [f for f in tex_files if f.stem not in pdf_names]
90
+
91
+ if new_tex:
92
+ print(f" Compiling {len(new_tex)} new PDFs...")
93
+ tex_files = new_tex
94
+ else:
95
+ print(" All PDFs already compiled")
96
+
97
+ recompile = input("Recompile all? (y/N): ").strip().lower()
98
+ if recompile != 'y':
99
+ return
100
+
101
+ # Compile in parallel
102
+ print(f"\nCompiling with {max_workers} parallel workers...")
103
+ start_time = time.time()
104
+
105
+ results = compile_latex_batch(
106
+ [str(f) for f in tex_files],
107
+ output_dir=str(latex_dir),
108
+ max_workers=max_workers,
109
+ timeout=30
110
+ )
111
+
112
+ # Summary
113
+ elapsed = time.time() - start_time
114
+ successful = sum(1 for r in results.values() if r[0])
115
+ failed = len(results) - successful
116
+
117
+ print(f"\n{'='*60}")
118
+ print(f"Compilation complete in {elapsed:.1f} seconds")
119
+ print(f" Successful: {successful}")
120
+ print(f" Failed: {failed}")
121
+
122
+ if failed > 0:
123
+ print("\nFailed files:")
124
+ for tex_file, (success, _, error) in results.items():
125
+ if not success:
126
+ print(f" - {Path(tex_file).name}: {error[:50]}...")
127
+
128
+ print(f"\nPDFs saved to: {latex_dir}")
129
+
130
+ def main():
131
+ parser = argparse.ArgumentParser(description='Compile LaTeX reconciliation documents to PDFs')
132
+ parser.add_argument('--run-dir', help='Specific run directory (default: latest)')
133
+ parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers')
134
+ parser.add_argument('--all', action='store_true', help='Compile all runs, not just latest')
135
+
136
+ args = parser.parse_args()
137
+
138
+ if args.all:
139
+ # Compile all runs
140
+ base_path = Path("validation_results")
141
+ if base_path.exists():
142
+ runs = [d for d in base_path.iterdir() if d.is_dir() and d.name.startswith("run_")]
143
+ print(f"Found {len(runs)} validation runs")
144
+
145
+ for run in runs:
146
+ print(f"\n{'='*60}")
147
+ print(f"Processing: {run.name}")
148
+ print('='*60)
149
+ compile_validation_pdfs(str(run), args.workers)
150
+ else:
151
+ compile_validation_pdfs(args.run_dir, args.workers)
152
+
153
+ if __name__ == "__main__":
154
+ main()
latex_compiler.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Async LaTeX compilation handler
4
+ Works efficiently on Linux/HF Spaces with forking
5
+ Falls back to sequential on Windows
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import subprocess
11
+ import platform
12
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
13
+ from pathlib import Path
14
+ import time
15
+
16
+ def is_linux():
17
+ """Check if running on Linux/Unix"""
18
+ return platform.system() in ['Linux', 'Darwin']
19
+
20
+ def compile_latex_file(tex_path, output_dir=None, timeout=30):
21
+ """
22
+ Compile a single LaTeX file to PDF
23
+
24
+ Args:
25
+ tex_path: Path to .tex file
26
+ output_dir: Output directory (default: same as tex file)
27
+ timeout: Compilation timeout in seconds
28
+
29
+ Returns:
30
+ tuple: (success: bool, pdf_path: str or None, error_msg: str or None)
31
+ """
32
+ tex_path = Path(tex_path)
33
+ if not tex_path.exists():
34
+ return False, None, f"File not found: {tex_path}"
35
+
36
+ output_dir = output_dir or tex_path.parent
37
+ pdf_path = output_dir / tex_path.with_suffix('.pdf').name
38
+
39
+ # Remove old PDF if exists
40
+ if pdf_path.exists():
41
+ try:
42
+ pdf_path.unlink()
43
+ except:
44
+ pass
45
+
46
+ # Compile command
47
+ cmd = [
48
+ 'pdflatex',
49
+ '-interaction=nonstopmode',
50
+ '-halt-on-error',
51
+ f'-output-directory={output_dir}',
52
+ str(tex_path)
53
+ ]
54
+
55
+ try:
56
+ # Run compilation
57
+ result = subprocess.run(
58
+ cmd,
59
+ capture_output=True,
60
+ text=True,
61
+ timeout=timeout,
62
+ cwd=str(tex_path.parent)
63
+ )
64
+
65
+ # Check if PDF was created
66
+ if pdf_path.exists():
67
+ return True, str(pdf_path), None
68
+ else:
69
+ # Extract error from log
70
+ error_msg = "Compilation failed"
71
+ if result.stdout:
72
+ lines = result.stdout.split('\n')
73
+ for i, line in enumerate(lines):
74
+ if 'Error' in line or '!' in line[:2]:
75
+ error_msg = '\n'.join(lines[i:i+5])
76
+ break
77
+ return False, None, error_msg
78
+
79
+ except subprocess.TimeoutExpired:
80
+ return False, None, f"Timeout after {timeout} seconds"
81
+ except FileNotFoundError:
82
+ return False, None, "pdflatex not found - install texlive"
83
+ except Exception as e:
84
+ return False, None, str(e)
85
+
86
+ def compile_latex_batch(tex_files, output_dir=None, max_workers=4, timeout=30):
87
+ """
88
+ Compile multiple LaTeX files in parallel
89
+
90
+ Args:
91
+ tex_files: List of .tex file paths
92
+ output_dir: Output directory for PDFs
93
+ max_workers: Number of parallel workers
94
+ timeout: Timeout per file
95
+
96
+ Returns:
97
+ dict: {tex_path: (success, pdf_path, error_msg)}
98
+ """
99
+ results = {}
100
+
101
+ if not tex_files:
102
+ return results
103
+
104
+ # Use ProcessPoolExecutor on Linux for true parallelism
105
+ # Use ThreadPoolExecutor on Windows (less efficient but works)
106
+ if is_linux():
107
+ executor_class = ProcessPoolExecutor
108
+ print(f"Using process-based parallelism ({max_workers} workers)")
109
+ else:
110
+ executor_class = ThreadPoolExecutor
111
+ print(f"Using thread-based parallelism ({max_workers} workers)")
112
+
113
+ with executor_class(max_workers=max_workers) as executor:
114
+ # Submit all compilation tasks
115
+ futures = {
116
+ executor.submit(compile_latex_file, tex_file, output_dir, timeout): tex_file
117
+ for tex_file in tex_files
118
+ }
119
+
120
+ # Collect results as they complete
121
+ for future in futures:
122
+ tex_file = futures[future]
123
+ try:
124
+ success, pdf_path, error = future.result(timeout=timeout+5)
125
+ results[tex_file] = (success, pdf_path, error)
126
+
127
+ if success:
128
+ print(f" ✓ Compiled: {Path(tex_file).name}")
129
+ else:
130
+ print(f" ✗ Failed: {Path(tex_file).name}")
131
+
132
+ except Exception as e:
133
+ results[tex_file] = (False, None, str(e))
134
+ print(f" ✗ Error: {Path(tex_file).name}: {e}")
135
+
136
+ return results
137
+
138
+ def compile_latex_async(tex_path, output_dir=None, callback=None):
139
+ """
140
+ Compile LaTeX file asynchronously (fire-and-forget)
141
+
142
+ Args:
143
+ tex_path: Path to .tex file
144
+ output_dir: Output directory
145
+ callback: Optional callback function(success, pdf_path, error)
146
+ """
147
+ if is_linux():
148
+ # On Linux, fork a subprocess
149
+ pid = os.fork()
150
+ if pid == 0:
151
+ # Child process
152
+ try:
153
+ success, pdf_path, error = compile_latex_file(tex_path, output_dir)
154
+ if callback:
155
+ callback(success, pdf_path, error)
156
+ finally:
157
+ os._exit(0)
158
+ else:
159
+ # Parent process continues immediately
160
+ print(f" → Compiling {Path(tex_path).name} in background (PID: {pid})")
161
+ else:
162
+ # On Windows, use threading
163
+ from threading import Thread
164
+
165
+ def compile_thread():
166
+ success, pdf_path, error = compile_latex_file(tex_path, output_dir)
167
+ if callback:
168
+ callback(success, pdf_path, error)
169
+
170
+ thread = Thread(target=compile_thread, daemon=True)
171
+ thread.start()
172
+ print(f" → Compiling {Path(tex_path).name} in background thread")
173
+
174
+ def check_latex_available():
175
+ """Check if pdflatex is available"""
176
+ try:
177
+ result = subprocess.run(
178
+ ['pdflatex', '--version'],
179
+ capture_output=True,
180
+ text=True,
181
+ timeout=5
182
+ )
183
+ if result.returncode == 0:
184
+ # Extract version
185
+ for line in result.stdout.split('\n'):
186
+ if 'TeX' in line:
187
+ print(f"LaTeX available: {line.strip()}")
188
+ return True
189
+ return False
190
+ except:
191
+ return False
192
+
193
+ # Integration with universal_validator.py
194
+ def setup_async_latex_compilation():
195
+ """
196
+ Setup async LaTeX compilation for the validator
197
+ Returns a function that can be used to compile LaTeX files
198
+ """
199
+ if not check_latex_available():
200
+ print("Warning: LaTeX not available, PDF compilation disabled")
201
+ return None
202
+
203
+ def compile_reconciliation(tex_path):
204
+ """Compile reconciliation document asynchronously"""
205
+ compile_latex_async(
206
+ tex_path,
207
+ callback=lambda s, p, e: print(f" [PDF] {'Success' if s else 'Failed'}: {Path(tex_path).name}")
208
+ )
209
+
210
+ return compile_reconciliation
211
+
212
+ if __name__ == "__main__":
213
+ # Test the compiler
214
+ import tempfile
215
+
216
+ print("Testing LaTeX compilation...")
217
+ print(f"Platform: {platform.system()}")
218
+ print(f"Async support: {'Yes' if is_linux() else 'Limited (Windows)'}")
219
+
220
+ if check_latex_available():
221
+ # Create a test document
222
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.tex', delete=False) as f:
223
+ f.write(r"""\documentclass{article}
224
+ \begin{document}
225
+ \title{Test Document}
226
+ \author{Validator}
227
+ \maketitle
228
+ This is a test: $x^2 + y^2 = z^2$
229
+ \end{document}""")
230
+ test_file = f.name
231
+
232
+ print(f"\nCompiling test file: {test_file}")
233
+ success, pdf_path, error = compile_latex_file(test_file)
234
+
235
+ if success:
236
+ print(f"✓ Success! PDF created: {pdf_path}")
237
+ print(f" Size: {os.path.getsize(pdf_path)} bytes")
238
+ else:
239
+ print(f"✗ Failed: {error}")
240
+
241
+ # Clean up
242
+ try:
243
+ os.unlink(test_file)
244
+ if pdf_path and os.path.exists(pdf_path):
245
+ os.unlink(pdf_path)
246
+ except:
247
+ pass
248
+ else:
249
+ print("✗ LaTeX not installed")
250
+ print(" On Linux: apt-get install texlive-latex-base")
251
+ print(" On Windows: Install MiKTeX")
252
+ print(" On macOS: brew install --cask mactex")
packages.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ texlive-latex-base
2
+ texlive-latex-extra
3
+ texlive-fonts-recommended
4
+ texlive-fonts-extra
5
+ texlive-latex-recommended
6
+ texlive-xetex
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ pandas>=2.0.0
3
+ openpyxl>=3.1.0
4
+ python-dotenv>=1.0.0
5
+ openai>=1.0.0
6
+ requests>=2.31.0
7
+ tqdm>=4.65.0
8
+ httpx>=0.24.0
run_parallel.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Run validator in parallel across multiple processes
4
+ """
5
+
6
+ import subprocess
7
+ import sys
8
+ import os
9
+ import argparse
10
+ import math
11
+ from concurrent.futures import ProcessPoolExecutor, as_completed
12
+ import pandas as pd
13
+
14
+ def run_validator_range(args):
15
+ """Run validator for a specific range"""
16
+ excel_file, solver, reconciler, start, end, images, batch_size, output_base, compile_latex = args
17
+
18
+ # Create unique output filename for this range
19
+ range_output = output_base.replace('.xlsx', f'_p{start}_{end}.xlsx')
20
+
21
+ cmd = [
22
+ sys.executable, "universal_validator.py",
23
+ excel_file,
24
+ "--model", solver,
25
+ "--reconciliation-model", reconciler,
26
+ "--images", images,
27
+ "--start", str(start),
28
+ "--end", str(end),
29
+ "--batch-size", str(batch_size),
30
+ "--output", range_output
31
+ ]
32
+
33
+ if compile_latex:
34
+ cmd.append("--compile-latex")
35
+
36
+ print(f"[PARALLEL] Starting process for questions {start+1}-{end}...")
37
+
38
+ try:
39
+ # Run without capturing output so it streams to console
40
+ process = subprocess.Popen(
41
+ cmd,
42
+ stdout=subprocess.PIPE,
43
+ stderr=subprocess.STDOUT,
44
+ text=True,
45
+ encoding='utf-8',
46
+ errors='replace',
47
+ bufsize=1
48
+ )
49
+
50
+ # Stream output
51
+ output_lines = []
52
+ while True:
53
+ line = process.stdout.readline()
54
+ if not line:
55
+ break
56
+ print(f"[P{start//100+1}] {line.rstrip()}")
57
+ output_lines.append(line)
58
+
59
+ process.wait()
60
+
61
+ if process.returncode == 0:
62
+ print(f"[PARALLEL] Completed range {start+1}-{end}")
63
+ return (start, end, "success", "")
64
+ else:
65
+ error_msg = "".join(output_lines[-20:]) # Last 20 lines
66
+ print(f"[FAIL] Failed range {start+1}-{end}")
67
+ return (start, end, "failed", error_msg)
68
+
69
+ except Exception as e:
70
+ print(f"[ERROR] Error in range {start+1}-{end}: {e}")
71
+ return (start, end, "error", str(e))
72
+
73
+ def main():
74
+ parser = argparse.ArgumentParser(description='Run validator in parallel')
75
+ parser.add_argument('file', help='Excel file to process')
76
+ parser.add_argument('--num-processes', type=int, default=4,
77
+ help='Number of parallel processes (default: 4)')
78
+ parser.add_argument('--solver', default='o3-mini',
79
+ help='Solver model (default: o3-mini)')
80
+ parser.add_argument('--reconciler', default='gpt-4o',
81
+ help='Reconciliation model (default: gpt-4o)')
82
+ parser.add_argument('--images', default='when_needed',
83
+ help='Image handling (default: when_needed)')
84
+ parser.add_argument('--batch-size', type=int, default=5,
85
+ help='Questions per batch (default: 5)')
86
+ parser.add_argument('--questions-per-process', type=int, default=100,
87
+ help='Questions per process (default: 100)')
88
+ parser.add_argument('--output', type=str, default=None,
89
+ help='Output filename for merged results')
90
+ parser.add_argument('--start-range', type=int, default=0,
91
+ help='Start of question range')
92
+ parser.add_argument('--end-range', type=int, default=None,
93
+ help='End of question range')
94
+ parser.add_argument('--compile-latex', action='store_true',
95
+ help='Compile LaTeX files to PDF')
96
+
97
+ args = parser.parse_args()
98
+
99
+ # Count total questions
100
+ print(f"Loading {args.file} to count questions...")
101
+ df = pd.read_excel(args.file, sheet_name='Data')
102
+
103
+ # Filter for math questions
104
+ if 'raw_subject' in df.columns:
105
+ math_filter = df['raw_subject'].str.lower().str.contains(
106
+ 'math|statistic|calculus|algebra|geometry|trigonometry',
107
+ na=False, regex=True
108
+ )
109
+ df = df[math_filter]
110
+
111
+ # Apply range if specified
112
+ if args.start_range > 0 or args.end_range:
113
+ start_idx = args.start_range
114
+ end_idx = args.end_range if args.end_range else len(df)
115
+ df = df.iloc[start_idx:end_idx]
116
+ print(f"Processing range: questions {start_idx+1} to {end_idx}")
117
+
118
+ total_questions = len(df)
119
+ print(f"Found {total_questions} math questions to process")
120
+
121
+ # Calculate ranges
122
+ questions_per_process = max(args.questions_per_process, math.ceil(total_questions / args.num_processes))
123
+ num_processes = min(args.num_processes, math.ceil(total_questions / questions_per_process))
124
+
125
+ # Generate output base filename
126
+ if args.output:
127
+ output_base = args.output
128
+ else:
129
+ from datetime import datetime
130
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
131
+ base_name = os.path.basename(args.file).replace('.xlsx', '')
132
+ output_base = f"{base_name}_validated_{timestamp}_parallel.xlsx"
133
+
134
+ ranges = []
135
+ base_start = args.start_range if args.start_range else 0
136
+
137
+ for i in range(num_processes):
138
+ start = base_start + i * questions_per_process
139
+ end = min(base_start + (i + 1) * questions_per_process, base_start + total_questions)
140
+ if start < base_start + total_questions:
141
+ ranges.append((
142
+ args.file,
143
+ args.solver,
144
+ args.reconciler,
145
+ start,
146
+ end,
147
+ args.images,
148
+ args.batch_size,
149
+ output_base,
150
+ args.compile_latex
151
+ ))
152
+
153
+ print(f"\nWill run {len(ranges)} parallel processes:")
154
+ for i, (_, _, _, start, end, _, _, _, _) in enumerate(ranges, 1):
155
+ print(f" Process {i}: questions {start+1}-{end}")
156
+
157
+ # Skip confirmation in GUI mode (when output is specified)
158
+ if not args.output:
159
+ confirm = input("\nProceed? (Y/n): ").strip().lower()
160
+ if confirm == 'n':
161
+ print("Cancelled")
162
+ return
163
+
164
+ # Run in parallel
165
+ print(f"\nStarting {len(ranges)} parallel processes...")
166
+
167
+ with ProcessPoolExecutor(max_workers=num_processes) as executor:
168
+ futures = {executor.submit(run_validator_range, r): r for r in ranges}
169
+
170
+ completed = 0
171
+ failed = []
172
+
173
+ for future in as_completed(futures):
174
+ completed += 1
175
+ start, end, status, error = future.result()
176
+
177
+ if status != "success":
178
+ failed.append((start, end, error))
179
+
180
+ print(f"Progress: {completed}/{len(ranges)} processes completed")
181
+
182
+ # Summary
183
+ print("\n" + "="*60)
184
+ print("PARALLEL VALIDATION COMPLETE")
185
+ print("="*60)
186
+
187
+ if failed:
188
+ print(f"\nFailed ranges ({len(failed)}):")
189
+ for start, end, error in failed:
190
+ print(f" {start}-{end}: {error[:100]}")
191
+ print("\nRerun these ranges individually to retry")
192
+ else:
193
+ print("\nAll ranges completed successfully!")
194
+
195
+ # Merge results from all processes
196
+ print("\nMerging results from all processes...")
197
+ merge_results(args.file, output_base, ranges)
198
+
199
+ # Clean up intermediate files
200
+ for _, _, _, start, end, _, _, _, _ in ranges:
201
+ temp_file = output_base.replace('.xlsx', f'_p{start}_{end}.xlsx')
202
+ if os.path.exists(temp_file):
203
+ os.remove(temp_file)
204
+ print(f" Cleaned up: {temp_file}")
205
+
206
+ print(f"\nFinal results saved to: {output_base}")
207
+ print(f"Results from {len(ranges)} processes have been merged")
208
+
209
+ def merge_results(original_file, output_file, ranges):
210
+ """Merge results from parallel processes into a single file"""
211
+ import pandas as pd
212
+
213
+ # Load original data
214
+ original_df = pd.read_excel(original_file, sheet_name='Data')
215
+
216
+ # Process each range file and update the dataframe
217
+ for _, _, _, start, end, _, _, _, _ in ranges:
218
+ temp_file = output_file.replace('.xlsx', f'_p{start}_{end}.xlsx')
219
+ if os.path.exists(temp_file):
220
+ try:
221
+ temp_df = pd.read_excel(temp_file, sheet_name='Data')
222
+ # Update the original dataframe with results from this range
223
+ for idx in range(start, min(end, len(temp_df))):
224
+ if idx < len(original_df):
225
+ for col in ['model_answer_file', 'answer_match', 'latex_file',
226
+ 'quality_rating', 'difficulty_level', 'quality_comment']:
227
+ if col in temp_df.columns:
228
+ original_df.at[idx, col] = temp_df.at[idx, col]
229
+ print(f" Merged results from questions {start+1}-{end}")
230
+ except Exception as e:
231
+ print(f" Warning: Could not merge {temp_file}: {e}")
232
+
233
+ # Save merged results
234
+ with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
235
+ original_df.to_excel(writer, sheet_name='Data', index=False)
236
+
237
+ # Copy other sheets if they exist
238
+ try:
239
+ xl = pd.ExcelFile(original_file)
240
+ for sheet_name in xl.sheet_names:
241
+ if sheet_name != 'Data':
242
+ df = pd.read_excel(original_file, sheet_name=sheet_name)
243
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
244
+ except:
245
+ pass
246
+
247
+ if __name__ == "__main__":
248
+ main()
universal_validator.py ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from dotenv import load_dotenv
4
+ import time
5
+ from typing import Dict, Any, Optional, List
6
+ import re
7
+ from datetime import datetime
8
+ import json
9
+ from tqdm import tqdm
10
+ import base64
11
+ import requests
12
+ from io import BytesIO
13
+
14
+ load_dotenv()
15
+
16
+ class UniversalMathValidator:
17
+ """Universal validator that can handle different Excel formats and API providers"""
18
+
19
+ def __init__(self, excel_file: str, provider: str = "openai", include_images: str = "when_needed",
20
+ solver_model: str = None, reconciliation_model: str = None):
21
+ """
22
+ Initialize validator
23
+
24
+ Args:
25
+ excel_file: Path to Excel file
26
+ provider: "openai" or "openrouter"
27
+ include_images: "always", "never", or "when_needed"
28
+ solver_model: Model for solving questions
29
+ reconciliation_model: Model for reconciliation
30
+ """
31
+ self.excel_file = excel_file
32
+ self.include_images = include_images
33
+
34
+ # Determine provider based on models
35
+ # If any model requires OpenRouter, use OpenRouter for everything
36
+ openrouter_prefixes = ["anthropic/", "x-ai/", "google/", "meta-llama/", "mistral/", "openai/"]
37
+ openai_models = ["o3-mini", "gpt-4o", "gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4-turbo"]
38
+
39
+ # Check if any model needs OpenRouter (has a prefix or is not an OpenAI model)
40
+ solver_needs_or = solver_model and (
41
+ any(solver_model.startswith(p) for p in openrouter_prefixes) or
42
+ solver_model not in openai_models
43
+ )
44
+ recon_needs_or = reconciliation_model and (
45
+ any(reconciliation_model.startswith(p) for p in openrouter_prefixes) or
46
+ reconciliation_model not in openai_models
47
+ )
48
+
49
+ needs_openrouter = solver_needs_or or recon_needs_or
50
+
51
+ # Override provider if OpenRouter is needed
52
+ if needs_openrouter:
53
+ self.provider = "openrouter"
54
+ if provider == "openai":
55
+ print("Note: Using OpenRouter for all models since non-OpenAI model specified")
56
+ else:
57
+ self.provider = provider
58
+
59
+ # Store original model names for later prefixing if needed
60
+ self.solver_model_input = solver_model
61
+ self.reconciliation_model_input = reconciliation_model
62
+
63
+ self.df = None
64
+ self.output_file = None # Will be set later
65
+ self.compile_latex = False # Will be set from args
66
+
67
+ # Detect file format
68
+ self.file_format = self._detect_format()
69
+
70
+ # Create directories for outputs
71
+ self.base_dir = "validation_results"
72
+ self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
73
+ self.run_dir = os.path.join(self.base_dir, f"run_{self.timestamp}")
74
+ self.latex_dir = os.path.join(self.run_dir, "latex_documents")
75
+ self.answers_dir = os.path.join(self.run_dir, "model_answers")
76
+
77
+ os.makedirs(self.latex_dir, exist_ok=True)
78
+ os.makedirs(self.answers_dir, exist_ok=True)
79
+
80
+ # Initialize API client
81
+ if self.provider == "openai":
82
+ from openai import OpenAI
83
+ import httpx
84
+ # Set 5 minute timeout for GPT-5 models which can be very slow
85
+ self.client = OpenAI(
86
+ api_key=os.getenv('OPENAI_API_KEY'),
87
+ timeout=httpx.Timeout(300.0, connect=10.0) # 300 second timeout, 10 second connect
88
+ )
89
+ # Default models for OpenAI
90
+ self.model = self.solver_model_input or "o3-mini"
91
+ self.reconciliation_model = self.reconciliation_model_input or "gpt-4o"
92
+ self.assessment_model = "gpt-4o"
93
+ elif self.provider == "openrouter":
94
+ import httpx
95
+ self.client = self._setup_openrouter()
96
+
97
+ # Helper to add openai/ prefix if needed
98
+ def format_for_openrouter(model_name):
99
+ if not model_name:
100
+ return None
101
+ # If already has a prefix, use as-is
102
+ if "/" in model_name:
103
+ return model_name
104
+ # If it's an OpenAI model, add prefix
105
+ openai_models = ["o3-mini", "gpt-4o", "gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4-turbo"]
106
+ if model_name in openai_models:
107
+ return f"openai/{model_name}"
108
+ # Otherwise assume it needs no prefix (for backwards compatibility)
109
+ return model_name
110
+
111
+ # Format models for OpenRouter
112
+ self.model = format_for_openrouter(self.solver_model_input) or "openai/o3-mini"
113
+ self.reconciliation_model = format_for_openrouter(self.reconciliation_model_input) or "openai/gpt-4o"
114
+ self.assessment_model = "openai/gpt-4o"
115
+
116
+ # System prompts
117
+ self.system_prompt_answer = """You are a highly skilled mathematics graduate student.
118
+ Solve the following problem step by step.
119
+ IMPORTANT: First show your complete reasoning and work.
120
+ Then clearly state the final answer.
121
+ Your response should include both the reasoning process and the final answer."""
122
+
123
+ self.system_prompt_assess = """You are an experienced mathematics educator. Evaluate mathematical questions."""
124
+
125
+ self.system_prompt_reconcile = """You are a graduate student who produces detailed justifications in LaTeX format.
126
+ You excel at analyzing mathematical solutions and identifying potential errors.
127
+ Your output should be a complete LaTeX document that can be compiled directly."""
128
+
129
+ # Create manifest
130
+ self.manifest_file = os.path.join(self.run_dir, "manifest.json")
131
+ self.manifest = {
132
+ "timestamp": self.timestamp,
133
+ "source_file": excel_file,
134
+ "file_format": self.file_format,
135
+ "provider": provider,
136
+ "model": self.model,
137
+ "questions": {}
138
+ }
139
+
140
+ def _detect_format(self) -> str:
141
+ """Detect which format the Excel file uses"""
142
+ xl = pd.ExcelFile(self.excel_file)
143
+
144
+ # Check for specific sheets
145
+ if 'rationale_images' in xl.sheet_names:
146
+ return "HLE_B3" # HLE_Verified_B3 format
147
+ elif 'model_responses' in xl.sheet_names:
148
+ return "HLE_335" # HLE_335 format
149
+ else:
150
+ return "unknown"
151
+
152
+ def _setup_openrouter(self):
153
+ """Setup OpenRouter client"""
154
+ from openai import OpenAI
155
+ import httpx
156
+
157
+ # OpenRouter uses OpenAI-compatible API
158
+ client = OpenAI(
159
+ base_url="https://openrouter.ai/api/v1",
160
+ api_key=os.getenv('OPENROUTER_API_KEY'),
161
+ timeout=httpx.Timeout(300.0, connect=10.0), # Same timeout as OpenAI
162
+ default_headers={
163
+ "HTTP-Referer": "https://github.com/yourusername/validator",
164
+ "X-Title": "Math Validator"
165
+ }
166
+ )
167
+ return client
168
+
169
+ def load_data(self):
170
+ """Load and normalize data based on file format"""
171
+ if self.file_format == "HLE_B3":
172
+ # Load HLE_Verified_B3 format
173
+ self.df = pd.read_excel(self.excel_file, sheet_name='Data')
174
+
175
+ # Normalize column names
176
+ self.df['task_name'] = self.df.get('id', '')
177
+ self.df['answer type'] = self.df.get('answer_type', 'exactMatch')
178
+
179
+ # Create image mapping from file_url column (question images)
180
+ self.image_mapping = {}
181
+ if 'file_url' in self.df.columns:
182
+ for idx, row in self.df.iterrows():
183
+ if pd.notna(row.get('file_url')) and pd.notna(row.get('id')):
184
+ self.image_mapping[row['id']] = row['file_url']
185
+ print(f"Loaded {len(self.image_mapping)} question images from file_url column")
186
+
187
+ # Also load rationale images if needed (these are for rationales, not questions)
188
+ try:
189
+ rationale_images = pd.read_excel(self.excel_file, sheet_name='rationale_images')
190
+ # Don't overwrite question images with rationale images
191
+ rationale_mapping = dict(zip(rationale_images['ID'], rationale_images['gcp']))
192
+ print(f"Found {len(rationale_mapping)} rationale images (not used for questions)")
193
+ except:
194
+ pass
195
+
196
+ elif self.file_format == "HLE_335":
197
+ # Load HLE_335 format
198
+ self.df = pd.read_excel(self.excel_file, sheet_name='Data')
199
+ self.image_mapping = {}
200
+ else:
201
+ # Generic format - assume Data sheet exists
202
+ self.df = pd.read_excel(self.excel_file, sheet_name='Data')
203
+ self.image_mapping = {}
204
+
205
+ # Filter for math questions but KEEP ORIGINAL INDICES
206
+ if 'raw_subject' in self.df.columns:
207
+ math_filter = self.df['raw_subject'].str.lower().str.contains(
208
+ 'math|statistic|calculus|algebra|geometry|trigonometry',
209
+ na=False, regex=True
210
+ )
211
+ # Keep original indices by not resetting them
212
+ self.df = self.df[math_filter] # Don't use .copy() with reset indices
213
+
214
+ # Add result columns
215
+ self.df['model_answer_file'] = ''
216
+ self.df['answer_match'] = ''
217
+ self.df['latex_file'] = ''
218
+ self.df['quality_rating'] = ''
219
+ self.df['difficulty_level'] = ''
220
+ self.df['quality_comment'] = ''
221
+
222
+ print(f"Loaded {len(self.df)} math/statistics questions from {self.file_format} format")
223
+ return self.df
224
+
225
+ def _get_image_for_question(self, row) -> Optional[str]:
226
+ """Get image URL or path for a question if needed"""
227
+ if self.include_images == "never":
228
+ return None
229
+
230
+ # Check if question has an image reference
231
+ question_id = row.get('id') or row.get('task_name')
232
+ question_text = str(row.get('question', '')).lower()
233
+
234
+ # Check if question mentions an image
235
+ has_image_reference = any(keyword in question_text for keyword in [
236
+ "image", "figure", "diagram", "picture", "attached",
237
+ "graph", "plot", "shown", "below", "above"
238
+ ])
239
+
240
+ if self.include_images == "always" or (
241
+ self.include_images == "when_needed" and has_image_reference
242
+ ):
243
+ # First check file_url column directly (primary source for question images)
244
+ if 'file_url' in row and pd.notna(row['file_url']):
245
+ return row['file_url']
246
+
247
+ # Then try to get image from mapping
248
+ if question_id in self.image_mapping:
249
+ return self.image_mapping[question_id]
250
+
251
+ # Finally check for generic image column
252
+ if 'image' in row and pd.notna(row['image']):
253
+ return row['image']
254
+
255
+ # Log warning if image was expected but not found
256
+ if has_image_reference:
257
+ original_idx = row.name if hasattr(row, 'name') else 'unknown'
258
+ print(f" [WARNING] Question {original_idx} mentions image but none found (ID: {question_id[:20]}...)")
259
+
260
+ return None
261
+
262
+ def _encode_image(self, image_url: str) -> Optional[str]:
263
+ """Download and encode image as base64"""
264
+ try:
265
+ response = requests.get(image_url, timeout=10)
266
+ if response.status_code == 200:
267
+ return base64.b64encode(response.content).decode('utf-8')
268
+ except:
269
+ pass
270
+ return None
271
+
272
+ def get_model_answer(self, question: str, image_url: Optional[str] = None, attempt: int = 1) -> Optional[str]:
273
+ """Get answer from model with optional image support"""
274
+ try:
275
+ messages = [
276
+ {"role": "system", "content": self.system_prompt_answer}
277
+ ]
278
+
279
+ # Build user message with optional image
280
+ if image_url and self.provider == "openai":
281
+ # OpenAI vision format
282
+ user_content = [
283
+ {"type": "text", "text": question}
284
+ ]
285
+
286
+ if image_url.startswith('http'):
287
+ user_content.append({
288
+ "type": "image_url",
289
+ "image_url": {"url": image_url}
290
+ })
291
+
292
+ messages.append({"role": "user", "content": user_content})
293
+ else:
294
+ # Text-only or OpenRouter (handle differently if needed)
295
+ messages.append({"role": "user", "content": question})
296
+
297
+ # Make API call
298
+ # Check the original model name (before prefixing) for special handling
299
+ # Handle case where solver_model_input might not be set
300
+ if hasattr(self, 'solver_model_input'):
301
+ original_model = self.solver_model_input or self.model
302
+ else:
303
+ original_model = self.model
304
+
305
+ if original_model in ["o3-mini", "gpt-5", "gpt-5-mini", "gpt-5-nano"]:
306
+ # Use higher token limit for GPT-5 and o3 models to allow for reasoning
307
+ if original_model == "o3-mini":
308
+ max_tokens = 10000
309
+ elif original_model in ["gpt-5", "gpt-5-mini", "gpt-5-nano"]:
310
+ max_tokens = 8000 # Increased for reasoning + answer
311
+ else:
312
+ max_tokens = 3000
313
+
314
+ response = self.client.chat.completions.create(
315
+ model=self.model,
316
+ messages=messages,
317
+ max_completion_tokens=max_tokens
318
+ )
319
+ else:
320
+ response = self.client.chat.completions.create(
321
+ model=self.model,
322
+ messages=messages,
323
+ temperature=0.1,
324
+ max_tokens=2000
325
+ )
326
+
327
+ return response.choices[0].message.content.strip()
328
+
329
+ except Exception as e:
330
+ error_msg = str(e)
331
+ if "timeout" in error_msg.lower():
332
+ print(f" [TIMEOUT] Timeout getting model answer (attempt {attempt}/3)")
333
+ else:
334
+ print(f" [ERROR] Error getting model answer (attempt {attempt}): {e}")
335
+
336
+ if attempt < 3:
337
+ time.sleep(2 ** attempt)
338
+ return self.get_model_answer(question, image_url, attempt + 1)
339
+
340
+ print(f" [ERROR] Failed after 3 attempts")
341
+ return None
342
+
343
+ def generate_reconciliation_latex(self, question: str, model_answer: str,
344
+ reference_answer: str, rationale: str = None, attempt: int = 1) -> str:
345
+ """Generate LaTeX reconciliation document for mismatched answers"""
346
+ prompt = f"""Compare and reconcile these two answers to the following problem.
347
+
348
+ PROBLEM:
349
+ {question}
350
+
351
+ MODEL'S ANSWER:
352
+ {model_answer}
353
+
354
+ REFERENCE ANSWER:
355
+ {reference_answer}
356
+
357
+ REFERENCE RATIONALE:
358
+ {rationale if pd.notna(rationale) else "Not provided"}
359
+
360
+ Please create a complete LaTeX document that:
361
+ 1. States the problem
362
+ 2. Shows the model's approach and solution
363
+ 3. Shows the reference approach and solution
364
+ 4. Analyzes where any differences or errors might occur
365
+ 5. Provides your assessment of which answer is correct and why
366
+
367
+ The document should be properly formatted with sections and mathematical notation.
368
+ Begin with \\documentclass and end with \\end{{document}}."""
369
+
370
+ try:
371
+ # Handle GPT-5 models parameter differences
372
+ messages = [
373
+ {"role": "system", "content": self.system_prompt_reconcile},
374
+ {"role": "user", "content": prompt}
375
+ ]
376
+
377
+ # Use the configured reconciliation model
378
+ reconciliation_model = self.reconciliation_model
379
+
380
+ # Check the original model name (before prefixing) for special handling
381
+ # Handle case where reconciliation_model_input might not be set
382
+ if hasattr(self, 'reconciliation_model_input'):
383
+ original_recon = self.reconciliation_model_input or reconciliation_model
384
+ else:
385
+ original_recon = reconciliation_model
386
+
387
+ # Check if reconciliation model needs special handling
388
+ if original_recon in ["gpt-5", "gpt-5-mini", "gpt-5-nano"]:
389
+ # GPT-5 models don't support temperature
390
+ response = self.client.chat.completions.create(
391
+ model=reconciliation_model,
392
+ messages=messages,
393
+ max_completion_tokens=8000 # Allow longer for reconciliation
394
+ )
395
+ elif original_recon in ["o3-mini"]:
396
+ response = self.client.chat.completions.create(
397
+ model=reconciliation_model,
398
+ messages=messages,
399
+ max_completion_tokens=10000
400
+ )
401
+ else:
402
+ # Standard models (gpt-4o, claude, etc.)
403
+ response = self.client.chat.completions.create(
404
+ model=reconciliation_model,
405
+ messages=messages,
406
+ temperature=0.3,
407
+ max_tokens=3000
408
+ )
409
+
410
+ return response.choices[0].message.content.strip()
411
+ except Exception as e:
412
+ error_msg = str(e)
413
+ if "timeout" in error_msg.lower():
414
+ print(f" [TIMEOUT] Timeout generating reconciliation (attempt {attempt}/3)")
415
+ else:
416
+ print(f" [ERROR] Error generating reconciliation (attempt {attempt}): {e}")
417
+
418
+ if attempt < 3:
419
+ time.sleep(2 ** attempt) # Exponential backoff
420
+ return self.generate_reconciliation_latex(question, model_answer, reference_answer, rationale, attempt + 1)
421
+
422
+ print(f" [ERROR] Failed to generate reconciliation after 3 attempts")
423
+ return None
424
+
425
+ def process_questions(self, start_idx: int = 0, batch_size: int = 5):
426
+ """Process questions with progress tracking"""
427
+ total = len(self.df)
428
+
429
+ with tqdm(total=total, desc="Overall Progress", position=0, leave=True) as pbar_main:
430
+ pbar_main.update(start_idx)
431
+
432
+ for i in range(start_idx, total, batch_size):
433
+ batch_end = min(i + batch_size, total)
434
+ print(f"\n{'='*60}")
435
+ print(f"Processing batch: questions {i+1} to {batch_end} of {total}")
436
+ print(f"Using {self.provider} with model {self.model}")
437
+ print(f"{'='*60}")
438
+
439
+ batch_size_actual = batch_end - i
440
+ with tqdm(total=batch_size_actual * 3, desc="Current Batch", position=1, leave=False) as pbar_batch:
441
+ for idx in range(i, batch_end):
442
+ row = self.df.iloc[idx]
443
+ original_idx = self.df.index[idx]
444
+
445
+ # Get question and check for image
446
+ question = row['question']
447
+ image_url = self._get_image_for_question(row)
448
+
449
+ if image_url:
450
+ print(f" Including image for question {original_idx}")
451
+
452
+ # Get model answer
453
+ print(f" Question {original_idx}: Getting answer from {self.model}...")
454
+ model_answer = self.get_model_answer(question, image_url)
455
+
456
+ if model_answer:
457
+ print(f" [OK] Got answer ({len(model_answer)} chars)")
458
+ else:
459
+ print(f" [FAIL] Failed to get answer")
460
+ pbar_batch.update(1)
461
+
462
+ if model_answer:
463
+ # Save model answer to file
464
+ question_id = f"q_{original_idx:04d}"
465
+ answer_filename = f"{question_id}_answer.txt"
466
+ answer_path = os.path.join(self.answers_dir, answer_filename)
467
+
468
+ with open(answer_path, 'w', encoding='utf-8') as f:
469
+ f.write(f"Question: {question}\n\n")
470
+ f.write(f"Model Answer: {model_answer}\n")
471
+
472
+ self.df.at[original_idx, 'model_answer_file'] = answer_filename
473
+
474
+ # Check if answer matches reference
475
+ reference_answer = str(row.get('correct_answer', row.get('answer', '')))
476
+
477
+ # Simple string matching (could be enhanced)
478
+ model_norm = str(model_answer).strip().lower()
479
+ ref_norm = str(reference_answer).strip().lower()
480
+
481
+ # Check for exact match or numerical equivalence
482
+ match = (model_norm == ref_norm)
483
+ if not match and reference_answer:
484
+ # Try extracting numbers for comparison
485
+ import re
486
+ model_nums = re.findall(r'-?\d+\.?\d*', model_norm)
487
+ ref_nums = re.findall(r'-?\d+\.?\d*', ref_norm)
488
+ if model_nums and ref_nums:
489
+ match = (model_nums[0] == ref_nums[0])
490
+
491
+ self.df.at[original_idx, 'answer_match'] = 'Yes' if match else 'No'
492
+
493
+ # Print match result for GUI tracking
494
+ if match:
495
+ print(f" [MATCH] Answer matches reference")
496
+ else:
497
+ print(f" [MISMATCH] Answer differs from reference")
498
+
499
+ # Generate LaTeX reconciliation if mismatch
500
+ if not match and reference_answer:
501
+ print(f" Generating reconciliation for question {original_idx}")
502
+ rationale = row.get('rationale', '')
503
+ latex_doc = self.generate_reconciliation_latex(
504
+ question, model_answer, reference_answer, rationale
505
+ )
506
+
507
+ # Only save LaTeX if generation was successful
508
+ if latex_doc:
509
+ latex_filename = f"{question_id}_reconciliation.tex"
510
+ latex_path = os.path.join(self.latex_dir, latex_filename)
511
+ with open(latex_path, 'w', encoding='utf-8') as f:
512
+ f.write(latex_doc)
513
+
514
+ self.df.at[original_idx, 'latex_file'] = latex_filename
515
+
516
+ # Compile LaTeX if requested
517
+ if self.compile_latex:
518
+ # Try async compilation first (better on Linux/HF Spaces)
519
+ try:
520
+ from latex_compiler import compile_latex_async, is_linux
521
+ if is_linux():
522
+ # Async compilation on Linux - doesn't block
523
+ compile_latex_async(
524
+ latex_path,
525
+ self.latex_dir,
526
+ callback=lambda s, p, e: None # Silent callback
527
+ )
528
+ print(f" [PDF] Compiling in background: {latex_filename}")
529
+ else:
530
+ # Fallback to synchronous on Windows
531
+ import subprocess
532
+ pdf_path = latex_path.replace('.tex', '.pdf')
533
+ result = subprocess.run(
534
+ ['pdflatex', '-interaction=nonstopmode', '-output-directory', self.latex_dir, latex_path],
535
+ capture_output=True,
536
+ timeout=30
537
+ )
538
+ if os.path.exists(pdf_path):
539
+ print(f" [OK] Compiled to PDF: {os.path.basename(pdf_path)}")
540
+ except ImportError:
541
+ # latex_compiler.py not available, use old method
542
+ try:
543
+ import subprocess
544
+ pdf_path = latex_path.replace('.tex', '.pdf')
545
+ result = subprocess.run(
546
+ ['pdflatex', '-interaction=nonstopmode', '-output-directory', self.latex_dir, latex_path],
547
+ capture_output=True,
548
+ timeout=30
549
+ )
550
+ if os.path.exists(pdf_path):
551
+ print(f" [OK] Compiled to PDF: {os.path.basename(pdf_path)}")
552
+ except Exception as e:
553
+ print(f" Warning: Could not compile LaTeX: {e}")
554
+ except Exception as e:
555
+ print(f" Warning: Could not compile LaTeX: {e}")
556
+ else:
557
+ print(f" Failed to generate reconciliation after retries")
558
+ self.df.at[original_idx, 'latex_file'] = 'GENERATION_ERROR'
559
+
560
+ pbar_batch.update(2)
561
+ else:
562
+ self.df.at[original_idx, 'model_answer_file'] = 'ERROR'
563
+ self.df.at[original_idx, 'answer_match'] = 'ERROR'
564
+ pbar_batch.update(2)
565
+
566
+ pbar_main.update(1)
567
+ time.sleep(0.5) # Rate limiting
568
+
569
+ self.save_results()
570
+ print(f"\nBatch complete. Progress saved to {self.output_file}")
571
+
572
+ if batch_end < total:
573
+ time.sleep(5)
574
+
575
+ def save_results(self):
576
+ """Save results back to Excel"""
577
+ with pd.ExcelWriter(self.output_file, engine='openpyxl') as writer:
578
+ original = pd.ExcelFile(self.excel_file)
579
+
580
+ for sheet_name in original.sheet_names:
581
+ if sheet_name == 'Data':
582
+ original_df = pd.read_excel(self.excel_file, sheet_name='Data')
583
+
584
+ # Update only processed rows
585
+ for idx in self.df.index:
586
+ for col in ['model_answer_file', 'answer_match', 'latex_file',
587
+ 'quality_rating', 'difficulty_level', 'quality_comment']:
588
+ if col in self.df.columns:
589
+ original_df.at[idx, col] = self.df.at[idx, col]
590
+
591
+ original_df.to_excel(writer, sheet_name=sheet_name, index=False)
592
+ else:
593
+ df_other = pd.read_excel(self.excel_file, sheet_name=sheet_name)
594
+ df_other.to_excel(writer, sheet_name=sheet_name, index=False)
595
+
596
+ def run(self):
597
+ """Main execution"""
598
+ # Set default output file if not already set
599
+ if not self.output_file:
600
+ from datetime import datetime
601
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
602
+ base_name = os.path.basename(self.excel_file).replace('.xlsx', '')
603
+ self.output_file = f"{base_name}_validated_{timestamp}.xlsx"
604
+
605
+ print(f"Starting Universal Math Validator")
606
+ print(f" File: {self.excel_file}")
607
+ print(f" Format: {self.file_format}")
608
+ print(f" Provider: {self.provider}")
609
+ print(f" Model: {self.model}")
610
+ print(f" Image handling: {self.include_images}")
611
+ print(f" Output: {self.output_file}")
612
+ print("=" * 60)
613
+
614
+ self.load_data()
615
+ self.process_questions()
616
+
617
+ # Calculate and display summary statistics
618
+ if 'answer_match' in self.df.columns:
619
+ total = len(self.df)
620
+ correct = (self.df['answer_match'] == 'Yes').sum()
621
+ incorrect = (self.df['answer_match'] == 'No').sum()
622
+ errors = (self.df['answer_match'] == 'ERROR').sum()
623
+
624
+ print("\n" + "="*60)
625
+ print("VALIDATION COMPLETE")
626
+ print("="*60)
627
+ print(f"\nTotal questions processed: {total}")
628
+ print(f"Correct answers: {correct} ({correct/total*100:.1f}%)")
629
+ print(f"Incorrect answers: {incorrect} ({incorrect/total*100:.1f}%)")
630
+ if errors > 0:
631
+ print(f"Errors: {errors}")
632
+
633
+ # Count LaTeX files generated
634
+ latex_count = (self.df['latex_file'] != '').sum()
635
+ if latex_count > 0:
636
+ print(f"\nLaTeX reconciliation documents generated: {latex_count}")
637
+ print(f"Location: {self.latex_dir}")
638
+
639
+ print(f"\nResults saved to: {self.output_file}")
640
+ print(f"Model answers saved to: {self.answers_dir}")
641
+ else:
642
+ print("\nValidation Complete!")
643
+ print(f"Results saved to: {self.output_file}")
644
+
645
+
646
+ if __name__ == "__main__":
647
+ import argparse
648
+
649
+ parser = argparse.ArgumentParser(description='Universal Math Question Validator')
650
+ parser.add_argument('file', help='Excel file to process')
651
+ parser.add_argument('--provider', choices=['openai', 'openrouter'], default='openai',
652
+ help='API provider to use')
653
+ parser.add_argument('--model', help='Model for solving questions (default: o3-mini)')
654
+ parser.add_argument('--reconciliation-model', help='Model for reconciliation (default: gpt-4o)')
655
+ parser.add_argument('--images', choices=['always', 'never', 'when_needed'],
656
+ default='when_needed', help='When to include images')
657
+ parser.add_argument('--start', type=int, default=0, help='Start from question index')
658
+ parser.add_argument('--end', type=int, default=None, help='End at question index (for parallel processing)')
659
+ parser.add_argument('--batch-size', type=int, default=5, help='Number of questions per batch')
660
+ parser.add_argument('--output', type=str, default=None, help='Output filename (default: auto-generated)')
661
+ parser.add_argument('--compile-latex', action='store_true', help='Compile LaTeX files to PDF')
662
+
663
+ args = parser.parse_args()
664
+
665
+ validator = UniversalMathValidator(
666
+ excel_file=args.file,
667
+ provider=args.provider,
668
+ include_images=args.images,
669
+ solver_model=args.model,
670
+ reconciliation_model=args.reconciliation_model
671
+ )
672
+
673
+ # Set output filename if provided
674
+ if args.output:
675
+ validator.output_file = args.output
676
+ else:
677
+ # Generate default filename with timestamp
678
+ from datetime import datetime
679
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
680
+ base_name = os.path.basename(args.file).replace('.xlsx', '')
681
+ if args.start > 0 or args.end:
682
+ range_str = f"_q{args.start+1}_q{args.end}" if args.end else f"_from_q{args.start+1}"
683
+ else:
684
+ range_str = ""
685
+ validator.output_file = f"{base_name}_validated_{timestamp}{range_str}.xlsx"
686
+
687
+ # Set LaTeX compilation flag
688
+ validator.compile_latex = args.compile_latex
689
+
690
+ # Handle parallel processing by limiting range
691
+ if args.end:
692
+ validator.load_data()
693
+ # Filter to specific range for parallel processing
694
+ validator.df = validator.df.iloc[args.start:args.end]
695
+ validator.process_questions(start_idx=0, batch_size=args.batch_size)
696
+ else:
697
+ validator.run()
validator_gui.py ADDED
@@ -0,0 +1,646 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Gradio Web Interface for Math Validator
4
+ """
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import os
9
+ import subprocess
10
+ import sys
11
+ import json
12
+ from datetime import datetime
13
+ import threading
14
+ import queue
15
+ import time
16
+ from dotenv import load_dotenv
17
+
18
+ # Load environment variables from .env file
19
+ load_dotenv()
20
+
21
+ class ValidatorGUI:
22
+ def __init__(self):
23
+ self.process = None
24
+ self.output_queue = queue.Queue()
25
+ self.is_running = False
26
+ self.total_questions = 0
27
+ self.math_questions = 0
28
+
29
+ # Progress tracking
30
+ self.questions_processed = 0
31
+ self.correct_answers = 0
32
+ self.incorrect_answers = 0
33
+ self.timeouts = 0
34
+ self.errors = 0
35
+
36
+ # Model options
37
+ self.openai_models = [
38
+ "o3-mini",
39
+ "gpt-4o",
40
+ "gpt-5",
41
+ "gpt-5-mini",
42
+ "gpt-5-nano",
43
+ "gpt-4-turbo"
44
+ ]
45
+
46
+ self.openrouter_models = [
47
+ # Anthropic Claude 4 Series (NEW)
48
+ "anthropic/claude-4-opus",
49
+ "anthropic/claude-4-sonnet",
50
+
51
+ # Anthropic Claude 3.5 Series
52
+ "anthropic/claude-3.5-sonnet",
53
+ "anthropic/claude-3-5-sonnet-20241022",
54
+ "anthropic/claude-3-opus",
55
+ "anthropic/claude-3-haiku",
56
+
57
+ # xAI Grok Series (including Grok 4)
58
+ "x-ai/grok-4",
59
+ "x-ai/grok-2",
60
+ "x-ai/grok-2-1212",
61
+
62
+ # DeepSeek Reasoning Models (NEW)
63
+ "deepseek/deepseek-r1",
64
+ "deepseek/deepseek-v3",
65
+ "deepseek/deepseek-chat",
66
+
67
+ # Google Gemini
68
+ "google/gemini-2.0-pro",
69
+ "google/gemini-2.0-flash",
70
+ "google/gemini-pro-1.5",
71
+ "google/gemini-flash-1.5",
72
+
73
+ # Baidu ERNIE (NEW)
74
+ "baidu/ernie-4.0-turbo-8k",
75
+ "baidu/ernie-bot-4",
76
+
77
+ # Meta Llama
78
+ "meta-llama/llama-3.2-405b",
79
+ "meta-llama/llama-3.1-405b-instruct",
80
+
81
+ # Mistral
82
+ "mistralai/mistral-large",
83
+ "mistralai/mixtral-8x22b-instruct"
84
+ ]
85
+
86
+ self.all_models = self.openai_models + self.openrouter_models
87
+
88
+ def get_excel_files(self):
89
+ """Get list of Excel files in current directory"""
90
+ files = [f for f in os.listdir('.') if f.endswith('.xlsx') and not f.endswith('_validated.xlsx')]
91
+ return files
92
+
93
+ def analyze_file(self, file_path):
94
+ """Analyze Excel file and return summary and question count"""
95
+ if not file_path:
96
+ return "No file selected", 0, 0
97
+
98
+ try:
99
+ df = pd.read_excel(file_path, sheet_name='Data')
100
+
101
+ # Store total questions
102
+ self.total_questions = len(df)
103
+
104
+ # Count math questions
105
+ if 'raw_subject' in df.columns:
106
+ math_filter = df['raw_subject'].str.lower().str.contains(
107
+ 'math|statistic|calculus|algebra|geometry|trigonometry',
108
+ na=False, regex=True
109
+ )
110
+ self.math_questions = math_filter.sum()
111
+ else:
112
+ self.math_questions = len(df)
113
+
114
+ # Check for images
115
+ image_count = 0
116
+ if 'file_url' in df.columns:
117
+ image_count = df['file_url'].notna().sum()
118
+
119
+ summary = f"""### File Analysis
120
+
121
+ **File:** {os.path.basename(file_path)}
122
+ **Total rows:** {self.total_questions}
123
+ **Math questions:** {self.math_questions}
124
+ **Questions with images:** {image_count}
125
+
126
+ **Columns found:** {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''}
127
+
128
+ **Estimated processing time:**
129
+ - Serial: ~{self.math_questions * 30 // 60} minutes
130
+ - Parallel (4 processes): ~{self.math_questions * 30 // (60 * 4)} minutes
131
+ """
132
+ return summary, self.total_questions, self.math_questions
133
+
134
+ except Exception as e:
135
+ return f"Error analyzing file: {str(e)}", 0, 0
136
+
137
+ def validate_config(self, file_path, solver_model, recon_model, num_processes, batch_size):
138
+ """Validate configuration before running"""
139
+ errors = []
140
+
141
+ if not file_path or not os.path.exists(file_path):
142
+ errors.append("Please select a valid Excel file")
143
+
144
+ if not solver_model:
145
+ errors.append("Please select a solver model")
146
+
147
+ if not recon_model:
148
+ errors.append("Please select a reconciliation model")
149
+
150
+ # Check API keys
151
+ needs_openai = solver_model in self.openai_models or recon_model in self.openai_models
152
+ needs_openrouter = solver_model in self.openrouter_models or recon_model in self.openrouter_models
153
+
154
+ if needs_openai and not os.getenv('OPENAI_API_KEY'):
155
+ errors.append("OPENAI_API_KEY not found in environment")
156
+
157
+ if needs_openrouter and not os.getenv('OPENROUTER_API_KEY'):
158
+ errors.append("OPENROUTER_API_KEY not found in environment")
159
+
160
+ return errors
161
+
162
+ def generate_output_filename(self, file_path, start_q, end_q):
163
+ """Generate output filename with timestamp and range"""
164
+ base_name = os.path.basename(file_path).replace('.xlsx', '')
165
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
166
+
167
+ if start_q is not None and end_q is not None and (start_q > 0 or end_q < self.math_questions):
168
+ # Add range to filename
169
+ range_str = f"_q{start_q+1}_q{end_q}"
170
+ else:
171
+ range_str = "_full"
172
+
173
+ return f"{base_name}_validated_{timestamp}{range_str}.xlsx"
174
+
175
+ def parse_progress_line(self, line):
176
+ """Parse output line for progress information"""
177
+ # Parse based on the new [TAG] format
178
+ line_lower = line.lower()
179
+
180
+ if "[ok] got answer" in line_lower and "chars" in line_lower:
181
+ self.questions_processed += 1
182
+ elif "[fail] failed to get answer" in line_lower:
183
+ self.errors += 1
184
+ self.questions_processed += 1 # Still count as processed
185
+ elif "[match]" in line_lower:
186
+ self.correct_answers += 1
187
+ elif "[mismatch]" in line_lower:
188
+ self.incorrect_answers += 1
189
+ elif "[timeout]" in line_lower:
190
+ self.timeouts += 1
191
+ elif "[error]" in line_lower:
192
+ if "failed after" in line_lower:
193
+ self.errors += 1
194
+ elif "[warning]" in line_lower:
195
+ # Just a warning, not an error
196
+ pass
197
+ elif "question" in line_lower and "getting answer from" in line_lower:
198
+ # This indicates a question is starting to be processed
199
+ pass
200
+
201
+ # Also parse parallel processing output
202
+ elif "starting process for questions" in line_lower:
203
+ # Parallel process starting
204
+ pass
205
+ elif "completed range" in line_lower:
206
+ # Parallel process completed a range
207
+ import re
208
+ # Try to extract question count from "Completed range X-Y"
209
+ match = re.search(r'range (\d+)-(\d+)', line_lower)
210
+ if match:
211
+ start, end = int(match.group(1)), int(match.group(2))
212
+ # This is approximate since we don't know exact results
213
+ self.questions_processed = max(self.questions_processed, end)
214
+
215
+ def get_progress_stats(self):
216
+ """Get formatted progress statistics"""
217
+ if self.questions_processed == 0:
218
+ return "Waiting for processing to start..."
219
+
220
+ accuracy = (self.correct_answers / self.questions_processed * 100) if self.questions_processed > 0 else 0
221
+
222
+ return f"""**Progress Stats:**
223
+ - Processed: {self.questions_processed}
224
+ - Correct: {self.correct_answers} ({accuracy:.1f}%)
225
+ - Incorrect: {self.incorrect_answers}
226
+ - Timeouts: {self.timeouts}
227
+ - Errors: {self.errors}
228
+ """
229
+
230
+ def run_validation(self, file_path, solver_model, recon_model, image_mode,
231
+ num_processes, batch_size, start_q, end_q, compile_latex, progress=gr.Progress()):
232
+ """Run the validation process"""
233
+
234
+ # Reset progress counters
235
+ self.questions_processed = 0
236
+ self.correct_answers = 0
237
+ self.incorrect_answers = 0
238
+ self.timeouts = 0
239
+ self.errors = 0
240
+
241
+ # Validate configuration
242
+ errors = self.validate_config(file_path, solver_model, recon_model, num_processes, batch_size)
243
+ if errors:
244
+ return f"### Configuration Errors\n" + "\n".join(f"- {e}" for e in errors), None, ""
245
+
246
+ self.is_running = True
247
+ output_log = []
248
+
249
+ # Generate output filename
250
+ output_file = self.generate_output_filename(file_path, start_q, end_q)
251
+ output_path = os.path.join(os.path.dirname(file_path), output_file)
252
+
253
+ try:
254
+ # Prepare command
255
+ base_cmd = [
256
+ sys.executable, "universal_validator.py", file_path,
257
+ "--model", solver_model,
258
+ "--reconciliation-model", recon_model,
259
+ "--images", image_mode,
260
+ "--batch-size", str(batch_size),
261
+ "--output", output_path
262
+ ]
263
+
264
+ # Add range parameters if specified
265
+ if start_q is not None and start_q >= 0:
266
+ base_cmd.extend(["--start", str(start_q)])
267
+ if end_q is not None and end_q > 0:
268
+ base_cmd.extend(["--end", str(end_q)])
269
+
270
+ # Add LaTeX compilation flag if requested
271
+ if compile_latex:
272
+ base_cmd.append("--compile-latex")
273
+
274
+ # Use parallel processing for larger ranges
275
+ if num_processes > 1 and (end_q - start_q) > 20:
276
+ cmd = [
277
+ sys.executable, "run_parallel.py", file_path,
278
+ "--num-processes", str(num_processes),
279
+ "--solver", solver_model,
280
+ "--reconciler", recon_model,
281
+ "--images", image_mode,
282
+ "--batch-size", str(batch_size),
283
+ "--output", output_path,
284
+ "--start-range", str(start_q),
285
+ "--end-range", str(end_q)
286
+ ]
287
+ if compile_latex:
288
+ cmd.append("--compile-latex")
289
+ print(f"[GUI] Using parallel processing with {num_processes} processes")
290
+ else:
291
+ # Use single process for small ranges
292
+ cmd = base_cmd
293
+ if num_processes > 1 and (end_q - start_q) <= 20:
294
+ print(f"[GUI] Range too small for parallel processing, using single process")
295
+
296
+ # Start process
297
+ progress(0, desc="Starting validation...")
298
+ output_log.append(f"Running: {' '.join(cmd)}\n")
299
+ output_log.append(f"Output file: {output_path}\n")
300
+ output_log.append(f"Question range: {start_q+1} to {end_q}\n\n")
301
+
302
+ print(f"[GUI] Starting subprocess: {' '.join(cmd)}")
303
+
304
+ try:
305
+ self.process = subprocess.Popen(
306
+ cmd,
307
+ stdout=subprocess.PIPE,
308
+ stderr=subprocess.STDOUT,
309
+ text=True,
310
+ bufsize=1,
311
+ universal_newlines=True,
312
+ encoding='utf-8',
313
+ errors='replace'
314
+ )
315
+ print(f"[GUI] Process started with PID: {self.process.pid}")
316
+ except Exception as e:
317
+ error_msg = f"Failed to start validator: {str(e)}"
318
+ print(f"[GUI Error] {error_msg}")
319
+ return error_msg, None, ""
320
+
321
+ # Read output
322
+ lines_processed = 0
323
+ last_update_time = time.time()
324
+
325
+ while True:
326
+ line = self.process.stdout.readline()
327
+ if not line:
328
+ # Check if process is still running
329
+ if self.process.poll() is not None:
330
+ break
331
+ time.sleep(0.1)
332
+ continue
333
+
334
+ output_log.append(line)
335
+ self.parse_progress_line(line)
336
+
337
+ # Debug: Print every line to see what's happening
338
+ print(f"[GUI Debug] {line.strip()}")
339
+
340
+ # Update progress based on output
341
+ if "processing batch" in line.lower() or "question" in line.lower():
342
+ lines_processed += 1
343
+ if self.math_questions > 0 and self.questions_processed > 0:
344
+ actual_progress = min(self.questions_processed / (end_q - start_q), 1.0)
345
+ progress(actual_progress, desc=f"Processing question {self.questions_processed}/{end_q - start_q}")
346
+
347
+ # Yield intermediate results with stats every 2 seconds or every 5 lines
348
+ current_time = time.time()
349
+ if lines_processed % 5 == 0 or (current_time - last_update_time) > 2:
350
+ stats = self.get_progress_stats()
351
+ output_text = stats + "\n\n" + "="*60 + "\n" + "".join(output_log[-50:])
352
+ yield output_text, None, stats
353
+ last_update_time = current_time
354
+
355
+ self.process.wait()
356
+
357
+ # Get final results
358
+ final_stats = self.get_progress_stats()
359
+ output_text = f"### Validation Complete\n\n{final_stats}\n\n" + "="*60 + "\n\nFull Log:\n" + "".join(output_log[-200:])
360
+
361
+ # Check if output file exists
362
+ if os.path.exists(output_path):
363
+ return output_text, output_path, final_stats
364
+ else:
365
+ # Try original naming convention as fallback
366
+ fallback_path = file_path.replace('.xlsx', '_validated.xlsx')
367
+ if os.path.exists(fallback_path):
368
+ return output_text, fallback_path, final_stats
369
+ return output_text, None, final_stats
370
+
371
+ except Exception as e:
372
+ stats = self.get_progress_stats()
373
+ return f"Error: {str(e)}\n\n{stats}\n\n{''.join(output_log)}", None, stats
374
+ finally:
375
+ self.is_running = False
376
+ self.process = None
377
+
378
+ def stop_validation(self):
379
+ """Stop the running validation"""
380
+ if self.process:
381
+ self.process.terminate()
382
+ time.sleep(1)
383
+ if self.process.poll() is None:
384
+ self.process.kill()
385
+ return "Validation stopped"
386
+ return "No validation running"
387
+
388
+ def create_interface(self):
389
+ """Create the Gradio interface"""
390
+
391
+ with gr.Blocks(title="Math Validator", theme=gr.themes.Soft()) as interface:
392
+ gr.Markdown("# Math Question Validator")
393
+ gr.Markdown("Web interface for validating mathematical questions and answers")
394
+
395
+ with gr.Tab("Validation"):
396
+ with gr.Row():
397
+ with gr.Column(scale=1):
398
+ # File selection
399
+ file_dropdown = gr.Dropdown(
400
+ choices=self.get_excel_files(),
401
+ label="Select Excel File",
402
+ value=self.get_excel_files()[0] if self.get_excel_files() else None
403
+ )
404
+
405
+ refresh_btn = gr.Button("🔄 Refresh Files", size="sm")
406
+
407
+ file_info = gr.Markdown("Select a file to see analysis")
408
+
409
+ # Question range selection (dynamically updated)
410
+ gr.Markdown("### Question Range")
411
+ with gr.Row():
412
+ start_question = gr.Number(
413
+ label="Start Question",
414
+ value=1,
415
+ minimum=1,
416
+ step=1,
417
+ info="First question to process"
418
+ )
419
+ end_question = gr.Number(
420
+ label="End Question",
421
+ value=100,
422
+ minimum=1,
423
+ step=1,
424
+ info="Last question to process"
425
+ )
426
+
427
+ use_all_questions = gr.Checkbox(
428
+ label="Process all questions",
429
+ value=True,
430
+ info="Uncheck to specify custom range"
431
+ )
432
+
433
+ with gr.Column(scale=2):
434
+ with gr.Row():
435
+ # Model selection
436
+ solver_dropdown = gr.Dropdown(
437
+ choices=["o3-mini (recommended)"] + self.all_models,
438
+ value="o3-mini (recommended)",
439
+ label="Solver Model",
440
+ info="Model for answering questions"
441
+ )
442
+
443
+ recon_dropdown = gr.Dropdown(
444
+ choices=["gpt-4o (recommended)"] + self.all_models,
445
+ value="gpt-4o (recommended)",
446
+ label="Reconciliation Model",
447
+ info="Model for comparing answers"
448
+ )
449
+
450
+ with gr.Row():
451
+ image_mode = gr.Radio(
452
+ choices=["when_needed", "always", "never"],
453
+ value="when_needed",
454
+ label="Image Handling",
455
+ info="When to include images with questions"
456
+ )
457
+
458
+ parallel_slider = gr.Slider(
459
+ minimum=1,
460
+ maximum=8,
461
+ value=1,
462
+ step=1,
463
+ label="Parallel Processes",
464
+ info="Number of concurrent processes (1 = serial)"
465
+ )
466
+
467
+ batch_slider = gr.Slider(
468
+ minimum=1,
469
+ maximum=20,
470
+ value=5,
471
+ step=1,
472
+ label="Batch Size",
473
+ info="Questions per batch"
474
+ )
475
+
476
+ # LaTeX compilation option
477
+ compile_latex = gr.Checkbox(
478
+ label="Compile LaTeX reconciliation documents to PDF",
479
+ value=False,
480
+ info="Requires pdflatex installed (slower but produces PDFs)"
481
+ )
482
+
483
+ with gr.Row():
484
+ run_btn = gr.Button("▶️ Start Validation", variant="primary", size="lg")
485
+ stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg")
486
+
487
+ # Output section with progress stats
488
+ progress_stats = gr.Markdown("**Progress:** Waiting to start...")
489
+
490
+ output_text = gr.Textbox(
491
+ label="Validation Output",
492
+ lines=20,
493
+ max_lines=30,
494
+ value="Click 'Start Validation' to begin..."
495
+ )
496
+
497
+ output_file = gr.File(
498
+ label="Download Results",
499
+ visible=False
500
+ )
501
+
502
+ # Event handlers
503
+ def update_file_info(file_path):
504
+ if file_path:
505
+ full_path = os.path.join(os.getcwd(), file_path)
506
+ summary, total, math_q = self.analyze_file(full_path)
507
+ # Update end question to match file
508
+ return summary, gr.Number(value=math_q, maximum=math_q)
509
+ return "No file selected", gr.Number(value=100)
510
+
511
+ def refresh_files():
512
+ files = self.get_excel_files()
513
+ return gr.Dropdown(choices=files, value=files[0] if files else None)
514
+
515
+ def clean_model_name(model):
516
+ # Remove "(recommended)" suffix if present
517
+ if "(recommended)" in model:
518
+ return model.split(" (")[0]
519
+ return model
520
+
521
+ def toggle_range_inputs(use_all):
522
+ # Enable/disable range inputs based on checkbox
523
+ return gr.Number(interactive=not use_all), gr.Number(interactive=not use_all)
524
+
525
+ def run_with_clean_models(file_path, solver, recon, images, parallel, batch,
526
+ use_all, start_q, end_q, compile_tex):
527
+ solver_clean = clean_model_name(solver)
528
+ recon_clean = clean_model_name(recon)
529
+
530
+ if file_path:
531
+ full_path = os.path.join(os.getcwd(), file_path)
532
+
533
+ # Adjust question range (convert to 0-indexed)
534
+ if use_all:
535
+ actual_start = 0
536
+ actual_end = self.math_questions
537
+ else:
538
+ actual_start = max(0, int(start_q) - 1) # Convert to 0-indexed
539
+ actual_end = min(self.math_questions, int(end_q))
540
+
541
+ # Run validation with progress updates
542
+ for result in self.run_validation(
543
+ full_path, solver_clean, recon_clean, images, parallel, batch,
544
+ actual_start, actual_end, compile_tex
545
+ ):
546
+ if len(result) == 3:
547
+ result_text, result_file, stats = result
548
+ if result_file:
549
+ yield result_text, gr.File(value=result_file, visible=True), stats
550
+ else:
551
+ yield result_text, gr.File(visible=False), stats
552
+ else:
553
+ yield result[0], gr.File(visible=False), result[1] if len(result) > 1 else ""
554
+ else:
555
+ yield "No file selected", gr.File(visible=False), ""
556
+
557
+ file_dropdown.change(update_file_info, inputs=[file_dropdown],
558
+ outputs=[file_info, end_question])
559
+ refresh_btn.click(refresh_files, outputs=[file_dropdown])
560
+
561
+ # Toggle range inputs when checkbox changes
562
+ use_all_questions.change(toggle_range_inputs, inputs=[use_all_questions],
563
+ outputs=[start_question, end_question])
564
+
565
+ run_btn.click(
566
+ run_with_clean_models,
567
+ inputs=[file_dropdown, solver_dropdown, recon_dropdown,
568
+ image_mode, parallel_slider, batch_slider,
569
+ use_all_questions, start_question, end_question, compile_latex],
570
+ outputs=[output_text, output_file, progress_stats]
571
+ )
572
+
573
+ stop_btn.click(self.stop_validation, outputs=[output_text])
574
+
575
+ with gr.Tab("Configuration"):
576
+ gr.Markdown("""
577
+ ### API Configuration
578
+
579
+ Make sure you have the required API keys set as environment variables:
580
+
581
+ - **OPENAI_API_KEY**: Required for OpenAI models (o3-mini, GPT-5, GPT-4o)
582
+ - **OPENROUTER_API_KEY**: Required for Claude, Grok, Gemini, and other models
583
+
584
+ ### Model Recommendations
585
+
586
+ **For best results:**
587
+ - Solver: o3-mini (best accuracy)
588
+ - Reconciliation: gpt-4o (fast and reliable)
589
+
590
+ **For speed:**
591
+ - Use 4-6 parallel processes
592
+ - Batch size of 5-10
593
+
594
+ **For GPT-5 testing:**
595
+ - Use gpt-5-mini (faster than gpt-5)
596
+ - Use gpt-4o for reconciliation (GPT-5 has timeout issues)
597
+ """)
598
+
599
+ # Check current configuration
600
+ config_status = []
601
+ if os.getenv('OPENAI_API_KEY'):
602
+ config_status.append("✅ OPENAI_API_KEY is set")
603
+ else:
604
+ config_status.append("❌ OPENAI_API_KEY is not set")
605
+
606
+ if os.getenv('OPENROUTER_API_KEY'):
607
+ config_status.append("✅ OPENROUTER_API_KEY is set")
608
+ else:
609
+ config_status.append("❌ OPENROUTER_API_KEY is not set")
610
+
611
+ gr.Markdown("### Current Status\n" + "\n".join(config_status))
612
+
613
+ with gr.Tab("Results Analysis"):
614
+ gr.Markdown("""
615
+ ### How to Analyze Results
616
+
617
+ After validation completes:
618
+
619
+ 1. **Download the validated Excel file** - Contains all results
620
+ 2. **Check the latex_documents folder** - Contains reconciliation documents
621
+ 3. **Run analysis scripts:**
622
+ - `python analyze_reconciliations.py` - Analyze which answers were vindicated
623
+ - `python summarize_results.py` - Get overall statistics
624
+
625
+ ### Understanding Results
626
+
627
+ - **answer_match = Yes**: Model answer matches reference
628
+ - **answer_match = No**: Mismatch (see LaTeX reconciliation)
629
+ - **latex_file**: Path to detailed reconciliation document
630
+ - **model_answer_file**: Path to model's complete response
631
+ """)
632
+
633
+ return interface
634
+
635
+ def main():
636
+ gui = ValidatorGUI()
637
+ interface = gui.create_interface()
638
+ interface.launch(
639
+ share=False,
640
+ server_name="127.0.0.1",
641
+ server_port=7860,
642
+ inbrowser=True
643
+ )
644
+
645
+ if __name__ == "__main__":
646
+ main()