""" MathTok Benchmark Runner Evaluates the MathTok pipeline against baseline tokenizers on a curated dataset of mathematical expressions and mixed text+math problems. Usage ───── python -m evaluation.benchmark # run full benchmark python -m evaluation.benchmark --quick # 20 examples only python -m evaluation.benchmark --json # JSON output python -m evaluation.benchmark --baselines # include GPT-2 baseline """ from __future__ import annotations import argparse import json import logging import time from pathlib import Path from typing import Callable from mathtok.pipeline import MathTokPipeline from .metrics import ( EvaluationReport, MetricResult, structural_compression_ratio, canonical_consistency_score, operator_preservation_score, token_stability, tree_depth_fidelity, make_gpt2_tokenizer, tokenize_character_level, ) logger = logging.getLogger(__name__) _DATASET_PATH = Path(__file__).parent / "datasets" / "sample_problems.json" # ── Dataset loading ─────────────────────────────────────────────────────── def load_dataset(path: Path = _DATASET_PATH) -> dict: """Load the benchmark dataset JSON.""" with open(path, "r", encoding="utf-8") as f: return json.load(f) # ── Benchmark runner ────────────────────────────────────────────────────── class MathTokBenchmark: """ Run all five evaluation metrics on the benchmark dataset. Parameters ---------- pipeline : MathTokPipeline to evaluate dataset : loaded benchmark dict (from load_dataset()) max_n : maximum number of examples to evaluate (None = all) """ def __init__( self, pipeline: MathTokPipeline, dataset: dict, max_n: int | None = None, ) -> None: self.pipeline = pipeline self.dataset = dataset self.max_n = max_n def run(self) -> EvaluationReport: """Run all five metrics and return an EvaluationReport.""" ds = self.dataset # Slice if max_n is set exprs = ds.get("expressions", [])[:self.max_n] eq_pairs = ds.get("equivalent_pairs", [])[:self.max_n] expr_groups = ds.get("rewriting_groups", [])[:self.max_n] mixed = ds.get("mixed_text_math", [])[:self.max_n] # Build the primary tokenizer function def tokenize(text: str) -> list[str]: return self.pipeline.encode(text).tokens def tokenize_math(expr: str) -> list[str]: return self.pipeline.encode_math_only(expr).tokens print(f"Running MathTok benchmark on {len(exprs)} expressions...") t0 = time.time() # ── SCR ────────────────────────────────────────────────────────── print(" Computing SCR...") tok_lengths = [] for expr in exprs: try: out = self.pipeline.encode_math_only(expr) tok_lengths.append(len(out.tokens)) except Exception: tok_lengths.append(0) scr = structural_compression_ratio(exprs, tok_lengths) # ── CCS ────────────────────────────────────────────────────────── print(" Computing CCS...") ccs = canonical_consistency_score(eq_pairs, tokenize_math) # ── OPS ────────────────────────────────────────────────────────── print(" Computing OPS...") ops = operator_preservation_score(exprs, tokenize_math) # ── TS ─────────────────────────────────────────────────────────── print(" Computing TS...") ts = token_stability(expr_groups, tokenize_math) # ── TDF ────────────────────────────────────────────────────────── print(" Computing TDF...") tdf = tree_depth_fidelity(exprs, self.pipeline.encode_math_only) elapsed = time.time() - t0 print(f" Done in {elapsed:.1f}s") return EvaluationReport( scr=scr, ccs=ccs, ops=ops, ts=ts, tdf=tdf, num_examples=len(exprs), ) def run_baseline_comparison(self, baseline_name: str = "gpt2") -> dict: """ Compare MathTok against a baseline tokenizer on SCR and CCS. Returns a dict with 'mathtok' and 'baseline' results. """ ds = self.dataset exprs = ds.get("expressions", [])[:self.max_n] eq_pairs = ds.get("equivalent_pairs", [])[:self.max_n] if baseline_name == "gpt2": baseline_fn = make_gpt2_tokenizer() elif baseline_name == "char": baseline_fn = tokenize_character_level else: raise ValueError(f"Unknown baseline: {baseline_name}") def mathtok_fn(expr: str) -> list[str]: return self.pipeline.encode_math_only(expr).tokens # MathTok metrics mt_tok_lengths = [len(mathtok_fn(e)) for e in exprs] mt_scr = structural_compression_ratio(exprs, mt_tok_lengths) mt_ccs = canonical_consistency_score(eq_pairs, mathtok_fn) # Baseline metrics bl_tok_lengths = [] for e in exprs: try: bl_tok_lengths.append(len(baseline_fn(e))) except Exception: bl_tok_lengths.append(0) bl_scr = structural_compression_ratio(exprs, bl_tok_lengths) bl_ccs = canonical_consistency_score(eq_pairs, baseline_fn) return { "mathtok": {"SCR": mt_scr.value, "CCS": mt_ccs.value}, "baseline": {"name": baseline_name, "SCR": bl_scr.value, "CCS": bl_ccs.value}, } # ── CLI ─────────────────────────────────────────────────────────────────── def main() -> None: logging.basicConfig(level=logging.WARNING) parser = argparse.ArgumentParser(description="MathTok Benchmark Runner") parser.add_argument("--quick", action="store_true", help="Run on first 20 examples only") parser.add_argument("--json", action="store_true", help="Output JSON") parser.add_argument("--baselines", action="store_true", help="Include GPT-2 baseline comparison") parser.add_argument("--dataset", default=str(_DATASET_PATH), help="Dataset JSON path") args = parser.parse_args() dataset = load_dataset(Path(args.dataset)) pipeline = MathTokPipeline() max_n = 20 if args.quick else None bench = MathTokBenchmark(pipeline, dataset, max_n=max_n) report = bench.run() if args.json: result = report.to_dict() if args.baselines: result["baseline_comparison"] = bench.run_baseline_comparison("char") print(json.dumps(result, indent=2)) else: print(report.summary()) if args.baselines: comp = bench.run_baseline_comparison("char") print("\nBaseline comparison (char-level):") print(f" MathTok SCR={comp['mathtok']['SCR']:.4f} CCS={comp['mathtok']['CCS']:.4f}") print(f" CharLvl SCR={comp['baseline']['SCR']:.4f} CCS={comp['baseline']['CCS']:.4f}") if __name__ == "__main__": main()