Initial clean MathTok release

edede4c 5 days ago

8.01 kB

	"""
	MathTok Benchmark Runner

	Evaluates the MathTok pipeline against baseline tokenizers on a curated
	dataset of mathematical expressions and mixed text+math problems.

	Usage
	─────
	python -m evaluation.benchmark # run full benchmark
	python -m evaluation.benchmark --quick # 20 examples only
	python -m evaluation.benchmark --json # JSON output
	python -m evaluation.benchmark --baselines # include GPT-2 baseline
	"""

	from __future__ import annotations

	import argparse
	import json
	import logging
	import time
	from pathlib import Path
	from typing import Callable

	from mathtok.pipeline import MathTokPipeline
	from .metrics import (
	EvaluationReport, MetricResult,
	structural_compression_ratio,
	canonical_consistency_score,
	operator_preservation_score,
	token_stability,
	tree_depth_fidelity,
	make_gpt2_tokenizer,
	tokenize_character_level,
	)

	logger = logging.getLogger(__name__)

	_DATASET_PATH = Path(__file__).parent / "datasets" / "sample_problems.json"


	# ── Dataset loading ───────────────────────────────────────────────────────

	def load_dataset(path: Path = _DATASET_PATH) -> dict:
	"""Load the benchmark dataset JSON."""
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)


	# ── Benchmark runner ──────────────────────────────────────────────────────

	class MathTokBenchmark:
	"""
	Run all five evaluation metrics on the benchmark dataset.

	Parameters
	----------
	pipeline : MathTokPipeline to evaluate
	dataset : loaded benchmark dict (from load_dataset())
	max_n : maximum number of examples to evaluate (None = all)
	"""

	def __init__(
	self,
	pipeline: MathTokPipeline,
	dataset: dict,
	max_n: int \| None = None,
	) -> None:
	self.pipeline = pipeline
	self.dataset = dataset
	self.max_n = max_n

	def run(self) -> EvaluationReport:
	"""Run all five metrics and return an EvaluationReport."""
	ds = self.dataset

	# Slice if max_n is set
	exprs = ds.get("expressions", [])[:self.max_n]
	eq_pairs = ds.get("equivalent_pairs", [])[:self.max_n]
	expr_groups = ds.get("rewriting_groups", [])[:self.max_n]
	mixed = ds.get("mixed_text_math", [])[:self.max_n]

	# Build the primary tokenizer function
	def tokenize(text: str) -> list[str]:
	return self.pipeline.encode(text).tokens

	def tokenize_math(expr: str) -> list[str]:
	return self.pipeline.encode_math_only(expr).tokens

	print(f"Running MathTok benchmark on {len(exprs)} expressions...")
	t0 = time.time()

	# ── SCR ──────────────────────────────────────────────────────────
	print(" Computing SCR...")
	tok_lengths = []
	for expr in exprs:
	try:
	out = self.pipeline.encode_math_only(expr)
	tok_lengths.append(len(out.tokens))
	except Exception:
	tok_lengths.append(0)
	scr = structural_compression_ratio(exprs, tok_lengths)

	# ── CCS ──────────────────────────────────────────────────────────
	print(" Computing CCS...")
	ccs = canonical_consistency_score(eq_pairs, tokenize_math)

	# ── OPS ──────────────────────────────────────────────────────────
	print(" Computing OPS...")
	ops = operator_preservation_score(exprs, tokenize_math)

	# ── TS ───────────────────────────────────────────────────────────
	print(" Computing TS...")
	ts = token_stability(expr_groups, tokenize_math)

	# ── TDF ──────────────────────────────────────────────────────────
	print(" Computing TDF...")
	tdf = tree_depth_fidelity(exprs, self.pipeline.encode_math_only)

	elapsed = time.time() - t0
	print(f" Done in {elapsed:.1f}s")

	return EvaluationReport(
	scr=scr, ccs=ccs, ops=ops, ts=ts, tdf=tdf,
	num_examples=len(exprs),
	)

	def run_baseline_comparison(self, baseline_name: str = "gpt2") -> dict:
	"""
	Compare MathTok against a baseline tokenizer on SCR and CCS.

	Returns a dict with 'mathtok' and 'baseline' results.
	"""
	ds = self.dataset
	exprs = ds.get("expressions", [])[:self.max_n]
	eq_pairs = ds.get("equivalent_pairs", [])[:self.max_n]

	if baseline_name == "gpt2":
	baseline_fn = make_gpt2_tokenizer()
	elif baseline_name == "char":
	baseline_fn = tokenize_character_level
	else:
	raise ValueError(f"Unknown baseline: {baseline_name}")

	def mathtok_fn(expr: str) -> list[str]:
	return self.pipeline.encode_math_only(expr).tokens

	# MathTok metrics
	mt_tok_lengths = [len(mathtok_fn(e)) for e in exprs]
	mt_scr = structural_compression_ratio(exprs, mt_tok_lengths)
	mt_ccs = canonical_consistency_score(eq_pairs, mathtok_fn)

	# Baseline metrics
	bl_tok_lengths = []
	for e in exprs:
	try:
	bl_tok_lengths.append(len(baseline_fn(e)))
	except Exception:
	bl_tok_lengths.append(0)
	bl_scr = structural_compression_ratio(exprs, bl_tok_lengths)
	bl_ccs = canonical_consistency_score(eq_pairs, baseline_fn)

	return {
	"mathtok": {"SCR": mt_scr.value, "CCS": mt_ccs.value},
	"baseline": {"name": baseline_name, "SCR": bl_scr.value, "CCS": bl_ccs.value},
	}


	# ── CLI ───────────────────────────────────────────────────────────────────

	def main() -> None:
	logging.basicConfig(level=logging.WARNING)
	parser = argparse.ArgumentParser(description="MathTok Benchmark Runner")
	parser.add_argument("--quick", action="store_true", help="Run on first 20 examples only")
	parser.add_argument("--json", action="store_true", help="Output JSON")
	parser.add_argument("--baselines", action="store_true", help="Include GPT-2 baseline comparison")
	parser.add_argument("--dataset", default=str(_DATASET_PATH), help="Dataset JSON path")
	args = parser.parse_args()

	dataset = load_dataset(Path(args.dataset))
	pipeline = MathTokPipeline()
	max_n = 20 if args.quick else None

	bench = MathTokBenchmark(pipeline, dataset, max_n=max_n)
	report = bench.run()

	if args.json:
	result = report.to_dict()
	if args.baselines:
	result["baseline_comparison"] = bench.run_baseline_comparison("char")
	print(json.dumps(result, indent=2))
	else:
	print(report.summary())
	if args.baselines:
	comp = bench.run_baseline_comparison("char")
	print("\nBaseline comparison (char-level):")
	print(f" MathTok SCR={comp['mathtok']['SCR']:.4f} CCS={comp['mathtok']['CCS']:.4f}")
	print(f" CharLvl SCR={comp['baseline']['SCR']:.4f} CCS={comp['baseline']['CCS']:.4f}")


	if __name__ == "__main__":
	main()