mathtok / evaluation /visualize.py
SurweeshSP's picture
Initial clean MathTok release
edede4c
"""
Visualization Script for MathTok Evaluation Results
===================================================
Generates visual charts from the benchmark comparison results, making
it easy to understand the performance differences in Semantic Compression Ratio (SCR),
Canonical Consistency Score (CCS), and more.
Usage:
python -m evaluation.visualize
"""
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
_RESULTS_DIR = Path(__file__).parent / "results"
def load_summary():
summary_path = _RESULTS_DIR / "comparison_summary.json"
if not summary_path.exists():
raise FileNotFoundError(f"Results summary not found at {summary_path}. Run comparison.py first.")
with open(summary_path, "r", encoding="utf-8") as f:
return json.load(f)
def load_jsonl_results():
results_path = _RESULTS_DIR / "comparison_results.jsonl"
records = []
if not results_path.exists():
return records
with open(results_path, "r", encoding="utf-8") as f:
for line in f:
records.append(json.loads(line))
return records
def plot_aggregated_scr(summary):
"""Plot the overall mean Semantic Compression Ratio."""
fig, ax = plt.subplots(figsize=(8, 6))
models = ["Char-level", "GPT-2", "SentencePiece", "MathTok"]
scrs = [
summary.get("charlevel_mean_scr", 0),
summary.get("gpt2_scr", 0),
summary.get("sentencepiece_mean_scr", 0),
summary.get("mathtok_mean_scr", 0)
]
# Filter out missing models (like GPT-2 if not run)
valid_models = []
valid_scrs = []
colors = []
all_models = [("Char-level", scrs[0], "#EF4444"),
("GPT-2", scrs[1], "#6B7280"),
("SentencePiece", scrs[2], "#3B82F6"),
("MathTok", scrs[3], "#10B981")]
for m, s, c in all_models:
if s is not None and s > 0:
valid_models.append(m)
valid_scrs.append(s)
colors.append(c)
sns.barplot(x=valid_models, y=valid_scrs, palette=colors, ax=ax)
ax.set_title("Mean Semantic Compression Ratio (SCR)\n(Higher is Better)", fontsize=14, fontweight='bold', pad=15)
ax.set_ylabel("SCR (Structural Score / Tokens)", fontsize=12)
sns.despine(ax=ax)
# Add value labels
for i, v in enumerate(valid_scrs):
ax.text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold', fontsize=11)
plt.tight_layout()
out_path = _RESULTS_DIR / "scr_comparison.png"
plt.savefig(out_path, dpi=300)
print(f"Saved {out_path}")
plt.close()
def plot_category_scr(records):
"""Plot SCR breakdown by category."""
data = []
for r in records:
cat = r["category"]
if "mixed" in cat or "latex_vs_ascii" in cat:
continue # Focus on standard mathematical metrics for SCR
data.append({"Category": cat, "Model": "MathTok", "SCR": r["mathtok"]["raw_scr"]})
data.append({"Category": cat, "Model": "Char-level", "SCR": r["char_level"]["raw_scr"]})
if r.get("gpt2") and r["gpt2"].get("raw_scr") is not None:
data.append({"Category": cat, "Model": "GPT-2", "SCR": r["gpt2"]["raw_scr"]})
if r.get("sentencepiece") and r["sentencepiece"].get("raw_scr") is not None:
data.append({"Category": cat, "Model": "SentencePiece", "SCR": r["sentencepiece"]["raw_scr"]})
if not data:
return
df = pd.DataFrame(data)
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=df, x="Category", y="SCR", hue="Model",
palette={"MathTok": "#10B981", "GPT-2": "#6B7280", "SentencePiece": "#3B82F6", "Char-level": "#EF4444"},
errorbar=None, ax=ax)
ax.set_title("Semantic Compression Ratio by Category", fontsize=14, fontweight='bold', pad=15)
ax.set_ylabel("Mean SCR", fontsize=12)
ax.set_xlabel("Expression Category", fontsize=12)
sns.despine(ax=ax)
plt.xticks(rotation=15)
plt.legend(title="Tokenizer")
plt.tight_layout()
out_path = _RESULTS_DIR / "scr_by_category.png"
plt.savefig(out_path, dpi=300)
print(f"Saved {out_path}")
plt.close()
def plot_token_counts(summary):
"""Plot total token counts as a bar chart to show efficiency."""
per_record = summary.get("per_record", [])
if not per_record:
return
# We'll just plot the first 15 for readability
subset = per_record[:15]
df_data = []
for i, r in enumerate(subset):
expr_short = r["expression"][:15] + ".." if len(r["expression"]) > 15 else r["expression"]
df_data.append({"Expression": expr_short, "Model": "MathTok", "Tokens": r["mt_tokens"], "Order": i})
df_data.append({"Expression": expr_short, "Model": "Char-level", "Tokens": r["ch_tokens"], "Order": i})
if r.get("gp_tokens"):
df_data.append({"Expression": expr_short, "Model": "GPT-2", "Tokens": r["gp_tokens"], "Order": i})
if r.get("sp_tokens"):
df_data.append({"Expression": expr_short, "Model": "SentencePiece", "Tokens": r["sp_tokens"], "Order": i})
df = pd.DataFrame(df_data)
fig, ax = plt.subplots(figsize=(12, 6))
# Sort by original order
df = df.sort_values("Order")
sns.barplot(data=df, x="Expression", y="Tokens", hue="Model",
palette={"MathTok": "#10B981", "GPT-2": "#6B7280", "SentencePiece": "#3B82F6", "Char-level": "#EF4444"}, ax=ax)
ax.set_title("Token Counts per Expression (Fewer is usually better, but SCR is the true metric)", fontsize=14, fontweight='bold', pad=15)
ax.set_ylabel("Number of Tokens", fontsize=12)
sns.despine(ax=ax)
plt.xticks(rotation=45, ha='right')
plt.legend(title="Tokenizer")
plt.tight_layout()
out_path = _RESULTS_DIR / "token_counts_sample.png"
plt.savefig(out_path, dpi=300)
print(f"Saved {out_path}")
plt.close()
def plot_semantic_density(records):
"""Plot the overall mean Semantic Density."""
ch_dens = [r["char_level"]["semantic_density"] for r in records if r.get("char_level")]
gp_dens = [r["gpt2"]["semantic_density"] for r in records if r.get("gpt2") and r["gpt2"].get("semantic_density") is not None]
sp_dens = [r["sentencepiece"]["semantic_density"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("semantic_density") is not None]
mt_dens = [r["mathtok"]["semantic_density"] for r in records if r.get("mathtok")]
mean_ch = sum(ch_dens) / len(ch_dens) if ch_dens else 0.0
mean_gp = sum(gp_dens) / len(gp_dens) if gp_dens else 0.0
mean_sp = sum(sp_dens) / len(sp_dens) if sp_dens else 0.0
mean_mt = sum(mt_dens) / len(mt_dens) if mt_dens else 0.0
valid_models = []
valid_dens = []
colors = []
all_models = [("Char-level", mean_ch, "#EF4444"),
("GPT-2", mean_gp, "#6B7280"),
("SentencePiece", mean_sp, "#3B82F6"),
("MathTok", mean_mt, "#10B981")]
for model, val, color in all_models:
if val > 0:
valid_models.append(model)
valid_dens.append(val)
colors.append(color)
fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(x=valid_models, y=valid_dens, palette=colors, ax=ax)
ax.set_title("Mean Semantic Density\n(Ratio of Math-Centric Tokens to Total Tokens)", fontsize=14, fontweight='bold', pad=15)
ax.set_ylabel("Semantic Density Score (Higher is Better)", fontsize=12)
sns.despine(ax=ax)
for i, v in enumerate(valid_dens):
ax.text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=11)
plt.tight_layout()
out_path = _RESULTS_DIR / "semantic_density_comparison.png"
plt.savefig(out_path, dpi=300)
print(f"Saved {out_path}")
plt.close()
def plot_structural_efficiency(records):
"""Plot the overall mean Structural Efficiency."""
ch_eff = [r["char_level"]["structural_efficiency"] for r in records if r.get("char_level")]
gp_eff = [r["gpt2"]["structural_efficiency"] for r in records if r.get("gpt2") and r["gpt2"].get("structural_efficiency") is not None]
sp_eff = [r["sentencepiece"]["structural_efficiency"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("structural_efficiency") is not None]
mt_eff = [r["mathtok"]["structural_efficiency"] for r in records if r.get("mathtok")]
mean_ch = sum(ch_eff) / len(ch_eff) if ch_eff else 0.0
mean_gp = sum(gp_eff) / len(gp_eff) if gp_eff else 0.0
mean_sp = sum(sp_eff) / len(sp_eff) if sp_eff else 0.0
mean_mt = sum(mt_eff) / len(mt_eff) if mt_eff else 0.0
valid_models = []
valid_eff = []
colors = []
all_models = [("Char-level", mean_ch, "#EF4444"),
("GPT-2", mean_gp, "#6B7280"),
("SentencePiece", mean_sp, "#3B82F6"),
("MathTok", mean_mt, "#10B981")]
for model, val, color in all_models:
if val > 0:
valid_models.append(model)
valid_eff.append(val)
colors.append(color)
fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(x=valid_models, y=valid_eff, palette=colors, ax=ax)
ax.set_title("Mean Structural Efficiency\n(Parent-Child Relations per Token)", fontsize=14, fontweight='bold', pad=15)
ax.set_ylabel("Structural Efficiency Score (Higher is Better)", fontsize=12)
sns.despine(ax=ax)
for i, v in enumerate(valid_eff):
ax.text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=11)
plt.tight_layout()
out_path = _RESULTS_DIR / "structural_efficiency_comparison.png"
plt.savefig(out_path, dpi=300)
print(f"Saved {out_path}")
plt.close()
def plot_unified_dashboard(summary, records):
"""Generates a side-by-side three-panel dashboard showing SCR, Semantic Density, and Structural Efficiency."""
fig, axes = plt.subplots(1, 3, figsize=(18, 5.5))
# 1. SCR
models = ["Char-level", "GPT-2", "SentencePiece", "MathTok"]
scrs = [
summary.get("charlevel_mean_scr", 0),
summary.get("gpt2_scr", 0),
summary.get("sentencepiece_mean_scr", 0),
summary.get("mathtok_mean_scr", 0)
]
valid_models_scr = []
valid_scrs = []
colors_scr = []
all_scr = [("Char-level", scrs[0], "#EF4444"),
("GPT-2", scrs[1], "#6B7280"),
("SentencePiece", scrs[2], "#3B82F6"),
("MathTok", scrs[3], "#10B981")]
for m, v, c in all_scr:
if v is not None and v > 0:
valid_models_scr.append(m)
valid_scrs.append(v)
colors_scr.append(c)
sns.barplot(x=valid_models_scr, y=valid_scrs, palette=colors_scr, ax=axes[0])
axes[0].set_title("Semantic Compression Ratio (SCR)", fontsize=12, fontweight='bold', pad=10)
axes[0].set_ylabel("SCR Score (Higher is Better)", fontsize=10)
sns.despine(ax=axes[0])
for i, v in enumerate(valid_scrs):
axes[0].text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold', fontsize=10)
# 2. Semantic Density
ch_dens = [r["char_level"]["semantic_density"] for r in records if r.get("char_level")]
gp_dens = [r["gpt2"]["semantic_density"] for r in records if r.get("gpt2") and r["gpt2"].get("semantic_density") is not None]
sp_dens = [r["sentencepiece"]["semantic_density"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("semantic_density") is not None]
mt_dens = [r["mathtok"]["semantic_density"] for r in records if r.get("mathtok")]
mean_ch_d = sum(ch_dens) / len(ch_dens) if ch_dens else 0.0
mean_gp_d = sum(gp_dens) / len(gp_dens) if gp_dens else 0.0
mean_sp_d = sum(sp_dens) / len(sp_dens) if sp_dens else 0.0
mean_mt_d = sum(mt_dens) / len(mt_dens) if mt_dens else 0.0
valid_models_d = []
valid_dens = []
colors_d = []
all_dens = [("Char-level", mean_ch_d, "#EF4444"),
("GPT-2", mean_gp_d, "#6B7280"),
("SentencePiece", mean_sp_d, "#3B82F6"),
("MathTok", mean_mt_d, "#10B981")]
for m, v, c in all_dens:
if v > 0:
valid_models_d.append(m)
valid_dens.append(v)
colors_d.append(c)
sns.barplot(x=valid_models_d, y=valid_dens, palette=colors_d, ax=axes[1])
axes[1].set_title("Semantic Density", fontsize=12, fontweight='bold', pad=10)
axes[1].set_ylabel("Density Score (Higher is Better)", fontsize=10)
sns.despine(ax=axes[1])
for i, v in enumerate(valid_dens):
axes[1].text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=10)
# 3. Structural Efficiency
ch_eff = [r["char_level"]["structural_efficiency"] for r in records if r.get("char_level")]
gp_eff = [r["gpt2"]["structural_efficiency"] for r in records if r.get("gpt2") and r["gpt2"].get("structural_efficiency") is not None]
sp_eff = [r["sentencepiece"]["structural_efficiency"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("structural_efficiency") is not None]
mt_eff = [r["mathtok"]["structural_efficiency"] for r in records if r.get("mathtok")]
mean_ch_e = sum(ch_eff) / len(ch_eff) if ch_eff else 0.0
mean_gp_e = sum(gp_eff) / len(gp_eff) if gp_eff else 0.0
mean_sp_e = sum(sp_eff) / len(sp_eff) if sp_eff else 0.0
mean_mt_e = sum(mt_eff) / len(mt_eff) if mt_eff else 0.0
valid_models_e = []
valid_eff = []
colors_e = []
all_eff = [("Char-level", mean_ch_e, "#EF4444"),
("GPT-2", mean_gp_e, "#6B7280"),
("SentencePiece", mean_sp_e, "#3B82F6"),
("MathTok", mean_mt_e, "#10B981")]
for m, v, c in all_eff:
if v > 0:
valid_models_e.append(m)
valid_eff.append(v)
colors_e.append(c)
sns.barplot(x=valid_models_e, y=valid_eff, palette=colors_e, ax=axes[2])
axes[2].set_title("Structural Efficiency", fontsize=12, fontweight='bold', pad=10)
axes[2].set_ylabel("Efficiency Score (Higher is Better)", fontsize=10)
sns.despine(ax=axes[2])
for i, v in enumerate(valid_eff):
axes[2].text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=10)
plt.suptitle("MathTok Comparative Evaluation Framework — Unified Dashboard", fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
out_path = _RESULTS_DIR / "metrics_dashboard.png"
plt.savefig(out_path, dpi=300, bbox_inches='tight')
print(f"Saved {out_path}")
plt.close()
def main():
print("Generating visualizations from benchmark results...")
# Set nice styling
sns.set_theme(style="whitegrid", rc={"grid.alpha": 0.3})
try:
summary = load_summary()
records = load_jsonl_results()
plot_aggregated_scr(summary)
if records:
plot_category_scr(records)
plot_semantic_density(records)
plot_structural_efficiency(records)
plot_unified_dashboard(summary, records)
plot_token_counts(summary)
print("\nAll visualizations generated successfully in evaluation/results/.")
except Exception as e:
print(f"Error generating visualizations: {e}")
if __name__ == "__main__":
main()