Text Generation
Transformers
English
custom
tokenizer
symbolic-ai
mathematics
llm
reasoning
ast
compiler
nlp
deep-learning
machine-learning
mathematical-reasoning
symbolic-reasoning
tokenization
parser
artificial-intelligence
Eval Results (legacy)
Instructions to use SurweeshSP/mathtok with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use SurweeshSP/mathtok with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="SurweeshSP/mathtok")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("SurweeshSP/mathtok", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use SurweeshSP/mathtok with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "SurweeshSP/mathtok" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SurweeshSP/mathtok", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/SurweeshSP/mathtok
- SGLang
How to use SurweeshSP/mathtok with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "SurweeshSP/mathtok" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SurweeshSP/mathtok", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "SurweeshSP/mathtok" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SurweeshSP/mathtok", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use SurweeshSP/mathtok with Docker Model Runner:
docker model run hf.co/SurweeshSP/mathtok
| """ | |
| Visualization Script for MathTok Evaluation Results | |
| =================================================== | |
| Generates visual charts from the benchmark comparison results, making | |
| it easy to understand the performance differences in Semantic Compression Ratio (SCR), | |
| Canonical Consistency Score (CCS), and more. | |
| Usage: | |
| python -m evaluation.visualize | |
| """ | |
| import json | |
| from pathlib import Path | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import pandas as pd | |
| _RESULTS_DIR = Path(__file__).parent / "results" | |
| def load_summary(): | |
| summary_path = _RESULTS_DIR / "comparison_summary.json" | |
| if not summary_path.exists(): | |
| raise FileNotFoundError(f"Results summary not found at {summary_path}. Run comparison.py first.") | |
| with open(summary_path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def load_jsonl_results(): | |
| results_path = _RESULTS_DIR / "comparison_results.jsonl" | |
| records = [] | |
| if not results_path.exists(): | |
| return records | |
| with open(results_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| records.append(json.loads(line)) | |
| return records | |
| def plot_aggregated_scr(summary): | |
| """Plot the overall mean Semantic Compression Ratio.""" | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| models = ["Char-level", "GPT-2", "SentencePiece", "MathTok"] | |
| scrs = [ | |
| summary.get("charlevel_mean_scr", 0), | |
| summary.get("gpt2_scr", 0), | |
| summary.get("sentencepiece_mean_scr", 0), | |
| summary.get("mathtok_mean_scr", 0) | |
| ] | |
| # Filter out missing models (like GPT-2 if not run) | |
| valid_models = [] | |
| valid_scrs = [] | |
| colors = [] | |
| all_models = [("Char-level", scrs[0], "#EF4444"), | |
| ("GPT-2", scrs[1], "#6B7280"), | |
| ("SentencePiece", scrs[2], "#3B82F6"), | |
| ("MathTok", scrs[3], "#10B981")] | |
| for m, s, c in all_models: | |
| if s is not None and s > 0: | |
| valid_models.append(m) | |
| valid_scrs.append(s) | |
| colors.append(c) | |
| sns.barplot(x=valid_models, y=valid_scrs, palette=colors, ax=ax) | |
| ax.set_title("Mean Semantic Compression Ratio (SCR)\n(Higher is Better)", fontsize=14, fontweight='bold', pad=15) | |
| ax.set_ylabel("SCR (Structural Score / Tokens)", fontsize=12) | |
| sns.despine(ax=ax) | |
| # Add value labels | |
| for i, v in enumerate(valid_scrs): | |
| ax.text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold', fontsize=11) | |
| plt.tight_layout() | |
| out_path = _RESULTS_DIR / "scr_comparison.png" | |
| plt.savefig(out_path, dpi=300) | |
| print(f"Saved {out_path}") | |
| plt.close() | |
| def plot_category_scr(records): | |
| """Plot SCR breakdown by category.""" | |
| data = [] | |
| for r in records: | |
| cat = r["category"] | |
| if "mixed" in cat or "latex_vs_ascii" in cat: | |
| continue # Focus on standard mathematical metrics for SCR | |
| data.append({"Category": cat, "Model": "MathTok", "SCR": r["mathtok"]["raw_scr"]}) | |
| data.append({"Category": cat, "Model": "Char-level", "SCR": r["char_level"]["raw_scr"]}) | |
| if r.get("gpt2") and r["gpt2"].get("raw_scr") is not None: | |
| data.append({"Category": cat, "Model": "GPT-2", "SCR": r["gpt2"]["raw_scr"]}) | |
| if r.get("sentencepiece") and r["sentencepiece"].get("raw_scr") is not None: | |
| data.append({"Category": cat, "Model": "SentencePiece", "SCR": r["sentencepiece"]["raw_scr"]}) | |
| if not data: | |
| return | |
| df = pd.DataFrame(data) | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.barplot(data=df, x="Category", y="SCR", hue="Model", | |
| palette={"MathTok": "#10B981", "GPT-2": "#6B7280", "SentencePiece": "#3B82F6", "Char-level": "#EF4444"}, | |
| errorbar=None, ax=ax) | |
| ax.set_title("Semantic Compression Ratio by Category", fontsize=14, fontweight='bold', pad=15) | |
| ax.set_ylabel("Mean SCR", fontsize=12) | |
| ax.set_xlabel("Expression Category", fontsize=12) | |
| sns.despine(ax=ax) | |
| plt.xticks(rotation=15) | |
| plt.legend(title="Tokenizer") | |
| plt.tight_layout() | |
| out_path = _RESULTS_DIR / "scr_by_category.png" | |
| plt.savefig(out_path, dpi=300) | |
| print(f"Saved {out_path}") | |
| plt.close() | |
| def plot_token_counts(summary): | |
| """Plot total token counts as a bar chart to show efficiency.""" | |
| per_record = summary.get("per_record", []) | |
| if not per_record: | |
| return | |
| # We'll just plot the first 15 for readability | |
| subset = per_record[:15] | |
| df_data = [] | |
| for i, r in enumerate(subset): | |
| expr_short = r["expression"][:15] + ".." if len(r["expression"]) > 15 else r["expression"] | |
| df_data.append({"Expression": expr_short, "Model": "MathTok", "Tokens": r["mt_tokens"], "Order": i}) | |
| df_data.append({"Expression": expr_short, "Model": "Char-level", "Tokens": r["ch_tokens"], "Order": i}) | |
| if r.get("gp_tokens"): | |
| df_data.append({"Expression": expr_short, "Model": "GPT-2", "Tokens": r["gp_tokens"], "Order": i}) | |
| if r.get("sp_tokens"): | |
| df_data.append({"Expression": expr_short, "Model": "SentencePiece", "Tokens": r["sp_tokens"], "Order": i}) | |
| df = pd.DataFrame(df_data) | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| # Sort by original order | |
| df = df.sort_values("Order") | |
| sns.barplot(data=df, x="Expression", y="Tokens", hue="Model", | |
| palette={"MathTok": "#10B981", "GPT-2": "#6B7280", "SentencePiece": "#3B82F6", "Char-level": "#EF4444"}, ax=ax) | |
| ax.set_title("Token Counts per Expression (Fewer is usually better, but SCR is the true metric)", fontsize=14, fontweight='bold', pad=15) | |
| ax.set_ylabel("Number of Tokens", fontsize=12) | |
| sns.despine(ax=ax) | |
| plt.xticks(rotation=45, ha='right') | |
| plt.legend(title="Tokenizer") | |
| plt.tight_layout() | |
| out_path = _RESULTS_DIR / "token_counts_sample.png" | |
| plt.savefig(out_path, dpi=300) | |
| print(f"Saved {out_path}") | |
| plt.close() | |
| def plot_semantic_density(records): | |
| """Plot the overall mean Semantic Density.""" | |
| ch_dens = [r["char_level"]["semantic_density"] for r in records if r.get("char_level")] | |
| gp_dens = [r["gpt2"]["semantic_density"] for r in records if r.get("gpt2") and r["gpt2"].get("semantic_density") is not None] | |
| sp_dens = [r["sentencepiece"]["semantic_density"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("semantic_density") is not None] | |
| mt_dens = [r["mathtok"]["semantic_density"] for r in records if r.get("mathtok")] | |
| mean_ch = sum(ch_dens) / len(ch_dens) if ch_dens else 0.0 | |
| mean_gp = sum(gp_dens) / len(gp_dens) if gp_dens else 0.0 | |
| mean_sp = sum(sp_dens) / len(sp_dens) if sp_dens else 0.0 | |
| mean_mt = sum(mt_dens) / len(mt_dens) if mt_dens else 0.0 | |
| valid_models = [] | |
| valid_dens = [] | |
| colors = [] | |
| all_models = [("Char-level", mean_ch, "#EF4444"), | |
| ("GPT-2", mean_gp, "#6B7280"), | |
| ("SentencePiece", mean_sp, "#3B82F6"), | |
| ("MathTok", mean_mt, "#10B981")] | |
| for model, val, color in all_models: | |
| if val > 0: | |
| valid_models.append(model) | |
| valid_dens.append(val) | |
| colors.append(color) | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| sns.barplot(x=valid_models, y=valid_dens, palette=colors, ax=ax) | |
| ax.set_title("Mean Semantic Density\n(Ratio of Math-Centric Tokens to Total Tokens)", fontsize=14, fontweight='bold', pad=15) | |
| ax.set_ylabel("Semantic Density Score (Higher is Better)", fontsize=12) | |
| sns.despine(ax=ax) | |
| for i, v in enumerate(valid_dens): | |
| ax.text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=11) | |
| plt.tight_layout() | |
| out_path = _RESULTS_DIR / "semantic_density_comparison.png" | |
| plt.savefig(out_path, dpi=300) | |
| print(f"Saved {out_path}") | |
| plt.close() | |
| def plot_structural_efficiency(records): | |
| """Plot the overall mean Structural Efficiency.""" | |
| ch_eff = [r["char_level"]["structural_efficiency"] for r in records if r.get("char_level")] | |
| gp_eff = [r["gpt2"]["structural_efficiency"] for r in records if r.get("gpt2") and r["gpt2"].get("structural_efficiency") is not None] | |
| sp_eff = [r["sentencepiece"]["structural_efficiency"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("structural_efficiency") is not None] | |
| mt_eff = [r["mathtok"]["structural_efficiency"] for r in records if r.get("mathtok")] | |
| mean_ch = sum(ch_eff) / len(ch_eff) if ch_eff else 0.0 | |
| mean_gp = sum(gp_eff) / len(gp_eff) if gp_eff else 0.0 | |
| mean_sp = sum(sp_eff) / len(sp_eff) if sp_eff else 0.0 | |
| mean_mt = sum(mt_eff) / len(mt_eff) if mt_eff else 0.0 | |
| valid_models = [] | |
| valid_eff = [] | |
| colors = [] | |
| all_models = [("Char-level", mean_ch, "#EF4444"), | |
| ("GPT-2", mean_gp, "#6B7280"), | |
| ("SentencePiece", mean_sp, "#3B82F6"), | |
| ("MathTok", mean_mt, "#10B981")] | |
| for model, val, color in all_models: | |
| if val > 0: | |
| valid_models.append(model) | |
| valid_eff.append(val) | |
| colors.append(color) | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| sns.barplot(x=valid_models, y=valid_eff, palette=colors, ax=ax) | |
| ax.set_title("Mean Structural Efficiency\n(Parent-Child Relations per Token)", fontsize=14, fontweight='bold', pad=15) | |
| ax.set_ylabel("Structural Efficiency Score (Higher is Better)", fontsize=12) | |
| sns.despine(ax=ax) | |
| for i, v in enumerate(valid_eff): | |
| ax.text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=11) | |
| plt.tight_layout() | |
| out_path = _RESULTS_DIR / "structural_efficiency_comparison.png" | |
| plt.savefig(out_path, dpi=300) | |
| print(f"Saved {out_path}") | |
| plt.close() | |
| def plot_unified_dashboard(summary, records): | |
| """Generates a side-by-side three-panel dashboard showing SCR, Semantic Density, and Structural Efficiency.""" | |
| fig, axes = plt.subplots(1, 3, figsize=(18, 5.5)) | |
| # 1. SCR | |
| models = ["Char-level", "GPT-2", "SentencePiece", "MathTok"] | |
| scrs = [ | |
| summary.get("charlevel_mean_scr", 0), | |
| summary.get("gpt2_scr", 0), | |
| summary.get("sentencepiece_mean_scr", 0), | |
| summary.get("mathtok_mean_scr", 0) | |
| ] | |
| valid_models_scr = [] | |
| valid_scrs = [] | |
| colors_scr = [] | |
| all_scr = [("Char-level", scrs[0], "#EF4444"), | |
| ("GPT-2", scrs[1], "#6B7280"), | |
| ("SentencePiece", scrs[2], "#3B82F6"), | |
| ("MathTok", scrs[3], "#10B981")] | |
| for m, v, c in all_scr: | |
| if v is not None and v > 0: | |
| valid_models_scr.append(m) | |
| valid_scrs.append(v) | |
| colors_scr.append(c) | |
| sns.barplot(x=valid_models_scr, y=valid_scrs, palette=colors_scr, ax=axes[0]) | |
| axes[0].set_title("Semantic Compression Ratio (SCR)", fontsize=12, fontweight='bold', pad=10) | |
| axes[0].set_ylabel("SCR Score (Higher is Better)", fontsize=10) | |
| sns.despine(ax=axes[0]) | |
| for i, v in enumerate(valid_scrs): | |
| axes[0].text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold', fontsize=10) | |
| # 2. Semantic Density | |
| ch_dens = [r["char_level"]["semantic_density"] for r in records if r.get("char_level")] | |
| gp_dens = [r["gpt2"]["semantic_density"] for r in records if r.get("gpt2") and r["gpt2"].get("semantic_density") is not None] | |
| sp_dens = [r["sentencepiece"]["semantic_density"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("semantic_density") is not None] | |
| mt_dens = [r["mathtok"]["semantic_density"] for r in records if r.get("mathtok")] | |
| mean_ch_d = sum(ch_dens) / len(ch_dens) if ch_dens else 0.0 | |
| mean_gp_d = sum(gp_dens) / len(gp_dens) if gp_dens else 0.0 | |
| mean_sp_d = sum(sp_dens) / len(sp_dens) if sp_dens else 0.0 | |
| mean_mt_d = sum(mt_dens) / len(mt_dens) if mt_dens else 0.0 | |
| valid_models_d = [] | |
| valid_dens = [] | |
| colors_d = [] | |
| all_dens = [("Char-level", mean_ch_d, "#EF4444"), | |
| ("GPT-2", mean_gp_d, "#6B7280"), | |
| ("SentencePiece", mean_sp_d, "#3B82F6"), | |
| ("MathTok", mean_mt_d, "#10B981")] | |
| for m, v, c in all_dens: | |
| if v > 0: | |
| valid_models_d.append(m) | |
| valid_dens.append(v) | |
| colors_d.append(c) | |
| sns.barplot(x=valid_models_d, y=valid_dens, palette=colors_d, ax=axes[1]) | |
| axes[1].set_title("Semantic Density", fontsize=12, fontweight='bold', pad=10) | |
| axes[1].set_ylabel("Density Score (Higher is Better)", fontsize=10) | |
| sns.despine(ax=axes[1]) | |
| for i, v in enumerate(valid_dens): | |
| axes[1].text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=10) | |
| # 3. Structural Efficiency | |
| ch_eff = [r["char_level"]["structural_efficiency"] for r in records if r.get("char_level")] | |
| gp_eff = [r["gpt2"]["structural_efficiency"] for r in records if r.get("gpt2") and r["gpt2"].get("structural_efficiency") is not None] | |
| sp_eff = [r["sentencepiece"]["structural_efficiency"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("structural_efficiency") is not None] | |
| mt_eff = [r["mathtok"]["structural_efficiency"] for r in records if r.get("mathtok")] | |
| mean_ch_e = sum(ch_eff) / len(ch_eff) if ch_eff else 0.0 | |
| mean_gp_e = sum(gp_eff) / len(gp_eff) if gp_eff else 0.0 | |
| mean_sp_e = sum(sp_eff) / len(sp_eff) if sp_eff else 0.0 | |
| mean_mt_e = sum(mt_eff) / len(mt_eff) if mt_eff else 0.0 | |
| valid_models_e = [] | |
| valid_eff = [] | |
| colors_e = [] | |
| all_eff = [("Char-level", mean_ch_e, "#EF4444"), | |
| ("GPT-2", mean_gp_e, "#6B7280"), | |
| ("SentencePiece", mean_sp_e, "#3B82F6"), | |
| ("MathTok", mean_mt_e, "#10B981")] | |
| for m, v, c in all_eff: | |
| if v > 0: | |
| valid_models_e.append(m) | |
| valid_eff.append(v) | |
| colors_e.append(c) | |
| sns.barplot(x=valid_models_e, y=valid_eff, palette=colors_e, ax=axes[2]) | |
| axes[2].set_title("Structural Efficiency", fontsize=12, fontweight='bold', pad=10) | |
| axes[2].set_ylabel("Efficiency Score (Higher is Better)", fontsize=10) | |
| sns.despine(ax=axes[2]) | |
| for i, v in enumerate(valid_eff): | |
| axes[2].text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=10) | |
| plt.suptitle("MathTok Comparative Evaluation Framework — Unified Dashboard", fontsize=16, fontweight='bold', y=1.02) | |
| plt.tight_layout() | |
| out_path = _RESULTS_DIR / "metrics_dashboard.png" | |
| plt.savefig(out_path, dpi=300, bbox_inches='tight') | |
| print(f"Saved {out_path}") | |
| plt.close() | |
| def main(): | |
| print("Generating visualizations from benchmark results...") | |
| # Set nice styling | |
| sns.set_theme(style="whitegrid", rc={"grid.alpha": 0.3}) | |
| try: | |
| summary = load_summary() | |
| records = load_jsonl_results() | |
| plot_aggregated_scr(summary) | |
| if records: | |
| plot_category_scr(records) | |
| plot_semantic_density(records) | |
| plot_structural_efficiency(records) | |
| plot_unified_dashboard(summary, records) | |
| plot_token_counts(summary) | |
| print("\nAll visualizations generated successfully in evaluation/results/.") | |
| except Exception as e: | |
| print(f"Error generating visualizations: {e}") | |
| if __name__ == "__main__": | |
| main() | |