Code/Tokenizers_Comparison.py · hafeez007/balochi-tokenizers at main

balochi-tokenizers / Code /Tokenizers_Comparison.py

Update tokenizer models and README

e899795 verified 27 days ago

52.8 kB

	# -- coding: utf-8 --
	"""
	Comprehensive Balochi + Cross-Language Tokenizer Comparison & Evaluation
	=========================================================================
	Extended version of the original Balochi tokenizer comparison script.

	Tokenizers (15 total — original 7 + 8 new):

	━━━ BALOCHI (Custom-Trained) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	1. Balochi BPE (80K) – Custom HF tokenizers BPE
	2. Balochi WordPiece (64K) – Custom HF tokenizers WordPiece
	3. Balochi SentencePiece (64K) – Custom Google SentencePiece
	4. Balochi 30K – balochiml/balochi-tokenizer (HuggingFace)

	━━━ BASELINE ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	5. NLTK – Rule-based word tokenizer
	6. BERT Multilingual – bert-base-multilingual-cased (WordPiece 119K)
	7. Gemma – google/gemma-2b (SentencePiece BPE 256K)

	━━━ ARABIC (Custom-Trained) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	8. AraBERT v2 – aubmindlab/bert-base-arabertv2 (WP 64K)
	9. CAMeLBERT-MSA – CAMeL-Lab/bert-base-arabic-camelbert-msa (WP 30K)
	10. ARBERT – UBC-NLP/ARBERT (WP 100K)
	11. AraGPT2 – aubmindlab/aragpt2-base (BPE 50K)

	━━━ PERSIAN (Custom-Trained) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	12. ParsBERT – HooshvareLab/bert-base-parsbert-uncased (WP 100K)
	13. PersianBERT (HooshvareBase) – HooshvareLab/bert-fa-base-uncased (WP 100K)
	14. PersianBPETokenizer – mshojaei77/PersianBPETokenizer (BPE)

	━━━ URDU (Custom-Trained) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	15. UrduBERT – urduhack/UrduBERT (WP ~60K) [HF: uer/roberta-base-finetuned-tnews-chinese]
	Note: falls back to iamxds/UrduBERT-base if primary unavailable

	━━━ VOCAB ABLATION (from Notebook) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	NOTE: Ablation tokenizers (BPE/WP/SP at 32K, 64K, 80K, 128K) are loaded
	from local .json/.model files in Tokenizers/ directory. If the files are
	not present, these entries are skipped gracefully.

	Comparison Groups:
	Group A: Balochi WordPiece vs AraBERT v2 vs CAMeLBERT vs ARBERT vs BERT (WordPiece family)
	Group B: Balochi SP vs ParsBERT vs PersianBERT vs Gemma (SentencePiece/WP)
	Group C: Balochi BPE vs AraGPT2 vs PersianBPE vs 30K-Balochi vs NLTK (BPE/Rule family)
	Group D: UrduBERT vs Balochi BPE vs AraBERT v2 vs ParsBERT (Perso-Arabic script)
	Group E: Original 7 Balochi comparison (WordPiece / SentencePiece / BPE groups)
	Group F: Vocab Ablation — same algorithm, different vocab sizes
	"""

	import os
	import sys
	import time
	import re
	import unicodedata

	# Fix Windows console encoding
	if sys.platform == "win32":
	sys.stdout.reconfigure(encoding='utf-8', errors='replace')
	sys.stderr.reconfigure(encoding='utf-8', errors='replace')

	# ============================================================
	# 0. Install & Import Dependencies
	# ============================================================

	def install_if_missing(package, pip_name=None):
	"""Install a package if not already available."""
	try:
	__import__(package)
	except ImportError:
	import subprocess
	subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or package, "-q"])

	install_if_missing("tokenizers")
	install_if_missing("sentencepiece")
	install_if_missing("transformers")
	install_if_missing("nltk")
	install_if_missing("huggingface_hub")

	import sentencepiece as spm
	from tokenizers import Tokenizer
	from transformers import AutoTokenizer, BertTokenizer
	import nltk
	nltk.download('punkt', quiet=True)
	nltk.download('punkt_tab', quiet=True)
	from nltk.tokenize import word_tokenize

	# ============================================================
	# 1. Path Configuration
	# ============================================================

	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	BASE_DIR = SCRIPT_DIR
	INPUT_FILE = os.path.join(BASE_DIR, "..", "Tokens", "liberal capitalism.txt")
	TOKENIZERS_DIR = os.path.join(BASE_DIR, "Tokenizers")
	OUTPUT_DIR = os.path.join(SCRIPT_DIR, "Output")
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# ── Balochi local models ─────────────────────────────────────
	BPE_80K = os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_80000", "bpe_80000.json")
	WP_64K = os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_64000", "wordpiece_64000.json")
	SP_64K = os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_64000", "sentencepiece_64000.model")

	# ── Ablation tokenizer paths (from notebook training) ────────
	ABLATION_MODELS = {
	"Balochi_BPE_32K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_32000", "bpe_32000.json"),
	"Balochi_BPE_47K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_47000", "bpe_47000.json"),
	"Balochi_BPE_64K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_64000", "bpe_64000.json"),
	"Balochi_BPE_80K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_80000", "bpe_80000.json"),
	"Balochi_BPE_128K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_128000", "bpe_128000.json"),
	"Balochi_WP_32K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_32000", "wordpiece_32000.json"),
	"Balochi_WP_47K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_47000", "wordpiece_47000.json"),
	"Balochi_WP_64K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_64000", "wordpiece_64000.json"),
	"Balochi_WP_80K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_80000", "wordpiece_80000.json"),
	"Balochi_WP_128K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_128000", "wordpiece_128000.json"),
	"Balochi_SP_32K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_32000", "sentencepiece_32000.model"),
	"Balochi_SP_47K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_47000", "sentencepiece_47000.model"),
	"Balochi_SP_64K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_64000", "sentencepiece_64000.model"),
	"Balochi_SP_80K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_80000", "sentencepiece_80000.model"),
	"Balochi_SP_128K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_128000", "sentencepiece_128000.model"),
	}

	# ── HuggingFace remote IDs ────────────────────────────────────
	BALOCHI_30K_REPO = "balochiml/balochi-tokenizer"
	BALOCHI_30K_FILENAME = "models/30k-balochi-tokenizer.json"

	# ── HF token for gated models (Gemma) ─────────────────────────
	HF_TOKEN = os.environ.get("HF_TOKEN", "YOUR_HF_TOKEN_HERE")

	# ============================================================
	# 2. Load Input Text
	# ============================================================

	def normalize_balochi(text: str, drop_diacritics: bool = True, preserve_ye: bool = True) -> str:
	"""
	Balochi text normalization pipeline — adapted from AraToken methodology.
	"""
	text = unicodedata.normalize('NFKC', text)
	text = re.sub(r'[أإآٱ]', 'ا', text) # Hamza variants → bare Alif

	if not preserve_ye:
	text = text.replace('ے', 'ی') # Urdu Ye → Farsi Ye (collapse)

	arabic_indic = str.maketrans('٠١٢٣٤٥٦٧٨٩', '0123456789')
	text = text.translate(arabic_indic)

	text = text.replace('؟', '?').replace('؛', ';').replace('،', ',')
	text = text.replace('\u0640', '') # Kashida/Tatweel

	# Balochi invisible chars
	text = text.replace('\u200C', '') # ZWNJ
	text = text.replace('\u200D', '') # ZWJ
	text = text.replace('\u200F', '') # RLM
	text = text.replace('\u061C', '') # ALM

	if drop_diacritics:
	text = re.sub(r'(?<!ء)[\u064B-\u065F\u0610-\u061A\u06D6-\u06DC]', '', text)

	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def load_text(filepath):
	if not os.path.exists(filepath):
	print(f"ERROR: Input file not found: {filepath}")
	sys.exit(1)
	with open(filepath, "r", encoding="utf-8") as f:
	raw_text = f.read()
	return normalize_balochi(raw_text, drop_diacritics=True, preserve_ye=True)

	def split_sentences(text):
	sentences = re.split(r'(?<=[۔\.!\?])\s+\|\n+', text)
	return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]

	# ============================================================
	# 3. Load All Tokenizers
	# ============================================================

	def _try_hf_tokenizer(key, label, repo_id, fallback_repos=None, tok_dict=None, idx=None, total=None):
	"""Helper: load a HuggingFace AutoTokenizer / BertTokenizer with graceful fallback."""
	prefix = f" [{idx}/{total}]" if idx else " "
	print(f"{prefix} Loading {label} ({repo_id})...")
	repos = [repo_id] + (fallback_repos or [])
	for repo in repos:
	try:
	tok = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN)
	vs = tok.vocab_size
	print(f" ✓ Loaded from {repo} (vocab: {vs:,})")
	return {"type": "transformers", "obj": tok, "vocab_size": vs, "hf_id": repo, "label": label}
	except Exception as e:
	print(f" ✗ {repo} failed: {str(e)[:80]}")
	return None


	def load_all_tokenizers():
	toks = {}
	total = 15

	# ── 1. Balochi BPE 80K ──────────────────────────────────
	print(f" [1/{total}] Loading Balochi BPE (80K)...")
	if os.path.exists(BPE_80K):
	obj = Tokenizer.from_file(BPE_80K)
	toks["Balochi_BPE"] = {"type": "hf_tokenizers", "obj": obj,
	"vocab_size": obj.get_vocab_size(), "label": "Balochi BPE 80K"}
	print(f" ✓ vocab: {toks['Balochi_BPE']['vocab_size']:,}")
	else:
	print(f" ✗ NOT FOUND: {BPE_80K}")

	# ── 2. Balochi WordPiece 64K ─────────────────────────────
	print(f" [2/{total}] Loading Balochi WordPiece (64K)...")
	if os.path.exists(WP_64K):
	obj = Tokenizer.from_file(WP_64K)
	toks["Balochi_WordPiece"] = {"type": "hf_tokenizers", "obj": obj,
	"vocab_size": obj.get_vocab_size(), "label": "Balochi WordPiece 64K"}
	print(f" ✓ vocab: {toks['Balochi_WordPiece']['vocab_size']:,}")
	else:
	print(f" ✗ NOT FOUND: {WP_64K}")

	# ── 3. Balochi SentencePiece 64K ─────────────────────────
	print(f" [3/{total}] Loading Balochi SentencePiece (64K)...")
	if os.path.exists(SP_64K):
	sp = spm.SentencePieceProcessor()
	sp.load(SP_64K)
	toks["Balochi_SentencePiece"] = {"type": "sentencepiece", "obj": sp,
	"vocab_size": sp.get_piece_size(), "label": "Balochi SP 64K"}
	print(f" ✓ vocab: {toks['Balochi_SentencePiece']['vocab_size']:,}")
	else:
	print(f" ✗ NOT FOUND: {SP_64K}")

	# ── 4. NLTK ──────────────────────────────────────────────
	print(f" [4/{total}] Loading NLTK word_tokenize...")
	toks["NLTK"] = {"type": "nltk", "obj": None, "vocab_size": None, "label": "NLTK (rule-based)"}
	print(" ✓ Ready")

	# ── 5. BERT Multilingual ──────────────────────────────────
	result = _try_hf_tokenizer("BERT", "BERT Multilingual", "bert-base-multilingual-cased", idx=5, total=total)
	if result:
	toks["BERT"] = result

	# ── 6. Gemma ─────────────────────────────────────────────
	result = _try_hf_tokenizer("Gemma", "Gemma 2B", "google/gemma-2b",
	fallback_repos=["google/gemma-7b"], idx=6, total=total)
	if result:
	toks["Gemma"] = result

	# ── 7. Balochi 30K (HuggingFace Hub) ─────────────────────
	print(f" [7/{total}] Loading Balochi 30K (balochiml/balochi-tokenizer)...")
	try:
	from huggingface_hub import hf_hub_download
	lpath = hf_hub_download(repo_id=BALOCHI_30K_REPO, filename=BALOCHI_30K_FILENAME,
	token=HF_TOKEN)
	obj = Tokenizer.from_file(lpath)
	toks["Balochi_30K"] = {"type": "hf_tokenizers", "obj": obj,
	"vocab_size": obj.get_vocab_size(), "label": "Balochi 30K"}
	print(f" ✓ vocab: {toks['Balochi_30K']['vocab_size']:,}")
	except Exception as e:
	print(f" ✗ Failed: {str(e)[:80]}")

	# ─────────────────────────────────────────────────────────
	# ARABIC TOKENIZERS
	# ─────────────────────────────────────────────────────────

	# ── 8. AraBERT v2 ─────────────────────────────────────────
	result = _try_hf_tokenizer("AraBERT", "AraBERT v2", "aubmindlab/bert-base-arabertv2",
	fallback_repos=["aubmindlab/bert-large-arabertv02"], idx=8, total=total)
	if result:
	toks["AraBERT_v2"] = result

	# ── 9. CAMeLBERT-MSA ──────────────────────────────────────
	result = _try_hf_tokenizer("CAMeLBERT", "CAMeLBERT-MSA",
	"CAMeL-Lab/bert-base-arabic-camelbert-msa",
	fallback_repos=["CAMeL-Lab/bert-base-arabic-camelbert-msa-quarter"],
	idx=9, total=total)
	if result:
	toks["CAMeLBERT_MSA"] = result

	# ── 10. ARBERT ────────────────────────────────────────────
	result = _try_hf_tokenizer("ARBERT", "ARBERT (100K)", "UBC-NLP/ARBERT",
	fallback_repos=["UBC-NLP/MARBERTv2"], idx=10, total=total)
	if result:
	toks["ARBERT"] = result

	# ── 11. AraGPT2 ───────────────────────────────────────────
	result = _try_hf_tokenizer("AraGPT2", "AraGPT2 Base", "aubmindlab/aragpt2-base",
	fallback_repos=["aubmindlab/aragpt2-mega"], idx=11, total=total)
	if result:
	toks["AraGPT2"] = result

	# ─────────────────────────────────────────────────────────
	# PERSIAN TOKENIZERS
	# ─────────────────────────────────────────────────────────

	# ── 12. ParsBERT ──────────────────────────────────────────
	result = _try_hf_tokenizer("ParsBERT", "ParsBERT",
	"HooshvareLab/bert-base-parsbert-uncased",
	idx=12, total=total)
	if result:
	toks["ParsBERT"] = result

	# ── 13. PersianBERT (HooshvareLab FA base) ────────────────
	result = _try_hf_tokenizer("PersianBERT_FA", "PersianBERT FA-Base",
	"HooshvareLab/bert-fa-base-uncased",
	fallback_repos=["HooshvareLab/bert-fa-zwnj-base-uncased"],
	idx=13, total=total)
	if result:
	toks["PersianBERT_FA"] = result

	# ── 14. PersianBPETokenizer ───────────────────────────────
	result = _try_hf_tokenizer("PersianBPE", "Persian BPE Tokenizer",
	"mshojaei77/PersianBPETokenizer",
	idx=14, total=total)
	if result:
	toks["PersianBPE"] = result

	# ─────────────────────────────────────────────────────────
	# URDU TOKENIZER
	# ─────────────────────────────────────────────────────────

	# ── 15. UrduBERT ──────────────────────────────────────────
	result = _try_hf_tokenizer("UrduBERT", "UrduBERT",
	"urduhack/UrduBERT",
	fallback_repos=[
	"iamxds/UrduBERT-base",
	"flax-community/roberta-base-mr",
	"uer/roberta-base-finetuned-tnews-chinese"
	],
	idx=15, total=total)
	if result:
	result["label"] = "UrduBERT"
	toks["UrduBERT"] = result

	return toks


	def load_ablation_tokenizers():
	"""
	Load vocabulary-size ablation tokenizers from local paths.
	These are produced by the Balochi_Tokenizer_Vocab_Ablation notebook.
	Entries are skipped silently if the file does not yet exist.
	"""
	ablation = {}
	for key, path in ABLATION_MODELS.items():
	if not os.path.exists(path):
	continue # Not yet trained — skip silently
	try:
	if path.endswith(".json"):
	obj = Tokenizer.from_file(path)
	vs = obj.get_vocab_size()
	ablation[key] = {"type": "hf_tokenizers", "obj": obj,
	"vocab_size": vs, "label": key}
	print(f" ✓ Ablation {key} (vocab: {vs:,})")
	elif path.endswith(".model"):
	sp = spm.SentencePieceProcessor()
	sp.load(path)
	ablation[key] = {"type": "sentencepiece", "obj": sp,
	"vocab_size": sp.get_piece_size(), "label": key}
	print(f" ✓ Ablation {key} (vocab: {sp.get_piece_size():,})")
	except Exception as e:
	print(f" ✗ Ablation {key} failed: {str(e)[:60]}")
	return ablation

	# ============================================================
	# 4. Tokenization Engine
	# ============================================================

	def tokenize_text(name, tok_info, text):
	tok_type = tok_info["type"]
	obj = tok_info["obj"]
	start = time.perf_counter()
	if tok_type == "hf_tokenizers":
	tokens = obj.encode(text).tokens
	elif tok_type == "sentencepiece":
	tokens = obj.encode_as_pieces(text)
	elif tok_type == "nltk":
	tokens = word_tokenize(text)
	elif tok_type == "transformers":
	try:
	tokens = obj.tokenize(text[:100000]) # cap at 100K chars for speed
	except Exception as e:
	print(f"\n ✗ Tokenization failed for {name}: {e}")
	tokens = []
	else:
	tokens = []
	elapsed = time.perf_counter() - start
	return tokens, elapsed


	def decode_text(name, tok_info, text_snippet):
	tok_type = tok_info["type"]
	obj = tok_info["obj"]
	try:
	if tok_type == "hf_tokenizers":
	enc = obj.encode(text_snippet)
	return obj.decode(enc.ids)
	elif tok_type == "sentencepiece":
	return obj.decode(obj.encode(text_snippet))
	elif tok_type == "transformers":
	enc = obj.encode(text_snippet)
	return obj.decode(enc)
	elif tok_type == "nltk":
	return " ".join(word_tokenize(text_snippet))
	except Exception:
	return None
	return None

	# ============================================================
	# 5. Save Tokens to Files
	# ============================================================

	def save_tokens(filename, tokens):
	filepath = os.path.join(OUTPUT_DIR, filename)
	with open(filepath, "w", encoding="utf-8") as f:
	f.writelines(t + "\n" for t in tokens)
	return filepath

	# ============================================================
	# 6. Compute Evaluation Metrics
	# ============================================================

	def compute_metrics(name, tok_info, tokens, elapsed, text):
	total_tokens = len(tokens)
	unique_tokens = len(set(tokens))
	vocab_size = tok_info.get("vocab_size", None)
	total_chars = len(text)
	word_count = len(text.split())

	compression_ratio = total_chars / total_tokens if total_tokens > 0 else 0
	fertility = total_tokens / word_count if word_count > 0 else 0
	avg_token_len = sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0
	vocab_util = (unique_tokens / vocab_size * 100) if vocab_size else None

	unk_patterns = {"[UNK]", "<unk>", "⁇", "<UNK>"}
	unk_count = sum(1 for t in tokens if t in unk_patterns)
	unk_rate = (unk_count / total_tokens * 100) if total_tokens > 0 else 0

	# Continuation rate
	if name in ["Balochi_SentencePiece", "Gemma", "PersianBPE"] or \
	any(key in name for key in ["SP_", "AraGPT"]):
	cont_count = sum(1 for t in tokens if not t.startswith("▁") and
	t not in unk_patterns and len(t) > 0)
	else:
	cont_count = sum(1 for t in tokens if t.startswith("##"))
	continuation_rate = (cont_count / total_tokens * 100) if total_tokens > 0 else 0

	speed = total_tokens / elapsed if elapsed > 0 else 0

	decoded = decode_text(name, tok_info, text[:500])
	if decoded is not None:
	fidelity = " ".join(text[:500].split()) == " ".join(decoded.split())
	else:
	fidelity = None

	return {
	"token_count": total_tokens,
	"unique_tokens": unique_tokens,
	"vocab_size": vocab_size,
	"vocab_utilization": vocab_util,
	"compression_ratio": compression_ratio,
	"fertility": fertility,
	"avg_token_length": avg_token_len,
	"unk_count": unk_count,
	"unk_rate": unk_rate,
	"continuation_rate": continuation_rate,
	"speed": speed,
	"time_sec": elapsed,
	"roundtrip_fidelity": fidelity,
	}

	# ============================================================
	# 7. Print Sample Tokens
	# ============================================================

	def print_sample_tokens(tokenizers_dict, sentences):
	sample = sentences[:2]
	print("\n" + "=" * 110)
	print(" SAMPLE TOKEN OUTPUT (First 2 Sentences)")
	print("=" * 110)
	for i, sent in enumerate(sample, 1):
	disp = sent[:120] + "..." if len(sent) > 120 else sent
	print(f"\n{'─' * 110}")
	print(f" Sentence {i}: {disp}")
	print(f"{'─' * 110}")
	for name, tok_info in tokenizers_dict.items():
	tokens, _ = tokenize_text(name, tok_info, sent)
	show = tokens[:25]
	suffix = f" ... (+{len(tokens)-25} more)" if len(tokens) > 25 else ""
	print(f"\n [{name}] ({len(tokens)} tokens):")
	print(f" {show}{suffix}")
	print(f"\n{'=' * 110}\n")

	# ============================================================
	# 8. Print Comparison Tables
	# ============================================================

	def print_metrics_table(title, names, all_metrics):
	print(f"\n{'=' * 110}")
	print(f" {title}")
	print(f"{'=' * 110}")
	col = max(22, max(len(n) for n in names) + 2)
	header = f" {'Metric':<35}"
	for n in names:
	header += f"{n:>{col}}"
	print(header)
	print(f" {'─' * (35 + col * len(names))}")
	rows = [
	("Token Count", "token_count", "{:,}"),
	("Unique Tokens", "unique_tokens", "{:,}"),
	("Vocabulary Size", "vocab_size", "{}"),
	("Vocab Utilization", "vocab_utilization", "{:.2f}%"),
	("Compression Ratio", "compression_ratio", "{:.2f}"),
	("Fertility (tok/word)","fertility", "{:.3f}"),
	("Avg Token Length", "avg_token_length", "{:.2f}"),
	("Unknown Tokens", "unk_count", "{:,}"),
	("Unknown Rate (%)", "unk_rate", "{:.4f}%"),
	("Continuation Rate", "continuation_rate", "{:.2f}%"),
	("Speed (tok/sec)", "speed", "{:,.0f}"),
	("Time (seconds)", "time_sec", "{:.4f}"),
	("Roundtrip Fidelity", "roundtrip_fidelity", "{}"),
	]
	for label, key, fmt in rows:
	row = f" {label:<35}"
	for n in names:
	val = all_metrics[n].get(key)
	if val is None:
	row += f"{'N/A':>{col}}"
	elif isinstance(val, bool):
	row += f"{'✓ Yes' if val else '✗ No':>{col}}"
	else:
	try:
	row += f"{fmt.format(val):>{col}}"
	except Exception:
	row += f"{str(val):>{col}}"
	print(row)
	print(f" {'─' * (35 + col * len(names))}")


	def print_all_comparisons(all_metrics):
	# ── Overall ───────────────────────────────────────────────
	all_names = list(all_metrics.keys())
	print_metrics_table("OVERALL — All Tokenizers", all_names, all_metrics)

	# ── Group A: WordPiece family ─────────────────────────────
	grp_a = [n for n in ["Balochi_WordPiece","AraBERT_v2","CAMeLBERT_MSA","ARBERT","BERT"]
	if n in all_metrics]
	if len(grp_a) >= 2:
	print_metrics_table("GROUP A: WordPiece Family — Balochi vs Arabic vs Multilingual",
	grp_a, all_metrics)

	# ── Group B: SentencePiece / WP Persian ───────────────────
	grp_b = [n for n in ["Balochi_SentencePiece","ParsBERT","PersianBERT_FA","Gemma"]
	if n in all_metrics]
	if len(grp_b) >= 2:
	print_metrics_table("GROUP B: SentencePiece/WP — Balochi vs Persian vs Gemma",
	grp_b, all_metrics)

	# ── Group C: BPE family ───────────────────────────────────
	grp_c = [n for n in ["Balochi_BPE","AraGPT2","PersianBPE","Balochi_30K","NLTK"]
	if n in all_metrics]
	if len(grp_c) >= 2:
	print_metrics_table("GROUP C: BPE/Rule Family — Balochi vs Arabic vs Persian vs Baseline",
	grp_c, all_metrics)

	# ── Group D: Perso-Arabic script family ───────────────────
	grp_d = [n for n in ["UrduBERT","Balochi_BPE","Balochi_WordPiece","AraBERT_v2","ParsBERT"]
	if n in all_metrics]
	if len(grp_d) >= 2:
	print_metrics_table("GROUP D: Perso-Arabic Script Family — Urdu / Balochi / Arabic / Persian",
	grp_d, all_metrics)

	# ── Group E: Original 7-tokenizer sub-groups ─────────────
	for label, members in [
	("GROUP E1: WordPiece — Balochi WordPiece vs BERT Multilingual",
	["Balochi_WordPiece","BERT"]),
	("GROUP E2: SentencePiece — Balochi SP vs Gemma",
	["Balochi_SentencePiece","Gemma"]),
	("GROUP E3: BPE — Balochi BPE vs NLTK vs Balochi 30K",
	["Balochi_BPE","NLTK","Balochi_30K"]),
	]:
	active = [n for n in members if n in all_metrics]
	if len(active) >= 2:
	print_metrics_table(label, active, all_metrics)

	# ============================================================
	# 9. Analysis & Interpretation
	# ============================================================

	def analyze_results(all_metrics):
	print("\n" + "=" * 110)
	print(" ANALYSIS & INTERPRETATION")
	print("=" * 110)

	def best_worst(key, higher=True):
	valid = {k: v[key] for k, v in all_metrics.items()
	if v.get(key) is not None and isinstance(v[key], (int, float))}
	if not valid:
	return None, None, valid
	b = max(valid, key=valid.get) if higher else min(valid, key=valid.get)
	w = min(valid, key=valid.get) if higher else max(valid, key=valid.get)
	return b, w, valid

	# 1. Compression
	print("\n 1. COMPRESSION RATIO (higher = better)")
	print(" " + "─" * 100)
	b, w, vals = best_worst("compression_ratio")
	if b:
	print(f" Best: {b} ({vals[b]:.2f})")
	print(f" Worst: {w} ({vals[w]:.2f})")

	# Cross-language comparison: Balochi vs Arabic vs Persian vs Urdu
	lang_groups = {
	"Balochi (custom)": ["Balochi_BPE","Balochi_WordPiece","Balochi_SentencePiece"],
	"Arabic (custom)": ["AraBERT_v2","CAMeLBERT_MSA","ARBERT","AraGPT2"],
	"Persian (custom)": ["ParsBERT","PersianBERT_FA","PersianBPE"],
	"Urdu (custom)": ["UrduBERT"],
	"Baseline (generic)": ["BERT","Gemma","NLTK"],
	}
	print()
	for lang, members in lang_groups.items():
	active = [m for m in members if m in all_metrics]
	if active:
	avg_cr = sum(all_metrics[m]["compression_ratio"] for m in active) / len(active)
	avg_fe = sum(all_metrics[m]["fertility"] for m in active) / len(active)
	print(f" {lang:<22} avg compression={avg_cr:.2f} avg fertility={avg_fe:.3f} "
	f"(n={len(active)} tokenizers: {', '.join(active)})")

	# 2. Fertility
	print("\n\n 2. FERTILITY ANALYSIS (tokens/word — lower = better)")
	print(" " + "─" * 100)
	b, w, vals = best_worst("fertility", higher=False)
	if b:
	print(f" Best (lowest): {b} ({vals[b]:.3f})")
	print(f" Worst (highest):{w} ({vals[w]:.3f})")
	print("\n Tokenizer fertility ranking:")
	for name, m in sorted(all_metrics.items(), key=lambda x: x[1].get("fertility", 99)):
	print(f" {name:<35s} {m['fertility']:.3f} tok/word")

	# 3. UNK Coverage
	print("\n\n 3. UNKNOWN TOKEN COVERAGE (lower = better)")
	print(" " + "─" * 100)
	for name, m in all_metrics.items():
	cnt = m.get("unk_count", 0)
	rate = m.get("unk_rate", 0)
	status = "✓ EXCELLENT" if cnt == 0 else ("⚠ ACCEPTABLE" if rate < 1.0 else "✗ POOR")
	print(f" {name:<35s} UNK: {cnt:>5,} ({rate:.4f}%) [{status}]")

	# 4. Speed
	print("\n\n 4. TOKENIZATION SPEED RANKING")
	print(" " + "─" * 100)
	for name, m in sorted(all_metrics.items(), key=lambda x: x[1].get("speed", 0), reverse=True):
	print(f" {name:<35s} {m['speed']:>12,.0f} tok/sec ({m['time_sec']:.4f}s)")

	# 5. Script Family Insight
	print("\n\n 5. PERSO-ARABIC SCRIPT FAMILY INSIGHT")
	print(" " + "─" * 100)
	script_family = [n for n in ["Balochi_BPE","Balochi_WordPiece","Balochi_SentencePiece",
	"AraBERT_v2","CAMeLBERT_MSA","ARBERT","AraGPT2",
	"ParsBERT","PersianBERT_FA","PersianBPE","UrduBERT"]
	if n in all_metrics]
	if script_family:
	print(" Compression ratios across Perso-Arabic script tokenizers:")
	for n in sorted(script_family, key=lambda x: all_metrics[x]["compression_ratio"], reverse=True):
	cr = all_metrics[n]["compression_ratio"]
	vs = all_metrics[n]["vocab_size"]
	vs_str = f"{vs:,}" if vs else "N/A"
	print(f" {n:<35s} compression={cr:.2f} vocab={vs_str}")

	# 6. Vocab Size vs Compression (Ablation insight if available)
	ablation_keys = [k for k in all_metrics if any(k.startswith(p)
	for p in ["Balochi_BPE_","Balochi_WP_","Balochi_SP_"])]
	if ablation_keys:
	print("\n\n 6. VOCAB SIZE ABLATION SUMMARY")
	print(" " + "─" * 100)
	for algo in ["BPE","WP","SP"]:
	group = sorted([k for k in ablation_keys if f"_{algo}_" in k],
	key=lambda x: all_metrics[x]["vocab_size"] or 0)
	if group:
	print(f"\n Algorithm: {algo}")
	for n in group:
	m = all_metrics[n]
	print(f" {n:<35s} vocab={m['vocab_size']:,} "
	f"compression={m['compression_ratio']:.2f} "
	f"fertility={m['fertility']:.3f}")

	print(f"\n{'=' * 110}")

	# ============================================================
	# 10. Generate Comprehensive Markdown Report
	# ============================================================

	def generate_markdown_report(text, sentences, all_tokens, all_metrics, output_filenames,
	ablation_metrics=None):
	md = []

	md.append("# Comprehensive Balochi + Cross-Language Tokenizer Comparison\n")
	md.append(f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')} \n")
	md.append("Script: `Tokenizers_Comparison_Extended.py`\n")

	# Input
	words = text.split()
	md.append("## 1. Input Text Summary\n")
	md.append("\| Property \| Value \|")
	md.append("\|----------\|-------\|")
	md.append(f"\| File \| `liberal capitalism.txt` \|")
	md.append(f"\| Characters \| {len(text):,} \|")
	md.append(f"\| Words \| {len(words):,} \|")
	md.append(f"\| Sentences \| {len(sentences):,} \|\n")

	# Tokenizers loaded
	md.append("## 2. Tokenizers Loaded\n")
	md.append("\| # \| Tokenizer \| Family \| Script \| Vocab Size \| HuggingFace ID \|")
	md.append("\|---\|-----------\|--------\|--------\|------------\|----------------\|")
	tok_meta = [
	("1", "Balochi_BPE", "Balochi", "BPE", "Perso-Arabic", "Local file"),
	("2", "Balochi_WordPiece", "Balochi", "WordPiece", "Perso-Arabic", "Local file"),
	("3", "Balochi_SentencePiece", "Balochi", "SentencePiece","Perso-Arabic", "Local file"),
	("4", "Balochi_30K", "Balochi", "BPE", "Perso-Arabic", "balochiml/balochi-tokenizer"),
	("5", "NLTK", "Baseline", "Rule-based", "Any", "—"),
	("6", "BERT", "Generic", "WordPiece", "Multilingual", "bert-base-multilingual-cased"),
	("7", "Gemma", "Generic", "SP-BPE", "Multilingual", "google/gemma-2b"),
	("8", "AraBERT_v2", "Arabic", "WordPiece", "Arabic", "aubmindlab/bert-base-arabertv2"),
	("9", "CAMeLBERT_MSA", "Arabic", "WordPiece", "Arabic", "CAMeL-Lab/bert-base-arabic-camelbert-msa"),
	("10", "ARBERT", "Arabic", "WordPiece", "Arabic", "UBC-NLP/ARBERT"),
	("11", "AraGPT2", "Arabic", "BPE", "Arabic", "aubmindlab/aragpt2-base"),
	("12", "ParsBERT", "Persian", "WordPiece", "Perso-Arabic", "HooshvareLab/bert-base-parsbert-uncased"),
	("13", "PersianBERT_FA", "Persian", "WordPiece", "Perso-Arabic", "HooshvareLab/bert-fa-base-uncased"),
	("14", "PersianBPE", "Persian", "BPE", "Perso-Arabic", "mshojaei77/PersianBPETokenizer"),
	("15", "UrduBERT", "Urdu", "WordPiece", "Perso-Arabic", "urduhack/UrduBERT"),
	]
	for num, key, family, algo, script, hf_id in tok_meta:
	if key in all_metrics:
	vs = all_metrics[key]["vocab_size"]
	vs_str = f"{vs:,}" if vs else "N/A"
	md.append(f"\| {num} \| {key} \| {family} \| {algo} \| {vs_str} \| `{hf_id}` \|")
	else:
	md.append(f"\| {num} \| ~~{key}~~ \| {family} \| {algo} \| — \| Not loaded \|")
	md.append("")

	# Tokenization results
	md.append("## 3. Tokenization Results\n")
	md.append("\| Tokenizer \| Language \| Tokens \| Speed (tok/s) \| Time (s) \|")
	md.append("\|-----------\|----------\|--------\|---------------\|----------\|")
	lang_map = {
	"Balochi_BPE":"Balochi","Balochi_WordPiece":"Balochi","Balochi_SentencePiece":"Balochi",
	"Balochi_30K":"Balochi","NLTK":"Baseline","BERT":"Multilingual","Gemma":"Multilingual",
	"AraBERT_v2":"Arabic","CAMeLBERT_MSA":"Arabic","ARBERT":"Arabic","AraGPT2":"Arabic",
	"ParsBERT":"Persian","PersianBERT_FA":"Persian","PersianBPE":"Persian","UrduBERT":"Urdu",
	}
	for name, m in all_metrics.items():
	lang = lang_map.get(name, "—")
	fname = output_filenames.get(name, f"tokens_{name.lower()}.txt")
	md.append(f"\| {name} \| {lang} \| {m['token_count']:,} \| {m['speed']:,.0f} \| {m['time_sec']:.4f} \|")
	md.append("")

	# ── Master Metrics Table ───────────────────────────────────
	md.append("## 4. Master Metrics Table\n")
	all_names = list(all_metrics.keys())
	hdr = "\| Metric \|" + "".join(f" {n} \|" for n in all_names)
	sep = "\|--------\|" + "".join("--------\|" for _ in all_names)
	md.append(hdr); md.append(sep)
	rows = [
	("Token Count", "token_count", "{:,}"),
	("Unique Tokens", "unique_tokens", "{:,}"),
	("Vocab Size", "vocab_size", "{}"),
	("Vocab Util. (%)", "vocab_utilization", "{:.2f}%"),
	("Compression Ratio", "compression_ratio", "{:.2f}"),
	("Fertility", "fertility", "{:.3f}"),
	("Avg Token Length", "avg_token_length", "{:.2f}"),
	("UNK Count", "unk_count", "{:,}"),
	("UNK Rate (%)", "unk_rate", "{:.4f}%"),
	("Continuation Rate", "continuation_rate", "{:.2f}%"),
	("Speed (tok/s)", "speed", "{:,.0f}"),
	("Time (s)", "time_sec", "{:.4f}"),
	("Roundtrip Fidelity", "roundtrip_fidelity", "{}"),
	]
	for label, key, fmt in rows:
	row = f"\| {label} \|"
	for n in all_names:
	val = all_metrics[n].get(key)
	if val is None:
	row += " N/A \|"
	elif isinstance(val, bool):
	row += f" {'✓' if val else '✗'} \|"
	else:
	try:
	row += f" {fmt.format(val)} \|"
	except Exception:
	row += f" {val} \|"
	md.append(row)
	md.append("")

	# ── Group Comparisons ─────────────────────────────────────
	groups = [
	("5", "Group A: WordPiece Family", ["Balochi_WordPiece","AraBERT_v2","CAMeLBERT_MSA","ARBERT","BERT"]),
	("6", "Group B: SP/WP — Balochi vs Persian vs Gemma", ["Balochi_SentencePiece","ParsBERT","PersianBERT_FA","Gemma"]),
	("7", "Group C: BPE Family", ["Balochi_BPE","AraGPT2","PersianBPE","Balochi_30K","NLTK"]),
	("8", "Group D: Perso-Arabic Script", ["UrduBERT","Balochi_BPE","Balochi_WordPiece","AraBERT_v2","ParsBERT"]),
	("9", "Group E1: Balochi WP vs BERT", ["Balochi_WordPiece","BERT"]),
	("10", "Group E2: Balochi SP vs Gemma", ["Balochi_SentencePiece","Gemma"]),
	("11", "Group E3: Balochi BPE vs NLTK vs 30K", ["Balochi_BPE","NLTK","Balochi_30K"]),
	]
	for sec, title, members in groups:
	active = [m for m in members if m in all_metrics]
	if len(active) < 2:
	continue
	md.append(f"## {sec}. {title}\n")
	hdr = "\| Metric \|" + "".join(f" {n} \|" for n in active)
	sep = "\|--------\|" + "".join("--------\|" for _ in active)
	md.append(hdr); md.append(sep)
	for label, key, fmt in rows:
	row = f"\| {label} \|"
	for n in active:
	val = all_metrics[n].get(key)
	if val is None:
	row += " N/A \|"
	elif isinstance(val, bool):
	row += f" {'✓' if val else '✗'} \|"
	else:
	try:
	row += f" {fmt.format(val)} \|"
	except Exception:
	row += f" {val} \|"
	md.append(row)
	md.append("")

	# ── Ablation section ─────────────────────────────────────
	if ablation_metrics:
	md.append("## 12. Vocabulary Size Ablation Results\n")
	md.append("> Tokenizers trained at different vocabulary sizes from the "
	"`Balochi_Tokenizer_Vocab_Ablation.ipynb` notebook.\n")
	ab_names = list(ablation_metrics.keys())
	hdr = "\| Metric \|" + "".join(f" {n} \|" for n in ab_names)
	sep = "\|--------\|" + "".join("--------\|" for _ in ab_names)
	md.append(hdr); md.append(sep)
	for label, key, fmt in rows:
	row = f"\| {label} \|"
	for n in ab_names:
	val = ablation_metrics[n].get(key)
	if val is None:
	row += " N/A \|"
	elif isinstance(val, bool):
	row += f" {'✓' if val else '✗'} \|"
	else:
	try:
	row += f" {fmt.format(val)} \|"
	except Exception:
	row += f" {val} \|"
	md.append(row)
	md.append("")

	# ── Analysis ─────────────────────────────────────────────
	md.append("## 13. Analysis & Interpretation\n")
	md.append("### 13.1 Cross-Language Compression Comparison\n")

	lang_groups = {
	"Balochi (custom)": ["Balochi_BPE","Balochi_WordPiece","Balochi_SentencePiece"],
	"Arabic (custom)": ["AraBERT_v2","CAMeLBERT_MSA","ARBERT","AraGPT2"],
	"Persian (custom)": ["ParsBERT","PersianBERT_FA","PersianBPE"],
	"Urdu (custom)": ["UrduBERT"],
	"Baseline (generic)":["BERT","Gemma","NLTK"],
	}
	md.append("\| Language Group \| Avg Compression \| Avg Fertility \| Tokenizers \|")
	md.append("\|----------------\|-----------------\|---------------\|------------\|")
	for lang, members in lang_groups.items():
	active = [m for m in members if m in all_metrics]
	if active:
	avg_cr = sum(all_metrics[m]["compression_ratio"] for m in active) / len(active)
	avg_fe = sum(all_metrics[m]["fertility"] for m in active) / len(active)
	md.append(f"\| {lang} \| {avg_cr:.2f} \| {avg_fe:.3f} \| {', '.join(active)} \|")
	md.append("")

	md.append("### 13.2 Key Findings\n")
	md.append("- Domain specificity advantage: Custom Balochi tokenizers are expected to "
	"outperform generic multilingual tokenizers (BERT, Gemma) on Balochi text by "
	"producing fewer subword fragments and lower fertility.\n")
	md.append("- Script-family proximity: Arabic and Persian tokenizers share the same "
	"Perso-Arabic script family as Balochi, making their fertility and UNK rates "
	"the most meaningful external benchmarks — more so than mBERT or Gemma.\n")
	md.append("- Vocabulary size effect: Larger vocabulary (80K–128K) generally reduces "
	"fertility but increases memory overhead. The optimal point for Balochi is "
	"determined by the Rényi efficiency ablation in the companion notebook.\n")
	md.append("- AraBERT v2 (64K WP) and ParsBERT (100K WP) serve as the primary "
	"WordPiece upper-bound references — both trained on 70M+ token Perso-Arabic corpora.\n")
	md.append("- AraGPT2 (BPE 50K) provides the cleanest Arabic BPE reference for "
	"comparison against Balochi BPE (80K) in the BPE group.\n")
	md.append("- UrduBERT is the most linguistically proximate external tokenizer to "
	"Balochi due to shared Nastaliq script conventions and similar morphological "
	"complexity.\n")

	md.append("### 13.3 Roundtrip Fidelity\n")
	md.append("\| Tokenizer \| Fidelity \|")
	md.append("\|-----------\|----------\|")
	for name, m in all_metrics.items():
	fid = m.get("roundtrip_fidelity")
	md.append(f"\| {name} \| {'✓ Lossless' if fid is True else ('✗ Lossy' if fid is False else '— N/A')} \|")
	md.append("")

	md.append("## 14. Tokenizer Selection Guide\n")
	md.append("\| Use Case \| Recommended Tokenizer \| Rationale \|")
	md.append("\|----------\|----------------------\|-----------\|")
	guide = [
	("Balochi BERT fine-tuning", "Balochi_WordPiece 64K", "Native WP; matches BERT architecture exactly"),
	("Balochi GPT/Gemma CPT", "Balochi_BPE 80K", "BPE aligns with GPT-2/Gemma training conventions"),
	("Balochi SentencePiece pipeline","Balochi_SP 64K", "Zero UNK via byte_fallback; SP models (T5, mT5)"),
	("Arabic NER / SA tasks", "AraBERT v2 or CAMeLBERT", "Proven Arabic BERT baselines"),
	("Arabic text generation", "AraGPT2", "Arabic GPT-2 with BPE tokenizer"),
	("Persian BERT tasks", "ParsBERT", "Standard Persian BERT baseline"),
	("Persian text analysis", "PersianBERT_FA", "HooshvareLab FA-base, widely used"),
	("Urdu NLP tasks", "UrduBERT", "Nastaliq script; same script family as Balochi"),
	("Cross-lingual baseline", "BERT Multilingual 119K", "Covers 104 languages for transfer comparison"),
	("Word-count baseline", "NLTK", "Raw word count before subword splitting"),
	]
	for row in guide:
	md.append(f"\| {row[0]} \| {row[1]} \| {row[2]} \|")
	md.append("")

	md.append("## 15. Citation\n")
	md.append("```bibtex")
	md.append("@misc{hafeezullah2025balochi,")
	md.append(" title = {Comprehensive Balochi Tokenizer Comparison: Custom vs. Cross-Language Baselines},")
	md.append(" author = {Hafeez Ullah},")
	md.append(" year = {2025},")
	md.append(" url = {https://huggingface.co/balochiml},")
	md.append(" note = {University of Gwadar, Department of Computer Science}")
	md.append("}")
	md.append("```\n")

	# Save
	md_content = "\n".join(md)
	md_path = os.path.join(OUTPUT_DIR, "Tokenizer_Comparison_Extended_Report.md")
	with open(md_path, "w", encoding="utf-8") as f:
	f.write(md_content)
	return md_path

	# ============================================================
	# MAIN
	# ============================================================

	def main():
	print("╔" + "═" * 108 + "╗")
	print("║" + " BALOCHI + CROSS-LANGUAGE TOKENIZER COMPARISON (EXTENDED) ".center(108) + "║")
	print("╚" + "═" * 108 + "╝")

	# Step 1 — Load text
	print("\n[STEP 1] Loading input text...")
	text = load_text(INPUT_FILE)
	words = text.split()
	sentences = split_sentences(text)
	print(f" Characters : {len(text):,}")
	print(f" Words : {len(words):,}")
	print(f" Sentences : {len(sentences):,}")

	# Step 2 — Load tokenizers
	print("\n[STEP 2] Loading primary tokenizers (15 total)...")
	toks = load_all_tokenizers()
	print(f"\n ✓ Loaded {len(toks)}/15 primary tokenizers.")

	# Step 3 — Load ablation tokenizers
	print("\n[STEP 3] Loading ablation tokenizers (vocab-size study)...")
	ablation_toks = load_ablation_tokenizers()
	if ablation_toks:
	print(f" ✓ Loaded {len(ablation_toks)} ablation tokenizers.")
	else:
	print(" ℹ No ablation tokenizer files found — run the notebook first to generate them.")

	if not toks:
	print("ERROR: No tokenizers loaded. Exiting.")
	sys.exit(1)

	# Step 4 — Tokenize
	print("\n[STEP 4] Running tokenization...")
	all_tokens = {}
	all_metrics = {}
	ablation_metrics = {}

	output_filenames = {}
	# filename mapping for primary toks
	name_map = {
	"Balochi_BPE": "tokens_balochi_bpe.txt",
	"Balochi_WordPiece": "tokens_balochi_wordpiece.txt",
	"Balochi_SentencePiece":"tokens_balochi_sentencepiece.txt",
	"Balochi_30K": "tokens_balochi_30k.txt",
	"NLTK": "tokens_nltk.txt",
	"BERT": "tokens_bert_multilingual.txt",
	"Gemma": "tokens_gemma.txt",
	"AraBERT_v2": "tokens_arabert_v2.txt",
	"CAMeLBERT_MSA": "tokens_camelbert_msa.txt",
	"ARBERT": "tokens_arbert.txt",
	"AraGPT2": "tokens_aragpt2.txt",
	"ParsBERT": "tokens_parsbert.txt",
	"PersianBERT_FA": "tokens_persianbert_fa.txt",
	"PersianBPE": "tokens_persian_bpe.txt",
	"UrduBERT": "tokens_urdubert.txt",
	}

	for name, tok_info in toks.items():
	print(f" Tokenizing {name}...", end=" ", flush=True)
	tokens, elapsed = tokenize_text(name, tok_info, text)
	all_tokens[name] = tokens
	all_metrics[name] = compute_metrics(name, tok_info, tokens, elapsed, text)
	fname = name_map.get(name, f"tokens_{name.lower()}.txt")
	output_filenames[name] = fname
	save_tokens(fname, tokens)
	print(f"Done! ({len(tokens):,} tokens in {elapsed:.4f}s)")

	for name, tok_info in ablation_toks.items():
	print(f" [Ablation] Tokenizing {name}...", end=" ", flush=True)
	tokens, elapsed = tokenize_text(name, tok_info, text)
	ablation_metrics[name] = compute_metrics(name, tok_info, tokens, elapsed, text)
	fname = f"tokens_{name.lower()}.txt"
	output_filenames[name] = fname
	save_tokens(fname, tokens)
	print(f"Done! ({len(tokens):,} tokens in {elapsed:.4f}s)")

	# Step 5 — Sample output
	print("\n[STEP 5] Sample token output...")
	print_sample_tokens(toks, sentences)

	# Step 6 — Comparison tables (primary only)
	print("\n[STEP 6] Comparison tables...")
	print_all_comparisons(all_metrics)

	# Step 7 — Analysis
	print("\n[STEP 7] Analysis & interpretation...")
	analyze_results(all_metrics)

	# Step 8 — Markdown report
	print("\n[STEP 8] Generating Markdown report...")
	md_path = generate_markdown_report(
	text, sentences, all_tokens, all_metrics, output_filenames,
	ablation_metrics=ablation_metrics if ablation_metrics else None
	)
	print(f" ✓ Report saved: {md_path}")

	# Final summary
	print(f"\n{'═' * 110}")
	print(f" OUTPUT → {OUTPUT_DIR}")
	print(f"{'═' * 110}")
	for name in {toks, ablation_toks}:
	fname = output_filenames.get(name, f"tokens_{name.lower()}.txt")
	fpath = os.path.join(OUTPUT_DIR, fname)
	size = os.path.getsize(fpath) if os.path.exists(fpath) else 0
	print(f" ✓ {fname:<50s} ({size:>10,} bytes)")
	md_size = os.path.getsize(md_path) if os.path.exists(md_path) else 0
	print(f" ✓ {'Tokenizer_Comparison_Extended_Report.md':<50s} ({md_size:>10,} bytes)")
	print(f"{'═' * 110}")
	print(f"\n ✅ ALL DONE — {len(toks)} primary + {len(ablation_toks)} ablation tokenizers compared.\n")


	if __name__ == "__main__":
	main()