# -*- coding: utf-8 -*- """ Comprehensive Balochi + Cross-Language Tokenizer Comparison & Evaluation ========================================================================= Extended version of the original Balochi tokenizer comparison script. Tokenizers (15 total — original 7 + 8 new): ━━━ BALOCHI (Custom-Trained) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1. Balochi BPE (80K) – Custom HF tokenizers BPE 2. Balochi WordPiece (64K) – Custom HF tokenizers WordPiece 3. Balochi SentencePiece (64K) – Custom Google SentencePiece 4. Balochi 30K – balochiml/balochi-tokenizer (HuggingFace) ━━━ BASELINE ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5. NLTK – Rule-based word tokenizer 6. BERT Multilingual – bert-base-multilingual-cased (WordPiece 119K) 7. Gemma – google/gemma-2b (SentencePiece BPE 256K) ━━━ ARABIC (Custom-Trained) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8. AraBERT v2 – aubmindlab/bert-base-arabertv2 (WP 64K) 9. CAMeLBERT-MSA – CAMeL-Lab/bert-base-arabic-camelbert-msa (WP 30K) 10. ARBERT – UBC-NLP/ARBERT (WP 100K) 11. AraGPT2 – aubmindlab/aragpt2-base (BPE 50K) ━━━ PERSIAN (Custom-Trained) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12. ParsBERT – HooshvareLab/bert-base-parsbert-uncased (WP 100K) 13. PersianBERT (HooshvareBase) – HooshvareLab/bert-fa-base-uncased (WP 100K) 14. PersianBPETokenizer – mshojaei77/PersianBPETokenizer (BPE) ━━━ URDU (Custom-Trained) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 15. UrduBERT – urduhack/UrduBERT (WP ~60K) [HF: uer/roberta-base-finetuned-tnews-chinese] Note: falls back to iamxds/UrduBERT-base if primary unavailable ━━━ VOCAB ABLATION (from Notebook) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ NOTE: Ablation tokenizers (BPE/WP/SP at 32K, 64K, 80K, 128K) are loaded from local .json/.model files in Tokenizers/ directory. If the files are not present, these entries are skipped gracefully. Comparison Groups: Group A: Balochi WordPiece vs AraBERT v2 vs CAMeLBERT vs ARBERT vs BERT (WordPiece family) Group B: Balochi SP vs ParsBERT vs PersianBERT vs Gemma (SentencePiece/WP) Group C: Balochi BPE vs AraGPT2 vs PersianBPE vs 30K-Balochi vs NLTK (BPE/Rule family) Group D: UrduBERT vs Balochi BPE vs AraBERT v2 vs ParsBERT (Perso-Arabic script) Group E: Original 7 Balochi comparison (WordPiece / SentencePiece / BPE groups) Group F: Vocab Ablation — same algorithm, different vocab sizes """ import os import sys import time import re import unicodedata # Fix Windows console encoding if sys.platform == "win32": sys.stdout.reconfigure(encoding='utf-8', errors='replace') sys.stderr.reconfigure(encoding='utf-8', errors='replace') # ============================================================ # 0. Install & Import Dependencies # ============================================================ def install_if_missing(package, pip_name=None): """Install a package if not already available.""" try: __import__(package) except ImportError: import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or package, "-q"]) install_if_missing("tokenizers") install_if_missing("sentencepiece") install_if_missing("transformers") install_if_missing("nltk") install_if_missing("huggingface_hub") import sentencepiece as spm from tokenizers import Tokenizer from transformers import AutoTokenizer, BertTokenizer import nltk nltk.download('punkt', quiet=True) nltk.download('punkt_tab', quiet=True) from nltk.tokenize import word_tokenize # ============================================================ # 1. Path Configuration # ============================================================ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) BASE_DIR = SCRIPT_DIR INPUT_FILE = os.path.join(BASE_DIR, "..", "Tokens", "liberal capitalism.txt") TOKENIZERS_DIR = os.path.join(BASE_DIR, "Tokenizers") OUTPUT_DIR = os.path.join(SCRIPT_DIR, "Output") os.makedirs(OUTPUT_DIR, exist_ok=True) # ── Balochi local models ───────────────────────────────────── BPE_80K = os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_80000", "bpe_80000.json") WP_64K = os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_64000", "wordpiece_64000.json") SP_64K = os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_64000", "sentencepiece_64000.model") # ── Ablation tokenizer paths (from notebook training) ──────── ABLATION_MODELS = { "Balochi_BPE_32K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_32000", "bpe_32000.json"), "Balochi_BPE_47K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_47000", "bpe_47000.json"), "Balochi_BPE_64K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_64000", "bpe_64000.json"), "Balochi_BPE_80K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_80000", "bpe_80000.json"), "Balochi_BPE_128K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_128000", "bpe_128000.json"), "Balochi_WP_32K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_32000", "wordpiece_32000.json"), "Balochi_WP_47K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_47000", "wordpiece_47000.json"), "Balochi_WP_64K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_64000", "wordpiece_64000.json"), "Balochi_WP_80K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_80000", "wordpiece_80000.json"), "Balochi_WP_128K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_128000", "wordpiece_128000.json"), "Balochi_SP_32K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_32000", "sentencepiece_32000.model"), "Balochi_SP_47K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_47000", "sentencepiece_47000.model"), "Balochi_SP_64K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_64000", "sentencepiece_64000.model"), "Balochi_SP_80K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_80000", "sentencepiece_80000.model"), "Balochi_SP_128K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_128000", "sentencepiece_128000.model"), } # ── HuggingFace remote IDs ──────────────────────────────────── BALOCHI_30K_REPO = "balochiml/balochi-tokenizer" BALOCHI_30K_FILENAME = "models/30k-balochi-tokenizer.json" # ── HF token for gated models (Gemma) ───────────────────────── HF_TOKEN = os.environ.get("HF_TOKEN", "YOUR_HF_TOKEN_HERE") # ============================================================ # 2. Load Input Text # ============================================================ def normalize_balochi(text: str, drop_diacritics: bool = True, preserve_ye: bool = True) -> str: """ Balochi text normalization pipeline — adapted from AraToken methodology. """ text = unicodedata.normalize('NFKC', text) text = re.sub(r'[أإآٱ]', 'ا', text) # Hamza variants → bare Alif if not preserve_ye: text = text.replace('ے', 'ی') # Urdu Ye → Farsi Ye (collapse) arabic_indic = str.maketrans('٠١٢٣٤٥٦٧٨٩', '0123456789') text = text.translate(arabic_indic) text = text.replace('؟', '?').replace('؛', ';').replace('،', ',') text = text.replace('\u0640', '') # Kashida/Tatweel # Balochi invisible chars text = text.replace('\u200C', '') # ZWNJ text = text.replace('\u200D', '') # ZWJ text = text.replace('\u200F', '') # RLM text = text.replace('\u061C', '') # ALM if drop_diacritics: text = re.sub(r'(? 10] # ============================================================ # 3. Load All Tokenizers # ============================================================ def _try_hf_tokenizer(key, label, repo_id, fallback_repos=None, tok_dict=None, idx=None, total=None): """Helper: load a HuggingFace AutoTokenizer / BertTokenizer with graceful fallback.""" prefix = f" [{idx}/{total}]" if idx else " " print(f"{prefix} Loading {label} ({repo_id})...") repos = [repo_id] + (fallback_repos or []) for repo in repos: try: tok = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN) vs = tok.vocab_size print(f" ✓ Loaded from {repo} (vocab: {vs:,})") return {"type": "transformers", "obj": tok, "vocab_size": vs, "hf_id": repo, "label": label} except Exception as e: print(f" ✗ {repo} failed: {str(e)[:80]}") return None def load_all_tokenizers(): toks = {} total = 15 # ── 1. Balochi BPE 80K ────────────────────────────────── print(f" [1/{total}] Loading Balochi BPE (80K)...") if os.path.exists(BPE_80K): obj = Tokenizer.from_file(BPE_80K) toks["Balochi_BPE"] = {"type": "hf_tokenizers", "obj": obj, "vocab_size": obj.get_vocab_size(), "label": "Balochi BPE 80K"} print(f" ✓ vocab: {toks['Balochi_BPE']['vocab_size']:,}") else: print(f" ✗ NOT FOUND: {BPE_80K}") # ── 2. Balochi WordPiece 64K ───────────────────────────── print(f" [2/{total}] Loading Balochi WordPiece (64K)...") if os.path.exists(WP_64K): obj = Tokenizer.from_file(WP_64K) toks["Balochi_WordPiece"] = {"type": "hf_tokenizers", "obj": obj, "vocab_size": obj.get_vocab_size(), "label": "Balochi WordPiece 64K"} print(f" ✓ vocab: {toks['Balochi_WordPiece']['vocab_size']:,}") else: print(f" ✗ NOT FOUND: {WP_64K}") # ── 3. Balochi SentencePiece 64K ───────────────────────── print(f" [3/{total}] Loading Balochi SentencePiece (64K)...") if os.path.exists(SP_64K): sp = spm.SentencePieceProcessor() sp.load(SP_64K) toks["Balochi_SentencePiece"] = {"type": "sentencepiece", "obj": sp, "vocab_size": sp.get_piece_size(), "label": "Balochi SP 64K"} print(f" ✓ vocab: {toks['Balochi_SentencePiece']['vocab_size']:,}") else: print(f" ✗ NOT FOUND: {SP_64K}") # ── 4. NLTK ────────────────────────────────────────────── print(f" [4/{total}] Loading NLTK word_tokenize...") toks["NLTK"] = {"type": "nltk", "obj": None, "vocab_size": None, "label": "NLTK (rule-based)"} print(" ✓ Ready") # ── 5. BERT Multilingual ────────────────────────────────── result = _try_hf_tokenizer("BERT", "BERT Multilingual", "bert-base-multilingual-cased", idx=5, total=total) if result: toks["BERT"] = result # ── 6. Gemma ───────────────────────────────────────────── result = _try_hf_tokenizer("Gemma", "Gemma 2B", "google/gemma-2b", fallback_repos=["google/gemma-7b"], idx=6, total=total) if result: toks["Gemma"] = result # ── 7. Balochi 30K (HuggingFace Hub) ───────────────────── print(f" [7/{total}] Loading Balochi 30K (balochiml/balochi-tokenizer)...") try: from huggingface_hub import hf_hub_download lpath = hf_hub_download(repo_id=BALOCHI_30K_REPO, filename=BALOCHI_30K_FILENAME, token=HF_TOKEN) obj = Tokenizer.from_file(lpath) toks["Balochi_30K"] = {"type": "hf_tokenizers", "obj": obj, "vocab_size": obj.get_vocab_size(), "label": "Balochi 30K"} print(f" ✓ vocab: {toks['Balochi_30K']['vocab_size']:,}") except Exception as e: print(f" ✗ Failed: {str(e)[:80]}") # ───────────────────────────────────────────────────────── # ARABIC TOKENIZERS # ───────────────────────────────────────────────────────── # ── 8. AraBERT v2 ───────────────────────────────────────── result = _try_hf_tokenizer("AraBERT", "AraBERT v2", "aubmindlab/bert-base-arabertv2", fallback_repos=["aubmindlab/bert-large-arabertv02"], idx=8, total=total) if result: toks["AraBERT_v2"] = result # ── 9. CAMeLBERT-MSA ────────────────────────────────────── result = _try_hf_tokenizer("CAMeLBERT", "CAMeLBERT-MSA", "CAMeL-Lab/bert-base-arabic-camelbert-msa", fallback_repos=["CAMeL-Lab/bert-base-arabic-camelbert-msa-quarter"], idx=9, total=total) if result: toks["CAMeLBERT_MSA"] = result # ── 10. ARBERT ──────────────────────────────────────────── result = _try_hf_tokenizer("ARBERT", "ARBERT (100K)", "UBC-NLP/ARBERT", fallback_repos=["UBC-NLP/MARBERTv2"], idx=10, total=total) if result: toks["ARBERT"] = result # ── 11. AraGPT2 ─────────────────────────────────────────── result = _try_hf_tokenizer("AraGPT2", "AraGPT2 Base", "aubmindlab/aragpt2-base", fallback_repos=["aubmindlab/aragpt2-mega"], idx=11, total=total) if result: toks["AraGPT2"] = result # ───────────────────────────────────────────────────────── # PERSIAN TOKENIZERS # ───────────────────────────────────────────────────────── # ── 12. ParsBERT ────────────────────────────────────────── result = _try_hf_tokenizer("ParsBERT", "ParsBERT", "HooshvareLab/bert-base-parsbert-uncased", idx=12, total=total) if result: toks["ParsBERT"] = result # ── 13. PersianBERT (HooshvareLab FA base) ──────────────── result = _try_hf_tokenizer("PersianBERT_FA", "PersianBERT FA-Base", "HooshvareLab/bert-fa-base-uncased", fallback_repos=["HooshvareLab/bert-fa-zwnj-base-uncased"], idx=13, total=total) if result: toks["PersianBERT_FA"] = result # ── 14. PersianBPETokenizer ─────────────────────────────── result = _try_hf_tokenizer("PersianBPE", "Persian BPE Tokenizer", "mshojaei77/PersianBPETokenizer", idx=14, total=total) if result: toks["PersianBPE"] = result # ───────────────────────────────────────────────────────── # URDU TOKENIZER # ───────────────────────────────────────────────────────── # ── 15. UrduBERT ────────────────────────────────────────── result = _try_hf_tokenizer("UrduBERT", "UrduBERT", "urduhack/UrduBERT", fallback_repos=[ "iamxds/UrduBERT-base", "flax-community/roberta-base-mr", "uer/roberta-base-finetuned-tnews-chinese" ], idx=15, total=total) if result: result["label"] = "UrduBERT" toks["UrduBERT"] = result return toks def load_ablation_tokenizers(): """ Load vocabulary-size ablation tokenizers from local paths. These are produced by the Balochi_Tokenizer_Vocab_Ablation notebook. Entries are skipped silently if the file does not yet exist. """ ablation = {} for key, path in ABLATION_MODELS.items(): if not os.path.exists(path): continue # Not yet trained — skip silently try: if path.endswith(".json"): obj = Tokenizer.from_file(path) vs = obj.get_vocab_size() ablation[key] = {"type": "hf_tokenizers", "obj": obj, "vocab_size": vs, "label": key} print(f" ✓ Ablation {key} (vocab: {vs:,})") elif path.endswith(".model"): sp = spm.SentencePieceProcessor() sp.load(path) ablation[key] = {"type": "sentencepiece", "obj": sp, "vocab_size": sp.get_piece_size(), "label": key} print(f" ✓ Ablation {key} (vocab: {sp.get_piece_size():,})") except Exception as e: print(f" ✗ Ablation {key} failed: {str(e)[:60]}") return ablation # ============================================================ # 4. Tokenization Engine # ============================================================ def tokenize_text(name, tok_info, text): tok_type = tok_info["type"] obj = tok_info["obj"] start = time.perf_counter() if tok_type == "hf_tokenizers": tokens = obj.encode(text).tokens elif tok_type == "sentencepiece": tokens = obj.encode_as_pieces(text) elif tok_type == "nltk": tokens = word_tokenize(text) elif tok_type == "transformers": try: tokens = obj.tokenize(text[:100000]) # cap at 100K chars for speed except Exception as e: print(f"\n ✗ Tokenization failed for {name}: {e}") tokens = [] else: tokens = [] elapsed = time.perf_counter() - start return tokens, elapsed def decode_text(name, tok_info, text_snippet): tok_type = tok_info["type"] obj = tok_info["obj"] try: if tok_type == "hf_tokenizers": enc = obj.encode(text_snippet) return obj.decode(enc.ids) elif tok_type == "sentencepiece": return obj.decode(obj.encode(text_snippet)) elif tok_type == "transformers": enc = obj.encode(text_snippet) return obj.decode(enc) elif tok_type == "nltk": return " ".join(word_tokenize(text_snippet)) except Exception: return None return None # ============================================================ # 5. Save Tokens to Files # ============================================================ def save_tokens(filename, tokens): filepath = os.path.join(OUTPUT_DIR, filename) with open(filepath, "w", encoding="utf-8") as f: f.writelines(t + "\n" for t in tokens) return filepath # ============================================================ # 6. Compute Evaluation Metrics # ============================================================ def compute_metrics(name, tok_info, tokens, elapsed, text): total_tokens = len(tokens) unique_tokens = len(set(tokens)) vocab_size = tok_info.get("vocab_size", None) total_chars = len(text) word_count = len(text.split()) compression_ratio = total_chars / total_tokens if total_tokens > 0 else 0 fertility = total_tokens / word_count if word_count > 0 else 0 avg_token_len = sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0 vocab_util = (unique_tokens / vocab_size * 100) if vocab_size else None unk_patterns = {"[UNK]", "", "⁇", ""} unk_count = sum(1 for t in tokens if t in unk_patterns) unk_rate = (unk_count / total_tokens * 100) if total_tokens > 0 else 0 # Continuation rate if name in ["Balochi_SentencePiece", "Gemma", "PersianBPE"] or \ any(key in name for key in ["SP_", "AraGPT"]): cont_count = sum(1 for t in tokens if not t.startswith("▁") and t not in unk_patterns and len(t) > 0) else: cont_count = sum(1 for t in tokens if t.startswith("##")) continuation_rate = (cont_count / total_tokens * 100) if total_tokens > 0 else 0 speed = total_tokens / elapsed if elapsed > 0 else 0 decoded = decode_text(name, tok_info, text[:500]) if decoded is not None: fidelity = " ".join(text[:500].split()) == " ".join(decoded.split()) else: fidelity = None return { "token_count": total_tokens, "unique_tokens": unique_tokens, "vocab_size": vocab_size, "vocab_utilization": vocab_util, "compression_ratio": compression_ratio, "fertility": fertility, "avg_token_length": avg_token_len, "unk_count": unk_count, "unk_rate": unk_rate, "continuation_rate": continuation_rate, "speed": speed, "time_sec": elapsed, "roundtrip_fidelity": fidelity, } # ============================================================ # 7. Print Sample Tokens # ============================================================ def print_sample_tokens(tokenizers_dict, sentences): sample = sentences[:2] print("\n" + "=" * 110) print(" SAMPLE TOKEN OUTPUT (First 2 Sentences)") print("=" * 110) for i, sent in enumerate(sample, 1): disp = sent[:120] + "..." if len(sent) > 120 else sent print(f"\n{'─' * 110}") print(f" Sentence {i}: {disp}") print(f"{'─' * 110}") for name, tok_info in tokenizers_dict.items(): tokens, _ = tokenize_text(name, tok_info, sent) show = tokens[:25] suffix = f" ... (+{len(tokens)-25} more)" if len(tokens) > 25 else "" print(f"\n [{name}] ({len(tokens)} tokens):") print(f" {show}{suffix}") print(f"\n{'=' * 110}\n") # ============================================================ # 8. Print Comparison Tables # ============================================================ def print_metrics_table(title, names, all_metrics): print(f"\n{'=' * 110}") print(f" {title}") print(f"{'=' * 110}") col = max(22, max(len(n) for n in names) + 2) header = f" {'Metric':<35}" for n in names: header += f"{n:>{col}}" print(header) print(f" {'─' * (35 + col * len(names))}") rows = [ ("Token Count", "token_count", "{:,}"), ("Unique Tokens", "unique_tokens", "{:,}"), ("Vocabulary Size", "vocab_size", "{}"), ("Vocab Utilization", "vocab_utilization", "{:.2f}%"), ("Compression Ratio", "compression_ratio", "{:.2f}"), ("Fertility (tok/word)","fertility", "{:.3f}"), ("Avg Token Length", "avg_token_length", "{:.2f}"), ("Unknown Tokens", "unk_count", "{:,}"), ("Unknown Rate (%)", "unk_rate", "{:.4f}%"), ("Continuation Rate", "continuation_rate", "{:.2f}%"), ("Speed (tok/sec)", "speed", "{:,.0f}"), ("Time (seconds)", "time_sec", "{:.4f}"), ("Roundtrip Fidelity", "roundtrip_fidelity", "{}"), ] for label, key, fmt in rows: row = f" {label:<35}" for n in names: val = all_metrics[n].get(key) if val is None: row += f"{'N/A':>{col}}" elif isinstance(val, bool): row += f"{'✓ Yes' if val else '✗ No':>{col}}" else: try: row += f"{fmt.format(val):>{col}}" except Exception: row += f"{str(val):>{col}}" print(row) print(f" {'─' * (35 + col * len(names))}") def print_all_comparisons(all_metrics): # ── Overall ─────────────────────────────────────────────── all_names = list(all_metrics.keys()) print_metrics_table("OVERALL — All Tokenizers", all_names, all_metrics) # ── Group A: WordPiece family ───────────────────────────── grp_a = [n for n in ["Balochi_WordPiece","AraBERT_v2","CAMeLBERT_MSA","ARBERT","BERT"] if n in all_metrics] if len(grp_a) >= 2: print_metrics_table("GROUP A: WordPiece Family — Balochi vs Arabic vs Multilingual", grp_a, all_metrics) # ── Group B: SentencePiece / WP Persian ─────────────────── grp_b = [n for n in ["Balochi_SentencePiece","ParsBERT","PersianBERT_FA","Gemma"] if n in all_metrics] if len(grp_b) >= 2: print_metrics_table("GROUP B: SentencePiece/WP — Balochi vs Persian vs Gemma", grp_b, all_metrics) # ── Group C: BPE family ─────────────────────────────────── grp_c = [n for n in ["Balochi_BPE","AraGPT2","PersianBPE","Balochi_30K","NLTK"] if n in all_metrics] if len(grp_c) >= 2: print_metrics_table("GROUP C: BPE/Rule Family — Balochi vs Arabic vs Persian vs Baseline", grp_c, all_metrics) # ── Group D: Perso-Arabic script family ─────────────────── grp_d = [n for n in ["UrduBERT","Balochi_BPE","Balochi_WordPiece","AraBERT_v2","ParsBERT"] if n in all_metrics] if len(grp_d) >= 2: print_metrics_table("GROUP D: Perso-Arabic Script Family — Urdu / Balochi / Arabic / Persian", grp_d, all_metrics) # ── Group E: Original 7-tokenizer sub-groups ───────────── for label, members in [ ("GROUP E1: WordPiece — Balochi WordPiece vs BERT Multilingual", ["Balochi_WordPiece","BERT"]), ("GROUP E2: SentencePiece — Balochi SP vs Gemma", ["Balochi_SentencePiece","Gemma"]), ("GROUP E3: BPE — Balochi BPE vs NLTK vs Balochi 30K", ["Balochi_BPE","NLTK","Balochi_30K"]), ]: active = [n for n in members if n in all_metrics] if len(active) >= 2: print_metrics_table(label, active, all_metrics) # ============================================================ # 9. Analysis & Interpretation # ============================================================ def analyze_results(all_metrics): print("\n" + "=" * 110) print(" ANALYSIS & INTERPRETATION") print("=" * 110) def best_worst(key, higher=True): valid = {k: v[key] for k, v in all_metrics.items() if v.get(key) is not None and isinstance(v[key], (int, float))} if not valid: return None, None, valid b = max(valid, key=valid.get) if higher else min(valid, key=valid.get) w = min(valid, key=valid.get) if higher else max(valid, key=valid.get) return b, w, valid # 1. Compression print("\n 1. COMPRESSION RATIO (higher = better)") print(" " + "─" * 100) b, w, vals = best_worst("compression_ratio") if b: print(f" Best: {b} ({vals[b]:.2f})") print(f" Worst: {w} ({vals[w]:.2f})") # Cross-language comparison: Balochi vs Arabic vs Persian vs Urdu lang_groups = { "Balochi (custom)": ["Balochi_BPE","Balochi_WordPiece","Balochi_SentencePiece"], "Arabic (custom)": ["AraBERT_v2","CAMeLBERT_MSA","ARBERT","AraGPT2"], "Persian (custom)": ["ParsBERT","PersianBERT_FA","PersianBPE"], "Urdu (custom)": ["UrduBERT"], "Baseline (generic)": ["BERT","Gemma","NLTK"], } print() for lang, members in lang_groups.items(): active = [m for m in members if m in all_metrics] if active: avg_cr = sum(all_metrics[m]["compression_ratio"] for m in active) / len(active) avg_fe = sum(all_metrics[m]["fertility"] for m in active) / len(active) print(f" {lang:<22} avg compression={avg_cr:.2f} avg fertility={avg_fe:.3f} " f"(n={len(active)} tokenizers: {', '.join(active)})") # 2. Fertility print("\n\n 2. FERTILITY ANALYSIS (tokens/word — lower = better)") print(" " + "─" * 100) b, w, vals = best_worst("fertility", higher=False) if b: print(f" Best (lowest): {b} ({vals[b]:.3f})") print(f" Worst (highest):{w} ({vals[w]:.3f})") print("\n Tokenizer fertility ranking:") for name, m in sorted(all_metrics.items(), key=lambda x: x[1].get("fertility", 99)): print(f" {name:<35s} {m['fertility']:.3f} tok/word") # 3. UNK Coverage print("\n\n 3. UNKNOWN TOKEN COVERAGE (lower = better)") print(" " + "─" * 100) for name, m in all_metrics.items(): cnt = m.get("unk_count", 0) rate = m.get("unk_rate", 0) status = "✓ EXCELLENT" if cnt == 0 else ("⚠ ACCEPTABLE" if rate < 1.0 else "✗ POOR") print(f" {name:<35s} UNK: {cnt:>5,} ({rate:.4f}%) [{status}]") # 4. Speed print("\n\n 4. TOKENIZATION SPEED RANKING") print(" " + "─" * 100) for name, m in sorted(all_metrics.items(), key=lambda x: x[1].get("speed", 0), reverse=True): print(f" {name:<35s} {m['speed']:>12,.0f} tok/sec ({m['time_sec']:.4f}s)") # 5. Script Family Insight print("\n\n 5. PERSO-ARABIC SCRIPT FAMILY INSIGHT") print(" " + "─" * 100) script_family = [n for n in ["Balochi_BPE","Balochi_WordPiece","Balochi_SentencePiece", "AraBERT_v2","CAMeLBERT_MSA","ARBERT","AraGPT2", "ParsBERT","PersianBERT_FA","PersianBPE","UrduBERT"] if n in all_metrics] if script_family: print(" Compression ratios across Perso-Arabic script tokenizers:") for n in sorted(script_family, key=lambda x: all_metrics[x]["compression_ratio"], reverse=True): cr = all_metrics[n]["compression_ratio"] vs = all_metrics[n]["vocab_size"] vs_str = f"{vs:,}" if vs else "N/A" print(f" {n:<35s} compression={cr:.2f} vocab={vs_str}") # 6. Vocab Size vs Compression (Ablation insight if available) ablation_keys = [k for k in all_metrics if any(k.startswith(p) for p in ["Balochi_BPE_","Balochi_WP_","Balochi_SP_"])] if ablation_keys: print("\n\n 6. VOCAB SIZE ABLATION SUMMARY") print(" " + "─" * 100) for algo in ["BPE","WP","SP"]: group = sorted([k for k in ablation_keys if f"_{algo}_" in k], key=lambda x: all_metrics[x]["vocab_size"] or 0) if group: print(f"\n Algorithm: {algo}") for n in group: m = all_metrics[n] print(f" {n:<35s} vocab={m['vocab_size']:,} " f"compression={m['compression_ratio']:.2f} " f"fertility={m['fertility']:.3f}") print(f"\n{'=' * 110}") # ============================================================ # 10. Generate Comprehensive Markdown Report # ============================================================ def generate_markdown_report(text, sentences, all_tokens, all_metrics, output_filenames, ablation_metrics=None): md = [] md.append("# Comprehensive Balochi + Cross-Language Tokenizer Comparison\n") md.append(f"**Generated:** {time.strftime('%Y-%m-%d %H:%M:%S')} \n") md.append("**Script:** `Tokenizers_Comparison_Extended.py`\n") # Input words = text.split() md.append("## 1. Input Text Summary\n") md.append("| Property | Value |") md.append("|----------|-------|") md.append(f"| **File** | `liberal capitalism.txt` |") md.append(f"| **Characters** | {len(text):,} |") md.append(f"| **Words** | {len(words):,} |") md.append(f"| **Sentences** | {len(sentences):,} |\n") # Tokenizers loaded md.append("## 2. Tokenizers Loaded\n") md.append("| # | Tokenizer | Family | Script | Vocab Size | HuggingFace ID |") md.append("|---|-----------|--------|--------|------------|----------------|") tok_meta = [ ("1", "Balochi_BPE", "Balochi", "BPE", "Perso-Arabic", "Local file"), ("2", "Balochi_WordPiece", "Balochi", "WordPiece", "Perso-Arabic", "Local file"), ("3", "Balochi_SentencePiece", "Balochi", "SentencePiece","Perso-Arabic", "Local file"), ("4", "Balochi_30K", "Balochi", "BPE", "Perso-Arabic", "balochiml/balochi-tokenizer"), ("5", "NLTK", "Baseline", "Rule-based", "Any", "—"), ("6", "BERT", "Generic", "WordPiece", "Multilingual", "bert-base-multilingual-cased"), ("7", "Gemma", "Generic", "SP-BPE", "Multilingual", "google/gemma-2b"), ("8", "AraBERT_v2", "Arabic", "WordPiece", "Arabic", "aubmindlab/bert-base-arabertv2"), ("9", "CAMeLBERT_MSA", "Arabic", "WordPiece", "Arabic", "CAMeL-Lab/bert-base-arabic-camelbert-msa"), ("10", "ARBERT", "Arabic", "WordPiece", "Arabic", "UBC-NLP/ARBERT"), ("11", "AraGPT2", "Arabic", "BPE", "Arabic", "aubmindlab/aragpt2-base"), ("12", "ParsBERT", "Persian", "WordPiece", "Perso-Arabic", "HooshvareLab/bert-base-parsbert-uncased"), ("13", "PersianBERT_FA", "Persian", "WordPiece", "Perso-Arabic", "HooshvareLab/bert-fa-base-uncased"), ("14", "PersianBPE", "Persian", "BPE", "Perso-Arabic", "mshojaei77/PersianBPETokenizer"), ("15", "UrduBERT", "Urdu", "WordPiece", "Perso-Arabic", "urduhack/UrduBERT"), ] for num, key, family, algo, script, hf_id in tok_meta: if key in all_metrics: vs = all_metrics[key]["vocab_size"] vs_str = f"{vs:,}" if vs else "N/A" md.append(f"| {num} | **{key}** | {family} | {algo} | {vs_str} | `{hf_id}` |") else: md.append(f"| {num} | ~~{key}~~ | {family} | {algo} | — | Not loaded |") md.append("") # Tokenization results md.append("## 3. Tokenization Results\n") md.append("| Tokenizer | Language | Tokens | Speed (tok/s) | Time (s) |") md.append("|-----------|----------|--------|---------------|----------|") lang_map = { "Balochi_BPE":"Balochi","Balochi_WordPiece":"Balochi","Balochi_SentencePiece":"Balochi", "Balochi_30K":"Balochi","NLTK":"Baseline","BERT":"Multilingual","Gemma":"Multilingual", "AraBERT_v2":"Arabic","CAMeLBERT_MSA":"Arabic","ARBERT":"Arabic","AraGPT2":"Arabic", "ParsBERT":"Persian","PersianBERT_FA":"Persian","PersianBPE":"Persian","UrduBERT":"Urdu", } for name, m in all_metrics.items(): lang = lang_map.get(name, "—") fname = output_filenames.get(name, f"tokens_{name.lower()}.txt") md.append(f"| {name} | {lang} | {m['token_count']:,} | {m['speed']:,.0f} | {m['time_sec']:.4f} |") md.append("") # ── Master Metrics Table ─────────────────────────────────── md.append("## 4. Master Metrics Table\n") all_names = list(all_metrics.keys()) hdr = "| Metric |" + "".join(f" {n} |" for n in all_names) sep = "|--------|" + "".join("--------|" for _ in all_names) md.append(hdr); md.append(sep) rows = [ ("Token Count", "token_count", "{:,}"), ("Unique Tokens", "unique_tokens", "{:,}"), ("Vocab Size", "vocab_size", "{}"), ("Vocab Util. (%)", "vocab_utilization", "{:.2f}%"), ("Compression Ratio", "compression_ratio", "{:.2f}"), ("Fertility", "fertility", "{:.3f}"), ("Avg Token Length", "avg_token_length", "{:.2f}"), ("UNK Count", "unk_count", "{:,}"), ("UNK Rate (%)", "unk_rate", "{:.4f}%"), ("Continuation Rate", "continuation_rate", "{:.2f}%"), ("Speed (tok/s)", "speed", "{:,.0f}"), ("Time (s)", "time_sec", "{:.4f}"), ("Roundtrip Fidelity", "roundtrip_fidelity", "{}"), ] for label, key, fmt in rows: row = f"| **{label}** |" for n in all_names: val = all_metrics[n].get(key) if val is None: row += " N/A |" elif isinstance(val, bool): row += f" {'✓' if val else '✗'} |" else: try: row += f" {fmt.format(val)} |" except Exception: row += f" {val} |" md.append(row) md.append("") # ── Group Comparisons ───────────────────────────────────── groups = [ ("5", "Group A: WordPiece Family", ["Balochi_WordPiece","AraBERT_v2","CAMeLBERT_MSA","ARBERT","BERT"]), ("6", "Group B: SP/WP — Balochi vs Persian vs Gemma", ["Balochi_SentencePiece","ParsBERT","PersianBERT_FA","Gemma"]), ("7", "Group C: BPE Family", ["Balochi_BPE","AraGPT2","PersianBPE","Balochi_30K","NLTK"]), ("8", "Group D: Perso-Arabic Script", ["UrduBERT","Balochi_BPE","Balochi_WordPiece","AraBERT_v2","ParsBERT"]), ("9", "Group E1: Balochi WP vs BERT", ["Balochi_WordPiece","BERT"]), ("10", "Group E2: Balochi SP vs Gemma", ["Balochi_SentencePiece","Gemma"]), ("11", "Group E3: Balochi BPE vs NLTK vs 30K", ["Balochi_BPE","NLTK","Balochi_30K"]), ] for sec, title, members in groups: active = [m for m in members if m in all_metrics] if len(active) < 2: continue md.append(f"## {sec}. {title}\n") hdr = "| Metric |" + "".join(f" {n} |" for n in active) sep = "|--------|" + "".join("--------|" for _ in active) md.append(hdr); md.append(sep) for label, key, fmt in rows: row = f"| **{label}** |" for n in active: val = all_metrics[n].get(key) if val is None: row += " N/A |" elif isinstance(val, bool): row += f" {'✓' if val else '✗'} |" else: try: row += f" {fmt.format(val)} |" except Exception: row += f" {val} |" md.append(row) md.append("") # ── Ablation section ───────────────────────────────────── if ablation_metrics: md.append("## 12. Vocabulary Size Ablation Results\n") md.append("> Tokenizers trained at different vocabulary sizes from the " "`Balochi_Tokenizer_Vocab_Ablation.ipynb` notebook.\n") ab_names = list(ablation_metrics.keys()) hdr = "| Metric |" + "".join(f" {n} |" for n in ab_names) sep = "|--------|" + "".join("--------|" for _ in ab_names) md.append(hdr); md.append(sep) for label, key, fmt in rows: row = f"| **{label}** |" for n in ab_names: val = ablation_metrics[n].get(key) if val is None: row += " N/A |" elif isinstance(val, bool): row += f" {'✓' if val else '✗'} |" else: try: row += f" {fmt.format(val)} |" except Exception: row += f" {val} |" md.append(row) md.append("") # ── Analysis ───────────────────────────────────────────── md.append("## 13. Analysis & Interpretation\n") md.append("### 13.1 Cross-Language Compression Comparison\n") lang_groups = { "Balochi (custom)": ["Balochi_BPE","Balochi_WordPiece","Balochi_SentencePiece"], "Arabic (custom)": ["AraBERT_v2","CAMeLBERT_MSA","ARBERT","AraGPT2"], "Persian (custom)": ["ParsBERT","PersianBERT_FA","PersianBPE"], "Urdu (custom)": ["UrduBERT"], "Baseline (generic)":["BERT","Gemma","NLTK"], } md.append("| Language Group | Avg Compression | Avg Fertility | Tokenizers |") md.append("|----------------|-----------------|---------------|------------|") for lang, members in lang_groups.items(): active = [m for m in members if m in all_metrics] if active: avg_cr = sum(all_metrics[m]["compression_ratio"] for m in active) / len(active) avg_fe = sum(all_metrics[m]["fertility"] for m in active) / len(active) md.append(f"| {lang} | {avg_cr:.2f} | {avg_fe:.3f} | {', '.join(active)} |") md.append("") md.append("### 13.2 Key Findings\n") md.append("- **Domain specificity advantage:** Custom Balochi tokenizers are expected to " "outperform generic multilingual tokenizers (BERT, Gemma) on Balochi text by " "producing fewer subword fragments and lower fertility.\n") md.append("- **Script-family proximity:** Arabic and Persian tokenizers share the same " "Perso-Arabic script family as Balochi, making their fertility and UNK rates " "the most meaningful external benchmarks — more so than mBERT or Gemma.\n") md.append("- **Vocabulary size effect:** Larger vocabulary (80K–128K) generally reduces " "fertility but increases memory overhead. The optimal point for Balochi is " "determined by the Rényi efficiency ablation in the companion notebook.\n") md.append("- **AraBERT v2 (64K WP)** and **ParsBERT (100K WP)** serve as the primary " "WordPiece upper-bound references — both trained on 70M+ token Perso-Arabic corpora.\n") md.append("- **AraGPT2 (BPE 50K)** provides the cleanest Arabic BPE reference for " "comparison against Balochi BPE (80K) in the BPE group.\n") md.append("- **UrduBERT** is the most linguistically proximate external tokenizer to " "Balochi due to shared Nastaliq script conventions and similar morphological " "complexity.\n") md.append("### 13.3 Roundtrip Fidelity\n") md.append("| Tokenizer | Fidelity |") md.append("|-----------|----------|") for name, m in all_metrics.items(): fid = m.get("roundtrip_fidelity") md.append(f"| {name} | {'✓ Lossless' if fid is True else ('✗ Lossy' if fid is False else '— N/A')} |") md.append("") md.append("## 14. Tokenizer Selection Guide\n") md.append("| Use Case | Recommended Tokenizer | Rationale |") md.append("|----------|----------------------|-----------|") guide = [ ("Balochi BERT fine-tuning", "Balochi_WordPiece 64K", "Native WP; matches BERT architecture exactly"), ("Balochi GPT/Gemma CPT", "Balochi_BPE 80K", "BPE aligns with GPT-2/Gemma training conventions"), ("Balochi SentencePiece pipeline","Balochi_SP 64K", "Zero UNK via byte_fallback; SP models (T5, mT5)"), ("Arabic NER / SA tasks", "AraBERT v2 or CAMeLBERT", "Proven Arabic BERT baselines"), ("Arabic text generation", "AraGPT2", "Arabic GPT-2 with BPE tokenizer"), ("Persian BERT tasks", "ParsBERT", "Standard Persian BERT baseline"), ("Persian text analysis", "PersianBERT_FA", "HooshvareLab FA-base, widely used"), ("Urdu NLP tasks", "UrduBERT", "Nastaliq script; same script family as Balochi"), ("Cross-lingual baseline", "BERT Multilingual 119K", "Covers 104 languages for transfer comparison"), ("Word-count baseline", "NLTK", "Raw word count before subword splitting"), ] for row in guide: md.append(f"| {row[0]} | **{row[1]}** | {row[2]} |") md.append("") md.append("## 15. Citation\n") md.append("```bibtex") md.append("@misc{hafeezullah2025balochi,") md.append(" title = {Comprehensive Balochi Tokenizer Comparison: Custom vs. Cross-Language Baselines},") md.append(" author = {Hafeez Ullah},") md.append(" year = {2025},") md.append(" url = {https://huggingface.co/balochiml},") md.append(" note = {University of Gwadar, Department of Computer Science}") md.append("}") md.append("```\n") # Save md_content = "\n".join(md) md_path = os.path.join(OUTPUT_DIR, "Tokenizer_Comparison_Extended_Report.md") with open(md_path, "w", encoding="utf-8") as f: f.write(md_content) return md_path # ============================================================ # MAIN # ============================================================ def main(): print("╔" + "═" * 108 + "╗") print("║" + " BALOCHI + CROSS-LANGUAGE TOKENIZER COMPARISON (EXTENDED) ".center(108) + "║") print("╚" + "═" * 108 + "╝") # Step 1 — Load text print("\n[STEP 1] Loading input text...") text = load_text(INPUT_FILE) words = text.split() sentences = split_sentences(text) print(f" Characters : {len(text):,}") print(f" Words : {len(words):,}") print(f" Sentences : {len(sentences):,}") # Step 2 — Load tokenizers print("\n[STEP 2] Loading primary tokenizers (15 total)...") toks = load_all_tokenizers() print(f"\n ✓ Loaded {len(toks)}/15 primary tokenizers.") # Step 3 — Load ablation tokenizers print("\n[STEP 3] Loading ablation tokenizers (vocab-size study)...") ablation_toks = load_ablation_tokenizers() if ablation_toks: print(f" ✓ Loaded {len(ablation_toks)} ablation tokenizers.") else: print(" ℹ No ablation tokenizer files found — run the notebook first to generate them.") if not toks: print("ERROR: No tokenizers loaded. Exiting.") sys.exit(1) # Step 4 — Tokenize print("\n[STEP 4] Running tokenization...") all_tokens = {} all_metrics = {} ablation_metrics = {} output_filenames = {} # filename mapping for primary toks name_map = { "Balochi_BPE": "tokens_balochi_bpe.txt", "Balochi_WordPiece": "tokens_balochi_wordpiece.txt", "Balochi_SentencePiece":"tokens_balochi_sentencepiece.txt", "Balochi_30K": "tokens_balochi_30k.txt", "NLTK": "tokens_nltk.txt", "BERT": "tokens_bert_multilingual.txt", "Gemma": "tokens_gemma.txt", "AraBERT_v2": "tokens_arabert_v2.txt", "CAMeLBERT_MSA": "tokens_camelbert_msa.txt", "ARBERT": "tokens_arbert.txt", "AraGPT2": "tokens_aragpt2.txt", "ParsBERT": "tokens_parsbert.txt", "PersianBERT_FA": "tokens_persianbert_fa.txt", "PersianBPE": "tokens_persian_bpe.txt", "UrduBERT": "tokens_urdubert.txt", } for name, tok_info in toks.items(): print(f" Tokenizing {name}...", end=" ", flush=True) tokens, elapsed = tokenize_text(name, tok_info, text) all_tokens[name] = tokens all_metrics[name] = compute_metrics(name, tok_info, tokens, elapsed, text) fname = name_map.get(name, f"tokens_{name.lower()}.txt") output_filenames[name] = fname save_tokens(fname, tokens) print(f"Done! ({len(tokens):,} tokens in {elapsed:.4f}s)") for name, tok_info in ablation_toks.items(): print(f" [Ablation] Tokenizing {name}...", end=" ", flush=True) tokens, elapsed = tokenize_text(name, tok_info, text) ablation_metrics[name] = compute_metrics(name, tok_info, tokens, elapsed, text) fname = f"tokens_{name.lower()}.txt" output_filenames[name] = fname save_tokens(fname, tokens) print(f"Done! ({len(tokens):,} tokens in {elapsed:.4f}s)") # Step 5 — Sample output print("\n[STEP 5] Sample token output...") print_sample_tokens(toks, sentences) # Step 6 — Comparison tables (primary only) print("\n[STEP 6] Comparison tables...") print_all_comparisons(all_metrics) # Step 7 — Analysis print("\n[STEP 7] Analysis & interpretation...") analyze_results(all_metrics) # Step 8 — Markdown report print("\n[STEP 8] Generating Markdown report...") md_path = generate_markdown_report( text, sentences, all_tokens, all_metrics, output_filenames, ablation_metrics=ablation_metrics if ablation_metrics else None ) print(f" ✓ Report saved: {md_path}") # Final summary print(f"\n{'═' * 110}") print(f" OUTPUT → {OUTPUT_DIR}") print(f"{'═' * 110}") for name in {**toks, **ablation_toks}: fname = output_filenames.get(name, f"tokens_{name.lower()}.txt") fpath = os.path.join(OUTPUT_DIR, fname) size = os.path.getsize(fpath) if os.path.exists(fpath) else 0 print(f" ✓ {fname:<50s} ({size:>10,} bytes)") md_size = os.path.getsize(md_path) if os.path.exists(md_path) else 0 print(f" ✓ {'Tokenizer_Comparison_Extended_Report.md':<50s} ({md_size:>10,} bytes)") print(f"{'═' * 110}") print(f"\n ✅ ALL DONE — {len(toks)} primary + {len(ablation_toks)} ablation tokenizers compared.\n") if __name__ == "__main__": main()