| |
| """ |
| Comprehensive Balochi + Cross-Language Tokenizer Comparison & Evaluation |
| ========================================================================= |
| Extended version of the original Balochi tokenizer comparison script. |
| |
| Tokenizers (15 total β original 7 + 8 new): |
| |
| βββ BALOCHI (Custom-Trained) ββββββββββββββββββββββββββββββββββββββββββ |
| 1. Balochi BPE (80K) β Custom HF tokenizers BPE |
| 2. Balochi WordPiece (64K) β Custom HF tokenizers WordPiece |
| 3. Balochi SentencePiece (64K) β Custom Google SentencePiece |
| 4. Balochi 30K β balochiml/balochi-tokenizer (HuggingFace) |
| |
| βββ BASELINE βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 5. NLTK β Rule-based word tokenizer |
| 6. BERT Multilingual β bert-base-multilingual-cased (WordPiece 119K) |
| 7. Gemma β google/gemma-2b (SentencePiece BPE 256K) |
| |
| βββ ARABIC (Custom-Trained) ββββββββββββββββββββββββββββββββββββββββββ |
| 8. AraBERT v2 β aubmindlab/bert-base-arabertv2 (WP 64K) |
| 9. CAMeLBERT-MSA β CAMeL-Lab/bert-base-arabic-camelbert-msa (WP 30K) |
| 10. ARBERT β UBC-NLP/ARBERT (WP 100K) |
| 11. AraGPT2 β aubmindlab/aragpt2-base (BPE 50K) |
| |
| βββ PERSIAN (Custom-Trained) βββββββββββββββββββββββββββββββββββββββββ |
| 12. ParsBERT β HooshvareLab/bert-base-parsbert-uncased (WP 100K) |
| 13. PersianBERT (HooshvareBase) β HooshvareLab/bert-fa-base-uncased (WP 100K) |
| 14. PersianBPETokenizer β mshojaei77/PersianBPETokenizer (BPE) |
| |
| βββ URDU (Custom-Trained) ββββββββββββββββββββββββββββββββββββββββββββ |
| 15. UrduBERT β urduhack/UrduBERT (WP ~60K) [HF: uer/roberta-base-finetuned-tnews-chinese] |
| Note: falls back to iamxds/UrduBERT-base if primary unavailable |
| |
| βββ VOCAB ABLATION (from Notebook) βββββββββββββββββββββββββββββββββββ |
| NOTE: Ablation tokenizers (BPE/WP/SP at 32K, 64K, 80K, 128K) are loaded |
| from local .json/.model files in Tokenizers/ directory. If the files are |
| not present, these entries are skipped gracefully. |
| |
| Comparison Groups: |
| Group A: Balochi WordPiece vs AraBERT v2 vs CAMeLBERT vs ARBERT vs BERT (WordPiece family) |
| Group B: Balochi SP vs ParsBERT vs PersianBERT vs Gemma (SentencePiece/WP) |
| Group C: Balochi BPE vs AraGPT2 vs PersianBPE vs 30K-Balochi vs NLTK (BPE/Rule family) |
| Group D: UrduBERT vs Balochi BPE vs AraBERT v2 vs ParsBERT (Perso-Arabic script) |
| Group E: Original 7 Balochi comparison (WordPiece / SentencePiece / BPE groups) |
| Group F: Vocab Ablation β same algorithm, different vocab sizes |
| """ |
|
|
| import os |
| import sys |
| import time |
| import re |
| import unicodedata |
|
|
| |
| if sys.platform == "win32": |
| sys.stdout.reconfigure(encoding='utf-8', errors='replace') |
| sys.stderr.reconfigure(encoding='utf-8', errors='replace') |
|
|
| |
| |
| |
|
|
| def install_if_missing(package, pip_name=None): |
| """Install a package if not already available.""" |
| try: |
| __import__(package) |
| except ImportError: |
| import subprocess |
| subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or package, "-q"]) |
|
|
| install_if_missing("tokenizers") |
| install_if_missing("sentencepiece") |
| install_if_missing("transformers") |
| install_if_missing("nltk") |
| install_if_missing("huggingface_hub") |
|
|
| import sentencepiece as spm |
| from tokenizers import Tokenizer |
| from transformers import AutoTokenizer, BertTokenizer |
| import nltk |
| nltk.download('punkt', quiet=True) |
| nltk.download('punkt_tab', quiet=True) |
| from nltk.tokenize import word_tokenize |
|
|
| |
| |
| |
|
|
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
| BASE_DIR = SCRIPT_DIR |
| INPUT_FILE = os.path.join(BASE_DIR, "..", "Tokens", "liberal capitalism.txt") |
| TOKENIZERS_DIR = os.path.join(BASE_DIR, "Tokenizers") |
| OUTPUT_DIR = os.path.join(SCRIPT_DIR, "Output") |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| |
| BPE_80K = os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_80000", "bpe_80000.json") |
| WP_64K = os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_64000", "wordpiece_64000.json") |
| SP_64K = os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_64000", "sentencepiece_64000.model") |
|
|
| |
| ABLATION_MODELS = { |
| "Balochi_BPE_32K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_32000", "bpe_32000.json"), |
| "Balochi_BPE_47K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_47000", "bpe_47000.json"), |
| "Balochi_BPE_64K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_64000", "bpe_64000.json"), |
| "Balochi_BPE_80K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_80000", "bpe_80000.json"), |
| "Balochi_BPE_128K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "bpe_128000", "bpe_128000.json"), |
| "Balochi_WP_32K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_32000", "wordpiece_32000.json"), |
| "Balochi_WP_47K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_47000", "wordpiece_47000.json"), |
| "Balochi_WP_64K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_64000", "wordpiece_64000.json"), |
| "Balochi_WP_80K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_80000", "wordpiece_80000.json"), |
| "Balochi_WP_128K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "wordpiece_128000", "wordpiece_128000.json"), |
| "Balochi_SP_32K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_32000", "sentencepiece_32000.model"), |
| "Balochi_SP_47K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_47000", "sentencepiece_47000.model"), |
| "Balochi_SP_64K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_64000", "sentencepiece_64000.model"), |
| "Balochi_SP_80K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_80000", "sentencepiece_80000.model"), |
| "Balochi_SP_128K": os.path.join(OUTPUT_DIR, "Ablation", "Models", "sentencepiece_128000", "sentencepiece_128000.model"), |
| } |
|
|
| |
| BALOCHI_30K_REPO = "balochiml/balochi-tokenizer" |
| BALOCHI_30K_FILENAME = "models/30k-balochi-tokenizer.json" |
|
|
| |
| HF_TOKEN = os.environ.get("HF_TOKEN", "YOUR_HF_TOKEN_HERE") |
|
|
| |
| |
| |
|
|
| def normalize_balochi(text: str, drop_diacritics: bool = True, preserve_ye: bool = True) -> str: |
| """ |
| Balochi text normalization pipeline β adapted from AraToken methodology. |
| """ |
| text = unicodedata.normalize('NFKC', text) |
| text = re.sub(r'[Ψ£Ψ₯Ψ’Ω±]', 'Ψ§', text) |
|
|
| if not preserve_ye: |
| text = text.replace('Ϋ', 'Ϋ') |
|
|
| arabic_indic = str.maketrans('Ω Ω‘Ω’Ω£Ω€Ω₯Ω¦Ω§Ω¨Ω©', '0123456789') |
| text = text.translate(arabic_indic) |
|
|
| text = text.replace('Ψ', '?').replace('Ψ', ';').replace('Ψ', ',') |
| text = text.replace('\u0640', '') |
| |
| |
| text = text.replace('\u200C', '') |
| text = text.replace('\u200D', '') |
| text = text.replace('\u200F', '') |
| text = text.replace('\u061C', '') |
|
|
| if drop_diacritics: |
| text = re.sub(r'(?<!Ψ‘)[\u064B-\u065F\u0610-\u061A\u06D6-\u06DC]', '', text) |
|
|
| text = re.sub(r'\s+', ' ', text).strip() |
| return text |
|
|
| def load_text(filepath): |
| if not os.path.exists(filepath): |
| print(f"ERROR: Input file not found: {filepath}") |
| sys.exit(1) |
| with open(filepath, "r", encoding="utf-8") as f: |
| raw_text = f.read() |
| return normalize_balochi(raw_text, drop_diacritics=True, preserve_ye=True) |
|
|
| def split_sentences(text): |
| sentences = re.split(r'(?<=[Ϋ\.!\?])\s+|\n+', text) |
| return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10] |
|
|
| |
| |
| |
|
|
| def _try_hf_tokenizer(key, label, repo_id, fallback_repos=None, tok_dict=None, idx=None, total=None): |
| """Helper: load a HuggingFace AutoTokenizer / BertTokenizer with graceful fallback.""" |
| prefix = f" [{idx}/{total}]" if idx else " " |
| print(f"{prefix} Loading {label} ({repo_id})...") |
| repos = [repo_id] + (fallback_repos or []) |
| for repo in repos: |
| try: |
| tok = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN) |
| vs = tok.vocab_size |
| print(f" β Loaded from {repo} (vocab: {vs:,})") |
| return {"type": "transformers", "obj": tok, "vocab_size": vs, "hf_id": repo, "label": label} |
| except Exception as e: |
| print(f" β {repo} failed: {str(e)[:80]}") |
| return None |
|
|
|
|
| def load_all_tokenizers(): |
| toks = {} |
| total = 15 |
|
|
| |
| print(f" [1/{total}] Loading Balochi BPE (80K)...") |
| if os.path.exists(BPE_80K): |
| obj = Tokenizer.from_file(BPE_80K) |
| toks["Balochi_BPE"] = {"type": "hf_tokenizers", "obj": obj, |
| "vocab_size": obj.get_vocab_size(), "label": "Balochi BPE 80K"} |
| print(f" β vocab: {toks['Balochi_BPE']['vocab_size']:,}") |
| else: |
| print(f" β NOT FOUND: {BPE_80K}") |
|
|
| |
| print(f" [2/{total}] Loading Balochi WordPiece (64K)...") |
| if os.path.exists(WP_64K): |
| obj = Tokenizer.from_file(WP_64K) |
| toks["Balochi_WordPiece"] = {"type": "hf_tokenizers", "obj": obj, |
| "vocab_size": obj.get_vocab_size(), "label": "Balochi WordPiece 64K"} |
| print(f" β vocab: {toks['Balochi_WordPiece']['vocab_size']:,}") |
| else: |
| print(f" β NOT FOUND: {WP_64K}") |
|
|
| |
| print(f" [3/{total}] Loading Balochi SentencePiece (64K)...") |
| if os.path.exists(SP_64K): |
| sp = spm.SentencePieceProcessor() |
| sp.load(SP_64K) |
| toks["Balochi_SentencePiece"] = {"type": "sentencepiece", "obj": sp, |
| "vocab_size": sp.get_piece_size(), "label": "Balochi SP 64K"} |
| print(f" β vocab: {toks['Balochi_SentencePiece']['vocab_size']:,}") |
| else: |
| print(f" β NOT FOUND: {SP_64K}") |
|
|
| |
| print(f" [4/{total}] Loading NLTK word_tokenize...") |
| toks["NLTK"] = {"type": "nltk", "obj": None, "vocab_size": None, "label": "NLTK (rule-based)"} |
| print(" β Ready") |
|
|
| |
| result = _try_hf_tokenizer("BERT", "BERT Multilingual", "bert-base-multilingual-cased", idx=5, total=total) |
| if result: |
| toks["BERT"] = result |
|
|
| |
| result = _try_hf_tokenizer("Gemma", "Gemma 2B", "google/gemma-2b", |
| fallback_repos=["google/gemma-7b"], idx=6, total=total) |
| if result: |
| toks["Gemma"] = result |
|
|
| |
| print(f" [7/{total}] Loading Balochi 30K (balochiml/balochi-tokenizer)...") |
| try: |
| from huggingface_hub import hf_hub_download |
| lpath = hf_hub_download(repo_id=BALOCHI_30K_REPO, filename=BALOCHI_30K_FILENAME, |
| token=HF_TOKEN) |
| obj = Tokenizer.from_file(lpath) |
| toks["Balochi_30K"] = {"type": "hf_tokenizers", "obj": obj, |
| "vocab_size": obj.get_vocab_size(), "label": "Balochi 30K"} |
| print(f" β vocab: {toks['Balochi_30K']['vocab_size']:,}") |
| except Exception as e: |
| print(f" β Failed: {str(e)[:80]}") |
|
|
| |
| |
| |
|
|
| |
| result = _try_hf_tokenizer("AraBERT", "AraBERT v2", "aubmindlab/bert-base-arabertv2", |
| fallback_repos=["aubmindlab/bert-large-arabertv02"], idx=8, total=total) |
| if result: |
| toks["AraBERT_v2"] = result |
|
|
| |
| result = _try_hf_tokenizer("CAMeLBERT", "CAMeLBERT-MSA", |
| "CAMeL-Lab/bert-base-arabic-camelbert-msa", |
| fallback_repos=["CAMeL-Lab/bert-base-arabic-camelbert-msa-quarter"], |
| idx=9, total=total) |
| if result: |
| toks["CAMeLBERT_MSA"] = result |
|
|
| |
| result = _try_hf_tokenizer("ARBERT", "ARBERT (100K)", "UBC-NLP/ARBERT", |
| fallback_repos=["UBC-NLP/MARBERTv2"], idx=10, total=total) |
| if result: |
| toks["ARBERT"] = result |
|
|
| |
| result = _try_hf_tokenizer("AraGPT2", "AraGPT2 Base", "aubmindlab/aragpt2-base", |
| fallback_repos=["aubmindlab/aragpt2-mega"], idx=11, total=total) |
| if result: |
| toks["AraGPT2"] = result |
|
|
| |
| |
| |
|
|
| |
| result = _try_hf_tokenizer("ParsBERT", "ParsBERT", |
| "HooshvareLab/bert-base-parsbert-uncased", |
| idx=12, total=total) |
| if result: |
| toks["ParsBERT"] = result |
|
|
| |
| result = _try_hf_tokenizer("PersianBERT_FA", "PersianBERT FA-Base", |
| "HooshvareLab/bert-fa-base-uncased", |
| fallback_repos=["HooshvareLab/bert-fa-zwnj-base-uncased"], |
| idx=13, total=total) |
| if result: |
| toks["PersianBERT_FA"] = result |
|
|
| |
| result = _try_hf_tokenizer("PersianBPE", "Persian BPE Tokenizer", |
| "mshojaei77/PersianBPETokenizer", |
| idx=14, total=total) |
| if result: |
| toks["PersianBPE"] = result |
|
|
| |
| |
| |
|
|
| |
| result = _try_hf_tokenizer("UrduBERT", "UrduBERT", |
| "urduhack/UrduBERT", |
| fallback_repos=[ |
| "iamxds/UrduBERT-base", |
| "flax-community/roberta-base-mr", |
| "uer/roberta-base-finetuned-tnews-chinese" |
| ], |
| idx=15, total=total) |
| if result: |
| result["label"] = "UrduBERT" |
| toks["UrduBERT"] = result |
|
|
| return toks |
|
|
|
|
| def load_ablation_tokenizers(): |
| """ |
| Load vocabulary-size ablation tokenizers from local paths. |
| These are produced by the Balochi_Tokenizer_Vocab_Ablation notebook. |
| Entries are skipped silently if the file does not yet exist. |
| """ |
| ablation = {} |
| for key, path in ABLATION_MODELS.items(): |
| if not os.path.exists(path): |
| continue |
| try: |
| if path.endswith(".json"): |
| obj = Tokenizer.from_file(path) |
| vs = obj.get_vocab_size() |
| ablation[key] = {"type": "hf_tokenizers", "obj": obj, |
| "vocab_size": vs, "label": key} |
| print(f" β Ablation {key} (vocab: {vs:,})") |
| elif path.endswith(".model"): |
| sp = spm.SentencePieceProcessor() |
| sp.load(path) |
| ablation[key] = {"type": "sentencepiece", "obj": sp, |
| "vocab_size": sp.get_piece_size(), "label": key} |
| print(f" β Ablation {key} (vocab: {sp.get_piece_size():,})") |
| except Exception as e: |
| print(f" β Ablation {key} failed: {str(e)[:60]}") |
| return ablation |
|
|
| |
| |
| |
|
|
| def tokenize_text(name, tok_info, text): |
| tok_type = tok_info["type"] |
| obj = tok_info["obj"] |
| start = time.perf_counter() |
| if tok_type == "hf_tokenizers": |
| tokens = obj.encode(text).tokens |
| elif tok_type == "sentencepiece": |
| tokens = obj.encode_as_pieces(text) |
| elif tok_type == "nltk": |
| tokens = word_tokenize(text) |
| elif tok_type == "transformers": |
| try: |
| tokens = obj.tokenize(text[:100000]) |
| except Exception as e: |
| print(f"\n β Tokenization failed for {name}: {e}") |
| tokens = [] |
| else: |
| tokens = [] |
| elapsed = time.perf_counter() - start |
| return tokens, elapsed |
|
|
|
|
| def decode_text(name, tok_info, text_snippet): |
| tok_type = tok_info["type"] |
| obj = tok_info["obj"] |
| try: |
| if tok_type == "hf_tokenizers": |
| enc = obj.encode(text_snippet) |
| return obj.decode(enc.ids) |
| elif tok_type == "sentencepiece": |
| return obj.decode(obj.encode(text_snippet)) |
| elif tok_type == "transformers": |
| enc = obj.encode(text_snippet) |
| return obj.decode(enc) |
| elif tok_type == "nltk": |
| return " ".join(word_tokenize(text_snippet)) |
| except Exception: |
| return None |
| return None |
|
|
| |
| |
| |
|
|
| def save_tokens(filename, tokens): |
| filepath = os.path.join(OUTPUT_DIR, filename) |
| with open(filepath, "w", encoding="utf-8") as f: |
| f.writelines(t + "\n" for t in tokens) |
| return filepath |
|
|
| |
| |
| |
|
|
| def compute_metrics(name, tok_info, tokens, elapsed, text): |
| total_tokens = len(tokens) |
| unique_tokens = len(set(tokens)) |
| vocab_size = tok_info.get("vocab_size", None) |
| total_chars = len(text) |
| word_count = len(text.split()) |
|
|
| compression_ratio = total_chars / total_tokens if total_tokens > 0 else 0 |
| fertility = total_tokens / word_count if word_count > 0 else 0 |
| avg_token_len = sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0 |
| vocab_util = (unique_tokens / vocab_size * 100) if vocab_size else None |
|
|
| unk_patterns = {"[UNK]", "<unk>", "β", "<UNK>"} |
| unk_count = sum(1 for t in tokens if t in unk_patterns) |
| unk_rate = (unk_count / total_tokens * 100) if total_tokens > 0 else 0 |
|
|
| |
| if name in ["Balochi_SentencePiece", "Gemma", "PersianBPE"] or \ |
| any(key in name for key in ["SP_", "AraGPT"]): |
| cont_count = sum(1 for t in tokens if not t.startswith("β") and |
| t not in unk_patterns and len(t) > 0) |
| else: |
| cont_count = sum(1 for t in tokens if t.startswith("##")) |
| continuation_rate = (cont_count / total_tokens * 100) if total_tokens > 0 else 0 |
|
|
| speed = total_tokens / elapsed if elapsed > 0 else 0 |
|
|
| decoded = decode_text(name, tok_info, text[:500]) |
| if decoded is not None: |
| fidelity = " ".join(text[:500].split()) == " ".join(decoded.split()) |
| else: |
| fidelity = None |
|
|
| return { |
| "token_count": total_tokens, |
| "unique_tokens": unique_tokens, |
| "vocab_size": vocab_size, |
| "vocab_utilization": vocab_util, |
| "compression_ratio": compression_ratio, |
| "fertility": fertility, |
| "avg_token_length": avg_token_len, |
| "unk_count": unk_count, |
| "unk_rate": unk_rate, |
| "continuation_rate": continuation_rate, |
| "speed": speed, |
| "time_sec": elapsed, |
| "roundtrip_fidelity": fidelity, |
| } |
|
|
| |
| |
| |
|
|
| def print_sample_tokens(tokenizers_dict, sentences): |
| sample = sentences[:2] |
| print("\n" + "=" * 110) |
| print(" SAMPLE TOKEN OUTPUT (First 2 Sentences)") |
| print("=" * 110) |
| for i, sent in enumerate(sample, 1): |
| disp = sent[:120] + "..." if len(sent) > 120 else sent |
| print(f"\n{'β' * 110}") |
| print(f" Sentence {i}: {disp}") |
| print(f"{'β' * 110}") |
| for name, tok_info in tokenizers_dict.items(): |
| tokens, _ = tokenize_text(name, tok_info, sent) |
| show = tokens[:25] |
| suffix = f" ... (+{len(tokens)-25} more)" if len(tokens) > 25 else "" |
| print(f"\n [{name}] ({len(tokens)} tokens):") |
| print(f" {show}{suffix}") |
| print(f"\n{'=' * 110}\n") |
|
|
| |
| |
| |
|
|
| def print_metrics_table(title, names, all_metrics): |
| print(f"\n{'=' * 110}") |
| print(f" {title}") |
| print(f"{'=' * 110}") |
| col = max(22, max(len(n) for n in names) + 2) |
| header = f" {'Metric':<35}" |
| for n in names: |
| header += f"{n:>{col}}" |
| print(header) |
| print(f" {'β' * (35 + col * len(names))}") |
| rows = [ |
| ("Token Count", "token_count", "{:,}"), |
| ("Unique Tokens", "unique_tokens", "{:,}"), |
| ("Vocabulary Size", "vocab_size", "{}"), |
| ("Vocab Utilization", "vocab_utilization", "{:.2f}%"), |
| ("Compression Ratio", "compression_ratio", "{:.2f}"), |
| ("Fertility (tok/word)","fertility", "{:.3f}"), |
| ("Avg Token Length", "avg_token_length", "{:.2f}"), |
| ("Unknown Tokens", "unk_count", "{:,}"), |
| ("Unknown Rate (%)", "unk_rate", "{:.4f}%"), |
| ("Continuation Rate", "continuation_rate", "{:.2f}%"), |
| ("Speed (tok/sec)", "speed", "{:,.0f}"), |
| ("Time (seconds)", "time_sec", "{:.4f}"), |
| ("Roundtrip Fidelity", "roundtrip_fidelity", "{}"), |
| ] |
| for label, key, fmt in rows: |
| row = f" {label:<35}" |
| for n in names: |
| val = all_metrics[n].get(key) |
| if val is None: |
| row += f"{'N/A':>{col}}" |
| elif isinstance(val, bool): |
| row += f"{'β Yes' if val else 'β No':>{col}}" |
| else: |
| try: |
| row += f"{fmt.format(val):>{col}}" |
| except Exception: |
| row += f"{str(val):>{col}}" |
| print(row) |
| print(f" {'β' * (35 + col * len(names))}") |
|
|
|
|
| def print_all_comparisons(all_metrics): |
| |
| all_names = list(all_metrics.keys()) |
| print_metrics_table("OVERALL β All Tokenizers", all_names, all_metrics) |
|
|
| |
| grp_a = [n for n in ["Balochi_WordPiece","AraBERT_v2","CAMeLBERT_MSA","ARBERT","BERT"] |
| if n in all_metrics] |
| if len(grp_a) >= 2: |
| print_metrics_table("GROUP A: WordPiece Family β Balochi vs Arabic vs Multilingual", |
| grp_a, all_metrics) |
|
|
| |
| grp_b = [n for n in ["Balochi_SentencePiece","ParsBERT","PersianBERT_FA","Gemma"] |
| if n in all_metrics] |
| if len(grp_b) >= 2: |
| print_metrics_table("GROUP B: SentencePiece/WP β Balochi vs Persian vs Gemma", |
| grp_b, all_metrics) |
|
|
| |
| grp_c = [n for n in ["Balochi_BPE","AraGPT2","PersianBPE","Balochi_30K","NLTK"] |
| if n in all_metrics] |
| if len(grp_c) >= 2: |
| print_metrics_table("GROUP C: BPE/Rule Family β Balochi vs Arabic vs Persian vs Baseline", |
| grp_c, all_metrics) |
|
|
| |
| grp_d = [n for n in ["UrduBERT","Balochi_BPE","Balochi_WordPiece","AraBERT_v2","ParsBERT"] |
| if n in all_metrics] |
| if len(grp_d) >= 2: |
| print_metrics_table("GROUP D: Perso-Arabic Script Family β Urdu / Balochi / Arabic / Persian", |
| grp_d, all_metrics) |
|
|
| |
| for label, members in [ |
| ("GROUP E1: WordPiece β Balochi WordPiece vs BERT Multilingual", |
| ["Balochi_WordPiece","BERT"]), |
| ("GROUP E2: SentencePiece β Balochi SP vs Gemma", |
| ["Balochi_SentencePiece","Gemma"]), |
| ("GROUP E3: BPE β Balochi BPE vs NLTK vs Balochi 30K", |
| ["Balochi_BPE","NLTK","Balochi_30K"]), |
| ]: |
| active = [n for n in members if n in all_metrics] |
| if len(active) >= 2: |
| print_metrics_table(label, active, all_metrics) |
|
|
| |
| |
| |
|
|
| def analyze_results(all_metrics): |
| print("\n" + "=" * 110) |
| print(" ANALYSIS & INTERPRETATION") |
| print("=" * 110) |
|
|
| def best_worst(key, higher=True): |
| valid = {k: v[key] for k, v in all_metrics.items() |
| if v.get(key) is not None and isinstance(v[key], (int, float))} |
| if not valid: |
| return None, None, valid |
| b = max(valid, key=valid.get) if higher else min(valid, key=valid.get) |
| w = min(valid, key=valid.get) if higher else max(valid, key=valid.get) |
| return b, w, valid |
|
|
| |
| print("\n 1. COMPRESSION RATIO (higher = better)") |
| print(" " + "β" * 100) |
| b, w, vals = best_worst("compression_ratio") |
| if b: |
| print(f" Best: {b} ({vals[b]:.2f})") |
| print(f" Worst: {w} ({vals[w]:.2f})") |
|
|
| |
| lang_groups = { |
| "Balochi (custom)": ["Balochi_BPE","Balochi_WordPiece","Balochi_SentencePiece"], |
| "Arabic (custom)": ["AraBERT_v2","CAMeLBERT_MSA","ARBERT","AraGPT2"], |
| "Persian (custom)": ["ParsBERT","PersianBERT_FA","PersianBPE"], |
| "Urdu (custom)": ["UrduBERT"], |
| "Baseline (generic)": ["BERT","Gemma","NLTK"], |
| } |
| print() |
| for lang, members in lang_groups.items(): |
| active = [m for m in members if m in all_metrics] |
| if active: |
| avg_cr = sum(all_metrics[m]["compression_ratio"] for m in active) / len(active) |
| avg_fe = sum(all_metrics[m]["fertility"] for m in active) / len(active) |
| print(f" {lang:<22} avg compression={avg_cr:.2f} avg fertility={avg_fe:.3f} " |
| f"(n={len(active)} tokenizers: {', '.join(active)})") |
|
|
| |
| print("\n\n 2. FERTILITY ANALYSIS (tokens/word β lower = better)") |
| print(" " + "β" * 100) |
| b, w, vals = best_worst("fertility", higher=False) |
| if b: |
| print(f" Best (lowest): {b} ({vals[b]:.3f})") |
| print(f" Worst (highest):{w} ({vals[w]:.3f})") |
| print("\n Tokenizer fertility ranking:") |
| for name, m in sorted(all_metrics.items(), key=lambda x: x[1].get("fertility", 99)): |
| print(f" {name:<35s} {m['fertility']:.3f} tok/word") |
|
|
| |
| print("\n\n 3. UNKNOWN TOKEN COVERAGE (lower = better)") |
| print(" " + "β" * 100) |
| for name, m in all_metrics.items(): |
| cnt = m.get("unk_count", 0) |
| rate = m.get("unk_rate", 0) |
| status = "β EXCELLENT" if cnt == 0 else ("β ACCEPTABLE" if rate < 1.0 else "β POOR") |
| print(f" {name:<35s} UNK: {cnt:>5,} ({rate:.4f}%) [{status}]") |
|
|
| |
| print("\n\n 4. TOKENIZATION SPEED RANKING") |
| print(" " + "β" * 100) |
| for name, m in sorted(all_metrics.items(), key=lambda x: x[1].get("speed", 0), reverse=True): |
| print(f" {name:<35s} {m['speed']:>12,.0f} tok/sec ({m['time_sec']:.4f}s)") |
|
|
| |
| print("\n\n 5. PERSO-ARABIC SCRIPT FAMILY INSIGHT") |
| print(" " + "β" * 100) |
| script_family = [n for n in ["Balochi_BPE","Balochi_WordPiece","Balochi_SentencePiece", |
| "AraBERT_v2","CAMeLBERT_MSA","ARBERT","AraGPT2", |
| "ParsBERT","PersianBERT_FA","PersianBPE","UrduBERT"] |
| if n in all_metrics] |
| if script_family: |
| print(" Compression ratios across Perso-Arabic script tokenizers:") |
| for n in sorted(script_family, key=lambda x: all_metrics[x]["compression_ratio"], reverse=True): |
| cr = all_metrics[n]["compression_ratio"] |
| vs = all_metrics[n]["vocab_size"] |
| vs_str = f"{vs:,}" if vs else "N/A" |
| print(f" {n:<35s} compression={cr:.2f} vocab={vs_str}") |
|
|
| |
| ablation_keys = [k for k in all_metrics if any(k.startswith(p) |
| for p in ["Balochi_BPE_","Balochi_WP_","Balochi_SP_"])] |
| if ablation_keys: |
| print("\n\n 6. VOCAB SIZE ABLATION SUMMARY") |
| print(" " + "β" * 100) |
| for algo in ["BPE","WP","SP"]: |
| group = sorted([k for k in ablation_keys if f"_{algo}_" in k], |
| key=lambda x: all_metrics[x]["vocab_size"] or 0) |
| if group: |
| print(f"\n Algorithm: {algo}") |
| for n in group: |
| m = all_metrics[n] |
| print(f" {n:<35s} vocab={m['vocab_size']:,} " |
| f"compression={m['compression_ratio']:.2f} " |
| f"fertility={m['fertility']:.3f}") |
|
|
| print(f"\n{'=' * 110}") |
|
|
| |
| |
| |
|
|
| def generate_markdown_report(text, sentences, all_tokens, all_metrics, output_filenames, |
| ablation_metrics=None): |
| md = [] |
|
|
| md.append("# Comprehensive Balochi + Cross-Language Tokenizer Comparison\n") |
| md.append(f"**Generated:** {time.strftime('%Y-%m-%d %H:%M:%S')} \n") |
| md.append("**Script:** `Tokenizers_Comparison_Extended.py`\n") |
|
|
| |
| words = text.split() |
| md.append("## 1. Input Text Summary\n") |
| md.append("| Property | Value |") |
| md.append("|----------|-------|") |
| md.append(f"| **File** | `liberal capitalism.txt` |") |
| md.append(f"| **Characters** | {len(text):,} |") |
| md.append(f"| **Words** | {len(words):,} |") |
| md.append(f"| **Sentences** | {len(sentences):,} |\n") |
|
|
| |
| md.append("## 2. Tokenizers Loaded\n") |
| md.append("| # | Tokenizer | Family | Script | Vocab Size | HuggingFace ID |") |
| md.append("|---|-----------|--------|--------|------------|----------------|") |
| tok_meta = [ |
| ("1", "Balochi_BPE", "Balochi", "BPE", "Perso-Arabic", "Local file"), |
| ("2", "Balochi_WordPiece", "Balochi", "WordPiece", "Perso-Arabic", "Local file"), |
| ("3", "Balochi_SentencePiece", "Balochi", "SentencePiece","Perso-Arabic", "Local file"), |
| ("4", "Balochi_30K", "Balochi", "BPE", "Perso-Arabic", "balochiml/balochi-tokenizer"), |
| ("5", "NLTK", "Baseline", "Rule-based", "Any", "β"), |
| ("6", "BERT", "Generic", "WordPiece", "Multilingual", "bert-base-multilingual-cased"), |
| ("7", "Gemma", "Generic", "SP-BPE", "Multilingual", "google/gemma-2b"), |
| ("8", "AraBERT_v2", "Arabic", "WordPiece", "Arabic", "aubmindlab/bert-base-arabertv2"), |
| ("9", "CAMeLBERT_MSA", "Arabic", "WordPiece", "Arabic", "CAMeL-Lab/bert-base-arabic-camelbert-msa"), |
| ("10", "ARBERT", "Arabic", "WordPiece", "Arabic", "UBC-NLP/ARBERT"), |
| ("11", "AraGPT2", "Arabic", "BPE", "Arabic", "aubmindlab/aragpt2-base"), |
| ("12", "ParsBERT", "Persian", "WordPiece", "Perso-Arabic", "HooshvareLab/bert-base-parsbert-uncased"), |
| ("13", "PersianBERT_FA", "Persian", "WordPiece", "Perso-Arabic", "HooshvareLab/bert-fa-base-uncased"), |
| ("14", "PersianBPE", "Persian", "BPE", "Perso-Arabic", "mshojaei77/PersianBPETokenizer"), |
| ("15", "UrduBERT", "Urdu", "WordPiece", "Perso-Arabic", "urduhack/UrduBERT"), |
| ] |
| for num, key, family, algo, script, hf_id in tok_meta: |
| if key in all_metrics: |
| vs = all_metrics[key]["vocab_size"] |
| vs_str = f"{vs:,}" if vs else "N/A" |
| md.append(f"| {num} | **{key}** | {family} | {algo} | {vs_str} | `{hf_id}` |") |
| else: |
| md.append(f"| {num} | ~~{key}~~ | {family} | {algo} | β | Not loaded |") |
| md.append("") |
|
|
| |
| md.append("## 3. Tokenization Results\n") |
| md.append("| Tokenizer | Language | Tokens | Speed (tok/s) | Time (s) |") |
| md.append("|-----------|----------|--------|---------------|----------|") |
| lang_map = { |
| "Balochi_BPE":"Balochi","Balochi_WordPiece":"Balochi","Balochi_SentencePiece":"Balochi", |
| "Balochi_30K":"Balochi","NLTK":"Baseline","BERT":"Multilingual","Gemma":"Multilingual", |
| "AraBERT_v2":"Arabic","CAMeLBERT_MSA":"Arabic","ARBERT":"Arabic","AraGPT2":"Arabic", |
| "ParsBERT":"Persian","PersianBERT_FA":"Persian","PersianBPE":"Persian","UrduBERT":"Urdu", |
| } |
| for name, m in all_metrics.items(): |
| lang = lang_map.get(name, "β") |
| fname = output_filenames.get(name, f"tokens_{name.lower()}.txt") |
| md.append(f"| {name} | {lang} | {m['token_count']:,} | {m['speed']:,.0f} | {m['time_sec']:.4f} |") |
| md.append("") |
|
|
| |
| md.append("## 4. Master Metrics Table\n") |
| all_names = list(all_metrics.keys()) |
| hdr = "| Metric |" + "".join(f" {n} |" for n in all_names) |
| sep = "|--------|" + "".join("--------|" for _ in all_names) |
| md.append(hdr); md.append(sep) |
| rows = [ |
| ("Token Count", "token_count", "{:,}"), |
| ("Unique Tokens", "unique_tokens", "{:,}"), |
| ("Vocab Size", "vocab_size", "{}"), |
| ("Vocab Util. (%)", "vocab_utilization", "{:.2f}%"), |
| ("Compression Ratio", "compression_ratio", "{:.2f}"), |
| ("Fertility", "fertility", "{:.3f}"), |
| ("Avg Token Length", "avg_token_length", "{:.2f}"), |
| ("UNK Count", "unk_count", "{:,}"), |
| ("UNK Rate (%)", "unk_rate", "{:.4f}%"), |
| ("Continuation Rate", "continuation_rate", "{:.2f}%"), |
| ("Speed (tok/s)", "speed", "{:,.0f}"), |
| ("Time (s)", "time_sec", "{:.4f}"), |
| ("Roundtrip Fidelity", "roundtrip_fidelity", "{}"), |
| ] |
| for label, key, fmt in rows: |
| row = f"| **{label}** |" |
| for n in all_names: |
| val = all_metrics[n].get(key) |
| if val is None: |
| row += " N/A |" |
| elif isinstance(val, bool): |
| row += f" {'β' if val else 'β'} |" |
| else: |
| try: |
| row += f" {fmt.format(val)} |" |
| except Exception: |
| row += f" {val} |" |
| md.append(row) |
| md.append("") |
|
|
| |
| groups = [ |
| ("5", "Group A: WordPiece Family", ["Balochi_WordPiece","AraBERT_v2","CAMeLBERT_MSA","ARBERT","BERT"]), |
| ("6", "Group B: SP/WP β Balochi vs Persian vs Gemma", ["Balochi_SentencePiece","ParsBERT","PersianBERT_FA","Gemma"]), |
| ("7", "Group C: BPE Family", ["Balochi_BPE","AraGPT2","PersianBPE","Balochi_30K","NLTK"]), |
| ("8", "Group D: Perso-Arabic Script", ["UrduBERT","Balochi_BPE","Balochi_WordPiece","AraBERT_v2","ParsBERT"]), |
| ("9", "Group E1: Balochi WP vs BERT", ["Balochi_WordPiece","BERT"]), |
| ("10", "Group E2: Balochi SP vs Gemma", ["Balochi_SentencePiece","Gemma"]), |
| ("11", "Group E3: Balochi BPE vs NLTK vs 30K", ["Balochi_BPE","NLTK","Balochi_30K"]), |
| ] |
| for sec, title, members in groups: |
| active = [m for m in members if m in all_metrics] |
| if len(active) < 2: |
| continue |
| md.append(f"## {sec}. {title}\n") |
| hdr = "| Metric |" + "".join(f" {n} |" for n in active) |
| sep = "|--------|" + "".join("--------|" for _ in active) |
| md.append(hdr); md.append(sep) |
| for label, key, fmt in rows: |
| row = f"| **{label}** |" |
| for n in active: |
| val = all_metrics[n].get(key) |
| if val is None: |
| row += " N/A |" |
| elif isinstance(val, bool): |
| row += f" {'β' if val else 'β'} |" |
| else: |
| try: |
| row += f" {fmt.format(val)} |" |
| except Exception: |
| row += f" {val} |" |
| md.append(row) |
| md.append("") |
|
|
| |
| if ablation_metrics: |
| md.append("## 12. Vocabulary Size Ablation Results\n") |
| md.append("> Tokenizers trained at different vocabulary sizes from the " |
| "`Balochi_Tokenizer_Vocab_Ablation.ipynb` notebook.\n") |
| ab_names = list(ablation_metrics.keys()) |
| hdr = "| Metric |" + "".join(f" {n} |" for n in ab_names) |
| sep = "|--------|" + "".join("--------|" for _ in ab_names) |
| md.append(hdr); md.append(sep) |
| for label, key, fmt in rows: |
| row = f"| **{label}** |" |
| for n in ab_names: |
| val = ablation_metrics[n].get(key) |
| if val is None: |
| row += " N/A |" |
| elif isinstance(val, bool): |
| row += f" {'β' if val else 'β'} |" |
| else: |
| try: |
| row += f" {fmt.format(val)} |" |
| except Exception: |
| row += f" {val} |" |
| md.append(row) |
| md.append("") |
|
|
| |
| md.append("## 13. Analysis & Interpretation\n") |
| md.append("### 13.1 Cross-Language Compression Comparison\n") |
|
|
| lang_groups = { |
| "Balochi (custom)": ["Balochi_BPE","Balochi_WordPiece","Balochi_SentencePiece"], |
| "Arabic (custom)": ["AraBERT_v2","CAMeLBERT_MSA","ARBERT","AraGPT2"], |
| "Persian (custom)": ["ParsBERT","PersianBERT_FA","PersianBPE"], |
| "Urdu (custom)": ["UrduBERT"], |
| "Baseline (generic)":["BERT","Gemma","NLTK"], |
| } |
| md.append("| Language Group | Avg Compression | Avg Fertility | Tokenizers |") |
| md.append("|----------------|-----------------|---------------|------------|") |
| for lang, members in lang_groups.items(): |
| active = [m for m in members if m in all_metrics] |
| if active: |
| avg_cr = sum(all_metrics[m]["compression_ratio"] for m in active) / len(active) |
| avg_fe = sum(all_metrics[m]["fertility"] for m in active) / len(active) |
| md.append(f"| {lang} | {avg_cr:.2f} | {avg_fe:.3f} | {', '.join(active)} |") |
| md.append("") |
|
|
| md.append("### 13.2 Key Findings\n") |
| md.append("- **Domain specificity advantage:** Custom Balochi tokenizers are expected to " |
| "outperform generic multilingual tokenizers (BERT, Gemma) on Balochi text by " |
| "producing fewer subword fragments and lower fertility.\n") |
| md.append("- **Script-family proximity:** Arabic and Persian tokenizers share the same " |
| "Perso-Arabic script family as Balochi, making their fertility and UNK rates " |
| "the most meaningful external benchmarks β more so than mBERT or Gemma.\n") |
| md.append("- **Vocabulary size effect:** Larger vocabulary (80Kβ128K) generally reduces " |
| "fertility but increases memory overhead. The optimal point for Balochi is " |
| "determined by the RΓ©nyi efficiency ablation in the companion notebook.\n") |
| md.append("- **AraBERT v2 (64K WP)** and **ParsBERT (100K WP)** serve as the primary " |
| "WordPiece upper-bound references β both trained on 70M+ token Perso-Arabic corpora.\n") |
| md.append("- **AraGPT2 (BPE 50K)** provides the cleanest Arabic BPE reference for " |
| "comparison against Balochi BPE (80K) in the BPE group.\n") |
| md.append("- **UrduBERT** is the most linguistically proximate external tokenizer to " |
| "Balochi due to shared Nastaliq script conventions and similar morphological " |
| "complexity.\n") |
|
|
| md.append("### 13.3 Roundtrip Fidelity\n") |
| md.append("| Tokenizer | Fidelity |") |
| md.append("|-----------|----------|") |
| for name, m in all_metrics.items(): |
| fid = m.get("roundtrip_fidelity") |
| md.append(f"| {name} | {'β Lossless' if fid is True else ('β Lossy' if fid is False else 'β N/A')} |") |
| md.append("") |
|
|
| md.append("## 14. Tokenizer Selection Guide\n") |
| md.append("| Use Case | Recommended Tokenizer | Rationale |") |
| md.append("|----------|----------------------|-----------|") |
| guide = [ |
| ("Balochi BERT fine-tuning", "Balochi_WordPiece 64K", "Native WP; matches BERT architecture exactly"), |
| ("Balochi GPT/Gemma CPT", "Balochi_BPE 80K", "BPE aligns with GPT-2/Gemma training conventions"), |
| ("Balochi SentencePiece pipeline","Balochi_SP 64K", "Zero UNK via byte_fallback; SP models (T5, mT5)"), |
| ("Arabic NER / SA tasks", "AraBERT v2 or CAMeLBERT", "Proven Arabic BERT baselines"), |
| ("Arabic text generation", "AraGPT2", "Arabic GPT-2 with BPE tokenizer"), |
| ("Persian BERT tasks", "ParsBERT", "Standard Persian BERT baseline"), |
| ("Persian text analysis", "PersianBERT_FA", "HooshvareLab FA-base, widely used"), |
| ("Urdu NLP tasks", "UrduBERT", "Nastaliq script; same script family as Balochi"), |
| ("Cross-lingual baseline", "BERT Multilingual 119K", "Covers 104 languages for transfer comparison"), |
| ("Word-count baseline", "NLTK", "Raw word count before subword splitting"), |
| ] |
| for row in guide: |
| md.append(f"| {row[0]} | **{row[1]}** | {row[2]} |") |
| md.append("") |
|
|
| md.append("## 15. Citation\n") |
| md.append("```bibtex") |
| md.append("@misc{hafeezullah2025balochi,") |
| md.append(" title = {Comprehensive Balochi Tokenizer Comparison: Custom vs. Cross-Language Baselines},") |
| md.append(" author = {Hafeez Ullah},") |
| md.append(" year = {2025},") |
| md.append(" url = {https://huggingface.co/balochiml},") |
| md.append(" note = {University of Gwadar, Department of Computer Science}") |
| md.append("}") |
| md.append("```\n") |
|
|
| |
| md_content = "\n".join(md) |
| md_path = os.path.join(OUTPUT_DIR, "Tokenizer_Comparison_Extended_Report.md") |
| with open(md_path, "w", encoding="utf-8") as f: |
| f.write(md_content) |
| return md_path |
|
|
| |
| |
| |
|
|
| def main(): |
| print("β" + "β" * 108 + "β") |
| print("β" + " BALOCHI + CROSS-LANGUAGE TOKENIZER COMPARISON (EXTENDED) ".center(108) + "β") |
| print("β" + "β" * 108 + "β") |
|
|
| |
| print("\n[STEP 1] Loading input text...") |
| text = load_text(INPUT_FILE) |
| words = text.split() |
| sentences = split_sentences(text) |
| print(f" Characters : {len(text):,}") |
| print(f" Words : {len(words):,}") |
| print(f" Sentences : {len(sentences):,}") |
|
|
| |
| print("\n[STEP 2] Loading primary tokenizers (15 total)...") |
| toks = load_all_tokenizers() |
| print(f"\n β Loaded {len(toks)}/15 primary tokenizers.") |
|
|
| |
| print("\n[STEP 3] Loading ablation tokenizers (vocab-size study)...") |
| ablation_toks = load_ablation_tokenizers() |
| if ablation_toks: |
| print(f" β Loaded {len(ablation_toks)} ablation tokenizers.") |
| else: |
| print(" βΉ No ablation tokenizer files found β run the notebook first to generate them.") |
|
|
| if not toks: |
| print("ERROR: No tokenizers loaded. Exiting.") |
| sys.exit(1) |
|
|
| |
| print("\n[STEP 4] Running tokenization...") |
| all_tokens = {} |
| all_metrics = {} |
| ablation_metrics = {} |
|
|
| output_filenames = {} |
| |
| name_map = { |
| "Balochi_BPE": "tokens_balochi_bpe.txt", |
| "Balochi_WordPiece": "tokens_balochi_wordpiece.txt", |
| "Balochi_SentencePiece":"tokens_balochi_sentencepiece.txt", |
| "Balochi_30K": "tokens_balochi_30k.txt", |
| "NLTK": "tokens_nltk.txt", |
| "BERT": "tokens_bert_multilingual.txt", |
| "Gemma": "tokens_gemma.txt", |
| "AraBERT_v2": "tokens_arabert_v2.txt", |
| "CAMeLBERT_MSA": "tokens_camelbert_msa.txt", |
| "ARBERT": "tokens_arbert.txt", |
| "AraGPT2": "tokens_aragpt2.txt", |
| "ParsBERT": "tokens_parsbert.txt", |
| "PersianBERT_FA": "tokens_persianbert_fa.txt", |
| "PersianBPE": "tokens_persian_bpe.txt", |
| "UrduBERT": "tokens_urdubert.txt", |
| } |
|
|
| for name, tok_info in toks.items(): |
| print(f" Tokenizing {name}...", end=" ", flush=True) |
| tokens, elapsed = tokenize_text(name, tok_info, text) |
| all_tokens[name] = tokens |
| all_metrics[name] = compute_metrics(name, tok_info, tokens, elapsed, text) |
| fname = name_map.get(name, f"tokens_{name.lower()}.txt") |
| output_filenames[name] = fname |
| save_tokens(fname, tokens) |
| print(f"Done! ({len(tokens):,} tokens in {elapsed:.4f}s)") |
|
|
| for name, tok_info in ablation_toks.items(): |
| print(f" [Ablation] Tokenizing {name}...", end=" ", flush=True) |
| tokens, elapsed = tokenize_text(name, tok_info, text) |
| ablation_metrics[name] = compute_metrics(name, tok_info, tokens, elapsed, text) |
| fname = f"tokens_{name.lower()}.txt" |
| output_filenames[name] = fname |
| save_tokens(fname, tokens) |
| print(f"Done! ({len(tokens):,} tokens in {elapsed:.4f}s)") |
|
|
| |
| print("\n[STEP 5] Sample token output...") |
| print_sample_tokens(toks, sentences) |
|
|
| |
| print("\n[STEP 6] Comparison tables...") |
| print_all_comparisons(all_metrics) |
|
|
| |
| print("\n[STEP 7] Analysis & interpretation...") |
| analyze_results(all_metrics) |
|
|
| |
| print("\n[STEP 8] Generating Markdown report...") |
| md_path = generate_markdown_report( |
| text, sentences, all_tokens, all_metrics, output_filenames, |
| ablation_metrics=ablation_metrics if ablation_metrics else None |
| ) |
| print(f" β Report saved: {md_path}") |
|
|
| |
| print(f"\n{'β' * 110}") |
| print(f" OUTPUT β {OUTPUT_DIR}") |
| print(f"{'β' * 110}") |
| for name in {**toks, **ablation_toks}: |
| fname = output_filenames.get(name, f"tokens_{name.lower()}.txt") |
| fpath = os.path.join(OUTPUT_DIR, fname) |
| size = os.path.getsize(fpath) if os.path.exists(fpath) else 0 |
| print(f" β {fname:<50s} ({size:>10,} bytes)") |
| md_size = os.path.getsize(md_path) if os.path.exists(md_path) else 0 |
| print(f" β {'Tokenizer_Comparison_Extended_Report.md':<50s} ({md_size:>10,} bytes)") |
| print(f"{'β' * 110}") |
| print(f"\n β
ALL DONE β {len(toks)} primary + {len(ablation_toks)} ablation tokenizers compared.\n") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|