""" 5-run averaged benchmark for final results. """ import os, sys, json, time, re, random sys.path.insert(0, "/root/workspace/smctm") # Load .env _env_path = "/root/workspace/smctm/.env" if os.path.exists(_env_path): with open(_env_path) as _f: for _line in _f: _line = _line.strip() if _line and not _line.startswith("#") and "=" in _line: _k, _v = _line.split("=", 1) os.environ.setdefault(_k.strip(), _v.strip()) try: import hf_transfer except ImportError: os.environ.pop("HF_HUB_ENABLE_HF_TRANSFER", None) import pyarrow.parquet as pq import glob as globmod from scripts.rewrite_bytes import ByteRewriter # ── Tokenizer wrappers ────────────────────────────────────────────── class SarfTokenizer: def __init__(self, tokenizer_dir, morf_map_path, display_name="SARF"): from transformers import PreTrainedTokenizerFast self._tok = PreTrainedTokenizerFast( tokenizer_file=os.path.join(tokenizer_dir, "tokenizer.json") ) self._rewriter = ByteRewriter(morf_map_path) self._name = display_name def encode(self, text): return self._tok.encode(self._rewriter.rewrite_text(text), add_special_tokens=False) @property def vocab_size(self): return len(self._tok) @property def name(self): return self._name class TiktokenTokenizer: def __init__(self, encoding_name, display_name=None): import tiktoken self._enc = tiktoken.get_encoding(encoding_name) self._name = display_name or encoding_name def encode(self, text): return self._enc.encode(text, allowed_special="all") @property def vocab_size(self): return self._enc.n_vocab @property def name(self): return self._name class HFTokenizer: def __init__(self, model_id, display_name=None): from transformers import AutoTokenizer try: self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) except Exception: self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False) self._name = display_name or model_id.split("/")[-1] def encode(self, text): return self._tok.encode(text, add_special_tokens=False) @property def vocab_size(self): return len(self._tok) @property def name(self): return self._name # ── Data loading ───────────────────────────────────────────────────── AR_DETECT = re.compile(r'[\u0600-\u06FF]') def load_all_samples(data_dir): parquet_files = sorted(globmod.glob(os.path.join(data_dir, '*.parquet'))) ar_samples, en_samples = [], [] for filepath in parquet_files: pf = pq.ParquetFile(filepath) for rg_idx in range(pf.num_row_groups): rg = pf.read_row_group(rg_idx) for text in rg.column("text").to_pylist(): if len(text) < 100: continue ar_chars = len(AR_DETECT.findall(text)) ar_ratio = ar_chars / len(text) if ar_ratio > 0.3: ar_samples.append(text[:2000]) elif ar_ratio < 0.05: en_samples.append(text[:2000]) if len(ar_samples) >= 25000 and len(en_samples) >= 25000: break print(f"Loaded {len(ar_samples)} Arabic, {len(en_samples)} English samples total") return ar_samples, en_samples # ── Metrics ───────────────────────────────────────────────────────── AR_WORD = re.compile(r'[\u0600-\u06FF]+') EN_WORD = re.compile(r'[a-zA-Z]+') def compute_metrics(tokenizer, ar_texts, en_texts): ar_total_chars = ar_total_tokens = ar_total_words = ar_total_word_tokens = 0 for text in ar_texts: tokens = tokenizer.encode(text) ar_total_chars += len(text) ar_total_tokens += len(tokens) words = AR_WORD.findall(text) ar_total_words += len(words) for w in words: ar_total_word_tokens += len(tokenizer.encode(w)) en_total_chars = en_total_tokens = en_total_words = en_total_word_tokens = 0 for text in en_texts: tokens = tokenizer.encode(text) en_total_chars += len(text) en_total_tokens += len(tokens) words = EN_WORD.findall(text) en_total_words += len(words) for w in words: en_total_word_tokens += len(tokenizer.encode(w)) ar_fertility = ar_total_word_tokens / ar_total_words if ar_total_words else 0 ar_cpt = ar_total_chars / ar_total_tokens if ar_total_tokens else 0 en_fertility = en_total_word_tokens / en_total_words if en_total_words else 0 en_cpt = en_total_chars / en_total_tokens if en_total_tokens else 0 parity = ar_cpt / en_cpt if en_cpt else 0 return { "ar_fertility": ar_fertility, "ar_cpt": ar_cpt, "en_fertility": en_fertility, "en_cpt": en_cpt, "parity": parity, "avg_fertility": (ar_fertility + en_fertility) / 2, } # ── Main ───────────────────────────────────────────────────────────── BASE = "/root/.cache/DeepLatent" FULL_DIR = f"{BASE}/tokenizer_parity_runs/full" DATA_DIR = f"{BASE}/base_data" BASIC_MAP = f"{BASE}/morfessor_models/morf_map.basic.json" SUPP_MAP = f"{BASE}/morfessor_models/morf_map.supp.json" # All tokenizers SARF_TOKENIZERS = [ ("SARF-65k-v2", f"{FULL_DIR}/basic_vs64000_ar115_en135", BASIC_MAP), ("SARF-65k", f"{FULL_DIR}/basic_vs64000_ar125_en135", BASIC_MAP), ("SARF-88k-plus", f"{FULL_DIR}/supp_vs64000_ar115_en145", SUPP_MAP), ("SARF-115k-plus", f"{FULL_DIR}/supp_vs96000_ar125_en135", SUPP_MAP), ] BASELINE_TOKENIZERS = [ ("GPT-4o", "tiktoken", "o200k_base"), ("Gemma-3-4B", "hf", "google/gemma-3-4b-it"), ("Command-R-Arabic", "hf", "CohereLabs/c4ai-command-r7b-arabic-02-2025"), ("Fanar-1-9B", "hf", "QCRI/Fanar-1-9B-Instruct"), ("Qwen3-4B", "hf", "Qwen/Qwen3-4B-Instruct-2507"), ] NUM_RUNS = 5 SAMPLES_PER_RUN = 5000 print("=" * 100) print(f"5-RUN AVERAGED BENCHMARK") print("=" * 100) # Load tokenizers print("\nLoading tokenizers...") tokenizers = [] for name, tok_dir, morf_map in SARF_TOKENIZERS: print(f" {name}...", end=" ", flush=True) try: tok = SarfTokenizer(tok_dir, morf_map, name) print(f"OK (vocab={tok.vocab_size:,})") tokenizers.append(tok) except Exception as e: print(f"FAILED: {e}") for name, typ, source in BASELINE_TOKENIZERS: print(f" {name}...", end=" ", flush=True) try: if typ == "tiktoken": tok = TiktokenTokenizer(source, name) else: tok = HFTokenizer(source, name) print(f"OK (vocab={tok.vocab_size:,})") tokenizers.append(tok) except Exception as e: print(f"FAILED: {e}") print(f"\nLoaded {len(tokenizers)} tokenizers.") # Load all samples print("\nLoading evaluation data...") all_ar, all_en = load_all_samples(DATA_DIR) # Run benchmark 5 times all_runs = {tok.name: [] for tok in tokenizers} for run in range(NUM_RUNS): print(f"\n{'='*80}") print(f"RUN {run+1}/{NUM_RUNS}") print(f"{'='*80}") random.seed(42 + run) ar_sample = random.sample(all_ar, min(SAMPLES_PER_RUN, len(all_ar))) en_sample = random.sample(all_en, min(SAMPLES_PER_RUN, len(all_en))) print(f"Sampled {len(ar_sample)} AR, {len(en_sample)} EN") for tok in tokenizers: print(f" {tok.name}...", end=" ", flush=True) t0 = time.time() m = compute_metrics(tok, ar_sample, en_sample) all_runs[tok.name].append(m) print(f"parity={m['parity']:.4f} ({time.time()-t0:.1f}s)") # Compute averages print("\n" + "=" * 100) print("COMPUTING AVERAGES") print("=" * 100) results = [] for tok in tokenizers: runs = all_runs[tok.name] n = len(runs) parity_vals = [r["parity"] for r in runs] parity_avg = sum(parity_vals) / n parity_std = (sum((v - parity_avg)**2 for v in parity_vals) / n) ** 0.5 avg = { "name": tok.name, "vocab_size": tok.vocab_size, "ar_fertility_avg": sum(r["ar_fertility"] for r in runs) / n, "en_fertility_avg": sum(r["en_fertility"] for r in runs) / n, "avg_fertility_avg": sum(r["avg_fertility"] for r in runs) / n, "ar_cpt_avg": sum(r["ar_cpt"] for r in runs) / n, "en_cpt_avg": sum(r["en_cpt"] for r in runs) / n, "parity_avg": parity_avg, "parity_std": parity_std, "runs": runs, } results.append(avg) # Sort by parity (closer to 1.0) results_sorted = sorted(results, key=lambda r: abs(1.0 - r["parity_avg"])) # Print table print("\n" + "=" * 130) print(f"FINAL RESULTS (averaged over {NUM_RUNS} runs)") print("=" * 130) header = f"{'Rank':<5} {'Tokenizer':<20} {'Vocab':>10} {'AR Fert':>10} {'EN Fert':>10} {'Avg Fert':>10} {'AR C/T':>10} {'EN C/T':>10} {'Parity':>10} {'±Std':>8}" print(header) print("-" * 130) for rank, r in enumerate(results_sorted, 1): is_best = rank == 1 is_sarf = "SARF" in r["name"] marker = " 🏆" if is_best else (" ***" if is_sarf else "") print(f"{rank:<5} {r['name']:<20} {r['vocab_size']:>10,} {r['ar_fertility_avg']:>10.3f} {r['en_fertility_avg']:>10.3f} {r['avg_fertility_avg']:>10.3f} {r['ar_cpt_avg']:>10.3f} {r['en_cpt_avg']:>10.3f} {r['parity_avg']:>10.4f} {r['parity_std']:>7.4f}{marker}") print("=" * 130) print("*** = SARF tokenizers | 🏆 = Best parity") # Save results output = { "num_runs": NUM_RUNS, "samples_per_run": SAMPLES_PER_RUN, "results": [{k: v for k, v in r.items() if k != "runs"} for r in results_sorted], "detailed_runs": {r["name"]: r["runs"] for r in results_sorted}, } with open("/tmp/benchmark_5runs_final.json", "w") as f: json.dump(output, f, indent=2, ensure_ascii=False) print("\nResults saved to /tmp/benchmark_5runs_final.json")