myte-parity-sweep / benchmark_script.py
almaghrabima's picture
Upload benchmark_script.py with huggingface_hub
a8aa24b verified
"""
5-run averaged benchmark for final results.
"""
import os, sys, json, time, re, random
sys.path.insert(0, "/root/workspace/smctm")
# Load .env
_env_path = "/root/workspace/smctm/.env"
if os.path.exists(_env_path):
with open(_env_path) as _f:
for _line in _f:
_line = _line.strip()
if _line and not _line.startswith("#") and "=" in _line:
_k, _v = _line.split("=", 1)
os.environ.setdefault(_k.strip(), _v.strip())
try:
import hf_transfer
except ImportError:
os.environ.pop("HF_HUB_ENABLE_HF_TRANSFER", None)
import pyarrow.parquet as pq
import glob as globmod
from scripts.rewrite_bytes import ByteRewriter
# ── Tokenizer wrappers ──────────────────────────────────────────────
class SarfTokenizer:
def __init__(self, tokenizer_dir, morf_map_path, display_name="SARF"):
from transformers import PreTrainedTokenizerFast
self._tok = PreTrainedTokenizerFast(
tokenizer_file=os.path.join(tokenizer_dir, "tokenizer.json")
)
self._rewriter = ByteRewriter(morf_map_path)
self._name = display_name
def encode(self, text):
return self._tok.encode(self._rewriter.rewrite_text(text), add_special_tokens=False)
@property
def vocab_size(self):
return len(self._tok)
@property
def name(self):
return self._name
class TiktokenTokenizer:
def __init__(self, encoding_name, display_name=None):
import tiktoken
self._enc = tiktoken.get_encoding(encoding_name)
self._name = display_name or encoding_name
def encode(self, text):
return self._enc.encode(text, allowed_special="all")
@property
def vocab_size(self):
return self._enc.n_vocab
@property
def name(self):
return self._name
class HFTokenizer:
def __init__(self, model_id, display_name=None):
from transformers import AutoTokenizer
try:
self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
except Exception:
self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False)
self._name = display_name or model_id.split("/")[-1]
def encode(self, text):
return self._tok.encode(text, add_special_tokens=False)
@property
def vocab_size(self):
return len(self._tok)
@property
def name(self):
return self._name
# ── Data loading ─────────────────────────────────────────────────────
AR_DETECT = re.compile(r'[\u0600-\u06FF]')
def load_all_samples(data_dir):
parquet_files = sorted(globmod.glob(os.path.join(data_dir, '*.parquet')))
ar_samples, en_samples = [], []
for filepath in parquet_files:
pf = pq.ParquetFile(filepath)
for rg_idx in range(pf.num_row_groups):
rg = pf.read_row_group(rg_idx)
for text in rg.column("text").to_pylist():
if len(text) < 100:
continue
ar_chars = len(AR_DETECT.findall(text))
ar_ratio = ar_chars / len(text)
if ar_ratio > 0.3:
ar_samples.append(text[:2000])
elif ar_ratio < 0.05:
en_samples.append(text[:2000])
if len(ar_samples) >= 25000 and len(en_samples) >= 25000:
break
print(f"Loaded {len(ar_samples)} Arabic, {len(en_samples)} English samples total")
return ar_samples, en_samples
# ── Metrics ─────────────────────────────────────────────────────────
AR_WORD = re.compile(r'[\u0600-\u06FF]+')
EN_WORD = re.compile(r'[a-zA-Z]+')
def compute_metrics(tokenizer, ar_texts, en_texts):
ar_total_chars = ar_total_tokens = ar_total_words = ar_total_word_tokens = 0
for text in ar_texts:
tokens = tokenizer.encode(text)
ar_total_chars += len(text)
ar_total_tokens += len(tokens)
words = AR_WORD.findall(text)
ar_total_words += len(words)
for w in words:
ar_total_word_tokens += len(tokenizer.encode(w))
en_total_chars = en_total_tokens = en_total_words = en_total_word_tokens = 0
for text in en_texts:
tokens = tokenizer.encode(text)
en_total_chars += len(text)
en_total_tokens += len(tokens)
words = EN_WORD.findall(text)
en_total_words += len(words)
for w in words:
en_total_word_tokens += len(tokenizer.encode(w))
ar_fertility = ar_total_word_tokens / ar_total_words if ar_total_words else 0
ar_cpt = ar_total_chars / ar_total_tokens if ar_total_tokens else 0
en_fertility = en_total_word_tokens / en_total_words if en_total_words else 0
en_cpt = en_total_chars / en_total_tokens if en_total_tokens else 0
parity = ar_cpt / en_cpt if en_cpt else 0
return {
"ar_fertility": ar_fertility,
"ar_cpt": ar_cpt,
"en_fertility": en_fertility,
"en_cpt": en_cpt,
"parity": parity,
"avg_fertility": (ar_fertility + en_fertility) / 2,
}
# ── Main ─────────────────────────────────────────────────────────────
BASE = "/root/.cache/DeepLatent"
FULL_DIR = f"{BASE}/tokenizer_parity_runs/full"
DATA_DIR = f"{BASE}/base_data"
BASIC_MAP = f"{BASE}/morfessor_models/morf_map.basic.json"
SUPP_MAP = f"{BASE}/morfessor_models/morf_map.supp.json"
# All tokenizers
SARF_TOKENIZERS = [
("SARF-65k-v2", f"{FULL_DIR}/basic_vs64000_ar115_en135", BASIC_MAP),
("SARF-65k", f"{FULL_DIR}/basic_vs64000_ar125_en135", BASIC_MAP),
("SARF-88k-plus", f"{FULL_DIR}/supp_vs64000_ar115_en145", SUPP_MAP),
("SARF-115k-plus", f"{FULL_DIR}/supp_vs96000_ar125_en135", SUPP_MAP),
]
BASELINE_TOKENIZERS = [
("GPT-4o", "tiktoken", "o200k_base"),
("Gemma-3-4B", "hf", "google/gemma-3-4b-it"),
("Command-R-Arabic", "hf", "CohereLabs/c4ai-command-r7b-arabic-02-2025"),
("Fanar-1-9B", "hf", "QCRI/Fanar-1-9B-Instruct"),
("Qwen3-4B", "hf", "Qwen/Qwen3-4B-Instruct-2507"),
]
NUM_RUNS = 5
SAMPLES_PER_RUN = 5000
print("=" * 100)
print(f"5-RUN AVERAGED BENCHMARK")
print("=" * 100)
# Load tokenizers
print("\nLoading tokenizers...")
tokenizers = []
for name, tok_dir, morf_map in SARF_TOKENIZERS:
print(f" {name}...", end=" ", flush=True)
try:
tok = SarfTokenizer(tok_dir, morf_map, name)
print(f"OK (vocab={tok.vocab_size:,})")
tokenizers.append(tok)
except Exception as e:
print(f"FAILED: {e}")
for name, typ, source in BASELINE_TOKENIZERS:
print(f" {name}...", end=" ", flush=True)
try:
if typ == "tiktoken":
tok = TiktokenTokenizer(source, name)
else:
tok = HFTokenizer(source, name)
print(f"OK (vocab={tok.vocab_size:,})")
tokenizers.append(tok)
except Exception as e:
print(f"FAILED: {e}")
print(f"\nLoaded {len(tokenizers)} tokenizers.")
# Load all samples
print("\nLoading evaluation data...")
all_ar, all_en = load_all_samples(DATA_DIR)
# Run benchmark 5 times
all_runs = {tok.name: [] for tok in tokenizers}
for run in range(NUM_RUNS):
print(f"\n{'='*80}")
print(f"RUN {run+1}/{NUM_RUNS}")
print(f"{'='*80}")
random.seed(42 + run)
ar_sample = random.sample(all_ar, min(SAMPLES_PER_RUN, len(all_ar)))
en_sample = random.sample(all_en, min(SAMPLES_PER_RUN, len(all_en)))
print(f"Sampled {len(ar_sample)} AR, {len(en_sample)} EN")
for tok in tokenizers:
print(f" {tok.name}...", end=" ", flush=True)
t0 = time.time()
m = compute_metrics(tok, ar_sample, en_sample)
all_runs[tok.name].append(m)
print(f"parity={m['parity']:.4f} ({time.time()-t0:.1f}s)")
# Compute averages
print("\n" + "=" * 100)
print("COMPUTING AVERAGES")
print("=" * 100)
results = []
for tok in tokenizers:
runs = all_runs[tok.name]
n = len(runs)
parity_vals = [r["parity"] for r in runs]
parity_avg = sum(parity_vals) / n
parity_std = (sum((v - parity_avg)**2 for v in parity_vals) / n) ** 0.5
avg = {
"name": tok.name,
"vocab_size": tok.vocab_size,
"ar_fertility_avg": sum(r["ar_fertility"] for r in runs) / n,
"en_fertility_avg": sum(r["en_fertility"] for r in runs) / n,
"avg_fertility_avg": sum(r["avg_fertility"] for r in runs) / n,
"ar_cpt_avg": sum(r["ar_cpt"] for r in runs) / n,
"en_cpt_avg": sum(r["en_cpt"] for r in runs) / n,
"parity_avg": parity_avg,
"parity_std": parity_std,
"runs": runs,
}
results.append(avg)
# Sort by parity (closer to 1.0)
results_sorted = sorted(results, key=lambda r: abs(1.0 - r["parity_avg"]))
# Print table
print("\n" + "=" * 130)
print(f"FINAL RESULTS (averaged over {NUM_RUNS} runs)")
print("=" * 130)
header = f"{'Rank':<5} {'Tokenizer':<20} {'Vocab':>10} {'AR Fert':>10} {'EN Fert':>10} {'Avg Fert':>10} {'AR C/T':>10} {'EN C/T':>10} {'Parity':>10} {'Β±Std':>8}"
print(header)
print("-" * 130)
for rank, r in enumerate(results_sorted, 1):
is_best = rank == 1
is_sarf = "SARF" in r["name"]
marker = " πŸ†" if is_best else (" ***" if is_sarf else "")
print(f"{rank:<5} {r['name']:<20} {r['vocab_size']:>10,} {r['ar_fertility_avg']:>10.3f} {r['en_fertility_avg']:>10.3f} {r['avg_fertility_avg']:>10.3f} {r['ar_cpt_avg']:>10.3f} {r['en_cpt_avg']:>10.3f} {r['parity_avg']:>10.4f} {r['parity_std']:>7.4f}{marker}")
print("=" * 130)
print("*** = SARF tokenizers | πŸ† = Best parity")
# Save results
output = {
"num_runs": NUM_RUNS,
"samples_per_run": SAMPLES_PER_RUN,
"results": [{k: v for k, v in r.items() if k != "runs"} for r in results_sorted],
"detailed_runs": {r["name"]: r["runs"] for r in results_sorted},
}
with open("/tmp/benchmark_5runs_final.json", "w") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print("\nResults saved to /tmp/benchmark_5runs_final.json")