|
|
""" |
|
|
5-run averaged benchmark for final results. |
|
|
""" |
|
|
import os, sys, json, time, re, random |
|
|
sys.path.insert(0, "/root/workspace/smctm") |
|
|
|
|
|
|
|
|
_env_path = "/root/workspace/smctm/.env" |
|
|
if os.path.exists(_env_path): |
|
|
with open(_env_path) as _f: |
|
|
for _line in _f: |
|
|
_line = _line.strip() |
|
|
if _line and not _line.startswith("#") and "=" in _line: |
|
|
_k, _v = _line.split("=", 1) |
|
|
os.environ.setdefault(_k.strip(), _v.strip()) |
|
|
|
|
|
try: |
|
|
import hf_transfer |
|
|
except ImportError: |
|
|
os.environ.pop("HF_HUB_ENABLE_HF_TRANSFER", None) |
|
|
|
|
|
import pyarrow.parquet as pq |
|
|
import glob as globmod |
|
|
from scripts.rewrite_bytes import ByteRewriter |
|
|
|
|
|
|
|
|
|
|
|
class SarfTokenizer: |
|
|
def __init__(self, tokenizer_dir, morf_map_path, display_name="SARF"): |
|
|
from transformers import PreTrainedTokenizerFast |
|
|
self._tok = PreTrainedTokenizerFast( |
|
|
tokenizer_file=os.path.join(tokenizer_dir, "tokenizer.json") |
|
|
) |
|
|
self._rewriter = ByteRewriter(morf_map_path) |
|
|
self._name = display_name |
|
|
|
|
|
def encode(self, text): |
|
|
return self._tok.encode(self._rewriter.rewrite_text(text), add_special_tokens=False) |
|
|
|
|
|
@property |
|
|
def vocab_size(self): |
|
|
return len(self._tok) |
|
|
|
|
|
@property |
|
|
def name(self): |
|
|
return self._name |
|
|
|
|
|
|
|
|
class TiktokenTokenizer: |
|
|
def __init__(self, encoding_name, display_name=None): |
|
|
import tiktoken |
|
|
self._enc = tiktoken.get_encoding(encoding_name) |
|
|
self._name = display_name or encoding_name |
|
|
|
|
|
def encode(self, text): |
|
|
return self._enc.encode(text, allowed_special="all") |
|
|
|
|
|
@property |
|
|
def vocab_size(self): |
|
|
return self._enc.n_vocab |
|
|
|
|
|
@property |
|
|
def name(self): |
|
|
return self._name |
|
|
|
|
|
|
|
|
class HFTokenizer: |
|
|
def __init__(self, model_id, display_name=None): |
|
|
from transformers import AutoTokenizer |
|
|
try: |
|
|
self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) |
|
|
except Exception: |
|
|
self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False) |
|
|
self._name = display_name or model_id.split("/")[-1] |
|
|
|
|
|
def encode(self, text): |
|
|
return self._tok.encode(text, add_special_tokens=False) |
|
|
|
|
|
@property |
|
|
def vocab_size(self): |
|
|
return len(self._tok) |
|
|
|
|
|
@property |
|
|
def name(self): |
|
|
return self._name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AR_DETECT = re.compile(r'[\u0600-\u06FF]') |
|
|
|
|
|
def load_all_samples(data_dir): |
|
|
parquet_files = sorted(globmod.glob(os.path.join(data_dir, '*.parquet'))) |
|
|
ar_samples, en_samples = [], [] |
|
|
for filepath in parquet_files: |
|
|
pf = pq.ParquetFile(filepath) |
|
|
for rg_idx in range(pf.num_row_groups): |
|
|
rg = pf.read_row_group(rg_idx) |
|
|
for text in rg.column("text").to_pylist(): |
|
|
if len(text) < 100: |
|
|
continue |
|
|
ar_chars = len(AR_DETECT.findall(text)) |
|
|
ar_ratio = ar_chars / len(text) |
|
|
if ar_ratio > 0.3: |
|
|
ar_samples.append(text[:2000]) |
|
|
elif ar_ratio < 0.05: |
|
|
en_samples.append(text[:2000]) |
|
|
if len(ar_samples) >= 25000 and len(en_samples) >= 25000: |
|
|
break |
|
|
print(f"Loaded {len(ar_samples)} Arabic, {len(en_samples)} English samples total") |
|
|
return ar_samples, en_samples |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AR_WORD = re.compile(r'[\u0600-\u06FF]+') |
|
|
EN_WORD = re.compile(r'[a-zA-Z]+') |
|
|
|
|
|
def compute_metrics(tokenizer, ar_texts, en_texts): |
|
|
ar_total_chars = ar_total_tokens = ar_total_words = ar_total_word_tokens = 0 |
|
|
for text in ar_texts: |
|
|
tokens = tokenizer.encode(text) |
|
|
ar_total_chars += len(text) |
|
|
ar_total_tokens += len(tokens) |
|
|
words = AR_WORD.findall(text) |
|
|
ar_total_words += len(words) |
|
|
for w in words: |
|
|
ar_total_word_tokens += len(tokenizer.encode(w)) |
|
|
|
|
|
en_total_chars = en_total_tokens = en_total_words = en_total_word_tokens = 0 |
|
|
for text in en_texts: |
|
|
tokens = tokenizer.encode(text) |
|
|
en_total_chars += len(text) |
|
|
en_total_tokens += len(tokens) |
|
|
words = EN_WORD.findall(text) |
|
|
en_total_words += len(words) |
|
|
for w in words: |
|
|
en_total_word_tokens += len(tokenizer.encode(w)) |
|
|
|
|
|
ar_fertility = ar_total_word_tokens / ar_total_words if ar_total_words else 0 |
|
|
ar_cpt = ar_total_chars / ar_total_tokens if ar_total_tokens else 0 |
|
|
en_fertility = en_total_word_tokens / en_total_words if en_total_words else 0 |
|
|
en_cpt = en_total_chars / en_total_tokens if en_total_tokens else 0 |
|
|
parity = ar_cpt / en_cpt if en_cpt else 0 |
|
|
|
|
|
return { |
|
|
"ar_fertility": ar_fertility, |
|
|
"ar_cpt": ar_cpt, |
|
|
"en_fertility": en_fertility, |
|
|
"en_cpt": en_cpt, |
|
|
"parity": parity, |
|
|
"avg_fertility": (ar_fertility + en_fertility) / 2, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BASE = "/root/.cache/DeepLatent" |
|
|
FULL_DIR = f"{BASE}/tokenizer_parity_runs/full" |
|
|
DATA_DIR = f"{BASE}/base_data" |
|
|
BASIC_MAP = f"{BASE}/morfessor_models/morf_map.basic.json" |
|
|
SUPP_MAP = f"{BASE}/morfessor_models/morf_map.supp.json" |
|
|
|
|
|
|
|
|
SARF_TOKENIZERS = [ |
|
|
("SARF-65k-v2", f"{FULL_DIR}/basic_vs64000_ar115_en135", BASIC_MAP), |
|
|
("SARF-65k", f"{FULL_DIR}/basic_vs64000_ar125_en135", BASIC_MAP), |
|
|
("SARF-88k-plus", f"{FULL_DIR}/supp_vs64000_ar115_en145", SUPP_MAP), |
|
|
("SARF-115k-plus", f"{FULL_DIR}/supp_vs96000_ar125_en135", SUPP_MAP), |
|
|
] |
|
|
|
|
|
BASELINE_TOKENIZERS = [ |
|
|
("GPT-4o", "tiktoken", "o200k_base"), |
|
|
("Gemma-3-4B", "hf", "google/gemma-3-4b-it"), |
|
|
("Command-R-Arabic", "hf", "CohereLabs/c4ai-command-r7b-arabic-02-2025"), |
|
|
("Fanar-1-9B", "hf", "QCRI/Fanar-1-9B-Instruct"), |
|
|
("Qwen3-4B", "hf", "Qwen/Qwen3-4B-Instruct-2507"), |
|
|
] |
|
|
|
|
|
NUM_RUNS = 5 |
|
|
SAMPLES_PER_RUN = 5000 |
|
|
|
|
|
print("=" * 100) |
|
|
print(f"5-RUN AVERAGED BENCHMARK") |
|
|
print("=" * 100) |
|
|
|
|
|
|
|
|
print("\nLoading tokenizers...") |
|
|
tokenizers = [] |
|
|
|
|
|
for name, tok_dir, morf_map in SARF_TOKENIZERS: |
|
|
print(f" {name}...", end=" ", flush=True) |
|
|
try: |
|
|
tok = SarfTokenizer(tok_dir, morf_map, name) |
|
|
print(f"OK (vocab={tok.vocab_size:,})") |
|
|
tokenizers.append(tok) |
|
|
except Exception as e: |
|
|
print(f"FAILED: {e}") |
|
|
|
|
|
for name, typ, source in BASELINE_TOKENIZERS: |
|
|
print(f" {name}...", end=" ", flush=True) |
|
|
try: |
|
|
if typ == "tiktoken": |
|
|
tok = TiktokenTokenizer(source, name) |
|
|
else: |
|
|
tok = HFTokenizer(source, name) |
|
|
print(f"OK (vocab={tok.vocab_size:,})") |
|
|
tokenizers.append(tok) |
|
|
except Exception as e: |
|
|
print(f"FAILED: {e}") |
|
|
|
|
|
print(f"\nLoaded {len(tokenizers)} tokenizers.") |
|
|
|
|
|
|
|
|
print("\nLoading evaluation data...") |
|
|
all_ar, all_en = load_all_samples(DATA_DIR) |
|
|
|
|
|
|
|
|
all_runs = {tok.name: [] for tok in tokenizers} |
|
|
|
|
|
for run in range(NUM_RUNS): |
|
|
print(f"\n{'='*80}") |
|
|
print(f"RUN {run+1}/{NUM_RUNS}") |
|
|
print(f"{'='*80}") |
|
|
|
|
|
random.seed(42 + run) |
|
|
ar_sample = random.sample(all_ar, min(SAMPLES_PER_RUN, len(all_ar))) |
|
|
en_sample = random.sample(all_en, min(SAMPLES_PER_RUN, len(all_en))) |
|
|
print(f"Sampled {len(ar_sample)} AR, {len(en_sample)} EN") |
|
|
|
|
|
for tok in tokenizers: |
|
|
print(f" {tok.name}...", end=" ", flush=True) |
|
|
t0 = time.time() |
|
|
m = compute_metrics(tok, ar_sample, en_sample) |
|
|
all_runs[tok.name].append(m) |
|
|
print(f"parity={m['parity']:.4f} ({time.time()-t0:.1f}s)") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 100) |
|
|
print("COMPUTING AVERAGES") |
|
|
print("=" * 100) |
|
|
|
|
|
results = [] |
|
|
for tok in tokenizers: |
|
|
runs = all_runs[tok.name] |
|
|
n = len(runs) |
|
|
|
|
|
parity_vals = [r["parity"] for r in runs] |
|
|
parity_avg = sum(parity_vals) / n |
|
|
parity_std = (sum((v - parity_avg)**2 for v in parity_vals) / n) ** 0.5 |
|
|
|
|
|
avg = { |
|
|
"name": tok.name, |
|
|
"vocab_size": tok.vocab_size, |
|
|
"ar_fertility_avg": sum(r["ar_fertility"] for r in runs) / n, |
|
|
"en_fertility_avg": sum(r["en_fertility"] for r in runs) / n, |
|
|
"avg_fertility_avg": sum(r["avg_fertility"] for r in runs) / n, |
|
|
"ar_cpt_avg": sum(r["ar_cpt"] for r in runs) / n, |
|
|
"en_cpt_avg": sum(r["en_cpt"] for r in runs) / n, |
|
|
"parity_avg": parity_avg, |
|
|
"parity_std": parity_std, |
|
|
"runs": runs, |
|
|
} |
|
|
results.append(avg) |
|
|
|
|
|
|
|
|
results_sorted = sorted(results, key=lambda r: abs(1.0 - r["parity_avg"])) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 130) |
|
|
print(f"FINAL RESULTS (averaged over {NUM_RUNS} runs)") |
|
|
print("=" * 130) |
|
|
header = f"{'Rank':<5} {'Tokenizer':<20} {'Vocab':>10} {'AR Fert':>10} {'EN Fert':>10} {'Avg Fert':>10} {'AR C/T':>10} {'EN C/T':>10} {'Parity':>10} {'Β±Std':>8}" |
|
|
print(header) |
|
|
print("-" * 130) |
|
|
|
|
|
for rank, r in enumerate(results_sorted, 1): |
|
|
is_best = rank == 1 |
|
|
is_sarf = "SARF" in r["name"] |
|
|
marker = " π" if is_best else (" ***" if is_sarf else "") |
|
|
print(f"{rank:<5} {r['name']:<20} {r['vocab_size']:>10,} {r['ar_fertility_avg']:>10.3f} {r['en_fertility_avg']:>10.3f} {r['avg_fertility_avg']:>10.3f} {r['ar_cpt_avg']:>10.3f} {r['en_cpt_avg']:>10.3f} {r['parity_avg']:>10.4f} {r['parity_std']:>7.4f}{marker}") |
|
|
|
|
|
print("=" * 130) |
|
|
print("*** = SARF tokenizers | π = Best parity") |
|
|
|
|
|
|
|
|
output = { |
|
|
"num_runs": NUM_RUNS, |
|
|
"samples_per_run": SAMPLES_PER_RUN, |
|
|
"results": [{k: v for k, v in r.items() if k != "runs"} for r in results_sorted], |
|
|
"detailed_runs": {r["name"]: r["runs"] for r in results_sorted}, |
|
|
} |
|
|
with open("/tmp/benchmark_5runs_final.json", "w") as f: |
|
|
json.dump(output, f, indent=2, ensure_ascii=False) |
|
|
print("\nResults saved to /tmp/benchmark_5runs_final.json") |
|
|
|