almaghrabima commited on
Commit
169c0b4
Β·
verified Β·
1 Parent(s): ce0f0a1

Upload benchmark_pypi.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark_pypi.py +338 -0
benchmark_pypi.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tokenizer Parity Benchmark - Compare SARF tokenizers against state-of-the-art.
4
+
5
+ This script compares SARFTokenizer (from deeplatent-nlp) against GPT-4o, Gemma-3,
6
+ Command-R, Fanar, Qwen3, and other popular tokenizers.
7
+
8
+ Datasets:
9
+ - Benchmark data (60k samples): https://huggingface.co/datasets/almaghrabima/deeplatent-benchmark-data
10
+ - Eval test data: https://huggingface.co/datasets/almaghrabima/eval-test-data
11
+
12
+ Usage:
13
+ pip install -r requirements.txt
14
+ python benchmark_pypi.py
15
+
16
+ Requirements: see benchmarks/requirements.txt
17
+ """
18
+
19
+ import os
20
+ import re
21
+ import json
22
+ import time
23
+ import random
24
+
25
+ import pyarrow.parquet as pq
26
+
27
+ # Import from PyPI package
28
+ from deeplatent import SARFTokenizer, version, RUST_AVAILABLE
29
+
30
+ print(f"deeplatent-nlp version: {version()}")
31
+ print(f"Rust available: {RUST_AVAILABLE}")
32
+
33
+
34
+ # ── Tokenizer wrappers ──────────────────────────────────────────────
35
+
36
+ class SarfTokenizerWrapper:
37
+ """SARF tokenizer using PyPI package."""
38
+
39
+ def __init__(self, name_or_path: str, display_name: str = "SARFTokenizer"):
40
+ self._tok = SARFTokenizer.from_pretrained(name_or_path)
41
+ self._name = display_name
42
+
43
+ def encode(self, text: str) -> list:
44
+ return self._tok.encode(text)
45
+
46
+ @property
47
+ def vocab_size(self) -> int:
48
+ return self._tok.vocab_size
49
+
50
+ @property
51
+ def name(self) -> str:
52
+ return self._name
53
+
54
+
55
+ class TiktokenTokenizer:
56
+ def __init__(self, encoding_name: str, display_name: str = None):
57
+ import tiktoken
58
+ self._enc = tiktoken.get_encoding(encoding_name)
59
+ self._name = display_name or encoding_name
60
+
61
+ def encode(self, text: str) -> list:
62
+ return self._enc.encode(text, allowed_special="all")
63
+
64
+ @property
65
+ def vocab_size(self) -> int:
66
+ return self._enc.n_vocab
67
+
68
+ @property
69
+ def name(self) -> str:
70
+ return self._name
71
+
72
+
73
+ class HFTokenizer:
74
+ def __init__(self, model_id: str, display_name: str = None):
75
+ from transformers import AutoTokenizer
76
+ try:
77
+ self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
78
+ except Exception:
79
+ self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False)
80
+ self._name = display_name or model_id.split("/")[-1]
81
+
82
+ def encode(self, text: str) -> list:
83
+ return self._tok.encode(text, add_special_tokens=False)
84
+
85
+ @property
86
+ def vocab_size(self) -> int:
87
+ return len(self._tok)
88
+
89
+ @property
90
+ def name(self) -> str:
91
+ return self._name
92
+
93
+
94
+ # ── Data loading ─────────────────────────────────────────────────────
95
+
96
+ AR_DETECT = re.compile(r'[\u0600-\u06FF]')
97
+
98
+ # HuggingFace datasets
99
+ HF_BENCHMARK_DATA = "almaghrabima/deeplatent-benchmark-data" # 60k samples (30k AR + 30k EN)
100
+ HF_EVAL_DATA = "almaghrabima/eval-test-data" # Eval test data
101
+
102
+
103
+ def load_samples_from_hf(dataset_id: str = HF_BENCHMARK_DATA):
104
+ """
105
+ Load Arabic and English samples from HuggingFace dataset.
106
+
107
+ Args:
108
+ dataset_id: HuggingFace dataset ID
109
+ - "almaghrabima/deeplatent-benchmark-data" (default): 60k samples for benchmarking
110
+ - "almaghrabima/eval-test-data": Eval test data
111
+
112
+ Returns:
113
+ Tuple of (arabic_samples, english_samples)
114
+ """
115
+ from huggingface_hub import hf_hub_download
116
+
117
+ cache_dir = os.path.expanduser("~/.cache/deeplatent/benchmark_data")
118
+ os.makedirs(cache_dir, exist_ok=True)
119
+
120
+ # Download parquet files from HF
121
+ ar_path = hf_hub_download(
122
+ repo_id=dataset_id,
123
+ filename="arabic_samples.parquet",
124
+ repo_type="dataset",
125
+ cache_dir=cache_dir,
126
+ )
127
+ en_path = hf_hub_download(
128
+ repo_id=dataset_id,
129
+ filename="english_samples.parquet",
130
+ repo_type="dataset",
131
+ cache_dir=cache_dir,
132
+ )
133
+
134
+ # Load samples
135
+ ar_table = pq.read_table(ar_path)
136
+ en_table = pq.read_table(en_path)
137
+
138
+ ar_samples = ar_table.column("text").to_pylist()
139
+ en_samples = en_table.column("text").to_pylist()
140
+
141
+ print(f"Loaded {len(ar_samples)} Arabic, {len(en_samples)} English samples from {dataset_id}")
142
+ return ar_samples, en_samples
143
+
144
+
145
+ # ── Metrics ─────────────────────────────────────────────────────────
146
+
147
+ AR_WORD = re.compile(r'[\u0600-\u06FF]+')
148
+ EN_WORD = re.compile(r'[a-zA-Z]+')
149
+
150
+
151
+ def compute_metrics(tokenizer, ar_texts: list, en_texts: list) -> dict:
152
+ """Compute fertility and parity metrics."""
153
+ ar_total_chars = ar_total_tokens = ar_total_words = ar_total_word_tokens = 0
154
+
155
+ for text in ar_texts:
156
+ tokens = tokenizer.encode(text)
157
+ ar_total_chars += len(text)
158
+ ar_total_tokens += len(tokens)
159
+ words = AR_WORD.findall(text)
160
+ ar_total_words += len(words)
161
+ for w in words:
162
+ ar_total_word_tokens += len(tokenizer.encode(w))
163
+
164
+ en_total_chars = en_total_tokens = en_total_words = en_total_word_tokens = 0
165
+
166
+ for text in en_texts:
167
+ tokens = tokenizer.encode(text)
168
+ en_total_chars += len(text)
169
+ en_total_tokens += len(tokens)
170
+ words = EN_WORD.findall(text)
171
+ en_total_words += len(words)
172
+ for w in words:
173
+ en_total_word_tokens += len(tokenizer.encode(w))
174
+
175
+ ar_fertility = ar_total_word_tokens / ar_total_words if ar_total_words else 0
176
+ ar_cpt = ar_total_chars / ar_total_tokens if ar_total_tokens else 0
177
+ en_fertility = en_total_word_tokens / en_total_words if en_total_words else 0
178
+ en_cpt = en_total_chars / en_total_tokens if en_total_tokens else 0
179
+ parity = ar_cpt / en_cpt if en_cpt else 0
180
+
181
+ return {
182
+ "ar_fertility": ar_fertility,
183
+ "ar_cpt": ar_cpt,
184
+ "en_fertility": en_fertility,
185
+ "en_cpt": en_cpt,
186
+ "parity": parity,
187
+ "avg_fertility": (ar_fertility + en_fertility) / 2,
188
+ }
189
+
190
+
191
+ # ── Configuration ───────────────────────────────────────────────────
192
+
193
+ # SARF tokenizers from HuggingFace
194
+ SARF_TOKENIZERS = [
195
+ ("SARFTokenizer", "almaghrabima/SARFTokenizer"),
196
+ ]
197
+
198
+ # Baseline tokenizers
199
+ BASELINE_TOKENIZERS = [
200
+ ("GPT-4o", "tiktoken", "o200k_base"),
201
+ ("GPT-4", "tiktoken", "cl100k_base"),
202
+ ("Gemma-3-4B", "hf", "google/gemma-3-4b-it"),
203
+ ("Command-R-Arabic", "hf", "CohereLabs/c4ai-command-r7b-arabic-02-2025"),
204
+ ("Fanar-1-9B", "hf", "QCRI/Fanar-1-9B-Instruct"),
205
+ ("Qwen3-4B", "hf", "Qwen/Qwen3-4B-Instruct-2507"),
206
+ ]
207
+
208
+ NUM_RUNS = 5
209
+ SAMPLES_PER_RUN = 5000
210
+
211
+
212
+ # ── Main ────────────────────────────────────────────────────────────
213
+
214
+ def main():
215
+ print("=" * 100)
216
+ print("TOKENIZER PARITY BENCHMARK")
217
+ print("Dataset: almaghrabima/deeplatent-benchmark-data")
218
+ print("=" * 100)
219
+
220
+ # Load tokenizers
221
+ print("\nLoading tokenizers...")
222
+ tokenizers = []
223
+
224
+ for name, hf_repo in SARF_TOKENIZERS:
225
+ print(f" {name}...", end=" ", flush=True)
226
+ try:
227
+ tok = SarfTokenizerWrapper(hf_repo, name)
228
+ print(f"OK (vocab={tok.vocab_size:,})")
229
+ tokenizers.append(tok)
230
+ except Exception as e:
231
+ print(f"FAILED: {e}")
232
+
233
+ for name, typ, source in BASELINE_TOKENIZERS:
234
+ print(f" {name}...", end=" ", flush=True)
235
+ try:
236
+ if typ == "tiktoken":
237
+ tok = TiktokenTokenizer(source, name)
238
+ else:
239
+ tok = HFTokenizer(source, name)
240
+ print(f"OK (vocab={tok.vocab_size:,})")
241
+ tokenizers.append(tok)
242
+ except Exception as e:
243
+ print(f"FAILED: {e}")
244
+
245
+ print(f"\nLoaded {len(tokenizers)} tokenizers.")
246
+
247
+ # Load all samples from HuggingFace
248
+ print("\nLoading evaluation data from HuggingFace...")
249
+ all_ar, all_en = load_samples_from_hf(HF_BENCHMARK_DATA)
250
+
251
+ # Run benchmark 5 times
252
+ all_runs = {tok.name: [] for tok in tokenizers}
253
+
254
+ for run in range(NUM_RUNS):
255
+ print(f"\n{'='*80}")
256
+ print(f"RUN {run+1}/{NUM_RUNS}")
257
+ print(f"{'='*80}")
258
+
259
+ random.seed(42 + run)
260
+ ar_sample = random.sample(all_ar, min(SAMPLES_PER_RUN, len(all_ar)))
261
+ en_sample = random.sample(all_en, min(SAMPLES_PER_RUN, len(all_en)))
262
+ print(f"Sampled {len(ar_sample)} AR, {len(en_sample)} EN")
263
+
264
+ for tok in tokenizers:
265
+ print(f" {tok.name}...", end=" ", flush=True)
266
+ t0 = time.time()
267
+ m = compute_metrics(tok, ar_sample, en_sample)
268
+ all_runs[tok.name].append(m)
269
+ print(f"parity={m['parity']:.4f} ({time.time()-t0:.1f}s)")
270
+
271
+ # Compute averages
272
+ print("\n" + "=" * 100)
273
+ print("COMPUTING AVERAGES")
274
+ print("=" * 100)
275
+
276
+ results = []
277
+ for tok in tokenizers:
278
+ runs = all_runs[tok.name]
279
+ n = len(runs)
280
+
281
+ parity_vals = [r["parity"] for r in runs]
282
+ parity_avg = sum(parity_vals) / n
283
+ parity_std = (sum((v - parity_avg)**2 for v in parity_vals) / n) ** 0.5
284
+
285
+ avg = {
286
+ "name": tok.name,
287
+ "vocab_size": tok.vocab_size,
288
+ "ar_fertility_avg": sum(r["ar_fertility"] for r in runs) / n,
289
+ "en_fertility_avg": sum(r["en_fertility"] for r in runs) / n,
290
+ "avg_fertility_avg": sum(r["avg_fertility"] for r in runs) / n,
291
+ "ar_cpt_avg": sum(r["ar_cpt"] for r in runs) / n,
292
+ "en_cpt_avg": sum(r["en_cpt"] for r in runs) / n,
293
+ "parity_avg": parity_avg,
294
+ "parity_std": parity_std,
295
+ "runs": runs,
296
+ }
297
+ results.append(avg)
298
+
299
+ # Sort by parity (closer to 1.0)
300
+ results_sorted = sorted(results, key=lambda r: abs(1.0 - r["parity_avg"]))
301
+
302
+ # Print table
303
+ print("\n" + "=" * 140)
304
+ print(f"FINAL RESULTS (averaged over {NUM_RUNS} runs, {SAMPLES_PER_RUN} samples each)")
305
+ print("=" * 140)
306
+ header = f"{'Rank':<5} {'Tokenizer':<22} {'Vocab':>10} {'AR Fert':>10} {'EN Fert':>10} {'Avg Fert':>10} {'AR C/T':>10} {'EN C/T':>10} {'Parity':>10} {'Β±Std':>8}"
307
+ print(header)
308
+ print("-" * 140)
309
+
310
+ for rank, r in enumerate(results_sorted, 1):
311
+ is_best = rank == 1
312
+ is_sarf = "SARF" in r["name"]
313
+ marker = " πŸ†" if is_best else (" ***" if is_sarf else "")
314
+ print(f"{rank:<5} {r['name']:<22} {r['vocab_size']:>10,} {r['ar_fertility_avg']:>10.3f} {r['en_fertility_avg']:>10.3f} {r['avg_fertility_avg']:>10.3f} {r['ar_cpt_avg']:>10.3f} {r['en_cpt_avg']:>10.3f} {r['parity_avg']:>10.4f} {r['parity_std']:>7.4f}{marker}")
315
+
316
+ print("=" * 140)
317
+ print("*** = SARF tokenizers (using PyPI deeplatent-nlp) | πŸ† = Best parity (closest to 1.0)")
318
+ print("Parity = AR chars/token Γ· EN chars/token (1.0 = equal treatment)")
319
+
320
+ # Save results
321
+ output = {
322
+ "package": "deeplatent-nlp",
323
+ "version": version(),
324
+ "dataset": HF_BENCHMARK_DATA,
325
+ "num_runs": NUM_RUNS,
326
+ "samples_per_run": SAMPLES_PER_RUN,
327
+ "results": [{k: v for k, v in r.items() if k != "runs"} for r in results_sorted],
328
+ "detailed_runs": {r["name"]: r["runs"] for r in results_sorted},
329
+ }
330
+
331
+ output_path = "benchmark_results.json"
332
+ with open(output_path, "w") as f:
333
+ json.dump(output, f, indent=2, ensure_ascii=False)
334
+ print(f"\nResults saved to {output_path}")
335
+
336
+
337
+ if __name__ == "__main__":
338
+ main()