almaghrabima commited on
Commit
4e2b74e
ยท
verified ยท
1 Parent(s): e86ee8e

Upload benchmark_pypi_full.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark_pypi_full.py +338 -0
benchmark_pypi_full.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tokenizer Parity Benchmark - Compare SARF tokenizers against state-of-the-art.
4
+
5
+ This script compares SARFTokenizer (from deeplatent-nlp) against GPT-4o, Gemma-3,
6
+ Command-R, Fanar, Qwen3, and other popular tokenizers.
7
+
8
+ Datasets:
9
+ - Benchmark data (60k samples): https://huggingface.co/datasets/almaghrabima/deeplatent-benchmark-data
10
+ - Eval test data: https://huggingface.co/datasets/almaghrabima/eval-test-data
11
+
12
+ Usage:
13
+ pip install -r requirements.txt
14
+ python benchmark_pypi.py
15
+
16
+ Requirements: see benchmarks/requirements.txt
17
+ """
18
+
19
+ import os
20
+ import re
21
+ import json
22
+ import time
23
+ import random
24
+
25
+ import pyarrow.parquet as pq
26
+
27
+ # Import from PyPI package
28
+ from deeplatent import SARFTokenizer, version, RUST_AVAILABLE
29
+
30
+ print(f"deeplatent-nlp version: {version()}")
31
+ print(f"Rust available: {RUST_AVAILABLE}")
32
+
33
+
34
+ # โ”€โ”€ Tokenizer wrappers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
35
+
36
+ class SarfTokenizerWrapper:
37
+ """SARF tokenizer using PyPI package."""
38
+
39
+ def __init__(self, name_or_path: str, display_name: str = "SARFTokenizer"):
40
+ self._tok = SARFTokenizer.from_pretrained(name_or_path)
41
+ self._name = display_name
42
+
43
+ def encode(self, text: str) -> list:
44
+ return self._tok.encode(text)
45
+
46
+ @property
47
+ def vocab_size(self) -> int:
48
+ return self._tok.vocab_size
49
+
50
+ @property
51
+ def name(self) -> str:
52
+ return self._name
53
+
54
+
55
+ class TiktokenTokenizer:
56
+ def __init__(self, encoding_name: str, display_name: str = None):
57
+ import tiktoken
58
+ self._enc = tiktoken.get_encoding(encoding_name)
59
+ self._name = display_name or encoding_name
60
+
61
+ def encode(self, text: str) -> list:
62
+ return self._enc.encode(text, allowed_special="all")
63
+
64
+ @property
65
+ def vocab_size(self) -> int:
66
+ return self._enc.n_vocab
67
+
68
+ @property
69
+ def name(self) -> str:
70
+ return self._name
71
+
72
+
73
+ class HFTokenizer:
74
+ def __init__(self, model_id: str, display_name: str = None):
75
+ from transformers import AutoTokenizer
76
+ try:
77
+ self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
78
+ except Exception:
79
+ self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False)
80
+ self._name = display_name or model_id.split("/")[-1]
81
+
82
+ def encode(self, text: str) -> list:
83
+ return self._tok.encode(text, add_special_tokens=False)
84
+
85
+ @property
86
+ def vocab_size(self) -> int:
87
+ return len(self._tok)
88
+
89
+ @property
90
+ def name(self) -> str:
91
+ return self._name
92
+
93
+
94
+ # โ”€โ”€ Data loading โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
95
+
96
+ AR_DETECT = re.compile(r'[\u0600-\u06FF]')
97
+
98
+ # HuggingFace datasets
99
+ HF_BENCHMARK_DATA = "almaghrabima/deeplatent-benchmark-data" # 60k samples (30k AR + 30k EN)
100
+ HF_EVAL_DATA = "almaghrabima/eval-test-data" # Eval test data
101
+
102
+
103
+ def load_samples_from_hf(dataset_id: str = HF_BENCHMARK_DATA):
104
+ """
105
+ Load Arabic and English samples from HuggingFace dataset.
106
+
107
+ Args:
108
+ dataset_id: HuggingFace dataset ID
109
+ - "almaghrabima/deeplatent-benchmark-data" (default): 60k samples for benchmarking
110
+ - "almaghrabima/eval-test-data": Eval test data
111
+
112
+ Returns:
113
+ Tuple of (arabic_samples, english_samples)
114
+ """
115
+ from huggingface_hub import hf_hub_download
116
+
117
+ cache_dir = os.path.expanduser("~/.cache/deeplatent/benchmark_data")
118
+ os.makedirs(cache_dir, exist_ok=True)
119
+
120
+ # Download parquet files from HF
121
+ ar_path = hf_hub_download(
122
+ repo_id=dataset_id,
123
+ filename="arabic_samples.parquet",
124
+ repo_type="dataset",
125
+ cache_dir=cache_dir,
126
+ )
127
+ en_path = hf_hub_download(
128
+ repo_id=dataset_id,
129
+ filename="english_samples.parquet",
130
+ repo_type="dataset",
131
+ cache_dir=cache_dir,
132
+ )
133
+
134
+ # Load samples
135
+ ar_table = pq.read_table(ar_path)
136
+ en_table = pq.read_table(en_path)
137
+
138
+ ar_samples = ar_table.column("text").to_pylist()
139
+ en_samples = en_table.column("text").to_pylist()
140
+
141
+ print(f"Loaded {len(ar_samples)} Arabic, {len(en_samples)} English samples from {dataset_id}")
142
+ return ar_samples, en_samples
143
+
144
+
145
+ # โ”€โ”€ Metrics โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
146
+
147
+ AR_WORD = re.compile(r'[\u0600-\u06FF]+')
148
+ EN_WORD = re.compile(r'[a-zA-Z]+')
149
+
150
+
151
+ def compute_metrics(tokenizer, ar_texts: list, en_texts: list) -> dict:
152
+ """Compute fertility and parity metrics."""
153
+ ar_total_chars = ar_total_tokens = ar_total_words = ar_total_word_tokens = 0
154
+
155
+ for text in ar_texts:
156
+ tokens = tokenizer.encode(text)
157
+ ar_total_chars += len(text)
158
+ ar_total_tokens += len(tokens)
159
+ words = AR_WORD.findall(text)
160
+ ar_total_words += len(words)
161
+ for w in words:
162
+ ar_total_word_tokens += len(tokenizer.encode(w))
163
+
164
+ en_total_chars = en_total_tokens = en_total_words = en_total_word_tokens = 0
165
+
166
+ for text in en_texts:
167
+ tokens = tokenizer.encode(text)
168
+ en_total_chars += len(text)
169
+ en_total_tokens += len(tokens)
170
+ words = EN_WORD.findall(text)
171
+ en_total_words += len(words)
172
+ for w in words:
173
+ en_total_word_tokens += len(tokenizer.encode(w))
174
+
175
+ ar_fertility = ar_total_word_tokens / ar_total_words if ar_total_words else 0
176
+ ar_cpt = ar_total_chars / ar_total_tokens if ar_total_tokens else 0
177
+ en_fertility = en_total_word_tokens / en_total_words if en_total_words else 0
178
+ en_cpt = en_total_chars / en_total_tokens if en_total_tokens else 0
179
+ parity = ar_cpt / en_cpt if en_cpt else 0
180
+
181
+ return {
182
+ "ar_fertility": ar_fertility,
183
+ "ar_cpt": ar_cpt,
184
+ "en_fertility": en_fertility,
185
+ "en_cpt": en_cpt,
186
+ "parity": parity,
187
+ "avg_fertility": (ar_fertility + en_fertility) / 2,
188
+ }
189
+
190
+
191
+ # โ”€โ”€ Configuration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
192
+
193
+ # SARF tokenizers
194
+ SARF_TOKENIZERS = [
195
+ ("SARFTokenizer", "/root/.cache/deeplatent/SARFTokenizer"),
196
+ ]
197
+
198
+ # Baseline tokenizers
199
+ BASELINE_TOKENIZERS = [
200
+ ("GPT-4o", "tiktoken", "o200k_base"),
201
+ ("GPT-4", "tiktoken", "cl100k_base"),
202
+ ("Gemma-3-4B", "hf", "unsloth/gemma-3-4b-it"),
203
+ ("Command-R-Arabic", "hf", "CohereLabs/c4ai-command-r7b-arabic-02-2025"),
204
+ ("Fanar-1-9B", "hf", "QCRI/Fanar-1-9B-Instruct"),
205
+ ("Qwen3-4B", "hf", "Qwen/Qwen3-4B-Instruct-2507"),
206
+ ("Hala-9B", "hf", "hammh0a/Hala-9B"),
207
+ ("Falcon-H1-7B", "hf", "tiiuae/Falcon-H1-7B-Instruct"),
208
+ ("ALLaM-7B", "hf", "humain-ai/ALLaM-7B-Instruct-preview"),
209
+ ("Mistral-7B-v0.3", "hf", "mistralai/Mistral-7B-Instruct-v0.3"),
210
+ ]
211
+
212
+ NUM_RUNS = 1
213
+ SAMPLES_PER_RUN = 60000
214
+
215
+
216
+ # โ”€โ”€ Main โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
217
+
218
+ def main():
219
+ print("=" * 100)
220
+ print("TOKENIZER PARITY BENCHMARK")
221
+ print("Dataset: almaghrabima/deeplatent-benchmark-data")
222
+ print("=" * 100)
223
+
224
+ # Load tokenizers
225
+ print("\nLoading tokenizers...")
226
+ tokenizers = []
227
+
228
+ for name, hf_repo in SARF_TOKENIZERS:
229
+ print(f" {name}...", end=" ", flush=True)
230
+ try:
231
+ tok = SarfTokenizerWrapper(hf_repo, name)
232
+ print(f"OK (vocab={tok.vocab_size:,})")
233
+ tokenizers.append(tok)
234
+ except Exception as e:
235
+ print(f"FAILED: {e}")
236
+
237
+ for name, typ, source in BASELINE_TOKENIZERS:
238
+ print(f" {name}...", end=" ", flush=True)
239
+ try:
240
+ if typ == "tiktoken":
241
+ tok = TiktokenTokenizer(source, name)
242
+ else:
243
+ tok = HFTokenizer(source, name)
244
+ print(f"OK (vocab={tok.vocab_size:,})")
245
+ tokenizers.append(tok)
246
+ except Exception as e:
247
+ print(f"FAILED: {e}")
248
+
249
+ print(f"\nLoaded {len(tokenizers)} tokenizers.")
250
+
251
+ # Load all samples from HuggingFace
252
+ print("\nLoading evaluation data from HuggingFace...")
253
+ all_ar, all_en = load_samples_from_hf(HF_BENCHMARK_DATA)
254
+
255
+ # Run benchmark 5 times
256
+ all_runs = {tok.name: [] for tok in tokenizers}
257
+
258
+ for run in range(NUM_RUNS):
259
+ print(f"\n{'='*80}")
260
+ print(f"RUN {run+1}/{NUM_RUNS}")
261
+ print(f"{'='*80}")
262
+
263
+ random.seed(42 + run)
264
+ ar_sample = random.sample(all_ar, min(SAMPLES_PER_RUN, len(all_ar)))
265
+ en_sample = random.sample(all_en, min(SAMPLES_PER_RUN, len(all_en)))
266
+ print(f"Sampled {len(ar_sample)} AR, {len(en_sample)} EN")
267
+
268
+ for tok in tokenizers:
269
+ print(f" {tok.name}...", end=" ", flush=True)
270
+ t0 = time.time()
271
+ m = compute_metrics(tok, ar_sample, en_sample)
272
+ all_runs[tok.name].append(m)
273
+ print(f"parity={m['parity']:.4f} ({time.time()-t0:.1f}s)")
274
+
275
+ # Compute averages
276
+ print("\n" + "=" * 100)
277
+ print("COMPUTING AVERAGES")
278
+ print("=" * 100)
279
+
280
+ results = []
281
+ for tok in tokenizers:
282
+ runs = all_runs[tok.name]
283
+ n = len(runs)
284
+
285
+ parity_vals = [r["parity"] for r in runs]
286
+ parity_avg = sum(parity_vals) / n
287
+ parity_std = (sum((v - parity_avg)**2 for v in parity_vals) / n) ** 0.5
288
+
289
+ avg = {
290
+ "name": tok.name,
291
+ "vocab_size": tok.vocab_size,
292
+ "ar_fertility_avg": sum(r["ar_fertility"] for r in runs) / n,
293
+ "en_fertility_avg": sum(r["en_fertility"] for r in runs) / n,
294
+ "avg_fertility_avg": sum(r["avg_fertility"] for r in runs) / n,
295
+ "ar_cpt_avg": sum(r["ar_cpt"] for r in runs) / n,
296
+ "en_cpt_avg": sum(r["en_cpt"] for r in runs) / n,
297
+ "parity_avg": parity_avg,
298
+ "parity_std": parity_std,
299
+ "runs": runs,
300
+ }
301
+ results.append(avg)
302
+
303
+ # Print table (no ranking)
304
+ print("\n" + "=" * 140)
305
+ print(f"FINAL RESULTS (averaged over {NUM_RUNS} runs, {SAMPLES_PER_RUN} samples each)")
306
+ print("=" * 140)
307
+ header = f"{'Tokenizer':<22} {'Vocab':>10} {'AR Fert':>10} {'EN Fert':>10} {'Avg Fert':>10} {'AR C/T':>10} {'EN C/T':>10} {'Parity':>10} {'ยฑStd':>8}"
308
+ print(header)
309
+ print("-" * 140)
310
+
311
+ for r in results:
312
+ is_sarf = "SARF" in r["name"]
313
+ marker = " ***" if is_sarf else ""
314
+ print(f"{r['name']:<22} {r['vocab_size']:>10,} {r['ar_fertility_avg']:>10.3f} {r['en_fertility_avg']:>10.3f} {r['avg_fertility_avg']:>10.3f} {r['ar_cpt_avg']:>10.3f} {r['en_cpt_avg']:>10.3f} {r['parity_avg']:>10.4f} {r['parity_std']:>7.4f}{marker}")
315
+
316
+ print("=" * 140)
317
+ print("*** = SARF tokenizers (using PyPI deeplatent-nlp)")
318
+ print("Parity = AR chars/token รท EN chars/token (1.0 = equal treatment)")
319
+
320
+ # Save results
321
+ output = {
322
+ "package": "deeplatent-nlp",
323
+ "version": version(),
324
+ "dataset": HF_BENCHMARK_DATA,
325
+ "num_runs": NUM_RUNS,
326
+ "samples_per_run": SAMPLES_PER_RUN,
327
+ "results": [{k: v for k, v in r.items() if k != "runs"} for r in results],
328
+ "detailed_runs": {r["name"]: r["runs"] for r in results},
329
+ }
330
+
331
+ output_path = "benchmark_results.json"
332
+ with open(output_path, "w") as f:
333
+ json.dump(output, f, indent=2, ensure_ascii=False)
334
+ print(f"\nResults saved to {output_path}")
335
+
336
+
337
+ if __name__ == "__main__":
338
+ main()