almaghrabima commited on
Commit
e86ee8e
ยท
verified ยท
1 Parent(s): dfd4850

Delete benchmark_pypi.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark_pypi.py +0 -338
benchmark_pypi.py DELETED
@@ -1,338 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Tokenizer Parity Benchmark - Compare SARF tokenizers against state-of-the-art.
4
-
5
- This script compares SARFTokenizer (from deeplatent-nlp) against GPT-4o, Gemma-3,
6
- Command-R, Fanar, Qwen3, and other popular tokenizers.
7
-
8
- Datasets:
9
- - Benchmark data (60k samples): https://huggingface.co/datasets/almaghrabima/deeplatent-benchmark-data
10
- - Eval test data: https://huggingface.co/datasets/almaghrabima/eval-test-data
11
-
12
- Usage:
13
- pip install -r requirements.txt
14
- python benchmark_pypi.py
15
-
16
- Requirements: see benchmarks/requirements.txt
17
- """
18
-
19
- import os
20
- import re
21
- import json
22
- import time
23
- import random
24
-
25
- import pyarrow.parquet as pq
26
-
27
- # Import from PyPI package
28
- from deeplatent import SARFTokenizer, version, RUST_AVAILABLE
29
-
30
- print(f"deeplatent-nlp version: {version()}")
31
- print(f"Rust available: {RUST_AVAILABLE}")
32
-
33
-
34
- # โ”€โ”€ Tokenizer wrappers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
35
-
36
- class SarfTokenizerWrapper:
37
- """SARF tokenizer using PyPI package."""
38
-
39
- def __init__(self, name_or_path: str, display_name: str = "SARFTokenizer"):
40
- self._tok = SARFTokenizer.from_pretrained(name_or_path)
41
- self._name = display_name
42
-
43
- def encode(self, text: str) -> list:
44
- return self._tok.encode(text)
45
-
46
- @property
47
- def vocab_size(self) -> int:
48
- return self._tok.vocab_size
49
-
50
- @property
51
- def name(self) -> str:
52
- return self._name
53
-
54
-
55
- class TiktokenTokenizer:
56
- def __init__(self, encoding_name: str, display_name: str = None):
57
- import tiktoken
58
- self._enc = tiktoken.get_encoding(encoding_name)
59
- self._name = display_name or encoding_name
60
-
61
- def encode(self, text: str) -> list:
62
- return self._enc.encode(text, allowed_special="all")
63
-
64
- @property
65
- def vocab_size(self) -> int:
66
- return self._enc.n_vocab
67
-
68
- @property
69
- def name(self) -> str:
70
- return self._name
71
-
72
-
73
- class HFTokenizer:
74
- def __init__(self, model_id: str, display_name: str = None):
75
- from transformers import AutoTokenizer
76
- try:
77
- self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
78
- except Exception:
79
- self._tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False)
80
- self._name = display_name or model_id.split("/")[-1]
81
-
82
- def encode(self, text: str) -> list:
83
- return self._tok.encode(text, add_special_tokens=False)
84
-
85
- @property
86
- def vocab_size(self) -> int:
87
- return len(self._tok)
88
-
89
- @property
90
- def name(self) -> str:
91
- return self._name
92
-
93
-
94
- # โ”€โ”€ Data loading โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
95
-
96
- AR_DETECT = re.compile(r'[\u0600-\u06FF]')
97
-
98
- # HuggingFace datasets
99
- HF_BENCHMARK_DATA = "almaghrabima/deeplatent-benchmark-data" # 60k samples (30k AR + 30k EN)
100
- HF_EVAL_DATA = "almaghrabima/eval-test-data" # Eval test data
101
-
102
-
103
- def load_samples_from_hf(dataset_id: str = HF_BENCHMARK_DATA):
104
- """
105
- Load Arabic and English samples from HuggingFace dataset.
106
-
107
- Args:
108
- dataset_id: HuggingFace dataset ID
109
- - "almaghrabima/deeplatent-benchmark-data" (default): 60k samples for benchmarking
110
- - "almaghrabima/eval-test-data": Eval test data
111
-
112
- Returns:
113
- Tuple of (arabic_samples, english_samples)
114
- """
115
- from huggingface_hub import hf_hub_download
116
-
117
- cache_dir = os.path.expanduser("~/.cache/deeplatent/benchmark_data")
118
- os.makedirs(cache_dir, exist_ok=True)
119
-
120
- # Download parquet files from HF
121
- ar_path = hf_hub_download(
122
- repo_id=dataset_id,
123
- filename="arabic_samples.parquet",
124
- repo_type="dataset",
125
- cache_dir=cache_dir,
126
- )
127
- en_path = hf_hub_download(
128
- repo_id=dataset_id,
129
- filename="english_samples.parquet",
130
- repo_type="dataset",
131
- cache_dir=cache_dir,
132
- )
133
-
134
- # Load samples
135
- ar_table = pq.read_table(ar_path)
136
- en_table = pq.read_table(en_path)
137
-
138
- ar_samples = ar_table.column("text").to_pylist()
139
- en_samples = en_table.column("text").to_pylist()
140
-
141
- print(f"Loaded {len(ar_samples)} Arabic, {len(en_samples)} English samples from {dataset_id}")
142
- return ar_samples, en_samples
143
-
144
-
145
- # โ”€โ”€ Metrics โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
146
-
147
- AR_WORD = re.compile(r'[\u0600-\u06FF]+')
148
- EN_WORD = re.compile(r'[a-zA-Z]+')
149
-
150
-
151
- def compute_metrics(tokenizer, ar_texts: list, en_texts: list) -> dict:
152
- """Compute fertility and parity metrics."""
153
- ar_total_chars = ar_total_tokens = ar_total_words = ar_total_word_tokens = 0
154
-
155
- for text in ar_texts:
156
- tokens = tokenizer.encode(text)
157
- ar_total_chars += len(text)
158
- ar_total_tokens += len(tokens)
159
- words = AR_WORD.findall(text)
160
- ar_total_words += len(words)
161
- for w in words:
162
- ar_total_word_tokens += len(tokenizer.encode(w))
163
-
164
- en_total_chars = en_total_tokens = en_total_words = en_total_word_tokens = 0
165
-
166
- for text in en_texts:
167
- tokens = tokenizer.encode(text)
168
- en_total_chars += len(text)
169
- en_total_tokens += len(tokens)
170
- words = EN_WORD.findall(text)
171
- en_total_words += len(words)
172
- for w in words:
173
- en_total_word_tokens += len(tokenizer.encode(w))
174
-
175
- ar_fertility = ar_total_word_tokens / ar_total_words if ar_total_words else 0
176
- ar_cpt = ar_total_chars / ar_total_tokens if ar_total_tokens else 0
177
- en_fertility = en_total_word_tokens / en_total_words if en_total_words else 0
178
- en_cpt = en_total_chars / en_total_tokens if en_total_tokens else 0
179
- parity = ar_cpt / en_cpt if en_cpt else 0
180
-
181
- return {
182
- "ar_fertility": ar_fertility,
183
- "ar_cpt": ar_cpt,
184
- "en_fertility": en_fertility,
185
- "en_cpt": en_cpt,
186
- "parity": parity,
187
- "avg_fertility": (ar_fertility + en_fertility) / 2,
188
- }
189
-
190
-
191
- # โ”€โ”€ Configuration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
192
-
193
- # SARF tokenizers from HuggingFace
194
- SARF_TOKENIZERS = [
195
- ("SARFTokenizer", "almaghrabima/SARFTokenizer"),
196
- ]
197
-
198
- # Baseline tokenizers
199
- BASELINE_TOKENIZERS = [
200
- ("GPT-4o", "tiktoken", "o200k_base"),
201
- ("GPT-4", "tiktoken", "cl100k_base"),
202
- ("Gemma-3-4B", "hf", "google/gemma-3-4b-it"),
203
- ("Command-R-Arabic", "hf", "CohereLabs/c4ai-command-r7b-arabic-02-2025"),
204
- ("Fanar-1-9B", "hf", "QCRI/Fanar-1-9B-Instruct"),
205
- ("Qwen3-4B", "hf", "Qwen/Qwen3-4B-Instruct-2507"),
206
- ]
207
-
208
- NUM_RUNS = 5
209
- SAMPLES_PER_RUN = 5000
210
-
211
-
212
- # โ”€โ”€ Main โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
213
-
214
- def main():
215
- print("=" * 100)
216
- print("TOKENIZER PARITY BENCHMARK")
217
- print("Dataset: almaghrabima/deeplatent-benchmark-data")
218
- print("=" * 100)
219
-
220
- # Load tokenizers
221
- print("\nLoading tokenizers...")
222
- tokenizers = []
223
-
224
- for name, hf_repo in SARF_TOKENIZERS:
225
- print(f" {name}...", end=" ", flush=True)
226
- try:
227
- tok = SarfTokenizerWrapper(hf_repo, name)
228
- print(f"OK (vocab={tok.vocab_size:,})")
229
- tokenizers.append(tok)
230
- except Exception as e:
231
- print(f"FAILED: {e}")
232
-
233
- for name, typ, source in BASELINE_TOKENIZERS:
234
- print(f" {name}...", end=" ", flush=True)
235
- try:
236
- if typ == "tiktoken":
237
- tok = TiktokenTokenizer(source, name)
238
- else:
239
- tok = HFTokenizer(source, name)
240
- print(f"OK (vocab={tok.vocab_size:,})")
241
- tokenizers.append(tok)
242
- except Exception as e:
243
- print(f"FAILED: {e}")
244
-
245
- print(f"\nLoaded {len(tokenizers)} tokenizers.")
246
-
247
- # Load all samples from HuggingFace
248
- print("\nLoading evaluation data from HuggingFace...")
249
- all_ar, all_en = load_samples_from_hf(HF_BENCHMARK_DATA)
250
-
251
- # Run benchmark 5 times
252
- all_runs = {tok.name: [] for tok in tokenizers}
253
-
254
- for run in range(NUM_RUNS):
255
- print(f"\n{'='*80}")
256
- print(f"RUN {run+1}/{NUM_RUNS}")
257
- print(f"{'='*80}")
258
-
259
- random.seed(42 + run)
260
- ar_sample = random.sample(all_ar, min(SAMPLES_PER_RUN, len(all_ar)))
261
- en_sample = random.sample(all_en, min(SAMPLES_PER_RUN, len(all_en)))
262
- print(f"Sampled {len(ar_sample)} AR, {len(en_sample)} EN")
263
-
264
- for tok in tokenizers:
265
- print(f" {tok.name}...", end=" ", flush=True)
266
- t0 = time.time()
267
- m = compute_metrics(tok, ar_sample, en_sample)
268
- all_runs[tok.name].append(m)
269
- print(f"parity={m['parity']:.4f} ({time.time()-t0:.1f}s)")
270
-
271
- # Compute averages
272
- print("\n" + "=" * 100)
273
- print("COMPUTING AVERAGES")
274
- print("=" * 100)
275
-
276
- results = []
277
- for tok in tokenizers:
278
- runs = all_runs[tok.name]
279
- n = len(runs)
280
-
281
- parity_vals = [r["parity"] for r in runs]
282
- parity_avg = sum(parity_vals) / n
283
- parity_std = (sum((v - parity_avg)**2 for v in parity_vals) / n) ** 0.5
284
-
285
- avg = {
286
- "name": tok.name,
287
- "vocab_size": tok.vocab_size,
288
- "ar_fertility_avg": sum(r["ar_fertility"] for r in runs) / n,
289
- "en_fertility_avg": sum(r["en_fertility"] for r in runs) / n,
290
- "avg_fertility_avg": sum(r["avg_fertility"] for r in runs) / n,
291
- "ar_cpt_avg": sum(r["ar_cpt"] for r in runs) / n,
292
- "en_cpt_avg": sum(r["en_cpt"] for r in runs) / n,
293
- "parity_avg": parity_avg,
294
- "parity_std": parity_std,
295
- "runs": runs,
296
- }
297
- results.append(avg)
298
-
299
- # Sort by parity (closer to 1.0)
300
- results_sorted = sorted(results, key=lambda r: abs(1.0 - r["parity_avg"]))
301
-
302
- # Print table
303
- print("\n" + "=" * 140)
304
- print(f"FINAL RESULTS (averaged over {NUM_RUNS} runs, {SAMPLES_PER_RUN} samples each)")
305
- print("=" * 140)
306
- header = f"{'Rank':<5} {'Tokenizer':<22} {'Vocab':>10} {'AR Fert':>10} {'EN Fert':>10} {'Avg Fert':>10} {'AR C/T':>10} {'EN C/T':>10} {'Parity':>10} {'ยฑStd':>8}"
307
- print(header)
308
- print("-" * 140)
309
-
310
- for rank, r in enumerate(results_sorted, 1):
311
- is_best = rank == 1
312
- is_sarf = "SARF" in r["name"]
313
- marker = " ๐Ÿ†" if is_best else (" ***" if is_sarf else "")
314
- print(f"{rank:<5} {r['name']:<22} {r['vocab_size']:>10,} {r['ar_fertility_avg']:>10.3f} {r['en_fertility_avg']:>10.3f} {r['avg_fertility_avg']:>10.3f} {r['ar_cpt_avg']:>10.3f} {r['en_cpt_avg']:>10.3f} {r['parity_avg']:>10.4f} {r['parity_std']:>7.4f}{marker}")
315
-
316
- print("=" * 140)
317
- print("*** = SARF tokenizers (using PyPI deeplatent-nlp) | ๐Ÿ† = Best parity (closest to 1.0)")
318
- print("Parity = AR chars/token รท EN chars/token (1.0 = equal treatment)")
319
-
320
- # Save results
321
- output = {
322
- "package": "deeplatent-nlp",
323
- "version": version(),
324
- "dataset": HF_BENCHMARK_DATA,
325
- "num_runs": NUM_RUNS,
326
- "samples_per_run": SAMPLES_PER_RUN,
327
- "results": [{k: v for k, v in r.items() if k != "runs"} for r in results_sorted],
328
- "detailed_runs": {r["name"]: r["runs"] for r in results_sorted},
329
- }
330
-
331
- output_path = "benchmark_results.json"
332
- with open(output_path, "w") as f:
333
- json.dump(output, f, indent=2, ensure_ascii=False)
334
- print(f"\nResults saved to {output_path}")
335
-
336
-
337
- if __name__ == "__main__":
338
- main()