almaghrabima commited on
Commit
9770614
·
verified ·
1 Parent(s): e180732

Upload benchmark_tiktoken_style.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark_tiktoken_style.py +264 -0
benchmark_tiktoken_style.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tiktoken-style benchmark comparing SARFTokenizer vs tiktoken vs HuggingFace.
4
+
5
+ Measures throughput in MB/s with proper thread isolation using multiprocessing.
6
+
7
+ Usage:
8
+ python benchmark_tiktoken_style.py --samples 1000000 --threads 1 2 4 8
9
+ """
10
+
11
+ import os
12
+ import sys
13
+ import time
14
+ import argparse
15
+ from pathlib import Path
16
+ from typing import List, Tuple
17
+ from multiprocessing import Process, Queue, cpu_count
18
+
19
+ import pyarrow.parquet as pq
20
+
21
+ # Add parent to path
22
+ sys.path.insert(0, str(Path(__file__).parent))
23
+
24
+ # Configuration
25
+ DATA_DIR = "/root/.cache/deeplatent/base_data/"
26
+ HF_TOKENIZER_PATH = os.path.expanduser("~/.cache/deeplatent/tokenizers/SARFTokenizer")
27
+ DEFAULT_THREADS = [2**i for i in range(8) if 2**i <= cpu_count()]
28
+
29
+
30
+ def format_byte_size(num_bytes: float) -> Tuple[str, str]:
31
+ """Convert bytes to human-readable format."""
32
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
33
+ if num_bytes < 1024:
34
+ return f"{num_bytes:.2f} {unit}", unit
35
+ num_bytes /= 1024
36
+ return f"{num_bytes:.2f} PB", "PB"
37
+
38
+
39
+ def load_samples(data_dir: str, num_samples: int) -> Tuple[List[str], int]:
40
+ """Load samples from parquet files."""
41
+ import re
42
+ AR_DETECT = re.compile(r'[\u0600-\u06FF]')
43
+
44
+ parquet_files = sorted(Path(data_dir).glob("shard_*.parquet"))
45
+ if not parquet_files:
46
+ raise FileNotFoundError(f"No parquet files found in {data_dir}")
47
+
48
+ samples = []
49
+ target = num_samples
50
+
51
+ for pq_file in parquet_files:
52
+ if len(samples) >= target:
53
+ break
54
+
55
+ table = pq.read_table(pq_file, columns=["text"])
56
+ texts = table.column("text").to_pylist()
57
+
58
+ for text in texts:
59
+ if len(samples) >= target:
60
+ break
61
+ if text and isinstance(text, str):
62
+ samples.append(text)
63
+
64
+ total_bytes = sum(len(t.encode('utf-8')) for t in samples)
65
+ return samples, total_bytes
66
+
67
+
68
+ def benchmark_sarf(documents: List[str], num_threads: int, result_queue: Queue):
69
+ """Benchmark SARFTokenizer."""
70
+ from deeplatent import SARFTokenizer
71
+
72
+ os.environ["RAYON_NUM_THREADS"] = str(num_threads)
73
+
74
+ tok = SARFTokenizer.from_pretrained(HF_TOKENIZER_PATH)
75
+ num_bytes = sum(len(d.encode('utf-8')) for d in documents)
76
+
77
+ # Warmup
78
+ tok.encode(documents[0])
79
+
80
+ # Benchmark
81
+ start = time.perf_counter_ns()
82
+ if hasattr(tok, 'encode_batch'):
83
+ tok.encode_batch(documents)
84
+ else:
85
+ for d in documents:
86
+ tok.encode(d)
87
+ end = time.perf_counter_ns()
88
+
89
+ elapsed_ns = end - start
90
+ bytes_per_sec = num_bytes / elapsed_ns * 1e9
91
+ texts_per_sec = len(documents) / elapsed_ns * 1e9
92
+
93
+ result_queue.put(("SARFTokenizer", bytes_per_sec, texts_per_sec))
94
+
95
+
96
+ def benchmark_tiktoken(documents: List[str], num_threads: int, encoding: str, result_queue: Queue):
97
+ """Benchmark tiktoken."""
98
+ import tiktoken
99
+
100
+ os.environ["RAYON_NUM_THREADS"] = str(num_threads)
101
+
102
+ enc = tiktoken.get_encoding(encoding)
103
+ num_bytes = sum(len(d.encode('utf-8')) for d in documents)
104
+
105
+ # Warmup
106
+ enc.encode(documents[0])
107
+
108
+ # Benchmark
109
+ start = time.perf_counter_ns()
110
+ enc.encode_ordinary_batch(documents, num_threads=num_threads)
111
+ end = time.perf_counter_ns()
112
+
113
+ elapsed_ns = end - start
114
+ bytes_per_sec = num_bytes / elapsed_ns * 1e9
115
+ texts_per_sec = len(documents) / elapsed_ns * 1e9
116
+
117
+ result_queue.put((f"tiktoken ({encoding})", bytes_per_sec, texts_per_sec))
118
+
119
+
120
+ def benchmark_hf_tokenizers(documents: List[str], num_threads: int, result_queue: Queue):
121
+ """Benchmark HuggingFace tokenizers."""
122
+ from tokenizers import Tokenizer
123
+
124
+ os.environ["RAYON_NUM_THREADS"] = str(num_threads)
125
+
126
+ # Load the SARFTokenizer's underlying HF tokenizer
127
+ tokenizer_path = os.path.join(HF_TOKENIZER_PATH, "tokenizer.json")
128
+ tok = Tokenizer.from_file(tokenizer_path)
129
+ num_bytes = sum(len(d.encode('utf-8')) for d in documents)
130
+
131
+ # Warmup
132
+ tok.encode(documents[0])
133
+
134
+ # Benchmark
135
+ start = time.perf_counter_ns()
136
+ tok.encode_batch_fast(documents)
137
+ end = time.perf_counter_ns()
138
+
139
+ elapsed_ns = end - start
140
+ bytes_per_sec = num_bytes / elapsed_ns * 1e9
141
+ texts_per_sec = len(documents) / elapsed_ns * 1e9
142
+
143
+ result_queue.put(("HF tokenizers", bytes_per_sec, texts_per_sec))
144
+
145
+
146
+ def run_benchmark(documents: List[str], num_threads: int, num_bytes: int):
147
+ """Run benchmarks for all tokenizers with given thread count."""
148
+ readable_size, _ = format_byte_size(num_bytes)
149
+ avg_len = sum(len(d) for d in documents) / len(documents)
150
+
151
+ print(f"\n{'='*70}")
152
+ print(f"Threads: {num_threads}, Data: {readable_size}, Documents: {len(documents):,}, Avg Length: {avg_len:.0f}")
153
+ print(f"{'='*70}")
154
+
155
+ results = []
156
+
157
+ # SARFTokenizer
158
+ q = Queue()
159
+ p = Process(target=benchmark_sarf, args=(documents, num_threads, q))
160
+ p.start()
161
+ p.join()
162
+ if not q.empty():
163
+ name, bps, tps = q.get()
164
+ readable, _ = format_byte_size(bps)
165
+ print(f"{name:<20}\t{readable}/s\t({tps:,.0f} texts/s)")
166
+ results.append((name, bps, tps))
167
+
168
+ # tiktoken o200k_base
169
+ q = Queue()
170
+ p = Process(target=benchmark_tiktoken, args=(documents, num_threads, "o200k_base", q))
171
+ p.start()
172
+ p.join()
173
+ if not q.empty():
174
+ name, bps, tps = q.get()
175
+ readable, _ = format_byte_size(bps)
176
+ print(f"{name:<20}\t{readable}/s\t({tps:,.0f} texts/s)")
177
+ results.append((name, bps, tps))
178
+
179
+ # tiktoken cl100k_base
180
+ q = Queue()
181
+ p = Process(target=benchmark_tiktoken, args=(documents, num_threads, "cl100k_base", q))
182
+ p.start()
183
+ p.join()
184
+ if not q.empty():
185
+ name, bps, tps = q.get()
186
+ readable, _ = format_byte_size(bps)
187
+ print(f"{name:<20}\t{readable}/s\t({tps:,.0f} texts/s)")
188
+ results.append((name, bps, tps))
189
+
190
+ # HuggingFace tokenizers
191
+ q = Queue()
192
+ p = Process(target=benchmark_hf_tokenizers, args=(documents, num_threads, q))
193
+ p.start()
194
+ p.join()
195
+ if not q.empty():
196
+ name, bps, tps = q.get()
197
+ readable, _ = format_byte_size(bps)
198
+ print(f"{name:<20}\t{readable}/s\t({tps:,.0f} texts/s)")
199
+ results.append((name, bps, tps))
200
+
201
+ return results
202
+
203
+
204
+ def main():
205
+ parser = argparse.ArgumentParser(description="Tiktoken-style tokenizer benchmark")
206
+ parser.add_argument("--samples", type=int, default=10000, help="Number of samples")
207
+ parser.add_argument("--threads", type=int, nargs="+", default=DEFAULT_THREADS, help="Thread counts")
208
+ parser.add_argument("--data-dir", type=str, default=DATA_DIR, help="Data directory")
209
+ args = parser.parse_args()
210
+
211
+ print("=" * 70)
212
+ print("TIKTOKEN-STYLE TOKENIZER BENCHMARK")
213
+ print("=" * 70)
214
+ print(f"CPU count: {cpu_count()}")
215
+ print(f"Samples: {args.samples:,}")
216
+ print(f"Threads: {args.threads}")
217
+
218
+ # Load data
219
+ print("\nLoading data...")
220
+ documents, total_bytes = load_samples(args.data_dir, args.samples)
221
+ readable_size, _ = format_byte_size(total_bytes)
222
+ print(f"Loaded {len(documents):,} documents ({readable_size})")
223
+
224
+ # Run benchmarks
225
+ all_results = {}
226
+ for num_threads in args.threads:
227
+ results = run_benchmark(documents, num_threads, total_bytes)
228
+ all_results[num_threads] = results
229
+
230
+ # Summary table
231
+ print("\n" + "=" * 100)
232
+ print("SUMMARY TABLE (MB/s)")
233
+ print("=" * 100)
234
+
235
+ # Header
236
+ header = f"{'Tokenizer':<25}"
237
+ for t in args.threads:
238
+ header += f"{t}T".rjust(15)
239
+ print(header)
240
+ print("-" * 100)
241
+
242
+ # Collect by tokenizer name
243
+ tokenizers = {}
244
+ for threads, results in all_results.items():
245
+ for name, bps, tps in results:
246
+ if name not in tokenizers:
247
+ tokenizers[name] = {}
248
+ tokenizers[name][threads] = bps / 1024 / 1024 # Convert to MB/s
249
+
250
+ # Print rows
251
+ for name, thread_results in tokenizers.items():
252
+ row = f"{name:<25}"
253
+ for t in args.threads:
254
+ if t in thread_results:
255
+ row += f"{thread_results[t]:>14.2f}"
256
+ else:
257
+ row += "N/A".rjust(15)
258
+ print(row)
259
+
260
+ print("=" * 100)
261
+
262
+
263
+ if __name__ == "__main__":
264
+ main()