Spaces:
Running
Running
File size: 785 Bytes
f55f92e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | from __future__ import annotations
from pathlib import Path
import pyarrow.parquet as pq
import tiktoken
class LiveShardTokenizer:
def __init__(self, encoding_name: str = "cl100k_base") -> None:
self.encoding = tiktoken.get_encoding(encoding_name)
def tokenize_shard_text(self, shard_path: Path) -> tuple[int, int]:
table = pq.read_table(shard_path, columns=["text"])
if "text" not in table.column_names:
return 0, 0
rows = 0
token_count = 0
for value in table.column("text").to_pylist():
if value is None:
continue
text = str(value)
rows += 1
token_count += len(self.encoding.encode(text, disallowed_special=()))
return rows, token_count
|