from __future__ import annotations from pathlib import Path import pyarrow.parquet as pq import tiktoken class LiveShardTokenizer: def __init__(self, encoding_name: str = "cl100k_base") -> None: self.encoding = tiktoken.get_encoding(encoding_name) def tokenize_shard_text(self, shard_path: Path) -> tuple[int, int]: table = pq.read_table(shard_path, columns=["text"]) if "text" not in table.column_names: return 0, 0 rows = 0 token_count = 0 for value in table.column("text").to_pylist(): if value is None: continue text = str(value) rows += 1 token_count += len(self.encoding.encode(text, disallowed_special=())) return rows, token_count