Spaces:
Running
Running
| from __future__ import annotations | |
| from pathlib import Path | |
| import pyarrow.parquet as pq | |
| import tiktoken | |
| class LiveShardTokenizer: | |
| def __init__(self, encoding_name: str = "cl100k_base") -> None: | |
| self.encoding = tiktoken.get_encoding(encoding_name) | |
| def tokenize_shard_text(self, shard_path: Path) -> tuple[int, int]: | |
| table = pq.read_table(shard_path, columns=["text"]) | |
| if "text" not in table.column_names: | |
| return 0, 0 | |
| rows = 0 | |
| token_count = 0 | |
| for value in table.column("text").to_pylist(): | |
| if value is None: | |
| continue | |
| text = str(value) | |
| rows += 1 | |
| token_count += len(self.encoding.encode(text, disallowed_special=())) | |
| return rows, token_count | |