AutoWS / crawler /tokenizer.py
Roman190928's picture
Upload AutoWS app files without plan/readme
f55f92e verified
from __future__ import annotations
from pathlib import Path
import pyarrow.parquet as pq
import tiktoken
class LiveShardTokenizer:
def __init__(self, encoding_name: str = "cl100k_base") -> None:
self.encoding = tiktoken.get_encoding(encoding_name)
def tokenize_shard_text(self, shard_path: Path) -> tuple[int, int]:
table = pq.read_table(shard_path, columns=["text"])
if "text" not in table.column_names:
return 0, 0
rows = 0
token_count = 0
for value in table.column("text").to_pylist():
if value is None:
continue
text = str(value)
rows += 1
token_count += len(self.encoding.encode(text, disallowed_special=()))
return rows, token_count