File size: 785 Bytes
f55f92e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from __future__ import annotations

from pathlib import Path

import pyarrow.parquet as pq
import tiktoken


class LiveShardTokenizer:
    def __init__(self, encoding_name: str = "cl100k_base") -> None:
        self.encoding = tiktoken.get_encoding(encoding_name)

    def tokenize_shard_text(self, shard_path: Path) -> tuple[int, int]:
        table = pq.read_table(shard_path, columns=["text"])
        if "text" not in table.column_names:
            return 0, 0

        rows = 0
        token_count = 0
        for value in table.column("text").to_pylist():
            if value is None:
                continue
            text = str(value)
            rows += 1
            token_count += len(self.encoding.encode(text, disallowed_special=()))
        return rows, token_count