File size: 1,458 Bytes
9aed7c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""SmartCore V1 tokenizer wrapper — SentencePiece doğrudan (transformers'a bağımsız).



Eğitim/veri pipeline'ı bunu kullanır (referans_kod/tokenizer.py'nin AutoTokenizer

arayüzü yerine). encode/decode/vocab_size + EOS ekleme.

"""
from __future__ import annotations
from typing import Iterable, List
import sentencepiece as spm

DEFAULT_MODEL = "kod/tokenizer/tokenizer.model"


class SCTokenizer:
    def __init__(self, model_file: str = DEFAULT_MODEL):
        self.sp = spm.SentencePieceProcessor(model_file=model_file)
        self.vocab_size = self.sp.get_piece_size()
        self.unk_id = self.sp.unk_id()
        self.bos_id = self.sp.bos_id()
        self.eos_id = self.sp.eos_id()
        self.pad_id = self.sp.pad_id()

    def encode(self, text: str, add_eos: bool = True) -> List[int]:
        ids = self.sp.encode(text, out_type=int)
        if add_eos:
            ids.append(self.eos_id)
        return ids

    def encode_stream(self, docs: Iterable[str], add_eos: bool = True) -> Iterable[List[int]]:
        for d in docs:
            yield self.encode(d, add_eos=add_eos)

    def decode(self, ids: List[int]) -> str:
        return self.sp.decode([i for i in ids if i != self.eos_id])


if __name__ == "__main__":
    tok = SCTokenizer()
    print(f"vocab={tok.vocab_size} eos={tok.eos_id}")
    ids = tok.encode("Merhaba dünya! Hello world 2026.")
    print(ids, "->", tok.decode(ids))