File size: 1,458 Bytes
9aed7c4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | """SmartCore V1 tokenizer wrapper — SentencePiece doğrudan (transformers'a bağımsız).
Eğitim/veri pipeline'ı bunu kullanır (referans_kod/tokenizer.py'nin AutoTokenizer
arayüzü yerine). encode/decode/vocab_size + EOS ekleme.
"""
from __future__ import annotations
from typing import Iterable, List
import sentencepiece as spm
DEFAULT_MODEL = "kod/tokenizer/tokenizer.model"
class SCTokenizer:
def __init__(self, model_file: str = DEFAULT_MODEL):
self.sp = spm.SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.sp.get_piece_size()
self.unk_id = self.sp.unk_id()
self.bos_id = self.sp.bos_id()
self.eos_id = self.sp.eos_id()
self.pad_id = self.sp.pad_id()
def encode(self, text: str, add_eos: bool = True) -> List[int]:
ids = self.sp.encode(text, out_type=int)
if add_eos:
ids.append(self.eos_id)
return ids
def encode_stream(self, docs: Iterable[str], add_eos: bool = True) -> Iterable[List[int]]:
for d in docs:
yield self.encode(d, add_eos=add_eos)
def decode(self, ids: List[int]) -> str:
return self.sp.decode([i for i in ids if i != self.eos_id])
if __name__ == "__main__":
tok = SCTokenizer()
print(f"vocab={tok.vocab_size} eos={tok.eos_id}")
ids = tok.encode("Merhaba dünya! Hello world 2026.")
print(ids, "->", tok.decode(ids))
|