smartcore-v1 / code /kod /sc_tokenizer.py
kdirgul's picture
kod (data hariç) Colab için
9aed7c4 verified
Raw
History Blame Contribute Delete
1.46 kB
"""SmartCore V1 tokenizer wrapper — SentencePiece doğrudan (transformers'a bağımsız).
Eğitim/veri pipeline'ı bunu kullanır (referans_kod/tokenizer.py'nin AutoTokenizer
arayüzü yerine). encode/decode/vocab_size + EOS ekleme.
"""
from __future__ import annotations
from typing import Iterable, List
import sentencepiece as spm
DEFAULT_MODEL = "kod/tokenizer/tokenizer.model"
class SCTokenizer:
def __init__(self, model_file: str = DEFAULT_MODEL):
self.sp = spm.SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.sp.get_piece_size()
self.unk_id = self.sp.unk_id()
self.bos_id = self.sp.bos_id()
self.eos_id = self.sp.eos_id()
self.pad_id = self.sp.pad_id()
def encode(self, text: str, add_eos: bool = True) -> List[int]:
ids = self.sp.encode(text, out_type=int)
if add_eos:
ids.append(self.eos_id)
return ids
def encode_stream(self, docs: Iterable[str], add_eos: bool = True) -> Iterable[List[int]]:
for d in docs:
yield self.encode(d, add_eos=add_eos)
def decode(self, ids: List[int]) -> str:
return self.sp.decode([i for i in ids if i != self.eos_id])
if __name__ == "__main__":
tok = SCTokenizer()
print(f"vocab={tok.vocab_size} eos={tok.eos_id}")
ids = tok.encode("Merhaba dünya! Hello world 2026.")
print(ids, "->", tok.decode(ids))