"""SmartCore V1 tokenizer wrapper — SentencePiece doğrudan (transformers'a bağımsız). Eğitim/veri pipeline'ı bunu kullanır (referans_kod/tokenizer.py'nin AutoTokenizer arayüzü yerine). encode/decode/vocab_size + EOS ekleme. """ from __future__ import annotations from typing import Iterable, List import sentencepiece as spm DEFAULT_MODEL = "kod/tokenizer/tokenizer.model" class SCTokenizer: def __init__(self, model_file: str = DEFAULT_MODEL): self.sp = spm.SentencePieceProcessor(model_file=model_file) self.vocab_size = self.sp.get_piece_size() self.unk_id = self.sp.unk_id() self.bos_id = self.sp.bos_id() self.eos_id = self.sp.eos_id() self.pad_id = self.sp.pad_id() def encode(self, text: str, add_eos: bool = True) -> List[int]: ids = self.sp.encode(text, out_type=int) if add_eos: ids.append(self.eos_id) return ids def encode_stream(self, docs: Iterable[str], add_eos: bool = True) -> Iterable[List[int]]: for d in docs: yield self.encode(d, add_eos=add_eos) def decode(self, ids: List[int]) -> str: return self.sp.decode([i for i in ids if i != self.eos_id]) if __name__ == "__main__": tok = SCTokenizer() print(f"vocab={tok.vocab_size} eos={tok.eos_id}") ids = tok.encode("Merhaba dünya! Hello world 2026.") print(ids, "->", tok.decode(ids))