| """SmartCore V1 tokenizer wrapper — SentencePiece doğrudan (transformers'a bağımsız). | |
| Eğitim/veri pipeline'ı bunu kullanır (referans_kod/tokenizer.py'nin AutoTokenizer | |
| arayüzü yerine). encode/decode/vocab_size + EOS ekleme. | |
| """ | |
| from __future__ import annotations | |
| from typing import Iterable, List | |
| import sentencepiece as spm | |
| DEFAULT_MODEL = "kod/tokenizer/tokenizer.model" | |
| class SCTokenizer: | |
| def __init__(self, model_file: str = DEFAULT_MODEL): | |
| self.sp = spm.SentencePieceProcessor(model_file=model_file) | |
| self.vocab_size = self.sp.get_piece_size() | |
| self.unk_id = self.sp.unk_id() | |
| self.bos_id = self.sp.bos_id() | |
| self.eos_id = self.sp.eos_id() | |
| self.pad_id = self.sp.pad_id() | |
| def encode(self, text: str, add_eos: bool = True) -> List[int]: | |
| ids = self.sp.encode(text, out_type=int) | |
| if add_eos: | |
| ids.append(self.eos_id) | |
| return ids | |
| def encode_stream(self, docs: Iterable[str], add_eos: bool = True) -> Iterable[List[int]]: | |
| for d in docs: | |
| yield self.encode(d, add_eos=add_eos) | |
| def decode(self, ids: List[int]) -> str: | |
| return self.sp.decode([i for i in ids if i != self.eos_id]) | |
| if __name__ == "__main__": | |
| tok = SCTokenizer() | |
| print(f"vocab={tok.vocab_size} eos={tok.eos_id}") | |
| ids = tok.encode("Merhaba dünya! Hello world 2026.") | |
| print(ids, "->", tok.decode(ids)) | |