SymbolicLight-V1 / src /train_tokenizer.py
symboliclight-ai's picture
Upload SymbolicLight V1 open weights
5762a7c verified
#!/usr/bin/env python3
"""Public tokenizer wrapper for SymbolicLight.
This release file intentionally contains no corpus download or tokenizer-training
logic. It only loads the released SentencePiece model.
"""
from __future__ import annotations
from pathlib import Path
from typing import Iterable
import sentencepiece as spm
def _resolve_model_path(model_path: str | Path | None = None) -> str:
here = Path(__file__).resolve().parent
package_root = here.parent
candidates = []
if model_path:
path = Path(model_path)
candidates.extend([path, here / path, package_root / path])
candidates.extend(
[
package_root / "tokenizer" / "sl_tokenizer.model",
here / "sl_tokenizer.model",
]
)
for candidate in candidates:
if candidate.exists():
return str(candidate)
tried = ", ".join(str(candidate) for candidate in candidates)
raise FileNotFoundError(f"SentencePiece model not found. Tried: {tried}")
class SLTokenizer:
"""Small compatibility wrapper around SentencePieceProcessor."""
def __init__(self, model_path: str | Path | None = None):
self.model_path = _resolve_model_path(model_path)
self.sp = spm.SentencePieceProcessor(model_file=self.model_path)
self.vocab_size = int(self.sp.vocab_size())
self.bos_id = int(self.sp.bos_id()) if self.sp.bos_id() >= 0 else None
self.eos_id = int(self.sp.eos_id()) if self.sp.eos_id() >= 0 else None
def encode(self, text: str, add_bos: bool = False, add_eos: bool = False) -> list[int]:
ids = list(self.sp.encode(str(text), out_type=int))
if add_bos and self.bos_id is not None:
ids.insert(0, self.bos_id)
if add_eos and self.eos_id is not None:
ids.append(self.eos_id)
return ids
def decode(self, ids: Iterable[int]) -> str:
return self.sp.decode([int(idx) for idx in ids])
def __len__(self) -> int:
return self.vocab_size