File size: 2,017 Bytes

5762a7c

#!/usr/bin/env python3
"""Public tokenizer wrapper for SymbolicLight.

This release file intentionally contains no corpus download or tokenizer-training
logic. It only loads the released SentencePiece model.
"""

from __future__ import annotations

from pathlib import Path
from typing import Iterable

import sentencepiece as spm


def _resolve_model_path(model_path: str | Path | None = None) -> str:
    here = Path(__file__).resolve().parent
    package_root = here.parent
    candidates = []
    if model_path:
        path = Path(model_path)
        candidates.extend([path, here / path, package_root / path])
    candidates.extend(
        [
            package_root / "tokenizer" / "sl_tokenizer.model",
            here / "sl_tokenizer.model",
        ]
    )
    for candidate in candidates:
        if candidate.exists():
            return str(candidate)
    tried = ", ".join(str(candidate) for candidate in candidates)
    raise FileNotFoundError(f"SentencePiece model not found. Tried: {tried}")


class SLTokenizer:
    """Small compatibility wrapper around SentencePieceProcessor."""

    def __init__(self, model_path: str | Path | None = None):
        self.model_path = _resolve_model_path(model_path)
        self.sp = spm.SentencePieceProcessor(model_file=self.model_path)
        self.vocab_size = int(self.sp.vocab_size())
        self.bos_id = int(self.sp.bos_id()) if self.sp.bos_id() >= 0 else None
        self.eos_id = int(self.sp.eos_id()) if self.sp.eos_id() >= 0 else None

    def encode(self, text: str, add_bos: bool = False, add_eos: bool = False) -> list[int]:
        ids = list(self.sp.encode(str(text), out_type=int))
        if add_bos and self.bos_id is not None:
            ids.insert(0, self.bos_id)
        if add_eos and self.eos_id is not None:
            ids.append(self.eos_id)
        return ids

    def decode(self, ids: Iterable[int]) -> str:
        return self.sp.decode([int(idx) for idx in ids])

    def __len__(self) -> int:
        return self.vocab_size