tiny-press / core /tokenizer_utils.py
sriharsha-cr's picture
pr/2 (#2)
6ea3105
raw
history blame contribute delete
453 Bytes
from models.model_loader import get_tokenizer_only
def count_tokens(text: str) -> int:
tokenizer = get_tokenizer_only()
return len(tokenizer.encode(text, add_special_tokens=False))
def get_token_strings(text: str) -> list[str]:
"""Return the decoded surface string for every token in text."""
tokenizer = get_tokenizer_only()
ids = tokenizer.encode(text, add_special_tokens=False)
return [tokenizer.decode([i]) for i in ids]