File size: 453 Bytes
6ea3105
ebc3bf5
 
 
6ea3105
ebc3bf5
 
 
 
 
6ea3105
ebc3bf5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
from models.model_loader import get_tokenizer_only


def count_tokens(text: str) -> int:
    tokenizer = get_tokenizer_only()
    return len(tokenizer.encode(text, add_special_tokens=False))


def get_token_strings(text: str) -> list[str]:
    """Return the decoded surface string for every token in text."""
    tokenizer = get_tokenizer_only()
    ids = tokenizer.encode(text, add_special_tokens=False)
    return [tokenizer.decode([i]) for i in ids]