Spaces:
Running
Running
File size: 593 Bytes
350392a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | """
Fallback segmentation utilities.
Used for offline tests or snapshot generation when model tokenizers
are unavailable.
"""
from typing import Dict, List
def fallback_token_info(text: str) -> Dict[str, List]:
"""Return minimal token info using UTF-8 codepoint boundaries."""
boundaries = [0]
byte_pos = 0
for ch in text:
byte_pos += len(ch.encode("utf-8"))
boundaries.append(byte_pos)
return {
"common_boundaries": boundaries,
"qwen_tokens": [],
"rwkv_tokens": [],
"byte_to_qwen": {},
"byte_to_rwkv": {},
}
|