Usage

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "piyazon/uyghur-text-tokenizer",
    trust_remote_code=True
)
text = "بۈگۈن ھاۋا ناھايىتى ياخشى، باغچىغا بېرىپ سەيلە قىلايلى."
enc = tokenizer(
    text,
    add_special_tokens=False,
    return_offsets_mapping=True,
)
input_ids = enc["input_ids"]
offsets = enc["offset_mapping"]
for i, (tid, (start, end)) in enumerate(zip(input_ids, offsets)):
    piece_text = text[start:end]
    raw_tok = tokenizer.convert_ids_to_tokens(tid)
    print(
        f"{i:02d}  id={tid:<8} raw_token={raw_tok!r:<20} "
        f"offset=({start},{end}) piece={piece_text!r}"
    )

print("\nDecoded:")
print(tokenizer.decode(input_ids, skip_special_tokens=True))
print("\nNum tokens:", len(input_ids))
words = text.split()
print("PIECES/WORD:", round(len(input_ids) / max(len(words), 1), 2))

Example result

image

Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support