|
|
|
|
|
""" |
|
|
Sample usage examples for the Mon language tokenizer. |
|
|
|
|
|
This script demonstrates various ways to use the Mon tokenizer with |
|
|
Hugging Face Transformers library. |
|
|
""" |
|
|
|
|
|
import logging |
|
|
import time |
|
|
from typing import List, Dict, Any |
|
|
|
|
|
import torch |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def basic_usage_example(): |
|
|
"""Demonstrate basic tokenizer usage.""" |
|
|
print("=== Basic Usage Example ===") |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer") |
|
|
print(f"✓ Loaded tokenizer (vocab size: {tokenizer.vocab_size:,})") |
|
|
|
|
|
|
|
|
texts = [ |
|
|
"ဘာသာမန်", |
|
|
"ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။", |
|
|
"ပ္ဍဲအခိင်မာံနဲသဵု မဒှ်ဘဝကွးဘာတက္ကသိုလ်ဂှ် ပါလုပ်ချဳဓရာင်ကၠုင် ပ္ဍဲပရေင်ကမၠောန်ယေန်သၞာင် ကေုာံ လိက်ပတ်မန် ဗွဲကတိုင်ကၟဟ်ရ။" |
|
|
] |
|
|
|
|
|
for i, text in enumerate(texts, 1): |
|
|
print(f"\nExample {i}:") |
|
|
print(f"Input: {text}") |
|
|
|
|
|
|
|
|
tokens = tokenizer(text, return_tensors="pt") |
|
|
input_ids = tokens["input_ids"][0] |
|
|
|
|
|
|
|
|
print(f"Token IDs: {input_ids.tolist()}") |
|
|
|
|
|
|
|
|
token_strings = tokenizer.convert_ids_to_tokens(input_ids) |
|
|
print(f"Tokens: {token_strings}") |
|
|
|
|
|
|
|
|
decoded = tokenizer.decode(input_ids, skip_special_tokens=True) |
|
|
print(f"Decoded: {decoded}") |
|
|
print(f"Round-trip success: {text == decoded}") |
|
|
|
|
|
|
|
|
def batch_processing_example(): |
|
|
"""Demonstrate batch processing.""" |
|
|
print("\n=== Batch Processing Example ===") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer") |
|
|
|
|
|
|
|
|
batch_texts = [ |
|
|
"ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။", |
|
|
"မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။", |
|
|
"အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ်ရ။" |
|
|
] |
|
|
|
|
|
|
|
|
batch_tokens = tokenizer( |
|
|
batch_texts, |
|
|
padding=True, |
|
|
truncation=True, |
|
|
return_tensors="pt", |
|
|
max_length=128 |
|
|
) |
|
|
|
|
|
print(f"Batch shape: {batch_tokens['input_ids'].shape}") |
|
|
print(f"Attention mask shape: {batch_tokens['attention_mask'].shape}") |
|
|
|
|
|
|
|
|
for i, text in enumerate(batch_texts): |
|
|
tokens_count = batch_tokens['attention_mask'][i].sum().item() |
|
|
decoded = tokenizer.decode(batch_tokens['input_ids'][i], skip_special_tokens=True) |
|
|
print(f"Text {i+1}: {tokens_count} tokens -> '{decoded}'") |
|
|
|
|
|
|
|
|
def advanced_features_example(): |
|
|
"""Demonstrate advanced features.""" |
|
|
print("\n=== Advanced Features Example ===") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer") |
|
|
text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။" |
|
|
|
|
|
|
|
|
print("Special token handling:") |
|
|
|
|
|
|
|
|
with_special = tokenizer(text, add_special_tokens=True, return_tensors="pt") |
|
|
print(f" With special tokens: {with_special['input_ids'].shape[1]} tokens") |
|
|
|
|
|
|
|
|
without_special = tokenizer(text, add_special_tokens=False, return_tensors="pt") |
|
|
print(f" Without special tokens: {without_special['input_ids'].shape[1]} tokens") |
|
|
|
|
|
|
|
|
print(f"\nSpecial tokens:") |
|
|
print(f" BOS: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})") |
|
|
print(f" EOS: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})") |
|
|
print(f" UNK: '{tokenizer.unk_token}' (ID: {tokenizer.unk_token_id})") |
|
|
print(f" PAD: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})") |
|
|
|
|
|
|
|
|
def performance_example(): |
|
|
"""Demonstrate performance characteristics.""" |
|
|
print("\n=== Performance Example ===") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer") |
|
|
|
|
|
test_texts = [ |
|
|
("Short", "ဘာသာမန်"), |
|
|
("Medium", "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။ မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။"), |
|
|
("Long", "အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ် ပ္ဍဲဍုင်သေံ ကဵု ဍုင်ဗၟာ ရ။ " * 10) |
|
|
] |
|
|
|
|
|
for name, text in test_texts: |
|
|
char_count = len(text) |
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
for _ in range(100): |
|
|
tokens = tokenizer(text, return_tensors="pt") |
|
|
avg_time = (time.time() - start_time) / 100 |
|
|
|
|
|
token_count = tokens['input_ids'].shape[1] |
|
|
chars_per_sec = char_count / avg_time if avg_time > 0 else 0 |
|
|
|
|
|
print(f"{name}: {char_count} chars -> {token_count} tokens") |
|
|
print(f" Time: {avg_time*1000:.2f}ms ({chars_per_sec:.0f} chars/sec)") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("🚀 Mon Tokenizer Usage Examples") |
|
|
print("=" * 50) |
|
|
|
|
|
try: |
|
|
basic_usage_example() |
|
|
batch_processing_example() |
|
|
advanced_features_example() |
|
|
performance_example() |
|
|
|
|
|
print(f"\n🎉 All examples completed successfully!") |
|
|
print(f"\nFor more information, visit:") |
|
|
print(f"https://huggingface.co/janakhpon/mon_tokenizer") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error running examples: {e}") |
|
|
exit(1) |