File size: 6,135 Bytes
81cf36d f51150c 81cf36d f51150c 81cf36d f51150c 81cf36d f51150c 81cf36d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
#!/usr/bin/env python3
"""
Sample usage examples for the Mon language tokenizer.
This script demonstrates various ways to use the Mon tokenizer with
Hugging Face Transformers library.
"""
import logging
import time
from typing import List, Dict, Any
import torch
from transformers import AutoTokenizer
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def basic_usage_example():
"""Demonstrate basic tokenizer usage."""
print("=== Basic Usage Example ===")
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
print(f"✓ Loaded tokenizer (vocab size: {tokenizer.vocab_size:,})")
# Example Mon texts
texts = [
"ဘာသာမန်",
"ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
"ပ္ဍဲအခိင်မာံနဲသဵု မဒှ်ဘဝကွးဘာတက္ကသိုလ်ဂှ် ပါလုပ်ချဳဓရာင်ကၠုင် ပ္ဍဲပရေင်ကမၠောန်ယေန်သၞာင် ကေုာံ လိက်ပတ်မန် ဗွဲကတိုင်ကၟဟ်ရ။"
]
for i, text in enumerate(texts, 1):
print(f"\nExample {i}:")
print(f"Input: {text}")
# Tokenize the text
tokens = tokenizer(text, return_tensors="pt")
input_ids = tokens["input_ids"][0]
# Print results
print(f"Token IDs: {input_ids.tolist()}")
# Convert to token strings
token_strings = tokenizer.convert_ids_to_tokens(input_ids)
print(f"Tokens: {token_strings}")
# Decode back to text
decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
print(f"Decoded: {decoded}")
print(f"Round-trip success: {text == decoded}")
def batch_processing_example():
"""Demonstrate batch processing."""
print("\n=== Batch Processing Example ===")
tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
# Multiple texts for batch processing
batch_texts = [
"ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
"မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
"အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ်ရ။"
]
# Batch tokenization with padding
batch_tokens = tokenizer(
batch_texts,
padding=True,
truncation=True,
return_tensors="pt",
max_length=128
)
print(f"Batch shape: {batch_tokens['input_ids'].shape}")
print(f"Attention mask shape: {batch_tokens['attention_mask'].shape}")
# Process each item
for i, text in enumerate(batch_texts):
tokens_count = batch_tokens['attention_mask'][i].sum().item()
decoded = tokenizer.decode(batch_tokens['input_ids'][i], skip_special_tokens=True)
print(f"Text {i+1}: {tokens_count} tokens -> '{decoded}'")
def advanced_features_example():
"""Demonstrate advanced features."""
print("\n=== Advanced Features Example ===")
tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
# Different tokenization options
print("Special token handling:")
# With special tokens
with_special = tokenizer(text, add_special_tokens=True, return_tensors="pt")
print(f" With special tokens: {with_special['input_ids'].shape[1]} tokens")
# Without special tokens
without_special = tokenizer(text, add_special_tokens=False, return_tensors="pt")
print(f" Without special tokens: {without_special['input_ids'].shape[1]} tokens")
# Special token info
print(f"\nSpecial tokens:")
print(f" BOS: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
print(f" EOS: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
print(f" UNK: '{tokenizer.unk_token}' (ID: {tokenizer.unk_token_id})")
print(f" PAD: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
def performance_example():
"""Demonstrate performance characteristics."""
print("\n=== Performance Example ===")
tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
test_texts = [
("Short", "ဘာသာမန်"),
("Medium", "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။ မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။"),
("Long", "အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ် ပ္ဍဲဍုင်သေံ ကဵု ဍုင်ဗၟာ ရ။ " * 10)
]
for name, text in test_texts:
char_count = len(text)
# Measure tokenization time
start_time = time.time()
for _ in range(100): # Average over 100 runs
tokens = tokenizer(text, return_tensors="pt")
avg_time = (time.time() - start_time) / 100
token_count = tokens['input_ids'].shape[1]
chars_per_sec = char_count / avg_time if avg_time > 0 else 0
print(f"{name}: {char_count} chars -> {token_count} tokens")
print(f" Time: {avg_time*1000:.2f}ms ({chars_per_sec:.0f} chars/sec)")
if __name__ == "__main__":
print("🚀 Mon Tokenizer Usage Examples")
print("=" * 50)
try:
basic_usage_example()
batch_processing_example()
advanced_features_example()
performance_example()
print(f"\n🎉 All examples completed successfully!")
print(f"\nFor more information, visit:")
print(f"https://huggingface.co/janakhpon/mon_tokenizer")
except Exception as e:
print(f"❌ Error running examples: {e}")
exit(1) |