File size: 6,135 Bytes
81cf36d
 
 
 
 
 
 
 
 
 
 
 
 
f51150c
 
81cf36d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f51150c
81cf36d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f51150c
 
81cf36d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f51150c
 
81cf36d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python3
"""
Sample usage examples for the Mon language tokenizer.

This script demonstrates various ways to use the Mon tokenizer with
Hugging Face Transformers library.
"""

import logging
import time
from typing import List, Dict, Any

import torch
from transformers import AutoTokenizer

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def basic_usage_example():
    """Demonstrate basic tokenizer usage."""
    print("=== Basic Usage Example ===")
    
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
    print(f"✓ Loaded tokenizer (vocab size: {tokenizer.vocab_size:,})")

    # Example Mon texts
    texts = [
        "ဘာသာမန်",
        "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
        "ပ္ဍဲအခိင်မာံနဲသဵု မဒှ်ဘဝကွးဘာတက္ကသိုလ်ဂှ် ပါလုပ်ချဳဓရာင်ကၠုင် ပ္ဍဲပရေင်ကမၠောန်ယေန်သၞာင် ကေုာံ လိက်ပတ်မန် ဗွဲကတိုင်ကၟဟ်ရ။"
    ]

    for i, text in enumerate(texts, 1):
        print(f"\nExample {i}:")
        print(f"Input: {text}")
        
        # Tokenize the text
        tokens = tokenizer(text, return_tensors="pt")
        input_ids = tokens["input_ids"][0]

        # Print results
        print(f"Token IDs: {input_ids.tolist()}")
        
        # Convert to token strings
        token_strings = tokenizer.convert_ids_to_tokens(input_ids)
        print(f"Tokens: {token_strings}")

        # Decode back to text
        decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
        print(f"Decoded: {decoded}")
        print(f"Round-trip success: {text == decoded}")


def batch_processing_example():
    """Demonstrate batch processing."""
    print("\n=== Batch Processing Example ===")
    
    tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
    
    # Multiple texts for batch processing
    batch_texts = [
        "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
        "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
        "အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ်ရ။"
    ]

    # Batch tokenization with padding
    batch_tokens = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=128
    )

    print(f"Batch shape: {batch_tokens['input_ids'].shape}")
    print(f"Attention mask shape: {batch_tokens['attention_mask'].shape}")
    
    # Process each item
    for i, text in enumerate(batch_texts):
        tokens_count = batch_tokens['attention_mask'][i].sum().item()
        decoded = tokenizer.decode(batch_tokens['input_ids'][i], skip_special_tokens=True)
        print(f"Text {i+1}: {tokens_count} tokens -> '{decoded}'")


def advanced_features_example():
    """Demonstrate advanced features."""
    print("\n=== Advanced Features Example ===")
    
    tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
    text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"

    # Different tokenization options
    print("Special token handling:")
    
    # With special tokens
    with_special = tokenizer(text, add_special_tokens=True, return_tensors="pt")
    print(f"  With special tokens: {with_special['input_ids'].shape[1]} tokens")
    
    # Without special tokens  
    without_special = tokenizer(text, add_special_tokens=False, return_tensors="pt")
    print(f"  Without special tokens: {without_special['input_ids'].shape[1]} tokens")
    
    # Special token info
    print(f"\nSpecial tokens:")
    print(f"  BOS: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
    print(f"  EOS: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
    print(f"  UNK: '{tokenizer.unk_token}' (ID: {tokenizer.unk_token_id})")
    print(f"  PAD: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")


def performance_example():
    """Demonstrate performance characteristics."""
    print("\n=== Performance Example ===")
    
    tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
    
    test_texts = [
        ("Short", "ဘာသာမန်"),
        ("Medium", "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။ မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။"),
        ("Long", "အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ် ပ္ဍဲဍုင်သေံ ကဵု ဍုင်ဗၟာ ရ။ " * 10)
    ]
    
    for name, text in test_texts:
        char_count = len(text)
        
        # Measure tokenization time
        start_time = time.time()
        for _ in range(100):  # Average over 100 runs
            tokens = tokenizer(text, return_tensors="pt")
        avg_time = (time.time() - start_time) / 100
        
        token_count = tokens['input_ids'].shape[1] 
        chars_per_sec = char_count / avg_time if avg_time > 0 else 0
        
        print(f"{name}: {char_count} chars -> {token_count} tokens")
        print(f"  Time: {avg_time*1000:.2f}ms ({chars_per_sec:.0f} chars/sec)")


if __name__ == "__main__":
    print("🚀 Mon Tokenizer Usage Examples")
    print("=" * 50)
    
    try:
        basic_usage_example()
        batch_processing_example() 
        advanced_features_example()
        performance_example()
        
        print(f"\n🎉 All examples completed successfully!")
        print(f"\nFor more information, visit:")
        print(f"https://huggingface.co/janakhpon/mon_tokenizer")
        
    except Exception as e:
        print(f"❌ Error running examples: {e}")
        exit(1)