| """ |
| EthioBBPE Example Usage |
| |
| Demonstrates how to use the trained EthioBBPE tokenizer for Ethiopian languages. |
| """ |
|
|
| from tokenizers import Tokenizer |
| import os |
|
|
| def load_tokenizer(model_path="models/EthioBBPE/tokenizer.json"): |
| """Load the trained EthioBBPE tokenizer.""" |
| if not os.path.exists(model_path): |
| |
| demo_path = "models/demo_tokenizer/tokenizer.json" |
| if os.path.exists(demo_path): |
| print(f"⚠️ EthioBBPE model not found, using demo model instead.") |
| model_path = demo_path |
| else: |
| raise FileNotFoundError( |
| f"No tokenizer found at {model_path}. Please train a model first." |
| ) |
| |
| return Tokenizer.from_file(model_path) |
|
|
|
|
| def main(): |
| print("🇪🇹 EthioBBPE Tokenizer - Example Usage\n") |
| print("=" * 50) |
| |
| |
| tokenizer = load_tokenizer() |
| print(f"✅ Loaded tokenizer from: {tokenizer.model_filename}\n") |
| |
| |
| test_texts = [ |
| ("Amharic", "ሰላም! እንዴት ነህ? የኢትዮጵያ ህዝብ በጣም ተቀራራቢ ነው።"), |
| ("Oromo", "Akkam! Akkam jirta? Ummanni Itoophiyaa baay'ee wal-qabaataa dha."), |
| ("Tigrinya", "ሰላም! ከመይ ኣለኻ? ህዝቢ ኢትዮጵያ ኣዝዩ ሓደ እዩ።"), |
| ("English", "Hello! How are you? The people of Ethiopia are very united."), |
| ("Mixed", "ሰላም Hello! እንዴት ነህ? How are you? 🇪🇹"), |
| ] |
| |
| for lang_name, text in test_texts: |
| print(f"\n--- {lang_name} ---") |
| print(f"Original: {text}") |
| |
| |
| encoded = tokenizer.encode(text) |
| print(f"Tokens ({len(encoded.tokens)}): {encoded.tokens[:20]}{'...' if len(encoded.tokens) > 20 else ''}") |
| print(f"IDs ({len(encoded.ids)}): {encoded.ids[:20]}{'...' if len(encoded.ids) > 20 else ''}") |
| |
| |
| decoded = tokenizer.decode(encoded.ids) |
| print(f"Decoded: {decoded}") |
| |
| |
| match = "✅" if decoded == text else "⚠️" |
| print(f"Round-trip: {match} {'Perfect match!' if decoded == text else 'Minor differences'}") |
| |
| print("\n" + "=" * 50) |
| print("✨ Example usage complete!") |
| print("\nTo train your own EthioBBPE tokenizer:") |
| print(" python scripts/train_tokenizer.py --data_dir ./data --model_name EthioBBPE") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|