File size: 2,499 Bytes
d681ea8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | """
EthioBBPE Example Usage
Demonstrates how to use the trained EthioBBPE tokenizer for Ethiopian languages.
"""
from tokenizers import Tokenizer
import os
def load_tokenizer(model_path="models/EthioBBPE/tokenizer.json"):
"""Load the trained EthioBBPE tokenizer."""
if not os.path.exists(model_path):
# Try demo model
demo_path = "models/demo_tokenizer/tokenizer.json"
if os.path.exists(demo_path):
print(f"⚠️ EthioBBPE model not found, using demo model instead.")
model_path = demo_path
else:
raise FileNotFoundError(
f"No tokenizer found at {model_path}. Please train a model first."
)
return Tokenizer.from_file(model_path)
def main():
print("🇪🇹 EthioBBPE Tokenizer - Example Usage\n")
print("=" * 50)
# Load tokenizer
tokenizer = load_tokenizer()
print(f"✅ Loaded tokenizer from: {tokenizer.model_filename}\n")
# Test texts in multiple Ethiopian languages
test_texts = [
("Amharic", "ሰላም! እንዴት ነህ? የኢትዮጵያ ህዝብ በጣም ተቀራራቢ ነው።"),
("Oromo", "Akkam! Akkam jirta? Ummanni Itoophiyaa baay'ee wal-qabaataa dha."),
("Tigrinya", "ሰላም! ከመይ ኣለኻ? ህዝቢ ኢትዮጵያ ኣዝዩ ሓደ እዩ።"),
("English", "Hello! How are you? The people of Ethiopia are very united."),
("Mixed", "ሰላም Hello! እንዴት ነህ? How are you? 🇪🇹"),
]
for lang_name, text in test_texts:
print(f"\n--- {lang_name} ---")
print(f"Original: {text}")
# Encode
encoded = tokenizer.encode(text)
print(f"Tokens ({len(encoded.tokens)}): {encoded.tokens[:20]}{'...' if len(encoded.tokens) > 20 else ''}")
print(f"IDs ({len(encoded.ids)}): {encoded.ids[:20]}{'...' if len(encoded.ids) > 20 else ''}")
# Decode
decoded = tokenizer.decode(encoded.ids)
print(f"Decoded: {decoded}")
# Verify round-trip
match = "✅" if decoded == text else "⚠️"
print(f"Round-trip: {match} {'Perfect match!' if decoded == text else 'Minor differences'}")
print("\n" + "=" * 50)
print("✨ Example usage complete!")
print("\nTo train your own EthioBBPE tokenizer:")
print(" python scripts/train_tokenizer.py --data_dir ./data --model_name EthioBBPE")
if __name__ == "__main__":
main()
|