File size: 2,499 Bytes

d681ea8

"""
EthioBBPE Example Usage

Demonstrates how to use the trained EthioBBPE tokenizer for Ethiopian languages.
"""

from tokenizers import Tokenizer
import os

def load_tokenizer(model_path="models/EthioBBPE/tokenizer.json"):
    """Load the trained EthioBBPE tokenizer."""
    if not os.path.exists(model_path):
        # Try demo model
        demo_path = "models/demo_tokenizer/tokenizer.json"
        if os.path.exists(demo_path):
            print(f"⚠️  EthioBBPE model not found, using demo model instead.")
            model_path = demo_path
        else:
            raise FileNotFoundError(
                f"No tokenizer found at {model_path}. Please train a model first."
            )
    
    return Tokenizer.from_file(model_path)


def main():
    print("🇪🇹 EthioBBPE Tokenizer - Example Usage\n")
    print("=" * 50)
    
    # Load tokenizer
    tokenizer = load_tokenizer()
    print(f"✅ Loaded tokenizer from: {tokenizer.model_filename}\n")
    
    # Test texts in multiple Ethiopian languages
    test_texts = [
        ("Amharic", "ሰላም! እንዴት ነህ? የኢትዮጵያ ህዝብ በጣም ተቀራራቢ ነው።"),
        ("Oromo", "Akkam! Akkam jirta? Ummanni Itoophiyaa baay'ee wal-qabaataa dha."),
        ("Tigrinya", "ሰላም! ከመይ ኣለኻ? ህዝቢ ኢትዮጵያ ኣዝዩ ሓደ እዩ።"),
        ("English", "Hello! How are you? The people of Ethiopia are very united."),
        ("Mixed", "ሰላም Hello! እንዴት ነህ? How are you? 🇪🇹"),
    ]
    
    for lang_name, text in test_texts:
        print(f"\n--- {lang_name} ---")
        print(f"Original: {text}")
        
        # Encode
        encoded = tokenizer.encode(text)
        print(f"Tokens ({len(encoded.tokens)}): {encoded.tokens[:20]}{'...' if len(encoded.tokens) > 20 else ''}")
        print(f"IDs ({len(encoded.ids)}): {encoded.ids[:20]}{'...' if len(encoded.ids) > 20 else ''}")
        
        # Decode
        decoded = tokenizer.decode(encoded.ids)
        print(f"Decoded: {decoded}")
        
        # Verify round-trip
        match = "✅" if decoded == text else "⚠️"
        print(f"Round-trip: {match} {'Perfect match!' if decoded == text else 'Minor differences'}")
    
    print("\n" + "=" * 50)
    print("✨ Example usage complete!")
    print("\nTo train your own EthioBBPE tokenizer:")
    print("  python scripts/train_tokenizer.py --data_dir ./data --model_name EthioBBPE")


if __name__ == "__main__":
    main()