ismail

File size: 3,694 Bytes

ad359f7

"""
Example usage of Turkish Tokenizer in the data pipeline.

This demonstrates how to use the Turkish morphological tokenizer
for training language models on Turkish text.
"""

from data import create_dataloader, TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
from model import ModelArgs

def main():
    """Example of using Turkish tokenizer with the data pipeline"""

    if not TURKISH_TOKENIZER_AVAILABLE:
        print("❌ Turkish tokenizer is not installed!")
        print("Install it with: pip install turkish-tokenizer")
        return

    # Sample Turkish text
    turkish_text = """
    Merhaba! Bu bir Türkçe metin örneğidir.
    İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum.
    Kitap okumak çok güzeldir ve bilgi verir.
    Türkiye Cumhuriyeti'nin başkenti Ankara'dır.
    Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor.
    """ * 100  # Repeat to have enough text for training

    print("=" * 60)
    print("TURKISH TOKENIZER EXAMPLE")
    print("=" * 60)

    # Test the tokenizer directly
    print("\n1️⃣  Testing Turkish Tokenizer Wrapper")
    tokenizer = TurkishTokenizerWrapper()
    print(f"   Tokenizer: {tokenizer.name}")
    print(f"   Vocabulary size: {tokenizer.n_vocab:,}")

    # Test encoding/decoding
    sample = "Kitapları okuyorum ve öğreniyorum."
    tokens = tokenizer.encode(sample)
    decoded = tokenizer.decode(tokens)

    print(f"\n   Original: {sample}")
    print(f"   Tokens ({len(tokens)}): {tokens[:20]}..." if len(tokens) > 20 else f"   Tokens: {tokens}")
    print(f"   Decoded: {decoded}")

    # Create dataloader with Turkish tokenizer
    print("\n2️⃣  Creating DataLoader with Turkish Tokenizer")
    args = ModelArgs(
        max_seq_len=128,
        max_batch_size=8,
        vocab_size=tokenizer.n_vocab  # Important: set vocab size for model
    )

    dataloader = create_dataloader(
        txt=turkish_text,
        args=args,
        stride=64,  # 50% overlap
        shuffle=True,
        num_workers=0,
        max_samples=50,  # Limit for testing
        use_turkish_tokenizer=True  # Enable Turkish tokenizer
    )

    print(f"\n   ✅ DataLoader created successfully!")
    print(f"   Sequence length: {args.max_seq_len}")
    print(f"   Batch size: {args.max_batch_size}")
    print(f"   Total batches: {len(dataloader)}")
    print(f"   Total samples: {len(dataloader.dataset)}")

    # Test a batch
    print("\n3️⃣  Testing First Batch")
    for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
        print(f"\n   Batch {batch_idx}:")
        print(f"   input_ids shape: {input_ids.shape}")
        print(f"   target_ids shape: {target_ids.shape}")
        print(f"   input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]")
        print(f"   Sample input (first 10 tokens): {input_ids[0, :10].tolist()}")
        print(f"   Decoded sample: {tokenizer.decode(input_ids[0, :30].tolist())}")
        break

    print("\n" + "=" * 60)
    print("✅ Turkish Tokenizer Example Complete!")
    print("=" * 60)

    # Usage tips
    print("\n💡 Usage Tips:")
    print("   • Set vocab_size in ModelArgs to tokenizer.n_vocab")
    print("   • Use use_turkish_tokenizer=True in create_dataloader()")
    print("   • Turkish tokenizer handles morphological analysis automatically")
    print("   • Vocabulary size is optimized for Turkish language")
    print("\n📚 To use in training:")
    print("   tokenizer = TurkishTokenizerWrapper()")
    print("   args = ModelArgs(vocab_size=tokenizer.n_vocab, ...)")
    print("   dataloader = create_dataloader(..., use_turkish_tokenizer=True)")


if __name__ == "__main__":
    main()