"""
Example usage of Turkish Tokenizer in the data pipeline.

This demonstrates how to use the Turkish morphological tokenizer
for training language models on Turkish text.
"""

from data import create_dataloader, TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
from model import ModelArgs

def main():
    """Example of using Turkish tokenizer with the data pipeline"""

    if not TURKISH_TOKENIZER_AVAILABLE:
        print("❌ Turkish tokenizer is not installed!")
        print("Install it with: pip install turkish-tokenizer")
        return

    # Sample Turkish text
    turkish_text = """
    Merhaba! Bu bir Türkçe metin örneğidir.
    İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum.
    Kitap okumak çok güzeldir ve bilgi verir.
    Türkiye Cumhuriyeti'nin başkenti Ankara'dır.
    Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor.
    """ * 100  # Repeat to have enough text for training

    print("=" * 60)
    print("TURKISH TOKENIZER EXAMPLE")
    print("=" * 60)

    # Test the tokenizer directly
    print("\n1️⃣  Testing Turkish Tokenizer Wrapper")
    tokenizer = TurkishTokenizerWrapper()
    print(f"   Tokenizer: {tokenizer.name}")
    print(f"   Vocabulary size: {tokenizer.n_vocab:,}")

    # Test encoding/decoding
    sample = "Kitapları okuyorum ve öğreniyorum."
    tokens = tokenizer.encode(sample)
    decoded = tokenizer.decode(tokens)

    print(f"\n   Original: {sample}")
    print(f"   Tokens ({len(tokens)}): {tokens[:20]}..." if len(tokens) > 20 else f"   Tokens: {tokens}")
    print(f"   Decoded: {decoded}")

    # Create dataloader with Turkish tokenizer
    print("\n2️⃣  Creating DataLoader with Turkish Tokenizer")
    args = ModelArgs(
        max_seq_len=128,
        max_batch_size=8,
        vocab_size=tokenizer.n_vocab  # Important: set vocab size for model
    )

    dataloader = create_dataloader(
        txt=turkish_text,
        args=args,
        stride=64,  # 50% overlap
        shuffle=True,
        num_workers=0,
        max_samples=50,  # Limit for testing
        use_turkish_tokenizer=True  # Enable Turkish tokenizer
    )

    print(f"\n   ✅ DataLoader created successfully!")
    print(f"   Sequence length: {args.max_seq_len}")
    print(f"   Batch size: {args.max_batch_size}")
    print(f"   Total batches: {len(dataloader)}")
    print(f"   Total samples: {len(dataloader.dataset)}")

    # Test a batch
    print("\n3️⃣  Testing First Batch")
    for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
        print(f"\n   Batch {batch_idx}:")
        print(f"   input_ids shape: {input_ids.shape}")
        print(f"   target_ids shape: {target_ids.shape}")
        print(f"   input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]")
        print(f"   Sample input (first 10 tokens): {input_ids[0, :10].tolist()}")
        print(f"   Decoded sample: {tokenizer.decode(input_ids[0, :30].tolist())}")
        break

    print("\n" + "=" * 60)
    print("✅ Turkish Tokenizer Example Complete!")
    print("=" * 60)

    # Usage tips
    print("\n💡 Usage Tips:")
    print("   • Set vocab_size in ModelArgs to tokenizer.n_vocab")
    print("   • Use use_turkish_tokenizer=True in create_dataloader()")
    print("   • Turkish tokenizer handles morphological analysis automatically")
    print("   • Vocabulary size is optimized for Turkish language")
    print("\n📚 To use in training:")
    print("   tokenizer = TurkishTokenizerWrapper()")
    print("   args = ModelArgs(vocab_size=tokenizer.n_vocab, ...)")
    print("   dataloader = create_dataloader(..., use_turkish_tokenizer=True)")


if __name__ == "__main__":
    main()