ismail / Model_Architecture /turkish_tokenizer_example.py
ikaganacar's picture
Turkish Tokenizer
ad359f7
"""
Example usage of Turkish Tokenizer in the data pipeline.
This demonstrates how to use the Turkish morphological tokenizer
for training language models on Turkish text.
"""
from data import create_dataloader, TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
from model import ModelArgs
def main():
"""Example of using Turkish tokenizer with the data pipeline"""
if not TURKISH_TOKENIZER_AVAILABLE:
print("❌ Turkish tokenizer is not installed!")
print("Install it with: pip install turkish-tokenizer")
return
# Sample Turkish text
turkish_text = """
Merhaba! Bu bir Türkçe metin örneğidir.
İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum.
Kitap okumak çok güzeldir ve bilgi verir.
Türkiye Cumhuriyeti'nin başkenti Ankara'dır.
Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor.
""" * 100 # Repeat to have enough text for training
print("=" * 60)
print("TURKISH TOKENIZER EXAMPLE")
print("=" * 60)
# Test the tokenizer directly
print("\n1️⃣ Testing Turkish Tokenizer Wrapper")
tokenizer = TurkishTokenizerWrapper()
print(f" Tokenizer: {tokenizer.name}")
print(f" Vocabulary size: {tokenizer.n_vocab:,}")
# Test encoding/decoding
sample = "Kitapları okuyorum ve öğreniyorum."
tokens = tokenizer.encode(sample)
decoded = tokenizer.decode(tokens)
print(f"\n Original: {sample}")
print(f" Tokens ({len(tokens)}): {tokens[:20]}..." if len(tokens) > 20 else f" Tokens: {tokens}")
print(f" Decoded: {decoded}")
# Create dataloader with Turkish tokenizer
print("\n2️⃣ Creating DataLoader with Turkish Tokenizer")
args = ModelArgs(
max_seq_len=128,
max_batch_size=8,
vocab_size=tokenizer.n_vocab # Important: set vocab size for model
)
dataloader = create_dataloader(
txt=turkish_text,
args=args,
stride=64, # 50% overlap
shuffle=True,
num_workers=0,
max_samples=50, # Limit for testing
use_turkish_tokenizer=True # Enable Turkish tokenizer
)
print(f"\n ✅ DataLoader created successfully!")
print(f" Sequence length: {args.max_seq_len}")
print(f" Batch size: {args.max_batch_size}")
print(f" Total batches: {len(dataloader)}")
print(f" Total samples: {len(dataloader.dataset)}")
# Test a batch
print("\n3️⃣ Testing First Batch")
for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
print(f"\n Batch {batch_idx}:")
print(f" input_ids shape: {input_ids.shape}")
print(f" target_ids shape: {target_ids.shape}")
print(f" input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]")
print(f" Sample input (first 10 tokens): {input_ids[0, :10].tolist()}")
print(f" Decoded sample: {tokenizer.decode(input_ids[0, :30].tolist())}")
break
print("\n" + "=" * 60)
print("✅ Turkish Tokenizer Example Complete!")
print("=" * 60)
# Usage tips
print("\n💡 Usage Tips:")
print(" • Set vocab_size in ModelArgs to tokenizer.n_vocab")
print(" • Use use_turkish_tokenizer=True in create_dataloader()")
print(" • Turkish tokenizer handles morphological analysis automatically")
print(" • Vocabulary size is optimized for Turkish language")
print("\n📚 To use in training:")
print(" tokenizer = TurkishTokenizerWrapper()")
print(" args = ModelArgs(vocab_size=tokenizer.n_vocab, ...)")
print(" dataloader = create_dataloader(..., use_turkish_tokenizer=True)")
if __name__ == "__main__":
main()