File size: 3,694 Bytes
ad359f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
"""
Example usage of Turkish Tokenizer in the data pipeline.
This demonstrates how to use the Turkish morphological tokenizer
for training language models on Turkish text.
"""
from data import create_dataloader, TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
from model import ModelArgs
def main():
"""Example of using Turkish tokenizer with the data pipeline"""
if not TURKISH_TOKENIZER_AVAILABLE:
print("❌ Turkish tokenizer is not installed!")
print("Install it with: pip install turkish-tokenizer")
return
# Sample Turkish text
turkish_text = """
Merhaba! Bu bir Türkçe metin örneğidir.
İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum.
Kitap okumak çok güzeldir ve bilgi verir.
Türkiye Cumhuriyeti'nin başkenti Ankara'dır.
Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor.
""" * 100 # Repeat to have enough text for training
print("=" * 60)
print("TURKISH TOKENIZER EXAMPLE")
print("=" * 60)
# Test the tokenizer directly
print("\n1️⃣ Testing Turkish Tokenizer Wrapper")
tokenizer = TurkishTokenizerWrapper()
print(f" Tokenizer: {tokenizer.name}")
print(f" Vocabulary size: {tokenizer.n_vocab:,}")
# Test encoding/decoding
sample = "Kitapları okuyorum ve öğreniyorum."
tokens = tokenizer.encode(sample)
decoded = tokenizer.decode(tokens)
print(f"\n Original: {sample}")
print(f" Tokens ({len(tokens)}): {tokens[:20]}..." if len(tokens) > 20 else f" Tokens: {tokens}")
print(f" Decoded: {decoded}")
# Create dataloader with Turkish tokenizer
print("\n2️⃣ Creating DataLoader with Turkish Tokenizer")
args = ModelArgs(
max_seq_len=128,
max_batch_size=8,
vocab_size=tokenizer.n_vocab # Important: set vocab size for model
)
dataloader = create_dataloader(
txt=turkish_text,
args=args,
stride=64, # 50% overlap
shuffle=True,
num_workers=0,
max_samples=50, # Limit for testing
use_turkish_tokenizer=True # Enable Turkish tokenizer
)
print(f"\n ✅ DataLoader created successfully!")
print(f" Sequence length: {args.max_seq_len}")
print(f" Batch size: {args.max_batch_size}")
print(f" Total batches: {len(dataloader)}")
print(f" Total samples: {len(dataloader.dataset)}")
# Test a batch
print("\n3️⃣ Testing First Batch")
for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
print(f"\n Batch {batch_idx}:")
print(f" input_ids shape: {input_ids.shape}")
print(f" target_ids shape: {target_ids.shape}")
print(f" input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]")
print(f" Sample input (first 10 tokens): {input_ids[0, :10].tolist()}")
print(f" Decoded sample: {tokenizer.decode(input_ids[0, :30].tolist())}")
break
print("\n" + "=" * 60)
print("✅ Turkish Tokenizer Example Complete!")
print("=" * 60)
# Usage tips
print("\n💡 Usage Tips:")
print(" • Set vocab_size in ModelArgs to tokenizer.n_vocab")
print(" • Use use_turkish_tokenizer=True in create_dataloader()")
print(" • Turkish tokenizer handles morphological analysis automatically")
print(" • Vocabulary size is optimized for Turkish language")
print("\n📚 To use in training:")
print(" tokenizer = TurkishTokenizerWrapper()")
print(" args = ModelArgs(vocab_size=tokenizer.n_vocab, ...)")
print(" dataloader = create_dataloader(..., use_turkish_tokenizer=True)")
if __name__ == "__main__":
main()
|