|
|
""" |
|
|
Example usage of Turkish Tokenizer in the data pipeline. |
|
|
|
|
|
This demonstrates how to use the Turkish morphological tokenizer |
|
|
for training language models on Turkish text. |
|
|
""" |
|
|
|
|
|
from data import create_dataloader, TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE |
|
|
from model import ModelArgs |
|
|
|
|
|
def main(): |
|
|
"""Example of using Turkish tokenizer with the data pipeline""" |
|
|
|
|
|
if not TURKISH_TOKENIZER_AVAILABLE: |
|
|
print("❌ Turkish tokenizer is not installed!") |
|
|
print("Install it with: pip install turkish-tokenizer") |
|
|
return |
|
|
|
|
|
|
|
|
turkish_text = """ |
|
|
Merhaba! Bu bir Türkçe metin örneğidir. |
|
|
İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum. |
|
|
Kitap okumak çok güzeldir ve bilgi verir. |
|
|
Türkiye Cumhuriyeti'nin başkenti Ankara'dır. |
|
|
Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor. |
|
|
""" * 100 |
|
|
|
|
|
print("=" * 60) |
|
|
print("TURKISH TOKENIZER EXAMPLE") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\n1️⃣ Testing Turkish Tokenizer Wrapper") |
|
|
tokenizer = TurkishTokenizerWrapper() |
|
|
print(f" Tokenizer: {tokenizer.name}") |
|
|
print(f" Vocabulary size: {tokenizer.n_vocab:,}") |
|
|
|
|
|
|
|
|
sample = "Kitapları okuyorum ve öğreniyorum." |
|
|
tokens = tokenizer.encode(sample) |
|
|
decoded = tokenizer.decode(tokens) |
|
|
|
|
|
print(f"\n Original: {sample}") |
|
|
print(f" Tokens ({len(tokens)}): {tokens[:20]}..." if len(tokens) > 20 else f" Tokens: {tokens}") |
|
|
print(f" Decoded: {decoded}") |
|
|
|
|
|
|
|
|
print("\n2️⃣ Creating DataLoader with Turkish Tokenizer") |
|
|
args = ModelArgs( |
|
|
max_seq_len=128, |
|
|
max_batch_size=8, |
|
|
vocab_size=tokenizer.n_vocab |
|
|
) |
|
|
|
|
|
dataloader = create_dataloader( |
|
|
txt=turkish_text, |
|
|
args=args, |
|
|
stride=64, |
|
|
shuffle=True, |
|
|
num_workers=0, |
|
|
max_samples=50, |
|
|
use_turkish_tokenizer=True |
|
|
) |
|
|
|
|
|
print(f"\n ✅ DataLoader created successfully!") |
|
|
print(f" Sequence length: {args.max_seq_len}") |
|
|
print(f" Batch size: {args.max_batch_size}") |
|
|
print(f" Total batches: {len(dataloader)}") |
|
|
print(f" Total samples: {len(dataloader.dataset)}") |
|
|
|
|
|
|
|
|
print("\n3️⃣ Testing First Batch") |
|
|
for batch_idx, (input_ids, target_ids) in enumerate(dataloader): |
|
|
print(f"\n Batch {batch_idx}:") |
|
|
print(f" input_ids shape: {input_ids.shape}") |
|
|
print(f" target_ids shape: {target_ids.shape}") |
|
|
print(f" input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]") |
|
|
print(f" Sample input (first 10 tokens): {input_ids[0, :10].tolist()}") |
|
|
print(f" Decoded sample: {tokenizer.decode(input_ids[0, :30].tolist())}") |
|
|
break |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("✅ Turkish Tokenizer Example Complete!") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\n💡 Usage Tips:") |
|
|
print(" • Set vocab_size in ModelArgs to tokenizer.n_vocab") |
|
|
print(" • Use use_turkish_tokenizer=True in create_dataloader()") |
|
|
print(" • Turkish tokenizer handles morphological analysis automatically") |
|
|
print(" • Vocabulary size is optimized for Turkish language") |
|
|
print("\n📚 To use in training:") |
|
|
print(" tokenizer = TurkishTokenizerWrapper()") |
|
|
print(" args = ModelArgs(vocab_size=tokenizer.n_vocab, ...)") |
|
|
print(" dataloader = create_dataloader(..., use_turkish_tokenizer=True)") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|