ismail / Model_Architecture /turkish_tokenizer_example.py

Turkish Tokenizer

ad359f7 3 months ago

3.69 kB

	"""
	Example usage of Turkish Tokenizer in the data pipeline.

	This demonstrates how to use the Turkish morphological tokenizer
	for training language models on Turkish text.
	"""

	from data import create_dataloader, TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
	from model import ModelArgs

	def main():
	"""Example of using Turkish tokenizer with the data pipeline"""

	if not TURKISH_TOKENIZER_AVAILABLE:
	print("❌ Turkish tokenizer is not installed!")
	print("Install it with: pip install turkish-tokenizer")
	return

	# Sample Turkish text
	turkish_text = """
	Merhaba! Bu bir Türkçe metin örneğidir.
	İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum.
	Kitap okumak çok güzeldir ve bilgi verir.
	Türkiye Cumhuriyeti'nin başkenti Ankara'dır.
	Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor.
	""" * 100 # Repeat to have enough text for training

	print("=" * 60)
	print("TURKISH TOKENIZER EXAMPLE")
	print("=" * 60)

	# Test the tokenizer directly
	print("\n1️⃣ Testing Turkish Tokenizer Wrapper")
	tokenizer = TurkishTokenizerWrapper()
	print(f" Tokenizer: {tokenizer.name}")
	print(f" Vocabulary size: {tokenizer.n_vocab:,}")

	# Test encoding/decoding
	sample = "Kitapları okuyorum ve öğreniyorum."
	tokens = tokenizer.encode(sample)
	decoded = tokenizer.decode(tokens)

	print(f"\n Original: {sample}")
	print(f" Tokens ({len(tokens)}): {tokens[:20]}..." if len(tokens) > 20 else f" Tokens: {tokens}")
	print(f" Decoded: {decoded}")

	# Create dataloader with Turkish tokenizer
	print("\n2️⃣ Creating DataLoader with Turkish Tokenizer")
	args = ModelArgs(
	max_seq_len=128,
	max_batch_size=8,
	vocab_size=tokenizer.n_vocab # Important: set vocab size for model
	)

	dataloader = create_dataloader(
	txt=turkish_text,
	args=args,
	stride=64, # 50% overlap
	shuffle=True,
	num_workers=0,
	max_samples=50, # Limit for testing
	use_turkish_tokenizer=True # Enable Turkish tokenizer
	)

	print(f"\n ✅ DataLoader created successfully!")
	print(f" Sequence length: {args.max_seq_len}")
	print(f" Batch size: {args.max_batch_size}")
	print(f" Total batches: {len(dataloader)}")
	print(f" Total samples: {len(dataloader.dataset)}")

	# Test a batch
	print("\n3️⃣ Testing First Batch")
	for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
	print(f"\n Batch {batch_idx}:")
	print(f" input_ids shape: {input_ids.shape}")
	print(f" target_ids shape: {target_ids.shape}")
	print(f" input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]")
	print(f" Sample input (first 10 tokens): {input_ids[0, :10].tolist()}")
	print(f" Decoded sample: {tokenizer.decode(input_ids[0, :30].tolist())}")
	break

	print("\n" + "=" * 60)
	print("✅ Turkish Tokenizer Example Complete!")
	print("=" * 60)

	# Usage tips
	print("\n💡 Usage Tips:")
	print(" • Set vocab_size in ModelArgs to tokenizer.n_vocab")
	print(" • Use use_turkish_tokenizer=True in create_dataloader()")
	print(" • Turkish tokenizer handles morphological analysis automatically")
	print(" • Vocabulary size is optimized for Turkish language")
	print("\n📚 To use in training:")
	print(" tokenizer = TurkishTokenizerWrapper()")
	print(" args = ModelArgs(vocab_size=tokenizer.n_vocab, ...)")
	print(" dataloader = create_dataloader(..., use_turkish_tokenizer=True)")


	if __name__ == "__main__":
	main()