Commit
·
ad359f7
1
Parent(s):
3d942a0
Turkish Tokenizer
Browse files- .gitignore +1 -0
- Model_Architecture/data.py +194 -11
- Model_Architecture/turkish_tokenizer_example.py +99 -0
- turkish_tiktokenizer +1 -0
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.pyc
|
Model_Architecture/data.py
CHANGED
|
@@ -9,6 +9,67 @@ import numpy as np
|
|
| 9 |
|
| 10 |
from model import ModelArgs
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
#####################################
|
| 13 |
# DATA
|
| 14 |
#####################################
|
|
@@ -28,9 +89,15 @@ class TextDataset(Dataset):
|
|
| 28 |
self.stride = stride if stride is not None else self.max_seq_len // 2
|
| 29 |
|
| 30 |
# Handle file paths efficiently with memory mapping
|
| 31 |
-
if Path(
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
text_content = txt
|
| 35 |
|
| 36 |
# Validate input
|
|
@@ -126,11 +193,12 @@ def create_dataloader(
|
|
| 126 |
num_workers: int = 0,
|
| 127 |
pin_memory: bool = True,
|
| 128 |
persistent_workers: bool = False,
|
| 129 |
-
max_samples: Optional[int] = None
|
|
|
|
| 130 |
) -> DataLoader:
|
| 131 |
"""
|
| 132 |
Optimized DataLoader with proper memory pinning and worker settings.
|
| 133 |
-
|
| 134 |
Args:
|
| 135 |
txt: Text content or file path
|
| 136 |
args: ModelArgs configuration
|
|
@@ -141,12 +209,24 @@ def create_dataloader(
|
|
| 141 |
pin_memory: Pin memory for faster GPU transfer (recommended)
|
| 142 |
persistent_workers: Keep workers alive between epochs (if num_workers > 0)
|
| 143 |
max_samples: Limit samples for testing
|
|
|
|
| 144 |
"""
|
| 145 |
-
#
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
# Create dataset with size validation
|
| 152 |
try:
|
|
@@ -185,4 +265,107 @@ def get_sample_data(url: str = "https://raw.githubusercontent.com/karpathy/char-
|
|
| 185 |
return response.text
|
| 186 |
except Exception as e:
|
| 187 |
print(f"⚠️ Could not download sample data: {e}")
|
| 188 |
-
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
from model import ModelArgs
|
| 11 |
|
| 12 |
+
# Turkish Tokenizer support
|
| 13 |
+
try:
|
| 14 |
+
from turkish_tokenizer import TurkishTokenizer as TurkishTokenizerBase
|
| 15 |
+
TURKISH_TOKENIZER_AVAILABLE = True
|
| 16 |
+
except ImportError:
|
| 17 |
+
TURKISH_TOKENIZER_AVAILABLE = False
|
| 18 |
+
TurkishTokenizerBase = None
|
| 19 |
+
|
| 20 |
+
#####################################
|
| 21 |
+
# TURKISH TOKENIZER WRAPPER
|
| 22 |
+
#####################################
|
| 23 |
+
class TurkishTokenizerWrapper:
|
| 24 |
+
"""
|
| 25 |
+
Wrapper for Turkish Tokenizer to make it compatible with tiktoken interface.
|
| 26 |
+
This allows seamless integration with the existing TextDataset class.
|
| 27 |
+
"""
|
| 28 |
+
def __init__(self):
|
| 29 |
+
if not TURKISH_TOKENIZER_AVAILABLE:
|
| 30 |
+
raise ImportError(
|
| 31 |
+
"turkish-tokenizer package is not installed. "
|
| 32 |
+
"Install it with: pip install turkish-tokenizer"
|
| 33 |
+
)
|
| 34 |
+
self.tokenizer = TurkishTokenizerBase()
|
| 35 |
+
self.name = "turkish-tokenizer"
|
| 36 |
+
|
| 37 |
+
def encode(self, text: str, allowed_special: Optional[set] = None) -> List[int]:
|
| 38 |
+
"""
|
| 39 |
+
Encode text to token IDs (compatible with tiktoken interface).
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
text: Input text to tokenize
|
| 43 |
+
allowed_special: Not used for Turkish tokenizer, kept for compatibility
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
List of token IDs
|
| 47 |
+
"""
|
| 48 |
+
return self.tokenizer.encode(text)
|
| 49 |
+
|
| 50 |
+
def decode(self, tokens: List[int]) -> str:
|
| 51 |
+
"""
|
| 52 |
+
Decode token IDs back to text.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
tokens: List of token IDs
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Decoded text string
|
| 59 |
+
"""
|
| 60 |
+
return self.tokenizer.decode(tokens)
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def n_vocab(self) -> int:
|
| 64 |
+
"""Get vocabulary size"""
|
| 65 |
+
return self.tokenizer.vocab_size
|
| 66 |
+
|
| 67 |
+
@property
|
| 68 |
+
def max_token_value(self) -> int:
|
| 69 |
+
"""Get maximum token value"""
|
| 70 |
+
return self.n_vocab - 1
|
| 71 |
+
|
| 72 |
+
|
| 73 |
#####################################
|
| 74 |
# DATA
|
| 75 |
#####################################
|
|
|
|
| 89 |
self.stride = stride if stride is not None else self.max_seq_len // 2
|
| 90 |
|
| 91 |
# Handle file paths efficiently with memory mapping
|
| 92 |
+
# Check if txt is a file path (avoid Path().exists() for long strings)
|
| 93 |
+
try:
|
| 94 |
+
path = Path(txt)
|
| 95 |
+
if len(txt) < 4096 and path.exists(): # Reasonable path length check
|
| 96 |
+
text_content = self._read_file_mmap(txt)
|
| 97 |
+
else:
|
| 98 |
+
text_content = txt
|
| 99 |
+
except (OSError, ValueError):
|
| 100 |
+
# If Path() fails or string is too long, treat as raw text
|
| 101 |
text_content = txt
|
| 102 |
|
| 103 |
# Validate input
|
|
|
|
| 193 |
num_workers: int = 0,
|
| 194 |
pin_memory: bool = True,
|
| 195 |
persistent_workers: bool = False,
|
| 196 |
+
max_samples: Optional[int] = None,
|
| 197 |
+
use_turkish_tokenizer: bool = False
|
| 198 |
) -> DataLoader:
|
| 199 |
"""
|
| 200 |
Optimized DataLoader with proper memory pinning and worker settings.
|
| 201 |
+
|
| 202 |
Args:
|
| 203 |
txt: Text content or file path
|
| 204 |
args: ModelArgs configuration
|
|
|
|
| 209 |
pin_memory: Pin memory for faster GPU transfer (recommended)
|
| 210 |
persistent_workers: Keep workers alive between epochs (if num_workers > 0)
|
| 211 |
max_samples: Limit samples for testing
|
| 212 |
+
use_turkish_tokenizer: Use Turkish morphological tokenizer instead of tiktoken
|
| 213 |
"""
|
| 214 |
+
# Select tokenizer based on user preference
|
| 215 |
+
if use_turkish_tokenizer:
|
| 216 |
+
if not TURKISH_TOKENIZER_AVAILABLE:
|
| 217 |
+
raise ImportError(
|
| 218 |
+
"Turkish tokenizer requested but not available. "
|
| 219 |
+
"Install it with: pip install turkish-tokenizer"
|
| 220 |
+
)
|
| 221 |
+
tokenizer = TurkishTokenizerWrapper()
|
| 222 |
+
print(f"🇹🇷 Using Turkish Tokenizer (vocab size: {tokenizer.n_vocab:,})")
|
| 223 |
+
else:
|
| 224 |
+
# Use the best default tokenizer for your setup
|
| 225 |
+
# tiktoken's gpt2 is fast, well-tested, and has reasonable vocab size (~50k)
|
| 226 |
+
# For multilingual or code, consider "cl100k_base" or "o200k_base"
|
| 227 |
+
tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
|
| 228 |
+
tokenizer = tiktoken.get_encoding(tokenizer_name)
|
| 229 |
+
print(f"📚 Using tiktoken tokenizer: {tokenizer_name} (vocab size: {tokenizer.n_vocab:,})")
|
| 230 |
|
| 231 |
# Create dataset with size validation
|
| 232 |
try:
|
|
|
|
| 265 |
return response.text
|
| 266 |
except Exception as e:
|
| 267 |
print(f"⚠️ Could not download sample data: {e}")
|
| 268 |
+
return ""
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
if __name__ == "__main__":
|
| 272 |
+
print("=" * 60)
|
| 273 |
+
print("TOKENIZER TESTING")
|
| 274 |
+
print("=" * 60)
|
| 275 |
+
|
| 276 |
+
# Choose which tokenizer to test
|
| 277 |
+
USE_TURKISH = True # Set to False to test tiktoken instead
|
| 278 |
+
|
| 279 |
+
if USE_TURKISH and TURKISH_TOKENIZER_AVAILABLE:
|
| 280 |
+
print("\n🇹🇷 Testing Turkish Tokenizer")
|
| 281 |
+
tokenizer = TurkishTokenizerWrapper()
|
| 282 |
+
print(f"📚 Tokenizer: {tokenizer.name}")
|
| 283 |
+
print(f"📊 Vocabulary Size: {tokenizer.n_vocab:,}")
|
| 284 |
+
print(f"📝 Max Token Value: {tokenizer.max_token_value:,}")
|
| 285 |
+
else:
|
| 286 |
+
# Test different tokenizers
|
| 287 |
+
tokenizer_name = "gpt2" # Change to "cl100k_base" or "o200k_base" to test others
|
| 288 |
+
tokenizer = tiktoken.get_encoding(tokenizer_name)
|
| 289 |
+
|
| 290 |
+
print(f"\n📚 Tokenizer: {tokenizer_name}")
|
| 291 |
+
print(f"📊 Vocabulary Size: {tokenizer.n_vocab:,}")
|
| 292 |
+
print(f"📝 Max Token Value: {tokenizer.max_token_value:,}")
|
| 293 |
+
print(f"🔤 Name: {tokenizer.name}")
|
| 294 |
+
|
| 295 |
+
# Test encoding/decoding
|
| 296 |
+
if USE_TURKISH and TURKISH_TOKENIZER_AVAILABLE:
|
| 297 |
+
test_samples = [
|
| 298 |
+
"Merhaba Dünya!",
|
| 299 |
+
"İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum.",
|
| 300 |
+
"Kitap okumak çok güzeldir ve bilgi verir.",
|
| 301 |
+
"Türkiye Cumhuriyeti'nin başkenti Ankara'dır.",
|
| 302 |
+
"Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor.",
|
| 303 |
+
]
|
| 304 |
+
else:
|
| 305 |
+
test_samples = [
|
| 306 |
+
"Hello, world!",
|
| 307 |
+
"The quick brown fox jumps over the lazy dog.",
|
| 308 |
+
"Machine learning is fascinating.",
|
| 309 |
+
"print('Hello, World!')", # Code sample
|
| 310 |
+
"日本語のテキスト", # Non-English
|
| 311 |
+
]
|
| 312 |
+
|
| 313 |
+
print("\n" + "=" * 60)
|
| 314 |
+
print("ENCODING EXAMPLES")
|
| 315 |
+
print("=" * 60)
|
| 316 |
+
|
| 317 |
+
for text in test_samples:
|
| 318 |
+
tokens = tokenizer.encode(text)
|
| 319 |
+
decoded = tokenizer.decode(tokens)
|
| 320 |
+
print(f"\nText: {text}")
|
| 321 |
+
print(f"Tokens ({len(tokens)}): {tokens}")
|
| 322 |
+
print(f"Token range: [{min(tokens)}, {max(tokens)}]")
|
| 323 |
+
print(f"Decoded: {decoded}")
|
| 324 |
+
|
| 325 |
+
# Test with actual data
|
| 326 |
+
print("\n" + "=" * 60)
|
| 327 |
+
print("DATALOADER TESTING")
|
| 328 |
+
print("=" * 60)
|
| 329 |
+
|
| 330 |
+
sample_text = get_sample_data()
|
| 331 |
+
if sample_text:
|
| 332 |
+
print(f"\n📄 Sample text length: {len(sample_text):,} characters")
|
| 333 |
+
|
| 334 |
+
# Tokenize sample
|
| 335 |
+
if USE_TURKISH and TURKISH_TOKENIZER_AVAILABLE:
|
| 336 |
+
full_tokens = tokenizer.encode(sample_text)
|
| 337 |
+
else:
|
| 338 |
+
full_tokens = tokenizer.encode(sample_text, allowed_special={"<|endoftext|>"})
|
| 339 |
+
|
| 340 |
+
print(f"🔢 Total tokens: {len(full_tokens):,}")
|
| 341 |
+
print(f"📈 Unique tokens used: {len(set(full_tokens)):,}")
|
| 342 |
+
print(f"📊 Vocabulary coverage: {len(set(full_tokens)) / tokenizer.n_vocab * 100:.2f}%")
|
| 343 |
+
|
| 344 |
+
# Create dataloader
|
| 345 |
+
args = ModelArgs(max_seq_len=128, max_batch_size=16)
|
| 346 |
+
dataloader = create_dataloader(
|
| 347 |
+
sample_text,
|
| 348 |
+
args,
|
| 349 |
+
num_workers=0,
|
| 350 |
+
max_samples=100,
|
| 351 |
+
use_turkish_tokenizer=USE_TURKISH and TURKISH_TOKENIZER_AVAILABLE
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
print(f"\n⚙️ DataLoader Config:")
|
| 355 |
+
print(f" Sequence length: {args.max_seq_len}")
|
| 356 |
+
print(f" Batch size: {args.max_batch_size}")
|
| 357 |
+
print(f" Total batches: {len(dataloader)}")
|
| 358 |
+
|
| 359 |
+
# Test first batch
|
| 360 |
+
for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
|
| 361 |
+
print(f"\n🎯 Batch {batch_idx}:")
|
| 362 |
+
print(f" input_ids shape: {input_ids.shape}")
|
| 363 |
+
print(f" target_ids shape: {target_ids.shape}")
|
| 364 |
+
print(f" input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]")
|
| 365 |
+
print(f" Sample input (first 10 tokens): {input_ids[0, :10].tolist()}")
|
| 366 |
+
print(f" Decoded: {tokenizer.decode(input_ids[0, :10].tolist())}")
|
| 367 |
+
break
|
| 368 |
+
|
| 369 |
+
print("\n" + "=" * 60)
|
| 370 |
+
print("✅ Testing complete!")
|
| 371 |
+
print("=" * 60)
|
Model_Architecture/turkish_tokenizer_example.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example usage of Turkish Tokenizer in the data pipeline.
|
| 3 |
+
|
| 4 |
+
This demonstrates how to use the Turkish morphological tokenizer
|
| 5 |
+
for training language models on Turkish text.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from data import create_dataloader, TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
|
| 9 |
+
from model import ModelArgs
|
| 10 |
+
|
| 11 |
+
def main():
|
| 12 |
+
"""Example of using Turkish tokenizer with the data pipeline"""
|
| 13 |
+
|
| 14 |
+
if not TURKISH_TOKENIZER_AVAILABLE:
|
| 15 |
+
print("❌ Turkish tokenizer is not installed!")
|
| 16 |
+
print("Install it with: pip install turkish-tokenizer")
|
| 17 |
+
return
|
| 18 |
+
|
| 19 |
+
# Sample Turkish text
|
| 20 |
+
turkish_text = """
|
| 21 |
+
Merhaba! Bu bir Türkçe metin örneğidir.
|
| 22 |
+
İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum.
|
| 23 |
+
Kitap okumak çok güzeldir ve bilgi verir.
|
| 24 |
+
Türkiye Cumhuriyeti'nin başkenti Ankara'dır.
|
| 25 |
+
Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor.
|
| 26 |
+
""" * 100 # Repeat to have enough text for training
|
| 27 |
+
|
| 28 |
+
print("=" * 60)
|
| 29 |
+
print("TURKISH TOKENIZER EXAMPLE")
|
| 30 |
+
print("=" * 60)
|
| 31 |
+
|
| 32 |
+
# Test the tokenizer directly
|
| 33 |
+
print("\n1️⃣ Testing Turkish Tokenizer Wrapper")
|
| 34 |
+
tokenizer = TurkishTokenizerWrapper()
|
| 35 |
+
print(f" Tokenizer: {tokenizer.name}")
|
| 36 |
+
print(f" Vocabulary size: {tokenizer.n_vocab:,}")
|
| 37 |
+
|
| 38 |
+
# Test encoding/decoding
|
| 39 |
+
sample = "Kitapları okuyorum ve öğreniyorum."
|
| 40 |
+
tokens = tokenizer.encode(sample)
|
| 41 |
+
decoded = tokenizer.decode(tokens)
|
| 42 |
+
|
| 43 |
+
print(f"\n Original: {sample}")
|
| 44 |
+
print(f" Tokens ({len(tokens)}): {tokens[:20]}..." if len(tokens) > 20 else f" Tokens: {tokens}")
|
| 45 |
+
print(f" Decoded: {decoded}")
|
| 46 |
+
|
| 47 |
+
# Create dataloader with Turkish tokenizer
|
| 48 |
+
print("\n2️⃣ Creating DataLoader with Turkish Tokenizer")
|
| 49 |
+
args = ModelArgs(
|
| 50 |
+
max_seq_len=128,
|
| 51 |
+
max_batch_size=8,
|
| 52 |
+
vocab_size=tokenizer.n_vocab # Important: set vocab size for model
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
dataloader = create_dataloader(
|
| 56 |
+
txt=turkish_text,
|
| 57 |
+
args=args,
|
| 58 |
+
stride=64, # 50% overlap
|
| 59 |
+
shuffle=True,
|
| 60 |
+
num_workers=0,
|
| 61 |
+
max_samples=50, # Limit for testing
|
| 62 |
+
use_turkish_tokenizer=True # Enable Turkish tokenizer
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
print(f"\n ✅ DataLoader created successfully!")
|
| 66 |
+
print(f" Sequence length: {args.max_seq_len}")
|
| 67 |
+
print(f" Batch size: {args.max_batch_size}")
|
| 68 |
+
print(f" Total batches: {len(dataloader)}")
|
| 69 |
+
print(f" Total samples: {len(dataloader.dataset)}")
|
| 70 |
+
|
| 71 |
+
# Test a batch
|
| 72 |
+
print("\n3️⃣ Testing First Batch")
|
| 73 |
+
for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
|
| 74 |
+
print(f"\n Batch {batch_idx}:")
|
| 75 |
+
print(f" input_ids shape: {input_ids.shape}")
|
| 76 |
+
print(f" target_ids shape: {target_ids.shape}")
|
| 77 |
+
print(f" input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]")
|
| 78 |
+
print(f" Sample input (first 10 tokens): {input_ids[0, :10].tolist()}")
|
| 79 |
+
print(f" Decoded sample: {tokenizer.decode(input_ids[0, :30].tolist())}")
|
| 80 |
+
break
|
| 81 |
+
|
| 82 |
+
print("\n" + "=" * 60)
|
| 83 |
+
print("✅ Turkish Tokenizer Example Complete!")
|
| 84 |
+
print("=" * 60)
|
| 85 |
+
|
| 86 |
+
# Usage tips
|
| 87 |
+
print("\n💡 Usage Tips:")
|
| 88 |
+
print(" • Set vocab_size in ModelArgs to tokenizer.n_vocab")
|
| 89 |
+
print(" • Use use_turkish_tokenizer=True in create_dataloader()")
|
| 90 |
+
print(" • Turkish tokenizer handles morphological analysis automatically")
|
| 91 |
+
print(" • Vocabulary size is optimized for Turkish language")
|
| 92 |
+
print("\n📚 To use in training:")
|
| 93 |
+
print(" tokenizer = TurkishTokenizerWrapper()")
|
| 94 |
+
print(" args = ModelArgs(vocab_size=tokenizer.n_vocab, ...)")
|
| 95 |
+
print(" dataloader = create_dataloader(..., use_turkish_tokenizer=True)")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
if __name__ == "__main__":
|
| 99 |
+
main()
|
turkish_tiktokenizer
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit da42085da3969c1e6822c6df7e4a879a1d9d1583
|