ikaganacar commited on
Commit
ad359f7
·
1 Parent(s): 3d942a0

Turkish Tokenizer

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pyc
Model_Architecture/data.py CHANGED
@@ -9,6 +9,67 @@ import numpy as np
9
 
10
  from model import ModelArgs
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  #####################################
13
  # DATA
14
  #####################################
@@ -28,9 +89,15 @@ class TextDataset(Dataset):
28
  self.stride = stride if stride is not None else self.max_seq_len // 2
29
 
30
  # Handle file paths efficiently with memory mapping
31
- if Path(txt).exists():
32
- text_content = self._read_file_mmap(txt)
33
- else:
 
 
 
 
 
 
34
  text_content = txt
35
 
36
  # Validate input
@@ -126,11 +193,12 @@ def create_dataloader(
126
  num_workers: int = 0,
127
  pin_memory: bool = True,
128
  persistent_workers: bool = False,
129
- max_samples: Optional[int] = None
 
130
  ) -> DataLoader:
131
  """
132
  Optimized DataLoader with proper memory pinning and worker settings.
133
-
134
  Args:
135
  txt: Text content or file path
136
  args: ModelArgs configuration
@@ -141,12 +209,24 @@ def create_dataloader(
141
  pin_memory: Pin memory for faster GPU transfer (recommended)
142
  persistent_workers: Keep workers alive between epochs (if num_workers > 0)
143
  max_samples: Limit samples for testing
 
144
  """
145
- # Use the best default tokenizer for your setup
146
- # tiktoken's gpt2 is fast, well-tested, and has reasonable vocab size (~50k)
147
- # For multilingual or code, consider "cl100k_base" or "o200k_base"
148
- tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
149
- tokenizer = tiktoken.get_encoding(tokenizer_name)
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  # Create dataset with size validation
152
  try:
@@ -185,4 +265,107 @@ def get_sample_data(url: str = "https://raw.githubusercontent.com/karpathy/char-
185
  return response.text
186
  except Exception as e:
187
  print(f"⚠️ Could not download sample data: {e}")
188
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  from model import ModelArgs
11
 
12
+ # Turkish Tokenizer support
13
+ try:
14
+ from turkish_tokenizer import TurkishTokenizer as TurkishTokenizerBase
15
+ TURKISH_TOKENIZER_AVAILABLE = True
16
+ except ImportError:
17
+ TURKISH_TOKENIZER_AVAILABLE = False
18
+ TurkishTokenizerBase = None
19
+
20
+ #####################################
21
+ # TURKISH TOKENIZER WRAPPER
22
+ #####################################
23
+ class TurkishTokenizerWrapper:
24
+ """
25
+ Wrapper for Turkish Tokenizer to make it compatible with tiktoken interface.
26
+ This allows seamless integration with the existing TextDataset class.
27
+ """
28
+ def __init__(self):
29
+ if not TURKISH_TOKENIZER_AVAILABLE:
30
+ raise ImportError(
31
+ "turkish-tokenizer package is not installed. "
32
+ "Install it with: pip install turkish-tokenizer"
33
+ )
34
+ self.tokenizer = TurkishTokenizerBase()
35
+ self.name = "turkish-tokenizer"
36
+
37
+ def encode(self, text: str, allowed_special: Optional[set] = None) -> List[int]:
38
+ """
39
+ Encode text to token IDs (compatible with tiktoken interface).
40
+
41
+ Args:
42
+ text: Input text to tokenize
43
+ allowed_special: Not used for Turkish tokenizer, kept for compatibility
44
+
45
+ Returns:
46
+ List of token IDs
47
+ """
48
+ return self.tokenizer.encode(text)
49
+
50
+ def decode(self, tokens: List[int]) -> str:
51
+ """
52
+ Decode token IDs back to text.
53
+
54
+ Args:
55
+ tokens: List of token IDs
56
+
57
+ Returns:
58
+ Decoded text string
59
+ """
60
+ return self.tokenizer.decode(tokens)
61
+
62
+ @property
63
+ def n_vocab(self) -> int:
64
+ """Get vocabulary size"""
65
+ return self.tokenizer.vocab_size
66
+
67
+ @property
68
+ def max_token_value(self) -> int:
69
+ """Get maximum token value"""
70
+ return self.n_vocab - 1
71
+
72
+
73
  #####################################
74
  # DATA
75
  #####################################
 
89
  self.stride = stride if stride is not None else self.max_seq_len // 2
90
 
91
  # Handle file paths efficiently with memory mapping
92
+ # Check if txt is a file path (avoid Path().exists() for long strings)
93
+ try:
94
+ path = Path(txt)
95
+ if len(txt) < 4096 and path.exists(): # Reasonable path length check
96
+ text_content = self._read_file_mmap(txt)
97
+ else:
98
+ text_content = txt
99
+ except (OSError, ValueError):
100
+ # If Path() fails or string is too long, treat as raw text
101
  text_content = txt
102
 
103
  # Validate input
 
193
  num_workers: int = 0,
194
  pin_memory: bool = True,
195
  persistent_workers: bool = False,
196
+ max_samples: Optional[int] = None,
197
+ use_turkish_tokenizer: bool = False
198
  ) -> DataLoader:
199
  """
200
  Optimized DataLoader with proper memory pinning and worker settings.
201
+
202
  Args:
203
  txt: Text content or file path
204
  args: ModelArgs configuration
 
209
  pin_memory: Pin memory for faster GPU transfer (recommended)
210
  persistent_workers: Keep workers alive between epochs (if num_workers > 0)
211
  max_samples: Limit samples for testing
212
+ use_turkish_tokenizer: Use Turkish morphological tokenizer instead of tiktoken
213
  """
214
+ # Select tokenizer based on user preference
215
+ if use_turkish_tokenizer:
216
+ if not TURKISH_TOKENIZER_AVAILABLE:
217
+ raise ImportError(
218
+ "Turkish tokenizer requested but not available. "
219
+ "Install it with: pip install turkish-tokenizer"
220
+ )
221
+ tokenizer = TurkishTokenizerWrapper()
222
+ print(f"🇹🇷 Using Turkish Tokenizer (vocab size: {tokenizer.n_vocab:,})")
223
+ else:
224
+ # Use the best default tokenizer for your setup
225
+ # tiktoken's gpt2 is fast, well-tested, and has reasonable vocab size (~50k)
226
+ # For multilingual or code, consider "cl100k_base" or "o200k_base"
227
+ tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
228
+ tokenizer = tiktoken.get_encoding(tokenizer_name)
229
+ print(f"📚 Using tiktoken tokenizer: {tokenizer_name} (vocab size: {tokenizer.n_vocab:,})")
230
 
231
  # Create dataset with size validation
232
  try:
 
265
  return response.text
266
  except Exception as e:
267
  print(f"⚠️ Could not download sample data: {e}")
268
+ return ""
269
+
270
+
271
+ if __name__ == "__main__":
272
+ print("=" * 60)
273
+ print("TOKENIZER TESTING")
274
+ print("=" * 60)
275
+
276
+ # Choose which tokenizer to test
277
+ USE_TURKISH = True # Set to False to test tiktoken instead
278
+
279
+ if USE_TURKISH and TURKISH_TOKENIZER_AVAILABLE:
280
+ print("\n🇹🇷 Testing Turkish Tokenizer")
281
+ tokenizer = TurkishTokenizerWrapper()
282
+ print(f"📚 Tokenizer: {tokenizer.name}")
283
+ print(f"📊 Vocabulary Size: {tokenizer.n_vocab:,}")
284
+ print(f"📝 Max Token Value: {tokenizer.max_token_value:,}")
285
+ else:
286
+ # Test different tokenizers
287
+ tokenizer_name = "gpt2" # Change to "cl100k_base" or "o200k_base" to test others
288
+ tokenizer = tiktoken.get_encoding(tokenizer_name)
289
+
290
+ print(f"\n📚 Tokenizer: {tokenizer_name}")
291
+ print(f"📊 Vocabulary Size: {tokenizer.n_vocab:,}")
292
+ print(f"📝 Max Token Value: {tokenizer.max_token_value:,}")
293
+ print(f"🔤 Name: {tokenizer.name}")
294
+
295
+ # Test encoding/decoding
296
+ if USE_TURKISH and TURKISH_TOKENIZER_AVAILABLE:
297
+ test_samples = [
298
+ "Merhaba Dünya!",
299
+ "İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum.",
300
+ "Kitap okumak çok güzeldir ve bilgi verir.",
301
+ "Türkiye Cumhuriyeti'nin başkenti Ankara'dır.",
302
+ "Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor.",
303
+ ]
304
+ else:
305
+ test_samples = [
306
+ "Hello, world!",
307
+ "The quick brown fox jumps over the lazy dog.",
308
+ "Machine learning is fascinating.",
309
+ "print('Hello, World!')", # Code sample
310
+ "日本語のテキスト", # Non-English
311
+ ]
312
+
313
+ print("\n" + "=" * 60)
314
+ print("ENCODING EXAMPLES")
315
+ print("=" * 60)
316
+
317
+ for text in test_samples:
318
+ tokens = tokenizer.encode(text)
319
+ decoded = tokenizer.decode(tokens)
320
+ print(f"\nText: {text}")
321
+ print(f"Tokens ({len(tokens)}): {tokens}")
322
+ print(f"Token range: [{min(tokens)}, {max(tokens)}]")
323
+ print(f"Decoded: {decoded}")
324
+
325
+ # Test with actual data
326
+ print("\n" + "=" * 60)
327
+ print("DATALOADER TESTING")
328
+ print("=" * 60)
329
+
330
+ sample_text = get_sample_data()
331
+ if sample_text:
332
+ print(f"\n📄 Sample text length: {len(sample_text):,} characters")
333
+
334
+ # Tokenize sample
335
+ if USE_TURKISH and TURKISH_TOKENIZER_AVAILABLE:
336
+ full_tokens = tokenizer.encode(sample_text)
337
+ else:
338
+ full_tokens = tokenizer.encode(sample_text, allowed_special={"<|endoftext|>"})
339
+
340
+ print(f"🔢 Total tokens: {len(full_tokens):,}")
341
+ print(f"📈 Unique tokens used: {len(set(full_tokens)):,}")
342
+ print(f"📊 Vocabulary coverage: {len(set(full_tokens)) / tokenizer.n_vocab * 100:.2f}%")
343
+
344
+ # Create dataloader
345
+ args = ModelArgs(max_seq_len=128, max_batch_size=16)
346
+ dataloader = create_dataloader(
347
+ sample_text,
348
+ args,
349
+ num_workers=0,
350
+ max_samples=100,
351
+ use_turkish_tokenizer=USE_TURKISH and TURKISH_TOKENIZER_AVAILABLE
352
+ )
353
+
354
+ print(f"\n⚙️ DataLoader Config:")
355
+ print(f" Sequence length: {args.max_seq_len}")
356
+ print(f" Batch size: {args.max_batch_size}")
357
+ print(f" Total batches: {len(dataloader)}")
358
+
359
+ # Test first batch
360
+ for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
361
+ print(f"\n🎯 Batch {batch_idx}:")
362
+ print(f" input_ids shape: {input_ids.shape}")
363
+ print(f" target_ids shape: {target_ids.shape}")
364
+ print(f" input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]")
365
+ print(f" Sample input (first 10 tokens): {input_ids[0, :10].tolist()}")
366
+ print(f" Decoded: {tokenizer.decode(input_ids[0, :10].tolist())}")
367
+ break
368
+
369
+ print("\n" + "=" * 60)
370
+ print("✅ Testing complete!")
371
+ print("=" * 60)
Model_Architecture/turkish_tokenizer_example.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example usage of Turkish Tokenizer in the data pipeline.
3
+
4
+ This demonstrates how to use the Turkish morphological tokenizer
5
+ for training language models on Turkish text.
6
+ """
7
+
8
+ from data import create_dataloader, TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
9
+ from model import ModelArgs
10
+
11
+ def main():
12
+ """Example of using Turkish tokenizer with the data pipeline"""
13
+
14
+ if not TURKISH_TOKENIZER_AVAILABLE:
15
+ print("❌ Turkish tokenizer is not installed!")
16
+ print("Install it with: pip install turkish-tokenizer")
17
+ return
18
+
19
+ # Sample Turkish text
20
+ turkish_text = """
21
+ Merhaba! Bu bir Türkçe metin örneğidir.
22
+ İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum.
23
+ Kitap okumak çok güzeldir ve bilgi verir.
24
+ Türkiye Cumhuriyeti'nin başkenti Ankara'dır.
25
+ Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor.
26
+ """ * 100 # Repeat to have enough text for training
27
+
28
+ print("=" * 60)
29
+ print("TURKISH TOKENIZER EXAMPLE")
30
+ print("=" * 60)
31
+
32
+ # Test the tokenizer directly
33
+ print("\n1️⃣ Testing Turkish Tokenizer Wrapper")
34
+ tokenizer = TurkishTokenizerWrapper()
35
+ print(f" Tokenizer: {tokenizer.name}")
36
+ print(f" Vocabulary size: {tokenizer.n_vocab:,}")
37
+
38
+ # Test encoding/decoding
39
+ sample = "Kitapları okuyorum ve öğreniyorum."
40
+ tokens = tokenizer.encode(sample)
41
+ decoded = tokenizer.decode(tokens)
42
+
43
+ print(f"\n Original: {sample}")
44
+ print(f" Tokens ({len(tokens)}): {tokens[:20]}..." if len(tokens) > 20 else f" Tokens: {tokens}")
45
+ print(f" Decoded: {decoded}")
46
+
47
+ # Create dataloader with Turkish tokenizer
48
+ print("\n2️⃣ Creating DataLoader with Turkish Tokenizer")
49
+ args = ModelArgs(
50
+ max_seq_len=128,
51
+ max_batch_size=8,
52
+ vocab_size=tokenizer.n_vocab # Important: set vocab size for model
53
+ )
54
+
55
+ dataloader = create_dataloader(
56
+ txt=turkish_text,
57
+ args=args,
58
+ stride=64, # 50% overlap
59
+ shuffle=True,
60
+ num_workers=0,
61
+ max_samples=50, # Limit for testing
62
+ use_turkish_tokenizer=True # Enable Turkish tokenizer
63
+ )
64
+
65
+ print(f"\n ✅ DataLoader created successfully!")
66
+ print(f" Sequence length: {args.max_seq_len}")
67
+ print(f" Batch size: {args.max_batch_size}")
68
+ print(f" Total batches: {len(dataloader)}")
69
+ print(f" Total samples: {len(dataloader.dataset)}")
70
+
71
+ # Test a batch
72
+ print("\n3️⃣ Testing First Batch")
73
+ for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
74
+ print(f"\n Batch {batch_idx}:")
75
+ print(f" input_ids shape: {input_ids.shape}")
76
+ print(f" target_ids shape: {target_ids.shape}")
77
+ print(f" input_ids range: [{input_ids.min().item()}, {input_ids.max().item()}]")
78
+ print(f" Sample input (first 10 tokens): {input_ids[0, :10].tolist()}")
79
+ print(f" Decoded sample: {tokenizer.decode(input_ids[0, :30].tolist())}")
80
+ break
81
+
82
+ print("\n" + "=" * 60)
83
+ print("✅ Turkish Tokenizer Example Complete!")
84
+ print("=" * 60)
85
+
86
+ # Usage tips
87
+ print("\n💡 Usage Tips:")
88
+ print(" • Set vocab_size in ModelArgs to tokenizer.n_vocab")
89
+ print(" • Use use_turkish_tokenizer=True in create_dataloader()")
90
+ print(" • Turkish tokenizer handles morphological analysis automatically")
91
+ print(" • Vocabulary size is optimized for Turkish language")
92
+ print("\n📚 To use in training:")
93
+ print(" tokenizer = TurkishTokenizerWrapper()")
94
+ print(" args = ModelArgs(vocab_size=tokenizer.n_vocab, ...)")
95
+ print(" dataloader = create_dataloader(..., use_turkish_tokenizer=True)")
96
+
97
+
98
+ if __name__ == "__main__":
99
+ main()
turkish_tiktokenizer ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit da42085da3969c1e6822c6df7e4a879a1d9d1583