ikaganacar commited on
Commit
326b359
·
1 Parent(s): ad359f7

Generation with Turkish Tokenizer

Browse files
Model_Architecture/data.py CHANGED
@@ -21,10 +21,6 @@ except ImportError:
21
  # TURKISH TOKENIZER WRAPPER
22
  #####################################
23
  class TurkishTokenizerWrapper:
24
- """
25
- Wrapper for Turkish Tokenizer to make it compatible with tiktoken interface.
26
- This allows seamless integration with the existing TextDataset class.
27
- """
28
  def __init__(self):
29
  if not TURKISH_TOKENIZER_AVAILABLE:
30
  raise ImportError(
@@ -35,28 +31,9 @@ class TurkishTokenizerWrapper:
35
  self.name = "turkish-tokenizer"
36
 
37
  def encode(self, text: str, allowed_special: Optional[set] = None) -> List[int]:
38
- """
39
- Encode text to token IDs (compatible with tiktoken interface).
40
-
41
- Args:
42
- text: Input text to tokenize
43
- allowed_special: Not used for Turkish tokenizer, kept for compatibility
44
-
45
- Returns:
46
- List of token IDs
47
- """
48
  return self.tokenizer.encode(text)
49
 
50
  def decode(self, tokens: List[int]) -> str:
51
- """
52
- Decode token IDs back to text.
53
-
54
- Args:
55
- tokens: List of token IDs
56
-
57
- Returns:
58
- Decoded text string
59
- """
60
  return self.tokenizer.decode(tokens)
61
 
62
  @property
@@ -75,16 +52,6 @@ class TurkishTokenizerWrapper:
75
  #####################################
76
  class TextDataset(Dataset):
77
  def __init__(self, txt: str, tokenizer, args: ModelArgs, stride: Optional[int] = None, max_samples: Optional[int] = None):
78
- """
79
- Optimized text dataset with memory-mapped reading and batched tokenization.
80
-
81
- Args:
82
- txt: Text content or path to file
83
- tokenizer: Pretrained tokenizer with .encode() method
84
- args: ModelArgs containing max_seq_len, max_batch_size
85
- stride: Sliding window stride. Defaults to max_seq_len // 2
86
- max_samples: Limit number of samples for quick testing
87
- """
88
  self.max_seq_len = args.max_seq_len
89
  self.stride = stride if stride is not None else self.max_seq_len // 2
90
 
@@ -115,7 +82,6 @@ class TextDataset(Dataset):
115
  print(f"✅ Created {len(self.samples)} training samples")
116
 
117
  def _read_file_mmap(self, file_path: str) -> str:
118
- """Memory-efficient file reading for large files"""
119
  try:
120
  with open(file_path, 'r', encoding='utf-8') as f:
121
  with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
@@ -124,7 +90,6 @@ class TextDataset(Dataset):
124
  raise RuntimeError(f"Failed to read file {file_path}: {e}")
125
 
126
  def _tokenize_with_progress(self, tokenizer, text: str) -> List[int]:
127
- """Tokenize with progress bar for large texts"""
128
  # Process in chunks for memory efficiency
129
  chunk_size = 10_000_000 # 10MB chunks
130
  tokens = []
@@ -148,7 +113,6 @@ class TextDataset(Dataset):
148
  return tokens
149
 
150
  def _create_sliding_windows(self, token_ids: List[int], max_samples: Optional[int]) -> torch.Tensor:
151
- """Create overlapping sequences using vectorized operations"""
152
  if len(token_ids) < self.max_seq_len + 1:
153
  raise ValueError(f"Not enough tokens. Need {self.max_seq_len + 1}, got {len(token_ids)}")
154
 
@@ -194,23 +158,9 @@ def create_dataloader(
194
  pin_memory: bool = True,
195
  persistent_workers: bool = False,
196
  max_samples: Optional[int] = None,
197
- use_turkish_tokenizer: bool = False
198
  ) -> DataLoader:
199
- """
200
- Optimized DataLoader with proper memory pinning and worker settings.
201
-
202
- Args:
203
- txt: Text content or file path
204
- args: ModelArgs configuration
205
- stride: Sliding window stride
206
- shuffle: Whether to shuffle samples
207
- drop_last: Drop incomplete batches
208
- num_workers: Number of data loading workers (0 = main process)
209
- pin_memory: Pin memory for faster GPU transfer (recommended)
210
- persistent_workers: Keep workers alive between epochs (if num_workers > 0)
211
- max_samples: Limit samples for testing
212
- use_turkish_tokenizer: Use Turkish morphological tokenizer instead of tiktoken
213
- """
214
  # Select tokenizer based on user preference
215
  if use_turkish_tokenizer:
216
  if not TURKISH_TOKENIZER_AVAILABLE:
 
21
  # TURKISH TOKENIZER WRAPPER
22
  #####################################
23
  class TurkishTokenizerWrapper:
 
 
 
 
24
  def __init__(self):
25
  if not TURKISH_TOKENIZER_AVAILABLE:
26
  raise ImportError(
 
31
  self.name = "turkish-tokenizer"
32
 
33
  def encode(self, text: str, allowed_special: Optional[set] = None) -> List[int]:
 
 
 
 
 
 
 
 
 
 
34
  return self.tokenizer.encode(text)
35
 
36
  def decode(self, tokens: List[int]) -> str:
 
 
 
 
 
 
 
 
 
37
  return self.tokenizer.decode(tokens)
38
 
39
  @property
 
52
  #####################################
53
  class TextDataset(Dataset):
54
  def __init__(self, txt: str, tokenizer, args: ModelArgs, stride: Optional[int] = None, max_samples: Optional[int] = None):
 
 
 
 
 
 
 
 
 
 
55
  self.max_seq_len = args.max_seq_len
56
  self.stride = stride if stride is not None else self.max_seq_len // 2
57
 
 
82
  print(f"✅ Created {len(self.samples)} training samples")
83
 
84
  def _read_file_mmap(self, file_path: str) -> str:
 
85
  try:
86
  with open(file_path, 'r', encoding='utf-8') as f:
87
  with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
 
90
  raise RuntimeError(f"Failed to read file {file_path}: {e}")
91
 
92
  def _tokenize_with_progress(self, tokenizer, text: str) -> List[int]:
 
93
  # Process in chunks for memory efficiency
94
  chunk_size = 10_000_000 # 10MB chunks
95
  tokens = []
 
113
  return tokens
114
 
115
  def _create_sliding_windows(self, token_ids: List[int], max_samples: Optional[int]) -> torch.Tensor:
 
116
  if len(token_ids) < self.max_seq_len + 1:
117
  raise ValueError(f"Not enough tokens. Need {self.max_seq_len + 1}, got {len(token_ids)}")
118
 
 
158
  pin_memory: bool = True,
159
  persistent_workers: bool = False,
160
  max_samples: Optional[int] = None,
161
+ use_turkish_tokenizer: bool = True
162
  ) -> DataLoader:
163
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  # Select tokenizer based on user preference
165
  if use_turkish_tokenizer:
166
  if not TURKISH_TOKENIZER_AVAILABLE:
Model_Architecture/generation.py CHANGED
@@ -1,6 +1,7 @@
1
  import torch
2
  import tiktoken
3
  from model import ismail, ModelArgs
 
4
 
5
 
6
  #####################################
@@ -93,12 +94,17 @@ def text_to_token_ids(text, tokenizer):
93
 
94
  Args:
95
  text: Input text string
96
- tokenizer: Tokenizer instance
97
 
98
  Returns:
99
  Tensor of token IDs with shape (1, seq_len)
100
  """
101
- encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
 
 
 
 
 
102
  encoded_tensor = torch.tensor(encoded).unsqueeze(0)
103
  return encoded_tensor
104
 
@@ -109,7 +115,7 @@ def token_ids_to_text(token_ids, tokenizer):
109
 
110
  Args:
111
  token_ids: Tensor of token IDs, can be 1D or 2D
112
- tokenizer: Tokenizer instance
113
 
114
  Returns:
115
  Decoded text string
@@ -123,6 +129,32 @@ def token_ids_to_text(token_ids, tokenizer):
123
  return tokenizer.decode(flat)
124
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  #####################################
127
  # EXAMPLE USAGE
128
  #####################################
@@ -131,6 +163,9 @@ if __name__ == "__main__":
131
  import json
132
  from pathlib import Path
133
 
 
 
 
134
  # Example configuration - smaller model for testing
135
  config_path = Path("config.json")
136
  if config_path.exists():
@@ -142,22 +177,34 @@ if __name__ == "__main__":
142
  print("⚠️ config.json not found, using default ModelArgs")
143
  args = ModelArgs()
144
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # Initialize model and tokenizer
147
  print("Initializing model...")
148
  torch.manual_seed(123)
149
  model = ismail(args)
150
  model.eval()
151
 
152
- tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
153
- tokenizer = tiktoken.get_encoding(tokenizer_name)
154
-
155
  # Example 1: Greedy generation (argmax)
156
  print(f"\n{'='*60}")
157
  print("EXAMPLE 1: GREEDY GENERATION (ARGMAX)")
158
  print(f"{'='*60}")
159
 
160
- start_context = "Hello, I am"
 
 
 
 
161
  print(f"\nInput: '{start_context}'")
162
 
163
  token_ids = text_to_token_ids(start_context, tokenizer)
@@ -179,7 +226,10 @@ if __name__ == "__main__":
179
  print("EXAMPLE 2: SAMPLING WITH TEMPERATURE")
180
  print(f"{'='*60}")
181
 
182
- start_context = "Once upon a time"
 
 
 
183
  print(f"\nInput: '{start_context}'")
184
 
185
  token_ids = text_to_token_ids(start_context, tokenizer)
@@ -202,7 +252,10 @@ if __name__ == "__main__":
202
  print("EXAMPLE 3: TOP-K SAMPLING")
203
  print(f"{'='*60}")
204
 
205
- start_context = "The future of AI is"
 
 
 
206
  print(f"\nInput: '{start_context}'")
207
 
208
  token_ids = text_to_token_ids(start_context, tokenizer)
 
1
  import torch
2
  import tiktoken
3
  from model import ismail, ModelArgs
4
+ from data import TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
5
 
6
 
7
  #####################################
 
94
 
95
  Args:
96
  text: Input text string
97
+ tokenizer: Tokenizer instance (tiktoken or TurkishTokenizerWrapper)
98
 
99
  Returns:
100
  Tensor of token IDs with shape (1, seq_len)
101
  """
102
+ # Turkish tokenizer doesn't support allowed_special parameter
103
+ if isinstance(tokenizer, TurkishTokenizerWrapper):
104
+ encoded = tokenizer.encode(text)
105
+ else:
106
+ encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
107
+
108
  encoded_tensor = torch.tensor(encoded).unsqueeze(0)
109
  return encoded_tensor
110
 
 
115
 
116
  Args:
117
  token_ids: Tensor of token IDs, can be 1D or 2D
118
+ tokenizer: Tokenizer instance (tiktoken or TurkishTokenizerWrapper)
119
 
120
  Returns:
121
  Decoded text string
 
129
  return tokenizer.decode(flat)
130
 
131
 
132
+ def get_tokenizer(use_turkish=False, tokenizer_name="gpt2"):
133
+ """
134
+ Get the appropriate tokenizer based on user preference.
135
+
136
+ Args:
137
+ use_turkish: Whether to use Turkish tokenizer
138
+ tokenizer_name: Name of tiktoken tokenizer to use if not using Turkish
139
+
140
+ Returns:
141
+ Tokenizer instance (TurkishTokenizerWrapper or tiktoken tokenizer)
142
+ """
143
+ if use_turkish:
144
+ if not TURKISH_TOKENIZER_AVAILABLE:
145
+ raise ImportError(
146
+ "Turkish tokenizer requested but not available. "
147
+ "Install it with: pip install turkish-tokenizer"
148
+ )
149
+ tokenizer = TurkishTokenizerWrapper()
150
+ print(f"🇹🇷 Using Turkish Tokenizer (vocab size: {tokenizer.n_vocab:,})")
151
+ return tokenizer
152
+ else:
153
+ tokenizer = tiktoken.get_encoding(tokenizer_name)
154
+ print(f"📚 Using tiktoken tokenizer: {tokenizer_name} (vocab size: {tokenizer.n_vocab:,})")
155
+ return tokenizer
156
+
157
+
158
  #####################################
159
  # EXAMPLE USAGE
160
  #####################################
 
163
  import json
164
  from pathlib import Path
165
 
166
+ # Configuration: Set to True to use Turkish tokenizer, False for tiktoken
167
+ USE_TURKISH_TOKENIZER = False # Change this to True for Turkish text generation
168
+
169
  # Example configuration - smaller model for testing
170
  config_path = Path("config.json")
171
  if config_path.exists():
 
177
  print("⚠️ config.json not found, using default ModelArgs")
178
  args = ModelArgs()
179
 
180
+ # Initialize tokenizer
181
+ tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
182
+ tokenizer = get_tokenizer(
183
+ use_turkish=USE_TURKISH_TOKENIZER,
184
+ tokenizer_name=tokenizer_name
185
+ )
186
+
187
+ # Update vocab size if using Turkish tokenizer
188
+ if USE_TURKISH_TOKENIZER and isinstance(tokenizer, TurkishTokenizerWrapper):
189
+ args.vocab_size = tokenizer.n_vocab
190
+ print(f"📊 Updated vocab_size to {args.vocab_size:,} for Turkish tokenizer")
191
 
192
+ # Initialize model
193
  print("Initializing model...")
194
  torch.manual_seed(123)
195
  model = ismail(args)
196
  model.eval()
197
 
 
 
 
198
  # Example 1: Greedy generation (argmax)
199
  print(f"\n{'='*60}")
200
  print("EXAMPLE 1: GREEDY GENERATION (ARGMAX)")
201
  print(f"{'='*60}")
202
 
203
+ # Use Turkish or English prompts based on tokenizer
204
+ if USE_TURKISH_TOKENIZER:
205
+ start_context = "Merhaba, ben"
206
+ else:
207
+ start_context = "Hello, I am"
208
  print(f"\nInput: '{start_context}'")
209
 
210
  token_ids = text_to_token_ids(start_context, tokenizer)
 
226
  print("EXAMPLE 2: SAMPLING WITH TEMPERATURE")
227
  print(f"{'='*60}")
228
 
229
+ if USE_TURKISH_TOKENIZER:
230
+ start_context = "Bir varmış bir yokmuş"
231
+ else:
232
+ start_context = "Once upon a time"
233
  print(f"\nInput: '{start_context}'")
234
 
235
  token_ids = text_to_token_ids(start_context, tokenizer)
 
252
  print("EXAMPLE 3: TOP-K SAMPLING")
253
  print(f"{'='*60}")
254
 
255
+ if USE_TURKISH_TOKENIZER:
256
+ start_context = "Yapay zekanın geleceği"
257
+ else:
258
+ start_context = "The future of AI is"
259
  print(f"\nInput: '{start_context}'")
260
 
261
  token_ids = text_to_token_ids(start_context, tokenizer)