Commit ·
326b359
1
Parent(s): ad359f7
Generation with Turkish Tokenizer
Browse files- Model_Architecture/data.py +2 -52
- Model_Architecture/generation.py +63 -10
Model_Architecture/data.py
CHANGED
|
@@ -21,10 +21,6 @@ except ImportError:
|
|
| 21 |
# TURKISH TOKENIZER WRAPPER
|
| 22 |
#####################################
|
| 23 |
class TurkishTokenizerWrapper:
|
| 24 |
-
"""
|
| 25 |
-
Wrapper for Turkish Tokenizer to make it compatible with tiktoken interface.
|
| 26 |
-
This allows seamless integration with the existing TextDataset class.
|
| 27 |
-
"""
|
| 28 |
def __init__(self):
|
| 29 |
if not TURKISH_TOKENIZER_AVAILABLE:
|
| 30 |
raise ImportError(
|
|
@@ -35,28 +31,9 @@ class TurkishTokenizerWrapper:
|
|
| 35 |
self.name = "turkish-tokenizer"
|
| 36 |
|
| 37 |
def encode(self, text: str, allowed_special: Optional[set] = None) -> List[int]:
|
| 38 |
-
"""
|
| 39 |
-
Encode text to token IDs (compatible with tiktoken interface).
|
| 40 |
-
|
| 41 |
-
Args:
|
| 42 |
-
text: Input text to tokenize
|
| 43 |
-
allowed_special: Not used for Turkish tokenizer, kept for compatibility
|
| 44 |
-
|
| 45 |
-
Returns:
|
| 46 |
-
List of token IDs
|
| 47 |
-
"""
|
| 48 |
return self.tokenizer.encode(text)
|
| 49 |
|
| 50 |
def decode(self, tokens: List[int]) -> str:
|
| 51 |
-
"""
|
| 52 |
-
Decode token IDs back to text.
|
| 53 |
-
|
| 54 |
-
Args:
|
| 55 |
-
tokens: List of token IDs
|
| 56 |
-
|
| 57 |
-
Returns:
|
| 58 |
-
Decoded text string
|
| 59 |
-
"""
|
| 60 |
return self.tokenizer.decode(tokens)
|
| 61 |
|
| 62 |
@property
|
|
@@ -75,16 +52,6 @@ class TurkishTokenizerWrapper:
|
|
| 75 |
#####################################
|
| 76 |
class TextDataset(Dataset):
|
| 77 |
def __init__(self, txt: str, tokenizer, args: ModelArgs, stride: Optional[int] = None, max_samples: Optional[int] = None):
|
| 78 |
-
"""
|
| 79 |
-
Optimized text dataset with memory-mapped reading and batched tokenization.
|
| 80 |
-
|
| 81 |
-
Args:
|
| 82 |
-
txt: Text content or path to file
|
| 83 |
-
tokenizer: Pretrained tokenizer with .encode() method
|
| 84 |
-
args: ModelArgs containing max_seq_len, max_batch_size
|
| 85 |
-
stride: Sliding window stride. Defaults to max_seq_len // 2
|
| 86 |
-
max_samples: Limit number of samples for quick testing
|
| 87 |
-
"""
|
| 88 |
self.max_seq_len = args.max_seq_len
|
| 89 |
self.stride = stride if stride is not None else self.max_seq_len // 2
|
| 90 |
|
|
@@ -115,7 +82,6 @@ class TextDataset(Dataset):
|
|
| 115 |
print(f"✅ Created {len(self.samples)} training samples")
|
| 116 |
|
| 117 |
def _read_file_mmap(self, file_path: str) -> str:
|
| 118 |
-
"""Memory-efficient file reading for large files"""
|
| 119 |
try:
|
| 120 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 121 |
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
|
|
@@ -124,7 +90,6 @@ class TextDataset(Dataset):
|
|
| 124 |
raise RuntimeError(f"Failed to read file {file_path}: {e}")
|
| 125 |
|
| 126 |
def _tokenize_with_progress(self, tokenizer, text: str) -> List[int]:
|
| 127 |
-
"""Tokenize with progress bar for large texts"""
|
| 128 |
# Process in chunks for memory efficiency
|
| 129 |
chunk_size = 10_000_000 # 10MB chunks
|
| 130 |
tokens = []
|
|
@@ -148,7 +113,6 @@ class TextDataset(Dataset):
|
|
| 148 |
return tokens
|
| 149 |
|
| 150 |
def _create_sliding_windows(self, token_ids: List[int], max_samples: Optional[int]) -> torch.Tensor:
|
| 151 |
-
"""Create overlapping sequences using vectorized operations"""
|
| 152 |
if len(token_ids) < self.max_seq_len + 1:
|
| 153 |
raise ValueError(f"Not enough tokens. Need {self.max_seq_len + 1}, got {len(token_ids)}")
|
| 154 |
|
|
@@ -194,23 +158,9 @@ def create_dataloader(
|
|
| 194 |
pin_memory: bool = True,
|
| 195 |
persistent_workers: bool = False,
|
| 196 |
max_samples: Optional[int] = None,
|
| 197 |
-
use_turkish_tokenizer: bool =
|
| 198 |
) -> DataLoader:
|
| 199 |
-
|
| 200 |
-
Optimized DataLoader with proper memory pinning and worker settings.
|
| 201 |
-
|
| 202 |
-
Args:
|
| 203 |
-
txt: Text content or file path
|
| 204 |
-
args: ModelArgs configuration
|
| 205 |
-
stride: Sliding window stride
|
| 206 |
-
shuffle: Whether to shuffle samples
|
| 207 |
-
drop_last: Drop incomplete batches
|
| 208 |
-
num_workers: Number of data loading workers (0 = main process)
|
| 209 |
-
pin_memory: Pin memory for faster GPU transfer (recommended)
|
| 210 |
-
persistent_workers: Keep workers alive between epochs (if num_workers > 0)
|
| 211 |
-
max_samples: Limit samples for testing
|
| 212 |
-
use_turkish_tokenizer: Use Turkish morphological tokenizer instead of tiktoken
|
| 213 |
-
"""
|
| 214 |
# Select tokenizer based on user preference
|
| 215 |
if use_turkish_tokenizer:
|
| 216 |
if not TURKISH_TOKENIZER_AVAILABLE:
|
|
|
|
| 21 |
# TURKISH TOKENIZER WRAPPER
|
| 22 |
#####################################
|
| 23 |
class TurkishTokenizerWrapper:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def __init__(self):
|
| 25 |
if not TURKISH_TOKENIZER_AVAILABLE:
|
| 26 |
raise ImportError(
|
|
|
|
| 31 |
self.name = "turkish-tokenizer"
|
| 32 |
|
| 33 |
def encode(self, text: str, allowed_special: Optional[set] = None) -> List[int]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
return self.tokenizer.encode(text)
|
| 35 |
|
| 36 |
def decode(self, tokens: List[int]) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
return self.tokenizer.decode(tokens)
|
| 38 |
|
| 39 |
@property
|
|
|
|
| 52 |
#####################################
|
| 53 |
class TextDataset(Dataset):
|
| 54 |
def __init__(self, txt: str, tokenizer, args: ModelArgs, stride: Optional[int] = None, max_samples: Optional[int] = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
self.max_seq_len = args.max_seq_len
|
| 56 |
self.stride = stride if stride is not None else self.max_seq_len // 2
|
| 57 |
|
|
|
|
| 82 |
print(f"✅ Created {len(self.samples)} training samples")
|
| 83 |
|
| 84 |
def _read_file_mmap(self, file_path: str) -> str:
|
|
|
|
| 85 |
try:
|
| 86 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 87 |
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
|
|
|
|
| 90 |
raise RuntimeError(f"Failed to read file {file_path}: {e}")
|
| 91 |
|
| 92 |
def _tokenize_with_progress(self, tokenizer, text: str) -> List[int]:
|
|
|
|
| 93 |
# Process in chunks for memory efficiency
|
| 94 |
chunk_size = 10_000_000 # 10MB chunks
|
| 95 |
tokens = []
|
|
|
|
| 113 |
return tokens
|
| 114 |
|
| 115 |
def _create_sliding_windows(self, token_ids: List[int], max_samples: Optional[int]) -> torch.Tensor:
|
|
|
|
| 116 |
if len(token_ids) < self.max_seq_len + 1:
|
| 117 |
raise ValueError(f"Not enough tokens. Need {self.max_seq_len + 1}, got {len(token_ids)}")
|
| 118 |
|
|
|
|
| 158 |
pin_memory: bool = True,
|
| 159 |
persistent_workers: bool = False,
|
| 160 |
max_samples: Optional[int] = None,
|
| 161 |
+
use_turkish_tokenizer: bool = True
|
| 162 |
) -> DataLoader:
|
| 163 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
# Select tokenizer based on user preference
|
| 165 |
if use_turkish_tokenizer:
|
| 166 |
if not TURKISH_TOKENIZER_AVAILABLE:
|
Model_Architecture/generation.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import torch
|
| 2 |
import tiktoken
|
| 3 |
from model import ismail, ModelArgs
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
#####################################
|
|
@@ -93,12 +94,17 @@ def text_to_token_ids(text, tokenizer):
|
|
| 93 |
|
| 94 |
Args:
|
| 95 |
text: Input text string
|
| 96 |
-
tokenizer: Tokenizer instance
|
| 97 |
|
| 98 |
Returns:
|
| 99 |
Tensor of token IDs with shape (1, seq_len)
|
| 100 |
"""
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
|
| 103 |
return encoded_tensor
|
| 104 |
|
|
@@ -109,7 +115,7 @@ def token_ids_to_text(token_ids, tokenizer):
|
|
| 109 |
|
| 110 |
Args:
|
| 111 |
token_ids: Tensor of token IDs, can be 1D or 2D
|
| 112 |
-
tokenizer: Tokenizer instance
|
| 113 |
|
| 114 |
Returns:
|
| 115 |
Decoded text string
|
|
@@ -123,6 +129,32 @@ def token_ids_to_text(token_ids, tokenizer):
|
|
| 123 |
return tokenizer.decode(flat)
|
| 124 |
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
#####################################
|
| 127 |
# EXAMPLE USAGE
|
| 128 |
#####################################
|
|
@@ -131,6 +163,9 @@ if __name__ == "__main__":
|
|
| 131 |
import json
|
| 132 |
from pathlib import Path
|
| 133 |
|
|
|
|
|
|
|
|
|
|
| 134 |
# Example configuration - smaller model for testing
|
| 135 |
config_path = Path("config.json")
|
| 136 |
if config_path.exists():
|
|
@@ -142,22 +177,34 @@ if __name__ == "__main__":
|
|
| 142 |
print("⚠️ config.json not found, using default ModelArgs")
|
| 143 |
args = ModelArgs()
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
-
# Initialize model
|
| 147 |
print("Initializing model...")
|
| 148 |
torch.manual_seed(123)
|
| 149 |
model = ismail(args)
|
| 150 |
model.eval()
|
| 151 |
|
| 152 |
-
tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
|
| 153 |
-
tokenizer = tiktoken.get_encoding(tokenizer_name)
|
| 154 |
-
|
| 155 |
# Example 1: Greedy generation (argmax)
|
| 156 |
print(f"\n{'='*60}")
|
| 157 |
print("EXAMPLE 1: GREEDY GENERATION (ARGMAX)")
|
| 158 |
print(f"{'='*60}")
|
| 159 |
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
print(f"\nInput: '{start_context}'")
|
| 162 |
|
| 163 |
token_ids = text_to_token_ids(start_context, tokenizer)
|
|
@@ -179,7 +226,10 @@ if __name__ == "__main__":
|
|
| 179 |
print("EXAMPLE 2: SAMPLING WITH TEMPERATURE")
|
| 180 |
print(f"{'='*60}")
|
| 181 |
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
| 183 |
print(f"\nInput: '{start_context}'")
|
| 184 |
|
| 185 |
token_ids = text_to_token_ids(start_context, tokenizer)
|
|
@@ -202,7 +252,10 @@ if __name__ == "__main__":
|
|
| 202 |
print("EXAMPLE 3: TOP-K SAMPLING")
|
| 203 |
print(f"{'='*60}")
|
| 204 |
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
| 206 |
print(f"\nInput: '{start_context}'")
|
| 207 |
|
| 208 |
token_ids = text_to_token_ids(start_context, tokenizer)
|
|
|
|
| 1 |
import torch
|
| 2 |
import tiktoken
|
| 3 |
from model import ismail, ModelArgs
|
| 4 |
+
from data import TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
|
| 5 |
|
| 6 |
|
| 7 |
#####################################
|
|
|
|
| 94 |
|
| 95 |
Args:
|
| 96 |
text: Input text string
|
| 97 |
+
tokenizer: Tokenizer instance (tiktoken or TurkishTokenizerWrapper)
|
| 98 |
|
| 99 |
Returns:
|
| 100 |
Tensor of token IDs with shape (1, seq_len)
|
| 101 |
"""
|
| 102 |
+
# Turkish tokenizer doesn't support allowed_special parameter
|
| 103 |
+
if isinstance(tokenizer, TurkishTokenizerWrapper):
|
| 104 |
+
encoded = tokenizer.encode(text)
|
| 105 |
+
else:
|
| 106 |
+
encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
|
| 107 |
+
|
| 108 |
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
|
| 109 |
return encoded_tensor
|
| 110 |
|
|
|
|
| 115 |
|
| 116 |
Args:
|
| 117 |
token_ids: Tensor of token IDs, can be 1D or 2D
|
| 118 |
+
tokenizer: Tokenizer instance (tiktoken or TurkishTokenizerWrapper)
|
| 119 |
|
| 120 |
Returns:
|
| 121 |
Decoded text string
|
|
|
|
| 129 |
return tokenizer.decode(flat)
|
| 130 |
|
| 131 |
|
| 132 |
+
def get_tokenizer(use_turkish=False, tokenizer_name="gpt2"):
|
| 133 |
+
"""
|
| 134 |
+
Get the appropriate tokenizer based on user preference.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
use_turkish: Whether to use Turkish tokenizer
|
| 138 |
+
tokenizer_name: Name of tiktoken tokenizer to use if not using Turkish
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Tokenizer instance (TurkishTokenizerWrapper or tiktoken tokenizer)
|
| 142 |
+
"""
|
| 143 |
+
if use_turkish:
|
| 144 |
+
if not TURKISH_TOKENIZER_AVAILABLE:
|
| 145 |
+
raise ImportError(
|
| 146 |
+
"Turkish tokenizer requested but not available. "
|
| 147 |
+
"Install it with: pip install turkish-tokenizer"
|
| 148 |
+
)
|
| 149 |
+
tokenizer = TurkishTokenizerWrapper()
|
| 150 |
+
print(f"🇹🇷 Using Turkish Tokenizer (vocab size: {tokenizer.n_vocab:,})")
|
| 151 |
+
return tokenizer
|
| 152 |
+
else:
|
| 153 |
+
tokenizer = tiktoken.get_encoding(tokenizer_name)
|
| 154 |
+
print(f"📚 Using tiktoken tokenizer: {tokenizer_name} (vocab size: {tokenizer.n_vocab:,})")
|
| 155 |
+
return tokenizer
|
| 156 |
+
|
| 157 |
+
|
| 158 |
#####################################
|
| 159 |
# EXAMPLE USAGE
|
| 160 |
#####################################
|
|
|
|
| 163 |
import json
|
| 164 |
from pathlib import Path
|
| 165 |
|
| 166 |
+
# Configuration: Set to True to use Turkish tokenizer, False for tiktoken
|
| 167 |
+
USE_TURKISH_TOKENIZER = False # Change this to True for Turkish text generation
|
| 168 |
+
|
| 169 |
# Example configuration - smaller model for testing
|
| 170 |
config_path = Path("config.json")
|
| 171 |
if config_path.exists():
|
|
|
|
| 177 |
print("⚠️ config.json not found, using default ModelArgs")
|
| 178 |
args = ModelArgs()
|
| 179 |
|
| 180 |
+
# Initialize tokenizer
|
| 181 |
+
tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
|
| 182 |
+
tokenizer = get_tokenizer(
|
| 183 |
+
use_turkish=USE_TURKISH_TOKENIZER,
|
| 184 |
+
tokenizer_name=tokenizer_name
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
# Update vocab size if using Turkish tokenizer
|
| 188 |
+
if USE_TURKISH_TOKENIZER and isinstance(tokenizer, TurkishTokenizerWrapper):
|
| 189 |
+
args.vocab_size = tokenizer.n_vocab
|
| 190 |
+
print(f"📊 Updated vocab_size to {args.vocab_size:,} for Turkish tokenizer")
|
| 191 |
|
| 192 |
+
# Initialize model
|
| 193 |
print("Initializing model...")
|
| 194 |
torch.manual_seed(123)
|
| 195 |
model = ismail(args)
|
| 196 |
model.eval()
|
| 197 |
|
|
|
|
|
|
|
|
|
|
| 198 |
# Example 1: Greedy generation (argmax)
|
| 199 |
print(f"\n{'='*60}")
|
| 200 |
print("EXAMPLE 1: GREEDY GENERATION (ARGMAX)")
|
| 201 |
print(f"{'='*60}")
|
| 202 |
|
| 203 |
+
# Use Turkish or English prompts based on tokenizer
|
| 204 |
+
if USE_TURKISH_TOKENIZER:
|
| 205 |
+
start_context = "Merhaba, ben"
|
| 206 |
+
else:
|
| 207 |
+
start_context = "Hello, I am"
|
| 208 |
print(f"\nInput: '{start_context}'")
|
| 209 |
|
| 210 |
token_ids = text_to_token_ids(start_context, tokenizer)
|
|
|
|
| 226 |
print("EXAMPLE 2: SAMPLING WITH TEMPERATURE")
|
| 227 |
print(f"{'='*60}")
|
| 228 |
|
| 229 |
+
if USE_TURKISH_TOKENIZER:
|
| 230 |
+
start_context = "Bir varmış bir yokmuş"
|
| 231 |
+
else:
|
| 232 |
+
start_context = "Once upon a time"
|
| 233 |
print(f"\nInput: '{start_context}'")
|
| 234 |
|
| 235 |
token_ids = text_to_token_ids(start_context, tokenizer)
|
|
|
|
| 252 |
print("EXAMPLE 3: TOP-K SAMPLING")
|
| 253 |
print(f"{'='*60}")
|
| 254 |
|
| 255 |
+
if USE_TURKISH_TOKENIZER:
|
| 256 |
+
start_context = "Yapay zekanın geleceği"
|
| 257 |
+
else:
|
| 258 |
+
start_context = "The future of AI is"
|
| 259 |
print(f"\nInput: '{start_context}'")
|
| 260 |
|
| 261 |
token_ids = text_to_token_ids(start_context, tokenizer)
|