ikaganacar
/

ismail

Model card Files Files and versions

xet

Community

ikaganacar commited on Nov 12, 2025

Commit

326b359

1 Parent(s): ad359f7

Generation with Turkish Tokenizer

Browse files

Files changed (2) hide show

Model_Architecture/data.py +2 -52
Model_Architecture/generation.py +63 -10

Model_Architecture/data.py CHANGED Viewed

@@ -21,10 +21,6 @@ except ImportError:
 # TURKISH TOKENIZER WRAPPER
 #####################################
 class TurkishTokenizerWrapper:
-    """
-    Wrapper for Turkish Tokenizer to make it compatible with tiktoken interface.
-    This allows seamless integration with the existing TextDataset class.
-    """
     def __init__(self):
         if not TURKISH_TOKENIZER_AVAILABLE:
             raise ImportError(
@@ -35,28 +31,9 @@ class TurkishTokenizerWrapper:
         self.name = "turkish-tokenizer"
     def encode(self, text: str, allowed_special: Optional[set] = None) -> List[int]:
-        """
-        Encode text to token IDs (compatible with tiktoken interface).
-        Args:
-            text: Input text to tokenize
-            allowed_special: Not used for Turkish tokenizer, kept for compatibility
-        Returns:
-            List of token IDs
-        """
         return self.tokenizer.encode(text)
     def decode(self, tokens: List[int]) -> str:
-        """
-        Decode token IDs back to text.
-        Args:
-            tokens: List of token IDs
-        Returns:
-            Decoded text string
-        """
         return self.tokenizer.decode(tokens)
     @property
@@ -75,16 +52,6 @@ class TurkishTokenizerWrapper:
 #####################################
 class TextDataset(Dataset):
     def __init__(self, txt: str, tokenizer, args: ModelArgs, stride: Optional[int] = None, max_samples: Optional[int] = None):
-        """
-        Optimized text dataset with memory-mapped reading and batched tokenization.
-        Args:
-            txt: Text content or path to file
-            tokenizer: Pretrained tokenizer with .encode() method
-            args: ModelArgs containing max_seq_len, max_batch_size
-            stride: Sliding window stride. Defaults to max_seq_len // 2
-            max_samples: Limit number of samples for quick testing
-        """
         self.max_seq_len = args.max_seq_len
         self.stride = stride if stride is not None else self.max_seq_len // 2
@@ -115,7 +82,6 @@ class TextDataset(Dataset):
         print(f"✅ Created {len(self.samples)} training samples")
     def _read_file_mmap(self, file_path: str) -> str:
-        """Memory-efficient file reading for large files"""
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
@@ -124,7 +90,6 @@ class TextDataset(Dataset):
             raise RuntimeError(f"Failed to read file {file_path}: {e}")
     def _tokenize_with_progress(self, tokenizer, text: str) -> List[int]:
-        """Tokenize with progress bar for large texts"""
         # Process in chunks for memory efficiency
         chunk_size = 10_000_000  # 10MB chunks
         tokens = []
@@ -148,7 +113,6 @@ class TextDataset(Dataset):
         return tokens
     def _create_sliding_windows(self, token_ids: List[int], max_samples: Optional[int]) -> torch.Tensor:
-        """Create overlapping sequences using vectorized operations"""
         if len(token_ids) < self.max_seq_len + 1:
             raise ValueError(f"Not enough tokens. Need {self.max_seq_len + 1}, got {len(token_ids)}")
@@ -194,23 +158,9 @@ def create_dataloader(
     pin_memory: bool = True,
     persistent_workers: bool = False,
     max_samples: Optional[int] = None,
-    use_turkish_tokenizer: bool = False
 ) -> DataLoader:
-    """
-    Optimized DataLoader with proper memory pinning and worker settings.
-    Args:
-        txt: Text content or file path
-        args: ModelArgs configuration
-        stride: Sliding window stride
-        shuffle: Whether to shuffle samples
-        drop_last: Drop incomplete batches
-        num_workers: Number of data loading workers (0 = main process)
-        pin_memory: Pin memory for faster GPU transfer (recommended)
-        persistent_workers: Keep workers alive between epochs (if num_workers > 0)
-        max_samples: Limit samples for testing
-        use_turkish_tokenizer: Use Turkish morphological tokenizer instead of tiktoken
-    """
     # Select tokenizer based on user preference
     if use_turkish_tokenizer:
         if not TURKISH_TOKENIZER_AVAILABLE:

 # TURKISH TOKENIZER WRAPPER
 #####################################
 class TurkishTokenizerWrapper:
     def __init__(self):
         if not TURKISH_TOKENIZER_AVAILABLE:
             raise ImportError(
         self.name = "turkish-tokenizer"
     def encode(self, text: str, allowed_special: Optional[set] = None) -> List[int]:
         return self.tokenizer.encode(text)
     def decode(self, tokens: List[int]) -> str:
         return self.tokenizer.decode(tokens)
     @property
 #####################################
 class TextDataset(Dataset):
     def __init__(self, txt: str, tokenizer, args: ModelArgs, stride: Optional[int] = None, max_samples: Optional[int] = None):
         self.max_seq_len = args.max_seq_len
         self.stride = stride if stride is not None else self.max_seq_len // 2
         print(f"✅ Created {len(self.samples)} training samples")
     def _read_file_mmap(self, file_path: str) -> str:
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
             raise RuntimeError(f"Failed to read file {file_path}: {e}")
     def _tokenize_with_progress(self, tokenizer, text: str) -> List[int]:
         # Process in chunks for memory efficiency
         chunk_size = 10_000_000  # 10MB chunks
         tokens = []
         return tokens
     def _create_sliding_windows(self, token_ids: List[int], max_samples: Optional[int]) -> torch.Tensor:
         if len(token_ids) < self.max_seq_len + 1:
             raise ValueError(f"Not enough tokens. Need {self.max_seq_len + 1}, got {len(token_ids)}")
     pin_memory: bool = True,
     persistent_workers: bool = False,
     max_samples: Optional[int] = None,
+    use_turkish_tokenizer: bool = True
 ) -> DataLoader:
     # Select tokenizer based on user preference
     if use_turkish_tokenizer:
         if not TURKISH_TOKENIZER_AVAILABLE:

Model_Architecture/generation.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 import tiktoken
 from model import ismail, ModelArgs
 #####################################
@@ -93,12 +94,17 @@ def text_to_token_ids(text, tokenizer):
     Args:
         text: Input text string
-        tokenizer: Tokenizer instance
     Returns:
         Tensor of token IDs with shape (1, seq_len)
     """
-    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
     encoded_tensor = torch.tensor(encoded).unsqueeze(0)
     return encoded_tensor
@@ -109,7 +115,7 @@ def token_ids_to_text(token_ids, tokenizer):
     Args:
         token_ids: Tensor of token IDs, can be 1D or 2D
-        tokenizer: Tokenizer instance
     Returns:
         Decoded text string
@@ -123,6 +129,32 @@ def token_ids_to_text(token_ids, tokenizer):
     return tokenizer.decode(flat)
 #####################################
 # EXAMPLE USAGE
 #####################################
@@ -131,6 +163,9 @@ if __name__ == "__main__":
     import json
     from pathlib import Path
     # Example configuration - smaller model for testing
     config_path = Path("config.json")
     if config_path.exists():
@@ -142,22 +177,34 @@ if __name__ == "__main__":
         print("⚠️ config.json not found, using default ModelArgs")
         args = ModelArgs()
-    # Initialize model and tokenizer
     print("Initializing model...")
     torch.manual_seed(123)
     model = ismail(args)
     model.eval()
-    tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
-    tokenizer = tiktoken.get_encoding(tokenizer_name)
     # Example 1: Greedy generation (argmax)
     print(f"\n{'='*60}")
     print("EXAMPLE 1: GREEDY GENERATION (ARGMAX)")
     print(f"{'='*60}")
-    start_context = "Hello, I am"
     print(f"\nInput: '{start_context}'")
     token_ids = text_to_token_ids(start_context, tokenizer)
@@ -179,7 +226,10 @@ if __name__ == "__main__":
     print("EXAMPLE 2: SAMPLING WITH TEMPERATURE")
     print(f"{'='*60}")
-    start_context = "Once upon a time"
     print(f"\nInput: '{start_context}'")
     token_ids = text_to_token_ids(start_context, tokenizer)
@@ -202,7 +252,10 @@ if __name__ == "__main__":
     print("EXAMPLE 3: TOP-K SAMPLING")
     print(f"{'='*60}")
-    start_context = "The future of AI is"
     print(f"\nInput: '{start_context}'")
     token_ids = text_to_token_ids(start_context, tokenizer)

 import torch
 import tiktoken
 from model import ismail, ModelArgs
+from data import TurkishTokenizerWrapper, TURKISH_TOKENIZER_AVAILABLE
 #####################################
     Args:
         text: Input text string
+        tokenizer: Tokenizer instance (tiktoken or TurkishTokenizerWrapper)
     Returns:
         Tensor of token IDs with shape (1, seq_len)
     """
+    # Turkish tokenizer doesn't support allowed_special parameter
+    if isinstance(tokenizer, TurkishTokenizerWrapper):
+        encoded = tokenizer.encode(text)
+    else:
+        encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
     encoded_tensor = torch.tensor(encoded).unsqueeze(0)
     return encoded_tensor
     Args:
         token_ids: Tensor of token IDs, can be 1D or 2D
+        tokenizer: Tokenizer instance (tiktoken or TurkishTokenizerWrapper)
     Returns:
         Decoded text string
     return tokenizer.decode(flat)
+def get_tokenizer(use_turkish=False, tokenizer_name="gpt2"):
+    """
+    Get the appropriate tokenizer based on user preference.
+    Args:
+        use_turkish: Whether to use Turkish tokenizer
+        tokenizer_name: Name of tiktoken tokenizer to use if not using Turkish
+    Returns:
+        Tokenizer instance (TurkishTokenizerWrapper or tiktoken tokenizer)
+    """
+    if use_turkish:
+        if not TURKISH_TOKENIZER_AVAILABLE:
+            raise ImportError(
+                "Turkish tokenizer requested but not available. "
+                "Install it with: pip install turkish-tokenizer"
+            )
+        tokenizer = TurkishTokenizerWrapper()
+        print(f"🇹🇷 Using Turkish Tokenizer (vocab size: {tokenizer.n_vocab:,})")
+        return tokenizer
+    else:
+        tokenizer = tiktoken.get_encoding(tokenizer_name)
+        print(f"📚 Using tiktoken tokenizer: {tokenizer_name} (vocab size: {tokenizer.n_vocab:,})")
+        return tokenizer
 #####################################
 # EXAMPLE USAGE
 #####################################
     import json
     from pathlib import Path
+    # Configuration: Set to True to use Turkish tokenizer, False for tiktoken
+    USE_TURKISH_TOKENIZER = False  # Change this to True for Turkish text generation
     # Example configuration - smaller model for testing
     config_path = Path("config.json")
     if config_path.exists():
         print("⚠️ config.json not found, using default ModelArgs")
         args = ModelArgs()
+    # Initialize tokenizer
+    tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
+    tokenizer = get_tokenizer(
+        use_turkish=USE_TURKISH_TOKENIZER,
+        tokenizer_name=tokenizer_name
+    )
+    # Update vocab size if using Turkish tokenizer
+    if USE_TURKISH_TOKENIZER and isinstance(tokenizer, TurkishTokenizerWrapper):
+        args.vocab_size = tokenizer.n_vocab
+        print(f"📊 Updated vocab_size to {args.vocab_size:,} for Turkish tokenizer")
+    # Initialize model
     print("Initializing model...")
     torch.manual_seed(123)
     model = ismail(args)
     model.eval()
     # Example 1: Greedy generation (argmax)
     print(f"\n{'='*60}")
     print("EXAMPLE 1: GREEDY GENERATION (ARGMAX)")
     print(f"{'='*60}")
+    # Use Turkish or English prompts based on tokenizer
+    if USE_TURKISH_TOKENIZER:
+        start_context = "Merhaba, ben"
+    else:
+        start_context = "Hello, I am"
     print(f"\nInput: '{start_context}'")
     token_ids = text_to_token_ids(start_context, tokenizer)
     print("EXAMPLE 2: SAMPLING WITH TEMPERATURE")
     print(f"{'='*60}")
+    if USE_TURKISH_TOKENIZER:
+        start_context = "Bir varmış bir yokmuş"
+    else:
+        start_context = "Once upon a time"
     print(f"\nInput: '{start_context}'")
     token_ids = text_to_token_ids(start_context, tokenizer)
     print("EXAMPLE 3: TOP-K SAMPLING")
     print(f"{'='*60}")
+    if USE_TURKISH_TOKENIZER:
+        start_context = "Yapay zekanın geleceği"
+    else:
+        start_context = "The future of AI is"
     print(f"\nInput: '{start_context}'")
     token_ids = text_to_token_ids(start_context, tokenizer)