karthick commited on Oct 26, 2025

Commit

fb67af8

1 Parent(s): d99ca15

Upload TinyStories 24.5M model - article generation success

Files changed (22) hide show

src/__init__.py +3 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/data/__init__.py +15 -0
src/data/__pycache__/__init__.cpython-313.pyc +0 -0
src/data/__pycache__/dataset.cpython-313.pyc +0 -0
src/data/__pycache__/quality_checker.cpython-313.pyc +0 -0
src/data/__pycache__/tokenizer.cpython-313.pyc +0 -0
src/data/dataset.py +302 -0
src/data/quality_checker.py +343 -0
src/data/tokenizer.py +272 -0
src/model/__init__.py +20 -0
src/model/__pycache__/__init__.cpython-313.pyc +0 -0
src/model/__pycache__/attention.cpython-313.pyc +0 -0
src/model/__pycache__/rmsnorm.cpython-313.pyc +0 -0
src/model/__pycache__/rope.cpython-313.pyc +0 -0
src/model/__pycache__/swiglu.cpython-313.pyc +0 -0
src/model/__pycache__/transformer_block.cpython-313.pyc +0 -0
src/model/attention.py +301 -0
src/model/rmsnorm.py +185 -0
src/model/rope.py +217 -0
src/model/swiglu.py +231 -0
src/model/transformer_block.py +454 -0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """TinyStories Language Model - 24.5M Parameters"""
2	+
3	+ __version__ = "1.0.0"

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (265 Bytes). View file

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Data processing modules for TinyStories training."""
+from .tokenizer import load_tokenizer, train_tokenizer, test_tokenizer
+from .dataset import TinyStoriesDataset, create_dataloaders
+from .quality_checker import check_dataset_quality, DataQualityChecker
+__all__ = [
+    'load_tokenizer',
+    'train_tokenizer',
+    'test_tokenizer',
+    'TinyStoriesDataset',
+    'create_dataloaders',
+    'check_dataset_quality',
+    'DataQualityChecker',
+]

src/data/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (584 Bytes). View file

src/data/__pycache__/dataset.cpython-313.pyc ADDED Viewed

Binary file (11.6 kB). View file

src/data/__pycache__/quality_checker.cpython-313.pyc ADDED Viewed

Binary file (17.8 kB). View file

src/data/__pycache__/tokenizer.cpython-313.pyc ADDED Viewed

Binary file (11.1 kB). View file

src/data/dataset.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""Dataset and DataLoader utilities for TinyStories training.
+This module provides:
+1. TinyStoriesDataset class for loading and processing TinyStories
+2. create_dataloaders function for creating train/val DataLoaders
+3. Sequence packing for efficient training
+TinyStories is a synthetic dataset of short stories generated by GPT-3.5/4
+using a limited vocabulary suitable for children. Perfect for fast training
+and testing language models.
+"""
+import torch
+from torch.utils.data import Dataset, DataLoader
+from datasets import load_dataset
+from pathlib import Path
+import pickle
+import logging
+from typing import Dict, List, Tuple, Optional
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+class TinyStoriesDataset(Dataset):
+    """TinyStories dataset with sequence packing for efficient training.
+    TinyStories is a synthetic dataset of short stories generated by GPT-3.5/4
+    using a limited vocabulary suitable for children. The dataset contains
+    ~2.1M stories and is excellent for:
+    - Fast training (only ~1GB)
+    - Clean, well-formed English
+    - Testing model architecture
+    - Educational purposes
+    This dataset:
+    1. Loads TinyStories from HuggingFace datasets
+    2. Tokenizes the text
+    3. Packs sequences to max_seq_len for efficiency
+    4. Caches processed data for fast subsequent loading
+    """
+    def __init__(
+        self,
+        tokenizer,
+        split: str = "train",
+        max_seq_len: int = 512,
+        cache_dir: Optional[str] = None,
+    ):
+        """Initialize TinyStories dataset.
+        Args:
+            tokenizer: Tokenizer instance (must have encode method)
+            split: Dataset split ("train" or "validation")
+            max_seq_len: Maximum sequence length (default: 512, matches official paper)
+            cache_dir: Directory for caching processed data
+        """
+        self.tokenizer = tokenizer
+        self.split = split
+        self.max_seq_len = max_seq_len
+        self.cache_dir = Path(cache_dir) if cache_dir else Path("./data/cache")
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        # Cache file path
+        cache_file = self.cache_dir / f"tinystories_{split}_{max_seq_len}.pkl"
+        # Try to load from cache
+        if cache_file.exists():
+            logger.info(f"Loading cached dataset from {cache_file}")
+            with open(cache_file, "rb") as f:
+                cache_data = pickle.load(f)
+                self.input_ids = cache_data["input_ids"]
+                self.labels = cache_data["labels"]
+            logger.info(f"Loaded {len(self.input_ids)} sequences from cache")
+        else:
+            # Process dataset
+            logger.info(f"Processing TinyStories {split} split...")
+            self.input_ids, self.labels = self._process_dataset()
+            # Save to cache
+            logger.info(f"Saving processed dataset to {cache_file}")
+            cache_data = {
+                "input_ids": self.input_ids,
+                "labels": self.labels,
+            }
+            with open(cache_file, "wb") as f:
+                pickle.dump(cache_data, f)
+        logger.info(f"Dataset ready: {len(self.input_ids)} sequences")
+    def _process_dataset(self) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        """Process TinyStories dataset into packed sequences.
+        Returns:
+            Tuple of (input_ids, labels) lists
+        """
+        # Load dataset
+        dataset = load_dataset(
+            "roneneldan/TinyStories",
+            split=self.split,
+        )
+        # Tokenize all text
+        logger.info("Tokenizing dataset...")
+        all_token_ids = []
+        for example in tqdm(dataset, desc="Tokenizing"):
+            text = example["text"].strip()
+            if len(text) > 0:  # Skip empty stories
+                # Encode text
+                if hasattr(self.tokenizer, 'encode'):
+                    token_ids = self.tokenizer.encode(text, add_special_tokens=False)
+                else:
+                    # Fallback for tokenizers.Tokenizer
+                    token_ids = self.tokenizer.tokenizer.encode(text).ids
+                all_token_ids.extend(token_ids)
+        logger.info(f"Total tokens: {len(all_token_ids):,}")
+        # Pack into sequences
+        logger.info("Packing sequences...")
+        input_ids_list = []
+        labels_list = []
+        # Pack sequences with stride to maximize data usage
+        for i in range(0, len(all_token_ids) - 1, self.max_seq_len):
+            # Get sequence
+            seq = all_token_ids[i : i + self.max_seq_len]
+            # Skip if too short
+            if len(seq) < 2:
+                continue
+            # Create input_ids and labels
+            # input_ids: [0, 1, 2, ..., n-1]
+            # labels:    [1, 2, 3, ..., n]
+            input_ids = torch.tensor(seq[:-1], dtype=torch.long)
+            labels = torch.tensor(seq[1:], dtype=torch.long)
+            # Pad if necessary
+            if len(input_ids) < self.max_seq_len:
+                pad_len = self.max_seq_len - len(input_ids)
+                input_ids = torch.cat([
+                    input_ids,
+                    torch.full((pad_len,), self.tokenizer.pad_token_id, dtype=torch.long)
+                ])
+                labels = torch.cat([
+                    labels,
+                    torch.full((pad_len,), -100, dtype=torch.long)  # -100 is ignored in loss
+                ])
+            input_ids_list.append(input_ids)
+            labels_list.append(labels)
+        logger.info(f"Created {len(input_ids_list)} packed sequences")
+        return input_ids_list, labels_list
+    def __len__(self) -> int:
+        """Return number of sequences."""
+        return len(self.input_ids)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """Get a single sequence.
+        Args:
+            idx: Sequence index
+        Returns:
+            Dictionary with 'input_ids' and 'labels'
+        """
+        return {
+            "input_ids": self.input_ids[idx],
+            "labels": self.labels[idx],
+        }
+def collate_fn(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+    """Collate function for DataLoader.
+    Args:
+        batch: List of dictionaries with 'input_ids' and 'labels'
+    Returns:
+        Batched dictionary
+    """
+    input_ids = torch.stack([item["input_ids"] for item in batch])
+    labels = torch.stack([item["labels"] for item in batch])
+    return {
+        "input_ids": input_ids,
+        "labels": labels,
+    }
+def create_dataloaders(
+    tokenizer,
+    batch_size: int,
+    max_seq_len: int,
+    cache_dir: str,
+    dataset_name: str = "tinystories",
+    num_workers: int = 0,
+    pin_memory: bool = True,
+    drop_last: bool = True,
+) -> Tuple[DataLoader, DataLoader]:
+    """Create train and validation DataLoaders for TinyStories.
+    Args:
+        tokenizer: Tokenizer instance
+        batch_size: Batch size per device
+        max_seq_len: Maximum sequence length (512 recommended for TinyStories)
+        cache_dir: Directory for caching processed data
+        dataset_name: Dataset to use (default: "tinystories")
+        num_workers: Number of data loading workers (use 0 for Windows)
+        pin_memory: Whether to pin memory for faster GPU transfer
+        drop_last: Whether to drop last incomplete batch
+    Returns:
+        Tuple of (train_loader, val_loader)
+    """
+    logger.info("Using TinyStories dataset")
+    logger.info("Creating train dataset...")
+    train_dataset = TinyStoriesDataset(
+        tokenizer=tokenizer,
+        split="train",
+        max_seq_len=max_seq_len,
+        cache_dir=cache_dir,
+    )
+    logger.info("Creating validation dataset...")
+    val_dataset = TinyStoriesDataset(
+        tokenizer=tokenizer,
+        split="validation",
+        max_seq_len=max_seq_len,
+        cache_dir=cache_dir,
+    )
+    # Create DataLoaders
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        drop_last=drop_last,
+        collate_fn=collate_fn,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        drop_last=False,
+        collate_fn=collate_fn,
+    )
+    logger.info(f"Train batches: {len(train_loader)}")
+    logger.info(f"Validation batches: {len(val_loader)}")
+    return train_loader, val_loader
+# Test the dataset
+if __name__ == "__main__":
+    from .tokenizer import load_tokenizer
+    print("Testing TinyStoriesDataset...")
+    # Load tokenizer (assumes it exists)
+    tokenizer_path = "./tokenizer/wikimini_32k"
+    if Path(tokenizer_path).exists():
+        tokenizer = load_tokenizer(tokenizer_path)
+        # Create small dataset for testing
+        dataset = TinyStoriesDataset(
+            tokenizer=tokenizer,
+            split="validation",  # Use smaller split for testing
+            max_seq_len=128,
+            cache_dir="./data/cache_test",
+        )
+        print(f"\nDataset size: {len(dataset)}")
+        print(f"Sample batch:")
+        sample = dataset[0]
+        print(f"  Input IDs shape: {sample['input_ids'].shape}")
+        print(f"  Labels shape: {sample['labels'].shape}")
+        print(f"  First 10 input IDs: {sample['input_ids'][:10]}")
+        print(f"  First 10 labels: {sample['labels'][:10]}")
+        # Test DataLoader
+        loader = DataLoader(dataset, batch_size=4, collate_fn=collate_fn)
+        batch = next(iter(loader))
+        print(f"\nDataLoader batch:")
+        print(f"  Input IDs shape: {batch['input_ids'].shape}")
+        print(f"  Labels shape: {batch['labels'].shape}")
+    else:
+        print(f"Tokenizer not found at {tokenizer_path}")
+        print("Please train tokenizer first: python scripts/train_tokenizer.py")

src/data/quality_checker.py ADDED Viewed

	@@ -0,0 +1,343 @@

+"""Data Quality Checker for training datasets.
+This module provides tools to validate dataset quality before training:
+- Detects artifacts (HTML tags, URLs, special tokens)
+- Checks for malformed text
+- Validates text statistics
+- Reports quality issues
+Prevents training on corrupted or low-quality data.
+"""
+import re
+import logging
+from typing import Dict, List, Tuple, Optional
+from collections import Counter
+from datasets import load_dataset
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+class DataQualityChecker:
+    """Check dataset quality before training."""
+    def __init__(
+        self,
+        dataset_name: str,
+        split: str = "train",
+        sample_size: Optional[int] = 10000,
+        strict: bool = False,
+    ):
+        """Initialize quality checker.
+        Args:
+            dataset_name: Name of dataset (e.g., "roneneldan/TinyStories")
+            split: Dataset split to check ("train" or "validation")
+            sample_size: Number of samples to check (None for all)
+            strict: If True, raise errors on issues; if False, only warn
+        """
+        self.dataset_name = dataset_name
+        self.split = split
+        self.sample_size = sample_size
+        self.strict = strict
+        # Quality metrics
+        self.issues: Dict[str, List[Tuple[int, str]]] = {
+            "html_tags": [],
+            "urls": [],
+            "emails": [],
+            "excessive_punctuation": [],
+            "malformed_unicode": [],
+            "empty_text": [],
+            "extremely_short": [],
+            "extremely_long": [],
+            "suspicious_patterns": [],
+            "special_tokens": [],
+        }
+        self.stats = {
+            "total_samples": 0,
+            "total_chars": 0,
+            "total_words": 0,
+            "avg_length": 0,
+            "vocabulary_size": 0,
+        }
+    def check_quality(self) -> Dict:
+        """Run all quality checks and return results.
+        Returns:
+            Dictionary with quality report and pass/fail status
+        """
+        logger.info(f"Loading dataset {self.dataset_name} ({self.split} split)...")
+        # Load dataset
+        if "tinystories" in self.dataset_name.lower():
+            dataset = load_dataset("roneneldan/TinyStories", split=self.split)
+        elif "wikitext" in self.dataset_name.lower():
+            dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split=self.split, trust_remote_code=True)
+        else:
+            dataset = load_dataset(self.dataset_name, split=self.split)
+        # Limit sample size if requested
+        if self.sample_size and len(dataset) > self.sample_size:
+            logger.info(f"Sampling {self.sample_size} examples from {len(dataset)} total")
+            indices = range(0, len(dataset), len(dataset) // self.sample_size)
+            dataset = dataset.select(list(indices)[:self.sample_size])
+        logger.info(f"Checking quality of {len(dataset)} examples...")
+        # Run checks
+        vocabulary = set()
+        for idx, example in enumerate(tqdm(dataset, desc="Quality Check")):
+            text = example.get("text", "")
+            # Update stats
+            self.stats["total_samples"] += 1
+            self.stats["total_chars"] += len(text)
+            words = text.split()
+            self.stats["total_words"] += len(words)
+            vocabulary.update(words)
+            # Run individual checks
+            self._check_html_tags(idx, text)
+            self._check_urls(idx, text)
+            self._check_emails(idx, text)
+            self._check_excessive_punctuation(idx, text)
+            self._check_malformed_unicode(idx, text)
+            self._check_empty_text(idx, text)
+            self._check_length_extremes(idx, text)
+            self._check_suspicious_patterns(idx, text)
+            self._check_special_tokens(idx, text)
+        # Calculate final stats
+        if self.stats["total_samples"] > 0:
+            self.stats["avg_length"] = self.stats["total_chars"] / self.stats["total_samples"]
+            self.stats["avg_words"] = self.stats["total_words"] / self.stats["total_samples"]
+        self.stats["vocabulary_size"] = len(vocabulary)
+        # Generate report
+        report = self._generate_report()
+        return report
+    def _check_html_tags(self, idx: int, text: str):
+        """Check for HTML tags."""
+        html_pattern = r'<[^>]+>'
+        if re.search(html_pattern, text):
+            self.issues["html_tags"].append((idx, text[:100]))
+    def _check_urls(self, idx: int, text: str):
+        """Check for URLs."""
+        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
+        if re.search(url_pattern, text):
+            self.issues["urls"].append((idx, text[:100]))
+    def _check_emails(self, idx: int, text: str):
+        """Check for email addresses."""
+        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
+        if re.search(email_pattern, text):
+            self.issues["emails"].append((idx, text[:100]))
+    def _check_excessive_punctuation(self, idx: int, text: str):
+        """Check for excessive punctuation (possible artifacts)."""
+        # More than 5 consecutive punctuation marks
+        if re.search(r'[!?.,;:]{5,}', text):
+            self.issues["excessive_punctuation"].append((idx, text[:100]))
+        # More than 20% punctuation
+        if len(text) > 0:
+            punct_count = sum(1 for c in text if c in '!?.,;:')
+            if punct_count / len(text) > 0.2:
+                self.issues["excessive_punctuation"].append((idx, text[:100]))
+    def _check_malformed_unicode(self, idx: int, text: str):
+        """Check for malformed Unicode characters."""
+        # Look for replacement characters or control characters
+        if '�' in text or '\ufffd' in text:
+            self.issues["malformed_unicode"].append((idx, text[:100]))
+        # Control characters (excluding whitespace)
+        if re.search(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', text):
+            self.issues["malformed_unicode"].append((idx, text[:100]))
+    def _check_empty_text(self, idx: int, text: str):
+        """Check for empty or whitespace-only text."""
+        if not text or not text.strip():
+            self.issues["empty_text"].append((idx, text))
+    def _check_length_extremes(self, idx: int, text: str):
+        """Check for extremely short or long text."""
+        if len(text.strip()) < 10:
+            self.issues["extremely_short"].append((idx, text))
+        elif len(text) > 50000:  # Suspiciously long
+            self.issues["extremely_long"].append((idx, text[:100]))
+    def _check_suspicious_patterns(self, idx: int, text: str):
+        """Check for suspicious patterns."""
+        # Repeated characters (e.g., "aaaaaa" more than 10 times)
+        if re.search(r'(.)\1{10,}', text):
+            self.issues["suspicious_patterns"].append((idx, text[:100]))
+        # Excessive whitespace
+        if re.search(r'\s{10,}', text):
+            self.issues["suspicious_patterns"].append((idx, text[:100]))
+    def _check_special_tokens(self, idx: int, text: str):
+        """Check for special tokens that shouldn't be in raw text."""
+        # Common tokenizer special tokens
+        special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '<|endoftext|>', '<pad>', '<unk>']
+        for token in special_tokens:
+            if token in text:
+                self.issues["special_tokens"].append((idx, text[:100]))
+                break
+    def _generate_report(self) -> Dict:
+        """Generate quality report.
+        Returns:
+            Dictionary with quality metrics and pass/fail status
+        """
+        total_issues = sum(len(issues) for issues in self.issues.values())
+        issue_percentage = (total_issues / self.stats["total_samples"] * 100) if self.stats["total_samples"] > 0 else 0
+        # Determine quality level
+        if issue_percentage == 0:
+            quality_level = "EXCELLENT"
+            passed = True
+        elif issue_percentage < 1:
+            quality_level = "GOOD"
+            passed = True
+        elif issue_percentage < 5:
+            quality_level = "ACCEPTABLE"
+            passed = not self.strict
+        elif issue_percentage < 10:
+            quality_level = "POOR"
+            passed = False
+        else:
+            quality_level = "CRITICAL"
+            passed = False
+        report = {
+            "dataset": self.dataset_name,
+            "split": self.split,
+            "quality_level": quality_level,
+            "passed": passed,
+            "stats": self.stats,
+            "issues": {
+                key: {
+                    "count": len(value),
+                    "percentage": (len(value) / self.stats["total_samples"] * 100) if self.stats["total_samples"] > 0 else 0,
+                    "samples": value[:3]  # First 3 examples
+                }
+                for key, value in self.issues.items() if len(value) > 0
+            },
+            "total_issues": total_issues,
+            "issue_percentage": issue_percentage,
+        }
+        return report
+    def print_report(self, report: Dict):
+        """Print formatted quality report.
+        Args:
+            report: Report dictionary from check_quality()
+        """
+        logger.info("\n" + "=" * 70)
+        logger.info("DATA QUALITY REPORT")
+        logger.info("=" * 70)
+        logger.info(f"Dataset: {report['dataset']} ({report['split']} split)")
+        logger.info(f"Quality Level: {report['quality_level']}")
+        logger.info(f"Status: {'✅ PASSED' if report['passed'] else '❌ FAILED'}")
+        logger.info("")
+        # Statistics
+        logger.info("Statistics:")
+        logger.info(f"  Total Samples: {report['stats']['total_samples']:,}")
+        logger.info(f"  Avg Length: {report['stats']['avg_length']:.1f} chars")
+        logger.info(f"  Avg Words: {report['stats'].get('avg_words', 0):.1f} words")
+        logger.info(f"  Vocabulary Size: {report['stats']['vocabulary_size']:,}")
+        logger.info("")
+        # Issues
+        if report['issues']:
+            logger.warning(f"Found {report['total_issues']} issues ({report['issue_percentage']:.2f}% of samples)")
+            logger.warning("")
+            for issue_type, details in report['issues'].items():
+                logger.warning(f"  {issue_type.replace('_', ' ').title()}:")
+                logger.warning(f"    Count: {details['count']} ({details['percentage']:.2f}%)")
+                if details['samples']:
+                    logger.warning(f"    Example: {details['samples'][0][1][:80]}...")
+                logger.warning("")
+        else:
+            logger.info("✅ No quality issues found!")
+        logger.info("=" * 70)
+        # Recommendations
+        if not report['passed']:
+            logger.error("\n⚠️  DATA HAS QUALITY ISSUES - Training not recommended!")
+            logger.error("Recommendations:")
+            if report['issues'].get('html_tags'):
+                logger.error("  - Remove HTML tags from text")
+            if report['issues'].get('urls'):
+                logger.error("  - Remove or mask URLs")
+            if report['issues'].get('malformed_unicode'):
+                logger.error("  - Fix Unicode encoding issues")
+            if report['issues'].get('empty_text'):
+                logger.error("  - Remove empty samples")
+            logger.error("")
+def check_dataset_quality(
+    dataset_name: str,
+    split: str = "train",
+    sample_size: Optional[int] = 10000,
+    strict: bool = False,
+) -> bool:
+    """Quick function to check dataset quality.
+    Args:
+        dataset_name: Dataset name or HuggingFace ID
+        split: Split to check
+        sample_size: Number of samples to check (None for all)
+        strict: If True, fail on any issues
+    Returns:
+        True if quality is acceptable, False otherwise
+    """
+    checker = DataQualityChecker(
+        dataset_name=dataset_name,
+        split=split,
+        sample_size=sample_size,
+        strict=strict,
+    )
+    report = checker.check_quality()
+    checker.print_report(report)
+    return report["passed"]
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Check dataset quality")
+    parser.add_argument("--dataset", type=str, required=True, help="Dataset name")
+    parser.add_argument("--split", type=str, default="train", help="Dataset split")
+    parser.add_argument("--sample-size", type=int, default=10000, help="Number of samples to check")
+    parser.add_argument("--strict", action="store_true", help="Fail on any issues")
+    args = parser.parse_args()
+    passed = check_dataset_quality(
+        dataset_name=args.dataset,
+        split=args.split,
+        sample_size=args.sample_size,
+        strict=args.strict,
+    )
+    exit(0 if passed else 1)

src/data/tokenizer.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""Tokenizer training and loading utilities for WikiMini model.
+This module provides functions to:
+1. Train a BPE tokenizer on WikiText-103
+2. Load a trained tokenizer from disk
+3. Test tokenizer functionality
+"""
+import os
+from pathlib import Path
+from typing import Optional, List
+from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
+from datasets import load_dataset
+import logging
+logger = logging.getLogger(__name__)
+def train_tokenizer(
+    vocab_size: int = 32000,
+    min_frequency: int = 2,
+    output_dir: str = "./tokenizer/wikimini_32k",
+    show_progress: bool = True,
+) -> Tokenizer:
+    """Train a BPE tokenizer on WikiText-103 dataset.
+    Args:
+        vocab_size: Size of the vocabulary
+        min_frequency: Minimum frequency for tokens
+        output_dir: Directory to save the trained tokenizer
+        show_progress: Whether to show progress during training
+    Returns:
+        Trained tokenizer
+    """
+    logger.info(f"Training BPE tokenizer with vocab_size={vocab_size}")
+    # Initialize BPE tokenizer
+    tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
+    # Pre-tokenization (split on whitespace and punctuation)
+    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+    # Decoder
+    tokenizer.decoder = decoders.ByteLevel()
+    # Configure trainer
+    special_tokens = [
+        "<unk>",  # Unknown token
+        "<s>",    # Begin of sentence
+        "</s>",   # End of sentence
+        "<pad>",  # Padding token
+    ]
+    trainer = trainers.BpeTrainer(
+        vocab_size=vocab_size,
+        min_frequency=min_frequency,
+        special_tokens=special_tokens,
+        show_progress=show_progress,
+    )
+    # Load WikiText-103 dataset
+    logger.info("Loading WikiText-103 dataset...")
+    dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
+    # Create iterator for training
+    def batch_iterator(batch_size: int = 1000):
+        """Yield batches of text for training."""
+        for i in range(0, len(dataset), batch_size):
+            batch = dataset[i : i + batch_size]
+            yield batch["text"]
+    # Train tokenizer
+    logger.info("Training tokenizer...")
+    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
+    # Add post-processor for special tokens
+    tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+    # Enable padding
+    tokenizer.enable_padding(
+        pad_id=tokenizer.token_to_id("<pad>"),
+        pad_token="<pad>",
+    )
+    # Enable truncation
+    tokenizer.enable_truncation(max_length=2048)
+    # Save tokenizer
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    tokenizer_file = output_path / "tokenizer.json"
+    tokenizer.save(str(tokenizer_file))
+    logger.info(f"Tokenizer saved to {tokenizer_file}")
+    # Save config
+    config = {
+        "vocab_size": vocab_size,
+        "model_type": "BPE",
+        "unk_token": "<unk>",
+        "bos_token": "<s>",
+        "eos_token": "</s>",
+        "pad_token": "<pad>",
+    }
+    import json
+    config_file = output_path / "config.json"
+    with open(config_file, 'w') as f:
+        json.dump(config, f, indent=2)
+    logger.info(f"Config saved to {config_file}")
+    return tokenizer
+def load_tokenizer(tokenizer_path: str, return_wrapper: bool = True):
+    """Load a trained tokenizer from disk.
+    Args:
+        tokenizer_path: Path to the tokenizer directory or file
+        return_wrapper: If True, returns TokenizerWrapper (default), else raw Tokenizer
+    Returns:
+        Loaded tokenizer (wrapped by default for compatibility)
+    """
+    tokenizer_path = Path(tokenizer_path)
+    # Handle both directory and file paths
+    if tokenizer_path.is_dir():
+        tokenizer_file = tokenizer_path / "tokenizer.json"
+    else:
+        tokenizer_file = tokenizer_path
+    if not tokenizer_file.exists():
+        raise FileNotFoundError(f"Tokenizer file not found: {tokenizer_file}")
+    logger.info(f"Loading tokenizer from {tokenizer_file}")
+    tokenizer = Tokenizer.from_file(str(tokenizer_file))
+    # Return wrapped version for easier use (supports len(), etc.)
+    if return_wrapper:
+        return TokenizerWrapper(tokenizer)
+    return tokenizer
+def test_tokenizer(tokenizer: Tokenizer) -> None:
+    """Test tokenizer with sample text.
+    Args:
+        tokenizer: Tokenizer to test
+    """
+    print("\n" + "="*70)
+    print(" "*25 + "Tokenizer Test")
+    print("="*70)
+    # Get vocab info
+    vocab_size = tokenizer.get_vocab_size()
+    print(f"\nVocabulary size: {vocab_size:,}")
+    # Test special tokens
+    print("\nSpecial tokens:")
+    special_tokens = ["<unk>", "<s>", "</s>", "<pad>"]
+    for token in special_tokens:
+        token_id = tokenizer.token_to_id(token)
+        print(f"  {token:8s} -> ID {token_id}")
+    # Test encoding/decoding
+    test_texts = [
+        "The quick brown fox jumps over the lazy dog.",
+        "Machine learning is a subset of artificial intelligence.",
+        "WikiText-103 is a large-scale language modeling benchmark.",
+    ]
+    print("\nEncoding/Decoding tests:")
+    print("-" * 70)
+    for i, text in enumerate(test_texts, 1):
+        # Encode
+        encoding = tokenizer.encode(text)
+        tokens = encoding.tokens
+        ids = encoding.ids
+        # Decode
+        decoded = tokenizer.decode(ids)
+        print(f"\nTest {i}:")
+        print(f"  Original: {text}")
+        print(f"  Tokens:   {len(tokens)}")
+        print(f"  IDs:      {ids[:10]}..." if len(ids) > 10 else f"  IDs:      {ids}")
+        print(f"  Decoded:  {decoded}")
+        # Check round-trip
+        if decoded.strip() == text.strip():
+            print("  ✅ Round-trip successful")
+        else:
+            print("  ⚠️  Round-trip differs slightly (common with BPE)")
+    # Test batch encoding
+    print("\n\nBatch encoding test:")
+    print("-" * 70)
+    encodings = tokenizer.encode_batch(test_texts)
+    print(f"  Batch size: {len(encodings)}")
+    print(f"  Token counts: {[len(enc.ids) for enc in encodings]}")
+    print("\n" + "="*70)
+    print(" "*25 + "✅ Test Complete")
+    print("="*70 + "\n")
+# Wrapper class for compatibility with HuggingFace-style interface
+class TokenizerWrapper:
+    """Wrapper to make tokenizers.Tokenizer compatible with expected interface."""
+    def __init__(self, tokenizer: Tokenizer):
+        self.tokenizer = tokenizer
+        self._vocab_size = tokenizer.get_vocab_size()
+        # Get special token IDs - support multiple formats
+        # Try standard format first, then TinyStories custom format
+        self.pad_token_id = (
+            tokenizer.token_to_id("<pad>") or
+            tokenizer.token_to_id("<|padding|>") or
+            0  # Fallback to 0 if not found
+        )
+        self.bos_token_id = (
+            tokenizer.token_to_id("<s>") or
+            tokenizer.token_to_id("<|startoftext|>")
+        )
+        self.eos_token_id = (
+            tokenizer.token_to_id("</s>") or
+            tokenizer.token_to_id("<|endoftext|>")
+        )
+        self.unk_token_id = tokenizer.token_to_id("<unk>")
+    def __call__(self, text, **kwargs):
+        """Encode text (callable interface)."""
+        if isinstance(text, str):
+            return self.tokenizer.encode(text).ids
+        elif isinstance(text, list):
+            return [self.tokenizer.encode(t).ids for t in text]
+    def encode(self, text, add_special_tokens=True):
+        """Encode text to token IDs."""
+        encoding = self.tokenizer.encode(text)
+        return encoding.ids
+    def decode(self, token_ids, skip_special_tokens=True):
+        """Decode token IDs to text."""
+        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+    def __len__(self):
+        """Return vocabulary size."""
+        return self._vocab_size
+    @property
+    def vocab_size(self):
+        """Vocabulary size property."""
+        return self._vocab_size
+def create_tokenizer_wrapper(tokenizer_path: str) -> TokenizerWrapper:
+    """Create a wrapped tokenizer for easier use.
+    Args:
+        tokenizer_path: Path to tokenizer directory or file
+    Returns:
+        TokenizerWrapper instance
+    """
+    tokenizer = load_tokenizer(tokenizer_path, return_wrapper=False)
+    return TokenizerWrapper(tokenizer)

src/model/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Model components for WikiMini 95M."""
+from .rmsnorm import RMSNorm, RMSNormOptimized
+from .rope import RotaryPositionEmbeddings, RotaryPositionEmbeddingsComplex
+from .swiglu import SwiGLU, SwiGLUParallel, GeGLU
+from .attention import MultiHeadAttention
+from .transformer_block import TransformerBlock, WikiMiniModel
+__all__ = [
+    "RMSNorm",
+    "RMSNormOptimized",
+    "RotaryPositionEmbeddings",
+    "RotaryPositionEmbeddingsComplex",
+    "SwiGLU",
+    "SwiGLUParallel",
+    "GeGLU",
+    "MultiHeadAttention",
+    "TransformerBlock",
+    "WikiMiniModel",
+]

src/model/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (692 Bytes). View file

src/model/__pycache__/attention.cpython-313.pyc ADDED Viewed

Binary file (12.3 kB). View file

src/model/__pycache__/rmsnorm.cpython-313.pyc ADDED Viewed

Binary file (7.81 kB). View file

src/model/__pycache__/rope.cpython-313.pyc ADDED Viewed

Binary file (9.95 kB). View file

src/model/__pycache__/swiglu.cpython-313.pyc ADDED Viewed

Binary file (8.97 kB). View file

src/model/__pycache__/transformer_block.cpython-313.pyc ADDED Viewed

Binary file (18.7 kB). View file

src/model/attention.py ADDED Viewed

	@@ -0,0 +1,301 @@

+"""Multi-Head Attention with RoPE integration and memory optimizations.
+Critical implementation details:
+1. Apply RoPE only to Q and K, never to V
+2. Use SDPA for Flash Attention 2 support
+3. Pre-normalization architecture
+4. Memory-efficient implementation
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Optional, Tuple
+from .rope import RotaryPositionEmbeddings
+class MultiHeadAttention(nn.Module):
+    """Multi-Head Attention with RoPE and Flash Attention support.
+    This implementation:
+    - Uses Rotary Position Embeddings (RoPE) on Q and K only
+    - Supports Flash Attention 2 via torch.nn.functional.scaled_dot_product_attention
+    - Uses no bias terms (modern approach)
+    - Includes proper causal masking
+    - Memory-efficient implementation
+    """
+    def __init__(
+        self,
+        d_model: int = 768,
+        n_heads: int = 12,
+        dropout: float = 0.1,
+        max_seq_len: int = 2048,
+        rope_base: int = 10000,
+        rope_percentage: float = 0.5,
+        use_flash_attention: bool = True,
+    ):
+        super().__init__()
+        assert d_model % n_heads == 0, f"d_model ({d_model}) must be divisible by n_heads ({n_heads})"
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.head_dim = d_model // n_heads
+        # Windows Flash Attention: Test with PyTorch 2.10+ nightly
+        # Older versions had freezing issues, but newer versions may work
+        import sys
+        import logging
+        logger = logging.getLogger(__name__)
+        if sys.platform == 'win32' and use_flash_attention:
+            # Allow Flash Attention on Windows with PyTorch 2.10+
+            # If freezing occurs, set use_flash_attention: false in config
+            self.use_flash_attention = use_flash_attention
+            logger.info("[Windows] Attempting Flash Attention with PyTorch 2.10+ - if freezing occurs, disable in config")
+        elif sys.platform == 'win32':
+            self.use_flash_attention = False
+            logger.info("[Windows] Flash Attention disabled - using manual attention")
+        else:
+            self.use_flash_attention = use_flash_attention
+        self.dropout = dropout
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+        # Q, K, V projections (no bias)
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.o_proj = nn.Linear(d_model, d_model, bias=False)
+        # RoPE for positional encoding
+        # Apply to only part of head dimensions (typically 50%)
+        rope_dim = int(self.head_dim * rope_percentage)
+        self.rope_dim = rope_dim
+        self.rope = RotaryPositionEmbeddings(
+            head_dim=rope_dim,
+            max_seq_len=max_seq_len,
+            base=rope_base
+        )
+        # Dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        self.resid_dropout = nn.Dropout(dropout)
+        # Pre-allocate causal mask more efficiently
+        # We'll create it on-demand based on sequence length
+        self.register_buffer('cached_mask', None, persistent=False)
+        self.register_buffer('cached_mask_size', torch.tensor(0), persistent=False)
+    def _get_causal_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        """Get or create causal mask for the given sequence length.
+        CRITICAL: Always returns mask on the specified device to prevent CPU OOM errors.
+        """
+        if self.cached_mask is None or self.cached_mask_size < seq_len:
+            # Create a new mask directly on the target device
+            mask = torch.triu(torch.ones(seq_len, seq_len, device=device), diagonal=1)
+            mask = mask.masked_fill(mask == 1, float('-inf'))
+            self.cached_mask = mask
+            self.cached_mask_size = torch.tensor(seq_len)
+        # CRITICAL: Ensure the returned mask is on the correct device
+        # This prevents CPU OOM when broadcasting during attn_scores + causal_mask
+        return self.cached_mask[:seq_len, :seq_len].to(device)
+    def _apply_rope(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply RoPE to partial dimensions of Q and K.
+        Args:
+            q: Query tensor [batch, seq_len, n_heads, head_dim]
+            k: Key tensor [batch, seq_len, n_heads, head_dim]
+            position_ids: Optional custom position IDs
+        Returns:
+            Rotated Q and K tensors
+        """
+        # Split into RoPE and pass-through dimensions
+        if self.rope_dim > 0:
+            q_rope, q_pass = q[..., :self.rope_dim], q[..., self.rope_dim:]
+            k_rope, k_pass = k[..., :self.rope_dim], k[..., self.rope_dim:]
+            # Apply RoPE to the first part
+            q_rope, k_rope = self.rope(q_rope, k_rope, position_ids)
+            # Concatenate back
+            q = torch.cat([q_rope, q_pass], dim=-1)
+            k = torch.cat([k_rope, k_pass], dim=-1)
+        return q, k
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        past_kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass of multi-head attention.
+        Args:
+            x: Input tensor [batch, seq_len, d_model]
+            attention_mask: Optional attention mask
+            position_ids: Optional position IDs for RoPE
+            use_cache: Whether to return KV cache for inference
+            past_kv: Past key-value cache for inference
+        Returns:
+            Output tensor and optional KV cache
+        """
+        batch_size, seq_len, _ = x.size()
+        # Project to Q, K, V
+        q = self.q_proj(x)  # [batch, seq_len, d_model]
+        k = self.k_proj(x)  # [batch, seq_len, d_model]
+        v = self.v_proj(x)  # [batch, seq_len, d_model]
+        # Reshape for multi-head attention
+        # [batch, seq_len, d_model] -> [batch, seq_len, n_heads, head_dim]
+        q = q.view(batch_size, seq_len, self.n_heads, self.head_dim)
+        k = k.view(batch_size, seq_len, self.n_heads, self.head_dim)
+        v = v.view(batch_size, seq_len, self.n_heads, self.head_dim)
+        # Apply RoPE to Q and K only (not V!)
+        q, k = self._apply_rope(q, k, position_ids)
+        # Handle KV cache for inference
+        if use_cache and past_kv is not None:
+            past_k, past_v = past_kv
+            k = torch.cat([past_k, k], dim=1)
+            v = torch.cat([past_v, v], dim=1)
+        kv_cache = (k, v) if use_cache else None
+        # Transpose for attention computation
+        # [batch, seq_len, n_heads, head_dim] -> [batch, n_heads, seq_len, head_dim]
+        q = q.transpose(1, 2).contiguous()
+        k = k.transpose(1, 2).contiguous()
+        v = v.transpose(1, 2).contiguous()
+        # Use Flash Attention 2 via SDPA when available
+        # This is MUCH more memory efficient than manual attention
+        if self.use_flash_attention and hasattr(F, 'scaled_dot_product_attention'):
+            # Flash Attention 2 is automatically used when available
+            # It handles the causal mask internally when is_causal=True
+            # NOTE: Windows compatibility - skip context manager to avoid freezing
+            import sys
+            if sys.platform == 'win32':
+                # On Windows, use SDPA without explicit kernel selection
+                attn_output = F.scaled_dot_product_attention(
+                    q, k, v,
+                    attn_mask=attention_mask,
+                    dropout_p=self.dropout if self.training else 0.0,
+                    is_causal=True if attention_mask is None else False,
+                    scale=self.scale,
+                )
+            else:
+                # On Linux, use explicit kernel selection for best performance
+                with torch.backends.cuda.sdp_kernel(
+                    enable_flash=True,  # Use Flash Attention when possible
+                    enable_math=True,   # Fallback to math implementation
+                    enable_mem_efficient=True  # Use memory-efficient attention
+                ):
+                    attn_output = F.scaled_dot_product_attention(
+                        q, k, v,
+                        attn_mask=attention_mask,
+                        dropout_p=self.dropout if self.training else 0.0,
+                        is_causal=True if attention_mask is None else False,
+                        scale=self.scale,
+                    )
+        else:
+            # Manual attention computation (fallback)
+            # This is memory-intensive and should only be used for small sequences
+            attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+            # Apply causal mask
+            if attention_mask is None:
+                causal_mask = self._get_causal_mask(seq_len, x.device)
+                # Expand mask for batch and heads
+                causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
+                attn_scores = attn_scores + causal_mask
+            else:
+                attn_scores = attn_scores + attention_mask
+            # Apply softmax
+            attn_weights = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(q.dtype)
+            attn_weights = self.attn_dropout(attn_weights)
+            # Compute output
+            attn_output = torch.matmul(attn_weights, v)
+        # Reshape back
+        # [batch, n_heads, seq_len, head_dim] -> [batch, seq_len, d_model]
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, seq_len, self.d_model)
+        # Output projection
+        output = self.o_proj(attn_output)
+        output = self.resid_dropout(output)
+        return output, kv_cache
+# Test the attention implementation
+def test_attention():
+    """Test multi-head attention with various configurations."""
+    print("Testing Multi-Head Attention...")
+    # Test configuration
+    batch_size = 2
+    seq_len = 128
+    d_model = 768
+    n_heads = 12
+    # Create attention module
+    attention = MultiHeadAttention(
+        d_model=d_model,
+        n_heads=n_heads,
+        dropout=0.1,
+        max_seq_len=2048,
+        rope_percentage=0.5,
+        use_flash_attention=True,  # Enable Flash Attention
+    )
+    # Move to GPU if available
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    attention = attention.to(device)
+    attention.eval()  # Set to eval mode for testing
+    # Create dummy input
+    x = torch.randn(batch_size, seq_len, d_model, device=device, dtype=torch.bfloat16)
+    # Forward pass
+    with torch.no_grad():
+        output, _ = attention(x)
+    # Check output shape
+    assert output.shape == (batch_size, seq_len, d_model), \
+        f"Expected shape {(batch_size, seq_len, d_model)}, got {output.shape}"
+    # Check for NaN
+    assert not torch.isnan(output).any(), "Output contains NaN values!"
+    print("✓ Multi-Head Attention test passed!")
+    print(f"  Input shape: {x.shape}")
+    print(f"  Output shape: {output.shape}")
+    print(f"  Device: {device}")
+    print(f"  Memory allocated: {torch.cuda.memory_allocated(device) / 1024**3:.2f} GB")
+    return True
+if __name__ == "__main__":
+    test_attention()

src/model/rmsnorm.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""Root Mean Square Layer Normalization (RMSNorm) implementation.
+Critical implementation details:
+1. Use multiplication with rsqrt, NOT division
+2. No mean subtraction (unlike LayerNorm)
+3. Compute in FP32 for numerical stability even when using BF16/FP16
+"""
+import torch
+import torch.nn as nn
+from typing import Optional
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization.
+    RMSNorm is a simplification of LayerNorm that removes the mean subtraction
+    and only performs re-scaling via root mean square.
+    Based on the paper: 'Root Mean Square Layer Normalization'
+    https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        """
+        Args:
+            hidden_size: Size of the hidden dimension
+            eps: Small constant for numerical stability (1e-6 for BF16, 1e-5 for FP16)
+        """
+        super().__init__()
+        self.hidden_size = hidden_size
+        # CRITICAL FIX: Ensure eps is stored as float, not string
+        self.eps = float(eps) if isinstance(eps, str) else eps
+        # Learnable scale parameter (gamma)
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply RMSNorm to input tensor.
+        CRITICAL BUG TO AVOID:
+        The most common bug is using division with torch.rsqrt:
+        WRONG: x / torch.rsqrt(variance + eps)  # This is x * sqrt(variance)
+        RIGHT: x * torch.rsqrt(variance + eps)  # This is x / sqrt(variance)
+        Args:
+            x: Input tensor of shape [..., hidden_size]
+        Returns:
+            Normalized tensor of same shape as input
+        """
+        # Store original dtype (for mixed precision training)
+        input_dtype = x.dtype
+        # CRITICAL: Compute in float32 for numerical stability
+        x_float32 = x.float()
+        # Compute RMS (root mean square)
+        # RMS = sqrt(mean(x^2))
+        variance = x_float32.pow(2).mean(dim=-1, keepdim=True)
+        # CRITICAL: Use rsqrt (reciprocal square root) with multiplication
+        # rsqrt(x) = 1/sqrt(x), so x * rsqrt(variance) = x / sqrt(variance)
+        # PERFORMANCE FIX: PyTorch automatically broadcasts scalars, no need for tensor()
+        x_normalized = x_float32 * torch.rsqrt(variance + self.eps)
+        # Apply learned scale and cast back to original dtype
+        return self.weight * x_normalized.to(input_dtype)
+    def extra_repr(self) -> str:
+        return f'hidden_size={self.hidden_size}, eps={self.eps}'
+class RMSNormOptimized(nn.Module):
+    """Optimized RMSNorm with optional fused operations.
+    This version includes optimizations for better performance:
+    1. Option for in-place operations
+    2. Support for sequence parallelism
+    3. Optional residual connection fusion
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        elementwise_affine: bool = True,
+        memory_efficient: bool = False,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        # CRITICAL FIX: Ensure eps is stored as float, not string
+        self.eps = float(eps) if isinstance(eps, str) else eps
+        self.elementwise_affine = elementwise_affine
+        self.memory_efficient = memory_efficient
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+        else:
+            self.register_parameter('weight', None)
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Apply RMSNorm with optional residual connection.
+        Args:
+            x: Input tensor
+            residual: Optional residual to add before normalization
+        Returns:
+            Normalized tensor (and residual if provided)
+        """
+        # Add residual if provided (pre-norm architecture)
+        if residual is not None:
+            x = x + residual
+            residual = x  # Save for skip connection
+        # Original dtype for mixed precision
+        input_dtype = x.dtype
+        # Compute in FP32
+        if self.memory_efficient:
+            # In-place operations to save memory
+            x = x.float()
+            variance = x.pow_(2).mean(dim=-1, keepdim=True)
+            # PERFORMANCE FIX: Use scalar directly
+            x.mul_(torch.rsqrt(variance + self.eps))
+        else:
+            # Standard computation
+            x_float32 = x.float()
+            variance = x_float32.pow(2).mean(dim=-1, keepdim=True)
+            # PERFORMANCE FIX: Use scalar directly
+            x = x_float32 * torch.rsqrt(variance + self.eps)
+        # Apply weight and cast back
+        if self.elementwise_affine:
+            x = self.weight * x
+        x = x.to(input_dtype)
+        if residual is not None:
+            return x, residual
+        return x
+def rmsnorm_func(x: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+    """Functional version of RMSNorm for use in torch.compile or custom kernels.
+    This can be used with torch.compile for better optimization.
+    """
+    input_dtype = x.dtype
+    x = x.float()
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    # Ensure eps is properly handled
+    eps_val = float(eps) if isinstance(eps, str) else eps
+    x = x * torch.rsqrt(variance + eps_val)
+    return (weight * x).to(input_dtype)
+# Comparison with LayerNorm for reference
+def compare_normalization():
+    """Compare RMSNorm with LayerNorm to understand the differences."""
+    import torch.nn as nn
+    batch_size, seq_len, hidden = 2, 10, 768
+    x = torch.randn(batch_size, seq_len, hidden)
+    # LayerNorm: normalizes by mean and variance
+    layer_norm = nn.LayerNorm(hidden)
+    ln_out = layer_norm(x)
+    # RMSNorm: normalizes by RMS only (no mean subtraction)
+    rms_norm = RMSNorm(hidden)
+    rms_out = rms_norm(x)
+    print(f"Input shape: {x.shape}")
+    print(f"LayerNorm output shape: {ln_out.shape}")
+    print(f"RMSNorm output shape: {rms_out.shape}")
+    print(f"Mean difference: {(ln_out - rms_out).abs().mean().item():.6f}")
+    # RMSNorm is 15-20% faster due to simpler computation
+    return ln_out, rms_out

src/model/rope.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""Rotary Position Embeddings (RoPE) implementation.
+Critical implementation details:
+1. Apply RoPE only to Q and K, never to V
+2. Use head_dim, not full model dimension
+3. Ensure proper dimension pairing for rotation
+"""
+import torch
+import torch.nn as nn
+import math
+from typing import Optional, Tuple
+class RotaryPositionEmbeddings(nn.Module):
+    """Rotary Position Embeddings (RoPE) for transformer models.
+    Based on the paper: 'RoFormer: Enhanced Transformer with Rotary Position Embedding'
+    https://arxiv.org/abs/2104.09864
+    """
+    def __init__(
+        self,
+        head_dim: int,
+        max_seq_len: int = 2048,
+        base: int = 10000,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.head_dim = head_dim
+        self.max_seq_len = max_seq_len
+        self.base = base
+        # CRITICAL: head_dim must be even for proper pairing
+        assert head_dim % 2 == 0, f"head_dim must be even, got {head_dim}"
+        # Precompute frequencies
+        self._precompute_freqs(device)
+    def _precompute_freqs(self, device: Optional[torch.device] = None):
+        """Precompute the frequency tensor for RoPE."""
+        # Calculate theta frequencies
+        # theta_i = base^(-2i/d) for i in [0, 1, ..., d/2-1]
+        theta = 1.0 / (self.base ** (torch.arange(0, self.head_dim, 2).float() / self.head_dim))
+        # Create position indices
+        positions = torch.arange(self.max_seq_len).float()
+        # Compute outer product: [seq_len, head_dim/2]
+        freqs = torch.einsum('i,j->ij', positions, theta)
+        # Convert to cos and sin for rotation
+        freqs_cos = torch.cos(freqs)  # [seq_len, head_dim/2]
+        freqs_sin = torch.sin(freqs)  # [seq_len, head_dim/2]
+        # Duplicate for full dimension coverage
+        # [seq_len, head_dim/2] -> [seq_len, head_dim]
+        freqs_cos = torch.cat([freqs_cos, freqs_cos], dim=-1)
+        freqs_sin = torch.cat([freqs_sin, freqs_sin], dim=-1)
+        # Register as buffers (not trainable, moves with model to device)
+        self.register_buffer('freqs_cos', freqs_cos, persistent=False)
+        self.register_buffer('freqs_sin', freqs_sin, persistent=False)
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        """Rotate half the hidden dims of the input.
+        CRITICAL: This is the most common bug - incorrect dimension pairing.
+        For input [1, 2, 3, 4], output should be [-3, -4, 1, 2].
+        """
+        x1 = x[..., :x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2:]
+        return torch.cat([-x2, x1], dim=-1)
+    def apply_rotary_pos_emb(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply rotary position embeddings to query and key tensors.
+        Args:
+            q: Query tensor of shape [batch, seq_len, num_heads, head_dim]
+            k: Key tensor of shape [batch, seq_len, num_heads, head_dim]
+            position_ids: Optional custom position IDs
+        Returns:
+            Tuple of rotated (q, k) tensors
+        """
+        seq_len = q.shape[1]
+        # Get the frequency tensors for current sequence length
+        if position_ids is not None:
+            freqs_cos = self.freqs_cos[position_ids]
+            freqs_sin = self.freqs_sin[position_ids]
+        else:
+            freqs_cos = self.freqs_cos[:seq_len]
+            freqs_sin = self.freqs_sin[:seq_len]
+        # Reshape for broadcasting
+        # [seq_len, head_dim] -> [1, seq_len, 1, head_dim]
+        freqs_cos = freqs_cos[None, :, None, :]
+        freqs_sin = freqs_sin[None, :, None, :]
+        # Apply rotation using the formula:
+        # x_rotated = x * cos + rotate_half(x) * sin
+        q_rotated = q * freqs_cos + self.rotate_half(q) * freqs_sin
+        k_rotated = k * freqs_cos + self.rotate_half(k) * freqs_sin
+        return q_rotated, k_rotated
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass - apply RoPE to Q and K only.
+        CRITICAL: Never apply RoPE to V (value) tensor!
+        """
+        return self.apply_rotary_pos_emb(q, k, position_ids)
+# Alternative implementation using complex numbers directly
+class RotaryPositionEmbeddingsComplex(nn.Module):
+    """Alternative RoPE implementation using complex number operations.
+    This can be more efficient on some hardware but requires careful handling.
+    """
+    def __init__(
+        self,
+        head_dim: int,
+        max_seq_len: int = 2048,
+        base: int = 10000,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.head_dim = head_dim
+        self.max_seq_len = max_seq_len
+        self.base = base
+        assert head_dim % 2 == 0, f"head_dim must be even, got {head_dim}"
+        # Precompute complex exponentials
+        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        t = torch.arange(max_seq_len, dtype=inv_freq.dtype)
+        freqs = torch.einsum('i,j->ij', t, inv_freq)
+        # Store as cos/sin values
+        emb = torch.cat([freqs, freqs], dim=-1)
+        self.register_buffer('cos_cached', emb.cos()[None, :, None, :])
+        self.register_buffer('sin_cached', emb.sin()[None, :, None, :])
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        seq_len: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply RoPE using cached cos/sin values."""
+        if seq_len is None:
+            seq_len = q.shape[1]
+        # Apply rotation
+        q_embed = (q * self.cos_cached[:, :seq_len]) + \
+                  (self.rotate_half(q) * self.sin_cached[:, :seq_len])
+        k_embed = (k * self.cos_cached[:, :seq_len]) + \
+                  (self.rotate_half(k) * self.sin_cached[:, :seq_len])
+        return q_embed, k_embed
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        """Rotate half the hidden dims."""
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat([-x2, x1], dim=-1)
+# Test function for RoPE
+def test_rope():
+    """Test RoPE implementation."""
+    print("Testing RoPE implementation...")
+    batch_size = 2
+    seq_len = 128
+    n_heads = 12
+    head_dim = 64
+    # Create RoPE module
+    rope = RotaryPositionEmbeddings(head_dim=head_dim, max_seq_len=2048)
+    # Create dummy Q and K tensors
+    q = torch.randn(batch_size, seq_len, n_heads, head_dim)
+    k = torch.randn(batch_size, seq_len, n_heads, head_dim)
+    # Apply RoPE
+    q_rot, k_rot = rope(q, k)
+    # Check shapes
+    assert q_rot.shape == q.shape, f"Q shape mismatch: {q_rot.shape} != {q.shape}"
+    assert k_rot.shape == k.shape, f"K shape mismatch: {k_rot.shape} != {k.shape}"
+    # Check for NaN
+    assert not torch.isnan(q_rot).any(), "Q contains NaN after RoPE"
+    assert not torch.isnan(k_rot).any(), "K contains NaN after RoPE"
+    print("✓ RoPE test passed!")
+    print(f"  Input shape: {q.shape}")
+    print(f"  Output shape: {q_rot.shape}")
+    return True
+if __name__ == "__main__":
+    test_rope()

src/model/swiglu.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""SwiGLU (Swish-Gated Linear Unit) activation function implementation.
+Critical implementation details:
+1. Requires THREE weight matrices (gate, value, down-projection)
+2. Hidden dimension should be adjusted to ~8/3 * d_model for parameter parity
+3. No bias terms in modern implementations
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+class SwiGLU(nn.Module):
+    """Swish-Gated Linear Unit activation function.
+    SwiGLU combines the Swish activation (SiLU) with a gating mechanism
+    for improved gradient flow in deep networks.
+    Based on the paper: 'GLU Variants Improve Transformer'
+    https://arxiv.org/abs/2002.05202
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: Optional[int] = None,
+        output_dim: Optional[int] = None,
+        multiple_of: int = 256,
+        bias: bool = False,
+    ):
+        """
+        Args:
+            input_dim: Input dimension (d_model)
+            hidden_dim: Hidden dimension for FFN. If None, uses 8/3 * input_dim
+            output_dim: Output dimension. If None, uses input_dim
+            multiple_of: Round hidden_dim to nearest multiple for hardware efficiency
+            bias: Whether to use bias terms (modern LLMs use False)
+        """
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim or input_dim
+        # CRITICAL: Adjust hidden dimension for parameter parity
+        # Standard FFN with ReLU/GELU uses 4 * d_model
+        # SwiGLU needs 3 matrices, so use (8/3) * d_model for same param count
+        if hidden_dim is None:
+            hidden_dim = int(8 * input_dim / 3)
+        # Round to nearest multiple for better hardware utilization
+        if multiple_of > 1:
+            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.hidden_dim = hidden_dim
+        # Three linear projections required for SwiGLU
+        self.w_gate = nn.Linear(input_dim, hidden_dim, bias=bias)  # Gate projection
+        self.w_up = nn.Linear(input_dim, hidden_dim, bias=bias)    # Value/up projection
+        self.w_down = nn.Linear(hidden_dim, self.output_dim, bias=bias)  # Down projection
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply SwiGLU activation.
+        Formula: SwiGLU(x) = (Swish(xW_gate) ⊗ xW_up) W_down
+        where Swish(x) = x * sigmoid(x) = SiLU(x)
+        Args:
+            x: Input tensor of shape [..., input_dim]
+        Returns:
+            Output tensor of shape [..., output_dim]
+        """
+        # Gate path with Swish/SiLU activation
+        gate = F.silu(self.w_gate(x))
+        # Value path (no activation)
+        value = self.w_up(x)
+        # Element-wise multiplication (gating)
+        hidden = gate * value
+        # Down projection to output dimension
+        output = self.w_down(hidden)
+        return output
+    def extra_repr(self) -> str:
+        return (
+            f'input_dim={self.input_dim}, '
+            f'hidden_dim={self.hidden_dim}, '
+            f'output_dim={self.output_dim}'
+        )
+class SwiGLUParallel(nn.Module):
+    """Parallel version of SwiGLU that combines gate and up projections.
+    This is more efficient as it reduces the number of separate matmuls.
+    Used in models like LLaMA and Mistral.
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: Optional[int] = None,
+        output_dim: Optional[int] = None,
+        multiple_of: int = 256,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim or input_dim
+        if hidden_dim is None:
+            hidden_dim = int(8 * input_dim / 3)
+        if multiple_of > 1:
+            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.hidden_dim = hidden_dim
+        # Combined gate and up projection for efficiency
+        # Output shape: [batch, seq, 2 * hidden_dim]
+        self.w_gate_up = nn.Linear(input_dim, 2 * hidden_dim, bias=bias)
+        self.w_down = nn.Linear(hidden_dim, self.output_dim, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply SwiGLU with parallel projections."""
+        # Single matmul for both gate and up projections
+        gate_up = self.w_gate_up(x)
+        # Split into gate and up components
+        gate, up = gate_up.chunk(2, dim=-1)
+        # Apply SwiGLU
+        hidden = F.silu(gate) * up
+        output = self.w_down(hidden)
+        return output
+class GeGLU(nn.Module):
+    """GELU-Gated Linear Unit - alternative to SwiGLU.
+    Some models use GeGLU instead of SwiGLU. The difference is using
+    GELU instead of SiLU for the gating activation.
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: Optional[int] = None,
+        output_dim: Optional[int] = None,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim or input_dim
+        if hidden_dim is None:
+            hidden_dim = int(8 * input_dim / 3)
+        self.hidden_dim = hidden_dim
+        self.w_gate = nn.Linear(input_dim, hidden_dim, bias=bias)
+        self.w_up = nn.Linear(input_dim, hidden_dim, bias=bias)
+        self.w_down = nn.Linear(hidden_dim, self.output_dim, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply GeGLU activation."""
+        gate = F.gelu(self.w_gate(x))
+        value = self.w_up(x)
+        hidden = gate * value
+        output = self.w_down(hidden)
+        return output
+def calculate_ffn_params(d_model: int, activation: str = "swiglu") -> dict:
+    """Calculate FFN parameters for different activation functions.
+    This helper ensures parameter parity across different activation types.
+    """
+    if activation == "relu" or activation == "gelu":
+        # Standard FFN: 2 matrices
+        hidden_dim = 4 * d_model
+        num_params = 2 * d_model * hidden_dim
+    elif activation in ["swiglu", "geglu"]:
+        # Gated FFN: 3 matrices, adjust hidden dimension
+        hidden_dim = int(8 * d_model / 3)
+        # Round to multiple of 256 for hardware efficiency
+        hidden_dim = 256 * ((hidden_dim + 255) // 256)
+        num_params = d_model * hidden_dim * 2 + hidden_dim * d_model
+    else:
+        raise ValueError(f"Unknown activation: {activation}")
+    return {
+        "activation": activation,
+        "d_model": d_model,
+        "hidden_dim": hidden_dim,
+        "num_params": num_params,
+        "params_millions": num_params / 1e6,
+    }
+# Example usage and parameter comparison
+if __name__ == "__main__":
+    d_model = 768
+    # Compare parameter counts
+    print("FFN Parameter Comparison:")
+    for act in ["relu", "gelu", "swiglu"]:
+        params = calculate_ffn_params(d_model, act)
+        print(f"{act.upper()}:")
+        print(f"  Hidden dim: {params['hidden_dim']}")
+        print(f"  Parameters: {params['params_millions']:.2f}M")
+    # Test SwiGLU
+    batch_size, seq_len = 2, 512
+    x = torch.randn(batch_size, seq_len, d_model)
+    swiglu = SwiGLU(d_model)
+    output = swiglu(x)
+    print(f"\nSwiGLU Test:")
+    print(f"Input shape: {x.shape}")
+    print(f"Output shape: {output.shape}")
+    print(f"SwiGLU parameters: {sum(p.numel() for p in swiglu.parameters()) / 1e6:.2f}M")

src/model/transformer_block.py ADDED Viewed

	@@ -0,0 +1,454 @@

+"""Transformer block with pre-normalization architecture and memory optimizations.
+Critical implementation details:
+1. Pre-normalization: RMSNorm BEFORE attention and FFN
+2. Residual connections after each sub-layer
+3. Modern component stack: RoPE + RMSNorm + SwiGLU
+4. Gradient checkpointing support for memory efficiency
+"""
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple, Dict, Any
+from torch.utils.checkpoint import checkpoint
+from .rmsnorm import RMSNorm
+from .attention import MultiHeadAttention
+from .swiglu import SwiGLU
+class TransformerBlock(nn.Module):
+    """Single transformer block with pre-normalization.
+    This follows the modern architecture used in LLaMA, Mistral, etc:
+    - Pre-normalization with RMSNorm
+    - Multi-head attention with RoPE
+    - SwiGLU activation in FFN
+    - Residual connections
+    - Gradient checkpointing support
+    """
+    def __init__(
+        self,
+        d_model: int = 768,
+        n_heads: int = 12,
+        d_ffn: Optional[int] = None,
+        dropout: float = 0.1,
+        max_seq_len: int = 2048,
+        rope_base: int = 10000,
+        rope_percentage: float = 0.5,
+        rms_norm_eps: float = 1e-6,
+        use_flash_attention: bool = True,
+        use_gradient_checkpointing: bool = False,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.use_gradient_checkpointing = use_gradient_checkpointing
+        # Pre-normalization layers
+        self.attn_norm = RMSNorm(d_model, eps=rms_norm_eps)
+        self.ffn_norm = RMSNorm(d_model, eps=rms_norm_eps)
+        # Multi-head attention with RoPE
+        self.attention = MultiHeadAttention(
+            d_model=d_model,
+            n_heads=n_heads,
+            dropout=dropout,
+            max_seq_len=max_seq_len,
+            rope_base=rope_base,
+            rope_percentage=rope_percentage,
+            use_flash_attention=use_flash_attention,
+        )
+        # SwiGLU FFN
+        # Default hidden dimension: 8/3 * d_model for parameter parity
+        if d_ffn is None:
+            d_ffn = int(8 * d_model / 3)
+            # Round to multiple of 256 for hardware efficiency
+            d_ffn = 256 * ((d_ffn + 255) // 256)
+        self.ffn = SwiGLU(
+            input_dim=d_model,
+            hidden_dim=d_ffn,
+            output_dim=d_model,
+            bias=False,
+        )
+    def _attention_block(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        past_kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Attention sub-block with pre-norm."""
+        # Pre-normalization
+        x_norm = self.attn_norm(x)
+        # Multi-head attention
+        attn_output, kv_cache = self.attention(
+            x_norm,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            past_kv=past_kv,
+        )
+        # Residual connection
+        return attn_output, kv_cache
+    def _ffn_block(self, x: torch.Tensor) -> torch.Tensor:
+        """Feed-forward sub-block with pre-norm."""
+        # Pre-normalization
+        x_norm = self.ffn_norm(x)
+        # Feed-forward
+        ffn_output = self.ffn(x_norm)
+        return ffn_output
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        past_kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass of transformer block.
+        Args:
+            x: Input tensor [batch, seq_len, d_model]
+            attention_mask: Optional attention mask
+            position_ids: Optional position IDs for RoPE
+            use_cache: Whether to return KV cache
+            past_kv: Past key-value cache
+        Returns:
+            Output tensor and optional KV cache
+        """
+        # Attention block with residual
+        if self.use_gradient_checkpointing and self.training:
+            # Use gradient checkpointing to save memory during training
+            def attention_fn(x_in):
+                attn_out, _ = self._attention_block(
+                    x_in,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    use_cache=False,  # Can't use cache with checkpointing
+                    past_kv=None,
+                )
+                return attn_out
+            attn_output = checkpoint(attention_fn, x, use_reentrant=False)
+            kv_cache = None
+        else:
+            attn_output, kv_cache = self._attention_block(
+                x,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                use_cache=use_cache,
+                past_kv=past_kv,
+            )
+        # Add residual for attention
+        x = x + attn_output
+        # FFN block with residual
+        if self.use_gradient_checkpointing and self.training:
+            # Use gradient checkpointing for FFN as well
+            ffn_output = checkpoint(self._ffn_block, x, use_reentrant=False)
+        else:
+            ffn_output = self._ffn_block(x)
+        # Add residual for FFN
+        x = x + ffn_output
+        return x, kv_cache
+class WikiMiniModel(nn.Module):
+    """Complete WikiMini 95M language model.
+    Architecture:
+    - Token embeddings with weight tying
+    - Stack of transformer blocks
+    - Final RMSNorm
+    - LM head (tied with embeddings)
+    """
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__()
+        # Extract config values with defaults
+        self.vocab_size = config.get('vocab_size', 32000)
+        self.d_model = config.get('d_model', 768)
+        self.n_layers = config.get('n_layers', 12)
+        self.n_heads = config.get('n_heads', 12)
+        self.d_ffn = config.get('d_ffn', None)
+        self.max_seq_len = config.get('max_seq_len', 2048)
+        self.dropout = config.get('dropout', 0.1)
+        self.rope_percentage = config.get('rope_percentage', 0.5)
+        self.rope_base = config.get('rope_base', 10000)
+        self.rms_norm_eps = config.get('rms_norm_eps', 1e-6)
+        self.tie_embeddings = config.get('tie_embeddings', True)
+        self.use_flash_attention = config.get('use_flash_attention', True)
+        self.use_gradient_checkpointing = config.get('gradient_checkpointing', False)
+        # Token embeddings
+        self.token_embedding = nn.Embedding(self.vocab_size, self.d_model)
+        # Transformer blocks
+        self.blocks = nn.ModuleList([
+            TransformerBlock(
+                d_model=self.d_model,
+                n_heads=self.n_heads,
+                d_ffn=self.d_ffn,
+                dropout=self.dropout,
+                max_seq_len=self.max_seq_len,
+                rope_base=self.rope_base,
+                rope_percentage=self.rope_percentage,
+                rms_norm_eps=self.rms_norm_eps,
+                use_flash_attention=self.use_flash_attention,
+                use_gradient_checkpointing=self.use_gradient_checkpointing,
+            )
+            for _ in range(self.n_layers)
+        ])
+        # Final normalization
+        self.final_norm = RMSNorm(self.d_model, eps=self.rms_norm_eps)
+        # Language modeling head
+        self.lm_head = nn.Linear(self.d_model, self.vocab_size, bias=False)
+        # Weight tying
+        if self.tie_embeddings:
+            self.lm_head.weight = self.token_embedding.weight
+        # Initialize weights
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize weights with scaled normal distribution."""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+                if module.bias is not None:
+                    torch.nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Embedding):
+                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def enable_gradient_checkpointing(self):
+        """Enable gradient checkpointing for all transformer blocks."""
+        self.use_gradient_checkpointing = True
+        for block in self.blocks:
+            block.use_gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        """Disable gradient checkpointing for all transformer blocks."""
+        self.use_gradient_checkpointing = False
+        for block in self.blocks:
+            block.use_gradient_checkpointing = False
+    def count_parameters(self) -> dict:
+        """Count model parameters by component.
+        Returns:
+            Dictionary with parameter counts for each component
+        """
+        # Count by component type
+        embedding_params = sum(p.numel() for p in self.token_embedding.parameters())
+        attention_params = 0
+        ffn_params = 0
+        norm_params = 0
+        for block in self.blocks:
+            # Attention parameters
+            attention_params += sum(p.numel() for p in block.attention.parameters())
+            # FFN parameters
+            ffn_params += sum(p.numel() for p in block.ffn.parameters())
+            # Norm parameters (attention + ffn norms)
+            norm_params += sum(p.numel() for p in block.attn_norm.parameters())
+            norm_params += sum(p.numel() for p in block.ffn_norm.parameters())
+        # Final norm
+        norm_params += sum(p.numel() for p in self.final_norm.parameters())
+        # LM head (only if not tied)
+        if not self.tie_embeddings:
+            lm_head_params = sum(p.numel() for p in self.lm_head.parameters())
+        else:
+            lm_head_params = 0  # Shared with embeddings
+        total_params = sum(p.numel() for p in self.parameters())
+        return {
+            'total': total_params,
+            'total_millions': total_params / 1e6,
+            'embedding': embedding_params,
+            'attention': attention_params,
+            'ffn': ffn_params,
+            'norm': norm_params,
+            'lm_head': lm_head_params,
+        }
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        past_key_values: Optional[list] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """Forward pass of the model.
+        Args:
+            input_ids: Token IDs [batch, seq_len]
+            attention_mask: Optional attention mask
+            position_ids: Optional position IDs
+            labels: Optional labels for language modeling loss
+            use_cache: Whether to return KV cache
+            past_key_values: Past KV cache for inference
+        Returns:
+            Dictionary with 'logits' and optionally 'loss' and 'past_key_values'
+        """
+        batch_size, seq_len = input_ids.shape
+        # Token embeddings
+        x = self.token_embedding(input_ids)
+        # Apply dropout to embeddings
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        # Process through transformer blocks
+        past_key_values_out = [] if use_cache else None
+        for i, block in enumerate(self.blocks):
+            # Get past KV for this layer if available
+            past_kv = past_key_values[i] if past_key_values is not None else None
+            # Process through block
+            x, kv_cache = block(
+                x,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                use_cache=use_cache,
+                past_kv=past_kv,
+            )
+            # Store KV cache if needed
+            if use_cache:
+                past_key_values_out.append(kv_cache)
+        # Final normalization
+        x = self.final_norm(x)
+        # Language modeling head
+        logits = self.lm_head(x)
+        # Prepare output
+        output = {'logits': logits}
+        # Calculate loss if labels provided
+        if labels is not None:
+            # Shift for next-token prediction
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten for cross-entropy
+            shift_logits = shift_logits.view(-1, self.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Calculate cross-entropy loss
+            loss = nn.functional.cross_entropy(
+                shift_logits,
+                shift_labels,
+                ignore_index=-100,  # Standard ignore index
+            )
+            output['loss'] = loss
+        # Add KV cache to output if requested
+        if use_cache:
+            output['past_key_values'] = past_key_values_out
+        return output
+def create_model(config: Dict[str, Any]) -> WikiMiniModel:
+    """Create a WikiMini model from configuration.
+    Args:
+        config: Model configuration dictionary
+    Returns:
+        WikiMiniModel instance
+    """
+    return WikiMiniModel(config)
+# Test the complete model
+if __name__ == "__main__":
+    # Test configuration for ~95M parameters
+    config = {
+        'vocab_size': 32000,
+        'd_model': 768,
+        'n_layers': 12,
+        'n_heads': 12,
+        'd_ffn': 2048,  # Adjusted for SwiGLU
+        'max_seq_len': 2048,
+        'dropout': 0.1,
+        'rope_percentage': 0.5,
+        'rope_base': 10000,
+        'rms_norm_eps': 1e-6,
+        'tie_embeddings': True,
+        'use_flash_attention': True,
+        'gradient_checkpointing': True,  # Enable for memory efficiency
+    }
+    # Create model
+    model = WikiMiniModel(config)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"WikiMini Model:")
+    print(f"  Total parameters: {total_params:,} ({total_params/1e6:.2f}M)")
+    print(f"  Trainable parameters: {trainable_params:,} ({trainable_params/1e6:.2f}M)")
+    print(f"  Layers: {model.n_layers}")
+    print(f"  Hidden size: {model.d_model}")
+    print(f"  Attention heads: {model.n_heads}")
+    # Test forward pass
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = model.to(device)
+    model.eval()
+    # Small test batch
+    batch_size = 2
+    seq_len = 128
+    input_ids = torch.randint(0, config['vocab_size'], (batch_size, seq_len), device=device)
+    # Enable gradient checkpointing
+    model.enable_gradient_checkpointing()
+    # Forward pass
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids)
+    print(f"\nTest forward pass:")
+    print(f"  Input shape: {input_ids.shape}")
+    print(f"  Output logits shape: {outputs['logits'].shape}")
+    print(f"  Device: {device}")
+    if torch.cuda.is_available():
+        print(f"  Memory allocated: {torch.cuda.memory_allocated(device) / 1024**3:.2f} GB")
+    print("\n✓ Model test passed!")