nameissakthi commited on Jan 23

Commit

c27df58

1 Parent(s): e3ef0ba

Remove pycache, add gitignore

Browse files

Files changed (22) hide show

.gitignore +5 -0
README.md +2 -4
src/__init__.py +2 -0
src/data/__init__.py +31 -0
src/data/dataloader.py +300 -0
src/data/dataset.py +434 -0
src/data/tokenizer.py +300 -0
src/export/__init__.py +1 -0
src/inference/__init__.py +1 -0
src/model/__init__.py +24 -0
src/model/attention.py +172 -0
src/model/config.py +116 -0
src/model/decoder.py +106 -0
src/model/ffn.py +67 -0
src/model/kv_cache.py +127 -0
src/model/normalization.py +50 -0
src/model/rope.py +110 -0
src/model/transformer.py +323 -0
src/training/__init__.py +16 -0
src/training/loss.py +123 -0
src/training/optimizer.py +197 -0
src/training/trainer.py +511 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.DS_Store

README.md CHANGED Viewed

@@ -111,10 +111,6 @@ Training Time: ~40 minutes
 ```bash
 pip install torch tokenizers huggingface_hub
-# Clone model architecture code
-git clone https://github.com/nameissakthi/slm-qualcomm
-cd slm-qualcomm
 ```
 ### Download Model
@@ -132,6 +128,8 @@ tokenizer_path = hf_hub_download(repo_id="nameissakthi/PebbleLM-117M-Chat", file
 ```python
 import torch
 from tokenizers import Tokenizer
 from src.model.transformer import SLMForCausalLM
 from src.model.config import SLMConfig

 ```bash
 pip install torch tokenizers huggingface_hub
 ```
 ### Download Model
 ```python
 import torch
 from tokenizers import Tokenizer
+# Model architecture is included in this repo
 from src.model.transformer import SLMForCausalLM
 from src.model.config import SLMConfig

src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # SLM Qualcomm - Conversational Small Language Model
2	+ __version__ = "1.0.0"

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Data loading and tokenizer components
+from .tokenizer import SLMTokenizer
+from .dataset import (
+    ConversationalDataset,
+    StreamingTextDataset,
+    PackedDataset,
+    create_train_val_split,
+    load_jsonl,
+    save_jsonl,
+)
+from .dataloader import (
+    DataModule,
+    StreamingDataModule,
+    create_dataloader,
+    estimate_dataset_tokens,
+)
+__all__ = [
+    "SLMTokenizer",
+    "ConversationalDataset",
+    "StreamingTextDataset",
+    "PackedDataset",
+    "create_train_val_split",
+    "load_jsonl",
+    "save_jsonl",
+    "DataModule",
+    "StreamingDataModule",
+    "create_dataloader",
+    "estimate_dataset_tokens",
+]

src/data/dataloader.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+DataLoader utilities for SLM training.
+Provides efficient batching and data loading for training.
+"""
+import os
+from typing import Dict, Optional, List
+import torch
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from .dataset import ConversationalDataset, StreamingTextDataset, PackedDataset
+from .tokenizer import SLMTokenizer
+def create_dataloader(
+    dataset: Dataset,
+    batch_size: int,
+    shuffle: bool = True,
+    num_workers: int = 4,
+    pin_memory: bool = None,  # Auto-detect based on device
+    drop_last: bool = True,
+    distributed: bool = False,
+    world_size: int = 1,
+    rank: int = 0,
+) -> DataLoader:
+    """Create a DataLoader with optimal settings.
+    Args:
+        dataset: The dataset to load from
+        batch_size: Batch size per device
+        shuffle: Whether to shuffle data
+        num_workers: Number of data loading workers
+        pin_memory: Pin memory for faster GPU transfer
+        drop_last: Drop last incomplete batch
+        distributed: Whether using distributed training
+        world_size: Number of distributed processes
+        rank: Current process rank
+    Returns:
+        Configured DataLoader
+    """
+    sampler = None
+    if distributed:
+        sampler = DistributedSampler(
+            dataset,
+            num_replicas=world_size,
+            rank=rank,
+            shuffle=shuffle,
+        )
+        shuffle = False  # Sampler handles shuffling
+    # Auto-detect pin_memory: disable for MPS (not supported)
+    if pin_memory is None:
+        import torch
+        pin_memory = torch.cuda.is_available()  # Only True for CUDA
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=shuffle if sampler is None else False,
+        sampler=sampler,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        drop_last=drop_last,
+        collate_fn=default_collate_fn,
+    )
+def default_collate_fn(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+    """Collate function for batching samples.
+    Args:
+        batch: List of sample dictionaries
+    Returns:
+        Batched dictionary with stacked tensors
+    """
+    return {
+        "input_ids": torch.stack([s["input_ids"] for s in batch]),
+        "attention_mask": torch.stack([s["attention_mask"] for s in batch]),
+        "labels": torch.stack([s["labels"] for s in batch]),
+    }
+class DataModule:
+    """Data module for managing train/val dataloaders.
+    Provides a unified interface for data loading during training.
+    """
+    def __init__(
+        self,
+        data_dir: str,
+        tokenizer_path: str,
+        max_length: int = 1024,
+        batch_size: int = 32,
+        num_workers: int = 4,
+        val_batch_size: Optional[int] = None,
+    ):
+        """Initialize data module.
+        Args:
+            data_dir: Directory containing processed data
+            tokenizer_path: Path to tokenizer.json
+            max_length: Maximum sequence length
+            batch_size: Training batch size
+            num_workers: Number of data loading workers
+            val_batch_size: Validation batch size (defaults to batch_size)
+        """
+        self.data_dir = data_dir
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.val_batch_size = val_batch_size or batch_size
+        self.num_workers = num_workers
+        # Load tokenizer
+        self.tokenizer = SLMTokenizer.from_file(tokenizer_path)
+        # Datasets (created on first access)
+        self._train_dataset = None
+        self._val_dataset = None
+    @property
+    def train_dataset(self) -> Dataset:
+        """Get or create training dataset."""
+        if self._train_dataset is None:
+            self._train_dataset = ConversationalDataset(
+                data_path=self.data_dir,
+                tokenizer=self.tokenizer,
+                max_length=self.max_length,
+                split="train",
+            )
+        return self._train_dataset
+    @property
+    def val_dataset(self) -> Dataset:
+        """Get or create validation dataset."""
+        if self._val_dataset is None:
+            self._val_dataset = ConversationalDataset(
+                data_path=self.data_dir,
+                tokenizer=self.tokenizer,
+                max_length=self.max_length,
+                split="val",
+            )
+        return self._val_dataset
+    def train_dataloader(
+        self,
+        distributed: bool = False,
+        world_size: int = 1,
+        rank: int = 0,
+    ) -> DataLoader:
+        """Get training dataloader."""
+        return create_dataloader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            drop_last=True,
+            distributed=distributed,
+            world_size=world_size,
+            rank=rank,
+        )
+    def val_dataloader(self) -> DataLoader:
+        """Get validation dataloader."""
+        return create_dataloader(
+            self.val_dataset,
+            batch_size=self.val_batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            drop_last=False,
+        )
+class StreamingDataModule:
+    """Data module for streaming large datasets.
+    Memory-efficient loading for large text corpora.
+    """
+    def __init__(
+        self,
+        data_files: List[str],
+        tokenizer_path: str,
+        max_length: int = 1024,
+        batch_size: int = 32,
+        num_workers: int = 4,
+    ):
+        """Initialize streaming data module.
+        Args:
+            data_files: List of text file paths
+            tokenizer_path: Path to tokenizer.json
+            max_length: Maximum sequence length
+            batch_size: Batch size
+            num_workers: Number of data loading workers
+        """
+        self.data_files = data_files
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        # Load tokenizer
+        self.tokenizer = SLMTokenizer.from_file(tokenizer_path)
+    def train_dataloader(self) -> DataLoader:
+        """Get training dataloader for streaming data."""
+        dataset = StreamingTextDataset(
+            data_files=self.data_files,
+            tokenizer=self.tokenizer,
+            max_length=self.max_length,
+            shuffle=True,
+        )
+        return DataLoader(
+            dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            collate_fn=default_collate_fn,
+        )
+def estimate_dataset_tokens(data_dir: str, tokenizer_path: str) -> Dict[str, int]:
+    """Estimate total tokens in a dataset.
+    Args:
+        data_dir: Directory containing data files
+        tokenizer_path: Path to tokenizer
+    Returns:
+        Dictionary with token counts
+    """
+    import json
+    from pathlib import Path
+    tokenizer = SLMTokenizer.from_file(tokenizer_path)
+    total_tokens = 0
+    total_samples = 0
+    for file_path in Path(data_dir).glob("*.json*"):
+        with open(file_path, "r") as f:
+            if file_path.suffix == ".jsonl":
+                samples = [json.loads(line) for line in f if line.strip()]
+            else:
+                samples = json.load(f)
+                if not isinstance(samples, list):
+                    samples = [samples]
+        for sample in samples:
+            if "user" in sample and "assistant" in sample:
+                tokens = tokenizer.encode_conversation(
+                    sample["user"], sample["assistant"]
+                )
+            elif "text" in sample:
+                tokens = tokenizer.encode(sample["text"])
+            else:
+                continue
+            total_tokens += len(tokens)
+            total_samples += 1
+    return {
+        "total_tokens": total_tokens,
+        "total_samples": total_samples,
+        "avg_tokens_per_sample": total_tokens / max(total_samples, 1),
+    }
+def get_dataloader_stats(dataloader: DataLoader) -> Dict[str, float]:
+    """Get statistics from a dataloader.
+    Args:
+        dataloader: The dataloader to analyze
+    Returns:
+        Dictionary with statistics
+    """
+    total_batches = 0
+    total_tokens = 0
+    total_non_pad_tokens = 0
+    for batch in dataloader:
+        total_batches += 1
+        total_tokens += batch["input_ids"].numel()
+        total_non_pad_tokens += batch["attention_mask"].sum().item()
+        # Only sample first 100 batches
+        if total_batches >= 100:
+            break
+    return {
+        "batches_sampled": total_batches,
+        "tokens_per_batch": total_tokens / max(total_batches, 1),
+        "non_pad_ratio": total_non_pad_tokens / max(total_tokens, 1),
+    }

src/data/dataset.py ADDED Viewed

	@@ -0,0 +1,434 @@

+"""
+Dataset classes for SLM training.
+Handles loading, preprocessing, and tokenization of conversational data.
+"""
+import os
+import json
+import random
+from typing import List, Dict, Optional, Iterator, Tuple
+from pathlib import Path
+import torch
+from torch.utils.data import Dataset, IterableDataset
+from .tokenizer import SLMTokenizer
+class ConversationalDataset(Dataset):
+    """Dataset for conversational/instruction-following data.
+    Loads pre-tokenized data from disk for efficient training.
+    Format: Each sample is a tokenized conversation with user/assistant turns.
+    """
+    def __init__(
+        self,
+        data_path: str,
+        tokenizer: SLMTokenizer,
+        max_length: int = 1024,
+        split: str = "train",
+    ):
+        """Initialize the dataset.
+        Args:
+            data_path: Path to the processed data directory
+            tokenizer: Tokenizer instance
+            max_length: Maximum sequence length
+            split: "train" or "val"
+        """
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.split = split
+        # Load data
+        self.samples = self._load_data(data_path)
+        print(f"Loaded {len(self.samples)} samples for {split} split")
+    def _load_data(self, data_path: str) -> List[Dict]:
+        """Load data from JSON or JSONL files."""
+        samples = []
+        # Check for split-specific JSONL file first (preferred for large datasets)
+        split_jsonl = os.path.join(data_path, f"{self.split}.jsonl")
+        if os.path.exists(split_jsonl):
+            with open(split_jsonl, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        samples.append(json.loads(line))
+            return samples
+        # Check for split-specific JSON file
+        split_file = os.path.join(data_path, f"{self.split}.json")
+        if os.path.exists(split_file):
+            with open(split_file, "r", encoding="utf-8") as f:
+                # Try JSONL format first (one JSON per line)
+                content = f.read()
+                f.seek(0)
+                try:
+                    # Try loading as single JSON array
+                    samples = json.loads(content)
+                    if isinstance(samples, list):
+                        return samples
+                except json.JSONDecodeError:
+                    pass
+                # Load as JSONL (one JSON per line)
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        samples.append(json.loads(line))
+                return samples
+        # Check for combined file with splits
+        combined_file = os.path.join(data_path, "data.json")
+        if os.path.exists(combined_file):
+            with open(combined_file, "r") as f:
+                all_data = json.load(f)
+            if isinstance(all_data, dict) and self.split in all_data:
+                return all_data[self.split]
+            return all_data
+        # Load all .json and .jsonl files in directory
+        for ext in ["*.jsonl", "*.json"]:
+            for file in sorted(Path(data_path).glob(ext)):
+                with open(file, "r", encoding="utf-8") as f:
+                    if file.suffix == ".jsonl":
+                        for line in f:
+                            line = line.strip()
+                            if line:
+                                samples.append(json.loads(line))
+                    else:
+                        data = json.load(f)
+                        if isinstance(data, list):
+                            samples.extend(data)
+                        else:
+                            samples.append(data)
+        return samples
+    def __len__(self) -> int:
+        return len(self.samples)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """Get a single sample.
+        Returns:
+            Dictionary with:
+            - input_ids: Token IDs for the full sequence
+            - attention_mask: 1 for real tokens, 0 for padding
+            - labels: Same as input_ids but with -100 for padding (for loss)
+        """
+        sample = self.samples[idx]
+        # Handle different data formats
+        if "input_ids" in sample:
+            # Pre-tokenized data
+            input_ids = sample["input_ids"]
+        elif "user" in sample and "assistant" in sample:
+            # Raw conversation format
+            input_ids = self.tokenizer.encode_conversation(
+                user_message=sample["user"],
+                assistant_message=sample["assistant"],
+                max_length=self.max_length,
+            )
+        elif "text" in sample:
+            # Raw text format
+            input_ids = self.tokenizer.encode(
+                sample["text"],
+                add_special_tokens=True,
+                max_length=self.max_length,
+                truncation=True,
+            )
+        elif "question" in sample and "answer" in sample:
+            # Q&A format
+            input_ids = self.tokenizer.encode_conversation(
+                user_message=sample["question"],
+                assistant_message=sample["answer"],
+                max_length=self.max_length,
+            )
+        else:
+            raise ValueError(f"Unknown sample format: {list(sample.keys())}")
+        # Pad or truncate
+        if len(input_ids) > self.max_length:
+            input_ids = input_ids[:self.max_length]
+            # Ensure EOS at the end
+            if input_ids[-1] != self.tokenizer.eos_token_id:
+                input_ids[-1] = self.tokenizer.eos_token_id
+        # Create attention mask (before padding)
+        attention_mask = [1] * len(input_ids)
+        # Pad if needed
+        padding_length = self.max_length - len(input_ids)
+        if padding_length > 0:
+            input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
+            attention_mask = attention_mask + [0] * padding_length
+        # Labels for language modeling (shift happens in loss function)
+        # Use -100 for padding tokens so they're ignored in loss
+        labels = [
+            id if mask == 1 else -100
+            for id, mask in zip(input_ids, attention_mask)
+        ]
+        return {
+            "input_ids": torch.tensor(input_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+            "labels": torch.tensor(labels, dtype=torch.long),
+        }
+class StreamingTextDataset(IterableDataset):
+    """Streaming dataset for large text files.
+    Memory-efficient dataset that streams data from disk.
+    Useful for training on large text corpora.
+    """
+    def __init__(
+        self,
+        data_files: List[str],
+        tokenizer: SLMTokenizer,
+        max_length: int = 1024,
+        shuffle: bool = True,
+        seed: int = 42,
+    ):
+        """Initialize streaming dataset.
+        Args:
+            data_files: List of text file paths
+            tokenizer: Tokenizer instance
+            max_length: Maximum sequence length
+            shuffle: Whether to shuffle files and lines
+            seed: Random seed for shuffling
+        """
+        self.data_files = data_files
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.shuffle = shuffle
+        self.seed = seed
+        # Verify files exist
+        for f in data_files:
+            if not os.path.exists(f):
+                raise FileNotFoundError(f"Data file not found: {f}")
+    def __iter__(self) -> Iterator[Dict[str, torch.Tensor]]:
+        """Iterate over all samples in all files."""
+        worker_info = torch.utils.data.get_worker_info()
+        # Handle multi-worker data loading
+        if worker_info is None:
+            files_to_process = self.data_files
+        else:
+            # Split files among workers
+            per_worker = len(self.data_files) // worker_info.num_workers
+            worker_id = worker_info.id
+            start = worker_id * per_worker
+            end = start + per_worker if worker_id < worker_info.num_workers - 1 else len(self.data_files)
+            files_to_process = self.data_files[start:end]
+        # Shuffle files if needed
+        if self.shuffle:
+            rng = random.Random(self.seed)
+            files_to_process = list(files_to_process)
+            rng.shuffle(files_to_process)
+        # Buffer for accumulating text
+        buffer = []
+        buffer_tokens = 0
+        for file_path in files_to_process:
+            with open(file_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    # Try to parse as JSON (for conversational data)
+                    try:
+                        data = json.loads(line)
+                        if "user" in data and "assistant" in data:
+                            tokens = self.tokenizer.encode_conversation(
+                                data["user"], data["assistant"]
+                            )
+                        elif "text" in data:
+                            tokens = self.tokenizer.encode(
+                                data["text"], add_special_tokens=True
+                            )
+                        else:
+                            tokens = self.tokenizer.encode(
+                                line, add_special_tokens=True
+                            )
+                    except json.JSONDecodeError:
+                        # Plain text line
+                        tokens = self.tokenizer.encode(
+                            line, add_special_tokens=True
+                        )
+                    buffer.extend(tokens)
+                    # Yield chunks of max_length
+                    while len(buffer) >= self.max_length:
+                        chunk = buffer[:self.max_length]
+                        buffer = buffer[self.max_length:]
+                        yield self._create_sample(chunk)
+        # Handle remaining buffer (pad to max_length)
+        if len(buffer) > 0:
+            yield self._create_sample(buffer)
+    def _create_sample(self, tokens: List[int]) -> Dict[str, torch.Tensor]:
+        """Create a training sample from tokens."""
+        input_ids = tokens[:self.max_length]
+        # Pad if needed
+        attention_mask = [1] * len(input_ids)
+        padding_length = self.max_length - len(input_ids)
+        if padding_length > 0:
+            input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
+            attention_mask = attention_mask + [0] * padding_length
+        labels = [
+            id if mask == 1 else -100
+            for id, mask in zip(input_ids, attention_mask)
+        ]
+        return {
+            "input_ids": torch.tensor(input_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+            "labels": torch.tensor(labels, dtype=torch.long),
+        }
+class PackedDataset(Dataset):
+    """Dataset that packs multiple short sequences into one.
+    Efficient for training when samples are shorter than max_length.
+    Concatenates samples with separator tokens to fill sequences.
+    """
+    def __init__(
+        self,
+        samples: List[Dict],
+        tokenizer: SLMTokenizer,
+        max_length: int = 1024,
+    ):
+        """Initialize packed dataset.
+        Args:
+            samples: List of samples with "user" and "assistant" keys
+            tokenizer: Tokenizer instance
+            max_length: Maximum sequence length
+        """
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        # Pack sequences
+        self.packed_samples = self._pack_sequences(samples)
+        print(f"Packed {len(samples)} samples into {len(self.packed_samples)} sequences")
+    def _pack_sequences(self, samples: List[Dict]) -> List[List[int]]:
+        """Pack short sequences together."""
+        packed = []
+        current_sequence = []
+        for sample in samples:
+            # Tokenize
+            if "user" in sample and "assistant" in sample:
+                tokens = self.tokenizer.encode_conversation(
+                    sample["user"], sample["assistant"]
+                )
+            elif "text" in sample:
+                tokens = self.tokenizer.encode(sample["text"], add_special_tokens=True)
+            else:
+                continue
+            # Check if we can add to current sequence
+            if len(current_sequence) + len(tokens) <= self.max_length:
+                current_sequence.extend(tokens)
+            else:
+                # Save current and start new
+                if current_sequence:
+                    packed.append(current_sequence)
+                current_sequence = tokens[:self.max_length]
+        # Don't forget the last sequence
+        if current_sequence:
+            packed.append(current_sequence)
+        return packed
+    def __len__(self) -> int:
+        return len(self.packed_samples)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """Get a packed sample."""
+        tokens = self.packed_samples[idx]
+        # Pad if needed
+        attention_mask = [1] * len(tokens)
+        padding_length = self.max_length - len(tokens)
+        if padding_length > 0:
+            tokens = tokens + [self.tokenizer.pad_token_id] * padding_length
+            attention_mask = attention_mask + [0] * padding_length
+        labels = [
+            id if mask == 1 else -100
+            for id, mask in zip(tokens, attention_mask)
+        ]
+        return {
+            "input_ids": torch.tensor(tokens, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+            "labels": torch.tensor(labels, dtype=torch.long),
+        }
+def create_train_val_split(
+    samples: List[Dict],
+    val_ratio: float = 0.01,
+    seed: int = 42,
+) -> Tuple[List[Dict], List[Dict]]:
+    """Split samples into train and validation sets.
+    Args:
+        samples: List of all samples
+        val_ratio: Ratio for validation set
+        seed: Random seed
+    Returns:
+        Tuple of (train_samples, val_samples)
+    """
+    random.seed(seed)
+    shuffled = list(samples)
+    random.shuffle(shuffled)
+    val_size = int(len(shuffled) * val_ratio)
+    val_samples = shuffled[:val_size]
+    train_samples = shuffled[val_size:]
+    return train_samples, val_samples
+def load_jsonl(file_path: str) -> List[Dict]:
+    """Load data from a JSONL file."""
+    samples = []
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                samples.append(json.loads(line))
+    return samples
+def save_jsonl(samples: List[Dict], file_path: str):
+    """Save data to a JSONL file."""
+    with open(file_path, "w", encoding="utf-8") as f:
+        for sample in samples:
+            f.write(json.dumps(sample) + "\n")

src/data/tokenizer.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+Custom BPE Tokenizer for SLM v1.
+16,384 vocabulary size optimized for conversational use.
+"""
+import os
+import json
+from typing import List, Optional, Union
+from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors, decoders
+from tokenizers.normalizers import NFKC, Lowercase, Sequence
+class SLMTokenizer:
+    """Custom BPE tokenizer for the SLM model.
+    Features:
+    - 16,384 token vocabulary (memory efficient)
+    - Special tokens for conversation format
+    - Compatible with the model's embedding layer
+    """
+    # Special tokens
+    PAD_TOKEN = "<|pad|>"
+    BOS_TOKEN = "<|bos|>"
+    EOS_TOKEN = "<|eos|>"
+    UNK_TOKEN = "<|unk|>"
+    USER_TOKEN = "<|user|>"
+    ASSISTANT_TOKEN = "<|assistant|>"
+    SPECIAL_TOKENS = [PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN, USER_TOKEN, ASSISTANT_TOKEN]
+    def __init__(self, tokenizer: Optional[Tokenizer] = None):
+        """Initialize tokenizer.
+        Args:
+            tokenizer: Pre-trained HuggingFace tokenizer object
+        """
+        self.tokenizer = tokenizer
+        self._setup_special_token_ids()
+    def _setup_special_token_ids(self):
+        """Setup special token IDs for easy access."""
+        if self.tokenizer is not None:
+            self.pad_token_id = self.tokenizer.token_to_id(self.PAD_TOKEN)
+            self.bos_token_id = self.tokenizer.token_to_id(self.BOS_TOKEN)
+            self.eos_token_id = self.tokenizer.token_to_id(self.EOS_TOKEN)
+            self.unk_token_id = self.tokenizer.token_to_id(self.UNK_TOKEN)
+            self.user_token_id = self.tokenizer.token_to_id(self.USER_TOKEN)
+            self.assistant_token_id = self.tokenizer.token_to_id(self.ASSISTANT_TOKEN)
+    @classmethod
+    def train(
+        cls,
+        files: List[str],
+        vocab_size: int = 16384,
+        min_frequency: int = 2,
+        save_path: Optional[str] = None,
+    ) -> "SLMTokenizer":
+        """Train a new BPE tokenizer on the given files.
+        Args:
+            files: List of text file paths to train on
+            vocab_size: Size of vocabulary (default 16,384)
+            min_frequency: Minimum token frequency to include
+            save_path: Optional path to save the trained tokenizer
+        Returns:
+            Trained SLMTokenizer instance
+        """
+        print(f"Training BPE tokenizer with vocab_size={vocab_size}...")
+        print(f"Training files: {files}")
+        # Initialize a BPE tokenizer
+        tokenizer = Tokenizer(models.BPE(unk_token=cls.UNK_TOKEN))
+        # Set up normalizer (optional - keeps text mostly as-is)
+        # We use NFKC normalization to standardize unicode
+        tokenizer.normalizer = NFKC()
+        # Set up pre-tokenizer (splits on whitespace and punctuation)
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+        # Set up decoder
+        tokenizer.decoder = decoders.ByteLevel()
+        # Set up trainer
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=cls.SPECIAL_TOKENS,
+            show_progress=True,
+        )
+        # Train the tokenizer
+        tokenizer.train(files, trainer)
+        # Set up post-processor for adding special tokens
+        tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{cls.BOS_TOKEN} $A {cls.EOS_TOKEN}",
+            pair=f"{cls.BOS_TOKEN} $A {cls.EOS_TOKEN} {cls.BOS_TOKEN} $B {cls.EOS_TOKEN}",
+            special_tokens=[
+                (cls.BOS_TOKEN, tokenizer.token_to_id(cls.BOS_TOKEN)),
+                (cls.EOS_TOKEN, tokenizer.token_to_id(cls.EOS_TOKEN)),
+            ],
+        )
+        print(f"Tokenizer trained! Vocabulary size: {tokenizer.get_vocab_size()}")
+        # Create instance
+        instance = cls(tokenizer)
+        # Save if path provided
+        if save_path:
+            instance.save(save_path)
+        return instance
+    @classmethod
+    def from_file(cls, path: str) -> "SLMTokenizer":
+        """Load a tokenizer from a saved file.
+        Args:
+            path: Path to the tokenizer.json file
+        Returns:
+            Loaded SLMTokenizer instance
+        """
+        tokenizer = Tokenizer.from_file(path)
+        return cls(tokenizer)
+    def save(self, path: str):
+        """Save the tokenizer to a file.
+        Args:
+            path: Path to save the tokenizer (directory or file)
+        """
+        if os.path.isdir(path):
+            save_path = os.path.join(path, "tokenizer.json")
+        else:
+            save_path = path
+            os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        self.tokenizer.save(save_path)
+        print(f"Tokenizer saved to: {save_path}")
+        # Also save config
+        config_path = save_path.replace("tokenizer.json", "tokenizer_config.json")
+        config = {
+            "vocab_size": self.vocab_size,
+            "pad_token": self.PAD_TOKEN,
+            "bos_token": self.BOS_TOKEN,
+            "eos_token": self.EOS_TOKEN,
+            "unk_token": self.UNK_TOKEN,
+            "user_token": self.USER_TOKEN,
+            "assistant_token": self.ASSISTANT_TOKEN,
+        }
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+        print(f"Tokenizer config saved to: {config_path}")
+    def encode(
+        self,
+        text: str,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        padding: bool = False,
+        truncation: bool = False,
+    ) -> List[int]:
+        """Encode text to token IDs.
+        Args:
+            text: Input text string
+            add_special_tokens: Whether to add BOS/EOS tokens
+            max_length: Maximum sequence length
+            padding: Whether to pad to max_length
+            truncation: Whether to truncate to max_length
+        Returns:
+            List of token IDs
+        """
+        # Encode
+        if add_special_tokens:
+            encoding = self.tokenizer.encode(text)
+        else:
+            encoding = self.tokenizer.encode(text, add_special_tokens=False)
+        ids = encoding.ids
+        # Truncation
+        if truncation and max_length and len(ids) > max_length:
+            ids = ids[:max_length]
+            # Ensure EOS at end if we had special tokens
+            if add_special_tokens and ids[-1] != self.eos_token_id:
+                ids[-1] = self.eos_token_id
+        # Padding
+        if padding and max_length and len(ids) < max_length:
+            ids = ids + [self.pad_token_id] * (max_length - len(ids))
+        return ids
+    def decode(self, ids: List[int], skip_special_tokens: bool = True) -> str:
+        """Decode token IDs to text.
+        Args:
+            ids: List of token IDs
+            skip_special_tokens: Whether to remove special tokens
+        Returns:
+            Decoded text string
+        """
+        return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+    def encode_conversation(
+        self,
+        user_message: str,
+        assistant_message: Optional[str] = None,
+        max_length: Optional[int] = None,
+    ) -> List[int]:
+        """Encode a conversation turn.
+        Format: <|bos|><|user|>message<|assistant|>response<|eos|>
+        Args:
+            user_message: The user's message
+            assistant_message: Optional assistant response
+            max_length: Maximum sequence length
+        Returns:
+            List of token IDs
+        """
+        # Build conversation string
+        if assistant_message:
+            text = f"{self.USER_TOKEN}{user_message}{self.ASSISTANT_TOKEN}{assistant_message}"
+        else:
+            # For inference - no response yet
+            text = f"{self.USER_TOKEN}{user_message}{self.ASSISTANT_TOKEN}"
+        return self.encode(text, add_special_tokens=True, max_length=max_length, truncation=True)
+    @property
+    def vocab_size(self) -> int:
+        """Get vocabulary size."""
+        return self.tokenizer.get_vocab_size()
+    def get_vocab(self) -> dict:
+        """Get the vocabulary as a dictionary."""
+        return self.tokenizer.get_vocab()
+    def __len__(self) -> int:
+        """Return vocabulary size."""
+        return self.vocab_size
+    def __call__(
+        self,
+        text: Union[str, List[str]],
+        max_length: Optional[int] = None,
+        padding: bool = False,
+        truncation: bool = False,
+        return_tensors: Optional[str] = None,
+    ) -> dict:
+        """Tokenize text (HuggingFace-style interface).
+        Args:
+            text: Input text or list of texts
+            max_length: Maximum sequence length
+            padding: Whether to pad sequences
+            truncation: Whether to truncate sequences
+            return_tensors: If "pt", return PyTorch tensors
+        Returns:
+            Dictionary with input_ids and attention_mask
+        """
+        if isinstance(text, str):
+            text = [text]
+        all_ids = []
+        for t in text:
+            ids = self.encode(
+                t,
+                max_length=max_length,
+                padding=padding,
+                truncation=truncation,
+            )
+            all_ids.append(ids)
+        # Create attention mask (1 for real tokens, 0 for padding)
+        attention_mask = [[1 if id != self.pad_token_id else 0 for id in ids] for ids in all_ids]
+        result = {
+            "input_ids": all_ids,
+            "attention_mask": attention_mask,
+        }
+        if return_tensors == "pt":
+            import torch
+            result["input_ids"] = torch.tensor(all_ids)
+            result["attention_mask"] = torch.tensor(attention_mask)
+        return result

src/export/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # ONNX export components

src/inference/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Inference and generation components

src/model/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""SLM Model Components."""
+from .config import SLMConfig
+from .transformer import SLMForCausalLM, SLMModel, SLMOutput
+from .kv_cache import KVCache
+from .normalization import RMSNorm
+from .rope import RotaryEmbedding
+from .attention import MultiHeadAttention, create_causal_mask
+from .ffn import FeedForward
+from .decoder import DecoderBlock
+__all__ = [
+    "SLMConfig",
+    "SLMForCausalLM",
+    "SLMModel",
+    "SLMOutput",
+    "KVCache",
+    "RMSNorm",
+    "RotaryEmbedding",
+    "MultiHeadAttention",
+    "create_causal_mask",
+    "FeedForward",
+    "DecoderBlock",
+]

src/model/attention.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Multi-Head Attention with explicit KV cache for SLM.
+Qualcomm-safe: No FlashAttention, no fused ops, clean ONNX export.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+from .config import SLMConfig
+from .rope import RotaryEmbedding
+from .kv_cache import KVCache
+class MultiHeadAttention(nn.Module):
+    """Multi-Head Self-Attention with RoPE and explicit KV cache.
+    Design choices for Qualcomm compatibility:
+    - Standard attention (no FlashAttention)
+    - No grouped/multi-query attention (simpler, v1.1 will add GQA)
+    - Explicit KV cache management
+    - Clean tensor operations for ONNX export
+    """
+    def __init__(self, config: SLMConfig, layer_idx: int):
+        """Initialize attention layer.
+        Args:
+            config: Model configuration
+            layer_idx: Index of this layer (for KV cache)
+        """
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = config.head_dim
+        self.dropout = config.attention_dropout
+        # Q, K, V projections
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # Output projection
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        # Rotary embeddings
+        self.rotary_emb = RotaryEmbedding(
+            dim=self.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            base=config.rope_theta,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[KVCache]]:
+        """Forward pass for attention.
+        Args:
+            hidden_states: Input tensor [batch, seq_len, hidden_size]
+            position_ids: Position indices [batch, seq_len]
+            attention_mask: Causal mask [batch, 1, seq_len, kv_seq_len]
+            kv_cache: Optional KV cache for inference
+            use_cache: Whether to use/update KV cache
+        Returns:
+            Tuple of (output, kv_cache)
+        """
+        batch_size, seq_len, _ = hidden_states.shape
+        # Project to Q, K, V
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+        # Reshape: [batch, seq, hidden] -> [batch, seq, heads, head_dim]
+        query = query.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        key = key.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        value = value.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        # Transpose for attention: [batch, heads, seq, head_dim]
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        # Apply rotary embeddings to Q and K
+        query, key = self.rotary_emb(query, key, position_ids)
+        # Handle KV cache
+        if use_cache and kv_cache is not None:
+            # Get the position to write to cache
+            cache_position = position_ids[0, 0].item()
+            # Update cache and get full K, V
+            key, value = kv_cache.update(
+                layer_idx=self.layer_idx,
+                key=key,
+                value=value,
+                position=cache_position,
+            )
+        # Compute attention scores
+        # [batch, heads, seq, head_dim] @ [batch, heads, head_dim, kv_seq]
+        # -> [batch, heads, seq, kv_seq]
+        scale = 1.0 / (self.head_dim ** 0.5)
+        attn_weights = torch.matmul(query, key.transpose(-2, -1)) * scale
+        # Apply causal mask
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        # Softmax and dropout
+        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+        if self.training and self.dropout > 0:
+            attn_weights = F.dropout(attn_weights, p=self.dropout)
+        # Apply attention to values
+        # [batch, heads, seq, kv_seq] @ [batch, heads, kv_seq, head_dim]
+        # -> [batch, heads, seq, head_dim]
+        attn_output = torch.matmul(attn_weights, value)
+        # Reshape back: [batch, heads, seq, head_dim] -> [batch, seq, hidden]
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, seq_len, self.hidden_size)
+        # Output projection
+        output = self.o_proj(attn_output)
+        return output, kv_cache
+def create_causal_mask(
+    seq_len: int,
+    kv_seq_len: int,
+    dtype: torch.dtype,
+    device: torch.device,
+) -> torch.Tensor:
+    """Create a causal attention mask.
+    Args:
+        seq_len: Query sequence length
+        kv_seq_len: Key/value sequence length
+        dtype: Data type for mask
+        device: Device for mask
+    Returns:
+        Causal mask tensor [1, 1, seq_len, kv_seq_len]
+    """
+    # Create lower triangular mask
+    mask = torch.full((seq_len, kv_seq_len), float("-inf"), dtype=dtype, device=device)
+    # For decode (seq_len=1), we can attend to all previous tokens
+    if seq_len == 1:
+        mask = torch.zeros((seq_len, kv_seq_len), dtype=dtype, device=device)
+    else:
+        # For prefill, create standard causal mask
+        # Position i can attend to positions 0..i
+        for i in range(seq_len):
+            # Offset for KV cache
+            offset = kv_seq_len - seq_len
+            mask[i, : offset + i + 1] = 0.0
+    return mask.unsqueeze(0).unsqueeze(0)

src/model/config.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Model configuration for SLM v1.
+Defines all hyperparameters based on architecture specification.
+"""
+from dataclasses import dataclass
+from typing import Optional
+import yaml
+@dataclass
+class SLMConfig:
+    """Configuration class for the SLM model.
+    Architecture: 120M parameter decoder-only transformer
+    - 8 layers, 1024 hidden size, 16 attention heads
+    - RMSNorm (pre-norm), GELU FFN, RoPE positions
+    - Explicit KV cache for efficient inference
+    """
+    # Model architecture
+    vocab_size: int = 16384
+    hidden_size: int = 1024
+    num_layers: int = 8
+    num_heads: int = 16
+    head_dim: int = 64
+    intermediate_size: int = 4096  # 4 * hidden_size
+    # Position encoding
+    max_position_embeddings: int = 1024
+    rope_theta: float = 10000.0
+    # Normalization
+    rms_norm_eps: float = 1e-6
+    # Embeddings
+    tie_word_embeddings: bool = True
+    # Dropout (disabled for inference, optional for training)
+    dropout: float = 0.0
+    attention_dropout: float = 0.0
+    # Precision
+    torch_dtype: str = "float16"
+    def __post_init__(self):
+        """Validate configuration after initialization."""
+        assert self.hidden_size % self.num_heads == 0, \
+            f"hidden_size ({self.hidden_size}) must be divisible by num_heads ({self.num_heads})"
+        assert self.head_dim == self.hidden_size // self.num_heads, \
+            f"head_dim ({self.head_dim}) must equal hidden_size // num_heads ({self.hidden_size // self.num_heads})"
+    @classmethod
+    def from_yaml(cls, path: str) -> "SLMConfig":
+        """Load configuration from YAML file."""
+        with open(path, "r") as f:
+            config_dict = yaml.safe_load(f)
+        model_config = config_dict.get("model", {})
+        return cls(**model_config)
+    def to_dict(self) -> dict:
+        """Convert configuration to dictionary."""
+        return {
+            "vocab_size": self.vocab_size,
+            "hidden_size": self.hidden_size,
+            "num_layers": self.num_layers,
+            "num_heads": self.num_heads,
+            "head_dim": self.head_dim,
+            "intermediate_size": self.intermediate_size,
+            "max_position_embeddings": self.max_position_embeddings,
+            "rope_theta": self.rope_theta,
+            "rms_norm_eps": self.rms_norm_eps,
+            "tie_word_embeddings": self.tie_word_embeddings,
+            "dropout": self.dropout,
+            "attention_dropout": self.attention_dropout,
+            "torch_dtype": self.torch_dtype,
+        }
+    @property
+    def num_parameters(self) -> int:
+        """Estimate total number of parameters."""
+        # Embedding: vocab_size * hidden_size
+        embedding_params = self.vocab_size * self.hidden_size
+        # Per layer:
+        # - Attention: 4 * hidden_size^2 (Q, K, V, O projections)
+        # - FFN: 2 * hidden_size * intermediate_size
+        # - Norms: 2 * hidden_size
+        attention_params = 4 * self.hidden_size * self.hidden_size
+        ffn_params = 2 * self.hidden_size * self.intermediate_size
+        norm_params = 2 * self.hidden_size
+        layer_params = attention_params + ffn_params + norm_params
+        total_layer_params = self.num_layers * layer_params
+        # Output head (tied with embedding if enabled)
+        output_params = 0 if self.tie_word_embeddings else self.vocab_size * self.hidden_size
+        # Final norm
+        final_norm_params = self.hidden_size
+        return embedding_params + total_layer_params + output_params + final_norm_params
+    def __repr__(self) -> str:
+        params_m = self.num_parameters / 1e6
+        return (
+            f"SLMConfig(\n"
+            f"  vocab_size={self.vocab_size},\n"
+            f"  hidden_size={self.hidden_size},\n"
+            f"  num_layers={self.num_layers},\n"
+            f"  num_heads={self.num_heads},\n"
+            f"  max_position_embeddings={self.max_position_embeddings},\n"
+            f"  estimated_params={params_m:.1f}M\n"
+            f")"
+        )

src/model/decoder.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+Decoder Block for SLM.
+Pre-norm architecture with residual connections.
+"""
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple
+from .config import SLMConfig
+from .normalization import RMSNorm
+from .attention import MultiHeadAttention
+from .ffn import FeedForward
+from .kv_cache import KVCache
+class DecoderBlock(nn.Module):
+    """Single decoder block with pre-norm architecture.
+    Structure (Pre-Norm):
+    ```
+    x
+     ├─ RMSNorm
+     ├─ Multi-Head Attention
+     ├─ Residual Add
+     ├─ RMSNorm
+     ├─ Feed-Forward Network
+     └─ Residual Add
+    ```
+    Why pre-norm:
+    - More stable gradients in FP16 training
+    - Better quantization behavior
+    - Easier ONNX export (no layer-crossing dependencies)
+    """
+    def __init__(self, config: SLMConfig, layer_idx: int):
+        """Initialize decoder block.
+        Args:
+            config: Model configuration
+            layer_idx: Index of this layer
+        """
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        # Pre-attention norm
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        # Self-attention
+        self.self_attn = MultiHeadAttention(config, layer_idx)
+        # Pre-FFN norm
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        # Feed-forward network
+        self.mlp = FeedForward(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[KVCache]]:
+        """Forward pass through decoder block.
+        Args:
+            hidden_states: Input tensor [batch, seq, hidden_size]
+            position_ids: Position indices [batch, seq]
+            attention_mask: Causal attention mask
+            kv_cache: Optional KV cache
+            use_cache: Whether to use/update cache
+        Returns:
+            Tuple of (output, kv_cache)
+        """
+        # Store residual
+        residual = hidden_states
+        # Pre-norm -> Attention
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, kv_cache = self.self_attn(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            kv_cache=kv_cache,
+            use_cache=use_cache,
+        )
+        # Residual connection
+        hidden_states = residual + hidden_states
+        # Store residual
+        residual = hidden_states
+        # Pre-norm -> FFN
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # Residual connection
+        hidden_states = residual + hidden_states
+        return hidden_states, kv_cache

src/model/ffn.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Feed-Forward Network for SLM.
+Uses GELU activation (not SwiGLU) for better INT8 quantization.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .config import SLMConfig
+class FeedForward(nn.Module):
+    """Feed-Forward Network with GELU activation.
+    Architecture: Linear -> GELU -> Linear
+    - Input: [batch, seq, hidden_size=768]
+    - Hidden: [batch, seq, intermediate_size=3072]
+    - Output: [batch, seq, hidden_size=768]
+    Why GELU over SwiGLU:
+    - Fewer operations (2 matmuls vs 3)
+    - Better INT8 quantization behavior
+    - Full QNN support without decomposition
+    - SwiGLU benefits mainly appear at >1B parameters
+    """
+    def __init__(self, config: SLMConfig):
+        """Initialize FFN.
+        Args:
+            config: Model configuration
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        # Up projection: hidden -> intermediate
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # Down projection: intermediate -> hidden
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.dropout = config.dropout
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass through FFN.
+        Args:
+            x: Input tensor [batch, seq, hidden_size]
+        Returns:
+            Output tensor [batch, seq, hidden_size]
+        """
+        # Up project and apply GELU
+        hidden = self.up_proj(x)
+        hidden = F.gelu(hidden, approximate="tanh")
+        # Down project
+        output = self.down_proj(hidden)
+        # Apply dropout during training
+        if self.training and self.dropout > 0:
+            output = F.dropout(output, p=self.dropout)
+        return output

src/model/kv_cache.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Explicit KV Cache management for efficient inference.
+This is critical for Qualcomm deployment and agent control loops.
+"""
+import torch
+from typing import Optional, Tuple
+from dataclasses import dataclass
+@dataclass
+class KVCache:
+    """Key-Value cache for transformer inference.
+    Layout: [num_layers, batch_size, num_heads, max_seq_len, head_dim]
+    This explicit cache enables:
+    - Efficient autoregressive decoding
+    - Cache offloading for memory management
+    - Sliding window attention (future)
+    - Agent control loops with cache manipulation
+    """
+    key_cache: torch.Tensor  # [num_layers, batch, heads, max_len, head_dim]
+    value_cache: torch.Tensor  # [num_layers, batch, heads, max_len, head_dim]
+    seq_len: int  # Current sequence length in cache
+    @classmethod
+    def create(
+        cls,
+        num_layers: int,
+        batch_size: int,
+        num_heads: int,
+        max_seq_len: int,
+        head_dim: int,
+        dtype: torch.dtype = torch.float16,
+        device: torch.device = None,
+    ) -> "KVCache":
+        """Create an empty KV cache.
+        Args:
+            num_layers: Number of transformer layers
+            batch_size: Batch size
+            num_heads: Number of attention heads
+            max_seq_len: Maximum sequence length
+            head_dim: Dimension per attention head
+            dtype: Data type for cache tensors
+            device: Device to create cache on
+        Returns:
+            Initialized KVCache with zero tensors
+        """
+        shape = (num_layers, batch_size, num_heads, max_seq_len, head_dim)
+        key_cache = torch.zeros(shape, dtype=dtype, device=device)
+        value_cache = torch.zeros(shape, dtype=dtype, device=device)
+        return cls(key_cache=key_cache, value_cache=value_cache, seq_len=0)
+    def update(
+        self,
+        layer_idx: int,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        position: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Update cache for a specific layer and return full K, V.
+        Args:
+            layer_idx: Index of the transformer layer
+            key: New key tensor [batch, heads, seq_len, head_dim]
+            value: New value tensor [batch, heads, seq_len, head_dim]
+            position: Starting position for the new tokens
+        Returns:
+            Tuple of (full_key, full_value) including cached values
+        """
+        seq_len = key.shape[2]
+        end_pos = position + seq_len
+        # Store new keys and values
+        self.key_cache[layer_idx, :, :, position:end_pos, :] = key
+        self.value_cache[layer_idx, :, :, position:end_pos, :] = value
+        # Update sequence length
+        self.seq_len = max(self.seq_len, end_pos)
+        # Return full K, V up to current position
+        return (
+            self.key_cache[layer_idx, :, :, :end_pos, :],
+            self.value_cache[layer_idx, :, :, :end_pos, :],
+        )
+    def get(
+        self,
+        layer_idx: int,
+        end_pos: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get cached K, V for a specific layer.
+        Args:
+            layer_idx: Index of the transformer layer
+            end_pos: End position (defaults to current seq_len)
+        Returns:
+            Tuple of (key, value) tensors
+        """
+        if end_pos is None:
+            end_pos = self.seq_len
+        return (
+            self.key_cache[layer_idx, :, :, :end_pos, :],
+            self.value_cache[layer_idx, :, :, :end_pos, :],
+        )
+    def reset(self):
+        """Reset the cache to empty state."""
+        self.key_cache.zero_()
+        self.value_cache.zero_()
+        self.seq_len = 0
+    @property
+    def memory_usage_mb(self) -> float:
+        """Calculate memory usage in megabytes."""
+        total_bytes = self.key_cache.numel() * self.key_cache.element_size()
+        total_bytes += self.value_cache.numel() * self.value_cache.element_size()
+        return total_bytes / (1024 * 1024)

src/model/normalization.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+RMSNorm implementation for SLM.
+Pre-norm architecture for stable FP16 training and better quantization.
+"""
+import torch
+import torch.nn as nn
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization.
+    RMSNorm is computationally simpler than LayerNorm as it doesn't
+    compute mean statistics. This makes it:
+    - Faster to compute
+    - More stable in FP16
+    - Better for quantization
+    Reference: https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        """Initialize RMSNorm.
+        Args:
+            hidden_size: The size of the hidden dimension
+            eps: Small constant for numerical stability
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply RMS normalization.
+        Args:
+            x: Input tensor of shape [..., hidden_size]
+        Returns:
+            Normalized tensor of same shape
+        """
+        # Compute RMS: sqrt(mean(x^2))
+        # Use float32 for numerical stability, then cast back
+        input_dtype = x.dtype
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.eps)
+        return (self.weight * x).to(input_dtype)

src/model/rope.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+Rotary Position Embedding (RoPE) implementation.
+Applied to Q and K only, with fixed base (no dynamic scaling).
+"""
+import torch
+import torch.nn as nn
+from typing import Tuple
+class RotaryEmbedding(nn.Module):
+    """Rotary Position Embedding (RoPE).
+    RoPE encodes position information by rotating the query and key vectors.
+    Key properties:
+    - Parameter-free (no learnable embeddings)
+    - Naturally encodes relative positions
+    - Extrapolates well to longer sequences
+    Reference: https://arxiv.org/abs/2104.09864
+    """
+    def __init__(
+        self,
+        dim: int,
+        max_position_embeddings: int = 1024,
+        base: float = 10000.0,
+    ):
+        """Initialize RoPE.
+        Args:
+            dim: Dimension of the rotary embedding (usually head_dim)
+            max_position_embeddings: Maximum sequence length
+            base: Base for the frequency computation
+        """
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # Precompute inverse frequencies
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2).float() / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Precompute cos and sin for all positions
+        self._set_cos_sin_cache(max_position_embeddings)
+    def _set_cos_sin_cache(self, seq_len: int):
+        """Precompute cos and sin values for positions."""
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        # Outer product: [seq_len] x [dim/2] -> [seq_len, dim/2]
+        freqs = torch.outer(t, self.inv_freq)
+        # Concatenate to get [seq_len, dim]
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos(), persistent=False)
+        self.register_buffer("sin_cached", emb.sin(), persistent=False)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply rotary embeddings to query and key tensors.
+        Args:
+            q: Query tensor of shape [batch, num_heads, seq_len, head_dim]
+            k: Key tensor of shape [batch, num_heads, seq_len, head_dim]
+            position_ids: Position indices of shape [batch, seq_len]
+        Returns:
+            Tuple of (rotated_q, rotated_k) with same shapes as inputs
+        """
+        seq_len = position_ids.max() + 1
+        # Extend cache if needed
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len)
+        # Get cos and sin for the positions
+        # Shape: [batch, seq_len, dim]
+        cos = self.cos_cached[position_ids]
+        sin = self.sin_cached[position_ids]
+        # Add head dimension: [batch, 1, seq_len, dim]
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        # Apply rotation
+        q_embed = (q * cos) + (self._rotate_half(q) * sin)
+        k_embed = (k * cos) + (self._rotate_half(k) * sin)
+        return q_embed, k_embed
+    @staticmethod
+    def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+        """Rotate half the hidden dims of the input.
+        Splits the input into two halves and rotates:
+        [x1, x2, x3, x4] -> [-x3, -x4, x1, x2]
+        """
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)

src/model/transformer.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+Full Transformer model for SLM.
+Implements the mandatory prefill/decode API for Qualcomm deployment.
+"""
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple, Union
+from dataclasses import dataclass
+from .config import SLMConfig
+from .normalization import RMSNorm
+from .decoder import DecoderBlock
+from .attention import create_causal_mask
+from .kv_cache import KVCache
+@dataclass
+class SLMOutput:
+    """Output from SLM forward pass."""
+    logits: torch.Tensor  # [batch, seq, vocab_size]
+    kv_cache: Optional[KVCache] = None
+    hidden_states: Optional[torch.Tensor] = None
+class SLMModel(nn.Module):
+    """Core transformer model (without LM head).
+    This is the decoder stack:
+    - Token embedding
+    - N decoder blocks
+    - Final RMSNorm
+    """
+    def __init__(self, config: SLMConfig):
+        """Initialize transformer model.
+        Args:
+            config: Model configuration
+        """
+        super().__init__()
+        self.config = config
+        # Token embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        # Decoder layers
+        self.layers = nn.ModuleList([
+            DecoderBlock(config, layer_idx=i)
+            for i in range(config.num_layers)
+        ])
+        # Final normalization
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[KVCache]]:
+        """Forward pass through transformer.
+        Args:
+            input_ids: Token IDs [batch, seq]
+            position_ids: Position indices [batch, seq]
+            attention_mask: Causal mask
+            kv_cache: Optional KV cache
+            use_cache: Whether to use/update cache
+        Returns:
+            Tuple of (hidden_states, kv_cache)
+        """
+        batch_size, seq_len = input_ids.shape
+        # Create position IDs if not provided
+        if position_ids is None:
+            if kv_cache is not None and kv_cache.seq_len > 0:
+                # For decode: position is the current cache length
+                position_ids = torch.arange(
+                    kv_cache.seq_len, kv_cache.seq_len + seq_len,
+                    device=input_ids.device
+                ).unsqueeze(0).expand(batch_size, -1)
+            else:
+                # For prefill: positions are 0..seq_len-1
+                position_ids = torch.arange(
+                    seq_len, device=input_ids.device
+                ).unsqueeze(0).expand(batch_size, -1)
+        # Create attention mask if not provided
+        if attention_mask is None:
+            kv_seq_len = seq_len
+            if kv_cache is not None and kv_cache.seq_len > 0:
+                kv_seq_len = kv_cache.seq_len + seq_len
+            attention_mask = create_causal_mask(
+                seq_len=seq_len,
+                kv_seq_len=kv_seq_len,
+                dtype=self.embed_tokens.weight.dtype,
+                device=input_ids.device,
+            )
+        # Token embeddings
+        hidden_states = self.embed_tokens(input_ids)
+        # Pass through decoder layers
+        for layer in self.layers:
+            hidden_states, kv_cache = layer(
+                hidden_states=hidden_states,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                kv_cache=kv_cache,
+                use_cache=use_cache,
+            )
+        # Final normalization
+        hidden_states = self.norm(hidden_states)
+        return hidden_states, kv_cache
+class SLMForCausalLM(nn.Module):
+    """SLM with language modeling head.
+    This is the full model with:
+    - Transformer backbone
+    - LM head (tied with embeddings)
+    - Prefill/Decode API for Qualcomm deployment
+    """
+    def __init__(self, config: SLMConfig):
+        """Initialize causal LM.
+        Args:
+            config: Model configuration
+        """
+        super().__init__()
+        self.config = config
+        # Transformer backbone
+        self.model = SLMModel(config)
+        # LM head
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Tie weights if configured
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module):
+        """Initialize model weights."""
+        std = 0.02
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+        use_cache: bool = False,
+        labels: Optional[torch.Tensor] = None,
+    ) -> SLMOutput:
+        """Forward pass for causal LM.
+        Args:
+            input_ids: Token IDs [batch, seq]
+            position_ids: Position indices [batch, seq]
+            attention_mask: Causal mask
+            kv_cache: Optional KV cache
+            use_cache: Whether to use/update cache
+            labels: Optional labels for loss computation
+        Returns:
+            SLMOutput with logits and optional loss
+        """
+        # Get hidden states from transformer
+        hidden_states, kv_cache = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            kv_cache=kv_cache,
+            use_cache=use_cache,
+        )
+        # Compute logits
+        logits = self.lm_head(hidden_states)
+        return SLMOutput(
+            logits=logits,
+            kv_cache=kv_cache,
+            hidden_states=hidden_states,
+        )
+    # =========================================================================
+    # MANDATORY KV CACHE API (from architecture.txt)
+    # =========================================================================
+    def prefill(
+        self,
+        input_ids: torch.Tensor,
+        kv_cache: Optional[KVCache] = None,
+    ) -> Tuple[torch.Tensor, KVCache]:
+        """Prefill: Process full prompt and populate KV cache.
+        This is Graph 1 for Qualcomm deployment.
+        Args:
+            input_ids: Token IDs [batch, seq]
+            kv_cache: Empty or existing KV cache
+        Returns:
+            Tuple of (logits [batch, seq, vocab], populated_kv_cache)
+        """
+        batch_size = input_ids.shape[0]
+        # Create empty cache if not provided
+        if kv_cache is None:
+            kv_cache = KVCache.create(
+                num_layers=self.config.num_layers,
+                batch_size=batch_size,
+                num_heads=self.config.num_heads,
+                max_seq_len=self.config.max_position_embeddings,
+                head_dim=self.config.head_dim,
+                dtype=self.model.embed_tokens.weight.dtype,
+                device=input_ids.device,
+            )
+        # Forward pass with cache
+        output = self.forward(
+            input_ids=input_ids,
+            kv_cache=kv_cache,
+            use_cache=True,
+        )
+        return output.logits, output.kv_cache
+    def decode(
+        self,
+        input_id: torch.Tensor,
+        kv_cache: KVCache,
+        position: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, KVCache]:
+        """Decode: Generate single token using KV cache.
+        This is Graph 2 for Qualcomm deployment.
+        Args:
+            input_id: Single token ID [batch, 1]
+            kv_cache: Populated KV cache from prefill or previous decode
+            position: Position index (defaults to cache.seq_len)
+        Returns:
+            Tuple of (logits [batch, 1, vocab], updated_kv_cache)
+        """
+        batch_size = input_id.shape[0]
+        # Get position from cache if not provided
+        if position is None:
+            position = kv_cache.seq_len
+        # Create position IDs
+        position_ids = torch.tensor(
+            [[position]], device=input_id.device
+        ).expand(batch_size, -1)
+        # Forward pass with cache
+        output = self.forward(
+            input_ids=input_id,
+            position_ids=position_ids,
+            kv_cache=kv_cache,
+            use_cache=True,
+        )
+        return output.logits, output.kv_cache
+    def create_empty_cache(
+        self,
+        batch_size: int = 1,
+        device: torch.device = None,
+    ) -> KVCache:
+        """Create an empty KV cache for inference.
+        Args:
+            batch_size: Batch size
+            device: Device for cache tensors
+        Returns:
+            Empty KVCache ready for prefill
+        """
+        if device is None:
+            device = self.model.embed_tokens.weight.device
+        return KVCache.create(
+            num_layers=self.config.num_layers,
+            batch_size=batch_size,
+            num_heads=self.config.num_heads,
+            max_seq_len=self.config.max_position_embeddings,
+            head_dim=self.config.head_dim,
+            dtype=self.model.embed_tokens.weight.dtype,
+            device=device,
+        )
+    @property
+    def num_parameters(self) -> int:
+        """Count total trainable parameters."""
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+    @property
+    def device(self) -> torch.device:
+        """Get model device."""
+        return self.model.embed_tokens.weight.device

src/training/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Training components
+from .loss import LanguageModelingLoss, compute_perplexity, compute_accuracy
+from .optimizer import create_optimizer, create_scheduler, clip_grad_norm
+from .trainer import Trainer, TrainingConfig
+__all__ = [
+    "LanguageModelingLoss",
+    "compute_perplexity",
+    "compute_accuracy",
+    "create_optimizer",
+    "create_scheduler",
+    "clip_grad_norm",
+    "Trainer",
+    "TrainingConfig",
+]

src/training/loss.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Loss functions for SLM training.
+Cross-entropy loss with optional label smoothing.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+class LanguageModelingLoss(nn.Module):
+    """Cross-entropy loss for language modeling.
+    Handles:
+    - Automatic shifting of labels
+    - Ignoring padding tokens (-100)
+    - Optional label smoothing
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        label_smoothing: float = 0.0,
+        ignore_index: int = -100,
+    ):
+        """Initialize loss function.
+        Args:
+            vocab_size: Size of vocabulary
+            label_smoothing: Label smoothing factor (0.0 = no smoothing)
+            ignore_index: Index to ignore in loss calculation (padding)
+        """
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.label_smoothing = label_smoothing
+        self.ignore_index = ignore_index
+        self.ce_loss = nn.CrossEntropyLoss(
+            ignore_index=ignore_index,
+            label_smoothing=label_smoothing,
+        )
+    def forward(
+        self,
+        logits: torch.Tensor,
+        labels: torch.Tensor,
+        shift_labels: bool = True,
+    ) -> torch.Tensor:
+        """Compute loss.
+        Args:
+            logits: Model output logits [batch_size, seq_len, vocab_size]
+            labels: Target token IDs [batch_size, seq_len]
+            shift_labels: Whether to shift labels (for autoregressive LM)
+        Returns:
+            Scalar loss tensor
+        """
+        if shift_labels:
+            # Shift so we predict next token
+            # logits: predict tokens 1..n
+            # labels: actual tokens 1..n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+        else:
+            shift_logits = logits
+            shift_labels = labels
+        # Flatten for cross-entropy
+        # [batch * seq_len, vocab_size]
+        flat_logits = shift_logits.view(-1, self.vocab_size)
+        # [batch * seq_len]
+        flat_labels = shift_labels.view(-1)
+        loss = self.ce_loss(flat_logits, flat_labels)
+        return loss
+def compute_perplexity(loss: torch.Tensor) -> torch.Tensor:
+    """Compute perplexity from cross-entropy loss.
+    Args:
+        loss: Cross-entropy loss value
+    Returns:
+        Perplexity (exp of loss)
+    """
+    return torch.exp(loss)
+def compute_accuracy(
+    logits: torch.Tensor,
+    labels: torch.Tensor,
+    ignore_index: int = -100,
+) -> torch.Tensor:
+    """Compute token prediction accuracy.
+    Args:
+        logits: Model output logits [batch_size, seq_len, vocab_size]
+        labels: Target token IDs [batch_size, seq_len]
+        ignore_index: Index to ignore in accuracy calculation
+    Returns:
+        Accuracy as a scalar tensor
+    """
+    # Shift for autoregressive prediction
+    shift_logits = logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+    # Get predictions
+    predictions = shift_logits.argmax(dim=-1)
+    # Mask for valid positions
+    mask = shift_labels != ignore_index
+    # Compute accuracy on valid positions
+    correct = (predictions == shift_labels) & mask
+    accuracy = correct.sum().float() / mask.sum().float()
+    return accuracy

src/training/optimizer.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+Optimizer and learning rate scheduler for SLM training.
+Uses AdamW with cosine annealing and warmup.
+"""
+import math
+from typing import Optional, Tuple, List
+import torch
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import LambdaLR
+def create_optimizer(
+    model: torch.nn.Module,
+    learning_rate: float = 3e-4,
+    weight_decay: float = 0.1,
+    betas: Tuple[float, float] = (0.9, 0.95),
+    eps: float = 1e-8,
+) -> AdamW:
+    """Create AdamW optimizer with weight decay.
+    Applies weight decay only to 2D parameters (weights, not biases/norms).
+    Args:
+        model: The model to optimize
+        learning_rate: Base learning rate
+        weight_decay: Weight decay coefficient
+        betas: Adam beta parameters
+        eps: Adam epsilon for numerical stability
+    Returns:
+        Configured AdamW optimizer
+    """
+    # Separate parameters into decay and no-decay groups
+    decay_params = []
+    no_decay_params = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue
+        # No weight decay for:
+        # - 1D parameters (biases, layer norms)
+        # - Embedding layers
+        if param.dim() == 1 or "embedding" in name.lower():
+            no_decay_params.append(param)
+        else:
+            decay_params.append(param)
+    param_groups = [
+        {"params": decay_params, "weight_decay": weight_decay},
+        {"params": no_decay_params, "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(
+        param_groups,
+        lr=learning_rate,
+        betas=betas,
+        eps=eps,
+    )
+    return optimizer
+def create_scheduler(
+    optimizer: torch.optim.Optimizer,
+    num_training_steps: int,
+    warmup_ratio: float = 0.1,
+    min_lr_ratio: float = 0.1,
+    scheduler_type: str = "cosine",
+) -> LambdaLR:
+    """Create learning rate scheduler.
+    Args:
+        optimizer: The optimizer to schedule
+        num_training_steps: Total number of training steps
+        warmup_ratio: Ratio of warmup steps (e.g., 0.1 = 10%)
+        min_lr_ratio: Minimum LR as ratio of max (e.g., 0.1 = 10% of peak LR)
+        scheduler_type: Type of scheduler ("cosine", "linear", "constant")
+    Returns:
+        LambdaLR scheduler
+    """
+    num_warmup_steps = int(num_training_steps * warmup_ratio)
+    if scheduler_type == "cosine":
+        def lr_lambda(current_step: int) -> float:
+            # Warmup phase
+            if current_step < num_warmup_steps:
+                return float(current_step) / float(max(1, num_warmup_steps))
+            # Cosine annealing phase
+            progress = float(current_step - num_warmup_steps) / float(
+                max(1, num_training_steps - num_warmup_steps)
+            )
+            cosine_decay = 0.5 * (1.0 + math.cos(math.pi * progress))
+            # Scale between min_lr_ratio and 1.0
+            return min_lr_ratio + (1.0 - min_lr_ratio) * cosine_decay
+    elif scheduler_type == "linear":
+        def lr_lambda(current_step: int) -> float:
+            if current_step < num_warmup_steps:
+                return float(current_step) / float(max(1, num_warmup_steps))
+            progress = float(current_step - num_warmup_steps) / float(
+                max(1, num_training_steps - num_warmup_steps)
+            )
+            return max(min_lr_ratio, 1.0 - progress * (1.0 - min_lr_ratio))
+    elif scheduler_type == "constant":
+        def lr_lambda(current_step: int) -> float:
+            if current_step < num_warmup_steps:
+                return float(current_step) / float(max(1, num_warmup_steps))
+            return 1.0
+    else:
+        raise ValueError(f"Unknown scheduler type: {scheduler_type}")
+    return LambdaLR(optimizer, lr_lambda)
+def get_parameter_count(model: torch.nn.Module) -> dict:
+    """Get detailed parameter count for a model.
+    Args:
+        model: The model to analyze
+    Returns:
+        Dictionary with parameter counts
+    """
+    total_params = 0
+    trainable_params = 0
+    embedding_params = 0
+    for name, param in model.named_parameters():
+        num_params = param.numel()
+        total_params += num_params
+        if param.requires_grad:
+            trainable_params += num_params
+        if "embedding" in name.lower():
+            embedding_params += num_params
+    return {
+        "total": total_params,
+        "trainable": trainable_params,
+        "embedding": embedding_params,
+        "non_embedding": total_params - embedding_params,
+    }
+def get_optimizer_state(optimizer: torch.optim.Optimizer) -> dict:
+    """Get optimizer state statistics.
+    Args:
+        optimizer: The optimizer to analyze
+    Returns:
+        Dictionary with optimizer state info
+    """
+    num_params = sum(
+        sum(p.numel() for p in group["params"])
+        for group in optimizer.param_groups
+    )
+    current_lrs = [group["lr"] for group in optimizer.param_groups]
+    return {
+        "num_param_groups": len(optimizer.param_groups),
+        "total_params": num_params,
+        "learning_rates": current_lrs,
+    }
+def clip_grad_norm(
+    model: torch.nn.Module,
+    max_norm: float = 1.0,
+) -> float:
+    """Clip gradient norm and return the norm value.
+    Args:
+        model: The model with gradients
+        max_norm: Maximum gradient norm
+    Returns:
+        The gradient norm before clipping
+    """
+    parameters = [p for p in model.parameters() if p.grad is not None]
+    if len(parameters) == 0:
+        return 0.0
+    total_norm = torch.nn.utils.clip_grad_norm_(parameters, max_norm)
+    return total_norm.item()

src/training/trainer.py ADDED Viewed

	@@ -0,0 +1,511 @@

+"""
+Training loop for SLM.
+Handles the complete training process including:
+- Mixed precision training
+- Gradient accumulation
+- Checkpointing
+- Logging
+"""
+import os
+import time
+import json
+from dataclasses import dataclass, asdict
+from typing import Optional, Dict, Any, Callable
+from pathlib import Path
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from torch.cuda.amp import autocast, GradScaler
+from tqdm import tqdm
+from .loss import LanguageModelingLoss, compute_perplexity, compute_accuracy
+from .optimizer import create_optimizer, create_scheduler, clip_grad_norm
+@dataclass
+class TrainingConfig:
+    """Configuration for training."""
+    # Optimization
+    learning_rate: float = 3e-4
+    weight_decay: float = 0.1
+    warmup_ratio: float = 0.1
+    min_lr_ratio: float = 0.1
+    max_grad_norm: float = 1.0
+    label_smoothing: float = 0.0
+    # Training
+    num_epochs: int = 5
+    gradient_accumulation_steps: int = 4
+    fp16: bool = True
+    # Checkpointing
+    checkpoint_dir: str = "checkpoints"
+    save_steps: int = 1000
+    save_total_limit: int = 3
+    # Evaluation
+    eval_steps: int = 500
+    logging_steps: int = 10
+    # Early stopping
+    early_stopping_patience: int = 5  # Stop after N evals without improvement
+    early_stopping_threshold: float = 0.01  # Minimum improvement to reset patience
+    # Device
+    device: str = "auto"
+    # Compile model (torch.compile)
+    compile_model: bool = False
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+class Trainer:
+    """Training loop for SLM model."""
+    def __init__(
+        self,
+        model: nn.Module,
+        config: TrainingConfig,
+        train_dataloader: DataLoader,
+        val_dataloader: Optional[DataLoader] = None,
+        wandb_project: Optional[str] = None,
+    ):
+        """Initialize trainer.
+        Args:
+            model: The model to train
+            config: Training configuration
+            train_dataloader: Training data loader
+            val_dataloader: Optional validation data loader
+            wandb_project: Optional W&B project name for logging
+        """
+        self.config = config
+        self.train_dataloader = train_dataloader
+        self.val_dataloader = val_dataloader
+        # Setup device
+        if config.device == "auto":
+            if torch.cuda.is_available():
+                self.device = torch.device("cuda")
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            else:
+                self.device = torch.device("cpu")
+        else:
+            self.device = torch.device(config.device)
+        print(f"Training on device: {self.device}")
+        # Move model to device
+        self.model = model.to(self.device)
+        # Get vocab size from model
+        if hasattr(model, "config"):
+            self.vocab_size = model.config.vocab_size
+        else:
+            self.vocab_size = model.embed_tokens.num_embeddings
+        # Setup loss function
+        self.loss_fn = LanguageModelingLoss(
+            vocab_size=self.vocab_size,
+            label_smoothing=config.label_smoothing,
+        )
+        # Calculate total steps
+        self.steps_per_epoch = len(train_dataloader)
+        self.total_steps = self.steps_per_epoch * config.num_epochs
+        self.total_steps = self.total_steps // config.gradient_accumulation_steps
+        # Setup optimizer and scheduler
+        self.optimizer = create_optimizer(
+            model,
+            learning_rate=config.learning_rate,
+            weight_decay=config.weight_decay,
+        )
+        self.scheduler = create_scheduler(
+            self.optimizer,
+            num_training_steps=self.total_steps,
+            warmup_ratio=config.warmup_ratio,
+            min_lr_ratio=config.min_lr_ratio,
+        )
+        # Setup mixed precision
+        self.use_amp = config.fp16 and self.device.type == "cuda"
+        self.scaler = GradScaler() if self.use_amp else None
+        # Tracking
+        self.global_step = 0
+        self.epoch = 0
+        self.best_val_loss = float("inf")
+        # Early stopping tracking
+        self.early_stopping_counter = 0
+        self.should_stop = False
+        # Checkpoint directory
+        os.makedirs(config.checkpoint_dir, exist_ok=True)
+        # W&B logging
+        self.wandb = None
+        if wandb_project:
+            try:
+                import wandb
+                wandb.init(project=wandb_project, config=config.to_dict())
+                self.wandb = wandb
+            except ImportError:
+                print("wandb not installed, skipping logging")
+    def train(self) -> Dict[str, Any]:
+        """Run the full training loop.
+        Returns:
+            Dictionary with training results
+        """
+        print(f"\n{'='*60}")
+        print("STARTING TRAINING")
+        print(f"{'='*60}")
+        print(f"Total epochs: {self.config.num_epochs}")
+        print(f"Steps per epoch: {self.steps_per_epoch}")
+        print(f"Total optimization steps: {self.total_steps}")
+        print(f"Gradient accumulation: {self.config.gradient_accumulation_steps}")
+        print(f"Mixed precision: {self.use_amp}")
+        if self.config.early_stopping_patience > 0:
+            print(f"Early stopping: patience={self.config.early_stopping_patience}")
+        print(f"{'='*60}\n")
+        training_start = time.time()
+        # FIX: Start from loaded epoch (for resume), not always from 0
+        start_epoch = self.epoch
+        if start_epoch > 0:
+            print(f"Resuming from epoch {start_epoch + 1}")
+        for epoch in range(start_epoch, self.config.num_epochs):
+            self.epoch = epoch
+            epoch_loss = self._train_epoch()
+            print(f"\nEpoch {epoch + 1}/{self.config.num_epochs} - Loss: {epoch_loss:.4f}")
+            # Validation
+            if self.val_dataloader is not None:
+                val_metrics = self.evaluate()
+                print(f"Validation - Loss: {val_metrics['loss']:.4f}, PPL: {val_metrics['perplexity']:.2f}")
+                # Early stopping check
+                if val_metrics["loss"] < self.best_val_loss - self.config.early_stopping_threshold:
+                    self.best_val_loss = val_metrics["loss"]
+                    self.early_stopping_counter = 0
+                    self.save_checkpoint("best")
+                    print(f"  New best model saved!")
+                else:
+                    self.early_stopping_counter += 1
+                    print(f"  No improvement. Early stopping: {self.early_stopping_counter}/{self.config.early_stopping_patience}")
+                    if self.config.early_stopping_patience > 0 and self.early_stopping_counter >= self.config.early_stopping_patience:
+                        print(f"\nEarly stopping triggered after {self.early_stopping_counter} evaluations without improvement.")
+                        self.should_stop = True
+            # Save epoch checkpoint
+            self.save_checkpoint(f"epoch_{epoch + 1}")
+            # Check early stopping
+            if self.should_stop:
+                print("Stopping training early.")
+                break
+        training_time = time.time() - training_start
+        print(f"\n{'='*60}")
+        print(f"TRAINING COMPLETE")
+        print(f"Total time: {training_time / 3600:.2f} hours")
+        print(f"Best validation loss: {self.best_val_loss:.4f}")
+        if self.should_stop:
+            print(f"Stopped early at epoch {self.epoch + 1}")
+        print(f"{'='*60}")
+        return {
+            "total_steps": self.global_step,
+            "training_time": training_time,
+            "best_val_loss": self.best_val_loss,
+        }
+    def _train_epoch(self) -> float:
+        """Train for one epoch.
+        Returns:
+            Average training loss for the epoch
+        """
+        self.model.train()
+        total_loss = 0.0
+        num_batches = 0
+        accumulated_loss = 0.0
+        num_accumulated_batches = 0  # FIX: Track actual number of batches for correct averaging
+        # Create progress bar
+        pbar = tqdm(
+            enumerate(self.train_dataloader),
+            total=len(self.train_dataloader),
+            desc=f"Epoch {self.epoch + 1}",
+            ncols=100,
+        )
+        for step, batch in pbar:
+            # Move batch to device
+            input_ids = batch["input_ids"].to(self.device)
+            labels = batch["labels"].to(self.device)
+            # Note: attention_mask from dataloader is padding mask (1/0)
+            # The model creates its own causal mask internally
+            # We handle padding via -100 labels in the loss function
+            # Forward pass with optional mixed precision
+            with autocast(enabled=self.use_amp):
+                outputs = self.model(input_ids)
+                # Handle different output types (tensor, tuple, or dataclass)
+                if isinstance(outputs, torch.Tensor):
+                    logits = outputs
+                elif hasattr(outputs, 'logits'):
+                    logits = outputs.logits
+                else:
+                    logits = outputs[0]
+                loss = self.loss_fn(logits, labels)
+                loss = loss / self.config.gradient_accumulation_steps
+            # Backward pass
+            if self.use_amp:
+                self.scaler.scale(loss).backward()
+            else:
+                loss.backward()
+            # FIX: Track unscaled loss correctly
+            unscaled_loss = loss.item() * self.config.gradient_accumulation_steps
+            accumulated_loss += unscaled_loss
+            num_accumulated_batches += 1
+            total_loss += unscaled_loss
+            num_batches += 1
+            # Gradient accumulation
+            if (step + 1) % self.config.gradient_accumulation_steps == 0:
+                # Gradient clipping
+                if self.use_amp:
+                    self.scaler.unscale_(self.optimizer)
+                grad_norm = clip_grad_norm(self.model, self.config.max_grad_norm)
+                # Optimizer step
+                if self.use_amp:
+                    self.scaler.step(self.optimizer)
+                    self.scaler.update()
+                else:
+                    self.optimizer.step()
+                self.scheduler.step()
+                self.optimizer.zero_grad()
+                self.global_step += 1
+                # Logging
+                if self.global_step % self.config.logging_steps == 0:
+                    # FIX: Divide by actual number of accumulated batches, not gradient_accumulation_steps
+                    avg_loss = accumulated_loss / max(num_accumulated_batches, 1)
+                    lr = self.scheduler.get_last_lr()[0]
+                    # Update progress bar
+                    pbar.set_postfix({
+                        'loss': f'{avg_loss:.4f}',
+                        'lr': f'{lr:.2e}',
+                        'step': f'{self.global_step}/{self.total_steps}'
+                    })
+                    tqdm.write(
+                        f"Step {self.global_step}/{self.total_steps} | "
+                        f"Loss: {avg_loss:.4f} | "
+                        f"LR: {lr:.2e} | "
+                        f"Grad: {grad_norm:.2f}"
+                    )
+                    if self.wandb:
+                        self.wandb.log({
+                            "train/loss": avg_loss,
+                            "train/learning_rate": lr,
+                            "train/grad_norm": grad_norm,
+                            "train/epoch": self.epoch,
+                        }, step=self.global_step)
+                    # Reset accumulators
+                    accumulated_loss = 0.0
+                    num_accumulated_batches = 0
+                # Evaluation
+                if self.config.eval_steps > 0 and self.global_step % self.config.eval_steps == 0:
+                    if self.val_dataloader is not None:
+                        val_metrics = self.evaluate()
+                        print(f"  Eval - Loss: {val_metrics['loss']:.4f}, PPL: {val_metrics['perplexity']:.2f}")
+                        if self.wandb:
+                            self.wandb.log({
+                                "eval/loss": val_metrics["loss"],
+                                "eval/perplexity": val_metrics["perplexity"],
+                            }, step=self.global_step)
+                        # Early stopping check during training
+                        if val_metrics["loss"] < self.best_val_loss - self.config.early_stopping_threshold:
+                            self.best_val_loss = val_metrics["loss"]
+                            self.early_stopping_counter = 0
+                            self.save_checkpoint("best")
+                            print(f"  New best model! Loss: {self.best_val_loss:.4f}")
+                        else:
+                            self.early_stopping_counter += 1
+                            if self.config.early_stopping_patience > 0:
+                                print(f"  No improvement ({self.early_stopping_counter}/{self.config.early_stopping_patience})")
+                                if self.early_stopping_counter >= self.config.early_stopping_patience:
+                                    print(f"\n  Early stopping triggered!")
+                                    self.should_stop = True
+                                    break  # Exit the training loop
+                # Checkpointing
+                if self.config.save_steps > 0 and self.global_step % self.config.save_steps == 0:
+                    self.save_checkpoint(f"step_{self.global_step}")
+            # Check if early stopping was triggered
+            if self.should_stop:
+                break
+        return total_loss / max(num_batches, 1)
+    @torch.no_grad()
+    def evaluate(self) -> Dict[str, float]:
+        """Evaluate the model on validation data.
+        Returns:
+            Dictionary with evaluation metrics
+        """
+        self.model.eval()
+        total_loss = 0.0
+        total_accuracy = 0.0
+        num_batches = 0
+        for batch in self.val_dataloader:
+            input_ids = batch["input_ids"].to(self.device)
+            labels = batch["labels"].to(self.device)
+            with autocast(enabled=self.use_amp):
+                outputs = self.model(input_ids)
+                # Handle different output types (tensor, tuple, or dataclass)
+                if isinstance(outputs, torch.Tensor):
+                    logits = outputs
+                elif hasattr(outputs, 'logits'):
+                    logits = outputs.logits
+                else:
+                    logits = outputs[0]
+                loss = self.loss_fn(logits, labels)
+            total_loss += loss.item()
+            total_accuracy += compute_accuracy(logits, labels).item()
+            num_batches += 1
+        self.model.train()
+        avg_loss = total_loss / max(num_batches, 1)
+        avg_accuracy = total_accuracy / max(num_batches, 1)
+        return {
+            "loss": avg_loss,
+            "perplexity": compute_perplexity(torch.tensor(avg_loss)).item(),
+            "accuracy": avg_accuracy,
+        }
+    def save_checkpoint(self, name: str):
+        """Save a checkpoint.
+        Args:
+            name: Checkpoint name (e.g., "best", "epoch_1", "step_1000")
+        """
+        checkpoint_path = os.path.join(self.config.checkpoint_dir, name)
+        os.makedirs(checkpoint_path, exist_ok=True)
+        # Save model
+        model_path = os.path.join(checkpoint_path, "model.pt")
+        torch.save(self.model.state_dict(), model_path)
+        # Save optimizer and scheduler
+        optimizer_path = os.path.join(checkpoint_path, "optimizer.pt")
+        torch.save({
+            "optimizer": self.optimizer.state_dict(),
+            "scheduler": self.scheduler.state_dict(),
+            "global_step": self.global_step,
+            "epoch": self.epoch,
+            "best_val_loss": self.best_val_loss,
+            "early_stopping_counter": self.early_stopping_counter,
+        }, optimizer_path)
+        # Save config
+        config_path = os.path.join(checkpoint_path, "config.json")
+        with open(config_path, "w") as f:
+            json.dump(self.config.to_dict(), f, indent=2)
+        print(f"Saved checkpoint: {checkpoint_path}")
+        # Cleanup old checkpoints
+        self._cleanup_checkpoints()
+    def load_checkpoint(self, checkpoint_path: str):
+        """Load a checkpoint.
+        Args:
+            checkpoint_path: Path to checkpoint directory
+        """
+        # Load model
+        model_path = os.path.join(checkpoint_path, "model.pt")
+        state_dict = torch.load(model_path, map_location=self.device)
+        # FIX: Handle torch.compile prefix (_orig_mod.) if present
+        if any(k.startswith("_orig_mod.") for k in state_dict.keys()):
+            print("  Detected compiled model checkpoint, removing _orig_mod. prefix...")
+            state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+        self.model.load_state_dict(state_dict)
+        # Load optimizer and scheduler
+        optimizer_path = os.path.join(checkpoint_path, "optimizer.pt")
+        if os.path.exists(optimizer_path):
+            state = torch.load(optimizer_path, map_location=self.device)
+            self.optimizer.load_state_dict(state["optimizer"])
+            self.scheduler.load_state_dict(state["scheduler"])
+            self.global_step = state["global_step"]
+            self.epoch = state["epoch"]
+            self.best_val_loss = state.get("best_val_loss", float("inf"))
+            self.early_stopping_counter = state.get("early_stopping_counter", 0)
+            # FIX: Increment epoch to start from next epoch (we saved after completing this epoch)
+            # Only if checkpoint was saved at end of epoch (epoch_* checkpoints)
+            if "epoch_" in checkpoint_path:
+                self.epoch += 1
+                print(f"  Checkpoint was end-of-epoch, will start from epoch {self.epoch + 1}")
+        print(f"Loaded checkpoint: {checkpoint_path}")
+        print(f"  Resuming from step {self.global_step}, epoch {self.epoch}")
+        print(f"  Best val loss so far: {self.best_val_loss:.4f}")
+    def _cleanup_checkpoints(self):
+        """Remove old checkpoints to save disk space."""
+        if self.config.save_total_limit <= 0:
+            return
+        checkpoint_dir = Path(self.config.checkpoint_dir)
+        checkpoints = sorted(
+            [d for d in checkpoint_dir.iterdir() if d.is_dir() and d.name.startswith("step_")],
+            key=lambda x: int(x.name.split("_")[1]),
+        )
+        # Keep only the most recent checkpoints (plus "best" and "epoch_*")
+        while len(checkpoints) > self.config.save_total_limit:
+            old_checkpoint = checkpoints.pop(0)
+            print(f"Removing old checkpoint: {old_checkpoint}")
+            import shutil
+            shutil.rmtree(old_checkpoint)