Upload FrawdLLMForCausalLM

Browse files

Files changed (12) hide show

README.md +199 -0
attention.py +162 -0
block.py +118 -0
config.json +24 -0
config.py +209 -0
embeddings.py +124 -0
generation_config.json +7 -0
gpt.py +223 -0
hf_wrapper.py +258 -0
mlp.py +105 -0
model.safetensors +3 -0
rope.py +153 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

attention.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+Multi-Head Self-Attention for FrawdLLM.
+This is the core mechanism that lets tokens "look at" each other.
+Each token creates:
+  - Query (Q): "What am I looking for?"
+  - Key (K):   "What do I contain?"
+  - Value (V): "What information do I give?"
+Attention score = how well Q matches K
+Output = weighted sum of V based on attention scores
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from .config import ModelConfig
+from .rope import RotaryEmbedding
+class CausalSelfAttention(nn.Module):
+    """
+    Multi-head causal (masked) self-attention.
+    "Causal" means tokens can only attend to past tokens, not future.
+    This is required for language models (can't peek at what we're predicting!)
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.head_dim = config.n_embd // config.n_head  # e.g., 768/12 = 64
+        self.use_rope = config.use_rope
+        # Linear projections to create Q, K, V
+        # Each transforms [batch, seq, n_embd] -> [batch, seq, n_embd]
+        # We do all three in one big matrix for efficiency, then split
+        self.qkv_proj = nn.Linear(config.n_embd, 3 * config.n_embd)
+        # Output projection: combines all heads back together
+        self.out_proj = nn.Linear(config.n_embd, config.n_embd)
+        # Dropout for regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        # RoPE for position encoding (if enabled)
+        if self.use_rope:
+            self.rope = RotaryEmbedding(
+                dim=self.head_dim,
+                max_seq_len=config.context_length * 4,  # Allow extrapolation
+            )
+        # Causal mask: lower triangular matrix
+        # This prevents attending to future tokens
+        # We register it as a buffer (saved with model, but not a parameter)
+        max_len = config.context_length * 4 if self.use_rope else config.context_length
+        mask = torch.tril(torch.ones(max_len, max_len))
+        self.register_buffer("mask", mask.view(1, 1, max_len, max_len))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply multi-head causal self-attention.
+        Args:
+            x: [batch_size, seq_len, n_embd] - input embeddings
+        Returns:
+            [batch_size, seq_len, n_embd] - attended embeddings
+        """
+        batch_size, seq_len, n_embd = x.shape
+        # Step 1: Project to Q, K, V (all at once for efficiency)
+        # [batch, seq, n_embd] -> [batch, seq, 3 * n_embd]
+        qkv = self.qkv_proj(x)
+        # Step 2: Split into Q, K, V
+        # [batch, seq, 3 * n_embd] -> 3 x [batch, seq, n_embd]
+        q, k, v = qkv.chunk(3, dim=-1)
+        # Step 3: Reshape for multi-head attention
+        # [batch, seq, n_embd] -> [batch, n_head, seq, head_dim]
+        # Example: [32, 512, 768] -> [32, 12, 512, 64]
+        q = q.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
+        # Step 3.5: Apply RoPE (if enabled)
+        # This rotates Q and K based on position - encodes position info
+        if self.use_rope:
+            q = self.rope(q)
+            k = self.rope(k)
+            # Note: V is not rotated - only Q and K need position info
+        # Step 4: Compute attention scores
+        # Q @ K^T: [batch, n_head, seq, head_dim] @ [batch, n_head, head_dim, seq]
+        #        = [batch, n_head, seq, seq]
+        # Each (i,j) entry = "how much should position i attend to position j?"
+        attn_scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        # Step 5: Apply causal mask (prevent attending to future)
+        # Mask is 1 for allowed positions, 0 for disallowed
+        # We set disallowed positions to -inf so softmax gives 0
+        attn_scores = attn_scores.masked_fill(
+            self.mask[:, :, :seq_len, :seq_len] == 0,
+            float('-inf')
+        )
+        # Step 6: Softmax to get attention weights (probabilities)
+        # [batch, n_head, seq, seq] - each row sums to 1
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Step 7: Apply attention to values
+        # [batch, n_head, seq, seq] @ [batch, n_head, seq, head_dim]
+        # = [batch, n_head, seq, head_dim]
+        out = attn_weights @ v
+        # Step 8: Reshape back: combine all heads
+        # [batch, n_head, seq, head_dim] -> [batch, seq, n_head, head_dim] -> [batch, seq, n_embd]
+        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, n_embd)
+        # Step 9: Final output projection
+        out = self.out_proj(out)
+        out = self.resid_dropout(out)
+        return out
+if __name__ == "__main__":
+    # Test the attention module
+    from .config import get_config
+    print("Testing CausalSelfAttention...")
+    print("=" * 50)
+    config = get_config("tiny")
+    print(f"Config: n_embd={config.n_embd}, n_head={config.n_head}, "
+          f"head_dim={config.head_dim}")
+    attn = CausalSelfAttention(config)
+    # Count parameters
+    num_params = sum(p.numel() for p in attn.parameters())
+    print(f"Attention parameters: {num_params:,}")
+    # Test input: [batch=2, seq=8, n_embd=256]
+    x = torch.randn(2, 8, config.n_embd)
+    print(f"\nInput shape: {x.shape}")
+    # Forward pass
+    out = attn(x)
+    print(f"Output shape: {out.shape}")
+    # Verify shapes match
+    assert x.shape == out.shape, "Input and output shapes should match!"
+    print("\nAttention working!")

block.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Transformer Block for FrawdLLM.
+A transformer block combines:
+1. Multi-head self-attention (tokens gather info from each other)
+2. MLP (each token processes info independently)
+With two important additions:
+- LayerNorm: Keeps values stable during training
+- Residual connections: Add input to output ("don't lose what you had")
+Structure (Pre-LN, which is more stable):
+    Input
+      ↓
+    ┌─────────────┐
+    │  LayerNorm  │
+    └─────────────┘
+      ↓
+    ┌─────────────┐
+    │  Attention  │───────┐
+    └─────────────┘       │ (residual)
+      ↓                   │
+      + ←─────────────────┘
+      ↓
+    ┌─────────────┐
+    │  LayerNorm  │
+    └─────────────┘
+      ↓
+    ┌─────────────┐
+    │     MLP     │───────┐
+    └─────────────┘       │ (residual)
+      ↓                   │
+      + ←─────────────────┘
+      ↓
+    Output
+"""
+import torch
+import torch.nn as nn
+from .config import ModelConfig
+from .attention import CausalSelfAttention
+from .mlp import MLP
+class TransformerBlock(nn.Module):
+    """
+    One transformer block = Attention + MLP with norms and residuals.
+    Input:  [batch_size, seq_len, n_embd]
+    Output: [batch_size, seq_len, n_embd]
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        # Layer norms (one before attention, one before MLP)
+        self.ln1 = nn.LayerNorm(config.n_embd)
+        self.ln2 = nn.LayerNorm(config.n_embd)
+        # Attention and MLP
+        self.attn = CausalSelfAttention(config)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply transformer block.
+        Args:
+            x: [batch_size, seq_len, n_embd]
+        Returns:
+            [batch_size, seq_len, n_embd]
+        """
+        # Attention with residual connection
+        # x + attention(norm(x))
+        # "Keep x, add attention's contribution"
+        x = x + self.attn(self.ln1(x))
+        # MLP with residual connection
+        # x + mlp(norm(x))
+        # "Keep x, add MLP's contribution"
+        x = x + self.mlp(self.ln2(x))
+        return x
+if __name__ == "__main__":
+    # Test the transformer block
+    from .config import get_config
+    print("Testing TransformerBlock...")
+    print("=" * 50)
+    config = get_config("tiny")
+    print(f"Config: n_embd={config.n_embd}, n_head={config.n_head}, "
+          f"n_layer={config.n_layer}")
+    block = TransformerBlock(config)
+    # Count parameters
+    num_params = sum(p.numel() for p in block.parameters())
+    print(f"Block parameters: {num_params:,}")
+    # Test input: [batch=2, seq=8, n_embd=256]
+    x = torch.randn(2, 8, config.n_embd)
+    print(f"\nInput shape: {x.shape}")
+    # Forward pass
+    out = block(x)
+    print(f"Output shape: {out.shape}")
+    # Verify shapes match
+    assert x.shape == out.shape, "Input and output shapes should match!"
+    print("\nTransformerBlock working!")

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "FrawdLLMForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "hf_wrapper.FrawdLLMConfig",
+    "AutoModelForCausalLM": "hf_wrapper.FrawdLLMForCausalLM"
+  },
+  "bos_token_id": 2,
+  "context_length": 1024,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "eos_token_id": 3,
+  "model_type": "frawdllm",
+  "n_embd": 768,
+  "n_head": 12,
+  "n_layer": 12,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3",
+  "use_rmsnorm": false,
+  "use_rope": true,
+  "use_swiglu": false,
+  "vocab_size": 32000
+}

config.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""
+Model configuration for FrawdLLM.
+This module defines the hyperparameters that control model architecture.
+We'll define multiple sizes to experiment with.
+Learning Notes:
+--------------
+Key hyperparameters and their effects:
+1. vocab_size: Size of tokenizer vocabulary
+   - Must match your trained tokenizer
+   - Larger = more memory for embedding table
+2. n_embd (embedding dimension): Size of hidden representations
+   - Larger = more expressive, but slower and more memory
+   - GPT-2 small: 768, GPT-2 large: 1280, GPT-3: 12288
+3. n_layer: Number of transformer blocks
+   - More layers = deeper reasoning, but harder to train
+   - GPT-2 small: 12, GPT-2 large: 36
+4. n_head: Number of attention heads
+   - Usually n_embd / n_head = 64 (head dimension)
+   - More heads = more parallel attention patterns
+5. context_length: Maximum sequence length
+   - Longer = can process more text, but O(n²) memory for attention
+   - GPT-2: 1024, GPT-3: 2048, modern models: 4096-128K
+6. dropout: Regularization to prevent overfitting
+   - 0.0 for small datasets (we need all the learning we can get)
+   - 0.1-0.2 for larger datasets
+"""
+from dataclasses import dataclass
+@dataclass
+class ModelConfig:
+    """Configuration for FrawdLLM model."""
+    # Vocabulary (must match tokenizer)
+    vocab_size: int = 8192
+    # Model dimensions
+    n_embd: int = 768        # Embedding dimension
+    n_layer: int = 12        # Number of transformer blocks
+    n_head: int = 12         # Number of attention heads
+    # Sequence length
+    context_length: int = 512  # Maximum sequence length
+    # Regularization
+    dropout: float = 0.0     # Dropout probability (0 for small data)
+    # Architecture choices (we'll implement both!)
+    use_rope: bool = False   # Use Rotary Position Embeddings (Llama-style)
+    use_rmsnorm: bool = False  # Use RMSNorm instead of LayerNorm (Llama-style)
+    use_swiglu: bool = False   # Use SwiGLU activation (Llama-style)
+    # Special token IDs (must match tokenizer)
+    pad_token_id: int = 0
+    bos_token_id: int = 2
+    eos_token_id: int = 3
+    def __post_init__(self):
+        """Validate configuration."""
+        assert self.n_embd % self.n_head == 0, \
+            f"n_embd ({self.n_embd}) must be divisible by n_head ({self.n_head})"
+        self.head_dim = self.n_embd // self.n_head
+    @property
+    def num_parameters(self) -> int:
+        """Estimate total number of parameters."""
+        # Token embeddings: vocab_size * n_embd
+        token_emb = self.vocab_size * self.n_embd
+        # Position embeddings (if not using RoPE): context_length * n_embd
+        pos_emb = 0 if self.use_rope else self.context_length * self.n_embd
+        # Per transformer block:
+        # - Attention: 4 * n_embd^2 (Q, K, V, O projections)
+        # - MLP: 8 * n_embd^2 (up, down) or 12 * n_embd^2 (SwiGLU has gate)
+        # - LayerNorms: 2 * n_embd (or 4 * n_embd with biases)
+        mlp_factor = 12 if self.use_swiglu else 8
+        per_block = 4 * self.n_embd**2 + mlp_factor * self.n_embd**2 + 4 * self.n_embd
+        total_blocks = self.n_layer * per_block
+        # Output projection (tied with token embeddings usually, so not counted)
+        # Final layer norm: n_embd
+        final_ln = self.n_embd
+        return token_emb + pos_emb + total_blocks + final_ln
+# Predefined configurations for different sizes
+# These are designed to be trainable on different hardware
+# ~10M parameters - For quick debugging on CPU/M3
+# Can train in minutes on a laptop
+FRAWDLLM_TINY = ModelConfig(
+    vocab_size=8192,
+    n_embd=256,
+    n_layer=6,
+    n_head=8,
+    context_length=256,
+    dropout=0.0,
+)
+# ~50M parameters - Good for learning on M3/single GPU
+# Can train in hours on M3, generates reasonable text
+FRAWDLLM_SMALL = ModelConfig(
+    vocab_size=8192,
+    n_embd=512,
+    n_layer=8,
+    n_head=8,
+    context_length=512,
+    dropout=0.0,
+)
+# ~125M parameters - Similar to GPT-2 small
+# Needs GPU (AWS), generates good quality text
+FRAWDLLM_BASE = ModelConfig(
+    vocab_size=8192,
+    n_embd=768,
+    n_layer=12,
+    n_head=12,
+    context_length=1024,
+    dropout=0.1,
+)
+# Llama-style variants (modern architecture)
+FRAWDLLM_TINY_LLAMA = ModelConfig(
+    vocab_size=8192,
+    n_embd=256,
+    n_layer=6,
+    n_head=8,
+    context_length=256,
+    dropout=0.0,
+    use_rope=True,
+    use_rmsnorm=True,
+    use_swiglu=True,
+)
+FRAWDLLM_SMALL_LLAMA = ModelConfig(
+    vocab_size=8192,
+    n_embd=512,
+    n_layer=8,
+    n_head=8,
+    context_length=512,
+    dropout=0.0,
+    use_rope=True,
+    use_rmsnorm=True,
+    use_swiglu=True,
+)
+# ~100M parameters - Similar to GPT-2 Small but with modern architecture
+# Uses RoPE for position encoding, allowing longer context at inference
+FRAWDLLM_100M = ModelConfig(
+    vocab_size=32000,       # Larger vocab for diverse data
+    n_embd=768,
+    n_layer=12,
+    n_head=12,
+    context_length=1024,    # Train on 1024, can extrapolate to 2048+
+    dropout=0.1,
+    use_rope=True,          # Rotary position embeddings
+    use_rmsnorm=False,      # Keep LayerNorm for now
+    use_swiglu=False,       # Keep GELU for now
+)
+def get_config(name: str) -> ModelConfig:
+    """Get a predefined configuration by name."""
+    configs = {
+        "tiny": FRAWDLLM_TINY,
+        "small": FRAWDLLM_SMALL,
+        "base": FRAWDLLM_BASE,
+        "tiny-llama": FRAWDLLM_TINY_LLAMA,
+        "small-llama": FRAWDLLM_SMALL_LLAMA,
+        "100m": FRAWDLLM_100M,
+    }
+    if name not in configs:
+        raise ValueError(f"Unknown config: {name}. Available: {list(configs.keys())}")
+    return configs[name]
+if __name__ == "__main__":
+    # Print parameter counts for each config
+    print("FrawdLLM Model Configurations")
+    print("=" * 50)
+    for name in ["tiny", "small", "base", "tiny-llama", "small-llama"]:
+        config = get_config(name)
+        params = config.num_parameters
+        print(f"\n{name}:")
+        print(f"  Parameters: {params:,} ({params/1e6:.1f}M)")
+        print(f"  Embedding dim: {config.n_embd}")
+        print(f"  Layers: {config.n_layer}")
+        print(f"  Heads: {config.n_head}")
+        print(f"  Context: {config.context_length}")
+        if config.use_rope:
+            print(f"  Style: Llama (RoPE, RMSNorm, SwiGLU)")
+        else:
+            print(f"  Style: GPT-2 (learned pos, LayerNorm, GELU)")

embeddings.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+Token and Position Embeddings for FrawdLLM.
+This is the first layer of the model - converts token IDs into vectors
+that the transformer can process.
+Two lookup tables:
+1. Token embeddings: WHAT the token is (vocab_size x n_embd)
+2. Position embeddings: WHERE the token is (context_length x n_embd)
+Final output = token_emb + pos_emb (just addition!)
+"""
+import torch
+import torch.nn as nn
+from .config import ModelConfig
+class Embeddings(nn.Module):
+    """
+    Combined token + position embeddings.
+    Input:  token_ids [batch_size, seq_len] - integers from tokenizer
+    Output: vectors [batch_size, seq_len, n_embd] - dense representations
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()  # Initialize nn.Module tracking
+        self.config = config
+        self.use_rope = config.use_rope
+        # Token embedding table: one vector per vocabulary word
+        # Shape: [vocab_size, n_embd] = [8192, 768]
+        self.token_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        # Position embedding table: one vector per position (only if NOT using RoPE)
+        # Shape: [context_length, n_embd] = [512, 768]
+        # With RoPE, position is encoded in attention via rotation instead
+        if not self.use_rope:
+            self.pos_emb = nn.Embedding(config.context_length, config.n_embd)
+        else:
+            self.pos_emb = None
+        # Dropout for regularization (usually 0 for small datasets)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, token_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Convert token IDs to embeddings.
+        Args:
+            token_ids: [batch_size, seq_len] tensor of token IDs
+        Returns:
+            [batch_size, seq_len, n_embd] tensor of embeddings
+        """
+        batch_size, seq_len = token_ids.shape
+        # Safety check: don't exceed context window (relaxed for RoPE)
+        max_len = self.config.context_length * 4 if self.use_rope else self.config.context_length
+        if seq_len > max_len:
+            raise ValueError(
+                f"Sequence length {seq_len} exceeds maximum length {max_len}"
+            )
+        # Step 1: Look up token embeddings
+        # [batch_size, seq_len] -> [batch_size, seq_len, n_embd]
+        embeddings = self.token_emb(token_ids)
+        # Step 2: Add position embeddings (only if NOT using RoPE)
+        # With RoPE, position is encoded via rotation in attention instead
+        if not self.use_rope:
+            positions = torch.arange(seq_len, device=token_ids.device)
+            pos_emb = self.pos_emb(positions)
+            embeddings = embeddings + pos_emb
+        # Step 3: Apply dropout (if any)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+if __name__ == "__main__":
+    # Quick test to verify it works
+    from .config import get_config
+    print("Testing Embeddings...")
+    print("=" * 50)
+    # Use tiny config for testing
+    config = get_config("tiny")
+    print(f"Config: vocab={config.vocab_size}, n_embd={config.n_embd}, "
+          f"context={config.context_length}")
+    # Create embedding layer
+    emb = Embeddings(config)
+    # Count parameters
+    num_params = sum(p.numel() for p in emb.parameters())
+    print(f"Embedding parameters: {num_params:,}")
+    # Test forward pass
+    # Fake batch: 2 sequences of 4 tokens each
+    token_ids = torch.tensor([
+        [2, 531, 892, 12],   # Sequence 1
+        [2, 100, 200, 3],    # Sequence 2
+    ])
+    print(f"\nInput shape: {token_ids.shape}")
+    print(f"Input tokens: {token_ids.tolist()}")
+    # Forward pass
+    output = emb(token_ids)
+    print(f"\nOutput shape: {output.shape}")
+    print(f"Each token is now a {output.shape[-1]}-dimensional vector")
+    # Show a snippet of the output
+    print(f"\nFirst token's vector (first 10 dims):")
+    print(f"  {output[0, 0, :10].tolist()}")
+    print("\nEmbeddings working!")

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 3,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3"
+}

gpt.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""
+Full GPT Model for FrawdLLM.
+This is the complete model that:
+1. Takes token IDs as input
+2. Converts to embeddings (token + position)
+3. Passes through N transformer blocks
+4. Predicts the next token
+Architecture:
+    Token IDs [batch, seq]
+        ↓
+    Embeddings [batch, seq, n_embd]
+        ↓
+    Transformer Block × N
+        ↓
+    Final LayerNorm
+        ↓
+    Output Head → [batch, seq, vocab_size]
+        ↓
+    Logits (unnormalized probabilities for each vocab word)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .config import ModelConfig
+from .embeddings import Embeddings
+from .block import TransformerBlock
+class FrawdLLM(nn.Module):
+    """
+    The complete FrawdLLM model.
+    Input:  token_ids [batch_size, seq_len]
+    Output: logits [batch_size, seq_len, vocab_size]
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        # Token + position embeddings
+        self.embeddings = Embeddings(config)
+        # Stack of transformer blocks
+        self.blocks = nn.ModuleList([
+            TransformerBlock(config) for _ in range(config.n_layer)
+        ])
+        # Final layer norm (before output projection)
+        self.ln_f = nn.LayerNorm(config.n_embd)
+        # Output head: project from n_embd to vocab_size
+        # This gives us a score for each possible next token
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Weight tying: share weights between token embeddings and output head
+        # This is a common trick that:
+        # 1. Reduces parameters
+        # 2. Makes sense: similar tokens should have similar embeddings AND predictions
+        self.lm_head.weight = self.embeddings.token_emb.weight
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        """Initialize weights for better training."""
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self,
+        token_ids: torch.Tensor,
+        targets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """
+        Forward pass through the model.
+        Args:
+            token_ids: [batch_size, seq_len] - input token IDs
+            targets: [batch_size, seq_len] - target token IDs (for computing loss)
+        Returns:
+            logits: [batch_size, seq_len, vocab_size] - prediction scores
+            loss: scalar tensor if targets provided, else None
+        """
+        # Step 1: Convert token IDs to embeddings
+        # [batch, seq] → [batch, seq, n_embd]
+        x = self.embeddings(token_ids)
+        # Step 2: Pass through all transformer blocks
+        for block in self.blocks:
+            x = block(x)
+        # Step 3: Final layer norm
+        x = self.ln_f(x)
+        # Step 4: Project to vocabulary size
+        # [batch, seq, n_embd] → [batch, seq, vocab_size]
+        logits = self.lm_head(x)
+        # Step 5: Compute loss if targets provided
+        loss = None
+        if targets is not None:
+            # Flatten for cross-entropy
+            # logits: [batch * seq, vocab_size]
+            # targets: [batch * seq]
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                targets.view(-1),
+                ignore_index=self.config.pad_token_id,  # Don't compute loss on padding
+            )
+        return logits, loss
+    @torch.no_grad()
+    def generate(
+        self,
+        token_ids: torch.Tensor,
+        max_new_tokens: int = 100,
+        temperature: float = 1.0,
+        top_k: int | None = None,
+    ) -> torch.Tensor:
+        """
+        Generate new tokens autoregressively.
+        Args:
+            token_ids: [batch_size, seq_len] - starting tokens (prompt)
+            max_new_tokens: How many new tokens to generate
+            temperature: Higher = more random, lower = more deterministic
+            top_k: If set, only sample from top k most likely tokens
+        Returns:
+            [batch_size, seq_len + max_new_tokens] - original + generated tokens
+        """
+        for _ in range(max_new_tokens):
+            # Crop to context length if needed
+            context = token_ids[:, -self.config.context_length:]
+            # Get predictions
+            logits, _ = self.forward(context)
+            # Take logits for the last position only
+            # [batch, vocab_size]
+            logits = logits[:, -1, :]
+            # Apply temperature
+            logits = logits / temperature
+            # Optionally apply top-k filtering
+            if top_k is not None:
+                # Keep only top k values, set rest to -inf
+                top_values, _ = torch.topk(logits, top_k, dim=-1)
+                min_top_value = top_values[:, -1].unsqueeze(-1)
+                logits = torch.where(
+                    logits < min_top_value,
+                    torch.full_like(logits, float('-inf')),
+                    logits,
+                )
+            # Convert to probabilities
+            probs = F.softmax(logits, dim=-1)
+            # Sample next token
+            next_token = torch.multinomial(probs, num_samples=1)
+            # Append to sequence
+            token_ids = torch.cat([token_ids, next_token], dim=1)
+            # Stop if we generated EOS token
+            if (next_token == self.config.eos_token_id).all():
+                break
+        return token_ids
+    def count_parameters(self) -> int:
+        """Count total trainable parameters."""
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+if __name__ == "__main__":
+    from .config import get_config
+    print("Testing FrawdLLM...")
+    print("=" * 50)
+    config = get_config("tiny")
+    print(f"Config: vocab={config.vocab_size}, n_embd={config.n_embd}, "
+          f"n_layer={config.n_layer}, n_head={config.n_head}")
+    model = FrawdLLM(config)
+    # Count parameters
+    num_params = model.count_parameters()
+    print(f"Total parameters: {num_params:,} ({num_params/1e6:.1f}M)")
+    # Test forward pass
+    batch_size, seq_len = 2, 16
+    token_ids = torch.randint(0, config.vocab_size, (batch_size, seq_len))
+    targets = torch.randint(0, config.vocab_size, (batch_size, seq_len))
+    print(f"\nInput shape: {token_ids.shape}")
+    logits, loss = model(token_ids, targets)
+    print(f"Output logits shape: {logits.shape}")
+    print(f"Loss: {loss.item():.4f}")
+    # Test generation
+    prompt = torch.tensor([[config.bos_token_id]])  # Start with BOS
+    generated = model.generate(prompt, max_new_tokens=10)
+    print(f"\nGenerated shape: {generated.shape}")
+    print(f"Generated tokens: {generated[0].tolist()}")
+    print("\nFrawdLLM working!")

hf_wrapper.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""
+HuggingFace wrapper for FrawdLLM.
+This allows the model to be loaded with:
+    from transformers import AutoModelForCausalLM
+    model = AutoModelForCausalLM.from_pretrained("tsingla1998/frawdllm-100m", trust_remote_code=True)
+"""
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel, GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .config import ModelConfig
+from .gpt import FrawdLLM
+class FrawdLLMConfig(PretrainedConfig):
+    """HuggingFace-compatible configuration for FrawdLLM."""
+    model_type = "frawdllm"
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        n_embd: int = 768,
+        n_layer: int = 12,
+        n_head: int = 12,
+        context_length: int = 1024,
+        dropout: float = 0.1,
+        use_rope: bool = True,
+        use_rmsnorm: bool = False,
+        use_swiglu: bool = False,
+        pad_token_id: int = 0,
+        bos_token_id: int = 2,
+        eos_token_id: int = 3,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.context_length = context_length
+        self.dropout = dropout
+        self.use_rope = use_rope
+        self.use_rmsnorm = use_rmsnorm
+        self.use_swiglu = use_swiglu
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+    def to_model_config(self) -> ModelConfig:
+        """Convert to internal ModelConfig for the model."""
+        return ModelConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.n_embd,
+            n_layer=self.n_layer,
+            n_head=self.n_head,
+            context_length=self.context_length,
+            dropout=self.dropout,
+            use_rope=self.use_rope,
+            use_rmsnorm=self.use_rmsnorm,
+            use_swiglu=self.use_swiglu,
+            pad_token_id=self.pad_token_id,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+    @classmethod
+    def from_model_config(cls, config: ModelConfig) -> "FrawdLLMConfig":
+        """Create from internal ModelConfig."""
+        return cls(
+            vocab_size=config.vocab_size,
+            n_embd=config.n_embd,
+            n_layer=config.n_layer,
+            n_head=config.n_head,
+            context_length=config.context_length,
+            dropout=config.dropout,
+            use_rope=config.use_rope,
+            use_rmsnorm=config.use_rmsnorm,
+            use_swiglu=config.use_swiglu,
+            pad_token_id=config.pad_token_id,
+            bos_token_id=config.bos_token_id,
+            eos_token_id=config.eos_token_id,
+        )
+class FrawdLLMForCausalLM(PreTrainedModel, GenerationMixin):
+    """HuggingFace-compatible wrapper for FrawdLLM."""
+    config_class = FrawdLLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["TransformerBlock"]
+    _tied_weights_keys = ["model.lm_head.weight"]
+    def __init__(self, config: FrawdLLMConfig):
+        super().__init__(config)
+        # Convert HF config to internal config
+        model_config = config.to_model_config()
+        # Create the actual model
+        self.model = FrawdLLM(model_config)
+        # For generation
+        self.main_input_name = "input_ids"
+    def get_input_embeddings(self):
+        return self.model.embeddings.token_emb
+    def set_input_embeddings(self, value):
+        self.model.embeddings.token_emb = value
+    def get_output_embeddings(self):
+        return self.model.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.model.lm_head = new_embeddings
+    def tie_weights(self):
+        """Tie input and output embeddings."""
+        self.model.lm_head.weight = self.model.embeddings.token_emb.weight
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """
+        Forward pass compatible with HuggingFace API.
+        Note: attention_mask, past_key_values, use_cache are accepted but
+        not fully implemented (our model doesn't use KV caching yet).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Get logits from our model
+        logits, _ = self.model(input_ids, None)
+        # Compute loss if labels provided
+        loss = None
+        if labels is not None:
+            # Shift for causal LM loss
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        if not return_dict:
+            output = (logits,)
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """Prepare inputs for generation (called by HF generate())."""
+        # Our model doesn't use KV cache yet, so just return input_ids
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+    @classmethod
+    def from_frawdllm_checkpoint(
+        cls,
+        checkpoint_path: str,
+        device: str = "cpu",
+    ) -> "FrawdLLMForCausalLM":
+        """
+        Load from a FrawdLLM .pt checkpoint.
+        Args:
+            checkpoint_path: Path to the .pt checkpoint file
+            device: Device to load the model on
+        Returns:
+            FrawdLLMForCausalLM instance
+        """
+        # Load the checkpoint
+        checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+        # Get the internal config
+        model_config = checkpoint["config"]
+        # Create HF config
+        hf_config = FrawdLLMConfig.from_model_config(model_config)
+        # Create the wrapper model
+        model = cls(hf_config)
+        # Load the weights
+        model.model.load_state_dict(checkpoint["model_state_dict"])
+        return model
+    def save_pretrained_simple(self, save_directory: str):
+        """
+        Save in HuggingFace format.
+        This saves:
+        - config.json
+        - model.safetensors (or pytorch_model.bin)
+        """
+        import os
+        from safetensors.torch import save_file
+        os.makedirs(save_directory, exist_ok=True)
+        # Save config
+        self.config.save_pretrained(save_directory)
+        # Save model weights
+        # Note: We have weight tying (token_emb.weight == lm_head.weight)
+        # Remove the duplicate to avoid safetensors error
+        state_dict = self.state_dict()
+        if "model.lm_head.weight" in state_dict:
+            del state_dict["model.lm_head.weight"]
+        save_file(state_dict, os.path.join(save_directory, "model.safetensors"))
+        print(f"Saved model to {save_directory}")
+# Register for AutoClass - this adds auto_map to config when saving
+FrawdLLMConfig.register_for_auto_class()
+FrawdLLMForCausalLM.register_for_auto_class("AutoModelForCausalLM")

mlp.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+MLP (Multi-Layer Perceptron) for FrawdLLM.
+This is the "feed-forward" part of the transformer block.
+After attention lets tokens gather information from each other,
+MLP lets each token process that information independently.
+Structure:
+    Input (768) → Expand (3072) → GELU → Shrink (768) → Output
+The 4x expansion gives the model more "thinking room" before
+compressing back to the original size.
+"""
+import torch
+import torch.nn as nn
+from .config import ModelConfig
+class MLP(nn.Module):
+    """
+    Simple feed-forward network with GELU activation.
+    Input:  [batch_size, seq_len, n_embd]
+    Output: [batch_size, seq_len, n_embd]
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        # Hidden dimension is 4x the embedding dimension
+        # This is a common ratio used in most transformers
+        hidden_dim = 4 * config.n_embd
+        # Expand: 768 → 3072
+        self.fc1 = nn.Linear(config.n_embd, hidden_dim)
+        # Activation function
+        self.act = nn.GELU()
+        # Shrink: 3072 → 768
+        self.fc2 = nn.Linear(hidden_dim, config.n_embd)
+        # Dropout for regularization
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply MLP to each token independently.
+        Args:
+            x: [batch_size, seq_len, n_embd]
+        Returns:
+            [batch_size, seq_len, n_embd]
+        """
+        # Step 1: Expand
+        # [batch, seq, 768] → [batch, seq, 3072]
+        x = self.fc1(x)
+        # Step 2: Non-linearity
+        # [batch, seq, 3072] → [batch, seq, 3072] (same shape, different values)
+        x = self.act(x)
+        # Step 3: Shrink back
+        # [batch, seq, 3072] → [batch, seq, 768]
+        x = self.fc2(x)
+        # Step 4: Dropout
+        x = self.dropout(x)
+        return x
+if __name__ == "__main__":
+    # Test the MLP module
+    from .config import get_config
+    print("Testing MLP...")
+    print("=" * 50)
+    config = get_config("tiny")
+    hidden_dim = 4 * config.n_embd
+    print(f"Config: n_embd={config.n_embd}, hidden_dim={hidden_dim}")
+    mlp = MLP(config)
+    # Count parameters
+    num_params = sum(p.numel() for p in mlp.parameters())
+    print(f"MLP parameters: {num_params:,}")
+    # Test input: [batch=2, seq=8, n_embd=256]
+    x = torch.randn(2, 8, config.n_embd)
+    print(f"\nInput shape: {x.shape}")
+    # Forward pass
+    out = mlp(x)
+    print(f"Output shape: {out.shape}")
+    # Verify shapes match
+    assert x.shape == out.shape, "Input and output shapes should match!"
+    print("\nMLP working!")

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a70aec25201815a785d3731a9b204a149cae2f0e7788a24c1d853f1375ad5cd8
+size 1243850448

rope.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+Rotary Position Embedding (RoPE) for FrawdLLM.
+RoPE encodes position by rotating the Q and K vectors. This has several advantages:
+1. No learned position embeddings (saves parameters)
+2. Better length generalization (can extrapolate beyond training length)
+3. Relative position encoding (attention depends on distance, not absolute position)
+How it works:
+- Each position gets a rotation angle based on its index
+- Q and K are rotated by their position's angle
+- The dot product Q·K then naturally encodes relative distance
+Reference: https://arxiv.org/abs/2104.09864
+"""
+import torch
+import torch.nn as nn
+import math
+def precompute_freqs(dim: int, max_seq_len: int, theta: float = 10000.0) -> torch.Tensor:
+    """
+    Precompute the frequency tensor for RoPE.
+    Args:
+        dim: Dimension of each head (must be even)
+        max_seq_len: Maximum sequence length
+        theta: Base for frequency computation (10000 is standard)
+    Returns:
+        Complex tensor of shape [max_seq_len, dim//2] containing rotation frequencies
+    """
+    # Frequency for each dimension pair: theta^(-2i/dim) for i = 0, 1, ..., dim/2-1
+    # Lower dimensions rotate slowly, higher dimensions rotate quickly
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+    # Position indices
+    positions = torch.arange(max_seq_len)
+    # Outer product: [max_seq_len, dim//2]
+    # Each position gets a different rotation angle for each frequency
+    angles = torch.outer(positions, freqs)
+    # Convert to complex numbers for easy rotation
+    # e^(i*angle) = cos(angle) + i*sin(angle)
+    freqs_complex = torch.polar(torch.ones_like(angles), angles)
+    return freqs_complex
+def apply_rope(
+    x: torch.Tensor,
+    freqs: torch.Tensor,
+    start_pos: int = 0,
+) -> torch.Tensor:
+    """
+    Apply rotary position embedding to Q or K tensor.
+    Args:
+        x: [batch, n_head, seq_len, head_dim] - Q or K tensor
+        freqs: [max_seq_len, head_dim//2] - precomputed frequencies
+        start_pos: Starting position (for KV cache during generation)
+    Returns:
+        Rotated tensor with same shape as input
+    """
+    batch, n_head, seq_len, head_dim = x.shape
+    # Get frequencies for this sequence
+    # [seq_len, head_dim//2]
+    seq_freqs = freqs[start_pos:start_pos + seq_len]
+    # Reshape x to pairs: [batch, n_head, seq_len, head_dim//2, 2]
+    # We rotate adjacent pairs of dimensions together
+    x_pairs = x.float().reshape(batch, n_head, seq_len, -1, 2)
+    # Convert to complex: [batch, n_head, seq_len, head_dim//2]
+    x_complex = torch.view_as_complex(x_pairs)
+    # Reshape freqs for broadcasting: [1, 1, seq_len, head_dim//2]
+    seq_freqs = seq_freqs.unsqueeze(0).unsqueeze(0)
+    # Rotate by multiplying complex numbers
+    x_rotated = x_complex * seq_freqs
+    # Convert back to real: [batch, n_head, seq_len, head_dim//2, 2]
+    x_out = torch.view_as_real(x_rotated)
+    # Flatten back: [batch, n_head, seq_len, head_dim]
+    x_out = x_out.reshape(batch, n_head, seq_len, head_dim)
+    return x_out.type_as(x)
+class RotaryEmbedding(nn.Module):
+    """
+    Module wrapper for rotary embeddings.
+    Precomputes and caches the frequency tensor.
+    """
+    def __init__(self, dim: int, max_seq_len: int = 4096, theta: float = 10000.0):
+        super().__init__()
+        self.dim = dim
+        self.max_seq_len = max_seq_len
+        self.theta = theta
+        # Precompute and register as buffer (saved with model but not trained)
+        freqs = precompute_freqs(dim, max_seq_len, theta)
+        self.register_buffer("freqs", freqs, persistent=False)
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        """Apply RoPE to input tensor."""
+        return apply_rope(x, self.freqs, start_pos)
+if __name__ == "__main__":
+    print("Testing RoPE...")
+    print("=" * 50)
+    # Test parameters
+    batch, n_head, seq_len, head_dim = 2, 4, 16, 64
+    # Create rotary embedding
+    rope = RotaryEmbedding(dim=head_dim, max_seq_len=512)
+    # Create random Q and K
+    q = torch.randn(batch, n_head, seq_len, head_dim)
+    k = torch.randn(batch, n_head, seq_len, head_dim)
+    print(f"Input shape: {q.shape}")
+    # Apply RoPE
+    q_rotated = rope(q)
+    k_rotated = rope(k)
+    print(f"Output shape: {q_rotated.shape}")
+    # Verify relative position property
+    # Attention at (i, j) should only depend on (i - j), not absolute positions
+    print("\nVerifying relative position property...")
+    # Compute attention for two positions
+    attn_0_1 = (q_rotated[:, :, 0:1, :] @ k_rotated[:, :, 1:2, :].transpose(-2, -1))
+    attn_5_6 = (q_rotated[:, :, 5:6, :] @ k_rotated[:, :, 6:7, :].transpose(-2, -1))
+    # These should be very similar (same relative distance of 1)
+    diff = (attn_0_1 - attn_5_6).abs().mean().item()
+    print(f"  Attention (0,1) vs (5,6) difference: {diff:.6f}")
+    print(f"  (Should be very small - same relative distance)")
+    print("\nRoPE working!")