Upload 4 files

Browse files

Files changed (4) hide show

README.md +79 -2
inference.py +86 -0
model.py +200 -0
train.py +201 -0

README.md CHANGED Viewed

@@ -1,3 +1,80 @@
 ---
-license: mit
----

+# 🚀 Small Language Model Training
+This project implements a **125M parameter language model** optimized for training on consumer hardware with limited VRAM (4GB+). It includes efficient training with gradient accumulation and length-based batch scheduling.
+## 📂 Project Structure
+```
+│── model.py        # Transformer-based language model (125M params)
+│── train.py        # Training script with memory optimizations
+│── inference.py    # Text generation script
+│── requirements.txt # Required dependencies
+│── README.md       # Project documentation
+```
+## 📌 Features
+- **Memory-Efficient Transformer Model** (~125M parameters)
+- **Length-Based Batch Scheduling** for efficient training
+- **Gradient Accumulation** for effective larger batch sizes
+- **Autoregressive Text Generation**
+- **Wikitext-2 Dataset Integration**
+## 🛠 Installation
+Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+## 🎯 Training the Model
+Run the training script:
+```bash
+python train.py
+```
+The training process includes:
+- Automatic GPU/CPU device selection
+- Dynamic batch scheduling by sequence length
+- Gradient accumulation (effective batch size: 16)
+- Automatic checkpointing
+- Cosine learning rate scheduling
+## 📝 Inference
+Generate text using the trained model:
+```bash
+python inference.py
+```
+## 🏗 Model Architecture
+- **Layers:** 12 transformer blocks
+- **Attention Heads:** 12 heads
+- **Embedding Dimension:** 768
+- **Context Window:** 512 tokens
+- **Total Parameters:** ~125M
+- **Activation:** GELU
+- **Layer Normalization:** Pre-norm architecture
+## ⚡ Performance Optimizations
+- ✅ Length-based batch scheduling
+- ✅ Gradient accumulation (4 steps)
+- ✅ Efficient memory usage
+- ✅ Optimized for 4GB VRAM GPUs
+- ✅ Pre-padded sequences for faster training
+## 🔧 Training Configuration
+- **Batch Size:** 4 (16 with gradient accumulation)
+- **Learning Rate:** 3e-4 with cosine decay
+- **Weight Decay:** 0.1
+- **Training Data:** Wikitext-2
+- **Epochs:** 3
+## 📊 Memory Usage
+- **GPU VRAM:** ~3.5GB peak
+- **Recommended GPU:** 4GB+ VRAM
+- **CPU RAM:** ~8GB recommended
+## 📜 License
+This project is licensed under the MIT License.
 ---
+🚀 Happy Training! Feel free to contribute or raise issues. 🎯

inference.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+from transformers import AutoTokenizer
+from model import SmallLanguageModel, ModelConfig
+def create_model_config(vocab_size):
+    """Create model configuration matching training"""
+    return ModelConfig(
+        vocab_size=vocab_size,
+        block_size=512,
+        n_layer=12,
+        n_head=12,
+        n_embd=768,
+        dropout=0.1,
+        bias=True
+    )
+def generate_text(prompt, model, tokenizer, max_length=100, temperature=0.8, top_k=50):
+    model.eval()
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+    with torch.no_grad():
+        for _ in range(max_length):
+            # Get model predictions
+            outputs = model(input_ids)
+            next_token_logits = outputs[:, -1, :] / temperature
+            # Apply top-k filtering
+            top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k, dim=-1)
+            next_token_logits[0, :] = float('-inf')
+            next_token_logits[0, top_k_indices[0]] = top_k_logits[0]
+            # Sample from the filtered distribution
+            probs = torch.softmax(next_token_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            # Append to input_ids
+            input_ids = torch.cat([input_ids, next_token], dim=-1)
+            # Stop if we generate the EOS token
+            if next_token[0].item() == tokenizer.eos_token_id:
+                break
+    return tokenizer.decode(input_ids[0], skip_special_tokens=True)
+if __name__ == "__main__":
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenizer.pad_token = tokenizer.eos_token
+    # Setup device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    # Create and load model
+    config = create_model_config(tokenizer.vocab_size)
+    model = SmallLanguageModel(config).to(device)
+    # Load trained weights
+    try:
+        checkpoint = torch.load("small_language_model.pt", map_location=device)
+        model.load_state_dict(checkpoint)
+        print("Loaded model from small_language_model.pt")
+    except FileNotFoundError:
+        print("No saved model found. Please train the model first.")
+        exit(1)
+    # Generate some example texts
+    prompts = [
+        "Once upon a time",
+        "The meaning of life is",
+        "In the distant future",
+        "The best way to learn programming is"
+    ]
+    print("\nGenerating text samples:\n")
+    for prompt in prompts:
+        print(f"Prompt: {prompt}")
+        generated_text = generate_text(
+            prompt,
+            model,
+            tokenizer,
+            max_length=100,
+            temperature=0.8,
+            top_k=50
+        )
+        print(f"Generated: {generated_text}\n")

model.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LayerNorm(nn.Module):
+    def __init__(self, ndim, bias=True):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, x):
+        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
+class MultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.n_head = config.n_head
+        self.head_dim = config.n_embd // config.n_head
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                    .view(1, 1, config.block_size, config.block_size))
+    def forward(self, x):
+        B, T, C = x.size() # batch, sequence length, embedding dim
+        # calculate query, key, values
+        q, k, v = self.c_attn(x).split(self.config.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        # causal self-attention
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.resid_dropout(self.c_proj(y))
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu = nn.GELU()
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = MultiHeadAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class ModelConfig:
+    def __init__(self, vocab_size=50257, block_size=1024, n_layer=24, n_head=16,
+                 n_embd=1024, dropout=0.1, bias=True):
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_embd = n_embd
+        self.dropout = dropout
+        self.bias = bias
+def count_parameters(model):
+    """Count number of trainable parameters in the model"""
+    total = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    # Calculate parameters for each component
+    embedding_params = model.transformer.wte.weight.numel() + model.transformer.wpe.weight.numel()
+    attention_params = 0
+    mlp_params = 0
+    layer_norm_params = 0
+    for block in model.transformer.h:
+        # Attention parameters
+        attention_params += (
+            block.attn.c_attn.weight.numel() +
+            (block.attn.c_attn.bias.numel() if block.attn.c_attn.bias is not None else 0) +
+            block.attn.c_proj.weight.numel() +
+            (block.attn.c_proj.bias.numel() if block.attn.c_proj.bias is not None else 0)
+        )
+        # MLP parameters
+        mlp_params += (
+            block.mlp.c_fc.weight.numel() +
+            (block.mlp.c_fc.bias.numel() if block.mlp.c_fc.bias is not None else 0) +
+            block.mlp.c_proj.weight.numel() +
+            (block.mlp.c_proj.bias.numel() if block.mlp.c_proj.bias is not None else 0)
+        )
+        # Layer norm parameters
+        layer_norm_params += (
+            block.ln_1.weight.numel() +
+            (block.ln_1.bias.numel() if block.ln_1.bias is not None else 0) +
+            block.ln_2.weight.numel() +
+            (block.ln_2.bias.numel() if block.ln_2.bias is not None else 0)
+        )
+    # Final layer norm
+    layer_norm_params += (
+        model.transformer.ln_f.weight.numel() +
+        (model.transformer.ln_f.bias.numel() if model.transformer.ln_f.bias is not None else 0)
+    )
+    # Print detailed breakdown
+    print(f"\nParameter Count Breakdown:")
+    print(f"Embeddings: {embedding_params:,} parameters")
+    print(f"Attention Layers: {attention_params:,} parameters")
+    print(f"MLP Layers: {mlp_params:,} parameters")
+    print(f"Layer Normalization: {layer_norm_params:,} parameters")
+    print(f"Total: {total:,} parameters")
+    return total
+class SmallLanguageModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            wpe = nn.Embedding(config.block_size, config.n_embd),
+            drop = nn.Dropout(config.dropout),
+            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f = LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.transformer.wte.weight = self.lm_head.weight
+        # Initialize weights
+        self.apply(self._init_weights)
+        print("\nModel Configuration:")
+        print(f"Layers: {config.n_layer}")
+        print(f"Heads: {config.n_head}")
+        print(f"Embedding Dimension: {config.n_embd}")
+        print(f"Context Window: {config.block_size}")
+        count_parameters(self)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, input_ids, targets=None):
+        device = input_ids.device
+        b, t = input_ids.size()
+        pos = torch.arange(0, t, dtype=torch.long, device=device)
+        # forward the model
+        tok_emb = self.transformer.wte(input_ids)
+        pos_emb = self.transformer.wpe(pos)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)
+        if targets is not None:
+            # Reshape logits and targets for loss calculation
+            logits = logits.reshape(-1, logits.size(-1))
+            targets = targets.reshape(-1)
+            loss = F.cross_entropy(logits, targets)
+            return logits, loss
+        return logits

train.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import torch
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+from transformers import AutoTokenizer
+from datasets import load_dataset
+from model import SmallLanguageModel, ModelConfig
+import random
+def create_model_config(vocab_size):
+    """Create a ~125M parameter model configuration"""
+    return ModelConfig(
+        vocab_size=vocab_size,
+        block_size=512,        # Reduced from 1024
+        n_layer=12,           # Reduced from 24
+        n_head=12,            # Reduced from 16
+        n_embd=768,           # Reduced from 1024
+        dropout=0.1,
+        bias=True
+    )
+def setup_training():
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenizer.pad_token = tokenizer.eos_token
+    # Create model configuration
+    config = create_model_config(tokenizer.vocab_size)
+    # Initialize model
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = SmallLanguageModel(config).to(device)
+    return model, tokenizer, device
+class TextDataset(Dataset):
+    def __init__(self, tokenized_texts, block_size, tokenizer):
+        self.examples = []
+        self.block_size = block_size
+        self.tokenizer = tokenizer
+        # Group texts by exact length
+        self.length_groups = {}  # Keep as instance variable
+        for text in tokenized_texts["input_ids"]:
+            if len(text) > 1:  # Ensure text is at least 2 tokens
+                # Truncate if longer than block_size + 1
+                if len(text) > block_size + 1:
+                    text = text[:block_size + 1]
+                length = len(text)
+                if length not in self.length_groups:
+                    self.length_groups[length] = []
+                self.length_groups[length].append(torch.tensor(text, dtype=torch.long))
+        # Sort lengths for more efficient batching
+        self.lengths = sorted(self.length_groups.keys())
+        # Create index mapping
+        self.length_to_idx = {}
+        start_idx = 0
+        for length in self.lengths:
+            group = self.length_groups[length]
+            self.length_to_idx[length] = (start_idx, start_idx + len(group))
+            start_idx += len(group)
+            self.examples.extend(group)
+        print(f"Created {len(self.examples)} sequences across {len(self.lengths)} different lengths")
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, idx):
+        return self.examples[idx]
+class BatchSchedulerSampler(torch.utils.data.Sampler):
+    """Samples batches according to sequence length"""
+    def __init__(self, dataset, batch_size):
+        super().__init__(dataset)
+        self.dataset = dataset
+        self.batch_size = batch_size
+        # Create batches for each length
+        self.batches = []
+        for length in dataset.lengths:
+            start_idx, end_idx = dataset.length_to_idx[length]
+            # Create batches of indices for this length
+            indices = list(range(start_idx, end_idx))
+            for i in range(0, len(indices), batch_size):
+                self.batches.append(indices[i:i + batch_size])
+    def __iter__(self):
+        # Shuffle batches
+        random.shuffle(self.batches)
+        for batch in self.batches:
+            yield batch
+    def __len__(self):
+        return len(self.batches)
+def prepare_dataset(tokenizer, block_size):
+    # Load and tokenize dataset
+    dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
+    def tokenize_function(examples):
+        # Remove empty strings and concatenate all texts
+        texts = [text for text in examples["text"] if len(text.strip()) > 0]
+        return tokenizer(texts, truncation=False, padding=False)
+    tokenized_dataset = dataset.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=dataset["train"].column_names,
+        desc="Tokenizing texts"
+    )
+    # Create training dataset with tokenizer
+    train_dataset = TextDataset(tokenized_dataset["train"], block_size=block_size, tokenizer=tokenizer)
+    print(f"Created dataset with {len(train_dataset)} examples")
+    return train_dataset
+def collate_batch(batch):
+    # All tensors in a batch should be the same length
+    return torch.stack(batch)
+def train_model(model, train_loader, optimizer, scheduler, device, num_epochs=3, gradient_accumulation_steps=4):
+    model.train()
+    for epoch in range(num_epochs):
+        total_loss = 0
+        optimizer.zero_grad()  # Zero gradients at start of epoch
+        for batch_idx, batch in enumerate(train_loader):
+            batch = batch.to(device)
+            # Get input_ids and targets
+            input_ids = batch[:, :-1].contiguous()
+            targets = batch[:, 1:].contiguous()
+            # Forward pass
+            logits, loss = model(input_ids, targets)
+            # Scale loss for gradient accumulation
+            loss = loss / gradient_accumulation_steps
+            loss.backward()
+            # Update weights every gradient_accumulation_steps
+            if (batch_idx + 1) % gradient_accumulation_steps == 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+            total_loss += loss.item() * gradient_accumulation_steps
+            if batch_idx % 10 == 0:
+                print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item() * gradient_accumulation_steps:.4f}, LR: {scheduler.get_last_lr()[0]:.6f}")
+        avg_loss = total_loss / len(train_loader)
+        print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")
+        # Save checkpoint
+        torch.save({
+            'epoch': epoch,
+            'model_state_dict': model.state_dict(),
+            'optimizer_state_dict': optimizer.state_dict(),
+            'loss': avg_loss,
+        }, f'checkpoint_epoch_{epoch+1}.pt')
+def main():
+    # Setup
+    model, tokenizer, device = setup_training()
+    # Prepare dataset
+    train_dataset = prepare_dataset(tokenizer, model.config.block_size)
+    # Use custom sampler instead of shuffle
+    train_loader = DataLoader(
+        train_dataset,
+        batch_sampler=BatchSchedulerSampler(train_dataset, batch_size=4),  # Reduced batch size from 8 to 4
+        num_workers=4
+    )
+    # Training setup with gradient accumulation
+    optimizer = optim.AdamW(model.parameters(),
+                           lr=3e-4,
+                           weight_decay=0.1)
+    # Learning rate scheduler
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(
+        optimizer,
+        T_max=len(train_loader) * 3,  # 3 epochs
+        eta_min=1e-5
+    )
+    # Train the model
+    train_model(model, train_loader, optimizer, scheduler, device)
+    # Save the final model
+    torch.save(model.state_dict(), "small_language_model.pt")
+if __name__ == "__main__":
+    main()