Nitin2004
/

APE

PyTorch

custom

Model card Files Files and versions

xet

Community

Nitin2004 commited on Mar 27, 2025

Commit

f881e8c

verified ·

1 Parent(s): 5626bda

Upload full model_ADPB folder

Browse files

Files changed (3) hide show

APE.py +509 -0
config.json +15 -0
pytorch_model.bin +3 -0

APE.py ADDED Viewed

	@@ -0,0 +1,509 @@

+import os
+import time
+import math
+import pickle
+import random
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+# We use Hugging Face’s transformers only for pretrained weight loading and tokenizer.
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+from dataclasses import dataclass
+# ----------------------------
+# Helper: ALiBi slopes computation
+# ----------------------------
+def get_alibi_slopes(n_head):
+    """Compute ALiBi slopes for each head.
+    This implementation follows the approach used in several ALiBi implementations.
+    """
+    def get_slopes_power_of_2(n):
+        start = 2 ** (-2 ** -(math.log2(n) - 3))
+        ratio = start
+        return [start * (ratio ** i) for i in range(n)]
+    if math.log2(n_head).is_integer():
+        slopes = get_slopes_power_of_2(n_head)
+    else:
+        closest_power_of_2 = 2 ** math.floor(math.log2(n_head))
+        slopes = get_slopes_power_of_2(closest_power_of_2)
+        extra_slopes = get_slopes_power_of_2(2 * closest_power_of_2)[0::2][: n_head - closest_power_of_2]
+        slopes.extend(extra_slopes)
+    return torch.tensor(slopes, dtype=torch.float32)
+# ----------------------------
+# Model Components
+# ----------------------------
+class LayerNorm(nn.Module):
+    """LayerNorm with an optional bias."""
+    def __init__(self, ndim, bias: bool):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        self.use_rope = config.use_rope
+        self.rope_base = config.rope_base
+        # Existing APE support.
+        self.use_ape = getattr(config, 'use_ape', False)
+        # New: ALiBi support.
+        self.use_alibi = getattr(config, 'use_alibi', False)
+        if self.use_alibi and self.use_ape:
+            raise ValueError("Cannot use both ALiBi and APE simultaneously.")
+        # For APE, learn a parameter beta.
+        if self.use_ape:
+            self.beta = nn.Parameter(torch.tensor(1.0))
+        # Use Flash Attention if available (but disable when APE is enabled).
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if (not self.flash) or self.use_ape:
+            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                        .view(1, 1, config.block_size, config.block_size))
+    def forward(self, x, return_attn_entropy=False, aggregate_heads=False):
+        """
+        Args:
+            x: Input tensor [B, T, C]
+            return_attn_entropy (bool): If True, return attention entropy.
+            aggregate_heads (bool): If True, average entropy across heads.
+        Returns:
+            y: Output tensor [B, T, C] or (y, entropy)
+        """
+        B, T, C = x.size()
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        head_dim = C // self.n_head
+        # Reshape to [B, n_head, T, head_dim]
+        q = q.view(B, T, self.n_head, head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, head_dim).transpose(1, 2)
+        # Optionally, apply RoPE if enabled.
+        if self.use_rope:
+            hs = head_dim
+            d = hs // 2
+            if self.use_ape:
+                theta = 1.0 / (self.rope_base ** (2 * torch.arange(0, d, dtype=x.dtype, device=x.device) / hs))
+            else:
+                theta = 1.0 / (self.rope_base ** (2 * torch.arange(0, d, dtype=x.dtype, device=x.device) / hs))
+            t_pos = torch.arange(T, device=x.device, dtype=x.dtype)
+            freqs = torch.outer(t_pos, theta)
+            freqs_cos = torch.cos(freqs).unsqueeze(0).unsqueeze(0)
+            freqs_sin = torch.sin(freqs).unsqueeze(0).unsqueeze(0)
+            def apply_rope(tensor, cos, sin):
+                tensor = tensor.reshape(*tensor.shape[:-1], -1, 2)
+                x0 = tensor[..., 0]
+                x1 = tensor[..., 1]
+                x0_rot = x0 * cos - x1 * sin
+                x1_rot = x0 * sin + x1 * cos
+                return torch.stack([x0_rot, x1_rot], dim=-1).flatten(start_dim=-2)
+            q = apply_rope(q, freqs_cos, freqs_sin)
+            k = apply_rope(k, freqs_cos, freqs_sin)
+        # Compute scaled dot-product attention scores.
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(head_dim))
+        # --- Apply positional biases ---
+        if self.use_alibi:
+            slopes = get_alibi_slopes(self.n_head).to(x.device)  # shape: (n_head,)
+            rel_positions = torch.arange(T, device=x.device).unsqueeze(0) - torch.arange(T, device=x.device).unsqueeze(1)
+            alibi_bias = slopes.view(1, self.n_head, 1, 1) * rel_positions.view(1, 1, T, T)
+            att = att - alibi_bias
+        elif self.use_ape:
+            pos_ids = torch.arange(T, device=x.device)
+            rel_dist = pos_ids.unsqueeze(0) - pos_ids.unsqueeze(1)
+            abs_rel = rel_dist.abs().float()
+            temp_matrix = 1.0 / (1.0 + abs_rel)
+            bias_matrix = - self.beta * torch.log(1.0 + abs_rel)
+            temp_matrix = temp_matrix.unsqueeze(0).unsqueeze(0)
+            bias_matrix = bias_matrix.unsqueeze(0).unsqueeze(0)
+            att = temp_matrix * att + bias_matrix
+        p_att = F.softmax(att, dim=-1)
+        entropy = -(p_att * torch.log(p_att + 1e-9)).sum(dim=-1)  # [B, n_head, T, T]
+        if self.flash and not self.use_ape:
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=None,
+                dropout_p=self.dropout if self.training else 0,
+                is_causal=True
+            )
+        else:
+            if T > self.bias.size(-1):
+                bias = torch.tril(torch.ones(T, T, device=x.device)).view(1, 1, T, T)
+            else:
+                bias = self.bias[:, :, :T, :T]
+            att = att.masked_fill(bias == 0, float('-inf'))
+            p_att = F.softmax(att, dim=-1)
+            entropy = -(p_att * torch.log(p_att + 1e-9)).sum(dim=-1)
+            att = self.attn_dropout(p_att)
+            y = att @ v  # [B, n_head, T, head_dim]
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.resid_dropout(self.c_proj(y))
+        if return_attn_entropy:
+            if aggregate_heads:
+                entropy = entropy.mean(dim=1)  # [B, T, T]
+            return y, entropy
+        else:
+            return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc   = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu   = nn.GELU()
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+    def forward(self, x, return_attn_entropy=False, aggregate_heads=False):
+        if return_attn_entropy:
+            attn_output, entropy = self.attn(self.ln_1(x), return_attn_entropy=True, aggregate_heads=aggregate_heads)
+            x = x + attn_output
+            x = x + self.mlp(self.ln_2(x))
+            return x, entropy
+        else:
+            attn_output = self.attn(self.ln_1(x), return_attn_entropy=False)
+            x = x + attn_output
+            x = x + self.mlp(self.ln_2(x))
+            return x
+@dataclass
+class GPTConfig:
+    block_size: int = 128
+    vocab_size: int = 50304  # For GPT-2
+    n_layer: int = 6
+    n_head: int = 6
+    n_embd: int = 384
+    dropout: float = 0.0
+    bias: bool = True
+    use_rope: bool = True
+    rope_base: float = 10000.0
+    use_ape: bool = False
+    lambda_temp: float = 0.1
+    use_alibi: bool = False
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None and config.block_size is not None
+        self.config = config
+        # If using ALiBi, disable RoPE.
+        self.use_rope = config.use_rope and not config.use_alibi
+        print(f"Using RoPE in GPT init: {self.use_rope}")
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            wpe = None if self.use_rope else nn.Embedding(config.block_size, config.n_embd),
+            drop = nn.Dropout(config.dropout),
+            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f = LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.transformer.wte.weight = self.lm_head.weight
+        self.apply(self._init_weights)
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
+        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
+    def get_num_params(self, non_embedding=True):
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding and (not self.use_rope) and (self.transformer.wpe is not None):
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, targets=None, return_attn_entropy=False, aggregate_heads=False):
+        device = idx.device
+        b, t = idx.size()
+        pos = torch.arange(0, t, dtype=torch.long, device=device)
+        tok_emb = self.transformer.wte(idx)
+        if self.use_rope or self.config.use_alibi:
+            x = self.transformer.drop(tok_emb)
+        else:
+            pos_emb = self.transformer.wpe(pos) if self.transformer.wpe is not None else 0
+            x = self.transformer.drop(tok_emb + pos_emb)
+        attn_entropies = []
+        for block in self.transformer.h:
+            if return_attn_entropy:
+                x, entropy = block(x, return_attn_entropy=True, aggregate_heads=aggregate_heads)
+                attn_entropies.append(entropy)
+            else:
+                x = block(x)
+        x = self.transformer.ln_f(x)
+        if targets is not None:
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            logits = self.lm_head(x[:, [-1], :])
+            loss = None
+        if return_attn_entropy:
+            return logits, loss, attn_entropies
+        else:
+            return logits, loss
+    @torch.no_grad()
+    def generate_and_compute_perplexity(self, prompt, ground_truth, temperature=1.0, return_attn_entropy=False, aggregate_heads=False):
+        if return_attn_entropy:
+            _, _, attn_entropies = self(prompt, return_attn_entropy=True, aggregate_heads=aggregate_heads)
+            per_layer_avgs = [entropy.mean().item() for entropy in attn_entropies]
+            avg_entropy = np.mean(per_layer_avgs)
+        else:
+            avg_entropy = None
+        total_loss = 0.0
+        total_tokens = 0
+        prompt_length = prompt.size(1)
+        num_target_tokens = ground_truth.size(1) - prompt_length
+        idx = prompt.clone()
+        for i in range(num_target_tokens):
+            logits, _ = self(idx)
+            logits = logits[:, -1, :] / temperature
+            target = ground_truth[:, prompt_length + i]
+            loss = F.cross_entropy(logits, target, reduction='sum')
+            total_loss += loss.item()
+            total_tokens += target.numel()
+            target_token = target.unsqueeze(1)
+            idx = torch.cat((idx, target_token), dim=1)
+        avg_neg_log_likelihood = total_loss / total_tokens if total_tokens > 0 else float('inf')
+        perplexity = math.exp(avg_neg_log_likelihood)
+        return idx, perplexity, avg_entropy
+    @torch.no_grad()
+    def generate_until_end(self, idx, temperature=1.0, top_k=None, max_new_tokens=1000):
+        for i in range(max_new_tokens):
+            idx_cond = idx
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+            if idx_next.item() == 50256:
+                break
+        return idx
+# ----------------------------
+# Utility Functions for Training & Evaluation
+# ----------------------------
+# Data Loader Functions
+train_data_path = "/data1/home/nitinvetcha/Topics in AI/Streamlined/COLM2025/train_tinystories.bin"
+val_data_path   = "/data1/home/nitinvetcha/Topics in AI/Streamlined/COLM2025/val_tinystories.bin"
+def get_batch(split):
+    data_path = train_data_path if split == 'train' else val_data_path
+    data = np.memmap(data_path, dtype=np.uint16, mode='r')
+    total_tokens = len(data)
+    max_ix = max(1, total_tokens - gptconf.block_size)
+    ix = torch.randint(0, max_ix, (batch_size,))
+    X = torch.stack([torch.from_numpy(data[i:i+gptconf.block_size].astype(np.int64)) for i in ix])
+    Y = torch.stack([torch.from_numpy(data[i+1:i+1+gptconf.block_size].astype(np.int64)) for i in ix])
+    return X.to(device), Y.to(device)
+def evaluate_prompt_perplexity(model, token_file, prompt_length, num_trials, generation_params, device):
+    tokens = np.fromfile(token_file, dtype=np.uint16)
+    total_tokens = len(tokens)
+    perplexities = []
+    entropy_trials = []
+    max_new_tokens = generation_params.get("max_new_tokens", 50)
+    total_length = prompt_length + max_new_tokens
+    for trial in range(num_trials):
+        start_idx = random.randint(0, total_tokens - total_length)
+        sequence_tokens = tokens[start_idx : start_idx + total_length]
+        prompt_tokens = sequence_tokens[:prompt_length]
+        ground_truth_tokens = sequence_tokens
+        prompt_tensor = torch.tensor(prompt_tokens, dtype=torch.long).unsqueeze(0).to(device)
+        ground_truth_tensor = torch.tensor(ground_truth_tokens, dtype=torch.long).unsqueeze(0).to(device)
+        _, ppl, trial_entropy = model.generate_and_compute_perplexity(
+            prompt_tensor, ground_truth_tensor,
+            temperature=generation_params.get("temperature", 1.0),
+            return_attn_entropy=True, aggregate_heads=True
+        )
+        perplexities.append(ppl)
+        entropy_trials.append(trial_entropy)
+        print(f"Trial {trial+1}/{num_trials} for prompt length {prompt_length}: Perplexity = {ppl:.2f}, Avg Entropy = {trial_entropy:.4f}")
+    avg_ppl = np.mean(perplexities)
+    avg_entropy = np.mean(entropy_trials)
+    print(f"Prompt Length {prompt_length} - Avg Perplexity: {avg_ppl:.2f}, Avg Attention Entropy: {avg_entropy:.4f}\n")
+    return avg_ppl, avg_entropy
+# ----------------------------
+# Training Loop
+# ----------------------------
+# Training hyperparameters
+batch_size = 12
+max_iters = 25001
+save_interval = 5000
+learning_rate = 6e-4
+weight_decay = 1e-1
+grad_clip = 1.0
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+torch.manual_seed(1337)
+# Model configuration: adjust these flags as needed.
+model_args = dict(
+    n_layer=6,
+    n_head=6,
+    n_embd=384,
+    block_size=64,  # You can change this as needed.
+    bias=False,
+    use_rope=True,
+    use_ape=True,    # Set to True if you want APE.
+    use_alibi=False, # Set to True to use ALiBi.
+    rope_base=10000.0,
+    vocab_size=50304,
+    dropout=0.0
+)
+gptconf = GPTConfig(**model_args)
+model = GPT(gptconf).to(device)
+model.train()
+optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+iter_num = 0
+start_time = time.time()
+training_losses = []
+validation_losses = []
+save_iters = []
+# Build a flag string for naming: e.g. "rope_ape" or "alibi" etc.
+flag_parts = []
+if gptconf.use_rope:
+    flag_parts.append("rope")
+if gptconf.use_ape:
+    flag_parts.append("ape")
+if gptconf.use_alibi:
+    flag_parts.append("alibi")
+flag_str = "_".join(flag_parts) if flag_parts else "none"
+weight_dir = f"weights_{flag_str}_{gptconf.block_size}"
+os.makedirs(weight_dir, exist_ok=True)
+while iter_num < max_iters:
+    X_train, Y_train = get_batch('train')
+    optimizer.zero_grad()
+    logits, loss_train = model(X_train, Y_train)
+    loss_train.backward()
+    if grad_clip > 0:
+        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+    optimizer.step()
+    training_losses.append(loss_train.item())
+    model.eval()
+    X_val, Y_val = get_batch('val')
+    with torch.no_grad():
+        logits_val, loss_val = model(X_val, Y_val)
+    validation_losses.append(loss_val.item())
+    model.train()
+    if iter_num % 100 == 0:
+        elapsed = time.time() - start_time
+        print(f"Iter {iter_num:5d}: train loss = {loss_train.item():.4f}, val loss = {loss_val.item():.4f}, time/iter = {elapsed/(iter_num+1):.4f}s")
+    if iter_num > 0 and iter_num % save_interval == 0:
+        save_iters.append(iter_num)
+        ckpt = {
+            'iter_num': iter_num,
+            'model_state_dict': model.state_dict(),
+            'optimizer_state_dict': optimizer.state_dict(),
+            'training_losses': training_losses,
+            'validation_losses': validation_losses,
+            'save_iters': save_iters,
+        }
+        ckpt_path = os.path.join(weight_dir, f"ckpt_{iter_num}.pt")
+        torch.save(ckpt, ckpt_path)
+        print(f"Checkpoint saved to {ckpt_path}")
+    iter_num += 1
+print("Training complete.")
+plt.figure(figsize=(10, 6))
+plt.plot(range(len(training_losses)), training_losses, label="Training Loss")
+plt.plot(range(len(validation_losses)), validation_losses, label="Validation Loss", alpha=0.7)
+plt.xlabel("Iteration")
+plt.ylabel("Loss")
+plt.title("Training and Validation Loss per Iteration")
+plt.legend()
+plt.grid(True)
+plt.show()
+# ----------------------------
+# Perplexity & Entropy Evaluation
+# ----------------------------
+token_file = val_data_path  # Use validation data for evaluation.
+prompt_lengths = [64, 128, 256, 512, 1024, 2048, 4096, 8192]
+num_trials = 5
+generation_params = {"temperature": 1.0, "max_new_tokens": 50}
+avg_perplexities = []
+avg_entropies = []
+for pl in prompt_lengths:
+    print(f"Evaluating for prompt length: {pl}")
+    avg_ppl, avg_entropy = evaluate_prompt_perplexity(model, token_file, pl, num_trials, generation_params, device)
+    avg_perplexities.append(avg_ppl)
+    avg_entropies.append(avg_entropy)
+results = {
+    "prompt_lengths": prompt_lengths,
+    "avg_perplexities": avg_perplexities,
+    "avg_entropies": avg_entropies
+}
+results_filename = f"results_{flag_str}_{gptconf.block_size}.json"
+with open(results_filename, "w") as f:
+    json.dump(results, f)
+print(f"Results saved to {results_filename}")
+plt.figure(figsize=(8, 6))
+plt.plot(prompt_lengths, avg_perplexities, marker='o')
+plt.xlabel("Prompt Length")
+plt.ylabel("Avg Generated Perplexity")
+plt.title("Avg Generated Perplexity vs Prompt Length")
+plt.grid(True)
+plt.xscale('log')
+plt.savefig(f"avg_generated_perplexity_{flag_str}_{gptconf.block_size}.png")
+plt.show()
+plt.figure(figsize=(8, 6))
+plt.plot(prompt_lengths, avg_entropies, marker='o', color='red')
+plt.xlabel("Prompt Length")
+plt.ylabel("Avg Attention Entropy")
+plt.title("Avg Attention Entropy vs Prompt Length\n(Averaged over Layers)")
+plt.grid(True)
+plt.xscale('log')
+plt.savefig(f"avg_attention_entropy_{flag_str}_{gptconf.block_size}.png")
+plt.show()

config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model_type": "custom",
+    "architectures": ["APE"],
+    "bias" : "False",
+    "use_rope" : "True",
+    "use_ape" : "True",
+    "use_alibi" : "False",
+    "n_layer": 6,
+    "n_head": 6,
+    "n_embd": 384,
+    "block_size": 64,
+    "vocab_size": 50304,
+    "rope_base": 10000.0,
+    "dropout" : 0
+  }

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ed529ea1054b5d58bc32ab9f3a1ff524cc3ade9788d909260aefcf144ab4c40
+size 359869364