eyad-silx
/

llm

Model card Files Files and versions

xet

Community

eyad-silx commited on Jan 1, 2025

Commit

5556482

verified ·

1 Parent(s): 200f2a8

Update model_baseline.py

Browse files

Files changed (1) hide show

model_baseline.py +64 -32

model_baseline.py CHANGED Viewed

@@ -16,7 +16,7 @@ class CausalSelfAttention(nn.Module):
         self.n_head = config.n_head
         self.n_embd = config.n_embd
         self.dropout = config.dropout
-        self.block_size = config.block_size
         # Key, Query, Value projections
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
@@ -26,30 +26,46 @@ class CausalSelfAttention(nn.Module):
         self.attn_dropout = nn.Dropout(config.dropout)
         self.resid_dropout = nn.Dropout(config.dropout)
-        # Flash attention style computation
-        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
-                                        .view(1, 1, config.block_size, config.block_size))
     def forward(self, x):
-        B, T, C = x.size()
         # Calculate query, key, values
-        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
         k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
-        # Causal self-attention
-        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
-        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
-        att = F.softmax(att, dim=-1)
-        att = self.attn_dropout(att)
-        y = att @ v
-        # Re-assemble all head outputs side by side
         y = y.transpose(1, 2).contiguous().view(B, T, C)
-        # Output projection
         y = self.resid_dropout(self.c_proj(y))
         return y
@@ -98,12 +114,17 @@ class BaselineTransformer(nn.Module):
         # Report number of parameters
         print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
-    def get_num_params(self, non_embedding=True):
-        n_params = sum(p.numel() for p in self.parameters())
-        if non_embedding:
-            n_params -= self.transformer.wpe.weight.numel()
-        return n_params
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
@@ -116,28 +137,33 @@ class BaselineTransformer(nn.Module):
     def forward(self, idx, targets=None):
         device = idx.device
         b, t = idx.size()
-        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
-        pos = torch.arange(0, t, dtype=torch.long, device=device)
-        # Forward the GPT model itself
         tok_emb = self.transformer.wte(idx)
         pos_emb = self.transformer.wpe(pos)
         x = self.transformer.drop(tok_emb + pos_emb)
-        for block in self.transformer.h:
-            x = block(x)
         x = self.transformer.ln_f(x)
-        # Get logits and compute loss
         logits = self.lm_head(x)
         loss = None
         if targets is not None:
-            # Calculate loss directly in BPC
-            B, T, C = logits.shape
-            logits = logits.view(B*T, C)
-            targets = targets.view(B*T)
-            loss = F.cross_entropy(logits, targets) * math.log2(math.e)
         return logits, loss
@@ -167,3 +193,9 @@ class BaselineTransformer(nn.Module):
             idx = torch.cat((idx, idx_next), dim=1)
         return idx

         self.n_head = config.n_head
         self.n_embd = config.n_embd
         self.dropout = config.dropout
+        self.head_size = config.n_embd // config.n_head
         # Key, Query, Value projections
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
         self.attn_dropout = nn.Dropout(config.dropout)
         self.resid_dropout = nn.Dropout(config.dropout)
+        # Flash attention optimization if available
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            print("WARNING: Flash Attention not available, using manual attention")
+            # Manual causal mask
+            self.register_buffer(
+                "bias",
+                torch.tril(torch.ones(config.block_size, config.block_size))
+                .view(1, 1, config.block_size, config.block_size)
+            )
     def forward(self, x):
+        B, T, C = x.size() # batch size, sequence length, embedding dimensionality
         # Calculate query, key, values
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
         k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        # Causal self-attention with memory optimization
+        if self.flash:
+            # Use flash attention if available (faster and more memory efficient)
+            with torch.backends.cuda.sdp_kernel(enable_flash=True):
+                y = torch.nn.functional.scaled_dot_product_attention(
+                    q, k, v,
+                    attn_mask=None,
+                    dropout_p=self.dropout if self.training else 0,
+                    is_causal=True
+                )
+        else:
+            # Manual attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v
+        # Reshape and project back
         y = y.transpose(1, 2).contiguous().view(B, T, C)
         y = self.resid_dropout(self.c_proj(y))
         return y
         # Report number of parameters
         print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
+        # Gradient checkpointing flag
+        self.gradient_checkpointing = False
+    def gradient_checkpointing_enable(self):
+        """Enable gradient checkpointing for memory efficiency"""
+        self.gradient_checkpointing = True
+    def gradient_checkpointing_disable(self):
+        """Disable gradient checkpointing"""
+        self.gradient_checkpointing = False
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
     def forward(self, idx, targets=None):
         device = idx.device
         b, t = idx.size()
+        # Token and position embeddings
         tok_emb = self.transformer.wte(idx)
+        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
         pos_emb = self.transformer.wpe(pos)
+        # Add embeddings and apply dropout
         x = self.transformer.drop(tok_emb + pos_emb)
+        # Apply transformer blocks with optional gradient checkpointing
+        if self.gradient_checkpointing and self.training:
+            for block in self.transformer.h:
+                x = torch.utils.checkpoint.checkpoint(block, x)
+        else:
+            for block in self.transformer.h:
+                x = block(x)
         x = self.transformer.ln_f(x)
+        # Language model head
         logits = self.lm_head(x)
+        # Loss calculation (in BPC)
         loss = None
         if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+            loss = loss / math.log(2)  # Convert to BPC
         return logits, loss
             idx = torch.cat((idx, idx_next), dim=1)
         return idx
+    def get_num_params(self, non_embedding=True):
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params