Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
OpenLLM Real Models App -
|
| 4 |
"""
|
| 5 |
|
| 6 |
import gradio as gr
|
|
@@ -133,7 +133,7 @@ class Block(nn.Module):
|
|
| 133 |
return x
|
| 134 |
|
| 135 |
class CausalSelfAttention(nn.Module):
|
| 136 |
-
"""Multi-head self-attention with causal masking -
|
| 137 |
def __init__(self, config):
|
| 138 |
super().__init__()
|
| 139 |
assert config.n_embd % config.n_head == 0
|
|
@@ -146,11 +146,15 @@ class CausalSelfAttention(nn.Module):
|
|
| 146 |
self.dropout = config.dropout
|
| 147 |
self.bias = config.bias
|
| 148 |
|
| 149 |
-
#
|
|
|
|
| 150 |
if config.bias:
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
| 152 |
else:
|
| 153 |
-
self.
|
| 154 |
|
| 155 |
def forward(self, x):
|
| 156 |
B, T, C = x.size()
|
|
@@ -161,17 +165,19 @@ class CausalSelfAttention(nn.Module):
|
|
| 161 |
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
| 162 |
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
| 163 |
|
| 164 |
-
# Causal self-attention
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
y = y.transpose(1, 2).contiguous().view(B, T, C)
|
| 167 |
|
| 168 |
# Output projection
|
| 169 |
y = self.resid_dropout(self.c_proj(y))
|
| 170 |
-
|
| 171 |
-
# Add the bias if it exists
|
| 172 |
-
if self.bias is not None:
|
| 173 |
-
y = y + self.bias
|
| 174 |
-
|
| 175 |
return y
|
| 176 |
|
| 177 |
class MLP(nn.Module):
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
OpenLLM Real Models App - Ultimate fixed version with correct attention bias handling
|
| 4 |
"""
|
| 5 |
|
| 6 |
import gradio as gr
|
|
|
|
| 133 |
return x
|
| 134 |
|
| 135 |
class CausalSelfAttention(nn.Module):
|
| 136 |
+
"""Multi-head self-attention with causal masking - ULTIMATE FIX"""
|
| 137 |
def __init__(self, config):
|
| 138 |
super().__init__()
|
| 139 |
assert config.n_embd % config.n_head == 0
|
|
|
|
| 146 |
self.dropout = config.dropout
|
| 147 |
self.bias = config.bias
|
| 148 |
|
| 149 |
+
# REGISTER THE ATTENTION BIAS as a buffer (not parameter) to match saved model
|
| 150 |
+
# This is actually an attention mask, not a learnable bias
|
| 151 |
if config.bias:
|
| 152 |
+
# Create a causal attention mask buffer
|
| 153 |
+
mask = torch.tril(torch.ones(config.block_size, config.block_size))
|
| 154 |
+
mask = mask.view(1, 1, config.block_size, config.block_size)
|
| 155 |
+
self.register_buffer('bias', mask)
|
| 156 |
else:
|
| 157 |
+
self.register_buffer('bias', None)
|
| 158 |
|
| 159 |
def forward(self, x):
|
| 160 |
B, T, C = x.size()
|
|
|
|
| 165 |
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
| 166 |
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
| 167 |
|
| 168 |
+
# Causal self-attention using the bias mask
|
| 169 |
+
if self.bias is not None:
|
| 170 |
+
# Use the causal mask
|
| 171 |
+
attn_mask = self.bias[:, :, :T, :T]
|
| 172 |
+
y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=self.dropout if self.training else 0, is_causal=False)
|
| 173 |
+
else:
|
| 174 |
+
# Use built-in causal attention
|
| 175 |
+
y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
|
| 176 |
+
|
| 177 |
y = y.transpose(1, 2).contiguous().view(B, T, C)
|
| 178 |
|
| 179 |
# Output projection
|
| 180 |
y = self.resid_dropout(self.c_proj(y))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
return y
|
| 182 |
|
| 183 |
class MLP(nn.Module):
|