lemms commited on
Commit
7b5eb87
·
verified ·
1 Parent(s): 02c7565

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +18 -12
app.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- OpenLLM Real Models App - Final fixed version with exact bias configuration
4
  """
5
 
6
  import gradio as gr
@@ -133,7 +133,7 @@ class Block(nn.Module):
133
  return x
134
 
135
  class CausalSelfAttention(nn.Module):
136
- """Multi-head self-attention with causal masking - EXACT bias configuration"""
137
  def __init__(self, config):
138
  super().__init__()
139
  assert config.n_embd % config.n_head == 0
@@ -146,11 +146,15 @@ class CausalSelfAttention(nn.Module):
146
  self.dropout = config.dropout
147
  self.bias = config.bias
148
 
149
- # ADD THE BIAS PARAMETER that the saved model expects
 
150
  if config.bias:
151
- self.bias = nn.Parameter(torch.zeros(config.n_embd))
 
 
 
152
  else:
153
- self.register_parameter('bias', None)
154
 
155
  def forward(self, x):
156
  B, T, C = x.size()
@@ -161,17 +165,19 @@ class CausalSelfAttention(nn.Module):
161
  q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
162
  v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
163
 
164
- # Causal self-attention
165
- y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
 
 
 
 
 
 
 
166
  y = y.transpose(1, 2).contiguous().view(B, T, C)
167
 
168
  # Output projection
169
  y = self.resid_dropout(self.c_proj(y))
170
-
171
- # Add the bias if it exists
172
- if self.bias is not None:
173
- y = y + self.bias
174
-
175
  return y
176
 
177
  class MLP(nn.Module):
 
1
  #!/usr/bin/env python3
2
  """
3
+ OpenLLM Real Models App - Ultimate fixed version with correct attention bias handling
4
  """
5
 
6
  import gradio as gr
 
133
  return x
134
 
135
  class CausalSelfAttention(nn.Module):
136
+ """Multi-head self-attention with causal masking - ULTIMATE FIX"""
137
  def __init__(self, config):
138
  super().__init__()
139
  assert config.n_embd % config.n_head == 0
 
146
  self.dropout = config.dropout
147
  self.bias = config.bias
148
 
149
+ # REGISTER THE ATTENTION BIAS as a buffer (not parameter) to match saved model
150
+ # This is actually an attention mask, not a learnable bias
151
  if config.bias:
152
+ # Create a causal attention mask buffer
153
+ mask = torch.tril(torch.ones(config.block_size, config.block_size))
154
+ mask = mask.view(1, 1, config.block_size, config.block_size)
155
+ self.register_buffer('bias', mask)
156
  else:
157
+ self.register_buffer('bias', None)
158
 
159
  def forward(self, x):
160
  B, T, C = x.size()
 
165
  q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
166
  v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
167
 
168
+ # Causal self-attention using the bias mask
169
+ if self.bias is not None:
170
+ # Use the causal mask
171
+ attn_mask = self.bias[:, :, :T, :T]
172
+ y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=self.dropout if self.training else 0, is_causal=False)
173
+ else:
174
+ # Use built-in causal attention
175
+ y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
176
+
177
  y = y.transpose(1, 2).contiguous().view(B, T, C)
178
 
179
  # Output projection
180
  y = self.resid_dropout(self.c_proj(y))
 
 
 
 
 
181
  return y
182
 
183
  class MLP(nn.Module):