jetuned commited on
Commit
d4fec2f
·
verified ·
1 Parent(s): f5e4dac

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. modeling_deepseek.py +6 -2
modeling_deepseek.py CHANGED
@@ -1374,9 +1374,13 @@ class LatentTransformerLayer(nn.Module):
1374
  def __init__(self, config, dropout=0.0):
1375
  super().__init__()
1376
  self.norm1 = DeepseekRMSNorm(config.hidden_size)
1377
- # Correctly initialize DeepseekAttention with modified config
 
 
 
 
1378
  attn_config = DeepseekConfig(
1379
- **config.to_dict(),
1380
  num_key_value_heads=config.num_attention_heads, # Force Multi-Head Attention
1381
  attention_bias=config.attention_bias,
1382
  _attn_implementation="eager" # Disable SDPA for stability
 
1374
  def __init__(self, config, dropout=0.0):
1375
  super().__init__()
1376
  self.norm1 = DeepseekRMSNorm(config.hidden_size)
1377
+ original_config_dict = config.to_dict()
1378
+ original_config_dict.pop("num_key_value_heads", None)
1379
+ original_config_dict.pop("attention_bias", None)
1380
+ original_config_dict.pop("_attn_implementation", None)
1381
+ # Correctly initialize DeepseekAttention with modified config using the filtered dictionary
1382
  attn_config = DeepseekConfig(
1383
+ **original_config_dict,
1384
  num_key_value_heads=config.num_attention_heads, # Force Multi-Head Attention
1385
  attention_bias=config.attention_bias,
1386
  _attn_implementation="eager" # Disable SDPA for stability