Upload folder using huggingface_hub
Browse files- modeling_deepseek.py +6 -2
modeling_deepseek.py
CHANGED
|
@@ -1374,9 +1374,13 @@ class LatentTransformerLayer(nn.Module):
|
|
| 1374 |
def __init__(self, config, dropout=0.0):
|
| 1375 |
super().__init__()
|
| 1376 |
self.norm1 = DeepseekRMSNorm(config.hidden_size)
|
| 1377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1378 |
attn_config = DeepseekConfig(
|
| 1379 |
-
**
|
| 1380 |
num_key_value_heads=config.num_attention_heads, # Force Multi-Head Attention
|
| 1381 |
attention_bias=config.attention_bias,
|
| 1382 |
_attn_implementation="eager" # Disable SDPA for stability
|
|
|
|
| 1374 |
def __init__(self, config, dropout=0.0):
|
| 1375 |
super().__init__()
|
| 1376 |
self.norm1 = DeepseekRMSNorm(config.hidden_size)
|
| 1377 |
+
original_config_dict = config.to_dict()
|
| 1378 |
+
original_config_dict.pop("num_key_value_heads", None)
|
| 1379 |
+
original_config_dict.pop("attention_bias", None)
|
| 1380 |
+
original_config_dict.pop("_attn_implementation", None)
|
| 1381 |
+
# Correctly initialize DeepseekAttention with modified config using the filtered dictionary
|
| 1382 |
attn_config = DeepseekConfig(
|
| 1383 |
+
**original_config_dict,
|
| 1384 |
num_key_value_heads=config.num_attention_heads, # Force Multi-Head Attention
|
| 1385 |
attention_bias=config.attention_bias,
|
| 1386 |
_attn_implementation="eager" # Disable SDPA for stability
|