Upload tiny-random deepseek_v32 model
Browse files- model.safetensors +1 -1
- modeling_deepseek_v32.py +5 -9
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 545819392
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7630389ac118c34d12846521ff5102b4ba0b97fa733ad63eb780c38aed731f0
|
| 3 |
size 545819392
|
modeling_deepseek_v32.py
CHANGED
|
@@ -796,7 +796,7 @@ class DeepseekV32Attention(nn.Module):
|
|
| 796 |
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
|
| 797 |
"with a layer index."
|
| 798 |
)
|
| 799 |
-
kv_seq_len += past_key_value.
|
| 800 |
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
| 801 |
|
| 802 |
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
|
|
@@ -930,7 +930,7 @@ class DeepseekV32FlashAttention2(DeepseekV32Attention):
|
|
| 930 |
|
| 931 |
kv_seq_len = value_states.shape[-2]
|
| 932 |
if past_key_value is not None:
|
| 933 |
-
kv_seq_len += past_key_value.
|
| 934 |
|
| 935 |
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
| 936 |
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
|
|
@@ -1430,8 +1430,8 @@ class DeepseekV32Model(DeepseekV32PreTrainedModel):
|
|
| 1430 |
if use_cache:
|
| 1431 |
use_legacy_cache = not isinstance(past_key_values, Cache)
|
| 1432 |
if use_legacy_cache:
|
| 1433 |
-
past_key_values = DynamicCache
|
| 1434 |
-
past_key_values_length = past_key_values.
|
| 1435 |
|
| 1436 |
if position_ids is None:
|
| 1437 |
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
|
@@ -1499,11 +1499,7 @@ class DeepseekV32Model(DeepseekV32PreTrainedModel):
|
|
| 1499 |
|
| 1500 |
next_cache = None
|
| 1501 |
if use_cache:
|
| 1502 |
-
next_cache =
|
| 1503 |
-
next_decoder_cache.to_legacy_cache()
|
| 1504 |
-
if use_legacy_cache
|
| 1505 |
-
else next_decoder_cache
|
| 1506 |
-
)
|
| 1507 |
if not return_dict:
|
| 1508 |
return tuple(
|
| 1509 |
v
|
|
|
|
| 796 |
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
|
| 797 |
"with a layer index."
|
| 798 |
)
|
| 799 |
+
kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
|
| 800 |
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
| 801 |
|
| 802 |
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
|
|
|
|
| 930 |
|
| 931 |
kv_seq_len = value_states.shape[-2]
|
| 932 |
if past_key_value is not None:
|
| 933 |
+
kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
|
| 934 |
|
| 935 |
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
| 936 |
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
|
|
|
|
| 1430 |
if use_cache:
|
| 1431 |
use_legacy_cache = not isinstance(past_key_values, Cache)
|
| 1432 |
if use_legacy_cache:
|
| 1433 |
+
past_key_values = DynamicCache()
|
| 1434 |
+
past_key_values_length = past_key_values.get_seq_length()
|
| 1435 |
|
| 1436 |
if position_ids is None:
|
| 1437 |
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
|
|
|
| 1499 |
|
| 1500 |
next_cache = None
|
| 1501 |
if use_cache:
|
| 1502 |
+
next_cache = next_decoder_cache
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1503 |
if not return_dict:
|
| 1504 |
return tuple(
|
| 1505 |
v
|