HyperAccel commited on
Commit
aea997a
·
verified ·
1 Parent(s): 082cab5

Upload tiny-random deepseek_v32 model

Browse files
Files changed (2) hide show
  1. model.safetensors +1 -1
  2. modeling_deepseek_v32.py +5 -9
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03da7356ad6e835009b261e79b42873a223209903e8ef7d4a69993faee6e3a2f
3
  size 545819392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7630389ac118c34d12846521ff5102b4ba0b97fa733ad63eb780c38aed731f0
3
  size 545819392
modeling_deepseek_v32.py CHANGED
@@ -796,7 +796,7 @@ class DeepseekV32Attention(nn.Module):
796
  "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
797
  "with a layer index."
798
  )
799
- kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
800
  cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
801
 
802
  q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
@@ -930,7 +930,7 @@ class DeepseekV32FlashAttention2(DeepseekV32Attention):
930
 
931
  kv_seq_len = value_states.shape[-2]
932
  if past_key_value is not None:
933
- kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
934
 
935
  cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
936
  q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
@@ -1430,8 +1430,8 @@ class DeepseekV32Model(DeepseekV32PreTrainedModel):
1430
  if use_cache:
1431
  use_legacy_cache = not isinstance(past_key_values, Cache)
1432
  if use_legacy_cache:
1433
- past_key_values = DynamicCache.from_legacy_cache(past_key_values)
1434
- past_key_values_length = past_key_values.get_usable_length(seq_length)
1435
 
1436
  if position_ids is None:
1437
  device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -1499,11 +1499,7 @@ class DeepseekV32Model(DeepseekV32PreTrainedModel):
1499
 
1500
  next_cache = None
1501
  if use_cache:
1502
- next_cache = (
1503
- next_decoder_cache.to_legacy_cache()
1504
- if use_legacy_cache
1505
- else next_decoder_cache
1506
- )
1507
  if not return_dict:
1508
  return tuple(
1509
  v
 
796
  "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
797
  "with a layer index."
798
  )
799
+ kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
800
  cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
801
 
802
  q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
 
930
 
931
  kv_seq_len = value_states.shape[-2]
932
  if past_key_value is not None:
933
+ kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
934
 
935
  cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
936
  q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
 
1430
  if use_cache:
1431
  use_legacy_cache = not isinstance(past_key_values, Cache)
1432
  if use_legacy_cache:
1433
+ past_key_values = DynamicCache()
1434
+ past_key_values_length = past_key_values.get_seq_length()
1435
 
1436
  if position_ids is None:
1437
  device = input_ids.device if input_ids is not None else inputs_embeds.device
 
1499
 
1500
  next_cache = None
1501
  if use_cache:
1502
+ next_cache = next_decoder_cache
 
 
 
 
1503
  if not return_dict:
1504
  return tuple(
1505
  v