microsoft
/

Phi-3-mini-128k-instruct

Text Generation

text-generation-inference

Model card Files Files and versions

sylwia-kuros commited on Jul 1, 2025

Commit

dd769b5

·

verified ·

1 Parent(s): c4a8a23

Update modeling_phi3.py

Files changed (1) hide show

modeling_phi3.py +1 -5

modeling_phi3.py CHANGED Viewed

@@ -1294,10 +1294,6 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
@@ -1564,4 +1560,4 @@ class Phi3ForTokenClassification(Phi3PreTrainedModel):
             logits=logits,
             hidden_states=model_outputs.hidden_states,
             attentions=model_outputs.attentions,
-        )

                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             logits=logits,
             hidden_states=model_outputs.hidden_states,
             attentions=model_outputs.attentions,
+        )