maxholsman
/

fuzzy-spec-dec

Model card Files Files and versions

xet

Community

maxholsman commited on Jan 5

Commit

2cac9f5

verified ·

1 Parent(s): 83f939c

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

custom_generate/generate.py +152 -68

custom_generate/generate.py CHANGED Viewed

@@ -62,6 +62,10 @@ class GenerateDecoderOnlyOutput(ModelOutput):
     attentions: tuple[tuple[torch.FloatTensor]] | None = None
     hidden_states: tuple[tuple[torch.FloatTensor]] | None = None
     past_key_values: Cache | None = None
 @dataclass
@@ -77,6 +81,10 @@ class GenerateEncoderDecoderOutput(ModelOutput):
     cross_attentions: tuple[tuple[torch.FloatTensor]] | None = None
     decoder_hidden_states: tuple[tuple[torch.FloatTensor]] | None = None
     past_key_values: Cache | None = None
 def _split_model_outputs(outputs, new_outputs, cur_len, added_len, is_decoder_attention=False):
@@ -115,9 +123,16 @@ class RawLogitsCandidateGenerator(AssistedCandidateGenerator):
         """Initialize the custom candidate generator."""
         super().__init__(*args, **kwargs)
         # Initialize probs list if sklearn is available and confidence threshold is enabled
         if (
             is_sklearn_available()
-            and self.assistant_generation_config.assistant_confidence_threshold
         ):
             if not hasattr(self, 'probs'):
                 self.probs = []
@@ -149,9 +164,15 @@ class RawLogitsCandidateGenerator(AssistedCandidateGenerator):
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
         # Handle sklearn confidence threshold tracking (if enabled)
         if (
             is_sklearn_available()
-            and self.assistant_generation_config.assistant_confidence_threshold
             and type(self) is RawLogitsCandidateGenerator
         ):
             scores_tensor = torch.cat(assistant_output.scores, dim=0)
@@ -181,7 +202,7 @@ def _speculative_sampling(
     is_done_candidate,
     candidate_logits_raw,
     fsd_threshold: float = 0.0,
-    fsd_div_type: str = "kl"
 ):
     """
     Applies sampling as in the speculative decoding paper (https://huggingface.co/papers/2211.17192, algorithm 1). Returns
@@ -210,21 +231,24 @@ def _speculative_sampling(
         ).sum(dim=-1)
     elif fsd_div_type == "js":
-        m = 0.5 * (cand_probs + target_probs[:, :-1, :])  # Mixture distribution
-        # Compute KL(P || M) and KL(Q || M)
-        kl_pm = kl_div(
-            m.log().clamp(min=-1e10),  # log-probabilities of mixture
-            cand_probs,  # probabilities of candidate
-            reduction='none'
-        )
-        kl_qm = kl_div(
-            m.log().clamp(min=-1e10),  # log-probabilities of mixture
-            target_probs[:, :-1, :],  # probabilities of target
-            reduction='none'
-        )
-        divs = 0.5 * (kl_pm + kl_qm).sum(dim=-1)
     elif fsd_div_type == "draft_tokens":
         draft_token_ids = new_candidate_input_ids  # shape: (batch, candidate_length)
@@ -287,7 +311,8 @@ def _assisted_decoding(
     assistant_tokenizer: Optional["PreTrainedTokenizerBase"] = None,
     tokenizer: Optional["PreTrainedTokenizerBase"] = None,
     fsd_threshold: float = 0.0,
-    fsd_div_type: str = "kl",
     **model_kwargs,
 ) -> Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput, torch.LongTensor]:
     r"""
@@ -328,6 +353,14 @@ def _assisted_decoding(
     output_scores = generation_config.output_scores
     output_logits = generation_config.output_logits
     return_dict_in_generate = generation_config.return_dict_in_generate
     # init attention / hidden states / scores tuples
     scores = () if (return_dict_in_generate and output_scores) else None
@@ -417,6 +450,10 @@ def _assisted_decoding(
                 fsd_threshold=fsd_threshold,
                 fsd_div_type=fsd_div_type,
             )
         # Case 2: all other cases (originally from assisted generation) 👉 Compare the tokens selected from the
         # original model logits with the candidate tokens. We can keep the candidate tokens until the first
@@ -435,6 +472,11 @@ def _assisted_decoding(
             if is_done_candidate and n_matches == candidate_length:
                 n_matches -= 1
             valid_tokens = selected_tokens[:, : n_matches + 1]
         # 4. Update variables according to the number of matching assistant tokens. Remember: the token generated
         # by the model after the last candidate match is also valid, as it is generated from a correct sequence.
@@ -518,32 +560,69 @@ def _assisted_decoding(
         candidate_generator.assistant_model.generation_config.num_assistant_tokens = (
             candidate_generator.num_assistant_tokens
         )
     if return_dict_in_generate:
         cache = None
         if any(cache_key in model_kwargs for cache_key in ALL_CACHE_NAMES):
             cache_key = next(cache_key for cache_key in ALL_CACHE_NAMES if cache_key in model_kwargs)
             cache = model_kwargs[cache_key]
         if model.config.is_encoder_decoder:
-            return GenerateEncoderDecoderOutput(
-                sequences=input_ids,
-                scores=scores,
-                logits=raw_logits,
-                encoder_attentions=encoder_attentions,
-                encoder_hidden_states=encoder_hidden_states,
-                decoder_attentions=decoder_attentions,
-                cross_attentions=cross_attentions,
-                decoder_hidden_states=decoder_hidden_states,
-                past_key_values=cache,
-            )
         else:
-            return GenerateDecoderOnlyOutput(
-                sequences=input_ids,
-                scores=scores,
-                logits=raw_logits,
-                attentions=decoder_attentions,
-                hidden_states=decoder_hidden_states,
-                past_key_values=cache,
-            )
     else:
         return input_ids
@@ -570,8 +649,12 @@ def generate(
     """
     # 1. Handle kwargs, `generation_config`, validate them and obtain generation mode
     # Extract custom parameters before validation (they're not standard generation config params)
     fsd_threshold = kwargs.pop("fsd_threshold", 0.0)
-    fsd_div_type = kwargs.pop("fsd_div_type", "kl")
     generation_mode_kwargs = model._extract_generation_mode_kwargs(
         None,  # custom_generate
@@ -583,6 +666,7 @@ def generate(
     # Add custom FSD parameters to generation_mode_kwargs so they're passed to _assisted_decoding
     generation_mode_kwargs["fsd_threshold"] = fsd_threshold
     generation_mode_kwargs["fsd_div_type"] = fsd_div_type
     # Check length values before updating the config with defaults
     has_default_max_length = kwargs.get("max_length") is None and (
@@ -830,47 +914,47 @@ def generate(
 #     new_candidate_input_ids = candidate_input_ids[:, -candidate_length:]
 #     correction_term = 0
-#     if div_type != 'sd':
-#         if div_type == 'kl_div_processed' or div_type == 'js_div_processed' or div_type == 'tv_div_processed':
-#             epsilon = 1e-10
-#             q = candidate_logits.softmax(dim=-1)
-#             p = new_logits[:, :candidate_length, :].softmax(dim=-1) # need to be cropped because M_L logits include logits for ungenerated position
-#             q_nonzero = (p > 0).int()
-#             p_nonzero = (q > 0).int()
-#             both_nonzero = (q_nonzero & p_nonzero).int()
-#             # print(f"nonzero q: {q_nonzero.sum(dim=-1)}")
-#             # print(f"nonzero p: {p_nonzero.sum(dim=-1)}")
-#             # print(f"both nonzero: {both_nonzero.sum(dim=-1)}")
-#             q = q + epsilon
-#             p = p + epsilon
-#             p = p / p.sum(dim=-1, keepdim=True)
-#             q = q / q.sum(dim=-1, keepdim=True)
-#         else:
-#             q = candidate_logits_unprocessed.softmax(dim=-1)
-#             p = new_logits_unprocessed[:, :candidate_length, :].softmax(dim=-1) # need to be cropped because M_L logits include logits for ungenerated position
-#             if len(div_logits_processor) > 0:
-#                 epsilon = 1e-10
-#                 q = q + epsilon
-#                 p = p + epsilon
-#                 p = p / p.sum(dim=-1, keepdim=True)
-#                 q = q / q.sum(dim=-1, keepdim=True)
-#         if div_type == 'kl_div' or div_type == 'kl_div_processed':
-#             divs = torch.nn.functional.kl_div(torch.log(p), q, reduction='none').sum(dim=-1) # shape = [bs, seq_len]
-#         elif div_type == 'kl_div_reversed' or div_type == 'kl_div_reversed_processed':
-#             divs = torch.nn.functional.kl_div(torch.log(q), p, reduction='none').sum(dim=-1) # shape = [bs, seq_len]
-#         elif div_type == 'js_div' or div_type == 'js_div_processed':
-#             m = 0.5 * (p + q)  # Midpoint distribution
-#             divs = (0.5 * torch.nn.functional.kl_div(torch.log(p), m, reduction='none') + 0.5 * torch.nn.functional.kl_div(torch.log(q), m, reduction='none')).sum(dim=-1)
 #         elif div_type == 'tv_div' or div_type == 'tv_div_processed':
 #             divs = 0.5 * torch.abs(p - q).sum(dim=-1)

     attentions: tuple[tuple[torch.FloatTensor]] | None = None
     hidden_states: tuple[tuple[torch.FloatTensor]] | None = None
     past_key_values: Cache | None = None
+    # Draft token acceptance tracking fields (optional for backward compatibility)
+    draft_token_acceptance_rate: float | None = None
+    total_draft_tokens: int | None = None
+    total_accepted_tokens: int | None = None
 @dataclass
     cross_attentions: tuple[tuple[torch.FloatTensor]] | None = None
     decoder_hidden_states: tuple[tuple[torch.FloatTensor]] | None = None
     past_key_values: Cache | None = None
+    # Draft token acceptance tracking fields (optional for backward compatibility)
+    draft_token_acceptance_rate: float | None = None
+    total_draft_tokens: int | None = None
+    total_accepted_tokens: int | None = None
 def _split_model_outputs(outputs, new_outputs, cur_len, added_len, is_decoder_attention=False):
         """Initialize the custom candidate generator."""
         super().__init__(*args, **kwargs)
         # Initialize probs list if sklearn is available and confidence threshold is enabled
+        # Handle both transformers versions (with and without assistant_generation_config)
+        assistant_config = getattr(self, 'assistant_generation_config', None)
+        if assistant_config is None:
+            # Fallback for transformers versions that don't set assistant_generation_config
+            assistant_config = self.assistant_model.generation_config
         if (
             is_sklearn_available()
+            and hasattr(assistant_config, 'assistant_confidence_threshold')
+            and assistant_config.assistant_confidence_threshold
         ):
             if not hasattr(self, 'probs'):
                 self.probs = []
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
         # Handle sklearn confidence threshold tracking (if enabled)
+        # Handle both transformers versions (with and without assistant_generation_config)
+        assistant_config = getattr(self, 'assistant_generation_config', None)
+        if assistant_config is None:
+            assistant_config = self.assistant_model.generation_config
         if (
             is_sklearn_available()
+            and hasattr(assistant_config, 'assistant_confidence_threshold')
+            and assistant_config.assistant_confidence_threshold
             and type(self) is RawLogitsCandidateGenerator
         ):
             scores_tensor = torch.cat(assistant_output.scores, dim=0)
     is_done_candidate,
     candidate_logits_raw,
     fsd_threshold: float = 0.0,
+    fsd_div_type: str = "js"
 ):
     """
     Applies sampling as in the speculative decoding paper (https://huggingface.co/papers/2211.17192, algorithm 1). Returns
         ).sum(dim=-1)
     elif fsd_div_type == "js":
+        m = 0.5 * (cand_probs + target_probs[:, :-1, :])  # Midpoint distribution
+        divs = (0.5 * torch.nn.functional.kl_div(torch.log(cand_probs), m, reduction='none') + 0.5 * torch.nn.functional.kl_div(torch.log(target_probs[:, :-1, :]), m, reduction='none')).sum(dim=-1)
+        # m = 0.5 * (cand_probs + target_probs[:, :-1, :])  # Mixture distribution
+        # # Compute KL(P || M) and KL(Q || M)
+        # kl_pm = kl_div(
+        #     m.log().clamp(min=-1e10),  # log-probabilities of mixture
+        #     cand_probs,  # probabilities of candidate
+        #     reduction='none'
+        # )
+        # kl_qm = kl_div(
+        #     m.log().clamp(min=-1e10),  # log-probabilities of mixture
+        #     target_probs[:, :-1, :],  # probabilities of target
+        #     reduction='none'
+        # )
+        # divs = 0.5 * (kl_pm + kl_qm).sum(dim=-1)
     elif fsd_div_type == "draft_tokens":
         draft_token_ids = new_candidate_input_ids  # shape: (batch, candidate_length)
     assistant_tokenizer: Optional["PreTrainedTokenizerBase"] = None,
     tokenizer: Optional["PreTrainedTokenizerBase"] = None,
     fsd_threshold: float = 0.0,
+    fsd_div_type: str = "js",
+    track_acceptance_metrics: bool = False,
     **model_kwargs,
 ) -> Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput, torch.LongTensor]:
     r"""
     output_scores = generation_config.output_scores
     output_logits = generation_config.output_logits
     return_dict_in_generate = generation_config.return_dict_in_generate
+    # Track draft token acceptance statistics (only if enabled)
+    if track_acceptance_metrics:
+        total_draft_tokens = 0
+        total_accepted_tokens = 0
+    else:
+        total_draft_tokens = None
+        total_accepted_tokens = None
     # init attention / hidden states / scores tuples
     scores = () if (return_dict_in_generate and output_scores) else None
                 fsd_threshold=fsd_threshold,
                 fsd_div_type=fsd_div_type,
             )
+            # Track acceptance statistics (only if we have draft tokens and tracking is enabled)
+            if track_acceptance_metrics and candidate_length > 0:
+                total_draft_tokens += candidate_length
+                total_accepted_tokens += n_matches
         # Case 2: all other cases (originally from assisted generation) 👉 Compare the tokens selected from the
         # original model logits with the candidate tokens. We can keep the candidate tokens until the first
             if is_done_candidate and n_matches == candidate_length:
                 n_matches -= 1
             valid_tokens = selected_tokens[:, : n_matches + 1]
+            # Track acceptance statistics (for non-sampling case, only if we have draft tokens and tracking is enabled)
+            if track_acceptance_metrics and candidate_length > 0:
+                total_draft_tokens += candidate_length
+                total_accepted_tokens += n_matches
         # 4. Update variables according to the number of matching assistant tokens. Remember: the token generated
         # by the model after the last candidate match is also valid, as it is generated from a correct sequence.
         candidate_generator.assistant_model.generation_config.num_assistant_tokens = (
             candidate_generator.num_assistant_tokens
         )
+    # Calculate draft token acceptance rate (only if tracking is enabled)
+    if track_acceptance_metrics:
+        acceptance_rate = total_accepted_tokens / total_draft_tokens if total_draft_tokens > 0 else 0.0
+    else:
+        acceptance_rate = None
+        total_draft_tokens = None
+        total_accepted_tokens = None
     if return_dict_in_generate:
         cache = None
         if any(cache_key in model_kwargs for cache_key in ALL_CACHE_NAMES):
             cache_key = next(cache_key for cache_key in ALL_CACHE_NAMES if cache_key in model_kwargs)
             cache = model_kwargs[cache_key]
+        # Build base output dict
         if model.config.is_encoder_decoder:
+            base_dict = {
+                "sequences": input_ids,
+                "scores": scores,
+                "logits": raw_logits,
+                "encoder_attentions": encoder_attentions,
+                "encoder_hidden_states": encoder_hidden_states,
+                "decoder_attentions": decoder_attentions,
+                "cross_attentions": cross_attentions,
+                "decoder_hidden_states": decoder_hidden_states,
+                "past_key_values": cache,
+            }
+            output_class = GenerateEncoderDecoderOutput
         else:
+            base_dict = {
+                "sequences": input_ids,
+                "scores": scores,
+                "logits": raw_logits,
+                "attentions": decoder_attentions,
+                "hidden_states": decoder_hidden_states,
+                "past_key_values": cache,
+            }
+            output_class = GenerateDecoderOnlyOutput
+        # Try to create output with acceptance rate fields (only if tracking is enabled)
+        # If the Hub version doesn't support these fields, create without them
+        if track_acceptance_metrics:
+            try:
+                return output_class(
+                    **base_dict,
+                    draft_token_acceptance_rate=acceptance_rate,
+                    total_draft_tokens=total_draft_tokens,
+                    total_accepted_tokens=total_accepted_tokens,
+                )
+            except TypeError:
+                # Hub version doesn't support these fields, create without them
+                output = output_class(**base_dict)
+                # Try to set the fields as attributes (ModelOutput should allow this)
+                try:
+                    output.draft_token_acceptance_rate = acceptance_rate
+                    output.total_draft_tokens = total_draft_tokens
+                    output.total_accepted_tokens = total_accepted_tokens
+                except Exception:
+                    # If setting attributes fails, just return without them
+                    pass
+                return output
+        else:
+            # Tracking disabled, return without metrics
+            return output_class(**base_dict)
     else:
         return input_ids
     """
     # 1. Handle kwargs, `generation_config`, validate them and obtain generation mode
     # Extract custom parameters before validation (they're not standard generation config params)
+    # These are used for loading the custom generate function, not for the generation process itself
+    custom_generate = kwargs.pop("custom_generate", None)
+    trust_remote_code = kwargs.pop("trust_remote_code", None)
     fsd_threshold = kwargs.pop("fsd_threshold", 0.0)
+    fsd_div_type = kwargs.pop("fsd_div_type", "js")
+    track_acceptance_metrics = kwargs.pop("track_acceptance_metrics", False)
     generation_mode_kwargs = model._extract_generation_mode_kwargs(
         None,  # custom_generate
     # Add custom FSD parameters to generation_mode_kwargs so they're passed to _assisted_decoding
     generation_mode_kwargs["fsd_threshold"] = fsd_threshold
     generation_mode_kwargs["fsd_div_type"] = fsd_div_type
+    generation_mode_kwargs["track_acceptance_metrics"] = track_acceptance_metrics
     # Check length values before updating the config with defaults
     has_default_max_length = kwargs.get("max_length") is None and (
 #     new_candidate_input_ids = candidate_input_ids[:, -candidate_length:]
 #     correction_term = 0
+    # if div_type != 'sd':
+    #     if div_type == 'kl_div_processed' or div_type == 'js_div_processed' or div_type == 'tv_div_processed':
+    #         epsilon = 1e-10
+    #         q = candidate_logits.softmax(dim=-1)
+    #         p = new_logits[:, :candidate_length, :].softmax(dim=-1) # need to be cropped because M_L logits include logits for ungenerated position
+    #         q_nonzero = (p > 0).int()
+    #         p_nonzero = (q > 0).int()
+    #         both_nonzero = (q_nonzero & p_nonzero).int()
+    #         # print(f"nonzero q: {q_nonzero.sum(dim=-1)}")
+    #         # print(f"nonzero p: {p_nonzero.sum(dim=-1)}")
+    #         # print(f"both nonzero: {both_nonzero.sum(dim=-1)}")
+    #         q = q + epsilon
+    #         p = p + epsilon
+    #         p = p / p.sum(dim=-1, keepdim=True)
+    #         q = q / q.sum(dim=-1, keepdim=True)
+    #     else:
+    #         q = candidate_logits_unprocessed.softmax(dim=-1)
+    #         p = new_logits_unprocessed[:, :candidate_length, :].softmax(dim=-1) # need to be cropped because M_L logits include logits for ungenerated position
+    #         if len(div_logits_processor) > 0:
+    #             epsilon = 1e-10
+    #             q = q + epsilon
+    #             p = p + epsilon
+    #             p = p / p.sum(dim=-1, keepdim=True)
+    #             q = q / q.sum(dim=-1, keepdim=True)
+    #     if div_type == 'kl_div' or div_type == 'kl_div_processed':
+    #         divs = torch.nn.functional.kl_div(torch.log(p), q, reduction='none').sum(dim=-1) # shape = [bs, seq_len]
+    #     elif div_type == 'kl_div_reversed' or div_type == 'kl_div_reversed_processed':
+    #         divs = torch.nn.functional.kl_div(torch.log(q), p, reduction='none').sum(dim=-1) # shape = [bs, seq_len]
+    #     elif div_type == 'js_div' or div_type == 'js_div_processed':
+            # m = 0.5 * (p + q)  # Midpoint distribution
+            # divs = (0.5 * torch.nn.functional.kl_div(torch.log(p), m, reduction='none') + 0.5 * torch.nn.functional.kl_div(torch.log(q), m, reduction='none')).sum(dim=-1)
 #         elif div_type == 'tv_div' or div_type == 'tv_div_processed':
 #             divs = 0.5 * torch.abs(p - q).sum(dim=-1)