Spaces:

bird-of-paradise
/

ReTool-Implementation

Running

App Files Files Community

bird-of-paradise commited on Jul 20

Commit

a0dec77

verified ·

1 Parent(s): cfa2a65

replace `model.generate` with custom generation function to optimize kv_cache

Browse files

replaced the problematic high level generate
```
model_outputs = self.model.generate(
input_ids=current_input_id,
attention_mask=current_attention_mask, # This mask is for (history in KV cache + current_input_id)
eos_token_id=[eos_id, code_id[1]], # code_id[1] is assumed to be </code>'s last token ID
past_key_values=current_kv,
generation_config=self.generation_config, # Ensure this has return_dict_in_generate=True, use_cache=True
# max_new_tokens should be set in self.generation_config appropriately for a segment
)
```
with custom generation and decoding functions.
This is because you can't use stateful cache in high level `model.generate`.

Files changed (1) hide show

src/retool_trainer.py +168 -101

src/retool_trainer.py CHANGED Viewed

@@ -25,6 +25,7 @@ from transformers import (
     is_wandb_available,
     PreTrainedTokenizer,
 )
@@ -257,128 +258,194 @@ class ReToolTrainer(Trainer):  # Change this line
         return advantages
-    def _retool_generate_with_interpreter(
-        self,
-        prompt_ids_batch: torch.Tensor, # Full batch of prompts
-        attention_mask_batch: torch.Tensor, # Full batch of attention masks for prompts
-        #tokenizer: PreTrainedTokenizer, # use self.processiing_class for Tokenizer
-        eos_id: int, # True end-of-sequence token ID
-        interpreter_id: list[int],  # [start_id, end_id]
-        code_id: list[int],         # [start_id, end_id]
-        max_turns: int = 10
-    ) -> tuple[torch.LongTensor, list[list[tuple[int, int]]]]:
         batch_size = prompt_ids_batch.size(0)
         batch_completion = []
         batch_interpreter_positions = []
-        for i in range(batch_size): # Process each item in the batch
-            # --- Initialization for the current sequence ---
-            current_input_id = prompt_ids_batch[i:i+1] # Initial input is the prompt
-            current_attention_mask = attention_mask_batch[i:i+1]
             current_kv = None
-            # NEW: Track only the completion part (no prompt)
             cumulative_completion_ids = torch.empty((1, 0), dtype=torch.long, device=prompt_ids_batch.device)
             interpreter_positions = []
             for turn_idx in range(max_turns):
-                # --- Stage 1: LM generates text ---
-                model_outputs = self.model.generate(
                     input_ids=current_input_id,
-                    attention_mask=current_attention_mask, # This mask is for (history in KV cache + current_input_id)
-                    eos_token_id=[eos_id, code_id[1]], # code_id[1] is assumed to be </code>'s last token ID
                     past_key_values=current_kv,
-                    generation_config=self.generation_config, # Ensure this has return_dict_in_generate=True, use_cache=True
-                    # max_new_tokens should be set in self.generation_config appropriately for a segment
                 )
-                # Update current_full_ids to the new complete sequence
-                current_full_ids = model_outputs.sequences
-                # Newly generated tokens by the LM in THIS step
-                completion_id = current_full_ids[:, current_input_id.size(1):]
-                # Add to completion tracking (excludes prompt)
-                cumulative_completion_ids = torch.cat([cumulative_completion_ids, completion_id], dim=1)
-                # Update current_input_id for the next generation step
-                # Update current_attention_mask: it was for (history + current_input_id),
-                # now append 1s for completion_id
-                current_attention_mask = torch.cat([
-                    current_attention_mask,
-                    torch.ones_like(completion_id)
-                ], dim=1)
-                current_kv = model_outputs.past_key_values  # Cache for the new current_full_ids
-                last_token_id = current_full_ids[0, -1].item()
                 if last_token_id == eos_id or turn_idx == max_turns - 1:
                     batch_completion.append(cumulative_completion_ids.squeeze(0))
-                    batch_interpreter_positions.append(interpreter_positions) # Note: was batch_interpreter_positions[i] = ...
                     break
-                if last_token_id == code_id[1]: # Assuming code_id[1] is the specific ID for </code> last token
-                    # --- Stage 2: Tool Execution ---
-                    # Extract code from the generated sequence
-                    full_text = self.processing_class.decode(current_full_ids[0])
                     code_match = re.search(r'<code>(.*?)</code>', full_text, re.DOTALL)
                     if code_match:
-                        code_block = code_match.group(1)
-                        interpreter_text = self._execute_code(code_block)  # 👈 To do: code sandbox execution 👈
                     else:
-                        interpreter_text = "Error: No code found"
-                    formatted_feedback_text = f"{self.processing_class.decode(interpreter_id[0])}{interpreter_text}{self.processing_class.decode(interpreter_id[1])}"
-                    interpreter_feedback_id = self.processing_class(
-                        formatted_feedback_text,
-                        return_tensors="pt",
-                        add_special_tokens=False
-                    ).input_ids.to(current_full_ids.device)
-                    # Record positions relative to cumulative_completion_ids *before* appending feedback
-                    interpreter_start_idx = cumulative_completion_ids.size(1)
-                    cumulative_completion_ids = torch.cat([cumulative_completion_ids, interpreter_feedback_id], dim=1)  # Use cumulative, not current
-                    interpreter_end_idx = cumulative_completion_ids.size(1) - 1
-                    interpreter_positions.append((interpreter_start_idx, interpreter_end_idx))
-                    # Update attention mask for the appended tool feedback
-                    current_attention_mask = torch.cat([
-                        current_attention_mask,
-                        torch.ones_like(interpreter_feedback_id)
-                    ], dim=1)
-                    # Prepare for the next LM generation step:
-                    # The model needs to "process" the tool_output_tokens to update its KV cache.
-                    # The `current_input_id` for the next generate call will be `interpreter_feedback_id`.
-                    # `current_kv` already holds the cache for `current_full_ids` *before* the tool feedback was appended.
-                    # The `current_attention_mask` now correctly covers `current_full_ids` (which includes tool feedback).
-                    current_input_id = interpreter_feedback_id
-                    # `current_kv` is correct (it's for the prefix before `interpreter_feedback_id`).
-                    # The next `model.generate` call will use this `current_input_id`, `current_attention_mask`, and `current_kv`.
                 else:
-                    # LM stopped for a reason other than EOS or code_end` (e.g., max_new_tokens for the segment)
-                    batch_completion.append(cumulative_completion_ids.squeeze(0))
-                    batch_interpreter_positions.append(interpreter_positions)
-                    # At the end, return full sequence (prompt + completion)
-                    break
-            else: # Executed if the loop finished due to max_turns without a break
                 batch_completion.append(cumulative_completion_ids.squeeze(0))
                 batch_interpreter_positions.append(interpreter_positions)
-        # Pad sequences in the batch to the same length for returning a single tensor
-        # This is a common step if you started with a batch loop.
-        # Alternatively, this function could return a list of tensors if lengths vary.
-        # For now, assuming you'll handle batch padding outside or return a list.
-        # The return type `torch.LongTensor` implies a padded batch.
-        padded_sequences = torch.nn.utils.rnn.pad_sequence(batch_completion, batch_first=True, padding_value=self.processing_class.pad_token_id)
         return padded_sequences, batch_interpreter_positions
     def _create_interpreter_mask(

     is_wandb_available,
     PreTrainedTokenizer,
 )
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
         return advantages
+    def _custom_generate(self, input_ids, attention_mask=None, past_key_values=None, max_new_tokens=50, eos_token_ids=None):
+        """Custom generation function that avoids KV cache issues"""
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if eos_token_ids is None:
+            eos_token_ids = [self.processing_class.eos_token_id]
+        # Initialize
+        current_ids = input_ids.clone()
+        current_mask = attention_mask.clone()
+        current_kv = past_key_values
+        # Generate tokens in batches for efficiency
+        all_tokens = []
+        batch_size = 10  # Process this many tokens at once
+        for start_idx in range(0, max_new_tokens, batch_size):
+            # How many tokens to generate in this batch
+            batch_tokens = min(batch_size, max_new_tokens - start_idx)
+            # Accumulate new tokens
+            new_tokens = []
+            for _ in range(batch_tokens):
+                # Forward pass with proper cache handling
+                with torch.no_grad():
+                    outputs = self.model(
+                        input_ids=current_ids if current_kv is None else current_ids[:, -1:],
+                        attention_mask=current_mask if current_kv is None else current_mask[:, -1:],
+                        past_key_values=DynamicCache.from_legacy_cache(current_kv) if current_kv is not None else None,
+                        use_cache=True
+                    )
+                # Sample next token
+                next_token_logits = outputs.logits[:, -1, :] / self.temperature
+                filtered_logits = self._filter_logits(next_token_logits)
+                probs = torch.nn.functional.softmax(filtered_logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                # Add to accumulated tokens
+                token_id = next_token.item()
+                new_tokens.append(token_id)
+                # Update for next iteration
+                current_ids = torch.cat([current_ids, next_token], dim=1)
+                token_mask = torch.ones((1, 1), device=current_mask.device, dtype=current_mask.dtype)
+                current_mask = torch.cat([current_mask, token_mask], dim=1)
+                current_kv = outputs.past_key_values
+                # Check for stop tokens - include both EOS and code_end
+                if token_id in eos_token_ids:
+                    break
+            # Add batch tokens to overall result
+            all_tokens.extend(new_tokens)
+            # Check if we hit a stop token
+            if len(new_tokens) < batch_tokens:
+                break
+        # Convert to tensor
+        result = torch.tensor([all_tokens], device=input_ids.device)
+        return result, current_kv
+    def _filter_logits(self, logits):
+        """Apply top-k and top-p filtering"""
+        if self.top_k > 0:
+            top_k_logits, top_k_indices = torch.topk(logits, self.top_k, dim=-1)
+            logits[0, :] = torch.full_like(logits[0, :], float('-inf'))
+            logits[0, top_k_indices[0]] = top_k_logits[0]
+        if self.top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+            cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above threshold
+            sorted_indices_to_remove = cumulative_probs > self.top_p
+            # Shift the indices to the right to keep the first token above threshold
+            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+            sorted_indices_to_remove[:, 0] = 0
+            # Scatter sorted tensors to original indexing
+            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = float('-inf')
+        return logits
+    def _retool_generate_with_interpreter(self, prompt_ids_batch, attention_mask_batch, eos_id, interpreter_id, code_id, max_turns=10):
+        """Implementation with custom generation to avoid KV cache issues"""
         batch_size = prompt_ids_batch.size(0)
         batch_completion = []
         batch_interpreter_positions = []
+        for i in range(batch_size):
+            # Initialize
+            current_input_id = prompt_ids_batch[i:i+1]
+            current_attention_mask = attention_mask_batch[i:i+1]
             current_kv = None
+            # Track completion (excludes prompt)
             cumulative_completion_ids = torch.empty((1, 0), dtype=torch.long, device=prompt_ids_batch.device)
             interpreter_positions = []
             for turn_idx in range(max_turns):
+                # Check if input is empty
+                if current_input_id.size(1) == 0:
+                    break
+                # Generate with custom function
+                newly_generated_tokens, current_kv = self._custom_generate(
                     input_ids=current_input_id,
+                    attention_mask=current_attention_mask,
                     past_key_values=current_kv,
+                    max_new_tokens=self.max_completion_length,  # Use class attribute
+                    eos_token_ids=[eos_id, code_id[1]]
                 )
+                # Add to completion
+                cumulative_completion_ids = torch.cat([cumulative_completion_ids, newly_generated_tokens], dim=1)
+                # Check last token
+                last_token_id = newly_generated_tokens[0, -1].item() if newly_generated_tokens.size(1) > 0 else None
+                # Check for end conditions
                 if last_token_id == eos_id or turn_idx == max_turns - 1:
                     batch_completion.append(cumulative_completion_ids.squeeze(0))
+                    batch_interpreter_positions.append(interpreter_positions)
                     break
+                # Check for code end token
+                if last_token_id == code_id[1]:
+                    # Extract code from the full text
+                    full_text = self.processing_class.decode(
+                        torch.cat([prompt_ids_batch[i], cumulative_completion_ids[0]], dim=0)
+                    )
                     code_match = re.search(r'<code>(.*?)</code>', full_text, re.DOTALL)
                     if code_match:
+                        code_block = code_match.group(1).strip()
+                        interpreter_text = self._execute_code(code_block)
+                        # Format and add interpreter output
+                        formatted_feedback = f"{self.processing_class.decode(interpreter_id[0])}{interpreter_text}{self.processing_class.decode(interpreter_id[1])}"
+                        interpreter_ids = self.processing_class(
+                            formatted_feedback,
+                            return_tensors="pt",
+                            add_special_tokens=False
+                        ).input_ids.to(prompt_ids_batch.device)
+                        # Record positions
+                        interpreter_start_idx = cumulative_completion_ids.size(1)
+                        cumulative_completion_ids = torch.cat([cumulative_completion_ids, interpreter_ids], dim=1)
+                        interpreter_end_idx = cumulative_completion_ids.size(1) - 1
+                        interpreter_positions.append((interpreter_start_idx, interpreter_end_idx))
+                        # Set up for next turn
+                        current_input_id = interpreter_ids
+                        current_attention_mask = torch.ones_like(current_input_id)
+                        # Keep current_kv from previous generation
                     else:
+                        # No code block found despite </code> token
+                        break
                 else:
+                    # Continue with the newly generated tokens
+                    current_input_id = newly_generated_tokens
+                    current_attention_mask = torch.ones_like(current_input_id)
+            else:
+                # Loop finished due to max_turns without a break
                 batch_completion.append(cumulative_completion_ids.squeeze(0))
                 batch_interpreter_positions.append(interpreter_positions)
+        # Pad sequences
+        if len(batch_completion) > 0:
+            # Ensure padding_value is a valid integer
+            padding_value = self.processing_class.pad_token_id
+            if padding_value is None:
+                padding_value = 0  # Use 0 as a default if pad_token_id is None
+            padded_sequences = torch.nn.utils.rnn.pad_sequence(
+                batch_completion,
+                batch_first=True,
+                padding_value=padding_value
+            )
+        else:
+            padded_sequences = torch.empty((0, 0), dtype=torch.long, device=prompt_ids_batch.device)
         return padded_sequences, batch_interpreter_positions
     def _create_interpreter_mask(