andrewqian123
/

LLAMA_BATCH

@@ -197,17 +197,27 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
             result_text.append(tokenizer.decode(result).strip())
         return result_text
-    def _decode(self, inputs_embeds, tokenizer, decode_text=False, **kwargs):
         terminators = [
             tokenizer.eos_token_id,
             tokenizer.convert_tokens_to_ids("<|eot_id|>")
         ]
-        output = self.llm.generate(
             inputs_embeds=inputs_embeds,
             pad_token_id=0,
             eos_token_id=terminators,
             **kwargs
-        )
         if decode_text:
             return self._decode_text(output, tokenizer)
         return output
@@ -277,6 +287,16 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
         max_x = max(tensor.shape[1] for tensor in batch)
         # Step 2: Automatically pad each tensor to have the same length (L) in the last dimension
         padded_tensors = [torch.nn.functional.pad(tensor, (0, 0, 0, max_x - tensor.shape[1])) for tensor in batch]
         # Step 3: Stack the padded tensors into a single batch
@@ -289,7 +309,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
             kwargs.pop("decode_text")
             result = self._decode_stream(batch, tokenizer, **kwargs)
         else:
-            result = self._decode(batch, tokenizer, **kwargs)
         return result

             result_text.append(tokenizer.decode(result).strip())
         return result_text
+    def _decode(self, inputs_embeds, tokenizer, attention_mask=None, decode_text=False, **kwargs):
         terminators = [
             tokenizer.eos_token_id,
             tokenizer.convert_tokens_to_ids("<|eot_id|>")
         ]
+        output = None
+        if (attention_mask != None):
+            output = self.llm.generate(
             inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
             pad_token_id=0,
             eos_token_id=terminators,
             **kwargs
+            )
+        else:
+            output = self.llm.generate(
+                inputs_embeds=inputs_embeds,
+                pad_token_id=0,
+                eos_token_id=terminators,
+                **kwargs
+            )
         if decode_text:
             return self._decode_text(output, tokenizer)
         return output
         max_x = max(tensor.shape[1] for tensor in batch)
         # Step 2: Automatically pad each tensor to have the same length (L) in the last dimension
+        attention_mask = []
+        for tensor in batch:
+            to_add = []
+            for pl in range(tensor.shape[1]):
+                to_add.append(1)
+            for pl in range(tensor.shape[1], max_x):
+                to_add.append(0)
+            attention_mask.append(to_add)
+        attention_mask = torch.tensor(attention_mask)
         padded_tensors = [torch.nn.functional.pad(tensor, (0, 0, 0, max_x - tensor.shape[1])) for tensor in batch]
         # Step 3: Stack the padded tensors into a single batch
             kwargs.pop("decode_text")
             result = self._decode_stream(batch, tokenizer, **kwargs)
         else:
+            result = self._decode(batch, tokenizer, attention_mask=attention_mask, **kwargs)
         return result