ClementRomac
/

gia2-small

Model card Files Files and versions

ClementRomac commited on Sep 28, 2023

Commit

6a479f7

·

1 Parent(s): ce5bd59

Upload processor

Files changed (1) hide show

processor.py +6 -21

processor.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from itertools import chain
 from transformers import GitProcessor
 class GIAProcessor(GitProcessor):
-    def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
-        self._block_size = 1024
     def _cut_text(self, examples):
         results = {
@@ -13,28 +14,12 @@ class GIAProcessor(GitProcessor):
         }
         for i in range(len(examples["input_ids"])):
             _input_size = len(examples["input_ids"][i])
-            for j in range(max(1, _input_size // self._block_size)):
-                results["input_ids"].append(examples["input_ids"][i][j*self._block_size:(j+1)*self._block_size])
-                results["attention_mask"].append(examples["attention_mask"][i][j * self._block_size:(j + 1) * self._block_size])
         return results
-    # def _group_texts(self, examples):
-    #     # Concatenate all texts.
-    #     concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-    #     total_length = len(concatenated_examples[list(examples.keys())[0]])
-    #     # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
-    #     # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-    #     if total_length > self._block_size:
-    #         total_length = (total_length // self._block_size) * self._block_size
-    #
-    #     # Split by chunks of max_len.
-    #     result = {
-    #         k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
-    #         for k, t in concatenated_examples.items()
-    #     }
-    #     return result
     def __call__(self, examples, return_tensors=None, **kwargs):
         if "text" in examples and not "images" in examples:
             encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)

 from itertools import chain
 from transformers import GitProcessor
 class GIAProcessor(GitProcessor):
+    def __init__(self, image_processor, tokenizer, max_input_size):
         super().__init__(image_processor, tokenizer)
+        self._max_input_size = max_input_size
     def _cut_text(self, examples):
         results = {
         }
         for i in range(len(examples["input_ids"])):
             _input_size = len(examples["input_ids"][i])
+            for j in range(max(1, _input_size // self._max_input_size)):
+                results["input_ids"].append(examples["input_ids"][i][j*self._max_input_size:(j + 1) * self._max_input_size])
+                results["attention_mask"].append(examples["attention_mask"][i][j * self._max_input_size:(j + 1) * self._max_input_size])
         return results
     def __call__(self, examples, return_tensors=None, **kwargs):
         if "text" in examples and not "images" in examples:
             encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)