ClementRomac
/

gia2-small

Transformers

gia2

custom_code

Model card Files Files and versions

xet

Community

ClementRomac commited on Sep 28, 2023

Commit

327431a

1 Parent(s): af34ccc

Upload processor

Browse files

Files changed (1) hide show

processor.py +28 -13

processor.py CHANGED Viewed

@@ -6,24 +6,39 @@ class GIAProcessor(GitProcessor):
         super().__init__(image_processor, tokenizer)
         self._block_size = 1024
-    def _group_texts(self, examples):
-        # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
-        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-        total_length = (total_length // self._block_size) * self._block_size
-        # Split by chunks of max_len.
-        result = {
-            k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
-            for k, t in concatenated_examples.items()
         }
-        return result
     def __call__(self, examples, return_tensors=None, **kwargs):
         if "text" in examples and not "images" in examples:
             encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
-            encoding = self._group_texts(encoded_text)
         elif "text" in examples and "images" in examples:
             encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)

         super().__init__(image_processor, tokenizer)
         self._block_size = 1024
+    def _cut_text(self, examples):
+        results = {
+            "input_ids": [],
+            "attention_mask": []
         }
+        for i in range(len(examples["input_ids"])):
+            _input_size = len(examples["input_ids"][i])
+            for j in range(_input_size // self._block_size):
+                results["input_ids"].append(examples["input_ids"][i][j*self._block_size:(j+1)*self._block_size])
+                results["attention_mask"].append(examples["attention_mask"][i][j * self._block_size:(j + 1) * self._block_size])
+        return results
+    # def _group_texts(self, examples):
+    #     # Concatenate all texts.
+    #     concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+    #     total_length = len(concatenated_examples[list(examples.keys())[0]])
+    #     # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+    #     # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+    #     if total_length > self._block_size:
+    #         total_length = (total_length // self._block_size) * self._block_size
+    #
+    #     # Split by chunks of max_len.
+    #     result = {
+    #         k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
+    #         for k, t in concatenated_examples.items()
+    #     }
+    #     return result
     def __call__(self, examples, return_tensors=None, **kwargs):
         if "text" in examples and not "images" in examples:
             encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
+            encoding = self._cut_text(encoded_text)
         elif "text" in examples and "images" in examples:
             encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)