Commit ·
6a479f7
1
Parent(s): ce5bd59
Upload processor
Browse files- processor.py +6 -21
processor.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
from itertools import chain
|
| 2 |
from transformers import GitProcessor
|
| 3 |
|
|
|
|
| 4 |
class GIAProcessor(GitProcessor):
|
| 5 |
-
def __init__(self, image_processor, tokenizer):
|
| 6 |
super().__init__(image_processor, tokenizer)
|
| 7 |
-
self.
|
| 8 |
|
| 9 |
def _cut_text(self, examples):
|
| 10 |
results = {
|
|
@@ -13,28 +14,12 @@ class GIAProcessor(GitProcessor):
|
|
| 13 |
}
|
| 14 |
for i in range(len(examples["input_ids"])):
|
| 15 |
_input_size = len(examples["input_ids"][i])
|
| 16 |
-
for j in range(max(1, _input_size // self.
|
| 17 |
-
results["input_ids"].append(examples["input_ids"][i][j*self.
|
| 18 |
-
results["attention_mask"].append(examples["attention_mask"][i][j * self.
|
| 19 |
|
| 20 |
return results
|
| 21 |
|
| 22 |
-
# def _group_texts(self, examples):
|
| 23 |
-
# # Concatenate all texts.
|
| 24 |
-
# concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
| 25 |
-
# total_length = len(concatenated_examples[list(examples.keys())[0]])
|
| 26 |
-
# # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
| 27 |
-
# # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
| 28 |
-
# if total_length > self._block_size:
|
| 29 |
-
# total_length = (total_length // self._block_size) * self._block_size
|
| 30 |
-
#
|
| 31 |
-
# # Split by chunks of max_len.
|
| 32 |
-
# result = {
|
| 33 |
-
# k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
|
| 34 |
-
# for k, t in concatenated_examples.items()
|
| 35 |
-
# }
|
| 36 |
-
# return result
|
| 37 |
-
|
| 38 |
def __call__(self, examples, return_tensors=None, **kwargs):
|
| 39 |
if "text" in examples and not "images" in examples:
|
| 40 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|
|
|
|
| 1 |
from itertools import chain
|
| 2 |
from transformers import GitProcessor
|
| 3 |
|
| 4 |
+
|
| 5 |
class GIAProcessor(GitProcessor):
|
| 6 |
+
def __init__(self, image_processor, tokenizer, max_input_size):
|
| 7 |
super().__init__(image_processor, tokenizer)
|
| 8 |
+
self._max_input_size = max_input_size
|
| 9 |
|
| 10 |
def _cut_text(self, examples):
|
| 11 |
results = {
|
|
|
|
| 14 |
}
|
| 15 |
for i in range(len(examples["input_ids"])):
|
| 16 |
_input_size = len(examples["input_ids"][i])
|
| 17 |
+
for j in range(max(1, _input_size // self._max_input_size)):
|
| 18 |
+
results["input_ids"].append(examples["input_ids"][i][j*self._max_input_size:(j + 1) * self._max_input_size])
|
| 19 |
+
results["attention_mask"].append(examples["attention_mask"][i][j * self._max_input_size:(j + 1) * self._max_input_size])
|
| 20 |
|
| 21 |
return results
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def __call__(self, examples, return_tensors=None, **kwargs):
|
| 24 |
if "text" in examples and not "images" in examples:
|
| 25 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|