Commit ·
327431a
1
Parent(s): af34ccc
Upload processor
Browse files- processor.py +28 -13
processor.py
CHANGED
|
@@ -6,24 +6,39 @@ class GIAProcessor(GitProcessor):
|
|
| 6 |
super().__init__(image_processor, tokenizer)
|
| 7 |
self._block_size = 1024
|
| 8 |
|
| 9 |
-
def
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
| 14 |
-
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
| 15 |
-
total_length = (total_length // self._block_size) * self._block_size
|
| 16 |
-
# Split by chunks of max_len.
|
| 17 |
-
result = {
|
| 18 |
-
k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
|
| 19 |
-
for k, t in concatenated_examples.items()
|
| 20 |
}
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def __call__(self, examples, return_tensors=None, **kwargs):
|
| 24 |
if "text" in examples and not "images" in examples:
|
| 25 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|
| 26 |
-
encoding = self.
|
| 27 |
elif "text" in examples and "images" in examples:
|
| 28 |
encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
|
| 29 |
|
|
|
|
| 6 |
super().__init__(image_processor, tokenizer)
|
| 7 |
self._block_size = 1024
|
| 8 |
|
| 9 |
+
def _cut_text(self, examples):
|
| 10 |
+
results = {
|
| 11 |
+
"input_ids": [],
|
| 12 |
+
"attention_mask": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
}
|
| 14 |
+
for i in range(len(examples["input_ids"])):
|
| 15 |
+
_input_size = len(examples["input_ids"][i])
|
| 16 |
+
for j in range(_input_size // self._block_size):
|
| 17 |
+
results["input_ids"].append(examples["input_ids"][i][j*self._block_size:(j+1)*self._block_size])
|
| 18 |
+
results["attention_mask"].append(examples["attention_mask"][i][j * self._block_size:(j + 1) * self._block_size])
|
| 19 |
+
|
| 20 |
+
return results
|
| 21 |
+
|
| 22 |
+
# def _group_texts(self, examples):
|
| 23 |
+
# # Concatenate all texts.
|
| 24 |
+
# concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
| 25 |
+
# total_length = len(concatenated_examples[list(examples.keys())[0]])
|
| 26 |
+
# # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
| 27 |
+
# # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
| 28 |
+
# if total_length > self._block_size:
|
| 29 |
+
# total_length = (total_length // self._block_size) * self._block_size
|
| 30 |
+
#
|
| 31 |
+
# # Split by chunks of max_len.
|
| 32 |
+
# result = {
|
| 33 |
+
# k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
|
| 34 |
+
# for k, t in concatenated_examples.items()
|
| 35 |
+
# }
|
| 36 |
+
# return result
|
| 37 |
|
| 38 |
def __call__(self, examples, return_tensors=None, **kwargs):
|
| 39 |
if "text" in examples and not "images" in examples:
|
| 40 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|
| 41 |
+
encoding = self._cut_text(encoded_text)
|
| 42 |
elif "text" in examples and "images" in examples:
|
| 43 |
encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
|
| 44 |
|