impresso-project
/

ner-stacked-bert-multilingual

@@ -202,54 +202,77 @@ class MultitaskTokenClassificationPipeline(Pipeline):
         }
         return preprocess_kwargs, {}, {}
     def preprocess(self, text, **kwargs):
-        language = detect(text)
-        sentences = segment_and_trim_sentences(text, language, 512)
         tokenized_inputs = self.tokenizer(
-            sentences,
-            padding="max_length",
-            truncation=True,
-            max_length=512,
-            return_tensors="pt",
         )
-        text_sentences = [
-            tokenize(add_spaces_around_punctuation(sentence)) for sentence in sentences
-        ]
-        return tokenized_inputs, text_sentences, text
     def _forward(self, inputs):
         inputs, text_sentences, text = inputs
-        all_logits = {}
-        for i in range(len(text_sentences)):
-            print(inputs["input_ids"][i].shape)
-            input_ids = torch.tensor([inputs["input_ids"][i]], dtype=torch.long).to(
-                self.model.device
-            )
-            attention_mask = torch.tensor(
-                [inputs["attention_mask"][i]], dtype=torch.long
-            ).to(self.model.device)
-            with torch.no_grad():
-                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
-            # Accumulate logits for each task
-            if not all_logits:
-                all_logits = {task: logits for task, logits in outputs.logits.items()}
-            else:
-                for task in all_logits:
-                    all_logits[task] = torch.cat(
-                        (all_logits[task], outputs.logits[task]), dim=1
-                    )
-        # Replace outputs.logits with accumulated logits
-        outputs.logits = all_logits
         return outputs, text_sentences, text
     def postprocess(self, outputs, **kwargs):
         """
         Postprocess the outputs of the model

         }
         return preprocess_kwargs, {}, {}
+    # def preprocess(self, text, **kwargs):
+    #
+    #     language = detect(text)
+    #     sentences = segment_and_trim_sentences(text, language, 512)
+    #
+    #     tokenized_inputs = self.tokenizer(
+    #         text,
+    #         padding="max_length",
+    #         truncation=True,
+    #         max_length=512,
+    #         return_tensors="pt",
+    #     )
+    #
+    #     text_sentences = [
+    #         tokenize(add_spaces_around_punctuation(sentence)) for sentence in sentences
+    #     ]
+    #     return tokenized_inputs, text_sentences, text
     def preprocess(self, text, **kwargs):
+        # sentences = segment_and_trim_sentences(text, language, 512)
         tokenized_inputs = self.tokenizer(
+            text, padding="max_length", truncation=True, max_length=512
         )
+        text_sentence = tokenize(add_spaces_around_punctuation(text))
+        return tokenized_inputs, text_sentence, text
     def _forward(self, inputs):
         inputs, text_sentences, text = inputs
+        input_ids = torch.tensor([inputs["input_ids"]], dtype=torch.long).to(
+            self.model.device
+        )
+        print(input_ids.shape)
+        attention_mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long).to(
+            self.model.device
+        )
+        with torch.no_grad():
+            outputs = self.model(input_ids, attention_mask)
         return outputs, text_sentences, text
+    # def _forward(self, inputs):
+    #     inputs, text_sentences, text = inputs
+    #     all_logits = {}
+    #
+    #     for i in range(len(text_sentences)):
+    #         print(inputs["input_ids"][i].shape)
+    #         input_ids = torch.tensor([inputs["input_ids"][i]], dtype=torch.long).to(
+    #             self.model.device
+    #         )
+    #         attention_mask = torch.tensor(
+    #             [inputs["attention_mask"][i]], dtype=torch.long
+    #         ).to(self.model.device)
+    #
+    #         with torch.no_grad():
+    #             outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+    #
+    #         # Accumulate logits for each task
+    #         if not all_logits:
+    #             all_logits = {task: logits for task, logits in outputs.logits.items()}
+    #         else:
+    #             for task in all_logits:
+    #                 all_logits[task] = torch.cat(
+    #                     (all_logits[task], outputs.logits[task]), dim=1
+    #                 )
+    #
+    #     # Replace outputs.logits with accumulated logits
+    #     outputs.logits = all_logits
+    #
+    #     return outputs, text_sentences, text
     def postprocess(self, outputs, **kwargs):
         """
         Postprocess the outputs of the model