impresso-project
/

ner-stacked-bert-multilingual

@@ -200,72 +200,58 @@ class MultitaskTokenClassificationPipeline(Pipeline):
         }
         return preprocess_kwargs, {}, {}
-    def chunk_text_exact(self, text, tokenizer, max_subtokens):
-        """
-        Splits text into exact subtoken chunks based on the tokenizer's max length.
-        """
-        subtokens = tokenizer.encode(text, add_special_tokens=False)
-        for i in range(0, len(subtokens), max_subtokens):
-            chunk = subtokens[i : i + max_subtokens]
-            yield tokenizer.decode(chunk, clean_up_tokenization_spaces=False)
     def preprocess(self, text, **kwargs):
-        # Get the model's max input length
-        max_input_length = self.tokenizer.model_max_length - 2  # Reserve space for [CLS] and [SEP]
-        # Split the text into subtoken chunks
-        text_chunks = list(self.chunk_text_exact(text, self.tokenizer, max_input_length))
-        # Tokenize and add special tokens for each chunk
-        tokenized_chunks = [
-            self.tokenizer(
-                chunk, padding="max_length", truncation=True, max_length=self.tokenizer.model_max_length
-            )
-            for chunk in text_chunks
-        ]
-        return tokenized_chunks, text_chunks, text
     def _forward(self, inputs):
-        tokenized_chunks, text_chunks, text = inputs
-        outputs = []
         with torch.no_grad():
-            for tokenized_input in tokenized_chunks:
-                input_ids = torch.tensor([tokenized_input["input_ids"]], dtype=torch.long).to(self.model.device)
-                attention_mask = torch.tensor([tokenized_input["attention_mask"]], dtype=torch.long).to(self.model.device)
-                outputs.append(self.model(input_ids, attention_mask))
-        return outputs, text_chunks, text
-    def postprocess(self, outputs, **kwargs):
-        tokens_result, text_chunks, text = outputs
-        # Initialize variables for collecting results across chunks
-        predictions = {task: [] for task in self.label_map.keys()}
-        confidence_scores = {task: [] for task in self.label_map.keys()}
-        # Collect predictions from each chunk
-        for chunk_result in tokens_result:
-            for task, logits in chunk_result.logits.items():
-                predictions[task].extend(torch.argmax(logits, dim=-1).tolist())
-                confidence_scores[task].extend(F.softmax(logits, dim=-1).tolist())
-        # Decode and process the predictions
         decoded_predictions = {}
         for task, preds in predictions.items():
             decoded_predictions[task] = [
                 [self.id2label[task][label] for label in seq] for seq in preds
             ]
-        # Extract entities from the combined predictions
         entities = {}
         for task, preds in predictions.items():
             words_list, preds_list, confidence_list = realign(
-                text_chunks,
-                preds,
-                confidence_scores[task],
                 self.tokenizer,
                 self.id2label[task],
             )
             entities[task] = get_entities(words_list, preds_list, confidence_list, text)
-            print(entities[task])
         return entities

         }
         return preprocess_kwargs, {}, {}
     def preprocess(self, text, **kwargs):
+        tokenized_inputs = self.tokenizer(
+            text, padding="max_length", truncation=True, max_length=512
+        )
+        text_sentence = tokenize(add_spaces_around_punctuation(text))
+        return tokenized_inputs, text_sentence, text
     def _forward(self, inputs):
+        inputs, text_sentences, text = inputs
+        input_ids = torch.tensor([inputs["input_ids"]], dtype=torch.long).to(
+            self.model.device
+        )
+        attention_mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long).to(
+            self.model.device
+        )
         with torch.no_grad():
+            outputs = self.model(input_ids, attention_mask)
+        return outputs, text_sentences, text
+    def postprocess(self, outputs, **kwargs):
+        """
+        Postprocess the outputs of the model
+        :param outputs:
+        :param kwargs:
+        :return:
+        """
+        tokens_result, text_sentence, text = outputs
+        predictions = {}
+        confidence_scores = {}
+        for task, logits in tokens_result.logits.items():
+            predictions[task] = torch.argmax(logits, dim=-1).tolist()
+            confidence_scores[task] = F.softmax(logits, dim=-1).tolist()
         decoded_predictions = {}
         for task, preds in predictions.items():
             decoded_predictions[task] = [
                 [self.id2label[task][label] for label in seq] for seq in preds
             ]
         entities = {}
         for task, preds in predictions.items():
             words_list, preds_list, confidence_list = realign(
+                text_sentence,
+                preds[0],
+                confidence_scores[task][0],
                 self.tokenizer,
                 self.id2label[task],
             )
             entities[task] = get_entities(words_list, preds_list, confidence_list, text)
         return entities