impresso-project
/

ner-stacked-bert-multilingual

@@ -1,15 +1,16 @@
 import numpy as np
 from nltk.chunk import conlltags2tree
 from nltk import pos_tag
 from nltk.tree import Tree
-import re, string
-import pysbd
-import torch
 import torch.nn.functional as F
-from transformers import Pipeline
 from langdetect import detect
-from nltk.tokenize import sent_tokenize
-from typing import List
 def tokenize(text):
@@ -201,74 +202,59 @@ class MultitaskTokenClassificationPipeline(Pipeline):
         }
         return preprocess_kwargs, {}, {}
-    class MultitaskTokenClassificationPipeline(Pipeline):
-        def _sanitize_parameters(self, **kwargs):
-            preprocess_kwargs = {}
-            if "text" in kwargs:
-                preprocess_kwargs["text"] = kwargs["text"]
-            self.label_map = self.model.config.label_map
-            self.id2label = {
-                task: {id_: label for label, id_ in labels.items()}
-                for task, labels in self.label_map.items()
-            }
-            return preprocess_kwargs, {}, {}
-        def preprocess(self, text, **kwargs):
-            language = detect(text)
-            sentences = segment_and_trim_sentences(text, language, 512)
-            tokenized_inputs = self.tokenizer(
-                sentences,
-                padding="max_length",
-                truncation=True,
-                max_length=512,
-                return_tensors="pt",
-            )
-            text_sentence = [
-                tokenize(add_spaces_around_punctuation(sentence))
-                for sentence in sentences
             ]
-            return tokenized_inputs, text_sentence, text
-        def _forward(self, inputs):
-            inputs, text_sentence, text = inputs
-            input_ids = inputs["input_ids"].to(self.model.device)
-            attention_mask = inputs["attention_mask"].to(self.model.device)
-            with torch.no_grad():
-                outputs = self.model(input_ids, attention_mask)
-            return outputs, text_sentence, text
-        def postprocess(self, outputs, **kwargs):
-            tokens_result, text_sentence, text = outputs
-            predictions = {}
-            confidence_scores = {}
-            for task, logits in tokens_result.logits.items():
-                predictions[task] = torch.argmax(logits, dim=-1).tolist()
-                confidence_scores[task] = F.softmax(logits, dim=-1).tolist()
-            decoded_predictions = {}
-            for task, preds in predictions.items():
-                decoded_predictions[task] = [
-                    [self.id2label[task][label] for label in seq] for seq in preds
-                ]
-            entities = {}
-            for task, preds in predictions.items():
-                words_list, preds_list, confidence_list = realign(
-                    text_sentence,
-                    preds[0],
-                    confidence_scores[task][0],
-                    self.tokenizer,
-                    self.id2label[task],
-                )
-                entities[task] = get_entities(
-                    words_list, preds_list, confidence_list, text
-                )
-            return entities

+from transformers import Pipeline
 import numpy as np
+import torch
 from nltk.chunk import conlltags2tree
 from nltk import pos_tag
 from nltk.tree import Tree
+import string
 import torch.nn.functional as F
 from langdetect import detect
+import re, string
+import pysbd
 def tokenize(text):
         }
         return preprocess_kwargs, {}, {}
+    def preprocess(self, text, **kwargs):
+        language = detect(text)
+        sentences = segment_and_trim_sentences(text, language, 512)
+        tokenized_inputs = self.tokenizer(
+            text, padding="max_length", truncation=True, max_length=512
+        )
+        text_sentence = tokenize(add_spaces_around_punctuation(text))
+        return tokenized_inputs, text_sentence, text
+    def _forward(self, inputs):
+        inputs, text_sentence, text = inputs
+        input_ids = torch.tensor([inputs["input_ids"]], dtype=torch.long).to(
+            self.model.device
+        )
+        attention_mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long).to(
+            self.model.device
+        )
+        with torch.no_grad():
+            outputs = self.model(input_ids, attention_mask)
+        return outputs, text_sentence, text
+    def postprocess(self, outputs, **kwargs):
+        """
+        Postprocess the outputs of the model
+        :param outputs:
+        :param kwargs:
+        :return:
+        """
+        tokens_result, text_sentence, text = outputs
+        predictions = {}
+        confidence_scores = {}
+        for task, logits in tokens_result.logits.items():
+            predictions[task] = torch.argmax(logits, dim=-1).tolist()
+            confidence_scores[task] = F.softmax(logits, dim=-1).tolist()
+        decoded_predictions = {}
+        for task, preds in predictions.items():
+            decoded_predictions[task] = [
+                [self.id2label[task][label] for label in seq] for seq in preds
             ]
+        entities = {}
+        for task, preds in predictions.items():
+            words_list, preds_list, confidence_list = realign(
+                text_sentence,
+                preds[0],
+                confidence_scores[task][0],
+                self.tokenizer,
+                self.id2label[task],
+            )
+            entities[task] = get_entities(words_list, preds_list, confidence_list, text)
+        return entities