impresso-project
/

ner-stacked-bert-multilingual

@@ -1,15 +1,15 @@
-from transformers import Pipeline
 import numpy as np
-import torch
 from nltk.chunk import conlltags2tree
 from nltk import pos_tag
 from nltk.tree import Tree
-import string
-import torch.nn.functional as F
-import re
 import re, string
 def tokenize(text):
@@ -88,14 +88,20 @@ def get_entities(tokens, tags, confidences, text):
                 entity_start_position = indices[0]
                 entity_end_position = indices[1]
                 if (
-                    "_".join([original_label, original_string, str(entity_start_position)])
                     in already_done
                 ):
                     continue
                 else:
                     already_done.append(
                         "_".join(
-                            [original_label, original_string, str(entity_start_position)]
                         )
                     )
                 entities.append(
@@ -141,6 +147,37 @@ def realign(
     return words_list, preds_list, confidence_list
 # List of additional "strange" punctuation marks
 additional_punctuation = "‘’“”„«»•–—―‣◦…§¶†‡‰′″〈〉"
@@ -164,56 +201,74 @@ class MultitaskTokenClassificationPipeline(Pipeline):
         }
         return preprocess_kwargs, {}, {}
-    def preprocess(self, text, **kwargs):
-        tokenized_inputs = self.tokenizer(
-            text, padding="max_length", truncation=True, max_length=512
-        )
-        text_sentence = tokenize(add_spaces_around_punctuation(text))
-        return tokenized_inputs, text_sentence, text
-    def _forward(self, inputs):
-        inputs, text_sentence, text = inputs
-        input_ids = torch.tensor([inputs["input_ids"]], dtype=torch.long).to(
-            self.model.device
-        )
-        attention_mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long).to(
-            self.model.device
-        )
-        with torch.no_grad():
-            outputs = self.model(input_ids, attention_mask)
-        return outputs, text_sentence, text
-    def postprocess(self, outputs, **kwargs):
-        """
-        Postprocess the outputs of the model
-        :param outputs:
-        :param kwargs:
-        :return:
-        """
-        tokens_result, text_sentence, text = outputs
-        predictions = {}
-        confidence_scores = {}
-        for task, logits in tokens_result.logits.items():
-            predictions[task] = torch.argmax(logits, dim=-1).tolist()
-            confidence_scores[task] = F.softmax(logits, dim=-1).tolist()
-        decoded_predictions = {}
-        for task, preds in predictions.items():
-            decoded_predictions[task] = [
-                [self.id2label[task][label] for label in seq] for seq in preds
-            ]
-        entities = {}
-        for task, preds in predictions.items():
-            words_list, preds_list, confidence_list = realign(
-                text_sentence,
-                preds[0],
-                confidence_scores[task][0],
-                self.tokenizer,
-                self.id2label[task],
             )
-            entities[task] = get_entities(words_list, preds_list, confidence_list, text)
-        return entities

 import numpy as np
 from nltk.chunk import conlltags2tree
 from nltk import pos_tag
 from nltk.tree import Tree
 import re, string
+import pysbd
+import torch
+import torch.nn.functional as F
+from transformers import Pipeline
+from langdetect import detect
+from nltk.tokenize import sent_tokenize
+from typing import List
 def tokenize(text):
                 entity_start_position = indices[0]
                 entity_end_position = indices[1]
                 if (
+                    "_".join(
+                        [original_label, original_string, str(entity_start_position)]
+                    )
                     in already_done
                 ):
                     continue
                 else:
                     already_done.append(
                         "_".join(
+                            [
+                                original_label,
+                                original_string,
+                                str(entity_start_position),
+                            ]
                         )
                     )
                 entities.append(
     return words_list, preds_list, confidence_list
+def segment_and_trim_sentences(article, language, max_length):
+    try:
+        segmenter = pysbd.Segmenter(language=language, clean=False)
+    except:
+        segmenter = pysbd.Segmenter(language="en", clean=False)
+    sentences = segmenter.segment(article)
+    trimmed_sentences = []
+    for sentence in sentences:
+        while len(sentence) > max_length:
+            # Find the last space within max_length
+            cut_index = sentence.rfind(" ", 0, max_length)
+            if cut_index == -1:
+                # If no space found, forcibly cut at max_length
+                cut_index = max_length
+            # Cut the sentence and add the first part to trimmed sentences
+            trimmed_sentences.append(sentence[:cut_index])
+            # Update the sentence to be the remaining part
+            sentence = sentence[cut_index:].lstrip()
+        # Add the remaining part of the sentence if it's not empty
+        if sentence:
+            trimmed_sentences.append(sentence)
+    return trimmed_sentences
 # List of additional "strange" punctuation marks
 additional_punctuation = "‘’“”„«»•–—―‣◦…§¶†‡‰′″〈〉"
         }
         return preprocess_kwargs, {}, {}
+    class MultitaskTokenClassificationPipeline(Pipeline):
+        def _sanitize_parameters(self, **kwargs):
+            preprocess_kwargs = {}
+            if "text" in kwargs:
+                preprocess_kwargs["text"] = kwargs["text"]
+            self.label_map = self.model.config.label_map
+            self.id2label = {
+                task: {id_: label for label, id_ in labels.items()}
+                for task, labels in self.label_map.items()
+            }
+            return preprocess_kwargs, {}, {}
+        def preprocess(self, text, **kwargs):
+            language = detect(text)
+            sentences = segment_and_trim_sentences(text, language, 512)
+            tokenized_inputs = self.tokenizer(
+                sentences,
+                padding="max_length",
+                truncation=True,
+                max_length=512,
+                return_tensors="pt",
             )
+            text_sentence = [
+                tokenize(add_spaces_around_punctuation(sentence))
+                for sentence in sentences
+            ]
+            return tokenized_inputs, text_sentence, text
+        def _forward(self, inputs):
+            inputs, text_sentence, text = inputs
+            input_ids = inputs["input_ids"].to(self.model.device)
+            attention_mask = inputs["attention_mask"].to(self.model.device)
+            with torch.no_grad():
+                outputs = self.model(input_ids, attention_mask)
+            return outputs, text_sentence, text
+        def postprocess(self, outputs, **kwargs):
+            tokens_result, text_sentence, text = outputs
+            predictions = {}
+            confidence_scores = {}
+            for task, logits in tokens_result.logits.items():
+                predictions[task] = torch.argmax(logits, dim=-1).tolist()
+                confidence_scores[task] = F.softmax(logits, dim=-1).tolist()
+            decoded_predictions = {}
+            for task, preds in predictions.items():
+                decoded_predictions[task] = [
+                    [self.id2label[task][label] for label in seq] for seq in preds
+                ]
+            entities = {}
+            for task, preds in predictions.items():
+                words_list, preds_list, confidence_list = realign(
+                    text_sentence,
+                    preds[0],
+                    confidence_scores[task][0],
+                    self.tokenizer,
+                    self.id2label[task],
+                )
+                entities[task] = get_entities(
+                    words_list, preds_list, confidence_list, text
+                )
+            return entities