impresso-project
/

ner-stacked-bert-multilingual

@@ -2,8 +2,9 @@ from transformers import Pipeline
 import numpy as np
 import torch
 import nltk
-nltk.download('averaged_perceptron_tagger')
-nltk.download('averaged_perceptron_tagger_eng')
 from nltk.chunk import conlltags2tree
 from nltk import pos_tag
 from nltk.tree import Tree
@@ -107,9 +108,13 @@ def get_entities(tokens, tags, confidences, text):
                 entities.append(
                     {
                         "entity": original_label,
-                        "score": round(np.average(confidences[idx : idx + len(subtree)]) * 100, 2),
                         "index": (idx, idx + len(subtree)),
-                        "word": text[entity_start_position:entity_end_position], #original_string,
                         "start": entity_start_position,
                         "end": entity_end_position,
                     }
@@ -221,6 +226,44 @@ class MultitaskTokenClassificationPipeline(Pipeline):
             outputs = self.model(input_ids, attention_mask)
         return outputs, text_sentences, text
     def postprocess(self, outputs, **kwargs):
         """
@@ -249,4 +292,5 @@ class MultitaskTokenClassificationPipeline(Pipeline):
             entities[task] = get_entities(words_list, preds_list, confidence_list, text)
         return entities

 import numpy as np
 import torch
 import nltk
+nltk.download("averaged_perceptron_tagger")
+nltk.download("averaged_perceptron_tagger_eng")
 from nltk.chunk import conlltags2tree
 from nltk import pos_tag
 from nltk.tree import Tree
                 entities.append(
                     {
                         "entity": original_label,
+                        "score": round(
+                            np.average(confidences[idx : idx + len(subtree)]) * 100, 2
+                        ),
                         "index": (idx, idx + len(subtree)),
+                        "word": text[
+                            entity_start_position:entity_end_position
+                        ],  # original_string,
                         "start": entity_start_position,
                         "end": entity_end_position,
                     }
             outputs = self.model(input_ids, attention_mask)
         return outputs, text_sentences, text
+    def is_within(self, entity1, entity2):
+        """Check if entity1 is fully within the bounds of entity2."""
+        return entity1["start"] >= entity2["start"] and entity1["end"] <= entity2["end"]
+    def postprocess_entities(self, ner_results):
+        # Collect all entities in one list for processing
+        all_entities = []
+        for key in ner_results:
+            all_entities.extend(ner_results[key])
+        # Sort entities by start position, then by end position (to handle nested structures)
+        all_entities.sort(key=lambda x: (x["start"], -x["end"]))
+        # Create a new list for final processed entities
+        final_entities = []
+        # Process each entity and check for nesting
+        for i, entity in enumerate(all_entities):
+            nested = False
+            # Compare the current entity with already processed entities
+            for parent_entity in final_entities:
+                if self.is_within(entity, parent_entity):
+                    # If the current entity is nested, add it as a field in the parent entity
+                    field_name = entity["entity"].split(".")[
+                        -1
+                    ]  # Last part of the label as the field
+                    if field_name not in parent_entity:
+                        parent_entity[field_name] = []
+                    parent_entity[field_name].append(entity)
+                    nested = True
+                    break
+            if not nested:
+                # If not nested, add the entity as a new outermost entity
+                final_entities.append(entity)
+        return final_entities
     def postprocess(self, outputs, **kwargs):
         """
             entities[task] = get_entities(words_list, preds_list, confidence_list, text)
+        print(self.postprocess_entities(entities))
         return entities