impresso-project
/

ner-stacked-bert-multilingual

@@ -236,6 +236,21 @@ def attach_comp_to_closest(entities):
     return other_entities
 class MultitaskTokenClassificationPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
@@ -274,48 +289,48 @@ class MultitaskTokenClassificationPipeline(Pipeline):
         """Check if entity1 is fully within the bounds of entity2."""
         return entity1["start"] >= entity2["start"] and entity1["end"] <= entity2["end"]
-    def postprocess_entities(self, all_entities):
-        # Sort entities by start position, then by end position (to handle nested structures)
-        all_entities.sort(key=lambda x: (x["start"], -x["end"]))
-        # Create a new list for final processed entities
-        final_entities = []
-        # Process each entity and check for nesting
-        for i, entity in enumerate(all_entities):
-            nested = False
-            # Compare the current entity with already processed entities
-            for parent_entity in final_entities:
-                if self.is_within(entity, parent_entity):
-                    # If the current entity is nested, add it as a field in the parent entity
-                    main_field_name = entity["entity"].split(".")[0]
-                    field_name = entity["entity"].split(".")[
-                        -1
-                    ]  # Last part of the label as the field
-                    if main_field_name not in parent_entity["entity"]:
-                        # print(
-                        #     "main_field_name:",
-                        #     main_field_name,
-                        #     "parent_entity:",
-                        #     parent_entity["entity"],
-                        # )
-                        parent_entity[field_name] = entity["word"]
-                        nested = True
-                        break
-                    else:
-                        nested = True
-                    if "comp" in entity["entity"]:
-                        nested = True
-            if not nested:
-                # If not nested, add the entity as a new outermost entity
-                entity["text"] = entity["word"]
-                entity.pop("word")
-                final_entities.append(entity)
-        return final_entities
     def postprocess(self, outputs, **kwargs):
         """
@@ -355,10 +370,13 @@ class MultitaskTokenClassificationPipeline(Pipeline):
             if key not in ["NE-COARSE-LIT"]:
                 all_entities.extend(entities[key])
-        print("Skipping 1")
-        # all_entities = self.postprocess_entities(all_entities, text_sentence)
-        # print("After 1:")
-        # pprint(all_entities)
         # Attach "comp.function" entities to the closest non-"comp.function" entity
         all_entities = attach_comp_to_closest(all_entities)
         print("After 2:")

     return other_entities
+def postprocess_entities(entities):
+    # Step 1: Filter entities with the same text, keeping those with the more specific label (contains a dot)
+    filtered_entities = []
+    entity_map = {}
+    # Loop over the entities and prioritize the more specific ones
+    for entity in entities:
+        entity_text = entity["text"]
+        # If this entity text hasn't been processed, or we find a more specific label, update it
+        if entity_text not in entity_map or "." in entity["entity"]:
+            entity_map[entity_text] = entity
+    return entity_map
 class MultitaskTokenClassificationPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
         """Check if entity1 is fully within the bounds of entity2."""
         return entity1["start"] >= entity2["start"] and entity1["end"] <= entity2["end"]
+    # def postprocess_entities(self, all_entities):
+    #
+    #     # Sort entities by start position, then by end position (to handle nested structures)
+    #     all_entities.sort(key=lambda x: (x["start"], -x["end"]))
+    #
+    #     # Create a new list for final processed entities
+    #     final_entities = []
+    #
+    #     # Process each entity and check for nesting
+    #     for i, entity in enumerate(all_entities):
+    #         nested = False
+    #
+    #         # Compare the current entity with already processed entities
+    #         for parent_entity in final_entities:
+    #             if self.is_within(entity, parent_entity):
+    #
+    #                 # If the current entity is nested, add it as a field in the parent entity
+    #                 main_field_name = entity["entity"].split(".")[0]
+    #                 field_name = entity["entity"].split(".")[
+    #                     -1
+    #                 ]  # Last part of the label as the field
+    #                 if main_field_name not in parent_entity["entity"]:
+    #                     # print(
+    #                     #     "main_field_name:",
+    #                     #     main_field_name,
+    #                     #     "parent_entity:",
+    #                     #     parent_entity["entity"],
+    #                     # )
+    #                     parent_entity[field_name] = entity["word"]
+    #                     nested = True
+    #                     break
+    #                 else:
+    #                     nested = True
+    #                 if "comp" in entity["entity"]:
+    #                     nested = True
+    #         if not nested:
+    #             # If not nested, add the entity as a new outermost entity
+    #             entity["text"] = entity["word"]
+    #             entity.pop("word")
+    #             final_entities.append(entity)
+    #
+    #     return final_entities
     def postprocess(self, outputs, **kwargs):
         """
             if key not in ["NE-COARSE-LIT"]:
                 all_entities.extend(entities[key])
+        # print("Skipping 1")
+        all_entities = postprocess_entities(
+            all_entities,
+        )
+        print("After 1:")
+        pprint(all_entities)
         # Attach "comp.function" entities to the closest non-"comp.function" entity
         all_entities = attach_comp_to_closest(all_entities)
         print("After 2:")