impresso-project
/

ner-stacked-bert-multilingual

@@ -121,12 +121,12 @@ def get_entities(tokens, tags, confidences, text):
                 entities.append(
                     {
-                        "entity": original_label,
                         "score": round(
                             np.average(confidences[idx : idx + len(subtree)]) * 100, 2
                         ),
                         "index": (idx, idx + len(subtree)),
-                        "text": text[
                             entity_start_position:entity_end_position
                         ],  # original_string,
                         "start": entity_start_position,
@@ -211,8 +211,8 @@ def attach_comp_to_closest(entities):
     valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
     # Separate "comp.function" and "comp.name" entities from other entities
-    comp_entities = [ent for ent in entities if ent["entity"].startswith("comp")]
-    other_entities = [ent for ent in entities if not ent["entity"].startswith("comp")]
     for comp_entity in comp_entities:
         closest_entity = None
@@ -227,7 +227,7 @@ def attach_comp_to_closest(entities):
             # Ensure that the other entity's type is valid for the attachment
             if (
                 distance < min_distance
-                and other_entity["entity"].split(".")[0] in valid_entity_types
             ):
                 # Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
                 if not conflicting_context(comp_entity, other_entity):
@@ -237,9 +237,9 @@ def attach_comp_to_closest(entities):
         # Attach the "comp.function" entity using the suffix of 'entity' field if a valid entity is found
         if closest_entity:
             # Extract the suffix (e.g., "comp.title" becomes "title")
-            suffix = comp_entity["entity"].split(".")[-1]
             closest_entity[suffix] = comp_entity[
-                "text"
             ]  # Attach the text using the suffix as the key
     return other_entities
@@ -250,19 +250,17 @@ def conflicting_context(comp_entity, target_entity):
     Determines if there is a conflict in the context between the comp_entity and the target entity.
     For example, don't attach 'Maximilien Robespierre' as a name to 'Marie Antoinette'.
     """
-    comp_text = comp_entity["text"].lower()
-    target_text = target_entity["text"].lower()
     # Case 1: Avoid attaching the name "Washington" to "George Washington" if it relates to a location.
-    if "pers" in target_entity["entity"] and comp_entity["entity"].startswith(
-        "comp.name"
-    ):
         # Ensure we allow attaching "Washington" to "George Washington" but not to other irrelevant contexts
         if comp_text not in target_text and target_text.startswith("george washington"):
             return True  # Conflict: Unrelated or partial name mismatch
     # Case 2: Ensure the target entity text matches the role or function (e.g., "President of the United States").
-    if "pers" in target_entity["entity"] and comp_entity["entity"].startswith(
         "comp.function"
     ):
         # Ensure the function context makes sense (e.g., attach 'President' to 'George Washington')
@@ -300,14 +298,14 @@ def repair_names_in_entities(entities):
     from the text of the entity if a partial name (e.g., 'Washington') is incorrectly attached.
     """
     for entity in entities:
-        if "name" in entity and "pers" in entity["entity"]:
             name = entity["name"].lower()
-            text = entity["text"].lower()
             # Check if the attached name is part of the entity's text
             if name in text:
                 # Extract the full name from the text by splitting around the attached name
-                full_name = extract_name_from_text(entity["text"], name)
                 entity["name"] = (
                     full_name  # Replace the partial name with the full name
                 )
@@ -321,13 +319,13 @@ def postprocess_entities(entities):
     # Loop over the entities and prioritize the one with the most dots
     for entity in entities:
-        entity_text = entity["text"]
-        num_dots = entity["entity"].count(".")
         # If the entity text is new, or this entity has more dots, update the map
         if (
             entity_text not in entity_map
-            or entity_map[entity_text]["entity"].count(".") < num_dots
         ):
             entity_map[entity_text] = entity
@@ -348,10 +346,10 @@ def remove_included_entities(entities):
     for i, entity in enumerate(entities):
         is_included = False
         for other_entity in entities:
-            if entity["text"] != other_entity["text"]:
-                if "comp" in other_entity["entity"]:
                     # Check if entity's text is a substring of another entity's text
-                    if entity["text"] in other_entity["text"]:
                         is_included = True
                         break
         if not is_included:
@@ -367,8 +365,8 @@ stop_words = stopwords(["en", "fr", "de"])
 def remove_trailing_stopwords(entities):
     # This function removes stopwords from both the beginning and end of each entity's text
     for entity in entities:
-        if "comp" not in entity["entity"]:
-            words = entity["text"].split()
             # Remove stopwords from the beginning
             while words and words[0].lower() in stop_words:
@@ -379,7 +377,7 @@ def remove_trailing_stopwords(entities):
                 words.pop()  # Remove the last word if it's a stopword
             # Join the remaining words back together and update the entity's text
-            entity["text"] = " ".join(words)
     return entities

                 entities.append(
                     {
+                        "type": original_label,
                         "score": round(
                             np.average(confidences[idx : idx + len(subtree)]) * 100, 2
                         ),
                         "index": (idx, idx + len(subtree)),
+                        "surface": text[
                             entity_start_position:entity_end_position
                         ],  # original_string,
                         "start": entity_start_position,
     valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
     # Separate "comp.function" and "comp.name" entities from other entities
+    comp_entities = [ent for ent in entities if ent["type"].startswith("comp")]
+    other_entities = [ent for ent in entities if not ent["type"].startswith("comp")]
     for comp_entity in comp_entities:
         closest_entity = None
             # Ensure that the other entity's type is valid for the attachment
             if (
                 distance < min_distance
+                and other_entity["type"].split(".")[0] in valid_entity_types
             ):
                 # Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
                 if not conflicting_context(comp_entity, other_entity):
         # Attach the "comp.function" entity using the suffix of 'entity' field if a valid entity is found
         if closest_entity:
             # Extract the suffix (e.g., "comp.title" becomes "title")
+            suffix = comp_entity["type"].split(".")[-1]
             closest_entity[suffix] = comp_entity[
+                "surface"
             ]  # Attach the text using the suffix as the key
     return other_entities
     Determines if there is a conflict in the context between the comp_entity and the target entity.
     For example, don't attach 'Maximilien Robespierre' as a name to 'Marie Antoinette'.
     """
+    comp_text = comp_entity["surface"].lower()
+    target_text = target_entity["surface"].lower()
     # Case 1: Avoid attaching the name "Washington" to "George Washington" if it relates to a location.
+    if "pers" in target_entity["type"] and comp_entity["type"].startswith("comp.name"):
         # Ensure we allow attaching "Washington" to "George Washington" but not to other irrelevant contexts
         if comp_text not in target_text and target_text.startswith("george washington"):
             return True  # Conflict: Unrelated or partial name mismatch
     # Case 2: Ensure the target entity text matches the role or function (e.g., "President of the United States").
+    if "pers" in target_entity["type"] and comp_entity["type"].startswith(
         "comp.function"
     ):
         # Ensure the function context makes sense (e.g., attach 'President' to 'George Washington')
     from the text of the entity if a partial name (e.g., 'Washington') is incorrectly attached.
     """
     for entity in entities:
+        if "name" in entity and "pers" in entity["type"]:
             name = entity["name"].lower()
+            text = entity["surface"].lower()
             # Check if the attached name is part of the entity's text
             if name in text:
                 # Extract the full name from the text by splitting around the attached name
+                full_name = extract_name_from_text(entity["surface"], name)
                 entity["name"] = (
                     full_name  # Replace the partial name with the full name
                 )
     # Loop over the entities and prioritize the one with the most dots
     for entity in entities:
+        entity_text = entity["surface"]
+        num_dots = entity["type"].count(".")
         # If the entity text is new, or this entity has more dots, update the map
         if (
             entity_text not in entity_map
+            or entity_map[entity_text]["type"].count(".") < num_dots
         ):
             entity_map[entity_text] = entity
     for i, entity in enumerate(entities):
         is_included = False
         for other_entity in entities:
+            if entity["surface"] != other_entity["surface"]:
+                if "comp" in other_entity["type"]:
                     # Check if entity's text is a substring of another entity's text
+                    if entity["surface"] in other_entity["surface"]:
                         is_included = True
                         break
         if not is_included:
 def remove_trailing_stopwords(entities):
     # This function removes stopwords from both the beginning and end of each entity's text
     for entity in entities:
+        if "comp" not in entity["type"]:
+            words = entity["surface"].split()
             # Remove stopwords from the beginning
             while words and words[0].lower() in stop_words:
                 words.pop()  # Remove the last word if it's a stopword
             # Join the remaining words back together and update the entity's text
+            entity["surface"] = " ".join(words)
     return entities