Commit
·
9ccbbd3
1
Parent(s):
38b2f9d
update fields
Browse files- generic_ner.py +23 -25
generic_ner.py
CHANGED
|
@@ -121,12 +121,12 @@ def get_entities(tokens, tags, confidences, text):
|
|
| 121 |
|
| 122 |
entities.append(
|
| 123 |
{
|
| 124 |
-
"
|
| 125 |
"score": round(
|
| 126 |
np.average(confidences[idx : idx + len(subtree)]) * 100, 2
|
| 127 |
),
|
| 128 |
"index": (idx, idx + len(subtree)),
|
| 129 |
-
"
|
| 130 |
entity_start_position:entity_end_position
|
| 131 |
], # original_string,
|
| 132 |
"start": entity_start_position,
|
|
@@ -211,8 +211,8 @@ def attach_comp_to_closest(entities):
|
|
| 211 |
valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
|
| 212 |
|
| 213 |
# Separate "comp.function" and "comp.name" entities from other entities
|
| 214 |
-
comp_entities = [ent for ent in entities if ent["
|
| 215 |
-
other_entities = [ent for ent in entities if not ent["
|
| 216 |
|
| 217 |
for comp_entity in comp_entities:
|
| 218 |
closest_entity = None
|
|
@@ -227,7 +227,7 @@ def attach_comp_to_closest(entities):
|
|
| 227 |
# Ensure that the other entity's type is valid for the attachment
|
| 228 |
if (
|
| 229 |
distance < min_distance
|
| 230 |
-
and other_entity["
|
| 231 |
):
|
| 232 |
# Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
|
| 233 |
if not conflicting_context(comp_entity, other_entity):
|
|
@@ -237,9 +237,9 @@ def attach_comp_to_closest(entities):
|
|
| 237 |
# Attach the "comp.function" entity using the suffix of 'entity' field if a valid entity is found
|
| 238 |
if closest_entity:
|
| 239 |
# Extract the suffix (e.g., "comp.title" becomes "title")
|
| 240 |
-
suffix = comp_entity["
|
| 241 |
closest_entity[suffix] = comp_entity[
|
| 242 |
-
"
|
| 243 |
] # Attach the text using the suffix as the key
|
| 244 |
|
| 245 |
return other_entities
|
|
@@ -250,19 +250,17 @@ def conflicting_context(comp_entity, target_entity):
|
|
| 250 |
Determines if there is a conflict in the context between the comp_entity and the target entity.
|
| 251 |
For example, don't attach 'Maximilien Robespierre' as a name to 'Marie Antoinette'.
|
| 252 |
"""
|
| 253 |
-
comp_text = comp_entity["
|
| 254 |
-
target_text = target_entity["
|
| 255 |
|
| 256 |
# Case 1: Avoid attaching the name "Washington" to "George Washington" if it relates to a location.
|
| 257 |
-
if "pers" in target_entity["
|
| 258 |
-
"comp.name"
|
| 259 |
-
):
|
| 260 |
# Ensure we allow attaching "Washington" to "George Washington" but not to other irrelevant contexts
|
| 261 |
if comp_text not in target_text and target_text.startswith("george washington"):
|
| 262 |
return True # Conflict: Unrelated or partial name mismatch
|
| 263 |
|
| 264 |
# Case 2: Ensure the target entity text matches the role or function (e.g., "President of the United States").
|
| 265 |
-
if "pers" in target_entity["
|
| 266 |
"comp.function"
|
| 267 |
):
|
| 268 |
# Ensure the function context makes sense (e.g., attach 'President' to 'George Washington')
|
|
@@ -300,14 +298,14 @@ def repair_names_in_entities(entities):
|
|
| 300 |
from the text of the entity if a partial name (e.g., 'Washington') is incorrectly attached.
|
| 301 |
"""
|
| 302 |
for entity in entities:
|
| 303 |
-
if "name" in entity and "pers" in entity["
|
| 304 |
name = entity["name"].lower()
|
| 305 |
-
text = entity["
|
| 306 |
|
| 307 |
# Check if the attached name is part of the entity's text
|
| 308 |
if name in text:
|
| 309 |
# Extract the full name from the text by splitting around the attached name
|
| 310 |
-
full_name = extract_name_from_text(entity["
|
| 311 |
entity["name"] = (
|
| 312 |
full_name # Replace the partial name with the full name
|
| 313 |
)
|
|
@@ -321,13 +319,13 @@ def postprocess_entities(entities):
|
|
| 321 |
|
| 322 |
# Loop over the entities and prioritize the one with the most dots
|
| 323 |
for entity in entities:
|
| 324 |
-
entity_text = entity["
|
| 325 |
-
num_dots = entity["
|
| 326 |
|
| 327 |
# If the entity text is new, or this entity has more dots, update the map
|
| 328 |
if (
|
| 329 |
entity_text not in entity_map
|
| 330 |
-
or entity_map[entity_text]["
|
| 331 |
):
|
| 332 |
entity_map[entity_text] = entity
|
| 333 |
|
|
@@ -348,10 +346,10 @@ def remove_included_entities(entities):
|
|
| 348 |
for i, entity in enumerate(entities):
|
| 349 |
is_included = False
|
| 350 |
for other_entity in entities:
|
| 351 |
-
if entity["
|
| 352 |
-
if "comp" in other_entity["
|
| 353 |
# Check if entity's text is a substring of another entity's text
|
| 354 |
-
if entity["
|
| 355 |
is_included = True
|
| 356 |
break
|
| 357 |
if not is_included:
|
|
@@ -367,8 +365,8 @@ stop_words = stopwords(["en", "fr", "de"])
|
|
| 367 |
def remove_trailing_stopwords(entities):
|
| 368 |
# This function removes stopwords from both the beginning and end of each entity's text
|
| 369 |
for entity in entities:
|
| 370 |
-
if "comp" not in entity["
|
| 371 |
-
words = entity["
|
| 372 |
|
| 373 |
# Remove stopwords from the beginning
|
| 374 |
while words and words[0].lower() in stop_words:
|
|
@@ -379,7 +377,7 @@ def remove_trailing_stopwords(entities):
|
|
| 379 |
words.pop() # Remove the last word if it's a stopword
|
| 380 |
|
| 381 |
# Join the remaining words back together and update the entity's text
|
| 382 |
-
entity["
|
| 383 |
|
| 384 |
return entities
|
| 385 |
|
|
|
|
| 121 |
|
| 122 |
entities.append(
|
| 123 |
{
|
| 124 |
+
"type": original_label,
|
| 125 |
"score": round(
|
| 126 |
np.average(confidences[idx : idx + len(subtree)]) * 100, 2
|
| 127 |
),
|
| 128 |
"index": (idx, idx + len(subtree)),
|
| 129 |
+
"surface": text[
|
| 130 |
entity_start_position:entity_end_position
|
| 131 |
], # original_string,
|
| 132 |
"start": entity_start_position,
|
|
|
|
| 211 |
valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
|
| 212 |
|
| 213 |
# Separate "comp.function" and "comp.name" entities from other entities
|
| 214 |
+
comp_entities = [ent for ent in entities if ent["type"].startswith("comp")]
|
| 215 |
+
other_entities = [ent for ent in entities if not ent["type"].startswith("comp")]
|
| 216 |
|
| 217 |
for comp_entity in comp_entities:
|
| 218 |
closest_entity = None
|
|
|
|
| 227 |
# Ensure that the other entity's type is valid for the attachment
|
| 228 |
if (
|
| 229 |
distance < min_distance
|
| 230 |
+
and other_entity["type"].split(".")[0] in valid_entity_types
|
| 231 |
):
|
| 232 |
# Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
|
| 233 |
if not conflicting_context(comp_entity, other_entity):
|
|
|
|
| 237 |
# Attach the "comp.function" entity using the suffix of 'entity' field if a valid entity is found
|
| 238 |
if closest_entity:
|
| 239 |
# Extract the suffix (e.g., "comp.title" becomes "title")
|
| 240 |
+
suffix = comp_entity["type"].split(".")[-1]
|
| 241 |
closest_entity[suffix] = comp_entity[
|
| 242 |
+
"surface"
|
| 243 |
] # Attach the text using the suffix as the key
|
| 244 |
|
| 245 |
return other_entities
|
|
|
|
| 250 |
Determines if there is a conflict in the context between the comp_entity and the target entity.
|
| 251 |
For example, don't attach 'Maximilien Robespierre' as a name to 'Marie Antoinette'.
|
| 252 |
"""
|
| 253 |
+
comp_text = comp_entity["surface"].lower()
|
| 254 |
+
target_text = target_entity["surface"].lower()
|
| 255 |
|
| 256 |
# Case 1: Avoid attaching the name "Washington" to "George Washington" if it relates to a location.
|
| 257 |
+
if "pers" in target_entity["type"] and comp_entity["type"].startswith("comp.name"):
|
|
|
|
|
|
|
| 258 |
# Ensure we allow attaching "Washington" to "George Washington" but not to other irrelevant contexts
|
| 259 |
if comp_text not in target_text and target_text.startswith("george washington"):
|
| 260 |
return True # Conflict: Unrelated or partial name mismatch
|
| 261 |
|
| 262 |
# Case 2: Ensure the target entity text matches the role or function (e.g., "President of the United States").
|
| 263 |
+
if "pers" in target_entity["type"] and comp_entity["type"].startswith(
|
| 264 |
"comp.function"
|
| 265 |
):
|
| 266 |
# Ensure the function context makes sense (e.g., attach 'President' to 'George Washington')
|
|
|
|
| 298 |
from the text of the entity if a partial name (e.g., 'Washington') is incorrectly attached.
|
| 299 |
"""
|
| 300 |
for entity in entities:
|
| 301 |
+
if "name" in entity and "pers" in entity["type"]:
|
| 302 |
name = entity["name"].lower()
|
| 303 |
+
text = entity["surface"].lower()
|
| 304 |
|
| 305 |
# Check if the attached name is part of the entity's text
|
| 306 |
if name in text:
|
| 307 |
# Extract the full name from the text by splitting around the attached name
|
| 308 |
+
full_name = extract_name_from_text(entity["surface"], name)
|
| 309 |
entity["name"] = (
|
| 310 |
full_name # Replace the partial name with the full name
|
| 311 |
)
|
|
|
|
| 319 |
|
| 320 |
# Loop over the entities and prioritize the one with the most dots
|
| 321 |
for entity in entities:
|
| 322 |
+
entity_text = entity["surface"]
|
| 323 |
+
num_dots = entity["type"].count(".")
|
| 324 |
|
| 325 |
# If the entity text is new, or this entity has more dots, update the map
|
| 326 |
if (
|
| 327 |
entity_text not in entity_map
|
| 328 |
+
or entity_map[entity_text]["type"].count(".") < num_dots
|
| 329 |
):
|
| 330 |
entity_map[entity_text] = entity
|
| 331 |
|
|
|
|
| 346 |
for i, entity in enumerate(entities):
|
| 347 |
is_included = False
|
| 348 |
for other_entity in entities:
|
| 349 |
+
if entity["surface"] != other_entity["surface"]:
|
| 350 |
+
if "comp" in other_entity["type"]:
|
| 351 |
# Check if entity's text is a substring of another entity's text
|
| 352 |
+
if entity["surface"] in other_entity["surface"]:
|
| 353 |
is_included = True
|
| 354 |
break
|
| 355 |
if not is_included:
|
|
|
|
| 365 |
def remove_trailing_stopwords(entities):
|
| 366 |
# This function removes stopwords from both the beginning and end of each entity's text
|
| 367 |
for entity in entities:
|
| 368 |
+
if "comp" not in entity["type"]:
|
| 369 |
+
words = entity["surface"].split()
|
| 370 |
|
| 371 |
# Remove stopwords from the beginning
|
| 372 |
while words and words[0].lower() in stop_words:
|
|
|
|
| 377 |
words.pop() # Remove the last word if it's a stopword
|
| 378 |
|
| 379 |
# Join the remaining words back together and update the entity's text
|
| 380 |
+
entity["surface"] = " ".join(words)
|
| 381 |
|
| 382 |
return entities
|
| 383 |
|