Commit
·
ff295cd
1
Parent(s):
5c23370
debug
Browse files- generic_ner.py +27 -11
generic_ner.py
CHANGED
|
@@ -206,12 +206,11 @@ def add_spaces_around_punctuation(text):
|
|
| 206 |
return re.sub(r"([{}])".format(re.escape(all_punctuation)), r" \1 ", text)
|
| 207 |
|
| 208 |
|
| 209 |
-
# Function to find the closest entity to the "comp.function"
|
| 210 |
def attach_comp_to_closest(entities):
|
| 211 |
# Define valid entity types that can receive a "comp.function" or "comp.name" attachment
|
| 212 |
valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
|
| 213 |
|
| 214 |
-
# Separate "comp.function" entities from other entities
|
| 215 |
comp_entities = [ent for ent in entities if ent["entity"].startswith("comp")]
|
| 216 |
other_entities = [ent for ent in entities if not ent["entity"].startswith("comp")]
|
| 217 |
|
|
@@ -221,20 +220,27 @@ def attach_comp_to_closest(entities):
|
|
| 221 |
|
| 222 |
# Find the closest non-"comp.function" entity that is valid for attaching
|
| 223 |
for other_entity in other_entities:
|
| 224 |
-
distance = abs(
|
|
|
|
|
|
|
| 225 |
|
| 226 |
# Ensure that the other entity's type is valid for the attachment
|
| 227 |
-
if
|
|
|
|
|
|
|
|
|
|
| 228 |
# Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
|
| 229 |
if not conflicting_context(comp_entity, other_entity):
|
| 230 |
min_distance = distance
|
| 231 |
closest_entity = other_entity
|
| 232 |
|
| 233 |
-
# Attach the "comp.function"
|
| 234 |
if closest_entity:
|
| 235 |
# Extract the suffix (e.g., "comp.title" becomes "title")
|
| 236 |
suffix = comp_entity["entity"].split(".")[-1]
|
| 237 |
-
closest_entity[suffix] = comp_entity[
|
|
|
|
|
|
|
| 238 |
|
| 239 |
return other_entities
|
| 240 |
|
|
@@ -247,15 +253,25 @@ def conflicting_context(comp_entity, target_entity):
|
|
| 247 |
comp_text = comp_entity["text"].lower()
|
| 248 |
target_text = target_entity["text"].lower()
|
| 249 |
|
| 250 |
-
#
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
return False # No conflict
|
| 256 |
|
| 257 |
|
| 258 |
-
|
| 259 |
def postprocess_entities(entities):
|
| 260 |
# Step 1: Filter entities with the same text, keeping the one with the most dots in the 'entity' field
|
| 261 |
entity_map = {}
|
|
|
|
| 206 |
return re.sub(r"([{}])".format(re.escape(all_punctuation)), r" \1 ", text)
|
| 207 |
|
| 208 |
|
|
|
|
| 209 |
def attach_comp_to_closest(entities):
|
| 210 |
# Define valid entity types that can receive a "comp.function" or "comp.name" attachment
|
| 211 |
valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
|
| 212 |
|
| 213 |
+
# Separate "comp.function" and "comp.name" entities from other entities
|
| 214 |
comp_entities = [ent for ent in entities if ent["entity"].startswith("comp")]
|
| 215 |
other_entities = [ent for ent in entities if not ent["entity"].startswith("comp")]
|
| 216 |
|
|
|
|
| 220 |
|
| 221 |
# Find the closest non-"comp.function" entity that is valid for attaching
|
| 222 |
for other_entity in other_entities:
|
| 223 |
+
distance = abs(
|
| 224 |
+
comp_entity["start"] - other_entity["end"]
|
| 225 |
+
) # Calculate the distance
|
| 226 |
|
| 227 |
# Ensure that the other entity's type is valid for the attachment
|
| 228 |
+
if (
|
| 229 |
+
distance < min_distance
|
| 230 |
+
and other_entity["entity"].split(".")[0] in valid_entity_types
|
| 231 |
+
):
|
| 232 |
# Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
|
| 233 |
if not conflicting_context(comp_entity, other_entity):
|
| 234 |
min_distance = distance
|
| 235 |
closest_entity = other_entity
|
| 236 |
|
| 237 |
+
# Attach the "comp.function" entity using the suffix of 'entity' field if a valid entity is found
|
| 238 |
if closest_entity:
|
| 239 |
# Extract the suffix (e.g., "comp.title" becomes "title")
|
| 240 |
suffix = comp_entity["entity"].split(".")[-1]
|
| 241 |
+
closest_entity[suffix] = comp_entity[
|
| 242 |
+
"text"
|
| 243 |
+
] # Attach the text using the suffix as the key
|
| 244 |
|
| 245 |
return other_entities
|
| 246 |
|
|
|
|
| 253 |
comp_text = comp_entity["text"].lower()
|
| 254 |
target_text = target_entity["text"].lower()
|
| 255 |
|
| 256 |
+
# Case 1: Avoid attaching the name "Washington" to "George Washington" if it relates to a location.
|
| 257 |
+
if "pers" in target_entity["entity"] and comp_entity["entity"].startswith(
|
| 258 |
+
"comp.name"
|
| 259 |
+
):
|
| 260 |
+
# Ensure we allow attaching "Washington" to "George Washington" but not to other irrelevant contexts
|
| 261 |
+
if comp_text not in target_text and target_text.startswith("george washington"):
|
| 262 |
+
return True # Conflict: Unrelated or partial name mismatch
|
| 263 |
+
|
| 264 |
+
# Case 2: Ensure the target entity text matches the role or function (e.g., "President of the United States").
|
| 265 |
+
if "pers" in target_entity["entity"] and comp_entity["entity"].startswith(
|
| 266 |
+
"comp.function"
|
| 267 |
+
):
|
| 268 |
+
# Ensure the function context makes sense (e.g., attach 'President' to 'George Washington')
|
| 269 |
+
if "president" not in comp_text and "george washington" in target_text:
|
| 270 |
+
return True # Conflict: Unrelated function
|
| 271 |
|
| 272 |
return False # No conflict
|
| 273 |
|
| 274 |
|
|
|
|
| 275 |
def postprocess_entities(entities):
|
| 276 |
# Step 1: Filter entities with the same text, keeping the one with the most dots in the 'entity' field
|
| 277 |
entity_map = {}
|