emanuelaboros commited on
Commit
ff295cd
·
1 Parent(s): 5c23370
Files changed (1) hide show
  1. generic_ner.py +27 -11
generic_ner.py CHANGED
@@ -206,12 +206,11 @@ def add_spaces_around_punctuation(text):
206
  return re.sub(r"([{}])".format(re.escape(all_punctuation)), r" \1 ", text)
207
 
208
 
209
- # Function to find the closest entity to the "comp.function"
210
  def attach_comp_to_closest(entities):
211
  # Define valid entity types that can receive a "comp.function" or "comp.name" attachment
212
  valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
213
 
214
- # Separate "comp.function" entities from other entities
215
  comp_entities = [ent for ent in entities if ent["entity"].startswith("comp")]
216
  other_entities = [ent for ent in entities if not ent["entity"].startswith("comp")]
217
 
@@ -221,20 +220,27 @@ def attach_comp_to_closest(entities):
221
 
222
  # Find the closest non-"comp.function" entity that is valid for attaching
223
  for other_entity in other_entities:
224
- distance = abs(comp_entity["start"] - other_entity["end"]) # Calculate the distance
 
 
225
 
226
  # Ensure that the other entity's type is valid for the attachment
227
- if distance < min_distance and other_entity["entity"].split(".")[0] in valid_entity_types:
 
 
 
228
  # Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
229
  if not conflicting_context(comp_entity, other_entity):
230
  min_distance = distance
231
  closest_entity = other_entity
232
 
233
- # Attach the "comp.function" or "comp.name" entity using the suffix of 'entity' field if a valid entity is found
234
  if closest_entity:
235
  # Extract the suffix (e.g., "comp.title" becomes "title")
236
  suffix = comp_entity["entity"].split(".")[-1]
237
- closest_entity[suffix] = comp_entity["text"] # Attach the text using the suffix as the key
 
 
238
 
239
  return other_entities
240
 
@@ -247,15 +253,25 @@ def conflicting_context(comp_entity, target_entity):
247
  comp_text = comp_entity["text"].lower()
248
  target_text = target_entity["text"].lower()
249
 
250
- # Simple conflict check: don't attach a name if the target entity text contains a different person or entity name
251
- # Example: If target entity is 'Marie Antoinette', don't attach 'Maximilien Robespierre'
252
- if "pers" in target_entity["entity"] and comp_text not in target_text:
253
- return True # Conflict: Different person/entity
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  return False # No conflict
256
 
257
 
258
-
259
  def postprocess_entities(entities):
260
  # Step 1: Filter entities with the same text, keeping the one with the most dots in the 'entity' field
261
  entity_map = {}
 
206
  return re.sub(r"([{}])".format(re.escape(all_punctuation)), r" \1 ", text)
207
 
208
 
 
209
  def attach_comp_to_closest(entities):
210
  # Define valid entity types that can receive a "comp.function" or "comp.name" attachment
211
  valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
212
 
213
+ # Separate "comp.function" and "comp.name" entities from other entities
214
  comp_entities = [ent for ent in entities if ent["entity"].startswith("comp")]
215
  other_entities = [ent for ent in entities if not ent["entity"].startswith("comp")]
216
 
 
220
 
221
  # Find the closest non-"comp.function" entity that is valid for attaching
222
  for other_entity in other_entities:
223
+ distance = abs(
224
+ comp_entity["start"] - other_entity["end"]
225
+ ) # Calculate the distance
226
 
227
  # Ensure that the other entity's type is valid for the attachment
228
+ if (
229
+ distance < min_distance
230
+ and other_entity["entity"].split(".")[0] in valid_entity_types
231
+ ):
232
  # Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
233
  if not conflicting_context(comp_entity, other_entity):
234
  min_distance = distance
235
  closest_entity = other_entity
236
 
237
+ # Attach the "comp.function" entity using the suffix of 'entity' field if a valid entity is found
238
  if closest_entity:
239
  # Extract the suffix (e.g., "comp.title" becomes "title")
240
  suffix = comp_entity["entity"].split(".")[-1]
241
+ closest_entity[suffix] = comp_entity[
242
+ "text"
243
+ ] # Attach the text using the suffix as the key
244
 
245
  return other_entities
246
 
 
253
  comp_text = comp_entity["text"].lower()
254
  target_text = target_entity["text"].lower()
255
 
256
+ # Case 1: Avoid attaching the name "Washington" to "George Washington" if it relates to a location.
257
+ if "pers" in target_entity["entity"] and comp_entity["entity"].startswith(
258
+ "comp.name"
259
+ ):
260
+ # Ensure we allow attaching "Washington" to "George Washington" but not to other irrelevant contexts
261
+ if comp_text not in target_text and target_text.startswith("george washington"):
262
+ return True # Conflict: Unrelated or partial name mismatch
263
+
264
+ # Case 2: Ensure the target entity text matches the role or function (e.g., "President of the United States").
265
+ if "pers" in target_entity["entity"] and comp_entity["entity"].startswith(
266
+ "comp.function"
267
+ ):
268
+ # Ensure the function context makes sense (e.g., attach 'President' to 'George Washington')
269
+ if "president" not in comp_text and "george washington" in target_text:
270
+ return True # Conflict: Unrelated function
271
 
272
  return False # No conflict
273
 
274
 
 
275
  def postprocess_entities(entities):
276
  # Step 1: Filter entities with the same text, keeping the one with the most dots in the 'entity' field
277
  entity_map = {}