emanuelaboros commited on
Commit
9ccbbd3
·
1 Parent(s): 38b2f9d

update fields

Browse files
Files changed (1) hide show
  1. generic_ner.py +23 -25
generic_ner.py CHANGED
@@ -121,12 +121,12 @@ def get_entities(tokens, tags, confidences, text):
121
 
122
  entities.append(
123
  {
124
- "entity": original_label,
125
  "score": round(
126
  np.average(confidences[idx : idx + len(subtree)]) * 100, 2
127
  ),
128
  "index": (idx, idx + len(subtree)),
129
- "text": text[
130
  entity_start_position:entity_end_position
131
  ], # original_string,
132
  "start": entity_start_position,
@@ -211,8 +211,8 @@ def attach_comp_to_closest(entities):
211
  valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
212
 
213
  # Separate "comp.function" and "comp.name" entities from other entities
214
- comp_entities = [ent for ent in entities if ent["entity"].startswith("comp")]
215
- other_entities = [ent for ent in entities if not ent["entity"].startswith("comp")]
216
 
217
  for comp_entity in comp_entities:
218
  closest_entity = None
@@ -227,7 +227,7 @@ def attach_comp_to_closest(entities):
227
  # Ensure that the other entity's type is valid for the attachment
228
  if (
229
  distance < min_distance
230
- and other_entity["entity"].split(".")[0] in valid_entity_types
231
  ):
232
  # Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
233
  if not conflicting_context(comp_entity, other_entity):
@@ -237,9 +237,9 @@ def attach_comp_to_closest(entities):
237
  # Attach the "comp.function" entity using the suffix of 'entity' field if a valid entity is found
238
  if closest_entity:
239
  # Extract the suffix (e.g., "comp.title" becomes "title")
240
- suffix = comp_entity["entity"].split(".")[-1]
241
  closest_entity[suffix] = comp_entity[
242
- "text"
243
  ] # Attach the text using the suffix as the key
244
 
245
  return other_entities
@@ -250,19 +250,17 @@ def conflicting_context(comp_entity, target_entity):
250
  Determines if there is a conflict in the context between the comp_entity and the target entity.
251
  For example, don't attach 'Maximilien Robespierre' as a name to 'Marie Antoinette'.
252
  """
253
- comp_text = comp_entity["text"].lower()
254
- target_text = target_entity["text"].lower()
255
 
256
  # Case 1: Avoid attaching the name "Washington" to "George Washington" if it relates to a location.
257
- if "pers" in target_entity["entity"] and comp_entity["entity"].startswith(
258
- "comp.name"
259
- ):
260
  # Ensure we allow attaching "Washington" to "George Washington" but not to other irrelevant contexts
261
  if comp_text not in target_text and target_text.startswith("george washington"):
262
  return True # Conflict: Unrelated or partial name mismatch
263
 
264
  # Case 2: Ensure the target entity text matches the role or function (e.g., "President of the United States").
265
- if "pers" in target_entity["entity"] and comp_entity["entity"].startswith(
266
  "comp.function"
267
  ):
268
  # Ensure the function context makes sense (e.g., attach 'President' to 'George Washington')
@@ -300,14 +298,14 @@ def repair_names_in_entities(entities):
300
  from the text of the entity if a partial name (e.g., 'Washington') is incorrectly attached.
301
  """
302
  for entity in entities:
303
- if "name" in entity and "pers" in entity["entity"]:
304
  name = entity["name"].lower()
305
- text = entity["text"].lower()
306
 
307
  # Check if the attached name is part of the entity's text
308
  if name in text:
309
  # Extract the full name from the text by splitting around the attached name
310
- full_name = extract_name_from_text(entity["text"], name)
311
  entity["name"] = (
312
  full_name # Replace the partial name with the full name
313
  )
@@ -321,13 +319,13 @@ def postprocess_entities(entities):
321
 
322
  # Loop over the entities and prioritize the one with the most dots
323
  for entity in entities:
324
- entity_text = entity["text"]
325
- num_dots = entity["entity"].count(".")
326
 
327
  # If the entity text is new, or this entity has more dots, update the map
328
  if (
329
  entity_text not in entity_map
330
- or entity_map[entity_text]["entity"].count(".") < num_dots
331
  ):
332
  entity_map[entity_text] = entity
333
 
@@ -348,10 +346,10 @@ def remove_included_entities(entities):
348
  for i, entity in enumerate(entities):
349
  is_included = False
350
  for other_entity in entities:
351
- if entity["text"] != other_entity["text"]:
352
- if "comp" in other_entity["entity"]:
353
  # Check if entity's text is a substring of another entity's text
354
- if entity["text"] in other_entity["text"]:
355
  is_included = True
356
  break
357
  if not is_included:
@@ -367,8 +365,8 @@ stop_words = stopwords(["en", "fr", "de"])
367
  def remove_trailing_stopwords(entities):
368
  # This function removes stopwords from both the beginning and end of each entity's text
369
  for entity in entities:
370
- if "comp" not in entity["entity"]:
371
- words = entity["text"].split()
372
 
373
  # Remove stopwords from the beginning
374
  while words and words[0].lower() in stop_words:
@@ -379,7 +377,7 @@ def remove_trailing_stopwords(entities):
379
  words.pop() # Remove the last word if it's a stopword
380
 
381
  # Join the remaining words back together and update the entity's text
382
- entity["text"] = " ".join(words)
383
 
384
  return entities
385
 
 
121
 
122
  entities.append(
123
  {
124
+ "type": original_label,
125
  "score": round(
126
  np.average(confidences[idx : idx + len(subtree)]) * 100, 2
127
  ),
128
  "index": (idx, idx + len(subtree)),
129
+ "surface": text[
130
  entity_start_position:entity_end_position
131
  ], # original_string,
132
  "start": entity_start_position,
 
211
  valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
212
 
213
  # Separate "comp.function" and "comp.name" entities from other entities
214
+ comp_entities = [ent for ent in entities if ent["type"].startswith("comp")]
215
+ other_entities = [ent for ent in entities if not ent["type"].startswith("comp")]
216
 
217
  for comp_entity in comp_entities:
218
  closest_entity = None
 
227
  # Ensure that the other entity's type is valid for the attachment
228
  if (
229
  distance < min_distance
230
+ and other_entity["type"].split(".")[0] in valid_entity_types
231
  ):
232
  # Additional context check: Ensure the comp_entity's text doesn't conflict with the target entity's text
233
  if not conflicting_context(comp_entity, other_entity):
 
237
  # Attach the "comp.function" entity using the suffix of 'entity' field if a valid entity is found
238
  if closest_entity:
239
  # Extract the suffix (e.g., "comp.title" becomes "title")
240
+ suffix = comp_entity["type"].split(".")[-1]
241
  closest_entity[suffix] = comp_entity[
242
+ "surface"
243
  ] # Attach the text using the suffix as the key
244
 
245
  return other_entities
 
250
  Determines if there is a conflict in the context between the comp_entity and the target entity.
251
  For example, don't attach 'Maximilien Robespierre' as a name to 'Marie Antoinette'.
252
  """
253
+ comp_text = comp_entity["surface"].lower()
254
+ target_text = target_entity["surface"].lower()
255
 
256
  # Case 1: Avoid attaching the name "Washington" to "George Washington" if it relates to a location.
257
+ if "pers" in target_entity["type"] and comp_entity["type"].startswith("comp.name"):
 
 
258
  # Ensure we allow attaching "Washington" to "George Washington" but not to other irrelevant contexts
259
  if comp_text not in target_text and target_text.startswith("george washington"):
260
  return True # Conflict: Unrelated or partial name mismatch
261
 
262
  # Case 2: Ensure the target entity text matches the role or function (e.g., "President of the United States").
263
+ if "pers" in target_entity["type"] and comp_entity["type"].startswith(
264
  "comp.function"
265
  ):
266
  # Ensure the function context makes sense (e.g., attach 'President' to 'George Washington')
 
298
  from the text of the entity if a partial name (e.g., 'Washington') is incorrectly attached.
299
  """
300
  for entity in entities:
301
+ if "name" in entity and "pers" in entity["type"]:
302
  name = entity["name"].lower()
303
+ text = entity["surface"].lower()
304
 
305
  # Check if the attached name is part of the entity's text
306
  if name in text:
307
  # Extract the full name from the text by splitting around the attached name
308
+ full_name = extract_name_from_text(entity["surface"], name)
309
  entity["name"] = (
310
  full_name # Replace the partial name with the full name
311
  )
 
319
 
320
  # Loop over the entities and prioritize the one with the most dots
321
  for entity in entities:
322
+ entity_text = entity["surface"]
323
+ num_dots = entity["type"].count(".")
324
 
325
  # If the entity text is new, or this entity has more dots, update the map
326
  if (
327
  entity_text not in entity_map
328
+ or entity_map[entity_text]["type"].count(".") < num_dots
329
  ):
330
  entity_map[entity_text] = entity
331
 
 
346
  for i, entity in enumerate(entities):
347
  is_included = False
348
  for other_entity in entities:
349
+ if entity["surface"] != other_entity["surface"]:
350
+ if "comp" in other_entity["type"]:
351
  # Check if entity's text is a substring of another entity's text
352
+ if entity["surface"] in other_entity["surface"]:
353
  is_included = True
354
  break
355
  if not is_included:
 
365
  def remove_trailing_stopwords(entities):
366
  # This function removes stopwords from both the beginning and end of each entity's text
367
  for entity in entities:
368
+ if "comp" not in entity["type"]:
369
+ words = entity["surface"].split()
370
 
371
  # Remove stopwords from the beginning
372
  while words and words[0].lower() in stop_words:
 
377
  words.pop() # Remove the last word if it's a stopword
378
 
379
  # Join the remaining words back together and update the entity's text
380
+ entity["surface"] = " ".join(words)
381
 
382
  return entities
383