emanuelaboros commited on
Commit
4378340
·
1 Parent(s): 4fc0c31
Files changed (1) hide show
  1. generic_ner.py +29 -11
generic_ner.py CHANGED
@@ -271,7 +271,6 @@ def conflicting_context(comp_entity, target_entity):
271
  return False # No conflict
272
 
273
 
274
-
275
  def extract_name_from_text(text, partial_name):
276
  """
277
  Extracts the full name from the entity's text based on the partial name.
@@ -363,23 +362,42 @@ from stopwordsiso import stopwords
363
 
364
  stop_words = stopwords(["en", "fr", "de"])
365
 
 
 
366
 
367
- def remove_trailing_stopwords(entities):
368
- # This function removes stopwords from both the beginning and end of each entity's text
 
 
 
369
  for entity in entities:
370
  if "comp" not in entity["type"]:
371
- words = entity["surface"].split()
 
 
372
 
373
- # Remove stopwords from the beginning
374
- while words and words[0].lower() in stop_words:
375
- words.pop(0) # Remove the first word if it's a stopword
376
 
377
- # Remove stopwords from the end
378
- while words and words[-1].lower() in stop_words:
379
- words.pop() # Remove the last word if it's a stopword
 
 
 
 
 
 
 
 
 
 
380
 
381
  # Join the remaining words back together and update the entity's text
382
  entity["surface"] = " ".join(words)
 
 
383
 
384
  return entities
385
 
@@ -465,7 +483,7 @@ class MultitaskTokenClassificationPipeline(Pipeline):
465
 
466
  all_entities = remove_trailing_stopwords(all_entities)
467
  # print("After remove_trailing_stopwords:")
468
- # pprint(all_entities)
469
  # Attach "comp.function" entities to the closest non-"comp.function" entity
470
  # print("After postprocess_entities")
471
  all_entities = postprocess_entities(all_entities)
 
271
  return False # No conflict
272
 
273
 
 
274
  def extract_name_from_text(text, partial_name):
275
  """
276
  Extracts the full name from the entity's text based on the partial name.
 
362
 
363
  stop_words = stopwords(["en", "fr", "de"])
364
 
365
+ import string
366
+
367
 
368
+ def remove_trailing_stopwords(entities, stop_words):
369
+ """
370
+ This function removes stopwords and punctuation from both the beginning and end of each entity's text
371
+ and repairs the lOffset and rOffset accordingly.
372
+ """
373
  for entity in entities:
374
  if "comp" not in entity["type"]:
375
+ original_words = entity["surface"].split()
376
+ words = original_words[:]
377
+ original_len = len(" ".join(original_words))
378
 
379
+ # Initial offsets
380
+ lOffset = entity.get("lOffset", 0)
381
+ rOffset = entity.get("rOffset", original_len)
382
 
383
+ # Remove stopwords and punctuation from the beginning
384
+ while words and (
385
+ words[0].lower() in stop_words or words[0] in string.punctuation
386
+ ):
387
+ lOffset += len(words[0]) + 1 # Adjust left offset
388
+ words.pop(0)
389
+
390
+ # Remove stopwords and punctuation from the end
391
+ while words and (
392
+ words[-1].lower() in stop_words or words[-1] in string.punctuation
393
+ ):
394
+ rOffset -= len(words[-1]) + 1 # Adjust right offset
395
+ words.pop()
396
 
397
  # Join the remaining words back together and update the entity's text
398
  entity["surface"] = " ".join(words)
399
+ entity["lOffset"] = lOffset
400
+ entity["rOffset"] = rOffset
401
 
402
  return entities
403
 
 
483
 
484
  all_entities = remove_trailing_stopwords(all_entities)
485
  # print("After remove_trailing_stopwords:")
486
+ pprint(all_entities)
487
  # Attach "comp.function" entities to the closest non-"comp.function" entity
488
  # print("After postprocess_entities")
489
  all_entities = postprocess_entities(all_entities)