emanuelaboros commited on
Commit
f401b99
·
1 Parent(s): dd5a61f

remove previous offsets

Browse files
Files changed (1) hide show
  1. generic_ner.py +7 -15
generic_ner.py CHANGED
@@ -132,8 +132,8 @@ def get_entities(tokens, tags, confidences, text):
132
  "surface": text[
133
  entity_start_position:entity_end_position
134
  ], # original_string,
135
- "start": entity_start_position,
136
- "end": entity_end_position,
137
  }
138
  )
139
 
@@ -224,7 +224,7 @@ def attach_comp_to_closest(entities):
224
  # Find the closest non-"comp.function" entity that is valid for attaching
225
  for other_entity in other_entities:
226
  distance = abs(
227
- comp_entity["start"] - other_entity["end"]
228
  ) # Calculate the distance
229
 
230
  # Ensure that the other entity's type is valid for the attachment
@@ -363,12 +363,6 @@ def remove_included_entities(entities):
363
  return final_entities
364
 
365
 
366
- # from stopwordsiso import stopwords
367
- #
368
- # stop_words = stopwords(["en", "fr", "de"])
369
- # get the stopwords from nltk
370
-
371
-
372
  def remove_trailing_stopwords(entities):
373
  """
374
  This function removes stopwords and punctuation from both the beginning and end of each entity's text
@@ -444,7 +438,10 @@ class MultitaskTokenClassificationPipeline(Pipeline):
444
 
445
  def is_within(self, entity1, entity2):
446
  """Check if entity1 is fully within the bounds of entity2."""
447
- return entity1["start"] >= entity2["start"] and entity1["end"] <= entity2["end"]
 
 
 
448
 
449
  def postprocess(self, outputs, **kwargs):
450
  """
@@ -486,12 +483,7 @@ class MultitaskTokenClassificationPipeline(Pipeline):
486
 
487
  # print("After remove_included_entities:")
488
  all_entities = remove_included_entities(all_entities)
489
-
490
  all_entities = remove_trailing_stopwords(all_entities)
491
- # print("After remove_trailing_stopwords:")
492
- # pprint(all_entities)
493
- # Attach "comp.function" entities to the closest non-"comp.function" entity
494
- # print("After postprocess_entities")
495
  all_entities = postprocess_entities(all_entities)
496
 
497
  # print("After attach_comp_to_closest:")
 
132
  "surface": text[
133
  entity_start_position:entity_end_position
134
  ], # original_string,
135
+ "lOffset": entity_start_position,
136
+ "rOffset": entity_end_position,
137
  }
138
  )
139
 
 
224
  # Find the closest non-"comp.function" entity that is valid for attaching
225
  for other_entity in other_entities:
226
  distance = abs(
227
+ comp_entity["lOffset"] - other_entity["rOffset"]
228
  ) # Calculate the distance
229
 
230
  # Ensure that the other entity's type is valid for the attachment
 
363
  return final_entities
364
 
365
 
 
 
 
 
 
 
366
  def remove_trailing_stopwords(entities):
367
  """
368
  This function removes stopwords and punctuation from both the beginning and end of each entity's text
 
438
 
439
  def is_within(self, entity1, entity2):
440
  """Check if entity1 is fully within the bounds of entity2."""
441
+ return (
442
+ entity1["lOffset"] >= entity2["lOffset"]
443
+ and entity1["rOffset"] <= entity2["rOffset"]
444
+ )
445
 
446
  def postprocess(self, outputs, **kwargs):
447
  """
 
483
 
484
  # print("After remove_included_entities:")
485
  all_entities = remove_included_entities(all_entities)
 
486
  all_entities = remove_trailing_stopwords(all_entities)
 
 
 
 
487
  all_entities = postprocess_entities(all_entities)
488
 
489
  # print("After attach_comp_to_closest:")