Commit
·
f401b99
1
Parent(s):
dd5a61f
remove previous offsets
Browse files- generic_ner.py +7 -15
generic_ner.py
CHANGED
|
@@ -132,8 +132,8 @@ def get_entities(tokens, tags, confidences, text):
|
|
| 132 |
"surface": text[
|
| 133 |
entity_start_position:entity_end_position
|
| 134 |
], # original_string,
|
| 135 |
-
"
|
| 136 |
-
"
|
| 137 |
}
|
| 138 |
)
|
| 139 |
|
|
@@ -224,7 +224,7 @@ def attach_comp_to_closest(entities):
|
|
| 224 |
# Find the closest non-"comp.function" entity that is valid for attaching
|
| 225 |
for other_entity in other_entities:
|
| 226 |
distance = abs(
|
| 227 |
-
comp_entity["
|
| 228 |
) # Calculate the distance
|
| 229 |
|
| 230 |
# Ensure that the other entity's type is valid for the attachment
|
|
@@ -363,12 +363,6 @@ def remove_included_entities(entities):
|
|
| 363 |
return final_entities
|
| 364 |
|
| 365 |
|
| 366 |
-
# from stopwordsiso import stopwords
|
| 367 |
-
#
|
| 368 |
-
# stop_words = stopwords(["en", "fr", "de"])
|
| 369 |
-
# get the stopwords from nltk
|
| 370 |
-
|
| 371 |
-
|
| 372 |
def remove_trailing_stopwords(entities):
|
| 373 |
"""
|
| 374 |
This function removes stopwords and punctuation from both the beginning and end of each entity's text
|
|
@@ -444,7 +438,10 @@ class MultitaskTokenClassificationPipeline(Pipeline):
|
|
| 444 |
|
| 445 |
def is_within(self, entity1, entity2):
|
| 446 |
"""Check if entity1 is fully within the bounds of entity2."""
|
| 447 |
-
return
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
def postprocess(self, outputs, **kwargs):
|
| 450 |
"""
|
|
@@ -486,12 +483,7 @@ class MultitaskTokenClassificationPipeline(Pipeline):
|
|
| 486 |
|
| 487 |
# print("After remove_included_entities:")
|
| 488 |
all_entities = remove_included_entities(all_entities)
|
| 489 |
-
|
| 490 |
all_entities = remove_trailing_stopwords(all_entities)
|
| 491 |
-
# print("After remove_trailing_stopwords:")
|
| 492 |
-
# pprint(all_entities)
|
| 493 |
-
# Attach "comp.function" entities to the closest non-"comp.function" entity
|
| 494 |
-
# print("After postprocess_entities")
|
| 495 |
all_entities = postprocess_entities(all_entities)
|
| 496 |
|
| 497 |
# print("After attach_comp_to_closest:")
|
|
|
|
| 132 |
"surface": text[
|
| 133 |
entity_start_position:entity_end_position
|
| 134 |
], # original_string,
|
| 135 |
+
"lOffset": entity_start_position,
|
| 136 |
+
"rOffset": entity_end_position,
|
| 137 |
}
|
| 138 |
)
|
| 139 |
|
|
|
|
| 224 |
# Find the closest non-"comp.function" entity that is valid for attaching
|
| 225 |
for other_entity in other_entities:
|
| 226 |
distance = abs(
|
| 227 |
+
comp_entity["lOffset"] - other_entity["rOffset"]
|
| 228 |
) # Calculate the distance
|
| 229 |
|
| 230 |
# Ensure that the other entity's type is valid for the attachment
|
|
|
|
| 363 |
return final_entities
|
| 364 |
|
| 365 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
def remove_trailing_stopwords(entities):
|
| 367 |
"""
|
| 368 |
This function removes stopwords and punctuation from both the beginning and end of each entity's text
|
|
|
|
| 438 |
|
| 439 |
def is_within(self, entity1, entity2):
|
| 440 |
"""Check if entity1 is fully within the bounds of entity2."""
|
| 441 |
+
return (
|
| 442 |
+
entity1["lOffset"] >= entity2["lOffset"]
|
| 443 |
+
and entity1["rOffset"] <= entity2["rOffset"]
|
| 444 |
+
)
|
| 445 |
|
| 446 |
def postprocess(self, outputs, **kwargs):
|
| 447 |
"""
|
|
|
|
| 483 |
|
| 484 |
# print("After remove_included_entities:")
|
| 485 |
all_entities = remove_included_entities(all_entities)
|
|
|
|
| 486 |
all_entities = remove_trailing_stopwords(all_entities)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
all_entities = postprocess_entities(all_entities)
|
| 488 |
|
| 489 |
# print("After attach_comp_to_closest:")
|