Commit ·
4378340
1
Parent(s): 4fc0c31
update
Browse files- generic_ner.py +29 -11
generic_ner.py
CHANGED
|
@@ -271,7 +271,6 @@ def conflicting_context(comp_entity, target_entity):
|
|
| 271 |
return False # No conflict
|
| 272 |
|
| 273 |
|
| 274 |
-
|
| 275 |
def extract_name_from_text(text, partial_name):
|
| 276 |
"""
|
| 277 |
Extracts the full name from the entity's text based on the partial name.
|
|
@@ -363,23 +362,42 @@ from stopwordsiso import stopwords
|
|
| 363 |
|
| 364 |
stop_words = stopwords(["en", "fr", "de"])
|
| 365 |
|
|
|
|
|
|
|
| 366 |
|
| 367 |
-
def remove_trailing_stopwords(entities):
|
| 368 |
-
|
|
|
|
|
|
|
|
|
|
| 369 |
for entity in entities:
|
| 370 |
if "comp" not in entity["type"]:
|
| 371 |
-
|
|
|
|
|
|
|
| 372 |
|
| 373 |
-
#
|
| 374 |
-
|
| 375 |
-
|
| 376 |
|
| 377 |
-
# Remove stopwords from the
|
| 378 |
-
while words and
|
| 379 |
-
words.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
# Join the remaining words back together and update the entity's text
|
| 382 |
entity["surface"] = " ".join(words)
|
|
|
|
|
|
|
| 383 |
|
| 384 |
return entities
|
| 385 |
|
|
@@ -465,7 +483,7 @@ class MultitaskTokenClassificationPipeline(Pipeline):
|
|
| 465 |
|
| 466 |
all_entities = remove_trailing_stopwords(all_entities)
|
| 467 |
# print("After remove_trailing_stopwords:")
|
| 468 |
-
|
| 469 |
# Attach "comp.function" entities to the closest non-"comp.function" entity
|
| 470 |
# print("After postprocess_entities")
|
| 471 |
all_entities = postprocess_entities(all_entities)
|
|
|
|
| 271 |
return False # No conflict
|
| 272 |
|
| 273 |
|
|
|
|
| 274 |
def extract_name_from_text(text, partial_name):
|
| 275 |
"""
|
| 276 |
Extracts the full name from the entity's text based on the partial name.
|
|
|
|
| 362 |
|
| 363 |
stop_words = stopwords(["en", "fr", "de"])
|
| 364 |
|
| 365 |
+
import string
|
| 366 |
+
|
| 367 |
|
| 368 |
+
def remove_trailing_stopwords(entities, stop_words):
|
| 369 |
+
"""
|
| 370 |
+
This function removes stopwords and punctuation from both the beginning and end of each entity's text
|
| 371 |
+
and repairs the lOffset and rOffset accordingly.
|
| 372 |
+
"""
|
| 373 |
for entity in entities:
|
| 374 |
if "comp" not in entity["type"]:
|
| 375 |
+
original_words = entity["surface"].split()
|
| 376 |
+
words = original_words[:]
|
| 377 |
+
original_len = len(" ".join(original_words))
|
| 378 |
|
| 379 |
+
# Initial offsets
|
| 380 |
+
lOffset = entity.get("lOffset", 0)
|
| 381 |
+
rOffset = entity.get("rOffset", original_len)
|
| 382 |
|
| 383 |
+
# Remove stopwords and punctuation from the beginning
|
| 384 |
+
while words and (
|
| 385 |
+
words[0].lower() in stop_words or words[0] in string.punctuation
|
| 386 |
+
):
|
| 387 |
+
lOffset += len(words[0]) + 1 # Adjust left offset
|
| 388 |
+
words.pop(0)
|
| 389 |
+
|
| 390 |
+
# Remove stopwords and punctuation from the end
|
| 391 |
+
while words and (
|
| 392 |
+
words[-1].lower() in stop_words or words[-1] in string.punctuation
|
| 393 |
+
):
|
| 394 |
+
rOffset -= len(words[-1]) + 1 # Adjust right offset
|
| 395 |
+
words.pop()
|
| 396 |
|
| 397 |
# Join the remaining words back together and update the entity's text
|
| 398 |
entity["surface"] = " ".join(words)
|
| 399 |
+
entity["lOffset"] = lOffset
|
| 400 |
+
entity["rOffset"] = rOffset
|
| 401 |
|
| 402 |
return entities
|
| 403 |
|
|
|
|
| 483 |
|
| 484 |
all_entities = remove_trailing_stopwords(all_entities)
|
| 485 |
# print("After remove_trailing_stopwords:")
|
| 486 |
+
pprint(all_entities)
|
| 487 |
# Attach "comp.function" entities to the closest non-"comp.function" entity
|
| 488 |
# print("After postprocess_entities")
|
| 489 |
all_entities = postprocess_entities(all_entities)
|