Update generic_ner.py
Browse files- generic_ner.py +32 -43
generic_ner.py
CHANGED
|
@@ -528,7 +528,7 @@ def remove_trailing_stopwords(entities):
|
|
| 528 |
and repairs the lOffset and rOffset accordingly.
|
| 529 |
"""
|
| 530 |
if DEBUG:
|
| 531 |
-
print(f"Initial entities: {len(entities)}")
|
| 532 |
new_entities = []
|
| 533 |
for entity in entities:
|
| 534 |
if "comp" not in entity["type"]:
|
|
@@ -540,7 +540,6 @@ def remove_trailing_stopwords(entities):
|
|
| 540 |
rOffset = entity.get("rOffset", original_len)
|
| 541 |
|
| 542 |
# Remove stopwords and punctuation from the beginning
|
| 543 |
-
i = 0
|
| 544 |
while entity_text and (
|
| 545 |
entity_text.split()[0].lower() in stop_words
|
| 546 |
or entity_text[0] in punctuation
|
|
@@ -562,48 +561,36 @@ def remove_trailing_stopwords(entities):
|
|
| 562 |
print(
|
| 563 |
f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
| 564 |
)
|
| 565 |
-
i += 1
|
| 566 |
|
| 567 |
-
i = 0
|
| 568 |
# Remove stopwords and punctuation from the end
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
# Exit loop if neither stopwords nor punctuation are found
|
| 597 |
-
break
|
| 598 |
-
|
| 599 |
-
iteration += 1
|
| 600 |
-
# print(f"ITERATION: {iteration} [{entity['surface']}] for {entity_text}")
|
| 601 |
|
| 602 |
-
if len(entity_text.strip()) == 1:
|
| 603 |
-
entities.remove(entity)
|
| 604 |
-
if DEBUG:
|
| 605 |
-
print(f"Skipping entity: {entity_text}")
|
| 606 |
-
continue
|
| 607 |
# Skip certain entities based on rules
|
| 608 |
if entity_text in string.punctuation:
|
| 609 |
if DEBUG:
|
|
@@ -682,10 +669,12 @@ def remove_trailing_stopwords(entities):
|
|
| 682 |
entities.remove(entity)
|
| 683 |
else:
|
| 684 |
new_entities.append(entity)
|
| 685 |
-
|
|
|
|
| 686 |
if DEBUG:
|
| 687 |
-
print(f"Remained entities: {len(new_entities)}")
|
| 688 |
return new_entities
|
|
|
|
| 689 |
|
| 690 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
| 691 |
|
|
|
|
| 528 |
and repairs the lOffset and rOffset accordingly.
|
| 529 |
"""
|
| 530 |
if DEBUG:
|
| 531 |
+
print(f"Initial entities in remove_trailing_stopwords: {len(entities)}")
|
| 532 |
new_entities = []
|
| 533 |
for entity in entities:
|
| 534 |
if "comp" not in entity["type"]:
|
|
|
|
| 540 |
rOffset = entity.get("rOffset", original_len)
|
| 541 |
|
| 542 |
# Remove stopwords and punctuation from the beginning
|
|
|
|
| 543 |
while entity_text and (
|
| 544 |
entity_text.split()[0].lower() in stop_words
|
| 545 |
or entity_text[0] in punctuation
|
|
|
|
| 561 |
print(
|
| 562 |
f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
| 563 |
)
|
|
|
|
| 564 |
|
|
|
|
| 565 |
# Remove stopwords and punctuation from the end
|
| 566 |
+
if len(entity_text.strip()) > 1:
|
| 567 |
+
while entity_text and (
|
| 568 |
+
entity_text.split()[-1].lower() in stop_words
|
| 569 |
+
or entity_text[-1] in punctuation
|
| 570 |
+
):
|
| 571 |
+
if entity_text.split()[-1].lower() in stop_words:
|
| 572 |
+
stopword_len = (
|
| 573 |
+
len(entity_text.split()[-1]) + 1
|
| 574 |
+
) # Adjust length for stopword and preceding space
|
| 575 |
+
entity_text = entity_text[
|
| 576 |
+
:-stopword_len
|
| 577 |
+
] # Remove trailing stopword
|
| 578 |
+
rOffset -= stopword_len # Adjust the right offset
|
| 579 |
+
if DEBUG:
|
| 580 |
+
print(
|
| 581 |
+
f"Removed trailing stopword from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
| 582 |
+
)
|
| 583 |
+
if entity_text:
|
| 584 |
+
if entity_text[-1] in punctuation:
|
| 585 |
+
entity_text = entity_text[
|
| 586 |
+
:-1
|
| 587 |
+
] # Remove trailing punctuation
|
| 588 |
+
rOffset -= 1 # Adjust the right offset
|
| 589 |
+
if DEBUG:
|
| 590 |
+
print(
|
| 591 |
+
f"Removed trailing punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
| 592 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
# Skip certain entities based on rules
|
| 595 |
if entity_text in string.punctuation:
|
| 596 |
if DEBUG:
|
|
|
|
| 669 |
entities.remove(entity)
|
| 670 |
else:
|
| 671 |
new_entities.append(entity)
|
| 672 |
+
else:
|
| 673 |
+
new_entities.append(entity)
|
| 674 |
if DEBUG:
|
| 675 |
+
print(f"Remained entities in remove_trailing_stopwords: {len(new_entities)}")
|
| 676 |
return new_entities
|
| 677 |
+
|
| 678 |
|
| 679 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
| 680 |
|