Update generic_ner.py
Browse files- generic_ner.py +41 -30
generic_ner.py
CHANGED
|
@@ -540,6 +540,7 @@ def remove_trailing_stopwords(entities):
|
|
| 540 |
rOffset = entity.get("rOffset", original_len)
|
| 541 |
|
| 542 |
# Remove stopwords and punctuation from the beginning
|
|
|
|
| 543 |
while entity_text and (
|
| 544 |
entity_text.split()[0].lower() in stop_words
|
| 545 |
or entity_text[0] in punctuation
|
|
@@ -561,36 +562,48 @@ def remove_trailing_stopwords(entities):
|
|
| 561 |
print(
|
| 562 |
f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
| 563 |
)
|
|
|
|
| 564 |
|
|
|
|
| 565 |
# Remove stopwords and punctuation from the end
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
|
|
|
|
|
|
| 593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
# Skip certain entities based on rules
|
| 595 |
if entity_text in string.punctuation:
|
| 596 |
if DEBUG:
|
|
@@ -669,13 +682,11 @@ def remove_trailing_stopwords(entities):
|
|
| 669 |
entities.remove(entity)
|
| 670 |
else:
|
| 671 |
new_entities.append(entity)
|
| 672 |
-
|
| 673 |
-
new_entities.append(entity)
|
| 674 |
if DEBUG:
|
| 675 |
print(f"Remained entities: {len(new_entities)}")
|
| 676 |
return new_entities
|
| 677 |
|
| 678 |
-
|
| 679 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
| 680 |
|
| 681 |
def _sanitize_parameters(self, **kwargs):
|
|
|
|
| 540 |
rOffset = entity.get("rOffset", original_len)
|
| 541 |
|
| 542 |
# Remove stopwords and punctuation from the beginning
|
| 543 |
+
i = 0
|
| 544 |
while entity_text and (
|
| 545 |
entity_text.split()[0].lower() in stop_words
|
| 546 |
or entity_text[0] in punctuation
|
|
|
|
| 562 |
print(
|
| 563 |
f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
| 564 |
)
|
| 565 |
+
i += 1
|
| 566 |
|
| 567 |
+
i = 0
|
| 568 |
# Remove stopwords and punctuation from the end
|
| 569 |
+
iteration = 0
|
| 570 |
+
max_iterations = len(entity_text) # Prevent infinite loops
|
| 571 |
+
|
| 572 |
+
while entity_text and iteration < max_iterations:
|
| 573 |
+
# Check if the last word is a stopword or the last character is punctuation
|
| 574 |
+
last_word = entity_text.split()[-1] if entity_text.split() else ""
|
| 575 |
+
last_char = entity_text[-1]
|
| 576 |
+
|
| 577 |
+
if last_word.lower() in stop_words:
|
| 578 |
+
# Remove trailing stopword and adjust rOffset
|
| 579 |
+
stopword_len = len(last_word) + 1 # Include space before stopword
|
| 580 |
+
entity_text = entity_text[:-stopword_len].rstrip()
|
| 581 |
+
rOffset -= stopword_len
|
| 582 |
+
if DEBUG:
|
| 583 |
+
print(
|
| 584 |
+
f"Removed trailing stopword from entity: {entity_text} (rOffset={rOffset})"
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
elif last_char in punctuation:
|
| 588 |
+
# Remove trailing punctuation and adjust rOffset
|
| 589 |
+
entity_text = entity_text[:-1].rstrip()
|
| 590 |
+
rOffset -= 1
|
| 591 |
+
if DEBUG:
|
| 592 |
+
print(
|
| 593 |
+
f"Removed trailing punctuation from entity: {entity_text} (rOffset={rOffset})"
|
| 594 |
+
)
|
| 595 |
+
else:
|
| 596 |
+
# Exit loop if neither stopwords nor punctuation are found
|
| 597 |
+
break
|
| 598 |
|
| 599 |
+
iteration += 1
|
| 600 |
+
# print(f"ITERATION: {iteration} [{entity['surface']}] for {entity_text}")
|
| 601 |
+
|
| 602 |
+
if len(entity_text.strip()) == 1:
|
| 603 |
+
entities.remove(entity)
|
| 604 |
+
if DEBUG:
|
| 605 |
+
print(f"Skipping entity: {entity_text}")
|
| 606 |
+
continue
|
| 607 |
# Skip certain entities based on rules
|
| 608 |
if entity_text in string.punctuation:
|
| 609 |
if DEBUG:
|
|
|
|
| 682 |
entities.remove(entity)
|
| 683 |
else:
|
| 684 |
new_entities.append(entity)
|
| 685 |
+
|
|
|
|
| 686 |
if DEBUG:
|
| 687 |
print(f"Remained entities: {len(new_entities)}")
|
| 688 |
return new_entities
|
| 689 |
|
|
|
|
| 690 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
| 691 |
|
| 692 |
def _sanitize_parameters(self, **kwargs):
|