impresso-project
/

ner-stacked-bert-multilingual

@@ -540,6 +540,7 @@ def remove_trailing_stopwords(entities):
             rOffset = entity.get("rOffset", original_len)
             # Remove stopwords and punctuation from the beginning
             while entity_text and (
                 entity_text.split()[0].lower() in stop_words
                 or entity_text[0] in punctuation
@@ -561,36 +562,48 @@ def remove_trailing_stopwords(entities):
                         print(
                             f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
                         )
             # Remove stopwords and punctuation from the end
-            if len(entity_text.strip()) > 1:
-                while entity_text and (
-                    entity_text.split()[-1].lower() in stop_words
-                    or entity_text[-1] in punctuation
-                ):
-                    if entity_text.split()[-1].lower() in stop_words:
-                        stopword_len = (
-                            len(entity_text.split()[-1]) + 1
-                        )  # Adjust length for stopword and preceding space
-                        entity_text = entity_text[
-                            :-stopword_len
-                        ]  # Remove trailing stopword
-                        rOffset -= stopword_len  # Adjust the right offset
-                        if DEBUG:
-                            print(
-                                f"Removed trailing stopword from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
-                            )
-                    if entity_text:
-                        if entity_text[-1] in punctuation:
-                            entity_text = entity_text[
-                                :-1
-                            ]  # Remove trailing punctuation
-                            rOffset -= 1  # Adjust the right offset
-                            if DEBUG:
-                                print(
-                                    f"Removed trailing punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
-                                )
             # Skip certain entities based on rules
             if entity_text in string.punctuation:
                 if DEBUG:
@@ -669,13 +682,11 @@ def remove_trailing_stopwords(entities):
                 entities.remove(entity)
             else:
                 new_entities.append(entity)
-        else:
-            new_entities.append(entity)
     if DEBUG:
         print(f"Remained entities: {len(new_entities)}")
     return new_entities
 class MultitaskTokenClassificationPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):

             rOffset = entity.get("rOffset", original_len)
             # Remove stopwords and punctuation from the beginning
+            i = 0
             while entity_text and (
                 entity_text.split()[0].lower() in stop_words
                 or entity_text[0] in punctuation
                         print(
                             f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
                         )
+                i += 1
+            i = 0
             # Remove stopwords and punctuation from the end
+            iteration = 0
+            max_iterations = len(entity_text)  # Prevent infinite loops
+            while entity_text and iteration < max_iterations:
+                # Check if the last word is a stopword or the last character is punctuation
+                last_word = entity_text.split()[-1] if entity_text.split() else ""
+                last_char = entity_text[-1]
+                if last_word.lower() in stop_words:
+                    # Remove trailing stopword and adjust rOffset
+                    stopword_len = len(last_word) + 1  # Include space before stopword
+                    entity_text = entity_text[:-stopword_len].rstrip()
+                    rOffset -= stopword_len
+                    if DEBUG:
+                        print(
+                            f"Removed trailing stopword from entity: {entity_text} (rOffset={rOffset})"
+                        )
+                elif last_char in punctuation:
+                    # Remove trailing punctuation and adjust rOffset
+                    entity_text = entity_text[:-1].rstrip()
+                    rOffset -= 1
+                    if DEBUG:
+                        print(
+                            f"Removed trailing punctuation from entity: {entity_text} (rOffset={rOffset})"
+                        )
+                else:
+                    # Exit loop if neither stopwords nor punctuation are found
+                    break
+                iteration += 1
+                # print(f"ITERATION: {iteration} [{entity['surface']}] for {entity_text}")
+            if len(entity_text.strip()) == 1:
+                entities.remove(entity)
+                if DEBUG:
+                    print(f"Skipping entity: {entity_text}")
+                continue
             # Skip certain entities based on rules
             if entity_text in string.punctuation:
                 if DEBUG:
                 entities.remove(entity)
             else:
                 new_entities.append(entity)
     if DEBUG:
         print(f"Remained entities: {len(new_entities)}")
     return new_entities
 class MultitaskTokenClassificationPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):