impresso-project
/

ner-stacked-bert-multilingual

@@ -271,7 +271,6 @@ def conflicting_context(comp_entity, target_entity):
     return False  # No conflict
 def extract_name_from_text(text, partial_name):
     """
     Extracts the full name from the entity's text based on the partial name.
@@ -363,23 +362,42 @@ from stopwordsiso import stopwords
 stop_words = stopwords(["en", "fr", "de"])
-def remove_trailing_stopwords(entities):
-    # This function removes stopwords from both the beginning and end of each entity's text
     for entity in entities:
         if "comp" not in entity["type"]:
-            words = entity["surface"].split()
-            # Remove stopwords from the beginning
-            while words and words[0].lower() in stop_words:
-                words.pop(0)  # Remove the first word if it's a stopword
-            # Remove stopwords from the end
-            while words and words[-1].lower() in stop_words:
-                words.pop()  # Remove the last word if it's a stopword
             # Join the remaining words back together and update the entity's text
             entity["surface"] = " ".join(words)
     return entities
@@ -465,7 +483,7 @@ class MultitaskTokenClassificationPipeline(Pipeline):
         all_entities = remove_trailing_stopwords(all_entities)
         # print("After remove_trailing_stopwords:")
-        # pprint(all_entities)
         # Attach "comp.function" entities to the closest non-"comp.function" entity
         # print("After postprocess_entities")
         all_entities = postprocess_entities(all_entities)

     return False  # No conflict
 def extract_name_from_text(text, partial_name):
     """
     Extracts the full name from the entity's text based on the partial name.
 stop_words = stopwords(["en", "fr", "de"])
+import string
+def remove_trailing_stopwords(entities, stop_words):
+    """
+    This function removes stopwords and punctuation from both the beginning and end of each entity's text
+    and repairs the lOffset and rOffset accordingly.
+    """
     for entity in entities:
         if "comp" not in entity["type"]:
+            original_words = entity["surface"].split()
+            words = original_words[:]
+            original_len = len(" ".join(original_words))
+            # Initial offsets
+            lOffset = entity.get("lOffset", 0)
+            rOffset = entity.get("rOffset", original_len)
+            # Remove stopwords and punctuation from the beginning
+            while words and (
+                words[0].lower() in stop_words or words[0] in string.punctuation
+            ):
+                lOffset += len(words[0]) + 1  # Adjust left offset
+                words.pop(0)
+            # Remove stopwords and punctuation from the end
+            while words and (
+                words[-1].lower() in stop_words or words[-1] in string.punctuation
+            ):
+                rOffset -= len(words[-1]) + 1  # Adjust right offset
+                words.pop()
             # Join the remaining words back together and update the entity's text
             entity["surface"] = " ".join(words)
+            entity["lOffset"] = lOffset
+            entity["rOffset"] = rOffset
     return entities
         all_entities = remove_trailing_stopwords(all_entities)
         # print("After remove_trailing_stopwords:")
+        pprint(all_entities)
         # Attach "comp.function" entities to the closest non-"comp.function" entity
         # print("After postprocess_entities")
         all_entities = postprocess_entities(all_entities)