impresso-project
/

ner-stacked-bert-multilingual

Token Classification

Model card Files Files and versions

emanuelaboros commited on Aug 9, 2024

Commit

ed9f086

·

verified ·

1 Parent(s): 7d319f7

Update generic_ner.py

Files changed (1) hide show

generic_ner.py +3 -24

generic_ner.py CHANGED Viewed

@@ -57,7 +57,7 @@ def find_entity_indices(article_text, search_text):
             original_end_index += 1  # Increment to include the last character
         # Append the found indices to the list
-        if article_text[original_start_index] == ' ':
             original_start_index += 1
         indices.append((original_start_index, original_end_index))
@@ -67,27 +67,6 @@ def find_entity_indices(article_text, search_text):
     return indices
-# def find_entity_indices(article, entity):
-#     """
-#     Find all occurrences of an entity in the article and return their indices.
-#
-#     :param article: The complete article text.
-#     :param entity: The entity to search for.
-#     :return: A list of tuples (lArticleOffset, rArticleOffset) for each occurrence.
-#     """
-#
-#     # normalized_target = normalize_text(entity)
-#     # normalized_document = normalize_text(article)
-#
-#     entity_indices = []
-#     for match in re.finditer(re.escape(entity), article):
-#         start_idx = match.start()
-#         end_idx = match.end()
-#         entity_indices.append((start_idx, end_idx))
-#
-#     return entity_indices
 def get_entities(tokens, tags, confidences, text):
     tags = [tag.replace("S-", "B-").replace("E-", "I-") for tag in tags]
@@ -111,8 +90,8 @@ def get_entities(tokens, tags, confidences, text):
                 entities.append(
                     {
                         "entity": original_label,
-                        "score": int(
-                            np.average(confidences[idx : idx + len(subtree)]) * 100
                         ),
                         "index": (idx, idx + len(subtree)),
                         "word": original_string,

             original_end_index += 1  # Increment to include the last character
         # Append the found indices to the list
+        if article_text[original_start_index] == " ":
             original_start_index += 1
         indices.append((original_start_index, original_end_index))
     return indices
 def get_entities(tokens, tags, confidences, text):
     tags = [tag.replace("S-", "B-").replace("E-", "I-") for tag in tags]
                 entities.append(
                     {
                         "entity": original_label,
+                        "score": round(
+                            np.average(confidences[idx : idx + len(subtree)]) * 100, 2
                         ),
                         "index": (idx, idx + len(subtree)),
                         "word": original_string,