test

Browse files

Files changed (13) hide show

.DS_Store +0 -0
generic_ner.py +53 -100
modeling_stacked.py +167 -60
old/config.json +233 -0
old/configuration_stacked.py +101 -0
old/generic_ner.py +789 -0
old/label_map.json +1 -0
old/model.safetensors +3 -0
old/modeling_stacked.py +136 -0
old/special_tokens_map.json +37 -0
old/tokenizer.json +0 -0
old/tokenizer_config.json +58 -0
old/vocab.txt +0 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

generic_ner.py CHANGED Viewed

@@ -16,21 +16,21 @@ import re, string
 stop_words = set(nltk.corpus.stopwords.words("english"))
 DEBUG = False
 punctuation = (
-    string.punctuation
-    + "«»—…“”"
-    + "—."
-    + "–"
-    + "’"
-    + "‘"
-    + "´"
-    + "•"
-    + "°"
-    + "»"
-    + "“"
-    + "”"
-    + "–"
-    + "—"
-    + "‘’“”„«»•–—―‣◦…§¶†‡‰′″〈〉"
 )
 # List of additional "strange" punctuation marks
@@ -87,53 +87,6 @@ WHITESPACE_RULES = {
 }
-# def tokenize(text: str, language: str = "other") -> list[str]:
-#     """Apply whitespace rules to the given text and language, separating it into tokens.
-#
-#     Args:
-#         text (str): The input text to separate into a list of tokens.
-#         language (str): Language of the text.
-#
-#     Returns:
-#         list[str]: List of tokens with punctuation as separate tokens.
-#     """
-#     # text = add_spaces_around_punctuation(text)
-#     if not text:
-#         return []
-#
-#     if language not in WHITESPACE_RULES:
-#         # Default behavior for languages without specific rules:
-#         # tokenize using standard whitespace splitting
-#         language = "other"
-#
-#     wsrules = WHITESPACE_RULES[language]
-#     tokenized_text = []
-#     current_token = ""
-#
-#     for char in text:
-#         if char in wsrules["pct_no_ws_before_after"]:
-#             if current_token:
-#                 tokenized_text.append(current_token)
-#             tokenized_text.append(char)
-#             current_token = ""
-#         elif char in wsrules["pct_no_ws_before"] or char in wsrules["pct_no_ws_after"]:
-#             if current_token:
-#                 tokenized_text.append(current_token)
-#             tokenized_text.append(char)
-#             current_token = ""
-#         elif char.isspace():
-#             if current_token:
-#                 tokenized_text.append(current_token)
-#                 current_token = ""
-#         else:
-#             current_token += char
-#
-#     if current_token:
-#         tokenized_text.append(current_token)
-#
-#     return tokenized_text
 def normalize_text(text):
     # Remove spaces and tabs for the search but keep newline characters
     return re.sub(r"[ \t]+", "", text)
@@ -183,7 +136,6 @@ def find_entity_indices(article_text, search_text):
 def get_entities(tokens, tags, confidences, text):
     tags = [tag.replace("S-", "B-").replace("E-", "I-") for tag in tags]
     pos_tags = [pos for token, pos in pos_tag(tokens)]
@@ -208,10 +160,10 @@ def get_entities(tokens, tags, confidences, text):
                 entity_start_position = indices[0]
                 entity_end_position = indices[1]
                 if (
-                    "_".join(
-                        [original_label, original_string, str(entity_start_position)]
-                    )
-                    in already_done
                 ):
                     continue
                 else:
@@ -225,24 +177,24 @@ def get_entities(tokens, tags, confidences, text):
                         )
                     )
                 if len(text[entity_start_position:entity_end_position].strip()) < len(
-                    text[entity_start_position:entity_end_position]
                 ):
                     entity_start_position = (
-                        entity_start_position
-                        + len(text[entity_start_position:entity_end_position])
-                        - len(text[entity_start_position:entity_end_position].strip())
                     )
                 entities.append(
                     {
                         "type": original_label,
                         "confidence_ner": round(
-                            np.average(confidences[idx : idx + len(subtree)]), 2
                         ),
                         "index": (idx, idx + len(subtree)),
                         "surface": text[
-                            entity_start_position:entity_end_position
-                        ],  # original_string,
                         "lOffset": entity_start_position,
                         "rOffset": entity_end_position,
                     }
@@ -282,6 +234,7 @@ def realign(word_ids, tokens, out_label_preds, softmax_scores, tokenizer, revert
     return words_list, preds_list, confidence_list
 def add_spaces_around_punctuation(text):
     # Add a space before and after all punctuation
     all_punctuation = string.punctuation + punctuation
@@ -312,8 +265,8 @@ def attach_comp_to_closest(entities):
             # Ensure the entity type is valid and check for minimal distance
             if (
-                distance < min_distance
-                and other_entity["type"].split(".")[0] in valid_entity_types
             ):
                 min_distance = distance
                 closest_entity = other_entity
@@ -363,8 +316,8 @@ def extract_name_from_text(text, partial_name):
     # Find the position of the partial name in the word list
     for i, word in enumerate(words):
         if DEBUG:
-            print(words, "---", words[i : i + len(partial_words)])
-        if words[i : i + len(partial_words)] == partial_words:
             # Initialize full name with the partial name
             full_name = partial_words[:]
@@ -443,8 +396,8 @@ def postprocess_entities(entities):
         # If the entity text is new, or this entity has more dots, update the map
         if (
-            entity_text not in entity_map
-            or entity_map[entity_text]["type"].count(".") < num_dots
         ):
             entity_map[entity_text] = entity
@@ -480,9 +433,9 @@ def remove_included_entities(entities):
                         is_included = True
                         break
                 elif (
-                    entity["type"].split(".")[0] in other_entity["type"].split(".")[0]
-                    or other_entity["type"].split(".")[0]
-                    in entity["type"].split(".")[0]
                 ):
                     if entity["surface"] in other_entity["surface"]:
                         is_included = True
@@ -547,12 +500,12 @@ def remove_trailing_stopwords(entities):
             if len(entity_text.split()) < 1:
                 continue
             while entity_text and (
-                entity_text.split()[0].lower() in stop_words
-                or entity_text[0] in punctuation
             ):
                 if entity_text.split()[0].lower() in stop_words:
                     stopword_len = (
-                        len(entity_text.split()[0]) + 1
                     )  # Adjust length for stopword and following space
                     entity_text = entity_text[stopword_len:]  # Remove leading stopword
                     lOffset += stopword_len  # Adjust the left offset
@@ -571,11 +524,11 @@ def remove_trailing_stopwords(entities):
             # Remove stopwords and punctuation from the end
             if len(entity_text.strip()) > 1:
                 while (
-                    entity_text.strip().split()
-                    and (
-                        entity_text.strip().split()[-1].lower() in stop_words
-                        or entity_text[-1] in punctuation
-                    )
                 ):
                     if entity_text.strip().split() and entity_text.strip().split()[-1].lower() in stop_words:
                         stopword_len = len(entity_text.strip().split()[-1]) + 1  # account for space
@@ -613,7 +566,7 @@ def remove_trailing_stopwords(entities):
                 continue
             # Check if the entire entity is made up of stopwords characters
             if all(
-                [char.lower() in stop_words for char in entity_text if char.isalpha()]
             ):
                 if DEBUG:
                     print(
@@ -630,11 +583,11 @@ def remove_trailing_stopwords(entities):
                 # entities.remove(entity)
                 continue
             if all(
-                [
-                    char.lower() in string.punctuation
-                    for char in entity_text
-                    if char.isalpha()
-                ]
             ):
                 if DEBUG:
                     print(
@@ -676,7 +629,7 @@ def remove_trailing_stopwords(entities):
     if DEBUG:
         print(f"Remained entities in remove_trailing_stopwords: {len(new_entities)}")
     return new_entities
 class MultitaskTokenClassificationPipeline(Pipeline):
@@ -723,8 +676,8 @@ class MultitaskTokenClassificationPipeline(Pipeline):
     def is_within(self, entity1, entity2):
         """Check if entity1 is fully within the bounds of entity2."""
         return (
-            entity1["lOffset"] >= entity2["lOffset"]
-            and entity1["rOffset"] <= entity2["rOffset"]
         )
     def postprocess(self, outputs, **kwargs):

 stop_words = set(nltk.corpus.stopwords.words("english"))
 DEBUG = False
 punctuation = (
+        string.punctuation
+        + "«»—…“”"
+        + "—."
+        + "–"
+        + "’"
+        + "‘"
+        + "´"
+        + "•"
+        + "°"
+        + "»"
+        + "“"
+        + "”"
+        + "–"
+        + "—"
+        + "‘’“”„«»•–—―‣◦…§¶†‡‰′″〈〉"
 )
 # List of additional "strange" punctuation marks
 }
 def normalize_text(text):
     # Remove spaces and tabs for the search but keep newline characters
     return re.sub(r"[ \t]+", "", text)
 def get_entities(tokens, tags, confidences, text):
     tags = [tag.replace("S-", "B-").replace("E-", "I-") for tag in tags]
     pos_tags = [pos for token, pos in pos_tag(tokens)]
                 entity_start_position = indices[0]
                 entity_end_position = indices[1]
                 if (
+                        "_".join(
+                            [original_label, original_string, str(entity_start_position)]
+                        )
+                        in already_done
                 ):
                     continue
                 else:
                         )
                     )
                 if len(text[entity_start_position:entity_end_position].strip()) < len(
+                        text[entity_start_position:entity_end_position]
                 ):
                     entity_start_position = (
+                            entity_start_position
+                            + len(text[entity_start_position:entity_end_position])
+                            - len(text[entity_start_position:entity_end_position].strip())
                     )
                 entities.append(
                     {
                         "type": original_label,
                         "confidence_ner": round(
+                            np.average(confidences[idx: idx + len(subtree)]), 2
                         ),
                         "index": (idx, idx + len(subtree)),
                         "surface": text[
+                                   entity_start_position:entity_end_position
+                                   ],  # original_string,
                         "lOffset": entity_start_position,
                         "rOffset": entity_end_position,
                     }
     return words_list, preds_list, confidence_list
 def add_spaces_around_punctuation(text):
     # Add a space before and after all punctuation
     all_punctuation = string.punctuation + punctuation
             # Ensure the entity type is valid and check for minimal distance
             if (
+                    distance < min_distance
+                    and other_entity["type"].split(".")[0] in valid_entity_types
             ):
                 min_distance = distance
                 closest_entity = other_entity
     # Find the position of the partial name in the word list
     for i, word in enumerate(words):
         if DEBUG:
+            print(words, "---", words[i: i + len(partial_words)])
+        if words[i: i + len(partial_words)] == partial_words:
             # Initialize full name with the partial name
             full_name = partial_words[:]
         # If the entity text is new, or this entity has more dots, update the map
         if (
+                entity_text not in entity_map
+                or entity_map[entity_text]["type"].count(".") < num_dots
         ):
             entity_map[entity_text] = entity
                         is_included = True
                         break
                 elif (
+                        entity["type"].split(".")[0] in other_entity["type"].split(".")[0]
+                        or other_entity["type"].split(".")[0]
+                        in entity["type"].split(".")[0]
                 ):
                     if entity["surface"] in other_entity["surface"]:
                         is_included = True
             if len(entity_text.split()) < 1:
                 continue
             while entity_text and (
+                    entity_text.split()[0].lower() in stop_words
+                    or entity_text[0] in punctuation
             ):
                 if entity_text.split()[0].lower() in stop_words:
                     stopword_len = (
+                            len(entity_text.split()[0]) + 1
                     )  # Adjust length for stopword and following space
                     entity_text = entity_text[stopword_len:]  # Remove leading stopword
                     lOffset += stopword_len  # Adjust the left offset
             # Remove stopwords and punctuation from the end
             if len(entity_text.strip()) > 1:
                 while (
+                        entity_text.strip().split()
+                        and (
+                                entity_text.strip().split()[-1].lower() in stop_words
+                                or entity_text[-1] in punctuation
+                        )
                 ):
                     if entity_text.strip().split() and entity_text.strip().split()[-1].lower() in stop_words:
                         stopword_len = len(entity_text.strip().split()[-1]) + 1  # account for space
                 continue
             # Check if the entire entity is made up of stopwords characters
             if all(
+                    [char.lower() in stop_words for char in entity_text if char.isalpha()]
             ):
                 if DEBUG:
                     print(
                 # entities.remove(entity)
                 continue
             if all(
+                    [
+                        char.lower() in string.punctuation
+                        for char in entity_text
+                        if char.isalpha()
+                    ]
             ):
                 if DEBUG:
                     print(
     if DEBUG:
         print(f"Remained entities in remove_trailing_stopwords: {len(new_entities)}")
     return new_entities
 class MultitaskTokenClassificationPipeline(Pipeline):
     def is_within(self, entity1, entity2):
         """Check if entity1 is fully within the bounds of entity2."""
         return (
+                entity1["lOffset"] >= entity2["lOffset"]
+                and entity1["rOffset"] <= entity2["rOffset"]
         )
     def postprocess(self, outputs, **kwargs):

modeling_stacked.py CHANGED Viewed

@@ -16,29 +16,26 @@ def get_info(label_map):
     return num_token_labels_dict
-class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
     config_class = ImpressoConfig
     _keys_to_ignore_on_load_missing = [r"position_ids"]
-    def __init__(self, config):
         super().__init__(config)
-        self.num_token_labels_dict = get_info(config.label_map)
         self.config = config
-        self.bert = AutoModel.from_pretrained(
-            config.pretrained_config["_name_or_path"], config=config.pretrained_config
-        )
-        if "classifier_dropout" not in config.__dict__:
-            classifier_dropout = 0.1
-        else:
-            classifier_dropout = (
-                config.classifier_dropout
-                if config.classifier_dropout is not None
-                else config.hidden_dropout_prob
-            )
         self.dropout = nn.Dropout(classifier_dropout)
         # Additional transformer layers
         self.transformer_encoder = nn.TransformerEncoder(
             nn.TransformerEncoderLayer(
@@ -46,71 +43,72 @@ class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
             ),
             num_layers=2,
         )
-        # For token classification, create a classifier for each task
-        self.token_classifiers = nn.ModuleDict(
-            {
-                task: nn.Linear(config.hidden_size, num_labels)
-                for task, num_labels in self.num_token_labels_dict.items()
-            }
-        )
-        # Initialize weights and apply final processing
         self.post_init()
     def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        token_labels: Optional[dict] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
-        r"""
-        token_labels (`dict` of `torch.LongTensor` of shape `(batch_size, seq_length)`, *optional*):
-            Labels for computing the token classification loss. Keys should match the tasks.
-        """
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
         bert_kwargs = {
-            "input_ids": input_ids,
             "attention_mask": attention_mask,
             "token_type_ids": token_type_ids,
             "position_ids": position_ids,
             "head_mask": head_mask,
-            "inputs_embeds": inputs_embeds,
             "output_attentions": output_attentions,
             "output_hidden_states": output_hidden_states,
             "return_dict": return_dict,
         }
-        if any(
-            keyword in self.config.name_or_path.lower()
-            for keyword in ["llama", "deberta"]
-        ):
-            bert_kwargs.pop("token_type_ids")
-            bert_kwargs.pop("head_mask")
-        outputs = self.bert(**bert_kwargs)
-        # For token classification
-        token_output = outputs[0]
-        token_output = self.dropout(token_output)
         # Pass through additional transformer layers
         token_output = self.transformer_encoder(token_output.transpose(0, 1)).transpose(
             0, 1
         )
-        # Collect the logits and compute the loss for each task
         task_logits = {}
         total_loss = 0
         for task, classifier in self.token_classifiers.items():
@@ -131,6 +129,115 @@ class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
         return TokenClassifierOutput(
             loss=total_loss,
             logits=task_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
         )

     return num_token_labels_dict
+class ExtendedMultitaskTimeModelForTokenClassification(PreTrainedModel):
     config_class = ImpressoConfig
     _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def __init__(self, config, num_token_labels_dict, temporal_fusion_strategy="baseline", num_years=327):
         super().__init__(config)
         self.config = config
+        self.num_token_labels_dict = num_token_labels_dict
+        self.temporal_fusion_strategy = temporal_fusion_strategy
+        self.model = AutoModel.from_pretrained(config.name_or_path, config=config)
+        self.model.config.use_cache = False
+        self.model.config.pretraining_tp = 1
+        self.num_years = num_years
+        classifier_dropout = getattr(config, "classifier_dropout", 0.1) or config.hidden_dropout_prob
         self.dropout = nn.Dropout(classifier_dropout)
+        self.temporal_fusion = TemporalFusion(config.hidden_size, strategy=self.temporal_fusion_strategy,
+                                              num_years=num_years)
         # Additional transformer layers
         self.transformer_encoder = nn.TransformerEncoder(
             nn.TransformerEncoderLayer(
             ),
             num_layers=2,
         )
+        self.token_classifiers = nn.ModuleDict({
+            task: nn.Linear(config.hidden_size, num_labels)
+            for task, num_labels in num_token_labels_dict.items()
+        })
         self.post_init()
     def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            token_labels: Optional[dict] = None,
+            date_indices: Optional[torch.Tensor] = None,
+            year_index: Optional[torch.Tensor] = None,
+            decade_index: Optional[torch.Tensor] = None,
+            century_index: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if inputs_embeds is None:
+            inputs_embeds = self.model.embeddings(input_ids)
+        # Early cross-attention fusion
+        if self.temporal_fusion_strategy == "early-cross-attention":
+            year_emb = self.temporal_fusion.compute_time_embedding(year_index)  # (B, H)
+            inputs_embeds = self.temporal_fusion.cross_attn(inputs_embeds, year_emb)
         bert_kwargs = {
+            "inputs_embeds": inputs_embeds if self.temporal_fusion_strategy == "early-cross-attention" else None,
+            "input_ids": input_ids if self.temporal_fusion_strategy != "early-cross-attention" else None,
             "attention_mask": attention_mask,
             "token_type_ids": token_type_ids,
             "position_ids": position_ids,
             "head_mask": head_mask,
             "output_attentions": output_attentions,
             "output_hidden_states": output_hidden_states,
             "return_dict": return_dict,
         }
+        if any(keyword in self.config.name_or_path.lower() for keyword in ["llama", "deberta"]):
+            bert_kwargs.pop("token_type_ids", None)
+            bert_kwargs.pop("head_mask", None)
+        outputs = self.model(**bert_kwargs)
+        token_output = self.dropout(outputs[0])  # (B, T, H)
+        hidden_states = list(outputs.hidden_states) if output_hidden_states else None
         # Pass through additional transformer layers
         token_output = self.transformer_encoder(token_output.transpose(0, 1)).transpose(
             0, 1
         )
+        # Apply fusion after transformer if needed
+        if self.temporal_fusion_strategy not in ["baseline", "early-cross-attention"]:
+            token_output = self.temporal_fusion(token_output, year_index)
+            if output_hidden_states:
+                hidden_states.append(token_output)  # add the final fused state
         task_logits = {}
         total_loss = 0
         for task, classifier in self.token_classifiers.items():
         return TokenClassifierOutput(
             loss=total_loss,
             logits=task_logits,
+            hidden_states=tuple(hidden_states) if hidden_states is not None else None,
+            attentions=outputs.attentions if output_attentions else None,
         )
+class TemporalFusion(nn.Module):
+    def __init__(self, hidden_size, strategy="add", num_years=327, min_year=1700):
+        super().__init__()
+        self.strategy = strategy
+        self.hidden_size = hidden_size
+        self.min_year = min_year
+        self.max_year = min_year + num_years - 1
+        self.year_emb = nn.Embedding(num_years, hidden_size)
+        if strategy == "concat":
+            self.concat_proj = nn.Linear(hidden_size * 2, hidden_size)
+        elif strategy == "film":
+            self.film_gamma = nn.Linear(hidden_size, hidden_size)
+            self.film_beta = nn.Linear(hidden_size, hidden_size)
+        elif strategy == "adapter":
+            self.adapter = nn.Sequential(
+                nn.Linear(hidden_size, hidden_size),
+                nn.ReLU(),
+                nn.Linear(hidden_size, hidden_size),
+            )
+        elif strategy == "relative":
+            self.relative_encoder = nn.Sequential(
+                nn.Linear(hidden_size, hidden_size),
+                nn.SiLU(),
+                nn.LayerNorm(hidden_size),
+            )
+            self.film_gamma = nn.Linear(hidden_size, hidden_size)
+            self.film_beta = nn.Linear(hidden_size, hidden_size)
+        elif strategy == "multiscale":
+            self.decade_emb = nn.Embedding(1000, hidden_size)
+            self.century_emb = nn.Embedding(100, hidden_size)
+        elif strategy in ["early-cross-attention", "late-cross-attention"]:
+            self.year_encoder = nn.Sequential(
+                nn.Linear(hidden_size, hidden_size),
+                nn.SiLU()
+            )
+            self.cross_attn = TemporalCrossAttention(hidden_size)
+    def compute_time_embedding(self, year_index):
+        if self.strategy in ["early-cross-attention", "late-cross-attention"]:
+            return self.year_encoder(self.year_emb(year_index))
+        elif self.strategy == "multiscale":
+            year_index = year_index.long()
+            year = year_index + self.min_year
+            decade = (year // 10).long()
+            century = (year // 100).long()
+            return (
+                    self.year_emb(year_index) +
+                    self.decade_emb(decade) +
+                    self.century_emb(century)
+            )
+        else:
+            return self.year_emb(year_index)
+    def forward(self, token_output, year_index):
+        B, T, H = token_output.size()
+        if self.strategy == "baseline":
+            return token_output
+        year_emb = self.compute_time_embedding(year_index)
+        if self.strategy == "concat":
+            expanded_year = year_emb.unsqueeze(1).repeat(1, T, 1)
+            fused = torch.cat([token_output, expanded_year], dim=-1)
+            return self.concat_proj(fused)
+        elif self.strategy == "film":
+            gamma = self.film_gamma(year_emb).unsqueeze(1)
+            beta = self.film_beta(year_emb).unsqueeze(1)
+            return gamma * token_output + beta
+        elif self.strategy == "adapter":
+            return token_output + self.adapter(year_emb).unsqueeze(1)
+        elif self.strategy == "add":
+            expanded_year = year_emb.unsqueeze(1).repeat(1, T, 1)
+            return token_output + expanded_year
+        elif self.strategy == "relative":
+            encoded = self.relative_encoder(year_emb)
+            gamma = self.film_gamma(encoded).unsqueeze(1)
+            beta = self.film_beta(encoded).unsqueeze(1)
+            return gamma * token_output + beta
+        elif self.strategy == "multiscale":
+            expanded_year = year_emb.unsqueeze(1).expand(-1, T, -1)
+            return token_output + expanded_year
+        elif self.strategy == "late-cross-attention":
+            return self.cross_attn(token_output, year_emb)
+        else:
+            raise ValueError(f"Unknown fusion strategy: {self.strategy}")
+class TemporalCrossAttention(nn.Module):
+    def __init__(self, hidden_size, num_heads=4):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads, batch_first=True)
+    def forward(self, token_output, time_embedding):
+        # token_output: (B, T, H), time_embedding: (B, H)
+        time_as_seq = time_embedding.unsqueeze(1)  # (B, 1, H)
+        attn_output, _ = self.attn(token_output, time_as_seq, time_as_seq)
+        return token_output + attn_output

old/config.json ADDED Viewed

	@@ -0,0 +1,233 @@

+{
+  "_name_or_path": "experiments_final/model_dbmdz_bert_medium_historic_multilingual_cased_max_sequence_length_512_epochs_5_run_extended_suffix_baseline/checkpoint-450",
+  "architectures": [
+    "ExtendedMultitaskModelForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_stacked.ImpressoConfig",
+    "AutoModelForTokenClassification": "modeling_stacked.ExtendedMultitaskModelForTokenClassification"
+  },
+  "classifier_dropout": null,
+  "custom_pipelines": {
+    "generic-ner": {
+      "impl": "generic_ner.MultitaskTokenClassificationPipeline",
+      "pt": "AutoModelForTokenClassification"
+    }
+  },
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "label_map": {
+    "NE-COARSE-LIT": {
+      "B-loc": 8,
+      "B-org": 0,
+      "B-pers": 7,
+      "B-prod": 4,
+      "B-time": 5,
+      "I-loc": 1,
+      "I-org": 2,
+      "I-pers": 9,
+      "I-prod": 10,
+      "I-time": 6,
+      "O": 3
+    },
+    "NE-COARSE-METO": {
+      "B-loc": 3,
+      "B-org": 0,
+      "B-time": 5,
+      "I-loc": 4,
+      "I-org": 2,
+      "O": 1
+    },
+    "NE-FINE-COMP": {
+      "B-comp.demonym": 8,
+      "B-comp.function": 5,
+      "B-comp.name": 1,
+      "B-comp.qualifier": 9,
+      "B-comp.title": 2,
+      "I-comp.demonym": 7,
+      "I-comp.function": 3,
+      "I-comp.name": 0,
+      "I-comp.qualifier": 10,
+      "I-comp.title": 4,
+      "O": 6
+    },
+    "NE-FINE-LIT": {
+      "B-loc.add.elec": 32,
+      "B-loc.add.phys": 5,
+      "B-loc.adm.nat": 34,
+      "B-loc.adm.reg": 39,
+      "B-loc.adm.sup": 12,
+      "B-loc.adm.town": 33,
+      "B-loc.fac": 36,
+      "B-loc.oro": 19,
+      "B-loc.phys.geo": 13,
+      "B-loc.phys.hydro": 28,
+      "B-loc.unk": 4,
+      "B-org.adm": 3,
+      "B-org.ent": 24,
+      "B-org.ent.pressagency": 37,
+      "B-pers.coll": 9,
+      "B-pers.ind": 0,
+      "B-pers.ind.articleauthor": 20,
+      "B-prod.doctr": 2,
+      "B-prod.media": 10,
+      "B-time.date.abs": 23,
+      "I-loc.add.elec": 22,
+      "I-loc.add.phys": 6,
+      "I-loc.adm.nat": 11,
+      "I-loc.adm.reg": 35,
+      "I-loc.adm.sup": 15,
+      "I-loc.adm.town": 8,
+      "I-loc.fac": 27,
+      "I-loc.oro": 21,
+      "I-loc.phys.geo": 25,
+      "I-loc.phys.hydro": 17,
+      "I-loc.unk": 40,
+      "I-org.adm": 29,
+      "I-org.ent": 1,
+      "I-org.ent.pressagency": 14,
+      "I-pers.coll": 26,
+      "I-pers.ind": 16,
+      "I-pers.ind.articleauthor": 31,
+      "I-prod.doctr": 30,
+      "I-prod.media": 38,
+      "I-time.date.abs": 7,
+      "O": 18
+    },
+    "NE-FINE-METO": {
+      "B-loc.adm.town": 6,
+      "B-loc.fac": 3,
+      "B-loc.oro": 5,
+      "B-org.adm": 1,
+      "B-org.ent": 7,
+      "B-time.date.abs": 9,
+      "I-loc.fac": 8,
+      "I-org.adm": 2,
+      "I-org.ent": 0,
+      "O": 4
+    },
+    "NE-NESTED": {
+      "B-loc.adm.nat": 13,
+      "B-loc.adm.reg": 15,
+      "B-loc.adm.sup": 10,
+      "B-loc.adm.town": 9,
+      "B-loc.fac": 18,
+      "B-loc.oro": 17,
+      "B-loc.phys.geo": 11,
+      "B-loc.phys.hydro": 1,
+      "B-org.adm": 4,
+      "B-org.ent": 20,
+      "B-pers.coll": 7,
+      "B-pers.ind": 2,
+      "B-prod.media": 23,
+      "I-loc.adm.nat": 8,
+      "I-loc.adm.reg": 14,
+      "I-loc.adm.town": 6,
+      "I-loc.fac": 0,
+      "I-loc.oro": 19,
+      "I-loc.phys.geo": 21,
+      "I-loc.phys.hydro": 22,
+      "I-org.adm": 5,
+      "I-org.ent": 3,
+      "I-pers.ind": 12,
+      "I-prod.media": 24,
+      "O": 16
+    }
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "stacked_bert",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 8,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "pretrained_config": {
+    "_name_or_path": "dbmdz/bert-medium-historic-multilingual-cased",
+    "add_cross_attention": false,
+    "architectures": [
+      "BertForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": null,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 512,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 512,
+    "min_length": 0,
+    "model_type": "bert",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 8,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "position_embedding_type": "absolute",
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "type_vocab_size": 2,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 32000
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 32000
+}

old/configuration_stacked.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from transformers import PretrainedConfig
+import torch
+class ImpressoConfig(PretrainedConfig):
+    model_type = "stacked_bert"
+    def __init__(
+            self,
+            vocab_size=30522,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            pad_token_id=0,
+            position_embedding_type="absolute",
+            use_cache=True,
+            classifier_dropout=None,
+            pretrained_config=None,
+            values_override=None,
+            label_map=None,
+            **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.pretrained_config = pretrained_config
+        self.label_map = label_map
+        self.values_override = values_override or {}
+        self.outputs = {
+            "logits": {"shape": [None, None, self.hidden_size], "dtype": "float32"}
+        }
+    @classmethod
+    def is_torch_support_available(cls):
+        """
+        Indicate whether Torch support is available for this configuration.
+        Required for compatibility with certain parts of the Transformers library.
+        """
+        return True
+    @classmethod
+    def patch_ops(self):
+        """
+        A method required by some Hugging Face utilities to modify operator mappings.
+        Currently, it performs no operation and is included for compatibility.
+        Args:
+            ops: A dictionary of operations to potentially patch.
+        Returns:
+            The (unmodified) ops dictionary.
+        """
+        return None
+    def generate_dummy_inputs(self, tokenizer, batch_size=1, seq_length=8, framework="pt"):
+        """
+        Generate dummy inputs for testing or export.
+        Args:
+            tokenizer: The tokenizer used to tokenize inputs.
+            batch_size: Number of input samples in the batch.
+            seq_length: Length of each sequence.
+            framework: Framework ("pt" for PyTorch, "tf" for TensorFlow).
+        Returns:
+            Dummy inputs as a dictionary.
+        """
+        if framework == "pt":
+            input_ids = torch.randint(
+                low=0,
+                high=self.vocab_size,
+                size=(batch_size, seq_length),
+                dtype=torch.long
+            )
+            attention_mask = torch.ones((batch_size, seq_length), dtype=torch.long)
+            return {"input_ids": input_ids, "attention_mask": attention_mask}
+        else:
+            raise ValueError("Framework '{}' not supported.".format(framework))
+# Register the configuration with the transformers library
+ImpressoConfig.register_for_auto_class()

old/generic_ner.py ADDED Viewed

	@@ -0,0 +1,789 @@

+import logging
+from transformers import Pipeline
+import numpy as np
+import torch
+import nltk
+nltk.download("averaged_perceptron_tagger")
+nltk.download("averaged_perceptron_tagger_eng")
+nltk.download("stopwords")
+from nltk.chunk import conlltags2tree
+from nltk import pos_tag
+from nltk.tree import Tree
+import torch.nn.functional as F
+import re, string
+stop_words = set(nltk.corpus.stopwords.words("english"))
+DEBUG = False
+punctuation = (
+    string.punctuation
+    + "«»—…“”"
+    + "—."
+    + "–"
+    + "’"
+    + "‘"
+    + "´"
+    + "•"
+    + "°"
+    + "»"
+    + "“"
+    + "”"
+    + "–"
+    + "—"
+    + "‘’“”„«»•–—―‣◦…§¶†‡‰′″〈〉"
+)
+# List of additional "strange" punctuation marks
+# additional_punctuation = "‘’“”„«»•–—―‣◦…§¶†‡‰′″〈〉"
+WHITESPACE_RULES = {
+    "fr": {
+        "pct_no_ws_before": [".", ",", ")", "]", "}", "°", "...", ".-", "%"],
+        "pct_no_ws_after": ["(", "[", "{"],
+        "pct_no_ws_before_after": ["'", "-"],
+        "pct_number": [".", ","],
+    },
+    "de": {
+        "pct_no_ws_before": [
+            ".",
+            ",",
+            ")",
+            "]",
+            "}",
+            "°",
+            "...",
+            "?",
+            "!",
+            ":",
+            ";",
+            ".-",
+            "%",
+        ],
+        "pct_no_ws_after": ["(", "[", "{"],
+        "pct_no_ws_before_after": ["'", "-"],
+        "pct_number": [".", ","],
+    },
+    "other": {
+        "pct_no_ws_before": [
+            ".",
+            ",",
+            ")",
+            "]",
+            "}",
+            "°",
+            "...",
+            "?",
+            "!",
+            ":",
+            ";",
+            ".-",
+            "%",
+        ],
+        "pct_no_ws_after": ["(", "[", "{"],
+        "pct_no_ws_before_after": ["'", "-"],
+        "pct_number": [".", ","],
+    },
+}
+# def tokenize(text: str, language: str = "other") -> list[str]:
+#     """Apply whitespace rules to the given text and language, separating it into tokens.
+#
+#     Args:
+#         text (str): The input text to separate into a list of tokens.
+#         language (str): Language of the text.
+#
+#     Returns:
+#         list[str]: List of tokens with punctuation as separate tokens.
+#     """
+#     # text = add_spaces_around_punctuation(text)
+#     if not text:
+#         return []
+#
+#     if language not in WHITESPACE_RULES:
+#         # Default behavior for languages without specific rules:
+#         # tokenize using standard whitespace splitting
+#         language = "other"
+#
+#     wsrules = WHITESPACE_RULES[language]
+#     tokenized_text = []
+#     current_token = ""
+#
+#     for char in text:
+#         if char in wsrules["pct_no_ws_before_after"]:
+#             if current_token:
+#                 tokenized_text.append(current_token)
+#             tokenized_text.append(char)
+#             current_token = ""
+#         elif char in wsrules["pct_no_ws_before"] or char in wsrules["pct_no_ws_after"]:
+#             if current_token:
+#                 tokenized_text.append(current_token)
+#             tokenized_text.append(char)
+#             current_token = ""
+#         elif char.isspace():
+#             if current_token:
+#                 tokenized_text.append(current_token)
+#                 current_token = ""
+#         else:
+#             current_token += char
+#
+#     if current_token:
+#         tokenized_text.append(current_token)
+#
+#     return tokenized_text
+def normalize_text(text):
+    # Remove spaces and tabs for the search but keep newline characters
+    return re.sub(r"[ \t]+", "", text)
+def find_entity_indices(article_text, search_text):
+    # Normalize texts by removing spaces and tabs
+    normalized_article = normalize_text(article_text)
+    normalized_search = normalize_text(search_text)
+    # Initialize a list to hold all start and end indices
+    indices = []
+    # Find all occurrences of the search text in the normalized article text
+    start_index = 0
+    while True:
+        start_index = normalized_article.find(normalized_search, start_index)
+        if start_index == -1:
+            break
+        # Calculate the actual start and end indices in the original article text
+        original_chars = 0
+        original_start_index = 0
+        for i in range(start_index):
+            while article_text[original_start_index] in (" ", "\t"):
+                original_start_index += 1
+            if article_text[original_start_index] not in (" ", "\t", "\n"):
+                original_chars += 1
+            original_start_index += 1
+        original_end_index = original_start_index
+        search_chars = 0
+        while search_chars < len(normalized_search):
+            if article_text[original_end_index] not in (" ", "\t", "\n"):
+                search_chars += 1
+            original_end_index += 1  # Increment to include the last character
+        # Append the found indices to the list
+        if article_text[original_start_index] == " ":
+            original_start_index += 1
+        indices.append((original_start_index, original_end_index))
+        # Move start_index to the next position to continue searching
+        start_index += 1
+    return indices
+def get_entities(tokens, tags, confidences, text):
+    tags = [tag.replace("S-", "B-").replace("E-", "I-") for tag in tags]
+    pos_tags = [pos for token, pos in pos_tag(tokens)]
+    for i in range(1, len(tags)):
+        # If a 'B-' tag is followed by another 'B-' without an 'O' in between, change the second to 'I-'
+        if tags[i].startswith("B-") and tags[i - 1].startswith("I-"):
+            tags[i] = "I-" + tags[i][2:]  # Change 'B-' to 'I-' for the same entity type
+    conlltags = [(token, pos, tg) for token, pos, tg in zip(tokens, pos_tags, tags)]
+    ne_tree = conlltags2tree(conlltags)
+    entities = []
+    idx: int = 0
+    already_done = []
+    for subtree in ne_tree:
+        # skipping 'O' tags
+        if isinstance(subtree, Tree):
+            original_label = subtree.label()
+            original_string = " ".join([token for token, pos in subtree.leaves()])
+            for indices in find_entity_indices(text, original_string):
+                entity_start_position = indices[0]
+                entity_end_position = indices[1]
+                if (
+                    "_".join(
+                        [original_label, original_string, str(entity_start_position)]
+                    )
+                    in already_done
+                ):
+                    continue
+                else:
+                    already_done.append(
+                        "_".join(
+                            [
+                                original_label,
+                                original_string,
+                                str(entity_start_position),
+                            ]
+                        )
+                    )
+                if len(text[entity_start_position:entity_end_position].strip()) < len(
+                    text[entity_start_position:entity_end_position]
+                ):
+                    entity_start_position = (
+                        entity_start_position
+                        + len(text[entity_start_position:entity_end_position])
+                        - len(text[entity_start_position:entity_end_position].strip())
+                    )
+                entities.append(
+                    {
+                        "type": original_label,
+                        "confidence_ner": round(
+                            np.average(confidences[idx : idx + len(subtree)]), 2
+                        ),
+                        "index": (idx, idx + len(subtree)),
+                        "surface": text[
+                            entity_start_position:entity_end_position
+                        ],  # original_string,
+                        "lOffset": entity_start_position,
+                        "rOffset": entity_end_position,
+                    }
+                )
+            idx += len(subtree)
+            # Update the current character position
+            # We add the length of the original string + 1 (for the space)
+        else:
+            token, pos = subtree
+            # If it's not a named entity, we still need to update the character
+            # position
+            idx += 1
+    return entities
+def realign(word_ids, tokens, out_label_preds, softmax_scores, tokenizer, reverted_label_map):
+    preds_list, words_list, confidence_list = [], [], []
+    seen_word_ids = set()
+    for i, word_id in enumerate(word_ids):
+        if word_id is None or word_id in seen_word_ids:
+            continue  # skip special tokens or repeated subwords
+        seen_word_ids.add(word_id)
+        try:
+            preds_list.append(reverted_label_map[out_label_preds[i]])
+            confidence_list.append(max(softmax_scores[i]))
+        except Exception:
+            preds_list.append("O")
+            confidence_list.append(0.0)
+        words_list.append(tokens[word_id])  # original word list index
+    return words_list, preds_list, confidence_list
+def add_spaces_around_punctuation(text):
+    # Add a space before and after all punctuation
+    all_punctuation = string.punctuation + punctuation
+    return re.sub(r"([{}])".format(re.escape(all_punctuation)), r" \1 ", text)
+def attach_comp_to_closest(entities):
+    # Define valid entity types that can receive a "comp.function" or "comp.name" attachment
+    valid_entity_types = {"org", "pers", "org.ent", "pers.ind"}
+    # Separate "comp.function" and "comp.name" entities from other entities
+    comp_entities = [ent for ent in entities if ent["type"].startswith("comp")]
+    other_entities = [ent for ent in entities if not ent["type"].startswith("comp")]
+    for comp_entity in comp_entities:
+        closest_entity = None
+        min_distance = float("inf")
+        # Find the closest non-"comp" entity that is valid for attaching
+        for other_entity in other_entities:
+            # Calculate distance between the comp entity and the other entity
+            if comp_entity["lOffset"] > other_entity["rOffset"]:
+                distance = comp_entity["lOffset"] - other_entity["rOffset"]
+            elif comp_entity["rOffset"] < other_entity["lOffset"]:
+                distance = other_entity["lOffset"] - comp_entity["rOffset"]
+            else:
+                distance = 0  # They overlap or touch
+            # Ensure the entity type is valid and check for minimal distance
+            if (
+                distance < min_distance
+                and other_entity["type"].split(".")[0] in valid_entity_types
+            ):
+                min_distance = distance
+                closest_entity = other_entity
+        # Attach the "comp.function" or "comp.name" if a valid entity is found
+        if closest_entity:
+            suffix = comp_entity["type"].split(".")[
+                -1
+            ]  # Extract the suffix (e.g., 'name', 'function')
+            closest_entity[suffix] = comp_entity["surface"]  # Attach the text
+    return other_entities
+def conflicting_context(comp_entity, target_entity):
+    """
+    Determines if there is a conflict between the comp_entity and the target entity.
+    Prevents incorrect name and function attachments by using a rule-based approach.
+    """
+    # Case 1: Check for correct function attachment to person or organization entities
+    if comp_entity["type"].startswith("comp.function"):
+        if not ("pers" in target_entity["type"] or "org" in target_entity["type"]):
+            return True  # Conflict: Function should only attach to persons or organizations
+    # Case 2: Avoid attaching comp.* entities to non-person, non-organization types (like locations)
+    if "loc" in target_entity["type"]:
+        return True  # Conflict: comp.* entities should not attach to locations or similar types
+    return False  # No conflict
+def extract_name_from_text(text, partial_name):
+    """
+    Extracts the full name from the entity's text based on the partial name.
+    This function assumes that the full name starts with capitalized letters and does not
+    include any words that come after the partial name.
+    """
+    # Split the text and partial name into words
+    words = text.split()
+    partial_words = partial_name.split()
+    if DEBUG:
+        print("text:", text)
+    if DEBUG:
+        print("partial_name:", partial_name)
+    # Find the position of the partial name in the word list
+    for i, word in enumerate(words):
+        if DEBUG:
+            print(words, "---", words[i : i + len(partial_words)])
+        if words[i : i + len(partial_words)] == partial_words:
+            # Initialize full name with the partial name
+            full_name = partial_words[:]
+            if DEBUG:
+                print("full_name:", full_name)
+            # Check previous words and only add capitalized words (skip lowercase words)
+            j = i - 1
+            while j >= 0 and words[j][0].isupper():
+                full_name.insert(0, words[j])
+                j -= 1
+                if DEBUG:
+                    print("full_name:", full_name)
+            # Return only the full name up to the partial name (ignore words after the name)
+            return " ".join(full_name).strip()  # Join the words to form the full name
+    # If not found, return the original text (as a fallback)
+    return text.strip()
+def repair_names_in_entities(entities):
+    """
+    This function repairs the names in the entities by extracting the full name
+    from the text of the entity if a partial name (e.g., 'Washington') is incorrectly attached.
+    """
+    for entity in entities:
+        if "name" in entity and "pers" in entity["type"]:
+            name = entity["name"]
+            text = entity["surface"]
+            # Check if the attached name is part of the entity's text
+            if name in text:
+                # Extract the full name from the text by splitting around the attached name
+                full_name = extract_name_from_text(entity["surface"], name)
+                entity["name"] = (
+                    full_name  # Replace the partial name with the full name
+                )
+        # if "name" not in entity:
+        #     entity["name"] = entity["surface"]
+    return entities
+def clean_coarse_entities(entities):
+    """
+    This function removes entities that are not useful for the NEL process.
+    """
+    # Define a set of entity types that are considered useful for NEL
+    useful_types = {
+        "pers",  # Person
+        "loc",  # Location
+        "org",  # Organization
+        "date",  # Product
+        "time",  # Time
+    }
+    # Filter out entities that are not in the useful_types set unless they are comp.* entities
+    cleaned_entities = [
+        entity
+        for entity in entities
+        if entity["type"] in useful_types or "comp" in entity["type"]
+    ]
+    return cleaned_entities
+def postprocess_entities(entities):
+    # Step 1: Filter entities with the same text, keeping the one with the most dots in the 'entity' field
+    entity_map = {}
+    # Loop over the entities and prioritize the one with the most dots
+    for entity in entities:
+        entity_text = entity["surface"]
+        num_dots = entity["type"].count(".")
+        # If the entity text is new, or this entity has more dots, update the map
+        if (
+            entity_text not in entity_map
+            or entity_map[entity_text]["type"].count(".") < num_dots
+        ):
+            entity_map[entity_text] = entity
+    # Collect the filtered entities from the map
+    filtered_entities = list(entity_map.values())
+    # Step 2: Attach "comp.function" entities to the closest other entities
+    filtered_entities = attach_comp_to_closest(filtered_entities)
+    if DEBUG:
+        print("After attach_comp_to_closest:", filtered_entities, "\n")
+    filtered_entities = repair_names_in_entities(filtered_entities)
+    if DEBUG:
+        print("After repair_names_in_entities:", filtered_entities, "\n")
+    # Step 3: Remove entities that are not useful for NEL
+    # filtered_entities = clean_coarse_entities(filtered_entities)
+    # filtered_entities = remove_blacklisted_entities(filtered_entities)
+    return filtered_entities
+def remove_included_entities(entities):
+    # Loop through entities and remove those whose text is included in another with the same label
+    final_entities = []
+    for i, entity in enumerate(entities):
+        is_included = False
+        for other_entity in entities:
+            if entity["surface"] != other_entity["surface"]:
+                if "comp" in other_entity["type"]:
+                    # Check if entity's text is a substring of another entity's text
+                    if entity["surface"] in other_entity["surface"]:
+                        is_included = True
+                        break
+                elif (
+                    entity["type"].split(".")[0] in other_entity["type"].split(".")[0]
+                    or other_entity["type"].split(".")[0]
+                    in entity["type"].split(".")[0]
+                ):
+                    if entity["surface"] in other_entity["surface"]:
+                        is_included = True
+        if not is_included:
+            final_entities.append(entity)
+    return final_entities
+def refine_entities_with_coarse(all_entities, coarse_entities):
+    """
+    Looks through all entities and refines them based on the coarse entities.
+    If a surface match is found in the coarse entities and the types match,
+    the entity's confidence_ner and type are updated based on the coarse entity.
+    """
+    # Create a dictionary for coarse entities based on surface and type for quick lookup
+    coarse_lookup = {}
+    for coarse_entity in coarse_entities:
+        key = (coarse_entity["surface"], coarse_entity["type"].split(".")[0])
+        coarse_lookup[key] = coarse_entity
+    # Iterate through all entities and compare with the coarse entities
+    for entity in all_entities:
+        key = (
+            entity["surface"],
+            entity["type"].split(".")[0],
+        )  # Use the coarse type for comparison
+        if key in coarse_lookup:
+            coarse_entity = coarse_lookup[key]
+            # If a match is found, update the confidence_ner and type in the entity
+            if entity["confidence_ner"] < coarse_entity["confidence_ner"]:
+                entity["confidence_ner"] = coarse_entity["confidence_ner"]
+                entity["type"] = coarse_entity[
+                    "type"
+                ]  # Update the type if the confidence is higher
+    # No need to append to refined_entities, we're modifying in place
+    for entity in all_entities:
+        entity["type"] = entity["type"].split(".")[0]
+    return all_entities
+def remove_trailing_stopwords(entities):
+    """
+    This function removes stopwords and punctuation from both the beginning and end of each entity's text
+    and repairs the lOffset and rOffset accordingly.
+    """
+    if DEBUG:
+        print(f"Initial entities in remove_trailing_stopwords: {len(entities)}")
+    new_entities = []
+    for entity in entities:
+        if "comp" not in entity["type"]:
+            entity_text = entity["surface"]
+            original_len = len(entity_text)
+            # Initial offsets
+            lOffset = entity.get("lOffset", 0)
+            rOffset = entity.get("rOffset", original_len)
+            # Remove stopwords and punctuation from the beginning
+            # print('----', entity_text)
+            if len(entity_text.split()) < 1:
+                continue
+            while entity_text and (
+                entity_text.split()[0].lower() in stop_words
+                or entity_text[0] in punctuation
+            ):
+                if entity_text.split()[0].lower() in stop_words:
+                    stopword_len = (
+                        len(entity_text.split()[0]) + 1
+                    )  # Adjust length for stopword and following space
+                    entity_text = entity_text[stopword_len:]  # Remove leading stopword
+                    lOffset += stopword_len  # Adjust the left offset
+                    if DEBUG:
+                        print(
+                            f"Removed leading stopword from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
+                        )
+                elif entity_text[0] in punctuation:
+                    entity_text = entity_text[1:]  # Remove leading punctuation
+                    lOffset += 1  # Adjust the left offset
+                    if DEBUG:
+                        print(
+                            f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
+                        )
+            # Remove stopwords and punctuation from the end
+            if len(entity_text.strip()) > 1:
+                while (
+                    entity_text.strip().split()
+                    and (
+                        entity_text.strip().split()[-1].lower() in stop_words
+                        or entity_text[-1] in punctuation
+                    )
+                ):
+                    if entity_text.strip().split() and entity_text.strip().split()[-1].lower() in stop_words:
+                        stopword_len = len(entity_text.strip().split()[-1]) + 1  # account for space
+                        entity_text = entity_text[:-stopword_len]
+                        rOffset -= stopword_len
+                        if DEBUG:
+                            print(
+                                f"Removed trailing stopword from entity: {entity['surface']} --> {entity_text} ({entity['type']})"
+                            )
+                    if entity_text and entity_text[-1] in punctuation:
+                        entity_text = entity_text[:-1]
+                        rOffset -= 1
+                        if DEBUG:
+                            print(
+                                f"Removed trailing punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']})"
+                            )
+            # Skip certain entities based on rules
+            if entity_text in string.punctuation:
+                if DEBUG:
+                    print(f"Skipping entity: {entity_text}")
+                # entities.remove(entity)
+                continue
+            # check now if its in stopwords
+            if entity_text.lower() in stop_words:
+                if DEBUG:
+                    print(f"Skipping entity: {entity_text}")
+                # entities.remove(entity)
+                continue
+            # check now if the entire entity is a list of stopwords:
+            if all([word.lower() in stop_words for word in entity_text.split()]):
+                if DEBUG:
+                    print(f"Skipping entity: {entity_text}")
+                # entities.remove(entity)
+                continue
+            # Check if the entire entity is made up of stopwords characters
+            if all(
+                [char.lower() in stop_words for char in entity_text if char.isalpha()]
+            ):
+                if DEBUG:
+                    print(
+                        f"Skipping entity: {entity_text} (all characters are stopwords)"
+                    )
+                # entities.remove(entity)
+                continue
+            # check now if all entity is in a list of punctuation
+            if all([word in string.punctuation for word in entity_text.split()]):
+                if DEBUG:
+                    print(
+                        f"Skipping entity: {entity_text} (all characters are punctuation)"
+                    )
+                # entities.remove(entity)
+                continue
+            if all(
+                [
+                    char.lower() in string.punctuation
+                    for char in entity_text
+                    if char.isalpha()
+                ]
+            ):
+                if DEBUG:
+                    print(
+                        f"Skipping entity: {entity_text} (all characters are punctuation)"
+                    )
+                # entities.remove(entity)
+                continue
+            # if it's a number and "time" no in it, then continue
+            if entity_text.isdigit() and "time" not in entity["type"]:
+                if DEBUG:
+                    print(f"Skipping entity: {entity_text}")
+                # entities.remove(entity)
+                continue
+            if entity_text.startswith(" "):
+                entity_text = entity_text[1:]
+                # update lOffset, rOffset
+                lOffset += 1
+            if entity_text.endswith(" "):
+                entity_text = entity_text[:-1]
+                # update lOffset, rOffset
+                rOffset -= 1
+            # Update the entity surface and offsets
+            entity["surface"] = entity_text
+            entity["lOffset"] = lOffset
+            entity["rOffset"] = rOffset
+            # Remove the entity if the surface is empty after cleaning
+            if len(entity["surface"].strip()) == 0:
+                if DEBUG:
+                    print(f"Deleted entity: {entity['surface']}")
+                # entities.remove(entity)
+            else:
+                new_entities.append(entity)
+        else:
+            new_entities.append(entity)
+    if DEBUG:
+        print(f"Remained entities in remove_trailing_stopwords: {len(new_entities)}")
+    return new_entities
+class MultitaskTokenClassificationPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "text" in kwargs:
+            preprocess_kwargs["text"] = kwargs["text"]
+        if "tokens" in kwargs:
+            preprocess_kwargs["tokens"] = kwargs["tokens"]
+        self.label_map = self.model.config.label_map
+        self.id2label = {
+            task: {id_: label for label, id_ in labels.items()}
+            for task, labels in self.label_map.items()
+        }
+        return preprocess_kwargs, {}, {}
+    def preprocess(self, text, **kwargs):
+        tokens = kwargs["tokens"]
+        tokenized_inputs = self.tokenizer(
+            tokens,  # a list of strings
+            is_split_into_words=True,
+            padding="max_length",
+            truncation=True,
+            max_length=512,
+        )
+        word_ids = tokenized_inputs.word_ids()
+        return tokenized_inputs, word_ids, text, tokens
+    def _forward(self, inputs):
+        inputs, word_ids, text, tokens = inputs
+        input_ids = torch.tensor([inputs["input_ids"]], dtype=torch.long).to(
+            self.model.device
+        )
+        attention_mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long).to(
+            self.model.device
+        )
+        with torch.no_grad():
+            outputs = self.model(input_ids, attention_mask)
+        return outputs, word_ids, text, tokens
+    def is_within(self, entity1, entity2):
+        """Check if entity1 is fully within the bounds of entity2."""
+        return (
+            entity1["lOffset"] >= entity2["lOffset"]
+            and entity1["rOffset"] <= entity2["rOffset"]
+        )
+    def postprocess(self, outputs, **kwargs):
+        """
+        Postprocess the outputs of the model
+        :param outputs:
+        :param kwargs:
+        :return:
+        """
+        tokens_result, word_ids, text, tokens = outputs
+        predictions = {}
+        confidence_scores = {}
+        for task, logits in tokens_result.logits.items():
+            predictions[task] = torch.argmax(logits, dim=-1).tolist()[0]
+            confidence_scores[task] = F.softmax(logits, dim=-1).tolist()[0]
+        entities = {}
+        for task in predictions.keys():
+            words_list, preds_list, confidence_list = realign(
+                word_ids,
+                tokens,
+                predictions[task],
+                confidence_scores[task],
+                self.tokenizer,
+                self.id2label[task],
+            )
+            entities[task] = get_entities(words_list, preds_list, confidence_list, text)
+        # add titles to comp entities
+        # from pprint import pprint
+        # print("Before:")
+        # pprint(entities)
+        all_entities = []
+        coarse_entities = []
+        for key in entities:
+            if key in ["NE-COARSE-LIT"]:
+                coarse_entities = entities[key]
+            all_entities.extend(entities[key])
+        if DEBUG:
+            print(all_entities)
+        # print("After remove_included_entities:")
+        all_entities = remove_included_entities(all_entities)
+        if DEBUG:
+            print("After remove_included_entities:", all_entities)
+        all_entities = remove_trailing_stopwords(all_entities)
+        if DEBUG:
+            print("After remove_trailing_stopwords:", all_entities)
+        all_entities = postprocess_entities(all_entities)
+        if DEBUG:
+            print("After postprocess_entities:", all_entities)
+        all_entities = refine_entities_with_coarse(all_entities, coarse_entities)
+        if DEBUG:
+            print("After refine_entities_with_coarse:", all_entities)
+        # print("After attach_comp_to_closest:")
+        # pprint(all_entities)
+        # print("\n")
+        return all_entities

old/label_map.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"NE-COARSE-LIT": {"B-org": 0, "I-loc": 1, "I-org": 2, "O": 3, "B-prod": 4, "B-time": 5, "I-time": 6, "B-pers": 7, "B-loc": 8, "I-pers": 9, "I-prod": 10}, "NE-COARSE-METO": {"B-org": 0, "O": 1, "I-org": 2, "B-loc": 3, "I-loc": 4, "B-time": 5}, "NE-FINE-LIT": {"B-pers.ind": 0, "I-org.ent": 1, "B-prod.doctr": 2, "B-org.adm": 3, "B-loc.unk": 4, "B-loc.add.phys": 5, "I-loc.add.phys": 6, "I-time.date.abs": 7, "I-loc.adm.town": 8, "B-pers.coll": 9, "B-prod.media": 10, "I-loc.adm.nat": 11, "B-loc.adm.sup": 12, "B-loc.phys.geo": 13, "I-org.ent.pressagency": 14, "I-loc.adm.sup": 15, "I-pers.ind": 16, "I-loc.phys.hydro": 17, "O": 18, "B-loc.oro": 19, "B-pers.ind.articleauthor": 20, "I-loc.oro": 21, "I-loc.add.elec": 22, "B-time.date.abs": 23, "B-org.ent": 24, "I-loc.phys.geo": 25, "I-pers.coll": 26, "I-loc.fac": 27, "B-loc.phys.hydro": 28, "I-org.adm": 29, "I-prod.doctr": 30, "I-pers.ind.articleauthor": 31, "B-loc.add.elec": 32, "B-loc.adm.town": 33, "B-loc.adm.nat": 34, "I-loc.adm.reg": 35, "B-loc.fac": 36, "B-org.ent.pressagency": 37, "I-prod.media": 38, "B-loc.adm.reg": 39, "I-loc.unk": 40}, "NE-FINE-METO": {"I-org.ent": 0, "B-org.adm": 1, "I-org.adm": 2, "B-loc.fac": 3, "O": 4, "B-loc.oro": 5, "B-loc.adm.town": 6, "B-org.ent": 7, "I-loc.fac": 8, "B-time.date.abs": 9}, "NE-FINE-COMP": {"I-comp.name": 0, "B-comp.name": 1, "B-comp.title": 2, "I-comp.function": 3, "I-comp.title": 4, "B-comp.function": 5, "O": 6, "I-comp.demonym": 7, "B-comp.demonym": 8, "B-comp.qualifier": 9, "I-comp.qualifier": 10}, "NE-NESTED": {"I-loc.fac": 0, "B-loc.phys.hydro": 1, "B-pers.ind": 2, "I-org.ent": 3, "B-org.adm": 4, "I-org.adm": 5, "I-loc.adm.town": 6, "B-pers.coll": 7, "I-loc.adm.nat": 8, "B-loc.adm.town": 9, "B-loc.adm.sup": 10, "B-loc.phys.geo": 11, "I-pers.ind": 12, "B-loc.adm.nat": 13, "I-loc.adm.reg": 14, "B-loc.adm.reg": 15, "O": 16, "B-loc.oro": 17, "B-loc.fac": 18, "I-loc.oro": 19, "B-org.ent": 20, "I-loc.phys.geo": 21, "I-loc.phys.hydro": 22, "B-prod.media": 23, "I-prod.media": 24}}

old/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03a807b124debff782406c816eacb7ced1f2e25b9a5198b27e1616a41faa0662
+size 193971960

old/modeling_stacked.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from transformers.modeling_outputs import TokenClassifierOutput
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, AutoModel, AutoConfig, BertConfig
+from torch.nn import CrossEntropyLoss
+from typing import Optional, Tuple, Union
+import logging, json, os
+from .configuration_stacked import ImpressoConfig
+logger = logging.getLogger(__name__)
+def get_info(label_map):
+    num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
+    return num_token_labels_dict
+class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
+    config_class = ImpressoConfig
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_token_labels_dict = get_info(config.label_map)
+        self.config = config
+        self.bert = AutoModel.from_pretrained(
+            config.pretrained_config["_name_or_path"], config=config.pretrained_config
+        )
+        if "classifier_dropout" not in config.__dict__:
+            classifier_dropout = 0.1
+        else:
+            classifier_dropout = (
+                config.classifier_dropout
+                if config.classifier_dropout is not None
+                else config.hidden_dropout_prob
+            )
+        self.dropout = nn.Dropout(classifier_dropout)
+        # Additional transformer layers
+        self.transformer_encoder = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                d_model=config.hidden_size, nhead=config.num_attention_heads
+            ),
+            num_layers=2,
+        )
+        # For token classification, create a classifier for each task
+        self.token_classifiers = nn.ModuleDict(
+            {
+                task: nn.Linear(config.hidden_size, num_labels)
+                for task, num_labels in self.num_token_labels_dict.items()
+            }
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        token_labels: Optional[dict] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        token_labels (`dict` of `torch.LongTensor` of shape `(batch_size, seq_length)`, *optional*):
+            Labels for computing the token classification loss. Keys should match the tasks.
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        bert_kwargs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "position_ids": position_ids,
+            "head_mask": head_mask,
+            "inputs_embeds": inputs_embeds,
+            "output_attentions": output_attentions,
+            "output_hidden_states": output_hidden_states,
+            "return_dict": return_dict,
+        }
+        if any(
+            keyword in self.config.name_or_path.lower()
+            for keyword in ["llama", "deberta"]
+        ):
+            bert_kwargs.pop("token_type_ids")
+            bert_kwargs.pop("head_mask")
+        outputs = self.bert(**bert_kwargs)
+        # For token classification
+        token_output = outputs[0]
+        token_output = self.dropout(token_output)
+        # Pass through additional transformer layers
+        token_output = self.transformer_encoder(token_output.transpose(0, 1)).transpose(
+            0, 1
+        )
+        # Collect the logits and compute the loss for each task
+        task_logits = {}
+        total_loss = 0
+        for task, classifier in self.token_classifiers.items():
+            logits = classifier(token_output)
+            task_logits[task] = logits
+            if token_labels and task in token_labels:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_token_labels_dict[task]),
+                    token_labels[task].view(-1),
+                )
+                total_loss += loss
+        if not return_dict:
+            output = (task_logits,) + outputs[2:]
+            return ((total_loss,) + output) if total_loss != 0 else output
+        return TokenClassifierOutput(
+            loss=total_loss,
+            logits=task_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

old/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

old/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

old/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "max_len": 512,
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": false,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

old/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff