emanuelaboros
/

lang-detect

Token Classification

language-identification

Model card Files Files and versions

emanuelaboros commited on Feb 27, 2025

Commit

47aff3b

·

1 Parent(s): d2959f2

testin the trick

Files changed (1) hide show

modeling_stacked.py +23 -4

modeling_stacked.py CHANGED Viewed

@@ -27,6 +27,22 @@ def get_info(label_map):
 #         return cls()
 class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
     config_class = ImpressoConfig
     _keys_to_ignore_on_load_missing = [r"position_ids"]
@@ -37,16 +53,19 @@ class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
         # Load floret model
         self.dummy_param = nn.Parameter(torch.zeros(1))
-        self.model_floret = floret.load_model(self.config.filename)
-        input_ids = "this is a text"
-        predictions, probabilities = self.model_floret.predict([input_ids], k=1)
     def forward(self, input_ids, attention_mask=None, **kwargs):
         # Convert input_ids to strings using tokenizer
         print(
             f"Check if it arrives here: {input_ids}, ---, {type(input_ids)} ----- {type(self.model_floret)}"
         )
         # if input_ids is not None:
         #     tokenizer = kwargs.get("tokenizer")
         #     texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)

 #         return cls()
+class SafeFloretWrapper(nn.Module):
+    """
+    A safe wrapper for floret model that keeps it off-device to avoid segmentation faults.
+    """
+    def __init__(self, floret_model):
+        super().__init__()
+        self.floret_model = floret_model
+    def forward(self, texts):
+        # Floret expects strings, not tensors
+        _, predictions = self.model_floret.predict([texts], k=1)
+        # Convert predictions to tensors for Hugging Face compatibility
+        return torch.tensor(predictions)
 class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
     config_class = ImpressoConfig
     _keys_to_ignore_on_load_missing = [r"position_ids"]
         # Load floret model
         self.dummy_param = nn.Parameter(torch.zeros(1))
+        model_floret = floret.load_model(self.config.filename)
+        self.model_floret = SafeFloretWrapper(model_floret)
+        # input_ids = "this is a text"
+        # predictions, probabilities = self.model_floret.predict([input_ids], k=1)
+    #
     def forward(self, input_ids, attention_mask=None, **kwargs):
         # Convert input_ids to strings using tokenizer
         print(
             f"Check if it arrives here: {input_ids}, ---, {type(input_ids)} ----- {type(self.model_floret)}"
         )
+        print(self.model_floret(input_ids))
         # if input_ids is not None:
         #     tokenizer = kwargs.get("tokenizer")
         #     texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)