emanuelaboros commited on
Commit
a887bec
·
1 Parent(s): 03cf626

testin the trick

Browse files
Files changed (2) hide show
  1. lang_detect.py +3 -3
  2. modeling_stacked.py +3 -192
lang_detect.py CHANGED
@@ -14,8 +14,8 @@ class MultitaskTokenClassificationPipeline(Pipeline):
14
  return text
15
 
16
  def _forward(self, text):
17
- print(f"Do we arrive here? {text}")
18
- print(f"Let's check the model: {self.model.get_floret_model()}")
19
  predictions, probabilities = self.model(text)
20
  return predictions, probabilities
21
 
@@ -37,6 +37,6 @@ class MultitaskTokenClassificationPipeline(Pipeline):
37
 
38
  # Format as JSON-compatible dictionary
39
  model_output = {"label": label, "confidence": round(confidence * 100, 2)}
40
- print("Formatted Model Output:", model_output)
41
 
42
  return model_output
 
14
  return text
15
 
16
  def _forward(self, text):
17
+ # print(f"Do we arrive here? {text}")
18
+ # print(f"Let's check the model: {self.model.get_floret_model()}")
19
  predictions, probabilities = self.model(text)
20
  return predictions, probabilities
21
 
 
37
 
38
  # Format as JSON-compatible dictionary
39
  model_output = {"label": label, "confidence": round(confidence * 100, 2)}
40
+ # print("Formatted Model Output:", model_output)
41
 
42
  return model_output
modeling_stacked.py CHANGED
@@ -11,38 +11,6 @@ from .configuration_stacked import ImpressoConfig
11
  logger = logging.getLogger(__name__)
12
 
13
 
14
- def get_info(label_map):
15
- num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
16
- return num_token_labels_dict
17
-
18
-
19
- # class MyCustomModel:
20
- # def __init__(self):
21
- # # Custom initialization
22
- # pass
23
- #
24
- # @classmethod
25
- # def from_pretrained(cls, *args, **kwargs):
26
- # print("Ignoring weights and using custom initialization.")
27
- # return cls()
28
-
29
-
30
- class SafeFloretWrapper:
31
- """
32
- A safe wrapper for floret model that keeps it off-device to avoid segmentation faults.
33
- This class is pure Python and never interacts with PyTorch tensors or devices.
34
- """
35
-
36
- def __init__(self, model_path):
37
- print(f"Loading floret model from {model_path}")
38
- self.model_floret = floret.load_model(model_path)
39
-
40
- def predict(self, texts, k=1):
41
- # Floret expects strings, not tensors
42
- predictions, probabilities = self.model_floret.predict(texts, k=k)
43
- return predictions, probabilities
44
-
45
-
46
  class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
47
  config_class = ImpressoConfig
48
  _keys_to_ignore_on_load_missing = [r"position_ids"]
@@ -51,16 +19,10 @@ class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
51
  super().__init__(config)
52
  self.config = config
53
 
54
- # Load floret model
55
  self.dummy_param = nn.Parameter(torch.zeros(1))
 
56
  self.model_floret = floret.load_model(self.config.config.filename)
57
- # print(self.config.config)
58
- print(type(self.config))
59
- print(self.config.config.filename)
60
- # self.model_floret = SafeFloretWrapper(model_floret)
61
- # input_ids = "this is a text"
62
-
63
- # predictions, probabilities = self.model_floret.predict([input_ids], k=1)
64
 
65
  #
66
  def forward(self, input_ids, attention_mask=None, **kwargs):
@@ -78,22 +40,7 @@ class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
78
 
79
  # Use the SafeFloretWrapper to get predictions
80
  predictions, probabilities = self.model_floret.predict(texts, k=1)
81
- # print(f"Predictions: {predictions}")
82
- # print(f"Probabilities: {probabilities}")
83
- # print(self.model_floret(input_ids))
84
- # if input_ids is not None:
85
- # tokenizer = kwargs.get("tokenizer")
86
- # texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
87
- # else:
88
- # texts = kwargs.get("text", None)
89
- #
90
- # if texts:
91
- # # Floret expects strings, not tensors
92
- # predictions = [self.model_floret(text) for text in texts]
93
- # # Convert predictions to tensors for Hugging Face compatibility
94
- # return torch.tensor(predictions)
95
- # else:
96
- # If no text is found, return dummy output
97
  return (
98
  predictions,
99
  probabilities,
@@ -132,139 +79,3 @@ class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
132
  # Pass the manually created config to the class
133
  model = cls(config)
134
  return model
135
-
136
-
137
- # class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
138
- #
139
- # config_class = ImpressoConfig
140
- # _keys_to_ignore_on_load_missing = [r"position_ids"]
141
- #
142
- # def __init__(self, config):
143
- # super().__init__(config)
144
- # # self.num_token_labels_dict = get_info(config.label_map)
145
- # # self.config = config
146
- # # # print(f"I dont think it arrives here: {self.config}")
147
- # # self.bert = AutoModel.from_pretrained(
148
- # # config.pretrained_config["_name_or_path"], config=config.pretrained_config
149
- # # )
150
- # self.model_floret = floret.load_model(self.config.filename)
151
- # # print(f"Model loaded: {self.model_floret}")
152
- # # if "classifier_dropout" not in config.__dict__:
153
- # # classifier_dropout = 0.1
154
- # # else:
155
- # # classifier_dropout = (
156
- # # config.classifier_dropout
157
- # # if config.classifier_dropout is not None
158
- # # else config.hidden_dropout_prob
159
- # # )
160
- # # self.dropout = nn.Dropout(classifier_dropout)
161
- # #
162
- # # # Additional transformer layers
163
- # # self.transformer_encoder = nn.TransformerEncoder(
164
- # # nn.TransformerEncoderLayer(
165
- # # d_model=config.hidden_size, nhead=config.num_attention_heads
166
- # # ),
167
- # # num_layers=2,
168
- # # )
169
- #
170
- # # For token classification, create a classifier for each task
171
- # # self.token_classifiers = nn.ModuleDict(
172
- # # {
173
- # # task: nn.Linear(config.hidden_size, num_labels)
174
- # # for task, num_labels in self.num_token_labels_dict.items()
175
- # # }
176
- # # )
177
- # #
178
- # # # Initialize weights and apply final processing
179
- # # self.post_init()
180
- #
181
- # def get_floret_model(self):
182
- # return self.model_floret
183
- #
184
- # @classmethod
185
- # def from_pretrained(cls, *args, **kwargs):
186
- # print("Ignoring weights and using custom initialization.")
187
- #
188
- # # Manually create the config
189
- # config = ImpressoConfig()
190
- #
191
- # # Pass the manually created config to the class
192
- # model = cls(config)
193
- # return model
194
- #
195
- # # def forward(
196
- # # self,
197
- # # input_ids: Optional[torch.Tensor] = None,
198
- # # attention_mask: Optional[torch.Tensor] = None,
199
- # # token_type_ids: Optional[torch.Tensor] = None,
200
- # # position_ids: Optional[torch.Tensor] = None,
201
- # # head_mask: Optional[torch.Tensor] = None,
202
- # # inputs_embeds: Optional[torch.Tensor] = None,
203
- # # labels: Optional[torch.Tensor] = None,
204
- # # token_labels: Optional[dict] = None,
205
- # # output_attentions: Optional[bool] = None,
206
- # # output_hidden_states: Optional[bool] = None,
207
- # # return_dict: Optional[bool] = None,
208
- # # ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
209
- # # r"""
210
- # # token_labels (`dict` of `torch.LongTensor` of shape `(batch_size, seq_length)`, *optional*):
211
- # # Labels for computing the token classification loss. Keys should match the tasks.
212
- # # """
213
- # # return_dict = (
214
- # # return_dict if return_dict is not None else self.config.use_return_dict
215
- # # )
216
- # #
217
- # # bert_kwargs = {
218
- # # "input_ids": input_ids,
219
- # # "attention_mask": attention_mask,
220
- # # "token_type_ids": token_type_ids,
221
- # # "position_ids": position_ids,
222
- # # "head_mask": head_mask,
223
- # # "inputs_embeds": inputs_embeds,
224
- # # "output_attentions": output_attentions,
225
- # # "output_hidden_states": output_hidden_states,
226
- # # "return_dict": return_dict,
227
- # # }
228
- # #
229
- # # if any(
230
- # # keyword in self.config.name_or_path.lower()
231
- # # for keyword in ["llama", "deberta"]
232
- # # ):
233
- # # bert_kwargs.pop("token_type_ids")
234
- # # bert_kwargs.pop("head_mask")
235
- # #
236
- # # outputs = self.bert(**bert_kwargs)
237
- # #
238
- # # # For token classification
239
- # # token_output = outputs[0]
240
- # # token_output = self.dropout(token_output)
241
- # #
242
- # # # Pass through additional transformer layers
243
- # # token_output = self.transformer_encoder(token_output.transpose(0, 1)).transpose(
244
- # # 0, 1
245
- # # )
246
- # #
247
- # # # Collect the logits and compute the loss for each task
248
- # # task_logits = {}
249
- # # total_loss = 0
250
- # # for task, classifier in self.token_classifiers.items():
251
- # # logits = classifier(token_output)
252
- # # task_logits[task] = logits
253
- # # if token_labels and task in token_labels:
254
- # # loss_fct = CrossEntropyLoss()
255
- # # loss = loss_fct(
256
- # # logits.view(-1, self.num_token_labels_dict[task]),
257
- # # token_labels[task].view(-1),
258
- # # )
259
- # # total_loss += loss
260
- # #
261
- # # if not return_dict:
262
- # # output = (task_logits,) + outputs[2:]
263
- # # return ((total_loss,) + output) if total_loss != 0 else output
264
- # # print(f"Is there anobidy coming here?")
265
- # # return TokenClassifierOutput(
266
- # # loss=total_loss,
267
- # # logits=task_logits,
268
- # # hidden_states=outputs.hidden_states,
269
- # # attentions=outputs.attentions,
270
- # # )
 
11
  logger = logging.getLogger(__name__)
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
15
  config_class = ImpressoConfig
16
  _keys_to_ignore_on_load_missing = [r"position_ids"]
 
19
  super().__init__(config)
20
  self.config = config
21
 
22
+ # Dummy for device checking
23
  self.dummy_param = nn.Parameter(torch.zeros(1))
24
+ # Load floret model
25
  self.model_floret = floret.load_model(self.config.config.filename)
 
 
 
 
 
 
 
26
 
27
  #
28
  def forward(self, input_ids, attention_mask=None, **kwargs):
 
40
 
41
  # Use the SafeFloretWrapper to get predictions
42
  predictions, probabilities = self.model_floret.predict(texts, k=1)
43
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  return (
45
  predictions,
46
  probabilities,
 
79
  # Pass the manually created config to the class
80
  model = cls(config)
81
  return model