Flansma commited on
Commit
62249d1
·
verified ·
1 Parent(s): 220c98d

Upload folder using huggingface_hub

Browse files
__pycache__/__init__.cpython-311.pyc ADDED
Binary file (174 Bytes). View file
 
__pycache__/configuration_helmbert.cpython-311.pyc ADDED
Binary file (4.61 kB). View file
 
__pycache__/modeling_helmbert.cpython-311.pyc ADDED
Binary file (51 kB). View file
 
configuration_helmbert.py CHANGED
@@ -66,6 +66,8 @@ class HELMBertConfig(PretrainedConfig):
66
  # Classification/regression
67
  num_labels: int = 2,
68
  problem_type: str = None,
 
 
69
  **kwargs,
70
  ):
71
  super().__init__(
@@ -102,3 +104,5 @@ class HELMBertConfig(PretrainedConfig):
102
  # Classification/regression
103
  self.num_labels = num_labels
104
  self.problem_type = problem_type
 
 
 
66
  # Classification/regression
67
  num_labels: int = 2,
68
  problem_type: str = None,
69
+ classifier_num_layers: int = 0,
70
+ classifier_dropout: float = 0.1,
71
  **kwargs,
72
  ):
73
  super().__init__(
 
104
  # Classification/regression
105
  self.num_labels = num_labels
106
  self.problem_type = problem_type
107
+ self.classifier_num_layers = classifier_num_layers
108
+ self.classifier_dropout = classifier_dropout
modeling_helmbert.py CHANGED
@@ -56,7 +56,9 @@ class XSoftmax(torch.autograd.Function):
56
  """Masked Softmax optimized for memory efficiency."""
57
 
58
  @staticmethod
59
- def forward(ctx, input: torch.Tensor, mask: Optional[torch.Tensor], dim: int) -> torch.Tensor:
 
 
60
  ctx.dim = dim
61
  if mask is not None:
62
  rmask = ~(mask.bool())
@@ -77,7 +79,9 @@ class XSoftmax(torch.autograd.Function):
77
  def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
78
  (output,) = ctx.saved_tensors
79
  if version.Version(torch.__version__) >= version.Version("1.11.0"):
80
- input_grad = _softmax_backward_data(grad_output, output, ctx.dim, output.dtype)
 
 
81
  else:
82
  input_grad = _softmax_backward_data(grad_output, output, ctx.dim, output)
83
  return input_grad, None, None
@@ -104,11 +108,14 @@ def build_relative_position(
104
  max_exact = num_buckets // 4
105
  is_small = rel_pos < max_exact
106
 
107
- rel_pos_if_large = max_exact + (
108
- torch.log(rel_pos.float() / max_exact)
109
- / math.log(max_position / max_exact)
110
- * (num_buckets // 4 - 1)
111
- ).long()
 
 
 
112
  rel_pos_if_large = torch.min(
113
  rel_pos_if_large, torch.full_like(rel_pos_if_large, num_buckets // 2 - 1)
114
  )
@@ -167,9 +174,13 @@ class DisentangledSelfAttention(nn.Module):
167
  # Position projections
168
  if not self.share_att_key:
169
  if "c2p" in self.pos_att_type or "p2p" in self.pos_att_type:
170
- self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
 
 
171
  if "p2c" in self.pos_att_type or "p2p" in self.pos_att_type:
172
- self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
 
 
173
 
174
  # Dropout
175
  self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
@@ -223,7 +234,9 @@ class DisentangledSelfAttention(nn.Module):
223
  attention_scores = attention_scores + rel_att
224
 
225
  # Normalize scores for numerical stability
226
- attention_scores = attention_scores - attention_scores.max(dim=-1, keepdim=True)[0].detach()
 
 
227
  attention_scores = attention_scores.to(hidden_states.dtype)
228
 
229
  # Reshape for XSoftmax
@@ -236,11 +249,15 @@ class DisentangledSelfAttention(nn.Module):
236
  attention_probs = self.dropout(attention_probs)
237
 
238
  # Apply attention to values
239
- attention_probs_flat = attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1))
 
 
240
  context_layer = torch.bmm(attention_probs_flat, value_layer)
241
 
242
  # Reshape output
243
- context_layer = context_layer.view(-1, self.num_heads, context_layer.size(-2), context_layer.size(-1))
 
 
244
  context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
245
  new_shape = context_layer.size()[:-2] + (self.all_head_size,)
246
  context_layer = context_layer.view(*new_shape)
@@ -285,33 +302,49 @@ class DisentangledSelfAttention(nn.Module):
285
  ].unsqueeze(0)
286
  rel_embeddings = self.pos_dropout(rel_embeddings)
287
 
288
- score = torch.zeros_like(query_layer[:, :, :1]).expand(-1, -1, key_layer.size(-2))
 
 
289
 
290
  # Prepare position indices
291
  c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
292
- c2p_pos = c2p_pos.squeeze(0).expand(query_layer.size(0), query_layer.size(1), relative_pos.size(-1))
 
 
293
 
294
  # Content-to-position (c2p)
295
  if "c2p" in self.pos_att_type:
296
  pos_key_layer = (
297
- self.pos_key_proj(rel_embeddings) if not self.share_att_key else self.key_proj(rel_embeddings)
 
 
 
 
 
298
  )
299
- pos_key_layer = self.transpose_for_scores(pos_key_layer).repeat(batch_size, 1, 1)
300
 
301
  c2p_scale = 1.0 / math.sqrt(self.head_size * scale_factor)
302
- c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2) * c2p_scale)
 
 
303
  c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_pos)
304
  score = score + c2p_att
305
 
306
  # Position-to-content (p2c)
307
  if "p2c" in self.pos_att_type:
308
  pos_query_layer = (
309
- self.pos_query_proj(rel_embeddings) if not self.share_att_key else self.query_proj(rel_embeddings)
 
 
 
 
 
310
  )
311
- pos_query_layer = self.transpose_for_scores(pos_query_layer).repeat(batch_size, 1, 1)
312
 
313
  p2c_scale = 1.0 / math.sqrt(self.head_size * scale_factor)
314
- p2c_att = torch.bmm(pos_query_layer * p2c_scale, key_layer.transpose(-1, -2))
 
 
315
  p2c_att = torch.gather(p2c_att, dim=-2, index=c2p_pos)
316
  score = score + p2c_att
317
 
@@ -331,7 +364,9 @@ class HELMBertEmbeddings(nn.Module):
331
  self.word_embeddings = nn.Embedding(
332
  config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
333
  )
334
- self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
 
 
335
  self.layer_norm = nn.LayerNorm(config.hidden_size)
336
  self.dropout = nn.Dropout(config.hidden_dropout_prob)
337
 
@@ -399,7 +434,11 @@ class NgieLayer(nn.Module):
399
  Output with n-gram information incorporated
400
  """
401
  # Apply 1D convolution
402
- out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous()
 
 
 
 
403
 
404
  # Create reverse mask for padding
405
  if version.Version(torch.__version__) >= version.Version("1.2.0a"):
@@ -414,7 +453,9 @@ class NgieLayer(nn.Module):
414
  out = self.activation(self.dropout(out))
415
 
416
  # Residual connection with LayerNorm
417
- output_states = masked_layer_norm(self.layer_norm, residual_states + out, attention_mask)
 
 
418
 
419
  return output_states
420
 
@@ -523,13 +564,17 @@ class HELMBertEncoder(nn.Module):
523
  self.ngie_layer = NgieLayer(config)
524
 
525
  # Transformer blocks
526
- self.layers = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_hidden_layers)])
 
 
527
 
528
  def get_rel_embedding(self) -> Optional[torch.Tensor]:
529
  """Get relative position embeddings from first layer."""
530
  if len(self.layers) > 0:
531
  first_layer = self.layers[0]
532
- if hasattr(first_layer, "self_attn") and hasattr(first_layer.self_attn, "rel_embeddings"):
 
 
533
  return first_layer.self_attn.rel_embeddings.weight
534
  return None
535
 
@@ -589,7 +634,9 @@ class HELMBertEncoder(nn.Module):
589
  # Apply nGiE after first layer
590
  if layer_idx == 0:
591
  hidden_states_batch = hidden_states.transpose(0, 1)
592
- hidden_states_batch = self.ngie_layer(ngie_input_states, hidden_states_batch, attention_mask)
 
 
593
  hidden_states = hidden_states_batch.transpose(0, 1)
594
 
595
  # Store layer[-2] for EMD
@@ -647,7 +694,9 @@ class HELMBertPooler(nn.Module):
647
  Pooled output [batch, hidden]
648
  """
649
  if attention_mask is not None:
650
- mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
 
 
651
  sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
652
  eps = torch.finfo(hidden_states.dtype).eps
653
  sum_mask = torch.clamp(mask_expanded.sum(1), min=eps)
@@ -858,7 +907,9 @@ class HELMBertForMaskedLM(HELMBertPreTrainedModel):
858
  attention_mask = torch.ones_like(input_ids)
859
 
860
  # Embeddings
861
- embeddings, position_embeddings = self.helmbert.embeddings(input_ids, attention_mask)
 
 
862
 
863
  # Encoder with optional EMD
864
  encoder_outputs = self.helmbert.encoder(
@@ -886,7 +937,9 @@ class HELMBertForMaskedLM(HELMBertPreTrainedModel):
886
  loss = None
887
  if labels is not None:
888
  loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
889
- loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
 
890
 
891
  if not return_dict:
892
  output = (prediction_scores, hidden_states, attentions)
@@ -900,12 +953,57 @@ class HELMBertForMaskedLM(HELMBertPreTrainedModel):
900
  )
901
 
902
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
903
  class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
904
  """HELM-BERT for sequence classification/regression.
905
 
906
  Example:
907
  >>> from helmbert import HELMBertForSequenceClassification, HELMBertConfig
908
- >>> config = HELMBertConfig(num_labels=1) # Regression
 
 
 
 
 
909
  >>> model = HELMBertForSequenceClassification(config)
910
  """
911
 
@@ -915,8 +1013,19 @@ class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
915
  self.config = config
916
 
917
  self.helmbert = HELMBertModel(config)
918
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
919
- self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
 
 
 
 
 
 
 
 
 
 
920
 
921
  self.post_init()
922
 
@@ -951,7 +1060,9 @@ class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
951
  )
952
 
953
  pooled_output = outputs.pooler_output
954
- pooled_output = self.dropout(pooled_output)
 
 
955
  logits = self.classifier(pooled_output)
956
 
957
  loss = None
@@ -959,7 +1070,9 @@ class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
959
  if self.config.problem_type is None:
960
  if self.num_labels == 1:
961
  self.config.problem_type = "regression"
962
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
 
 
963
  self.config.problem_type = "single_label_classification"
964
  else:
965
  self.config.problem_type = "multi_label_classification"
 
56
  """Masked Softmax optimized for memory efficiency."""
57
 
58
  @staticmethod
59
+ def forward(
60
+ ctx, input: torch.Tensor, mask: Optional[torch.Tensor], dim: int
61
+ ) -> torch.Tensor:
62
  ctx.dim = dim
63
  if mask is not None:
64
  rmask = ~(mask.bool())
 
79
  def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
80
  (output,) = ctx.saved_tensors
81
  if version.Version(torch.__version__) >= version.Version("1.11.0"):
82
+ input_grad = _softmax_backward_data(
83
+ grad_output, output, ctx.dim, output.dtype
84
+ )
85
  else:
86
  input_grad = _softmax_backward_data(grad_output, output, ctx.dim, output)
87
  return input_grad, None, None
 
108
  max_exact = num_buckets // 4
109
  is_small = rel_pos < max_exact
110
 
111
+ rel_pos_if_large = (
112
+ max_exact
113
+ + (
114
+ torch.log(rel_pos.float() / max_exact)
115
+ / math.log(max_position / max_exact)
116
+ * (num_buckets // 4 - 1)
117
+ ).long()
118
+ )
119
  rel_pos_if_large = torch.min(
120
  rel_pos_if_large, torch.full_like(rel_pos_if_large, num_buckets // 2 - 1)
121
  )
 
174
  # Position projections
175
  if not self.share_att_key:
176
  if "c2p" in self.pos_att_type or "p2p" in self.pos_att_type:
177
+ self.pos_key_proj = nn.Linear(
178
+ config.hidden_size, self.all_head_size, bias=True
179
+ )
180
  if "p2c" in self.pos_att_type or "p2p" in self.pos_att_type:
181
+ self.pos_query_proj = nn.Linear(
182
+ config.hidden_size, self.all_head_size, bias=False
183
+ )
184
 
185
  # Dropout
186
  self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
234
  attention_scores = attention_scores + rel_att
235
 
236
  # Normalize scores for numerical stability
237
+ attention_scores = (
238
+ attention_scores - attention_scores.max(dim=-1, keepdim=True)[0].detach()
239
+ )
240
  attention_scores = attention_scores.to(hidden_states.dtype)
241
 
242
  # Reshape for XSoftmax
 
249
  attention_probs = self.dropout(attention_probs)
250
 
251
  # Apply attention to values
252
+ attention_probs_flat = attention_probs.view(
253
+ -1, attention_probs.size(-2), attention_probs.size(-1)
254
+ )
255
  context_layer = torch.bmm(attention_probs_flat, value_layer)
256
 
257
  # Reshape output
258
+ context_layer = context_layer.view(
259
+ -1, self.num_heads, context_layer.size(-2), context_layer.size(-1)
260
+ )
261
  context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
262
  new_shape = context_layer.size()[:-2] + (self.all_head_size,)
263
  context_layer = context_layer.view(*new_shape)
 
302
  ].unsqueeze(0)
303
  rel_embeddings = self.pos_dropout(rel_embeddings)
304
 
305
+ score = torch.zeros_like(query_layer[:, :, :1]).expand(
306
+ -1, -1, key_layer.size(-2)
307
+ )
308
 
309
  # Prepare position indices
310
  c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
311
+ c2p_pos = c2p_pos.squeeze(0).expand(
312
+ query_layer.size(0), query_layer.size(1), relative_pos.size(-1)
313
+ )
314
 
315
  # Content-to-position (c2p)
316
  if "c2p" in self.pos_att_type:
317
  pos_key_layer = (
318
+ self.pos_key_proj(rel_embeddings)
319
+ if not self.share_att_key
320
+ else self.key_proj(rel_embeddings)
321
+ )
322
+ pos_key_layer = self.transpose_for_scores(pos_key_layer).repeat(
323
+ batch_size, 1, 1
324
  )
 
325
 
326
  c2p_scale = 1.0 / math.sqrt(self.head_size * scale_factor)
327
+ c2p_att = torch.bmm(
328
+ query_layer, pos_key_layer.transpose(-1, -2) * c2p_scale
329
+ )
330
  c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_pos)
331
  score = score + c2p_att
332
 
333
  # Position-to-content (p2c)
334
  if "p2c" in self.pos_att_type:
335
  pos_query_layer = (
336
+ self.pos_query_proj(rel_embeddings)
337
+ if not self.share_att_key
338
+ else self.query_proj(rel_embeddings)
339
+ )
340
+ pos_query_layer = self.transpose_for_scores(pos_query_layer).repeat(
341
+ batch_size, 1, 1
342
  )
 
343
 
344
  p2c_scale = 1.0 / math.sqrt(self.head_size * scale_factor)
345
+ p2c_att = torch.bmm(
346
+ pos_query_layer * p2c_scale, key_layer.transpose(-1, -2)
347
+ )
348
  p2c_att = torch.gather(p2c_att, dim=-2, index=c2p_pos)
349
  score = score + p2c_att
350
 
 
364
  self.word_embeddings = nn.Embedding(
365
  config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
366
  )
367
+ self.position_embeddings = nn.Embedding(
368
+ config.max_position_embeddings, config.hidden_size
369
+ )
370
  self.layer_norm = nn.LayerNorm(config.hidden_size)
371
  self.dropout = nn.Dropout(config.hidden_dropout_prob)
372
 
 
434
  Output with n-gram information incorporated
435
  """
436
  # Apply 1D convolution
437
+ out = (
438
+ self.conv(hidden_states.permute(0, 2, 1).contiguous())
439
+ .permute(0, 2, 1)
440
+ .contiguous()
441
+ )
442
 
443
  # Create reverse mask for padding
444
  if version.Version(torch.__version__) >= version.Version("1.2.0a"):
 
453
  out = self.activation(self.dropout(out))
454
 
455
  # Residual connection with LayerNorm
456
+ output_states = masked_layer_norm(
457
+ self.layer_norm, residual_states + out, attention_mask
458
+ )
459
 
460
  return output_states
461
 
 
564
  self.ngie_layer = NgieLayer(config)
565
 
566
  # Transformer blocks
567
+ self.layers = nn.ModuleList(
568
+ [TransformerBlock(config) for _ in range(config.num_hidden_layers)]
569
+ )
570
 
571
  def get_rel_embedding(self) -> Optional[torch.Tensor]:
572
  """Get relative position embeddings from first layer."""
573
  if len(self.layers) > 0:
574
  first_layer = self.layers[0]
575
+ if hasattr(first_layer, "self_attn") and hasattr(
576
+ first_layer.self_attn, "rel_embeddings"
577
+ ):
578
  return first_layer.self_attn.rel_embeddings.weight
579
  return None
580
 
 
634
  # Apply nGiE after first layer
635
  if layer_idx == 0:
636
  hidden_states_batch = hidden_states.transpose(0, 1)
637
+ hidden_states_batch = self.ngie_layer(
638
+ ngie_input_states, hidden_states_batch, attention_mask
639
+ )
640
  hidden_states = hidden_states_batch.transpose(0, 1)
641
 
642
  # Store layer[-2] for EMD
 
694
  Pooled output [batch, hidden]
695
  """
696
  if attention_mask is not None:
697
+ mask_expanded = (
698
+ attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
699
+ )
700
  sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
701
  eps = torch.finfo(hidden_states.dtype).eps
702
  sum_mask = torch.clamp(mask_expanded.sum(1), min=eps)
 
907
  attention_mask = torch.ones_like(input_ids)
908
 
909
  # Embeddings
910
+ embeddings, position_embeddings = self.helmbert.embeddings(
911
+ input_ids, attention_mask
912
+ )
913
 
914
  # Encoder with optional EMD
915
  encoder_outputs = self.helmbert.encoder(
 
937
  loss = None
938
  if labels is not None:
939
  loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
940
+ loss = loss_fct(
941
+ prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
942
+ )
943
 
944
  if not return_dict:
945
  output = (prediction_scores, hidden_states, attentions)
 
953
  )
954
 
955
 
956
+ class MLPHead(nn.Module):
957
+ """MLP head with skip connections for classification/regression.
958
+
959
+ Architecture: input -> [Linear -> GELU -> LayerNorm -> Dropout (+ skip)] x N -> Linear -> output
960
+ """
961
+
962
+ def __init__(
963
+ self,
964
+ input_dim: int,
965
+ output_dim: int,
966
+ hidden_dims: list,
967
+ dropout: float = 0.1,
968
+ ):
969
+ super().__init__()
970
+ self.layers = nn.ModuleList()
971
+ self.norms = nn.ModuleList()
972
+ self.dropouts = nn.ModuleList()
973
+
974
+ prev_dim = input_dim
975
+ for hidden_dim in hidden_dims:
976
+ self.layers.append(nn.Linear(prev_dim, hidden_dim))
977
+ self.norms.append(nn.LayerNorm(hidden_dim))
978
+ self.dropouts.append(nn.Dropout(dropout))
979
+ prev_dim = hidden_dim
980
+
981
+ self.output_layer = nn.Linear(prev_dim, output_dim)
982
+ self.activation = nn.GELU()
983
+
984
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
985
+ for layer, norm, dropout in zip(self.layers, self.norms, self.dropouts):
986
+ identity = x
987
+ x = layer(x)
988
+ if x.shape == identity.shape:
989
+ x = x + identity # Skip connection
990
+ x = self.activation(x)
991
+ x = norm(x)
992
+ x = dropout(x)
993
+ return self.output_layer(x)
994
+
995
+
996
  class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
997
  """HELM-BERT for sequence classification/regression.
998
 
999
  Example:
1000
  >>> from helmbert import HELMBertForSequenceClassification, HELMBertConfig
1001
+ >>> # Simple linear head (default)
1002
+ >>> config = HELMBertConfig(num_labels=1)
1003
+ >>> model = HELMBertForSequenceClassification(config)
1004
+ >>>
1005
+ >>> # MLP head with 2 layers (for permeability prediction)
1006
+ >>> config = HELMBertConfig(num_labels=1, classifier_num_layers=2)
1007
  >>> model = HELMBertForSequenceClassification(config)
1008
  """
1009
 
 
1013
  self.config = config
1014
 
1015
  self.helmbert = HELMBertModel(config)
1016
+
1017
+ # Use MLP head if num_layers > 0, otherwise simple linear
1018
+ if config.classifier_num_layers > 0:
1019
+ hidden_dims = [config.hidden_size] * config.classifier_num_layers
1020
+ self.classifier = MLPHead(
1021
+ input_dim=config.hidden_size,
1022
+ output_dim=config.num_labels,
1023
+ hidden_dims=hidden_dims,
1024
+ dropout=config.classifier_dropout,
1025
+ )
1026
+ else:
1027
+ self.dropout = nn.Dropout(config.classifier_dropout)
1028
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1029
 
1030
  self.post_init()
1031
 
 
1060
  )
1061
 
1062
  pooled_output = outputs.pooler_output
1063
+ # MLP head has internal dropout, simple linear needs separate dropout
1064
+ if hasattr(self, "dropout"):
1065
+ pooled_output = self.dropout(pooled_output)
1066
  logits = self.classifier(pooled_output)
1067
 
1068
  loss = None
 
1070
  if self.config.problem_type is None:
1071
  if self.num_labels == 1:
1072
  self.config.problem_type = "regression"
1073
+ elif self.num_labels > 1 and (
1074
+ labels.dtype == torch.long or labels.dtype == torch.int
1075
+ ):
1076
  self.config.problem_type = "single_label_classification"
1077
  else:
1078
  self.config.problem_type = "multi_label_classification"
tokenization_helmbert.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  import json
4
  import os
5
- from typing import Dict, List, Optional, Tuple, Union
6
 
7
  from transformers import PreTrainedTokenizer
8
 
@@ -10,43 +10,89 @@ from transformers import PreTrainedTokenizer
10
  # Default vocabulary for HELM notation
11
  HELM_VOCAB = {
12
  # Special tokens (0-4)
13
- " ": 0, # PAD
14
- "@": 1, # BOS/CLS
15
  "\n": 2, # EOS/SEP
16
- "§": 3, # UNK
17
- "¶": 4, # MASK
18
-
19
  # Natural amino acids (5-25)
20
- "A": 5, "R": 6, "N": 7, "D": 8, "C": 9,
21
- "E": 10, "Q": 11, "G": 12, "H": 13, "I": 14,
22
- "L": 15, "K": 16, "M": 17, "F": 18, "P": 19,
23
- "S": 20, "T": 21, "W": 22, "Y": 23, "V": 24,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  "X": 25, # Unknown amino acid
25
-
26
  # Structure symbols (26-37)
27
- "[": 26, "]": 27, "{": 28, "}": 29, "(": 30, ")": 31,
28
- "$": 32, ",": 33, ":": 34, "|": 35, "-": 36, ".": 37,
29
-
 
 
 
 
 
 
 
 
 
30
  # Numbers (38-47)
31
- "0": 38, "1": 39, "2": 40, "3": 41, "4": 42,
32
- "5": 43, "6": 44, "7": 45, "8": 46, "9": 47,
33
-
 
 
 
 
 
 
 
34
  # Uppercase non-amino acids (48-50)
35
- "B": 48, "O": 49, ">": 50,
36
-
 
37
  # Lowercase letters (51-72)
38
- "a": 51, "b": 52, "c": 53, "d": 54, "e": 55,
39
- "f": 56, "g": 57, "h": 58, "i": 59, "l": 60,
40
- "m": 61, "n": 62, "o": 63, "p": 64, "r": 65,
41
- "s": 66, "t": 67, "u": 68, "v": 69, "x": 70,
42
- "y": 71, "z": 72,
43
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # Encoded polymer markers (73-76)
45
- "/": 73, # PEPTIDE
46
- "*": 74, # me
47
  "\t": 75, # am
48
- "&": 76, # ac
49
-
50
  # Miscellaneous (77)
51
  "_": 77,
52
  }
@@ -227,7 +273,12 @@ class HELMBertTokenizer(PreTrainedTokenizer):
227
  List of 0s and 1s (1 = special token)
228
  """
229
  if already_has_special_tokens:
230
- return [1 if x in [self.cls_token_id, self.sep_token_id, self.pad_token_id] else 0 for x in token_ids_0]
 
 
 
 
 
231
 
232
  if token_ids_1 is None:
233
  return [1] + [0] * len(token_ids_0) + [1]
 
2
 
3
  import json
4
  import os
5
+ from typing import Dict, List, Optional, Tuple
6
 
7
  from transformers import PreTrainedTokenizer
8
 
 
10
  # Default vocabulary for HELM notation
11
  HELM_VOCAB = {
12
  # Special tokens (0-4)
13
+ " ": 0, # PAD
14
+ "@": 1, # BOS/CLS
15
  "\n": 2, # EOS/SEP
16
+ "§": 3, # UNK
17
+ "¶": 4, # MASK
 
18
  # Natural amino acids (5-25)
19
+ "A": 5,
20
+ "R": 6,
21
+ "N": 7,
22
+ "D": 8,
23
+ "C": 9,
24
+ "E": 10,
25
+ "Q": 11,
26
+ "G": 12,
27
+ "H": 13,
28
+ "I": 14,
29
+ "L": 15,
30
+ "K": 16,
31
+ "M": 17,
32
+ "F": 18,
33
+ "P": 19,
34
+ "S": 20,
35
+ "T": 21,
36
+ "W": 22,
37
+ "Y": 23,
38
+ "V": 24,
39
  "X": 25, # Unknown amino acid
 
40
  # Structure symbols (26-37)
41
+ "[": 26,
42
+ "]": 27,
43
+ "{": 28,
44
+ "}": 29,
45
+ "(": 30,
46
+ ")": 31,
47
+ "$": 32,
48
+ ",": 33,
49
+ ":": 34,
50
+ "|": 35,
51
+ "-": 36,
52
+ ".": 37,
53
  # Numbers (38-47)
54
+ "0": 38,
55
+ "1": 39,
56
+ "2": 40,
57
+ "3": 41,
58
+ "4": 42,
59
+ "5": 43,
60
+ "6": 44,
61
+ "7": 45,
62
+ "8": 46,
63
+ "9": 47,
64
  # Uppercase non-amino acids (48-50)
65
+ "B": 48,
66
+ "O": 49,
67
+ ">": 50,
68
  # Lowercase letters (51-72)
69
+ "a": 51,
70
+ "b": 52,
71
+ "c": 53,
72
+ "d": 54,
73
+ "e": 55,
74
+ "f": 56,
75
+ "g": 57,
76
+ "h": 58,
77
+ "i": 59,
78
+ "l": 60,
79
+ "m": 61,
80
+ "n": 62,
81
+ "o": 63,
82
+ "p": 64,
83
+ "r": 65,
84
+ "s": 66,
85
+ "t": 67,
86
+ "u": 68,
87
+ "v": 69,
88
+ "x": 70,
89
+ "y": 71,
90
+ "z": 72,
91
  # Encoded polymer markers (73-76)
92
+ "/": 73, # PEPTIDE
93
+ "*": 74, # me
94
  "\t": 75, # am
95
+ "&": 76, # ac
 
96
  # Miscellaneous (77)
97
  "_": 77,
98
  }
 
273
  List of 0s and 1s (1 = special token)
274
  """
275
  if already_has_special_tokens:
276
+ return [
277
+ 1
278
+ if x in [self.cls_token_id, self.sep_token_id, self.pad_token_id]
279
+ else 0
280
+ for x in token_ids_0
281
+ ]
282
 
283
  if token_ids_1 is None:
284
  return [1] + [0] * len(token_ids_0) + [1]