Mart78 commited on
Commit
33477af
·
verified ·
1 Parent(s): 18dfc75

Upload 8 files

Browse files
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MolformerForMaskedLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_molformer.MolformerConfig",
7
+ "AutoModel": "modeling_molformer.MolformerModel",
8
+ "AutoModelForMaskedLM": "modeling_molformer.MolformerForMaskedLM",
9
+ "AutoModelForSequenceClassification": "modeling_molformer.MolformerForSequenceClassification"
10
+ },
11
+ "classifier_dropout_prob": null,
12
+ "classifier_skip_connection": true,
13
+ "deterministic_eval": false,
14
+ "dtype": "float32",
15
+ "embedding_dropout_prob": 0.2,
16
+ "feature_map_kernel": "relu",
17
+ "hidden_act": "gelu",
18
+ "hidden_dropout_prob": 0.1,
19
+ "hidden_size": 768,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 768,
22
+ "layer_norm_eps": 1e-12,
23
+ "linear_attention_eps": 1e-06,
24
+ "max_position_embeddings": 202,
25
+ "model_type": "molformer",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "num_random_features": 32,
29
+ "pad_token_id": 2,
30
+ "tie_word_embeddings": false,
31
+ "transformers_version": "4.56.1",
32
+ "vocab_size": 2362
33
+ }
configuration_molformer.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Molformer model configuration"""
16
+
17
+ from collections import OrderedDict
18
+ from typing import Mapping
19
+
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.onnx import OnnxConfig
22
+ from transformers.utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ MOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28
+ "ibm/MoLFormer-XL-both-10pct": "https://huggingface.co/ibm/MoLFormer-XL-both-10pct/resolve/main/config.json",
29
+ }
30
+
31
+
32
+ class MolformerConfig(PretrainedConfig):
33
+ r"""
34
+ This is the configuration class to store the configuration of a [`MolformerModel`]. It is used to instantiate an
35
+ Molformer model according to the specified arguments, defining the model architecture. Instantiating a
36
+ configuration with the defaults will yield a similar configuration to that of the Molformer
37
+ [ibm/MoLFormer-XL-both-10pct](https://huggingface.co/ibm/MoLFormer-XL-both-10pct) architecture.
38
+
39
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
40
+ documentation from [`PretrainedConfig`] for more information.
41
+
42
+
43
+ Args:
44
+ vocab_size (`int`, *optional*, defaults to 2362):
45
+ Vocabulary size of the Molformer model. Defines the number of different tokens that can be represented by
46
+ the `inputs_ids` passed when calling [`MolformerModel`] or [`TFMolformerModel`].
47
+ hidden_size (`int`, *optional*, defaults to 768):
48
+ Dimension of the encoder layers and the pooler layer.
49
+ num_hidden_layers (`int`, *optional*, defaults to 12):
50
+ Number of hidden layers in the Transformer encoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 12):
52
+ Number of attention heads for each attention layer in the Transformer encoder.
53
+ intermediate_size (`int`, *optional*, defaults to 768):
54
+ Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
55
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
56
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
57
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
58
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
59
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
60
+ embedding_dropout_prob (`float`, *optional*, defaults to 0.2):
61
+ The dropout probability for the word embeddings.
62
+ max_position_embeddings (`int`, *optional*, defaults to 202):
63
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
64
+ just in case (e.g., 512 or 1024 or 1536).
65
+ initializer_range (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
68
+ The epsilon used by the layer normalization layers.
69
+ linear_attention_eps (`float`, *optional*, defaults to 1e-06):
70
+ The epsilon used by the linear attention layers normalization step.
71
+ num_random_features (`int`, *optional*, defaults to 32):
72
+ Random feature map dimension used in linear attention.
73
+ feature_map_kernel (`str` or `function`, *optional*, defaults to `"relu"`):
74
+ The non-linear activation function (function or string) in the generalized random features. If string,
75
+ `"gelu"`, `"relu"`, `"selu"`, and `"gelu_new"` ar supported.
76
+ deterministic_eval (`bool`, *optional*, defaults to `False`):
77
+ Whether the random features should only be redrawn when training or not. If `True` and `model.training` is
78
+ `False`, linear attention random feature weights will be constant, i.e., deterministic.
79
+ classifier_dropout_prob (`float`, *optional*):
80
+ The dropout probability for the classification head. If `None`, use `hidden_dropout_prob`.
81
+ classifier_skip_connection (`bool`, *optional*, defaults to `True`):
82
+ Whether a skip connection should be made between the layers of the classification head or not.
83
+ pad_token_id (`int`, *optional*, defaults to 2):
84
+ The id of the _padding_ token.
85
+
86
+ Example:
87
+
88
+ ```python
89
+ >>> from transformers import MolformerModel, MolformerConfig
90
+
91
+ >>> # Initializing a Molformer ibm/MoLFormer-XL-both-10pct style configuration
92
+ >>> configuration = MolformerConfig()
93
+
94
+ >>> # Initializing a model from the ibm/MoLFormer-XL-both-10pct style configuration
95
+ >>> model = MolformerModel(configuration)
96
+
97
+ >>> # Accessing the model configuration
98
+ >>> configuration = model.config
99
+ ```"""
100
+ model_type = "molformer"
101
+
102
+ def __init__(
103
+ self,
104
+ vocab_size=2362,
105
+ hidden_size=768,
106
+ num_hidden_layers=12,
107
+ num_attention_heads=12,
108
+ intermediate_size=768,
109
+ hidden_act="gelu",
110
+ hidden_dropout_prob=0.1,
111
+ embedding_dropout_prob=0.2,
112
+ max_position_embeddings=202,
113
+ initializer_range=0.02,
114
+ layer_norm_eps=1e-12,
115
+ linear_attention_eps=1e-6,
116
+ num_random_features=32,
117
+ feature_map_kernel="relu",
118
+ deterministic_eval=False,
119
+ classifier_dropout_prob=None,
120
+ classifier_skip_connection=True,
121
+ pad_token_id=2,
122
+ **kwargs,
123
+ ):
124
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
125
+
126
+ self.vocab_size = vocab_size
127
+ self.hidden_size = hidden_size
128
+ self.num_hidden_layers = num_hidden_layers
129
+ self.num_attention_heads = num_attention_heads
130
+ self.hidden_act = hidden_act
131
+ self.intermediate_size = intermediate_size
132
+ self.hidden_dropout_prob = hidden_dropout_prob
133
+ self.embedding_dropout_prob = embedding_dropout_prob
134
+ self.max_position_embeddings = max_position_embeddings
135
+ self.initializer_range = initializer_range
136
+ self.layer_norm_eps = layer_norm_eps
137
+ self.linear_attention_eps = linear_attention_eps
138
+ self.num_random_features = num_random_features
139
+ self.feature_map_kernel = feature_map_kernel
140
+ self.deterministic_eval = deterministic_eval
141
+ self.classifier_dropout_prob = classifier_dropout_prob
142
+ self.classifier_skip_connection = classifier_skip_connection
143
+
144
+
145
+ # Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->Molformer
146
+ class MolformerOnnxConfig(OnnxConfig):
147
+ @property
148
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
149
+ if self.task == "multiple-choice":
150
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
151
+ else:
152
+ dynamic_axis = {0: "batch", 1: "sequence"}
153
+ return OrderedDict(
154
+ [
155
+ ("input_ids", dynamic_axis),
156
+ ("attention_mask", dynamic_axis),
157
+ ]
158
+ )
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e244cc3502c32472c268f671e50fdf0d8779f1d1b7d05b47267a8e7aa1f789c4
3
+ size 187248784
modeling_molformer.py ADDED
@@ -0,0 +1,921 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ PyTorch Molformer model."""
16
+
17
+
18
+ import math
19
+ from typing import Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.utils.checkpoint
23
+ from torch import nn
24
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
25
+
26
+ from transformers.activations import ACT2FN
27
+ from transformers.modeling_outputs import (
28
+ BaseModelOutput,
29
+ BaseModelOutputWithPooling,
30
+ MaskedLMOutput,
31
+ SequenceClassifierOutput,
32
+ )
33
+ from transformers.modeling_utils import PreTrainedModel
34
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
35
+ from transformers.utils import (
36
+ add_code_sample_docstrings,
37
+ add_start_docstrings,
38
+ add_start_docstrings_to_model_forward,
39
+ logging,
40
+ )
41
+ from .configuration_molformer import MolformerConfig
42
+
43
+
44
+ logger = logging.get_logger(__name__)
45
+
46
+ _CHECKPOINT_FOR_DOC = "ibm/MoLFormer-XL-both-10pct"
47
+ _CONFIG_FOR_DOC = "MolformerConfig"
48
+
49
+ MOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
50
+ "ibm/MoLFormer-XL-both-10pct",
51
+ # See all MoLFormer models at https://huggingface.co/models?filter=molformer
52
+ ]
53
+
54
+
55
+ # Copied from transformers.models.esm.modeling_esm.rotate_half
56
+ def rotate_half(x):
57
+ x1, x2 = x.chunk(2, dim=-1)
58
+ return torch.cat((-x2, x1), dim=-1)
59
+
60
+
61
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
62
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
63
+ cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
64
+ sin = sin[position_ids].unsqueeze(1)
65
+ q_embed = (q * cos) + (rotate_half(q) * sin)
66
+ k_embed = (k * cos) + (rotate_half(k) * sin)
67
+ return q_embed, k_embed
68
+
69
+
70
+ # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Molformer
71
+ class MolformerRotaryEmbedding(nn.Module):
72
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
73
+ super().__init__()
74
+
75
+ self.dim = dim
76
+ self.max_position_embeddings = max_position_embeddings
77
+ self.base = base
78
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
79
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
80
+
81
+ # Build here to make `torch.jit.trace` work.
82
+ self._set_cos_sin_cache(
83
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
84
+ )
85
+
86
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
87
+ self.max_seq_len_cached = seq_len
88
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
89
+
90
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
91
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
92
+ emb = torch.cat((freqs, freqs), dim=-1)
93
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
94
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
95
+
96
+ def forward(self, x, seq_len=None):
97
+ # x: [bs, num_attention_heads, seq_len, head_size]
98
+ if seq_len > self.max_seq_len_cached:
99
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
100
+
101
+ return (
102
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
103
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
104
+ )
105
+
106
+
107
+ class MolformerEmbeddings(nn.Module):
108
+ """Construct the embeddings from word embeddings."""
109
+
110
+ def __init__(self, config):
111
+ super().__init__()
112
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
113
+ self.dropout = nn.Dropout(config.embedding_dropout_prob)
114
+
115
+ def forward(
116
+ self, input_ids: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None
117
+ ) -> torch.Tensor:
118
+ if inputs_embeds is None:
119
+ inputs_embeds = self.word_embeddings(input_ids)
120
+
121
+ embeddings = inputs_embeds
122
+ embeddings = self.dropout(embeddings)
123
+ return embeddings
124
+
125
+
126
+ class MolformerFeatureMap(nn.Module):
127
+ def __init__(self, config):
128
+ super().__init__()
129
+ self.query_size = config.hidden_size // config.num_attention_heads
130
+ self.num_components = config.num_random_features
131
+ self.orthogonal_random_weights()
132
+ if isinstance(config.feature_map_kernel, str):
133
+ self.kernel = ACT2FN[config.feature_map_kernel]
134
+ else:
135
+ self.kernel = config.feature_map_kernel
136
+ self.deterministic = config.deterministic_eval
137
+
138
+ def orthogonal_random_weights(self, device=None):
139
+ # make sure query size evenly divides feature size (round up)
140
+ num_batches = math.ceil(self.num_components / self.query_size)
141
+
142
+ def orthogonal_batch(size):
143
+ block = torch.randn(size, size, device=device)
144
+ norms = torch.linalg.norm(block, dim=1).unsqueeze(0)
145
+ Q, _ = torch.linalg.qr(block)
146
+ return Q * norms
147
+
148
+ random_weights = torch.cat([orthogonal_batch(self.query_size) for _ in range(num_batches)], dim=1)
149
+ random_weights = random_weights[:, : self.num_components]
150
+ self.register_buffer("weight", random_weights)
151
+
152
+ def forward(self, query, key):
153
+ if not self.deterministic or self.training:
154
+ self.orthogonal_random_weights(query.device)
155
+ # generalized random fourier features
156
+ query = torch.matmul(query, self.weight)
157
+ key = torch.matmul(key, self.weight)
158
+ return self.kernel(query), self.kernel(key)
159
+
160
+
161
+ class MolformerSelfAttention(nn.Module):
162
+ def __init__(self, config):
163
+ super().__init__()
164
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
165
+ raise ValueError(
166
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
167
+ f"heads ({config.num_attention_heads})"
168
+ )
169
+
170
+ self.num_attention_heads = config.num_attention_heads
171
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
172
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
173
+
174
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
175
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
176
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
177
+
178
+ self.eps = config.linear_attention_eps
179
+
180
+ self.rotary_embeddings = MolformerRotaryEmbedding(
181
+ dim=self.attention_head_size, max_position_embeddings=config.max_position_embeddings
182
+ )
183
+ self.feature_map = MolformerFeatureMap(config)
184
+
185
+ # Copied from transformers.models.bert.modeling_bert.BertSelfAttention.transpose_for_scores
186
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
187
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
188
+ x = x.view(new_x_shape)
189
+ return x.permute(0, 2, 1, 3)
190
+
191
+ def forward(
192
+ self,
193
+ hidden_states: torch.Tensor,
194
+ attention_mask: Optional[torch.FloatTensor] = None,
195
+ position_ids: Optional[torch.LongTensor] = None,
196
+ head_mask: Optional[torch.FloatTensor] = None,
197
+ output_attentions: Optional[bool] = False,
198
+ ) -> Tuple[torch.Tensor]:
199
+ query_layer = self.transpose_for_scores(self.query(hidden_states))
200
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
201
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
202
+
203
+ kv_seq_len = key_layer.shape[-2]
204
+ cos, sin = self.rotary_embeddings(value_layer, seq_len=kv_seq_len)
205
+ query_layer, key_layer = apply_rotary_pos_emb(query_layer, key_layer, cos, sin, position_ids)
206
+ # Apply the feature map to the queries and keys
207
+ query_layer, key_layer = self.feature_map(query_layer, key_layer)
208
+
209
+ if attention_mask is not None:
210
+ # since we don't use softmax, we need to reconvert this mask to 1/0
211
+ attention_mask = (attention_mask == 0).to(attention_mask.dtype)
212
+ # separate original mask from causal mask
213
+ per_query_attn = attention_mask[:, 0, -1]
214
+ per_query_extended = per_query_attn[:, None, None, :]
215
+ if not torch.equal(attention_mask, per_query_extended):
216
+ raise ValueError(
217
+ "MolformerSelfAttention does not support arbitrary 3D attention. attention_mask must be 2D (i.e., [batch size, sequence length])"
218
+ )
219
+
220
+ key_layer = key_layer * per_query_attn[:, None, -kv_seq_len:, None]
221
+
222
+ # linear attention
223
+ key_value = torch.matmul(key_layer.transpose(-1, -2), value_layer)
224
+ norm = torch.matmul(query_layer, key_layer.sum(dim=-2).unsqueeze(-1)).clamp(min=self.eps)
225
+ context_layer = torch.matmul(query_layer, key_value) / norm
226
+
227
+ if head_mask is not None:
228
+ context_layer = context_layer * head_mask
229
+
230
+ if output_attentions:
231
+ logger.warning(
232
+ "Outputting attentions in linear attention negates the efficiency gains! Only use for visualization/debugging."
233
+ )
234
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
235
+ if attention_mask is not None:
236
+ attention_scores = attention_scores * attention_mask
237
+ attention_probs = nn.functional.normalize(attention_scores, p=1, dim=-1, eps=self.eps)
238
+ if head_mask is not None:
239
+ attention_probs = attention_probs * head_mask
240
+ # recompute context_layer for grad
241
+ context_layer = torch.matmul(attention_probs, value_layer)
242
+
243
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
244
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
245
+ context_layer = context_layer.view(*new_context_layer_shape)
246
+
247
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
248
+
249
+ return outputs
250
+
251
+
252
+ # Copied from transformers.models.bert.modeling_bert.BertSelfOutput
253
+ class MolformerSelfOutput(nn.Module):
254
+ def __init__(self, config):
255
+ super().__init__()
256
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
257
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
258
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
259
+
260
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
261
+ hidden_states = self.dense(hidden_states)
262
+ hidden_states = self.dropout(hidden_states)
263
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
264
+ return hidden_states
265
+
266
+
267
+ class MolformerAttention(nn.Module):
268
+ def __init__(self, config):
269
+ super().__init__()
270
+ self.self = MolformerSelfAttention(config)
271
+ self.output = MolformerSelfOutput(config)
272
+ self.pruned_heads = set()
273
+
274
+ # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads
275
+ def prune_heads(self, heads):
276
+ if len(heads) == 0:
277
+ return
278
+ heads, index = find_pruneable_heads_and_indices(
279
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
280
+ )
281
+
282
+ # Prune linear layers
283
+ self.self.query = prune_linear_layer(self.self.query, index)
284
+ self.self.key = prune_linear_layer(self.self.key, index)
285
+ self.self.value = prune_linear_layer(self.self.value, index)
286
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
287
+
288
+ # Update hyper params and store pruned heads
289
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
290
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
291
+ self.pruned_heads = self.pruned_heads.union(heads)
292
+
293
+ def forward(
294
+ self,
295
+ hidden_states: torch.Tensor,
296
+ attention_mask: Optional[torch.FloatTensor] = None,
297
+ position_ids: Optional[torch.LongTensor] = None,
298
+ head_mask: Optional[torch.FloatTensor] = None,
299
+ output_attentions: Optional[bool] = False,
300
+ ) -> Tuple[torch.Tensor]:
301
+ self_outputs = self.self(
302
+ hidden_states,
303
+ attention_mask,
304
+ position_ids,
305
+ head_mask,
306
+ output_attentions,
307
+ )
308
+ attention_output = self.output(self_outputs[0], hidden_states)
309
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
310
+ return outputs
311
+
312
+
313
+ # Copied from transformers.models.bert.modeling_bert.BertIntermediate
314
+ class MolformerIntermediate(nn.Module):
315
+ def __init__(self, config):
316
+ super().__init__()
317
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
318
+ if isinstance(config.hidden_act, str):
319
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
320
+ else:
321
+ self.intermediate_act_fn = config.hidden_act
322
+
323
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
324
+ hidden_states = self.dense(hidden_states)
325
+ hidden_states = self.intermediate_act_fn(hidden_states)
326
+ return hidden_states
327
+
328
+
329
+ # Copied from transformers.models.bert.modeling_bert.BertOutput
330
+ class MolformerOutput(nn.Module):
331
+ def __init__(self, config):
332
+ super().__init__()
333
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
334
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
335
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
336
+
337
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
338
+ hidden_states = self.dense(hidden_states)
339
+ hidden_states = self.dropout(hidden_states)
340
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
341
+ return hidden_states
342
+
343
+
344
+ class MolformerLayer(nn.Module):
345
+ def __init__(self, config):
346
+ super().__init__()
347
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
348
+ self.seq_len_dim = 1
349
+ self.attention = MolformerAttention(config)
350
+ self.intermediate = MolformerIntermediate(config)
351
+ self.output = MolformerOutput(config)
352
+
353
+ def forward(
354
+ self,
355
+ hidden_states: torch.Tensor,
356
+ attention_mask: Optional[torch.FloatTensor] = None,
357
+ position_ids: Optional[torch.LongTensor] = None,
358
+ head_mask: Optional[torch.FloatTensor] = None,
359
+ output_attentions: Optional[bool] = False,
360
+ ) -> Tuple[torch.Tensor]:
361
+ self_attention_outputs = self.attention(
362
+ hidden_states,
363
+ attention_mask,
364
+ position_ids,
365
+ head_mask,
366
+ output_attentions=output_attentions,
367
+ )
368
+ attention_output = self_attention_outputs[0]
369
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
370
+
371
+ layer_output = apply_chunking_to_forward(
372
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
373
+ )
374
+ outputs = (layer_output,) + outputs
375
+
376
+ return outputs
377
+
378
+ def feed_forward_chunk(self, attention_output):
379
+ intermediate_output = self.intermediate(attention_output)
380
+ layer_output = self.output(intermediate_output, attention_output)
381
+ return layer_output
382
+
383
+
384
+ class MolformerEncoder(nn.Module):
385
+ def __init__(self, config):
386
+ super().__init__()
387
+ self.config = config
388
+ self.layer = nn.ModuleList([MolformerLayer(config) for _ in range(config.num_hidden_layers)])
389
+ self.gradient_checkpointing = False
390
+
391
+ def forward(
392
+ self,
393
+ hidden_states: torch.Tensor,
394
+ attention_mask: Optional[torch.FloatTensor] = None,
395
+ position_ids: Optional[torch.LongTensor] = None,
396
+ head_mask: Optional[torch.FloatTensor] = None,
397
+ output_attentions: Optional[bool] = False,
398
+ output_hidden_states: Optional[bool] = False,
399
+ return_dict: Optional[bool] = True,
400
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
401
+ all_hidden_states = () if output_hidden_states else None
402
+ all_self_attentions = () if output_attentions else None
403
+
404
+ for i, layer_module in enumerate(self.layer):
405
+ if output_hidden_states:
406
+ all_hidden_states = all_hidden_states + (hidden_states,)
407
+
408
+ layer_head_mask = head_mask[i] if head_mask is not None else None
409
+
410
+ if self.gradient_checkpointing and self.training:
411
+
412
+ def create_custom_forward(module):
413
+ def custom_forward(*inputs):
414
+ return module(*inputs, output_attentions)
415
+
416
+ return custom_forward
417
+
418
+ layer_outputs = torch.utils.checkpoint.checkpoint(
419
+ create_custom_forward(layer_module),
420
+ hidden_states,
421
+ attention_mask,
422
+ position_ids,
423
+ layer_head_mask,
424
+ )
425
+ else:
426
+ layer_outputs = layer_module(
427
+ hidden_states,
428
+ attention_mask,
429
+ position_ids,
430
+ layer_head_mask,
431
+ output_attentions,
432
+ )
433
+
434
+ hidden_states = layer_outputs[0]
435
+ if output_attentions:
436
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
437
+
438
+ if output_hidden_states:
439
+ all_hidden_states = all_hidden_states + (hidden_states,)
440
+
441
+ if not return_dict:
442
+ return tuple(
443
+ v
444
+ for v in [
445
+ hidden_states,
446
+ all_hidden_states,
447
+ all_self_attentions,
448
+ ]
449
+ if v is not None
450
+ )
451
+ return BaseModelOutput(
452
+ last_hidden_state=hidden_states,
453
+ hidden_states=all_hidden_states,
454
+ attentions=all_self_attentions,
455
+ )
456
+
457
+
458
+ # Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform
459
+ class MolformerPredictionHeadTransform(nn.Module):
460
+ def __init__(self, config):
461
+ super().__init__()
462
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
463
+ if isinstance(config.hidden_act, str):
464
+ self.transform_act_fn = ACT2FN[config.hidden_act]
465
+ else:
466
+ self.transform_act_fn = config.hidden_act
467
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
468
+
469
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
470
+ hidden_states = self.dense(hidden_states)
471
+ hidden_states = self.transform_act_fn(hidden_states)
472
+ hidden_states = self.LayerNorm(hidden_states)
473
+ return hidden_states
474
+
475
+
476
+ class MolformerLMPredictionHead(nn.Module):
477
+ def __init__(self, config):
478
+ super().__init__()
479
+ self.transform = MolformerPredictionHeadTransform(config)
480
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
481
+
482
+ def forward(self, hidden_states):
483
+ hidden_states = self.transform(hidden_states)
484
+ hidden_states = self.decoder(hidden_states)
485
+ return hidden_states
486
+
487
+
488
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Molformer,roberta->molformer
489
+ class MolformerPreTrainedModel(PreTrainedModel):
490
+ """
491
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
492
+ models.
493
+ """
494
+
495
+ config_class = MolformerConfig
496
+ base_model_prefix = "molformer"
497
+ supports_gradient_checkpointing = True
498
+ _no_split_modules = ["MolformerEmbeddings", "MolformerSelfAttention"]
499
+
500
+ # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
501
+ def _init_weights(self, module):
502
+ """Initialize the weights"""
503
+ if isinstance(module, nn.Linear):
504
+ # Slightly different from the TF version which uses truncated_normal for initialization
505
+ # cf https://github.com/pytorch/pytorch/pull/5617
506
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
507
+ if module.bias is not None:
508
+ module.bias.data.zero_()
509
+ elif isinstance(module, nn.Embedding):
510
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
511
+ if module.padding_idx is not None:
512
+ module.weight.data[module.padding_idx].zero_()
513
+ elif isinstance(module, nn.LayerNorm):
514
+ module.bias.data.zero_()
515
+ module.weight.data.fill_(1.0)
516
+
517
+ def _set_gradient_checkpointing(self, module, value=False):
518
+ if isinstance(module, MolformerEncoder):
519
+ module.gradient_checkpointing = value
520
+
521
+
522
+ def masked_avg_pool1d(hidden_states, attention_mask, eps=1e-9):
523
+ attention_mask = attention_mask.unsqueeze(-1).expand_as(hidden_states).float()
524
+ sum_embeddings = torch.sum(hidden_states * attention_mask, dim=1)
525
+ sum_mask = torch.clamp(attention_mask.sum(dim=1), min=eps)
526
+ embedding = sum_embeddings / sum_mask
527
+ return embedding
528
+
529
+
530
+ MOLFORMER_START_DOCSTRING = r"""
531
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
532
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
533
+ behavior.
534
+
535
+ Parameters:
536
+ config ([`MolformerConfig`]): Model configuration class with all the parameters of the model.
537
+ Initializing with a config file does not load the weights associated with the model, only the
538
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
539
+ """
540
+
541
+ MOLFORMER_INPUTS_DOCSTRING = r"""
542
+ Args:
543
+ input_ids (`torch.LongTensor` of shape `({0})`):
544
+ Indices of input sequence tokens in the vocabulary.
545
+
546
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
547
+ [`PreTrainedTokenizer.__call__`] for details.
548
+
549
+ [What are input IDs?](../glossary#input-ids)
550
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
551
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
552
+
553
+ - 1 for tokens that are **not masked**,
554
+ - 0 for tokens that are **masked**.
555
+
556
+ [What are attention masks?](../glossary#attention-mask)
557
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
558
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
559
+ config.n_positions - 1]`.
560
+
561
+ [What are position IDs?](../glossary#position-ids)
562
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
563
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
564
+
565
+ - 1 indicates the head is **not masked**,
566
+ - 0 indicates the head is **masked**.
567
+
568
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
569
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
570
+ is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
571
+ model's internal embedding lookup matrix.
572
+ output_attentions (`bool`, *optional*):
573
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
574
+ tensors for more detail.
575
+ output_hidden_states (`bool`, *optional*):
576
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
577
+ more detail.
578
+ return_dict (`bool`, *optional*):
579
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
580
+ """
581
+
582
+
583
+ @add_start_docstrings(
584
+ "The bare Molformer Model transformer outputting raw hidden-states without any specific head on top.",
585
+ MOLFORMER_START_DOCSTRING,
586
+ """
587
+ add_pooling_layer (`bool`, *optional*, defaults to `True`):
588
+ Whether or not to apply pooling layer.
589
+ """,
590
+ )
591
+ class MolformerModel(MolformerPreTrainedModel):
592
+ """
593
+
594
+ The model can behave as an encoder (with only self-attention).
595
+ """
596
+
597
+ def __init__(self, config, add_pooling_layer=True):
598
+ super().__init__(config)
599
+ self.config = config
600
+
601
+ self.embeddings = MolformerEmbeddings(config)
602
+ self.encoder = MolformerEncoder(config)
603
+
604
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
605
+ self.pooler = masked_avg_pool1d if add_pooling_layer else None
606
+
607
+ # Initialize weights and apply final processing
608
+ self.post_init()
609
+
610
+ def get_input_embeddings(self):
611
+ return self.embeddings.word_embeddings
612
+
613
+ def set_input_embeddings(self, value):
614
+ self.embeddings.word_embeddings = value
615
+
616
+ def _prune_heads(self, heads_to_prune):
617
+ """
618
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
619
+ class PreTrainedModel
620
+ """
621
+ for layer, heads in heads_to_prune.items():
622
+ self.encoder.layer[layer].attention.prune_heads(heads)
623
+
624
+ @add_start_docstrings_to_model_forward(MOLFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
625
+ @add_code_sample_docstrings(
626
+ checkpoint=_CHECKPOINT_FOR_DOC,
627
+ output_type=BaseModelOutputWithPooling,
628
+ config_class=_CONFIG_FOR_DOC,
629
+ )
630
+ def forward(
631
+ self,
632
+ input_ids: Optional[torch.LongTensor] = None,
633
+ attention_mask: Optional[torch.FloatTensor] = None,
634
+ position_ids: Optional[torch.LongTensor] = None,
635
+ head_mask: Optional[torch.FloatTensor] = None,
636
+ inputs_embeds: Optional[torch.FloatTensor] = None,
637
+ output_attentions: Optional[bool] = None,
638
+ output_hidden_states: Optional[bool] = None,
639
+ return_dict: Optional[bool] = None,
640
+ ) -> Union[BaseModelOutputWithPooling, Tuple[torch.Tensor]]:
641
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
642
+ output_hidden_states = (
643
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
644
+ )
645
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
646
+
647
+ if input_ids is not None and inputs_embeds is not None:
648
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
649
+ elif input_ids is not None:
650
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
651
+ input_shape = input_ids.size()
652
+ elif inputs_embeds is not None:
653
+ input_shape = inputs_embeds.size()[:-1]
654
+ else:
655
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
656
+
657
+ batch_size, seq_length = input_shape
658
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
659
+
660
+ if position_ids is None:
661
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
662
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
663
+ else:
664
+ position_ids = position_ids.view(-1, seq_length).long()
665
+
666
+ if attention_mask is None:
667
+ attention_mask = torch.ones((batch_size, seq_length), device=device)
668
+
669
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
670
+ # ourselves in which case we just need to make it broadcastable to all heads.
671
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
672
+
673
+ # Prepare head mask if needed
674
+ # 1.0 in head_mask indicate we keep the head
675
+ # attention_probs has shape bsz x n_heads x N x N
676
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
677
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
678
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
679
+
680
+ embedding_output = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds)
681
+
682
+ encoder_outputs = self.encoder(
683
+ embedding_output,
684
+ attention_mask=extended_attention_mask,
685
+ position_ids=position_ids,
686
+ head_mask=head_mask,
687
+ output_attentions=output_attentions,
688
+ output_hidden_states=output_hidden_states,
689
+ return_dict=return_dict,
690
+ )
691
+ sequence_output = encoder_outputs[0]
692
+ sequence_output = self.LayerNorm(sequence_output)
693
+ pooled_output = self.pooler(sequence_output, attention_mask) if self.pooler is not None else None
694
+
695
+ if not return_dict:
696
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
697
+
698
+ return BaseModelOutputWithPooling(
699
+ last_hidden_state=sequence_output,
700
+ pooler_output=pooled_output,
701
+ hidden_states=encoder_outputs.hidden_states,
702
+ attentions=encoder_outputs.attentions,
703
+ )
704
+
705
+
706
+ @add_start_docstrings("""Molformer Model with a `language modeling` head on top.""", MOLFORMER_START_DOCSTRING)
707
+ class MolformerForMaskedLM(MolformerPreTrainedModel):
708
+ _tied_weights_keys = ["lm_head.decoder.weight"]
709
+
710
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with Roberta->Molformer,roberta->molformer,LMHead->LMPredictionHead
711
+ def __init__(self, config):
712
+ super().__init__(config)
713
+
714
+ if config.is_decoder:
715
+ logger.warning(
716
+ "If you want to use `MolformerForMaskedLM` make sure `config.is_decoder=False` for "
717
+ "bi-directional self-attention."
718
+ )
719
+
720
+ self.molformer = MolformerModel(config, add_pooling_layer=False)
721
+ self.lm_head = MolformerLMPredictionHead(config)
722
+
723
+ # Initialize weights and apply final processing
724
+ self.post_init()
725
+
726
+ def get_output_embeddings(self):
727
+ return self.lm_head.decoder
728
+
729
+ def set_output_embeddings(self, new_embeddings):
730
+ self.lm_head.decoder = new_embeddings
731
+
732
+ @add_start_docstrings_to_model_forward(MOLFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
733
+ @add_code_sample_docstrings(
734
+ checkpoint=_CHECKPOINT_FOR_DOC,
735
+ output_type=MaskedLMOutput,
736
+ config_class=_CONFIG_FOR_DOC,
737
+ mask="P<mask>", # add extra token so labels line up
738
+ )
739
+ def forward(
740
+ self,
741
+ input_ids: Optional[torch.LongTensor] = None,
742
+ attention_mask: Optional[torch.FloatTensor] = None,
743
+ position_ids: Optional[torch.LongTensor] = None,
744
+ head_mask: Optional[torch.FloatTensor] = None,
745
+ inputs_embeds: Optional[torch.FloatTensor] = None,
746
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
747
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
748
+ labels: Optional[torch.LongTensor] = None,
749
+ output_attentions: Optional[bool] = None,
750
+ output_hidden_states: Optional[bool] = None,
751
+ return_dict: Optional[bool] = None,
752
+ ) -> Union[MaskedLMOutput, Tuple[torch.Tensor]]:
753
+ r"""
754
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
755
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
756
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
757
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
758
+ """
759
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
760
+
761
+ outputs = self.molformer(
762
+ input_ids,
763
+ attention_mask=attention_mask,
764
+ position_ids=position_ids,
765
+ head_mask=head_mask,
766
+ inputs_embeds=inputs_embeds,
767
+ output_attentions=output_attentions,
768
+ output_hidden_states=output_hidden_states,
769
+ return_dict=return_dict,
770
+ )
771
+
772
+ sequence_output = outputs[0]
773
+ prediction_scores = self.lm_head(sequence_output)
774
+
775
+ masked_lm_loss = None
776
+ if labels is not None:
777
+ # move labels to correct device to enable model parallelism
778
+ labels = labels.to(prediction_scores.device)
779
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
780
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
781
+
782
+ if not return_dict:
783
+ output = (prediction_scores,) + outputs[2:]
784
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
785
+
786
+ return MaskedLMOutput(
787
+ loss=masked_lm_loss,
788
+ logits=prediction_scores,
789
+ hidden_states=outputs.hidden_states,
790
+ attentions=outputs.attentions,
791
+ )
792
+
793
+
794
+ class MolformerClassificationHead(nn.Module):
795
+ """Head for sequence-level classification tasks."""
796
+
797
+ def __init__(self, config):
798
+ super().__init__()
799
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
800
+ self.dense2 = nn.Linear(config.hidden_size, config.hidden_size)
801
+ self.dropout = nn.Dropout(
802
+ config.classifier_dropout_prob
803
+ if config.classifier_dropout_prob is not None
804
+ else config.hidden_dropout_prob
805
+ )
806
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
807
+ if isinstance(config.hidden_act, str):
808
+ self.classifier_act_fn = ACT2FN[config.hidden_act]
809
+ else:
810
+ self.classifier_act_fn = config.hidden_act
811
+ self.skip_connection = config.classifier_skip_connection
812
+
813
+ def forward(self, pooled_output):
814
+ hidden_state = self.dense(pooled_output)
815
+ hidden_state = self.dropout(hidden_state)
816
+ hidden_state = self.classifier_act_fn(hidden_state)
817
+ if self.skip_connection:
818
+ hidden_state = residual = hidden_state + pooled_output
819
+ hidden_state = self.dense2(hidden_state)
820
+ hidden_state = self.dropout(hidden_state)
821
+ hidden_state = self.classifier_act_fn(hidden_state)
822
+ if self.skip_connection:
823
+ hidden_state = hidden_state + residual
824
+ logits = self.out_proj(hidden_state)
825
+ return logits
826
+
827
+
828
+ @add_start_docstrings(
829
+ """
830
+ Molformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
831
+ pooled output) e.g. for MoleculeNet tasks.
832
+ """,
833
+ MOLFORMER_START_DOCSTRING,
834
+ )
835
+ class MolformerForSequenceClassification(MolformerPreTrainedModel):
836
+ def __init__(self, config):
837
+ super().__init__(config)
838
+ self.num_labels = config.num_labels
839
+ self.config = config
840
+
841
+ self.molformer = MolformerModel(config, add_pooling_layer=True)
842
+ self.classifier = MolformerClassificationHead(config)
843
+
844
+ # Initialize weights and apply final processing
845
+ self.post_init()
846
+
847
+ @add_start_docstrings_to_model_forward(MOLFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
848
+ @add_code_sample_docstrings(
849
+ checkpoint=_CHECKPOINT_FOR_DOC,
850
+ output_type=SequenceClassifierOutput,
851
+ config_class=_CONFIG_FOR_DOC,
852
+ )
853
+ def forward(
854
+ self,
855
+ input_ids: Optional[torch.LongTensor] = None,
856
+ attention_mask: Optional[torch.FloatTensor] = None,
857
+ position_ids: Optional[torch.LongTensor] = None,
858
+ head_mask: Optional[torch.FloatTensor] = None,
859
+ inputs_embeds: Optional[torch.FloatTensor] = None,
860
+ labels: Optional[torch.LongTensor] = None,
861
+ output_attentions: Optional[bool] = None,
862
+ output_hidden_states: Optional[bool] = None,
863
+ return_dict: Optional[bool] = None,
864
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
865
+ r"""
866
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
867
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
868
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
869
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
870
+ """
871
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
872
+
873
+ outputs = self.molformer(
874
+ input_ids,
875
+ attention_mask=attention_mask,
876
+ position_ids=position_ids,
877
+ head_mask=head_mask,
878
+ inputs_embeds=inputs_embeds,
879
+ output_attentions=output_attentions,
880
+ output_hidden_states=output_hidden_states,
881
+ return_dict=return_dict,
882
+ )
883
+
884
+ pooled_output = outputs[1]
885
+ logits = self.classifier(pooled_output)
886
+
887
+ loss = None
888
+ if labels is not None:
889
+ # move labels to correct device to enable model parallelism
890
+ labels = labels.to(logits.device)
891
+ if self.config.problem_type is None:
892
+ if self.num_labels == 1:
893
+ self.config.problem_type = "regression"
894
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
895
+ self.config.problem_type = "single_label_classification"
896
+ else:
897
+ self.config.problem_type = "multi_label_classification"
898
+
899
+ if self.config.problem_type == "regression":
900
+ loss_fct = MSELoss()
901
+ if self.num_labels == 1:
902
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
903
+ else:
904
+ loss = loss_fct(logits, labels)
905
+ elif self.config.problem_type == "single_label_classification":
906
+ loss_fct = CrossEntropyLoss()
907
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
908
+ elif self.config.problem_type == "multi_label_classification":
909
+ loss_fct = BCEWithLogitsLoss()
910
+ loss = loss_fct(logits, labels)
911
+
912
+ if not return_dict:
913
+ output = (logits,) + outputs[2:]
914
+ return ((loss,) + output) if loss is not None else output
915
+
916
+ return SequenceClassifierOutput(
917
+ loss=loss,
918
+ logits=logits,
919
+ hidden_states=outputs.hidden_states,
920
+ attentions=outputs.attentions,
921
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "<mask>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "<eos>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenization_molformer.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Molformer."""
16
+
17
+ import collections
18
+ import json
19
+ import os
20
+ import re
21
+ from typing import List, Optional, Tuple
22
+
23
+ from transformers.tokenization_utils import PreTrainedTokenizer
24
+ from transformers.utils import logging
25
+
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
30
+
31
+ PRETRAINED_VOCAB_FILES_MAP = {
32
+ "vocab_file": {
33
+ "ibm/MoLFormer-XL-both-10pct": "https://huggingface.co/ibm/MoLFormer-XL-both-10pct/resolve/main/vocab.json",
34
+ }
35
+ }
36
+
37
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
38
+ "ibm/MoLFormer-XL-both-10pct": 202,
39
+ }
40
+
41
+
42
+ class MolformerTokenizer(PreTrainedTokenizer):
43
+ r"""
44
+ Construct a Molformer tokenizer. Based on regex.
45
+
46
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
47
+ this superclass for more information regarding those methods.
48
+
49
+ Args:
50
+ vocab_file (`str`):
51
+ File containing the vocabulary.
52
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
53
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
54
+ token instead.
55
+ sep_token (`str`, *optional*, defaults to `"<eos>"`):
56
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
57
+ sequence classification or for a text and a question for question answering. It is also used as the last
58
+ token of a sequence built with special tokens.
59
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
60
+ The token used for padding, for example when batching sequences of different lengths.
61
+ cls_token (`str`, *optional*, defaults to `"<bos>"`):
62
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
63
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
64
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
65
+ The token used for masking values. This is the token used when training this model with masked language
66
+ modeling. This is the token which the model will try to predict.
67
+ """
68
+
69
+ vocab_files_names = VOCAB_FILES_NAMES
70
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
71
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
72
+ model_input_names = ["input_ids", "attention_mask"]
73
+
74
+ def __init__(
75
+ self,
76
+ vocab_file,
77
+ unk_token="<unk>",
78
+ sep_token="<eos>",
79
+ pad_token="<pad>",
80
+ cls_token="<bos>",
81
+ mask_token="<mask>",
82
+ **kwargs,
83
+ ):
84
+ if not os.path.isfile(vocab_file):
85
+ raise ValueError(
86
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from an IBM pretrained"
87
+ " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
88
+ )
89
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
90
+ self.vocab = json.load(vocab_handle)
91
+ self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
92
+ self.pattern = (
93
+ r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
94
+ )
95
+ self.regex_tokenizer = re.compile(self.pattern)
96
+
97
+ super().__init__(
98
+ unk_token=unk_token,
99
+ sep_token=sep_token,
100
+ pad_token=pad_token,
101
+ cls_token=cls_token,
102
+ mask_token=mask_token,
103
+ **kwargs,
104
+ )
105
+
106
+ @property
107
+ def vocab_size(self):
108
+ return len(self.vocab)
109
+
110
+ def get_vocab(self):
111
+ return dict(self.vocab, **self.added_tokens_encoder)
112
+
113
+ def _tokenize(self, text):
114
+ split_tokens = self.regex_tokenizer.findall(text)
115
+ return split_tokens
116
+
117
+ def _convert_token_to_id(self, token):
118
+ """Converts a token (str) in an id using the vocab."""
119
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
120
+
121
+ def _convert_id_to_token(self, index):
122
+ """Converts an index (integer) in a token (str) using the vocab."""
123
+ return self.ids_to_tokens.get(index, self.unk_token)
124
+
125
+ def convert_tokens_to_string(self, tokens):
126
+ """Converts a sequence of tokens (string) in a single string."""
127
+ out_string = "".join(tokens).strip()
128
+ return out_string
129
+
130
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
131
+ def build_inputs_with_special_tokens(
132
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
133
+ ) -> List[int]:
134
+ """
135
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
136
+ adding special tokens. A BERT sequence has the following format:
137
+
138
+ - single sequence: `[CLS] X [SEP]`
139
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
140
+
141
+ Args:
142
+ token_ids_0 (`List[int]`):
143
+ List of IDs to which the special tokens will be added.
144
+ token_ids_1 (`List[int]`, *optional*):
145
+ Optional second list of IDs for sequence pairs.
146
+
147
+ Returns:
148
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
149
+ """
150
+ if token_ids_1 is None:
151
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
152
+ cls = [self.cls_token_id]
153
+ sep = [self.sep_token_id]
154
+ return cls + token_ids_0 + sep + token_ids_1 + sep
155
+
156
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
157
+ def get_special_tokens_mask(
158
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
159
+ ) -> List[int]:
160
+ """
161
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
162
+ special tokens using the tokenizer `prepare_for_model` method.
163
+
164
+ Args:
165
+ token_ids_0 (`List[int]`):
166
+ List of IDs.
167
+ token_ids_1 (`List[int]`, *optional*):
168
+ Optional second list of IDs for sequence pairs.
169
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
170
+ Whether or not the token list is already formatted with special tokens for the model.
171
+
172
+ Returns:
173
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
174
+ """
175
+
176
+ if already_has_special_tokens:
177
+ return super().get_special_tokens_mask(
178
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
179
+ )
180
+
181
+ if token_ids_1 is not None:
182
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
183
+ return [1] + ([0] * len(token_ids_0)) + [1]
184
+
185
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
186
+ def create_token_type_ids_from_sequences(
187
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
188
+ ) -> List[int]:
189
+ """
190
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
191
+ pair mask has the following format:
192
+
193
+ ```
194
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
195
+ | first sequence | second sequence |
196
+ ```
197
+
198
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
199
+
200
+ Args:
201
+ token_ids_0 (`List[int]`):
202
+ List of IDs.
203
+ token_ids_1 (`List[int]`, *optional*):
204
+ Optional second list of IDs for sequence pairs.
205
+
206
+ Returns:
207
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
208
+ """
209
+ sep = [self.sep_token_id]
210
+ cls = [self.cls_token_id]
211
+ if token_ids_1 is None:
212
+ return len(cls + token_ids_0 + sep) * [0]
213
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
214
+
215
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
216
+ if not os.path.isdir(save_directory):
217
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
218
+ return
219
+ vocab_file = os.path.join(
220
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
221
+ )
222
+
223
+ with open(vocab_file, "w", encoding="utf-8") as f:
224
+ f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
225
+
226
+ return (vocab_file,)
tokenization_molformer_fast.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Molformer."""
16
+ from typing import List, Optional, Tuple
17
+
18
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
19
+ from transformers.utils import logging
20
+ from .tokenization_molformer import MolformerTokenizer
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"}
26
+
27
+ PRETRAINED_VOCAB_FILES_MAP = {
28
+ "vocab_file": {
29
+ "ibm/MoLFormer-XL-both-10pct": "https://huggingface.co/ibm/MoLFormer-XL-both-10pct/resolve/main/vocab.json",
30
+ }
31
+ }
32
+
33
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
34
+ "ibm/MoLFormer-XL-both-10pct": 202,
35
+ }
36
+
37
+
38
+ class MolformerTokenizerFast(PreTrainedTokenizerFast):
39
+ r"""
40
+ Construct a "fast" Molformer tokenizer.
41
+
42
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
43
+ refer to this superclass for more information regarding those methods.
44
+
45
+ Args:
46
+ vocab_file (`str`, *optional*):
47
+ File containing the vocabulary.
48
+ tokenizer_file (`str`, *optional*):
49
+ The path to a tokenizer file to use instead of the vocab file.
50
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
51
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
52
+ token instead.
53
+ sep_token (`str`, *optional*, defaults to `"<eos>"`):
54
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
55
+ sequence classification or for a text and a question for question answering. It is also used as the last
56
+ token of a sequence built with special tokens.
57
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
58
+ The token used for padding, for example when batching sequences of different lengths.
59
+ cls_token (`str`, *optional*, defaults to `"<bos>"`):
60
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
61
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
62
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
63
+ The token used for masking values. This is the token used when training this model with masked language
64
+ modeling. This is the token which the model will try to predict.
65
+ """
66
+
67
+ vocab_files_names = VOCAB_FILES_NAMES
68
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
69
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
70
+ model_input_names = ["input_ids", "attention_mask"]
71
+ slow_tokenizer_class = MolformerTokenizer
72
+
73
+ def __init__(
74
+ self,
75
+ vocab_file=None,
76
+ tokenizer_file=None,
77
+ unk_token="<unk>",
78
+ sep_token="<eos>",
79
+ pad_token="<pad>",
80
+ cls_token="<bos>",
81
+ mask_token="<mask>",
82
+ **kwargs,
83
+ ):
84
+ super().__init__(
85
+ vocab_file,
86
+ tokenizer_file=tokenizer_file,
87
+ unk_token=unk_token,
88
+ sep_token=sep_token,
89
+ pad_token=pad_token,
90
+ cls_token=cls_token,
91
+ mask_token=mask_token,
92
+ **kwargs,
93
+ )
94
+
95
+ # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
96
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
97
+ """
98
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
99
+ adding special tokens. A BERT sequence has the following format:
100
+
101
+ - single sequence: `[CLS] X [SEP]`
102
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
103
+
104
+ Args:
105
+ token_ids_0 (`List[int]`):
106
+ List of IDs to which the special tokens will be added.
107
+ token_ids_1 (`List[int]`, *optional*):
108
+ Optional second list of IDs for sequence pairs.
109
+
110
+ Returns:
111
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
112
+ """
113
+ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
114
+
115
+ if token_ids_1 is not None:
116
+ output += token_ids_1 + [self.sep_token_id]
117
+
118
+ return output
119
+
120
+ # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.create_token_type_ids_from_sequences
121
+ def create_token_type_ids_from_sequences(
122
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
123
+ ) -> List[int]:
124
+ """
125
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
126
+ pair mask has the following format:
127
+
128
+ ```
129
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
130
+ | first sequence | second sequence |
131
+ ```
132
+
133
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
134
+
135
+ Args:
136
+ token_ids_0 (`List[int]`):
137
+ List of IDs.
138
+ token_ids_1 (`List[int]`, *optional*):
139
+ Optional second list of IDs for sequence pairs.
140
+
141
+ Returns:
142
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
143
+ """
144
+ sep = [self.sep_token_id]
145
+ cls = [self.cls_token_id]
146
+ if token_ids_1 is None:
147
+ return len(cls + token_ids_0 + sep) * [0]
148
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
149
+
150
+ # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
151
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
152
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
153
+ return tuple(files)
tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<mask>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "2361": {
36
+ "content": "<unk>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenization_molformer.MolformerTokenizer",
47
+ "tokenization_molformer_fast.MolformerTokenizerFast"
48
+ ]
49
+ },
50
+ "clean_up_tokenization_spaces": true,
51
+ "cls_token": "<bos>",
52
+ "extra_special_tokens": {},
53
+ "mask_token": "<mask>",
54
+ "model_max_length": 1000000000000000019884624838656,
55
+ "pad_token": "<pad>",
56
+ "sep_token": "<eos>",
57
+ "tokenizer_class": "MolformerTokenizer",
58
+ "unk_token": "<unk>"
59
+ }