Minor changes and README.md fix
Browse files- README.md +3 -3
- config.json +1 -1
- modeling_m5_encoder.py +1 -0
README.md
CHANGED
|
@@ -61,7 +61,7 @@ rel_pos = torch.tensor(pos_encod).unsqueeze(0) # (1, seq_len, seq_len)
|
|
| 61 |
|
| 62 |
outputs = model(input_ids=input_ids, attention_mask=attn_mask, relative_position=rel_pos)
|
| 63 |
hidden = outputs.last_hidden_state # (1, seq_len, 512)
|
| 64 |
-
```
|
| 65 |
|
| 66 |
A function ``model.collate_for_dataset`` is also available to perform collation for use in Pytorch's DataLoader. The function gets a list of tuples, each of which is composed of:
|
| 67 |
- the first element is a dictionary with keys ``"input_ids"`` (``np.ndarray``, shape ``(L,)``) and ``"attention_mask"`` (``np.ndarray``, shape ``(L,)``), as produced by a tokenizer
|
|
@@ -79,8 +79,8 @@ A function ``model.collate_for_dataset`` is also available to perform collation
|
|
| 79 |
| `num_heads` | 12 |
|
| 80 |
| `vocab_size` | 1 032 |
|
| 81 |
| `feed_forward_proj` | gated-gelu |
|
| 82 |
-
| `relative_attention_num_buckets` |
|
| 83 |
-
| `relative_attention_max_distance` |
|
| 84 |
|
| 85 |
Position biases are replaced by molecular-graph distances computed
|
| 86 |
with RDKit and binned with a modified T5 logarithm binning algorithm, giving the model awareness to molecular topology without being too strict on precise distances.
|
|
|
|
| 61 |
|
| 62 |
outputs = model(input_ids=input_ids, attention_mask=attn_mask, relative_position=rel_pos)
|
| 63 |
hidden = outputs.last_hidden_state # (1, seq_len, 512)
|
| 64 |
+
```
|
| 65 |
|
| 66 |
A function ``model.collate_for_dataset`` is also available to perform collation for use in Pytorch's DataLoader. The function gets a list of tuples, each of which is composed of:
|
| 67 |
- the first element is a dictionary with keys ``"input_ids"`` (``np.ndarray``, shape ``(L,)``) and ``"attention_mask"`` (``np.ndarray``, shape ``(L,)``), as produced by a tokenizer
|
|
|
|
| 79 |
| `num_heads` | 12 |
|
| 80 |
| `vocab_size` | 1 032 |
|
| 81 |
| `feed_forward_proj` | gated-gelu |
|
| 82 |
+
| `relative_attention_num_buckets` | 32 |
|
| 83 |
+
| `relative_attention_max_distance` | 96 |
|
| 84 |
|
| 85 |
Position biases are replaced by molecular-graph distances computed
|
| 86 |
with RDKit and binned with a modified T5 logarithm binning algorithm, giving the model awareness to molecular topology without being too strict on precise distances.
|
config.json
CHANGED
|
@@ -148,7 +148,7 @@
|
|
| 148 |
},
|
| 149 |
"layer_norm_epsilon": 1e-06,
|
| 150 |
"model_type": "m5_model",
|
| 151 |
-
"num_decoder_layers":
|
| 152 |
"num_heads": 12,
|
| 153 |
"num_layers": 24,
|
| 154 |
"pad_token_id": 2,
|
|
|
|
| 148 |
},
|
| 149 |
"layer_norm_epsilon": 1e-06,
|
| 150 |
"model_type": "m5_model",
|
| 151 |
+
"num_decoder_layers": 0,
|
| 152 |
"num_heads": 12,
|
| 153 |
"num_layers": 24,
|
| 154 |
"pad_token_id": 2,
|
modeling_m5_encoder.py
CHANGED
|
@@ -50,6 +50,7 @@ class M5EncoderConfig(T5Config):
|
|
| 50 |
relative_attention_max_distance=relative_attention_max_distance,
|
| 51 |
relative_attention_num_buckets=relative_attention_num_buckets,
|
| 52 |
vocab_size=vocab_size,
|
|
|
|
| 53 |
**kwargs)
|
| 54 |
|
| 55 |
class M5Encoder(PreTrainedModel):
|
|
|
|
| 50 |
relative_attention_max_distance=relative_attention_max_distance,
|
| 51 |
relative_attention_num_buckets=relative_attention_num_buckets,
|
| 52 |
vocab_size=vocab_size,
|
| 53 |
+
num_decoder_layers=0,
|
| 54 |
**kwargs)
|
| 55 |
|
| 56 |
class M5Encoder(PreTrainedModel):
|