judewells commited on
Commit
afd882c
·
verified ·
1 Parent(s): 39d8a18

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. .hydra/config.yaml +417 -0
  2. .hydra/gym_config.yaml +242 -0
  3. checkpoints/last.ckpt +3 -0
.hydra/config.yaml ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task_name: train
2
+ experiment_group: openfold_fs50_ur90_memmap
3
+ tags:
4
+ - ${experiment_group}
5
+ train: true
6
+ test: false
7
+ ckpt_path: null
8
+ seed: 12345
9
+ float32_matmul_precision: high
10
+ model:
11
+ _target_: src.models.llama.LlamaLitModule
12
+ scheduler_name: constant_with_warmup
13
+ num_warmup_steps: 200
14
+ num_training_steps: 1000000
15
+ lr: 0.001
16
+ embed_coords: false
17
+ embed_sequence_index: false
18
+ embed_residue_index: false
19
+ use_kv_cache_for_scoring: true
20
+ max_seq_pos_in_doc: 1024
21
+ max_res_pos_in_seq: 4096
22
+ pass_res_pos_in_doc_as_position_ids: true
23
+ optimizer: adamw
24
+ config:
25
+ _target_: transformers.LlamaConfig
26
+ vocab_size: ${constants.vocab_size}
27
+ hidden_size: 1024
28
+ intermediate_size: 4096
29
+ num_attention_heads: 16
30
+ num_hidden_layers: 16
31
+ num_key_value_heads: 8
32
+ rope_theta: 500000
33
+ max_position_embeddings: 131072
34
+ scoring_max_tokens: 10240
35
+ attn_implementation: flash_attention_2
36
+ attention_bias: false
37
+ attention_dropout: 0.0
38
+ rms_norm_eps: 1.0e-05
39
+ hidden_act: silu
40
+ torch_dtype: bfloat16
41
+ use_cache: true
42
+ pretraining_tp: 1
43
+ rope_scaling:
44
+ factor: 32.0
45
+ high_freq_factor: 4.0
46
+ low_freq_factor: 1.0
47
+ original_max_position_embeddings: 32000
48
+ rope_type: llama3
49
+ callbacks:
50
+ throughput:
51
+ _target_: src.utils.callbacks.TokenThroughputMonitor
52
+ model_checkpoint:
53
+ _target_: lightning.pytorch.callbacks.ModelCheckpoint
54
+ dirpath: ${paths.output_dir}/checkpoints
55
+ filename: epoch_{epoch:03d}
56
+ monitor: val/loss
57
+ verbose: false
58
+ save_last: true
59
+ save_top_k: 1
60
+ mode: min
61
+ auto_insert_metric_name: false
62
+ save_weights_only: false
63
+ every_n_train_steps: null
64
+ train_time_interval: null
65
+ every_n_epochs: null
66
+ save_on_train_epoch_end: null
67
+ model_summary:
68
+ _target_: lightning.pytorch.callbacks.RichModelSummary
69
+ max_depth: -1
70
+ rich_progress_bar:
71
+ _target_: lightning.pytorch.callbacks.RichProgressBar
72
+ timer:
73
+ _target_: src.utils.callbacks.EpochTimerCallback
74
+ print:
75
+ _target_: src.utils.callbacks.PrintCallback
76
+ sample_counter:
77
+ _target_: src.utils.callbacks.SampleCounter
78
+ logger:
79
+ wandb:
80
+ _target_: src.utils.loggers.WandbLogger
81
+ save_dir: ${paths.output_dir}
82
+ offline: false
83
+ id: null
84
+ anonymous: null
85
+ project: profam
86
+ log_model: false
87
+ prefix: ''
88
+ entity: ProFam
89
+ group: ''
90
+ name: null
91
+ tags: ${tags}
92
+ job_type: ''
93
+ log_hydra_config_file: true
94
+ log_git_hash: true
95
+ trainer:
96
+ _target_: src.utils.trainer.ProFamTrainer
97
+ default_root_dir: ${paths.output_dir}
98
+ max_epochs: 10000
99
+ max_steps: -1
100
+ accelerator: gpu
101
+ devices: auto
102
+ check_val_every_n_epoch: 1
103
+ val_check_interval: 50000
104
+ target_tokens_per_batch: null
105
+ tokens_per_document: 30000
106
+ batch_size: ${data.batch_size}
107
+ deterministic: false
108
+ log_every_n_steps: 10
109
+ timeout: 120
110
+ profiler:
111
+ name: null
112
+ log_tensorboard: false
113
+ simple:
114
+ _target_: SimpleProfiler
115
+ advance:
116
+ _target_: AdvancedProfiler
117
+ filename: advanced_perf_logs
118
+ dirpath: ./profiler_logs
119
+ pytorch:
120
+ _target_: PyTorchProfiler
121
+ filename: pytorch_perf_logs
122
+ dirpath: ./profiler_logs
123
+ record_shapes: true
124
+ profile_memory: true
125
+ with_stack: true
126
+ with_flops: false
127
+ with_modules: false
128
+ acc_events: false
129
+ strategy: ddp
130
+ num_nodes: 1
131
+ sync_batchnorm: true
132
+ precision: bf16-true
133
+ min_epochs: 1000
134
+ accumulate_grad_batches: 2
135
+ use_distributed_sampler: false
136
+ paths:
137
+ root_dir: ${oc.env:PROJECT_ROOT}
138
+ data_dir: /home/jovyan/shared/judewells/profam/data
139
+ log_dir: ${paths.root_dir}/logs/${experiment_group}
140
+ output_dir: ${hydra:runtime.output_dir}
141
+ work_dir: ${hydra:runtime.cwd}
142
+ extras:
143
+ ignore_warnings: false
144
+ enforce_tags: true
145
+ print_config: true
146
+ tokenizer:
147
+ _target_: src.data.tokenizers.ProFamTokenizer
148
+ tokenizer_file: data/profam_tokenizer.json
149
+ unk_token: '[UNK]'
150
+ pad_token: '[PAD]'
151
+ bos_token: '[start-of-document]'
152
+ sep_token: '[SEP]'
153
+ mask_token: '?'
154
+ seq_struct_sep_token: '|'
155
+ add_final_sep: true
156
+ add_bos_token: true
157
+ add_document_token: true
158
+ mask_below_plddt: 80
159
+ max_res_pos_in_seq: ${model.max_res_pos_in_seq}
160
+ embed_residue_index: ${model.embed_residue_index}
161
+ constants:
162
+ vocab_size: 68
163
+ gym_val_assay_list:
164
+ - BLAT_ECOLX_Jacquier_2013
165
+ - CALM1_HUMAN_Weile_2017
166
+ - DYR_ECOLI_Thompson_2019
167
+ - DLG4_RAT_McLaughlin_2012
168
+ - REV_HV1H2_Fernandes_2016
169
+ - TAT_HV1BR_Fernandes_2016
170
+ - RL40A_YEAST_Roscoe_2013
171
+ - P53_HUMAN_Giacomelli_2018_WT_Nutlin
172
+ sequence_features:
173
+ - ds_name
174
+ - identifier
175
+ - input_ids
176
+ - attention_mask
177
+ - original_size
178
+ - residue_index
179
+ - batch_size
180
+ structure_features:
181
+ - ds_name
182
+ - identifier
183
+ - input_ids
184
+ - attention_mask
185
+ - original_size
186
+ - residue_index
187
+ - coords
188
+ - coords_mask
189
+ - interleaved_coords_mask
190
+ - aa_mask
191
+ - plddts
192
+ - structure_mask
193
+ extra_callbacks:
194
+ unconditional_sampling_callback:
195
+ prompt_builder:
196
+ preprocessor:
197
+ cfg:
198
+ _target_: src.data.processors.PreprocessingConfig
199
+ document_token: '[RAW]'
200
+ drop_first_protein: false
201
+ keep_first_protein: false
202
+ allow_unk: false
203
+ max_tokens_per_example: 8192
204
+ shuffle_proteins_in_document: true
205
+ padding: do_not_pad
206
+ transform_fns:
207
+ - _target_: src.data.processors.transforms.replace_nans_in_coords
208
+ _partial_: true
209
+ fill_value: 0.0
210
+ _target_: src.data.processors.ProteinDocumentPreprocessor
211
+ _target_: src.models.inference.PromptBuilder
212
+ _target_: src.pipelines.callback.SamplingEvaluationPipelineCallback
213
+ pipeline:
214
+ _target_: src.pipelines.unconditional_sequence.UnconditionalSequenceEvaluationPipeline
215
+ num_generations: 5
216
+ max_tokens: 20000
217
+ max_generated_length: 300
218
+ pipeline_id: unconditional_sampling
219
+ save_results_to_file: false
220
+ evaluators:
221
+ _target_: src.evaluators.esmfold.ESMFoldSamplingEvaluator
222
+ name: esmfold_example
223
+ data:
224
+ _target_: src.data.datamodule.ProteinDataMixture
225
+ dataset_builders:
226
+ openfold_train:
227
+ _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
228
+ name: openfold_train
229
+ dataset_root: ${paths.data_dir}/openfold/uniclust30_clustered_shuffled_final_text/train_test_split_v2/train_filtered
230
+ tokenizer: ${tokenizer}
231
+ preprocessor:
232
+ _target_: src.data.processors.ProteinDocumentPreprocessor
233
+ cfg:
234
+ _target_: src.data.processors.AlignedProteinPreprocessingConfig
235
+ document_token: '[RAW]'
236
+ drop_first_protein: false
237
+ keep_first_protein: false
238
+ allow_unk: false
239
+ max_tokens_per_example: 8192
240
+ shuffle_proteins_in_document: true
241
+ padding: do_not_pad
242
+ keep_gaps: false
243
+ keep_insertions: true
244
+ to_upper: true
245
+ use_msa_pos: false
246
+ transform_fns:
247
+ - _target_: src.data.processors.transforms.replace_nans_in_coords
248
+ _partial_: true
249
+ fill_value: 0.0
250
+ proteingym:
251
+ _target_: src.data.builders.proteingym.ProteinGymDataset
252
+ name: proteingym
253
+ dms_ids: ${constants.gym_val_assay_list}
254
+ seed: 42
255
+ max_mutated_sequences: null
256
+ mutant_bos_token: sep
257
+ keep_gaps: false
258
+ use_filtered_msa: true
259
+ extra_tokens_per_document: 2
260
+ use_msa_pos: false
261
+ num_proc: null
262
+ max_tokens_per_example: 7500
263
+ max_context_seqs: null
264
+ keep_wt: false
265
+ drop_wt: true
266
+ foldseek_s50_train:
267
+ _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
268
+ name: foldseek_s50_train
269
+ dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/train_filtered
270
+ tokenizer: ${tokenizer}
271
+ seed: ${seed}
272
+ preprocessor:
273
+ _target_: src.data.processors.ProteinDocumentPreprocessor
274
+ cfg:
275
+ _target_: src.data.processors.PreprocessingConfig
276
+ document_token: '[RAW]'
277
+ drop_first_protein: false
278
+ keep_first_protein: false
279
+ allow_unk: false
280
+ max_tokens_per_example: 8192
281
+ shuffle_proteins_in_document: true
282
+ padding: do_not_pad
283
+ transform_fns:
284
+ - _target_: src.data.processors.transforms.replace_nans_in_coords
285
+ _partial_: true
286
+ fill_value: 0.0
287
+ uniref90_train:
288
+ _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
289
+ name: uniref90_train
290
+ dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/train_filtered
291
+ tokenizer: ${tokenizer}
292
+ preprocessor:
293
+ _target_: src.data.processors.ProteinDocumentPreprocessor
294
+ cfg:
295
+ _target_: src.data.processors.PreprocessingConfig
296
+ document_token: '[RAW]'
297
+ drop_first_protein: false
298
+ keep_first_protein: false
299
+ allow_unk: false
300
+ max_tokens_per_example: 320
301
+ shuffle_proteins_in_document: true
302
+ padding: do_not_pad
303
+ transform_fns:
304
+ - _target_: src.data.processors.transforms.replace_nans_in_coords
305
+ _partial_: true
306
+ fill_value: 0.0
307
+ uniref90_val:
308
+ _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
309
+ name: uniref90_val
310
+ dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/val_filtered
311
+ tokenizer: ${tokenizer}
312
+ preprocessor:
313
+ _target_: src.data.processors.ProteinDocumentPreprocessor
314
+ cfg:
315
+ _target_: src.data.processors.PreprocessingConfig
316
+ document_token: '[RAW]'
317
+ drop_first_protein: false
318
+ keep_first_protein: false
319
+ allow_unk: false
320
+ max_tokens_per_example: 320
321
+ shuffle_proteins_in_document: true
322
+ padding: do_not_pad
323
+ transform_fns:
324
+ - _target_: src.data.processors.transforms.replace_nans_in_coords
325
+ _partial_: true
326
+ fill_value: 0.0
327
+ funfams_s50_train:
328
+ _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
329
+ name: funfams_s50_train
330
+ dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/train_filtered
331
+ tokenizer: ${tokenizer}
332
+ seed: ${seed}
333
+ preprocessor:
334
+ _target_: src.data.processors.ProteinDocumentPreprocessor
335
+ cfg:
336
+ _target_: src.data.processors.AlignedProteinPreprocessingConfig
337
+ document_token: '[RAW]'
338
+ drop_first_protein: false
339
+ keep_first_protein: false
340
+ allow_unk: false
341
+ max_tokens_per_example: 8192
342
+ shuffle_proteins_in_document: true
343
+ padding: do_not_pad
344
+ keep_gaps: false
345
+ keep_insertions: true
346
+ to_upper: true
347
+ use_msa_pos: false
348
+ transform_fns:
349
+ - _target_: src.data.processors.transforms.replace_nans_in_coords
350
+ _partial_: true
351
+ fill_value: 0.0
352
+ funfams_s50_val:
353
+ _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
354
+ name: funfams_s50_val
355
+ dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/val_filtered
356
+ tokenizer: ${tokenizer}
357
+ preprocessor:
358
+ _target_: src.data.processors.ProteinDocumentPreprocessor
359
+ cfg:
360
+ _target_: src.data.processors.AlignedProteinPreprocessingConfig
361
+ document_token: '[RAW]'
362
+ drop_first_protein: false
363
+ keep_first_protein: false
364
+ allow_unk: false
365
+ max_tokens_per_example: 8192
366
+ shuffle_proteins_in_document: true
367
+ padding: do_not_pad
368
+ keep_gaps: false
369
+ keep_insertions: true
370
+ to_upper: true
371
+ use_msa_pos: false
372
+ transform_fns:
373
+ - _target_: src.data.processors.transforms.replace_nans_in_coords
374
+ _partial_: true
375
+ fill_value: 0.0
376
+ foldseek_s50_val:
377
+ _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
378
+ name: foldseek_s50_val
379
+ dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/val_filtered
380
+ tokenizer: ${tokenizer}
381
+ seed: ${seed}
382
+ preprocessor:
383
+ _target_: src.data.processors.ProteinDocumentPreprocessor
384
+ cfg:
385
+ _target_: src.data.processors.PreprocessingConfig
386
+ document_token: '[RAW]'
387
+ drop_first_protein: false
388
+ keep_first_protein: false
389
+ allow_unk: false
390
+ max_tokens_per_example: 8192
391
+ shuffle_proteins_in_document: true
392
+ padding: do_not_pad
393
+ transform_fns:
394
+ - _target_: src.data.processors.transforms.replace_nans_in_coords
395
+ _partial_: true
396
+ fill_value: 0.0
397
+ data_weights:
398
+ foldseek_s50_train: 1
399
+ uniref90_train: 1
400
+ openfold_train: 1
401
+ funfams_s50_train: 0.03
402
+ val_dataset_batch_sizes:
403
+ funfams_s50_val: 1
404
+ proteingym: 1
405
+ foldseek_s50_val: 1
406
+ uniref90_val: 1
407
+ batch_size: 100
408
+ data_dir: ${paths.data_dir}
409
+ num_workers: 32
410
+ ignore_gaps: true
411
+ feature_names: ${constants.sequence_features}
412
+ pack_to_max_tokens: 52000
413
+ prefetch_factor: 4
414
+ shuffle: true
415
+ interleaved: true
416
+ interleaved_block_size: 1000
417
+ total_num_train_samples: null
.hydra/gym_config.yaml ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task_name: train
2
+ experiment_group: openfold_fs50_ur90_memmap_GYM_ONLY
3
+ tags:
4
+ - ${experiment_group}
5
+ train: true
6
+ test: false
7
+ ckpt_path: null
8
+ seed: 12345
9
+ float32_matmul_precision: high
10
+ model:
11
+ _target_: src.models.llama.LlamaLitModule
12
+ scheduler_name: constant_with_warmup
13
+ num_warmup_steps: 200
14
+ num_training_steps: 1000000
15
+ lr: 0.001
16
+ embed_coords: false
17
+ embed_sequence_index: false
18
+ embed_residue_index: false
19
+ use_kv_cache_for_scoring: true
20
+ max_seq_pos_in_doc: 1024
21
+ max_res_pos_in_seq: 4096
22
+ pass_res_pos_in_doc_as_position_ids: true
23
+ optimizer: adamw
24
+ scoring_max_tokens: 10240
25
+ config:
26
+ _target_: transformers.LlamaConfig
27
+ vocab_size: ${constants.vocab_size}
28
+ hidden_size: 1024
29
+ intermediate_size: 4096
30
+ num_attention_heads: 16
31
+ num_hidden_layers: 16
32
+ num_key_value_heads: 8
33
+ rope_theta: 500000
34
+ max_position_embeddings: 131072
35
+ scoring_max_tokens: 10240
36
+ attn_implementation: flash_attention_2
37
+ attention_bias: false
38
+ attention_dropout: 0.0
39
+ rms_norm_eps: 1.0e-05
40
+ hidden_act: silu
41
+ torch_dtype: bfloat16
42
+ use_cache: true
43
+ pretraining_tp: 1
44
+ rope_scaling:
45
+ factor: 32.0
46
+ high_freq_factor: 4.0
47
+ low_freq_factor: 1.0
48
+ original_max_position_embeddings: 32000
49
+ rope_type: llama3
50
+ callbacks:
51
+ throughput:
52
+ _target_: src.utils.callbacks.TokenThroughputMonitor
53
+ model_checkpoint:
54
+ _target_: lightning.pytorch.callbacks.ModelCheckpoint
55
+ dirpath: ${paths.output_dir}/checkpoints
56
+ filename: epoch_{epoch:03d}
57
+ monitor: val/loss
58
+ verbose: false
59
+ save_last: true
60
+ save_top_k: 1
61
+ mode: min
62
+ auto_insert_metric_name: false
63
+ save_weights_only: false
64
+ every_n_train_steps: null
65
+ train_time_interval: null
66
+ every_n_epochs: null
67
+ save_on_train_epoch_end: null
68
+ model_summary:
69
+ _target_: lightning.pytorch.callbacks.RichModelSummary
70
+ max_depth: -1
71
+ rich_progress_bar:
72
+ _target_: lightning.pytorch.callbacks.RichProgressBar
73
+ timer:
74
+ _target_: src.utils.callbacks.EpochTimerCallback
75
+ print:
76
+ _target_: src.utils.callbacks.PrintCallback
77
+ sample_counter:
78
+ _target_: src.utils.callbacks.SampleCounter
79
+ logger: null
80
+
81
+ trainer:
82
+ _target_: src.utils.trainer.ProFamTrainer
83
+ default_root_dir: ${paths.output_dir}
84
+ max_epochs: 10000
85
+ max_steps: -1
86
+ accelerator: gpu
87
+ devices: auto
88
+ check_val_every_n_epoch: 1
89
+ val_check_interval: 50000
90
+ target_tokens_per_batch: null
91
+ tokens_per_document: 30000
92
+ batch_size: ${data.batch_size}
93
+ deterministic: false
94
+ log_every_n_steps: 10
95
+ timeout: 120
96
+ profiler:
97
+ name: null
98
+ log_tensorboard: false
99
+ simple:
100
+ _target_: SimpleProfiler
101
+ advance:
102
+ _target_: AdvancedProfiler
103
+ filename: advanced_perf_logs
104
+ dirpath: ./profiler_logs
105
+ pytorch:
106
+ _target_: PyTorchProfiler
107
+ filename: pytorch_perf_logs
108
+ dirpath: ./profiler_logs
109
+ record_shapes: true
110
+ profile_memory: true
111
+ with_stack: true
112
+ with_flops: false
113
+ with_modules: false
114
+ acc_events: false
115
+ strategy: ddp
116
+ num_nodes: 1
117
+ sync_batchnorm: true
118
+ precision: bf16-true
119
+ min_epochs: 1000
120
+ accumulate_grad_batches: 2
121
+ use_distributed_sampler: false
122
+ paths:
123
+ root_dir: ${oc.env:PROJECT_ROOT}
124
+ data_dir: "../data"
125
+ log_dir: ${paths.root_dir}/logs/${experiment_group}
126
+ output_dir: ${hydra:runtime.output_dir}
127
+ work_dir: ${hydra:runtime.cwd}
128
+ extras:
129
+ ignore_warnings: false
130
+ enforce_tags: true
131
+ print_config: true
132
+ tokenizer:
133
+ _target_: src.data.tokenizers.ProFamTokenizer
134
+ tokenizer_file: data/profam_tokenizer.json
135
+ unk_token: '[UNK]'
136
+ pad_token: '[PAD]'
137
+ bos_token: '[start-of-document]'
138
+ sep_token: '[SEP]'
139
+ mask_token: '?'
140
+ seq_struct_sep_token: '|'
141
+ add_final_sep: true
142
+ add_bos_token: true
143
+ add_document_token: true
144
+ mask_below_plddt: 80
145
+ max_res_pos_in_seq: ${model.max_res_pos_in_seq}
146
+ embed_residue_index: ${model.embed_residue_index}
147
+ constants:
148
+ vocab_size: 68
149
+ gym_val_assay_list:
150
+ - BLAT_ECOLX_Jacquier_2013
151
+ - CALM1_HUMAN_Weile_2017
152
+ - DYR_ECOLI_Thompson_2019
153
+ - DLG4_RAT_McLaughlin_2012
154
+ - REV_HV1H2_Fernandes_2016
155
+ - TAT_HV1BR_Fernandes_2016
156
+ - RL40A_YEAST_Roscoe_2013
157
+ - P53_HUMAN_Giacomelli_2018_WT_Nutlin
158
+ sequence_features:
159
+ - ds_name
160
+ - identifier
161
+ - input_ids
162
+ - attention_mask
163
+ - original_size
164
+ - residue_index
165
+ - batch_size
166
+ structure_features:
167
+ - ds_name
168
+ - identifier
169
+ - input_ids
170
+ - attention_mask
171
+ - original_size
172
+ - residue_index
173
+ - coords
174
+ - coords_mask
175
+ - interleaved_coords_mask
176
+ - aa_mask
177
+ - plddts
178
+ - structure_mask
179
+ extra_callbacks:
180
+ unconditional_sampling_callback:
181
+ prompt_builder:
182
+ preprocessor:
183
+ cfg:
184
+ _target_: src.data.processors.PreprocessingConfig
185
+ document_token: '[RAW]'
186
+ drop_first_protein: false
187
+ keep_first_protein: false
188
+ allow_unk: false
189
+ max_tokens_per_example: 8192
190
+ shuffle_proteins_in_document: true
191
+ padding: do_not_pad
192
+ transform_fns:
193
+ - _target_: src.data.processors.transforms.replace_nans_in_coords
194
+ _partial_: true
195
+ fill_value: 0.0
196
+ _target_: src.data.processors.ProteinDocumentPreprocessor
197
+ _target_: src.models.inference.PromptBuilder
198
+ _target_: src.pipelines.callback.SamplingEvaluationPipelineCallback
199
+ pipeline:
200
+ _target_: src.pipelines.unconditional_sequence.UnconditionalSequenceEvaluationPipeline
201
+ num_generations: 5
202
+ max_tokens: 20000
203
+ max_generated_length: 300
204
+ pipeline_id: unconditional_sampling
205
+ save_results_to_file: false
206
+ evaluators:
207
+ _target_: src.evaluators.esmfold.ESMFoldSamplingEvaluator
208
+ name: esmfold_example
209
+ data:
210
+ _target_: src.data.datamodule.ProteinDataMixture
211
+ dataset_builders:
212
+ proteingym:
213
+ _target_: src.data.builders.proteingym.ProteinGymDataset
214
+ name: proteingym
215
+ dms_ids: ${constants.gym_val_assay_list}
216
+ seed: 42
217
+ max_mutated_sequences: null
218
+ mutant_bos_token: sep
219
+ keep_gaps: false
220
+ use_filtered_msa: true
221
+ extra_tokens_per_document: 2
222
+ use_msa_pos: false
223
+ num_proc: null
224
+ max_tokens_per_example: 7500
225
+ max_context_seqs: null
226
+ keep_wt: false
227
+ drop_wt: true
228
+ data_weights:
229
+ foldseek_s50_train: 1
230
+ val_dataset_batch_sizes:
231
+ proteingym: 1
232
+ batch_size: 100
233
+ data_dir: ${paths.data_dir}
234
+ num_workers: 32
235
+ ignore_gaps: true
236
+ feature_names: ${constants.sequence_features}
237
+ pack_to_max_tokens: 52000
238
+ prefetch_factor: 4
239
+ shuffle: true
240
+ interleaved: true
241
+ interleaved_block_size: 1000
242
+ total_num_train_samples: null
checkpoints/last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5bcec672ba8a3f5d98d817db3027b08c019752d6c3579977f7e001a2d65d350
3
+ size 1511190296