judewells commited on
Commit
2cc62d2
·
verified ·
1 Parent(s): 2beaee8

Fix module paths: src.* -> profam.* (.hydra/config.yaml)

Browse files
Files changed (1) hide show
  1. .hydra/config.yaml +45 -45
.hydra/config.yaml CHANGED
@@ -8,7 +8,7 @@ ckpt_path: null
8
  seed: 12345
9
  float32_matmul_precision: high
10
  model:
11
- _target_: src.models.llama.LlamaLitModule
12
  scheduler_name: constant_with_warmup
13
  num_warmup_steps: 200
14
  num_training_steps: 1000000
@@ -48,7 +48,7 @@ model:
48
  rope_type: llama3
49
  callbacks:
50
  throughput:
51
- _target_: src.utils.callbacks.TokenThroughputMonitor
52
  model_checkpoint:
53
  _target_: lightning.pytorch.callbacks.ModelCheckpoint
54
  dirpath: ${paths.output_dir}/checkpoints
@@ -70,14 +70,14 @@ callbacks:
70
  rich_progress_bar:
71
  _target_: lightning.pytorch.callbacks.RichProgressBar
72
  timer:
73
- _target_: src.utils.callbacks.EpochTimerCallback
74
  print:
75
- _target_: src.utils.callbacks.PrintCallback
76
  sample_counter:
77
- _target_: src.utils.callbacks.SampleCounter
78
  logger:
79
  wandb:
80
- _target_: src.utils.loggers.WandbLogger
81
  save_dir: ${paths.output_dir}
82
  offline: false
83
  id: null
@@ -93,7 +93,7 @@ logger:
93
  log_hydra_config_file: true
94
  log_git_hash: true
95
  trainer:
96
- _target_: src.utils.trainer.ProFamTrainer
97
  default_root_dir: ${paths.output_dir}
98
  max_epochs: 10000
99
  max_steps: -1
@@ -144,7 +144,7 @@ extras:
144
  enforce_tags: true
145
  print_config: true
146
  tokenizer:
147
- _target_: src.data.tokenizers.ProFamTokenizer
148
  tokenizer_file: data/profam_tokenizer.json
149
  unk_token: '[UNK]'
150
  pad_token: '[PAD]'
@@ -195,7 +195,7 @@ extra_callbacks:
195
  prompt_builder:
196
  preprocessor:
197
  cfg:
198
- _target_: src.data.processors.PreprocessingConfig
199
  document_token: '[RAW]'
200
  drop_first_protein: false
201
  keep_first_protein: false
@@ -204,34 +204,34 @@ extra_callbacks:
204
  shuffle_proteins_in_document: true
205
  padding: do_not_pad
206
  transform_fns:
207
- - _target_: src.data.processors.transforms.replace_nans_in_coords
208
  _partial_: true
209
  fill_value: 0.0
210
- _target_: src.data.processors.ProteinDocumentPreprocessor
211
- _target_: src.models.inference.PromptBuilder
212
- _target_: src.pipelines.callback.SamplingEvaluationPipelineCallback
213
  pipeline:
214
- _target_: src.pipelines.unconditional_sequence.UnconditionalSequenceEvaluationPipeline
215
  num_generations: 5
216
  max_tokens: 20000
217
  max_generated_length: 300
218
  pipeline_id: unconditional_sampling
219
  save_results_to_file: false
220
  evaluators:
221
- _target_: src.evaluators.esmfold.ESMFoldSamplingEvaluator
222
  name: esmfold_example
223
  data:
224
- _target_: src.data.datamodule.ProteinDataMixture
225
  dataset_builders:
226
  openfold_train:
227
- _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
228
  name: openfold_train
229
  dataset_root: ${paths.data_dir}/openfold/uniclust30_clustered_shuffled_final_text/train_test_split_v2/train_filtered
230
  tokenizer: ${tokenizer}
231
  preprocessor:
232
- _target_: src.data.processors.ProteinDocumentPreprocessor
233
  cfg:
234
- _target_: src.data.processors.AlignedProteinPreprocessingConfig
235
  document_token: '[RAW]'
236
  drop_first_protein: false
237
  keep_first_protein: false
@@ -244,11 +244,11 @@ data:
244
  to_upper: true
245
  use_msa_pos: false
246
  transform_fns:
247
- - _target_: src.data.processors.transforms.replace_nans_in_coords
248
  _partial_: true
249
  fill_value: 0.0
250
  proteingym:
251
- _target_: src.data.builders.proteingym.ProteinGymDataset
252
  name: proteingym
253
  dms_ids: ${constants.gym_val_assay_list}
254
  seed: 42
@@ -264,15 +264,15 @@ data:
264
  keep_wt: false
265
  drop_wt: true
266
  foldseek_s50_train:
267
- _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
268
  name: foldseek_s50_train
269
  dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/train_filtered
270
  tokenizer: ${tokenizer}
271
  seed: ${seed}
272
  preprocessor:
273
- _target_: src.data.processors.ProteinDocumentPreprocessor
274
  cfg:
275
- _target_: src.data.processors.PreprocessingConfig
276
  document_token: '[RAW]'
277
  drop_first_protein: false
278
  keep_first_protein: false
@@ -281,18 +281,18 @@ data:
281
  shuffle_proteins_in_document: true
282
  padding: do_not_pad
283
  transform_fns:
284
- - _target_: src.data.processors.transforms.replace_nans_in_coords
285
  _partial_: true
286
  fill_value: 0.0
287
  uniref90_train:
288
- _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
289
  name: uniref90_train
290
  dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/train_filtered
291
  tokenizer: ${tokenizer}
292
  preprocessor:
293
- _target_: src.data.processors.ProteinDocumentPreprocessor
294
  cfg:
295
- _target_: src.data.processors.PreprocessingConfig
296
  document_token: '[RAW]'
297
  drop_first_protein: false
298
  keep_first_protein: false
@@ -301,18 +301,18 @@ data:
301
  shuffle_proteins_in_document: true
302
  padding: do_not_pad
303
  transform_fns:
304
- - _target_: src.data.processors.transforms.replace_nans_in_coords
305
  _partial_: true
306
  fill_value: 0.0
307
  uniref90_val:
308
- _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
309
  name: uniref90_val
310
  dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/val_filtered
311
  tokenizer: ${tokenizer}
312
  preprocessor:
313
- _target_: src.data.processors.ProteinDocumentPreprocessor
314
  cfg:
315
- _target_: src.data.processors.PreprocessingConfig
316
  document_token: '[RAW]'
317
  drop_first_protein: false
318
  keep_first_protein: false
@@ -321,19 +321,19 @@ data:
321
  shuffle_proteins_in_document: true
322
  padding: do_not_pad
323
  transform_fns:
324
- - _target_: src.data.processors.transforms.replace_nans_in_coords
325
  _partial_: true
326
  fill_value: 0.0
327
  funfams_s50_train:
328
- _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
329
  name: funfams_s50_train
330
  dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/train_filtered
331
  tokenizer: ${tokenizer}
332
  seed: ${seed}
333
  preprocessor:
334
- _target_: src.data.processors.ProteinDocumentPreprocessor
335
  cfg:
336
- _target_: src.data.processors.AlignedProteinPreprocessingConfig
337
  document_token: '[RAW]'
338
  drop_first_protein: false
339
  keep_first_protein: false
@@ -346,18 +346,18 @@ data:
346
  to_upper: true
347
  use_msa_pos: false
348
  transform_fns:
349
- - _target_: src.data.processors.transforms.replace_nans_in_coords
350
  _partial_: true
351
  fill_value: 0.0
352
  funfams_s50_val:
353
- _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
354
  name: funfams_s50_val
355
  dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/val_filtered
356
  tokenizer: ${tokenizer}
357
  preprocessor:
358
- _target_: src.data.processors.ProteinDocumentPreprocessor
359
  cfg:
360
- _target_: src.data.processors.AlignedProteinPreprocessingConfig
361
  document_token: '[RAW]'
362
  drop_first_protein: false
363
  keep_first_protein: false
@@ -370,19 +370,19 @@ data:
370
  to_upper: true
371
  use_msa_pos: false
372
  transform_fns:
373
- - _target_: src.data.processors.transforms.replace_nans_in_coords
374
  _partial_: true
375
  fill_value: 0.0
376
  foldseek_s50_val:
377
- _target_: src.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
378
  name: foldseek_s50_val
379
  dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/val_filtered
380
  tokenizer: ${tokenizer}
381
  seed: ${seed}
382
  preprocessor:
383
- _target_: src.data.processors.ProteinDocumentPreprocessor
384
  cfg:
385
- _target_: src.data.processors.PreprocessingConfig
386
  document_token: '[RAW]'
387
  drop_first_protein: false
388
  keep_first_protein: false
@@ -391,7 +391,7 @@ data:
391
  shuffle_proteins_in_document: true
392
  padding: do_not_pad
393
  transform_fns:
394
- - _target_: src.data.processors.transforms.replace_nans_in_coords
395
  _partial_: true
396
  fill_value: 0.0
397
  data_weights:
 
8
  seed: 12345
9
  float32_matmul_precision: high
10
  model:
11
+ _target_: profam.models.llama.LlamaLitModule
12
  scheduler_name: constant_with_warmup
13
  num_warmup_steps: 200
14
  num_training_steps: 1000000
 
48
  rope_type: llama3
49
  callbacks:
50
  throughput:
51
+ _target_: profam.utils.callbacks.TokenThroughputMonitor
52
  model_checkpoint:
53
  _target_: lightning.pytorch.callbacks.ModelCheckpoint
54
  dirpath: ${paths.output_dir}/checkpoints
 
70
  rich_progress_bar:
71
  _target_: lightning.pytorch.callbacks.RichProgressBar
72
  timer:
73
+ _target_: profam.utils.callbacks.EpochTimerCallback
74
  print:
75
+ _target_: profam.utils.callbacks.PrintCallback
76
  sample_counter:
77
+ _target_: profam.utils.callbacks.SampleCounter
78
  logger:
79
  wandb:
80
+ _target_: profam.utils.loggers.WandbLogger
81
  save_dir: ${paths.output_dir}
82
  offline: false
83
  id: null
 
93
  log_hydra_config_file: true
94
  log_git_hash: true
95
  trainer:
96
+ _target_: profam.utils.trainer.ProFamTrainer
97
  default_root_dir: ${paths.output_dir}
98
  max_epochs: 10000
99
  max_steps: -1
 
144
  enforce_tags: true
145
  print_config: true
146
  tokenizer:
147
+ _target_: profam.data.tokenizers.ProFamTokenizer
148
  tokenizer_file: data/profam_tokenizer.json
149
  unk_token: '[UNK]'
150
  pad_token: '[PAD]'
 
195
  prompt_builder:
196
  preprocessor:
197
  cfg:
198
+ _target_: profam.data.processors.PreprocessingConfig
199
  document_token: '[RAW]'
200
  drop_first_protein: false
201
  keep_first_protein: false
 
204
  shuffle_proteins_in_document: true
205
  padding: do_not_pad
206
  transform_fns:
207
+ - _target_: profam.data.processors.transforms.replace_nans_in_coords
208
  _partial_: true
209
  fill_value: 0.0
210
+ _target_: profam.data.processors.ProteinDocumentPreprocessor
211
+ _target_: profam.models.inference.PromptBuilder
212
+ _target_: profam.pipelines.callback.SamplingEvaluationPipelineCallback
213
  pipeline:
214
+ _target_: profam.pipelines.unconditional_sequence.UnconditionalSequenceEvaluationPipeline
215
  num_generations: 5
216
  max_tokens: 20000
217
  max_generated_length: 300
218
  pipeline_id: unconditional_sampling
219
  save_results_to_file: false
220
  evaluators:
221
+ _target_: profam.evaluators.esmfold.ESMFoldSamplingEvaluator
222
  name: esmfold_example
223
  data:
224
+ _target_: profam.data.datamodule.ProteinDataMixture
225
  dataset_builders:
226
  openfold_train:
227
+ _target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
228
  name: openfold_train
229
  dataset_root: ${paths.data_dir}/openfold/uniclust30_clustered_shuffled_final_text/train_test_split_v2/train_filtered
230
  tokenizer: ${tokenizer}
231
  preprocessor:
232
+ _target_: profam.data.processors.ProteinDocumentPreprocessor
233
  cfg:
234
+ _target_: profam.data.processors.AlignedProteinPreprocessingConfig
235
  document_token: '[RAW]'
236
  drop_first_protein: false
237
  keep_first_protein: false
 
244
  to_upper: true
245
  use_msa_pos: false
246
  transform_fns:
247
+ - _target_: profam.data.processors.transforms.replace_nans_in_coords
248
  _partial_: true
249
  fill_value: 0.0
250
  proteingym:
251
+ _target_: profam.data.builders.proteingym.ProteinGymDataset
252
  name: proteingym
253
  dms_ids: ${constants.gym_val_assay_list}
254
  seed: 42
 
264
  keep_wt: false
265
  drop_wt: true
266
  foldseek_s50_train:
267
+ _target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
268
  name: foldseek_s50_train
269
  dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/train_filtered
270
  tokenizer: ${tokenizer}
271
  seed: ${seed}
272
  preprocessor:
273
+ _target_: profam.data.processors.ProteinDocumentPreprocessor
274
  cfg:
275
+ _target_: profam.data.processors.PreprocessingConfig
276
  document_token: '[RAW]'
277
  drop_first_protein: false
278
  keep_first_protein: false
 
281
  shuffle_proteins_in_document: true
282
  padding: do_not_pad
283
  transform_fns:
284
+ - _target_: profam.data.processors.transforms.replace_nans_in_coords
285
  _partial_: true
286
  fill_value: 0.0
287
  uniref90_train:
288
+ _target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
289
  name: uniref90_train
290
  dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/train_filtered
291
  tokenizer: ${tokenizer}
292
  preprocessor:
293
+ _target_: profam.data.processors.ProteinDocumentPreprocessor
294
  cfg:
295
+ _target_: profam.data.processors.PreprocessingConfig
296
  document_token: '[RAW]'
297
  drop_first_protein: false
298
  keep_first_protein: false
 
301
  shuffle_proteins_in_document: true
302
  padding: do_not_pad
303
  transform_fns:
304
+ - _target_: profam.data.processors.transforms.replace_nans_in_coords
305
  _partial_: true
306
  fill_value: 0.0
307
  uniref90_val:
308
+ _target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
309
  name: uniref90_val
310
  dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/val_filtered
311
  tokenizer: ${tokenizer}
312
  preprocessor:
313
+ _target_: profam.data.processors.ProteinDocumentPreprocessor
314
  cfg:
315
+ _target_: profam.data.processors.PreprocessingConfig
316
  document_token: '[RAW]'
317
  drop_first_protein: false
318
  keep_first_protein: false
 
321
  shuffle_proteins_in_document: true
322
  padding: do_not_pad
323
  transform_fns:
324
+ - _target_: profam.data.processors.transforms.replace_nans_in_coords
325
  _partial_: true
326
  fill_value: 0.0
327
  funfams_s50_train:
328
+ _target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
329
  name: funfams_s50_train
330
  dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/train_filtered
331
  tokenizer: ${tokenizer}
332
  seed: ${seed}
333
  preprocessor:
334
+ _target_: profam.data.processors.ProteinDocumentPreprocessor
335
  cfg:
336
+ _target_: profam.data.processors.AlignedProteinPreprocessingConfig
337
  document_token: '[RAW]'
338
  drop_first_protein: false
339
  keep_first_protein: false
 
346
  to_upper: true
347
  use_msa_pos: false
348
  transform_fns:
349
+ - _target_: profam.data.processors.transforms.replace_nans_in_coords
350
  _partial_: true
351
  fill_value: 0.0
352
  funfams_s50_val:
353
+ _target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
354
  name: funfams_s50_val
355
  dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/val_filtered
356
  tokenizer: ${tokenizer}
357
  preprocessor:
358
+ _target_: profam.data.processors.ProteinDocumentPreprocessor
359
  cfg:
360
+ _target_: profam.data.processors.AlignedProteinPreprocessingConfig
361
  document_token: '[RAW]'
362
  drop_first_protein: false
363
  keep_first_protein: false
 
370
  to_upper: true
371
  use_msa_pos: false
372
  transform_fns:
373
+ - _target_: profam.data.processors.transforms.replace_nans_in_coords
374
  _partial_: true
375
  fill_value: 0.0
376
  foldseek_s50_val:
377
+ _target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
378
  name: foldseek_s50_val
379
  dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/val_filtered
380
  tokenizer: ${tokenizer}
381
  seed: ${seed}
382
  preprocessor:
383
+ _target_: profam.data.processors.ProteinDocumentPreprocessor
384
  cfg:
385
+ _target_: profam.data.processors.PreprocessingConfig
386
  document_token: '[RAW]'
387
  drop_first_protein: false
388
  keep_first_protein: false
 
391
  shuffle_proteins_in_document: true
392
  padding: do_not_pad
393
  transform_fns:
394
+ - _target_: profam.data.processors.transforms.replace_nans_in_coords
395
  _partial_: true
396
  fill_value: 0.0
397
  data_weights: