Fix module paths: src.* -> profam.* (.hydra/config.yaml)
Browse files- .hydra/config.yaml +45 -45
.hydra/config.yaml
CHANGED
|
@@ -8,7 +8,7 @@ ckpt_path: null
|
|
| 8 |
seed: 12345
|
| 9 |
float32_matmul_precision: high
|
| 10 |
model:
|
| 11 |
-
_target_:
|
| 12 |
scheduler_name: constant_with_warmup
|
| 13 |
num_warmup_steps: 200
|
| 14 |
num_training_steps: 1000000
|
|
@@ -48,7 +48,7 @@ model:
|
|
| 48 |
rope_type: llama3
|
| 49 |
callbacks:
|
| 50 |
throughput:
|
| 51 |
-
_target_:
|
| 52 |
model_checkpoint:
|
| 53 |
_target_: lightning.pytorch.callbacks.ModelCheckpoint
|
| 54 |
dirpath: ${paths.output_dir}/checkpoints
|
|
@@ -70,14 +70,14 @@ callbacks:
|
|
| 70 |
rich_progress_bar:
|
| 71 |
_target_: lightning.pytorch.callbacks.RichProgressBar
|
| 72 |
timer:
|
| 73 |
-
_target_:
|
| 74 |
print:
|
| 75 |
-
_target_:
|
| 76 |
sample_counter:
|
| 77 |
-
_target_:
|
| 78 |
logger:
|
| 79 |
wandb:
|
| 80 |
-
_target_:
|
| 81 |
save_dir: ${paths.output_dir}
|
| 82 |
offline: false
|
| 83 |
id: null
|
|
@@ -93,7 +93,7 @@ logger:
|
|
| 93 |
log_hydra_config_file: true
|
| 94 |
log_git_hash: true
|
| 95 |
trainer:
|
| 96 |
-
_target_:
|
| 97 |
default_root_dir: ${paths.output_dir}
|
| 98 |
max_epochs: 10000
|
| 99 |
max_steps: -1
|
|
@@ -144,7 +144,7 @@ extras:
|
|
| 144 |
enforce_tags: true
|
| 145 |
print_config: true
|
| 146 |
tokenizer:
|
| 147 |
-
_target_:
|
| 148 |
tokenizer_file: data/profam_tokenizer.json
|
| 149 |
unk_token: '[UNK]'
|
| 150 |
pad_token: '[PAD]'
|
|
@@ -195,7 +195,7 @@ extra_callbacks:
|
|
| 195 |
prompt_builder:
|
| 196 |
preprocessor:
|
| 197 |
cfg:
|
| 198 |
-
_target_:
|
| 199 |
document_token: '[RAW]'
|
| 200 |
drop_first_protein: false
|
| 201 |
keep_first_protein: false
|
|
@@ -204,34 +204,34 @@ extra_callbacks:
|
|
| 204 |
shuffle_proteins_in_document: true
|
| 205 |
padding: do_not_pad
|
| 206 |
transform_fns:
|
| 207 |
-
- _target_:
|
| 208 |
_partial_: true
|
| 209 |
fill_value: 0.0
|
| 210 |
-
_target_:
|
| 211 |
-
_target_:
|
| 212 |
-
_target_:
|
| 213 |
pipeline:
|
| 214 |
-
_target_:
|
| 215 |
num_generations: 5
|
| 216 |
max_tokens: 20000
|
| 217 |
max_generated_length: 300
|
| 218 |
pipeline_id: unconditional_sampling
|
| 219 |
save_results_to_file: false
|
| 220 |
evaluators:
|
| 221 |
-
_target_:
|
| 222 |
name: esmfold_example
|
| 223 |
data:
|
| 224 |
-
_target_:
|
| 225 |
dataset_builders:
|
| 226 |
openfold_train:
|
| 227 |
-
_target_:
|
| 228 |
name: openfold_train
|
| 229 |
dataset_root: ${paths.data_dir}/openfold/uniclust30_clustered_shuffled_final_text/train_test_split_v2/train_filtered
|
| 230 |
tokenizer: ${tokenizer}
|
| 231 |
preprocessor:
|
| 232 |
-
_target_:
|
| 233 |
cfg:
|
| 234 |
-
_target_:
|
| 235 |
document_token: '[RAW]'
|
| 236 |
drop_first_protein: false
|
| 237 |
keep_first_protein: false
|
|
@@ -244,11 +244,11 @@ data:
|
|
| 244 |
to_upper: true
|
| 245 |
use_msa_pos: false
|
| 246 |
transform_fns:
|
| 247 |
-
- _target_:
|
| 248 |
_partial_: true
|
| 249 |
fill_value: 0.0
|
| 250 |
proteingym:
|
| 251 |
-
_target_:
|
| 252 |
name: proteingym
|
| 253 |
dms_ids: ${constants.gym_val_assay_list}
|
| 254 |
seed: 42
|
|
@@ -264,15 +264,15 @@ data:
|
|
| 264 |
keep_wt: false
|
| 265 |
drop_wt: true
|
| 266 |
foldseek_s50_train:
|
| 267 |
-
_target_:
|
| 268 |
name: foldseek_s50_train
|
| 269 |
dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/train_filtered
|
| 270 |
tokenizer: ${tokenizer}
|
| 271 |
seed: ${seed}
|
| 272 |
preprocessor:
|
| 273 |
-
_target_:
|
| 274 |
cfg:
|
| 275 |
-
_target_:
|
| 276 |
document_token: '[RAW]'
|
| 277 |
drop_first_protein: false
|
| 278 |
keep_first_protein: false
|
|
@@ -281,18 +281,18 @@ data:
|
|
| 281 |
shuffle_proteins_in_document: true
|
| 282 |
padding: do_not_pad
|
| 283 |
transform_fns:
|
| 284 |
-
- _target_:
|
| 285 |
_partial_: true
|
| 286 |
fill_value: 0.0
|
| 287 |
uniref90_train:
|
| 288 |
-
_target_:
|
| 289 |
name: uniref90_train
|
| 290 |
dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/train_filtered
|
| 291 |
tokenizer: ${tokenizer}
|
| 292 |
preprocessor:
|
| 293 |
-
_target_:
|
| 294 |
cfg:
|
| 295 |
-
_target_:
|
| 296 |
document_token: '[RAW]'
|
| 297 |
drop_first_protein: false
|
| 298 |
keep_first_protein: false
|
|
@@ -301,18 +301,18 @@ data:
|
|
| 301 |
shuffle_proteins_in_document: true
|
| 302 |
padding: do_not_pad
|
| 303 |
transform_fns:
|
| 304 |
-
- _target_:
|
| 305 |
_partial_: true
|
| 306 |
fill_value: 0.0
|
| 307 |
uniref90_val:
|
| 308 |
-
_target_:
|
| 309 |
name: uniref90_val
|
| 310 |
dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/val_filtered
|
| 311 |
tokenizer: ${tokenizer}
|
| 312 |
preprocessor:
|
| 313 |
-
_target_:
|
| 314 |
cfg:
|
| 315 |
-
_target_:
|
| 316 |
document_token: '[RAW]'
|
| 317 |
drop_first_protein: false
|
| 318 |
keep_first_protein: false
|
|
@@ -321,19 +321,19 @@ data:
|
|
| 321 |
shuffle_proteins_in_document: true
|
| 322 |
padding: do_not_pad
|
| 323 |
transform_fns:
|
| 324 |
-
- _target_:
|
| 325 |
_partial_: true
|
| 326 |
fill_value: 0.0
|
| 327 |
funfams_s50_train:
|
| 328 |
-
_target_:
|
| 329 |
name: funfams_s50_train
|
| 330 |
dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/train_filtered
|
| 331 |
tokenizer: ${tokenizer}
|
| 332 |
seed: ${seed}
|
| 333 |
preprocessor:
|
| 334 |
-
_target_:
|
| 335 |
cfg:
|
| 336 |
-
_target_:
|
| 337 |
document_token: '[RAW]'
|
| 338 |
drop_first_protein: false
|
| 339 |
keep_first_protein: false
|
|
@@ -346,18 +346,18 @@ data:
|
|
| 346 |
to_upper: true
|
| 347 |
use_msa_pos: false
|
| 348 |
transform_fns:
|
| 349 |
-
- _target_:
|
| 350 |
_partial_: true
|
| 351 |
fill_value: 0.0
|
| 352 |
funfams_s50_val:
|
| 353 |
-
_target_:
|
| 354 |
name: funfams_s50_val
|
| 355 |
dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/val_filtered
|
| 356 |
tokenizer: ${tokenizer}
|
| 357 |
preprocessor:
|
| 358 |
-
_target_:
|
| 359 |
cfg:
|
| 360 |
-
_target_:
|
| 361 |
document_token: '[RAW]'
|
| 362 |
drop_first_protein: false
|
| 363 |
keep_first_protein: false
|
|
@@ -370,19 +370,19 @@ data:
|
|
| 370 |
to_upper: true
|
| 371 |
use_msa_pos: false
|
| 372 |
transform_fns:
|
| 373 |
-
- _target_:
|
| 374 |
_partial_: true
|
| 375 |
fill_value: 0.0
|
| 376 |
foldseek_s50_val:
|
| 377 |
-
_target_:
|
| 378 |
name: foldseek_s50_val
|
| 379 |
dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/val_filtered
|
| 380 |
tokenizer: ${tokenizer}
|
| 381 |
seed: ${seed}
|
| 382 |
preprocessor:
|
| 383 |
-
_target_:
|
| 384 |
cfg:
|
| 385 |
-
_target_:
|
| 386 |
document_token: '[RAW]'
|
| 387 |
drop_first_protein: false
|
| 388 |
keep_first_protein: false
|
|
@@ -391,7 +391,7 @@ data:
|
|
| 391 |
shuffle_proteins_in_document: true
|
| 392 |
padding: do_not_pad
|
| 393 |
transform_fns:
|
| 394 |
-
- _target_:
|
| 395 |
_partial_: true
|
| 396 |
fill_value: 0.0
|
| 397 |
data_weights:
|
|
|
|
| 8 |
seed: 12345
|
| 9 |
float32_matmul_precision: high
|
| 10 |
model:
|
| 11 |
+
_target_: profam.models.llama.LlamaLitModule
|
| 12 |
scheduler_name: constant_with_warmup
|
| 13 |
num_warmup_steps: 200
|
| 14 |
num_training_steps: 1000000
|
|
|
|
| 48 |
rope_type: llama3
|
| 49 |
callbacks:
|
| 50 |
throughput:
|
| 51 |
+
_target_: profam.utils.callbacks.TokenThroughputMonitor
|
| 52 |
model_checkpoint:
|
| 53 |
_target_: lightning.pytorch.callbacks.ModelCheckpoint
|
| 54 |
dirpath: ${paths.output_dir}/checkpoints
|
|
|
|
| 70 |
rich_progress_bar:
|
| 71 |
_target_: lightning.pytorch.callbacks.RichProgressBar
|
| 72 |
timer:
|
| 73 |
+
_target_: profam.utils.callbacks.EpochTimerCallback
|
| 74 |
print:
|
| 75 |
+
_target_: profam.utils.callbacks.PrintCallback
|
| 76 |
sample_counter:
|
| 77 |
+
_target_: profam.utils.callbacks.SampleCounter
|
| 78 |
logger:
|
| 79 |
wandb:
|
| 80 |
+
_target_: profam.utils.loggers.WandbLogger
|
| 81 |
save_dir: ${paths.output_dir}
|
| 82 |
offline: false
|
| 83 |
id: null
|
|
|
|
| 93 |
log_hydra_config_file: true
|
| 94 |
log_git_hash: true
|
| 95 |
trainer:
|
| 96 |
+
_target_: profam.utils.trainer.ProFamTrainer
|
| 97 |
default_root_dir: ${paths.output_dir}
|
| 98 |
max_epochs: 10000
|
| 99 |
max_steps: -1
|
|
|
|
| 144 |
enforce_tags: true
|
| 145 |
print_config: true
|
| 146 |
tokenizer:
|
| 147 |
+
_target_: profam.data.tokenizers.ProFamTokenizer
|
| 148 |
tokenizer_file: data/profam_tokenizer.json
|
| 149 |
unk_token: '[UNK]'
|
| 150 |
pad_token: '[PAD]'
|
|
|
|
| 195 |
prompt_builder:
|
| 196 |
preprocessor:
|
| 197 |
cfg:
|
| 198 |
+
_target_: profam.data.processors.PreprocessingConfig
|
| 199 |
document_token: '[RAW]'
|
| 200 |
drop_first_protein: false
|
| 201 |
keep_first_protein: false
|
|
|
|
| 204 |
shuffle_proteins_in_document: true
|
| 205 |
padding: do_not_pad
|
| 206 |
transform_fns:
|
| 207 |
+
- _target_: profam.data.processors.transforms.replace_nans_in_coords
|
| 208 |
_partial_: true
|
| 209 |
fill_value: 0.0
|
| 210 |
+
_target_: profam.data.processors.ProteinDocumentPreprocessor
|
| 211 |
+
_target_: profam.models.inference.PromptBuilder
|
| 212 |
+
_target_: profam.pipelines.callback.SamplingEvaluationPipelineCallback
|
| 213 |
pipeline:
|
| 214 |
+
_target_: profam.pipelines.unconditional_sequence.UnconditionalSequenceEvaluationPipeline
|
| 215 |
num_generations: 5
|
| 216 |
max_tokens: 20000
|
| 217 |
max_generated_length: 300
|
| 218 |
pipeline_id: unconditional_sampling
|
| 219 |
save_results_to_file: false
|
| 220 |
evaluators:
|
| 221 |
+
_target_: profam.evaluators.esmfold.ESMFoldSamplingEvaluator
|
| 222 |
name: esmfold_example
|
| 223 |
data:
|
| 224 |
+
_target_: profam.data.datamodule.ProteinDataMixture
|
| 225 |
dataset_builders:
|
| 226 |
openfold_train:
|
| 227 |
+
_target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
|
| 228 |
name: openfold_train
|
| 229 |
dataset_root: ${paths.data_dir}/openfold/uniclust30_clustered_shuffled_final_text/train_test_split_v2/train_filtered
|
| 230 |
tokenizer: ${tokenizer}
|
| 231 |
preprocessor:
|
| 232 |
+
_target_: profam.data.processors.ProteinDocumentPreprocessor
|
| 233 |
cfg:
|
| 234 |
+
_target_: profam.data.processors.AlignedProteinPreprocessingConfig
|
| 235 |
document_token: '[RAW]'
|
| 236 |
drop_first_protein: false
|
| 237 |
keep_first_protein: false
|
|
|
|
| 244 |
to_upper: true
|
| 245 |
use_msa_pos: false
|
| 246 |
transform_fns:
|
| 247 |
+
- _target_: profam.data.processors.transforms.replace_nans_in_coords
|
| 248 |
_partial_: true
|
| 249 |
fill_value: 0.0
|
| 250 |
proteingym:
|
| 251 |
+
_target_: profam.data.builders.proteingym.ProteinGymDataset
|
| 252 |
name: proteingym
|
| 253 |
dms_ids: ${constants.gym_val_assay_list}
|
| 254 |
seed: 42
|
|
|
|
| 264 |
keep_wt: false
|
| 265 |
drop_wt: true
|
| 266 |
foldseek_s50_train:
|
| 267 |
+
_target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
|
| 268 |
name: foldseek_s50_train
|
| 269 |
dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/train_filtered
|
| 270 |
tokenizer: ${tokenizer}
|
| 271 |
seed: ${seed}
|
| 272 |
preprocessor:
|
| 273 |
+
_target_: profam.data.processors.ProteinDocumentPreprocessor
|
| 274 |
cfg:
|
| 275 |
+
_target_: profam.data.processors.PreprocessingConfig
|
| 276 |
document_token: '[RAW]'
|
| 277 |
drop_first_protein: false
|
| 278 |
keep_first_protein: false
|
|
|
|
| 281 |
shuffle_proteins_in_document: true
|
| 282 |
padding: do_not_pad
|
| 283 |
transform_fns:
|
| 284 |
+
- _target_: profam.data.processors.transforms.replace_nans_in_coords
|
| 285 |
_partial_: true
|
| 286 |
fill_value: 0.0
|
| 287 |
uniref90_train:
|
| 288 |
+
_target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
|
| 289 |
name: uniref90_train
|
| 290 |
dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/train_filtered
|
| 291 |
tokenizer: ${tokenizer}
|
| 292 |
preprocessor:
|
| 293 |
+
_target_: profam.data.processors.ProteinDocumentPreprocessor
|
| 294 |
cfg:
|
| 295 |
+
_target_: profam.data.processors.PreprocessingConfig
|
| 296 |
document_token: '[RAW]'
|
| 297 |
drop_first_protein: false
|
| 298 |
keep_first_protein: false
|
|
|
|
| 301 |
shuffle_proteins_in_document: true
|
| 302 |
padding: do_not_pad
|
| 303 |
transform_fns:
|
| 304 |
+
- _target_: profam.data.processors.transforms.replace_nans_in_coords
|
| 305 |
_partial_: true
|
| 306 |
fill_value: 0.0
|
| 307 |
uniref90_val:
|
| 308 |
+
_target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
|
| 309 |
name: uniref90_val
|
| 310 |
dataset_root: ${paths.data_dir}/uniref/uniref90_text_shuffled/train_test_split_v2/val_filtered
|
| 311 |
tokenizer: ${tokenizer}
|
| 312 |
preprocessor:
|
| 313 |
+
_target_: profam.data.processors.ProteinDocumentPreprocessor
|
| 314 |
cfg:
|
| 315 |
+
_target_: profam.data.processors.PreprocessingConfig
|
| 316 |
document_token: '[RAW]'
|
| 317 |
drop_first_protein: false
|
| 318 |
keep_first_protein: false
|
|
|
|
| 321 |
shuffle_proteins_in_document: true
|
| 322 |
padding: do_not_pad
|
| 323 |
transform_fns:
|
| 324 |
+
- _target_: profam.data.processors.transforms.replace_nans_in_coords
|
| 325 |
_partial_: true
|
| 326 |
fill_value: 0.0
|
| 327 |
funfams_s50_train:
|
| 328 |
+
_target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
|
| 329 |
name: funfams_s50_train
|
| 330 |
dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/train_filtered
|
| 331 |
tokenizer: ${tokenizer}
|
| 332 |
seed: ${seed}
|
| 333 |
preprocessor:
|
| 334 |
+
_target_: profam.data.processors.ProteinDocumentPreprocessor
|
| 335 |
cfg:
|
| 336 |
+
_target_: profam.data.processors.AlignedProteinPreprocessingConfig
|
| 337 |
document_token: '[RAW]'
|
| 338 |
drop_first_protein: false
|
| 339 |
keep_first_protein: false
|
|
|
|
| 346 |
to_upper: true
|
| 347 |
use_msa_pos: false
|
| 348 |
transform_fns:
|
| 349 |
+
- _target_: profam.data.processors.transforms.replace_nans_in_coords
|
| 350 |
_partial_: true
|
| 351 |
fill_value: 0.0
|
| 352 |
funfams_s50_val:
|
| 353 |
+
_target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
|
| 354 |
name: funfams_s50_val
|
| 355 |
dataset_root: ${paths.data_dir}/funfams/s50_text/train_test_split_v2/val_filtered
|
| 356 |
tokenizer: ${tokenizer}
|
| 357 |
preprocessor:
|
| 358 |
+
_target_: profam.data.processors.ProteinDocumentPreprocessor
|
| 359 |
cfg:
|
| 360 |
+
_target_: profam.data.processors.AlignedProteinPreprocessingConfig
|
| 361 |
document_token: '[RAW]'
|
| 362 |
drop_first_protein: false
|
| 363 |
keep_first_protein: false
|
|
|
|
| 370 |
to_upper: true
|
| 371 |
use_msa_pos: false
|
| 372 |
transform_fns:
|
| 373 |
+
- _target_: profam.data.processors.transforms.replace_nans_in_coords
|
| 374 |
_partial_: true
|
| 375 |
fill_value: 0.0
|
| 376 |
foldseek_s50_val:
|
| 377 |
+
_target_: profam.data.builders.family_text_memmap_datasets.ProteinFamilyMemmapDatasetBuilder
|
| 378 |
name: foldseek_s50_val
|
| 379 |
dataset_root: ${paths.data_dir}/foldseek/foldseek_s50_seq_only_text/train_test_split_v2/val_filtered
|
| 380 |
tokenizer: ${tokenizer}
|
| 381 |
seed: ${seed}
|
| 382 |
preprocessor:
|
| 383 |
+
_target_: profam.data.processors.ProteinDocumentPreprocessor
|
| 384 |
cfg:
|
| 385 |
+
_target_: profam.data.processors.PreprocessingConfig
|
| 386 |
document_token: '[RAW]'
|
| 387 |
drop_first_protein: false
|
| 388 |
keep_first_protein: false
|
|
|
|
| 391 |
shuffle_proteins_in_document: true
|
| 392 |
padding: do_not_pad
|
| 393 |
transform_fns:
|
| 394 |
+
- _target_: profam.data.processors.transforms.replace_nans_in_coords
|
| 395 |
_partial_: true
|
| 396 |
fill_value: 0.0
|
| 397 |
data_weights:
|