| batch_size: 112 | |
| epochs: 3 | |
| stage: 1 | |
| unfrozen_ratio: 0.3 | |
| ckpt_weights_only: false | |
| checkpoint_dir: ./checkpoints/hydra_mark | |
| train_data_dir: ./data/train_shards1 | |
| val_data_dir: ./data/val_shards | |
| weights_path: ./models/hydra_hypernet_mark.pt | |
| shuffle: true | |
| use_early_stopping: false | |
| max_patience_counter: 5 | |
| min_delta: 0.01 | |
| use_gradient_clipping: true | |
| gradient_clipping_norm: 1.0 | |
| pad_length: 4096 | |
| learning_rate_mark: 0.0006 | |
| learning_rate_hydra: 3.0e-05 | |
| learning_rate_cls: 0.0001 | |
| no_cache: false | |
| num_workers: 8 | |
| matmul_precision: high | |
| multi_shot: false | |
| intervals: 3 | |
| is_prenorm: false | |
| accumulate_grad_batches: 2 | |
| cart: true | |
| cart_p: 0.45 | |
| cart_scale: 1.0 | |
| distillation: false | |
| lr_scheduler: | |
| type: cosine | |
| warmup_steps: 720 | |
| total_steps: 14400 | |
| min_lr_ratio: 0.1 | |
| polynomial: | |
| end_lr_ratio: 0.0 | |
| power: 1.0 | |
| plateau: | |
| factor: 0.5 | |
| patience: 3 | |
| min_lr: 1.0e-06 | |
| trainer: | |
| accelerator: gpu | |
| devices: -1 | |
| check_val_every_n_epoch: null | |
| num_sanity_val_steps: 0 | |
| accumulate_grad_batches: 1 | |
| precision: bf16-true | |
| enable_checkpointing: true | |
| default_root_dir: ./checkpoints/hydra_mark | |
| wandb: | |
| project: hydra-training_hypernet | |
| model_name: HydraForMaskedLM | |
| watch_log: all | |
| log_freq: 20 | |
| hydra_config: | |
| hidden_size: 768 | |
| vocab_size: 30522 | |
| type_vocab_size: 2 | |
| pad_token_id: 0 | |
| use_position_embeddings: false | |
| max_position_embeddings: 4096 | |
| use_timestep_embeddings: true | |
| layer_norm_eps: 1.0e-12 | |
| dropout: 0.0 | |
| max_timestep_embeddings: 1000 | |
| current_timestep: 0 | |
| d_state: 64 | |
| d_conv: 7 | |
| head_dim: 64 | |
| expand: 2 | |
| chunk_size: 256 | |
| is_prenorm: false | |
| use_eff_compute: false | |
| gradient_checkpointing: true | |
| num_hidden_layers: 23 | |
| guider_hidden_layers: 12 | |
| device: cpu | |
| pool_all: false | |
| mark_kernel: hypernet | |
| mark_ensemble: false | |
| rank: 2 | |
| degree: 5 | |
| L_timepoints: 256 | |
| n_freqs: 8 | |
| mark_mlp_dim: 256 | |
| hidden_act: swish | |
| initializer_range: 0.02 | |