batch_size: 112 epochs: 3 stage: 1 unfrozen_ratio: 0.3 ckpt_weights_only: false checkpoint_dir: ./checkpoints/hydra_mark train_data_dir: ./data/train_shards1 val_data_dir: ./data/val_shards weights_path: ./models/hydra_hypernet_mark.pt shuffle: true use_early_stopping: false max_patience_counter: 5 min_delta: 0.01 use_gradient_clipping: true gradient_clipping_norm: 1.0 pad_length: 4096 learning_rate_mark: 0.0006 learning_rate_hydra: 3.0e-05 learning_rate_cls: 0.0001 no_cache: false num_workers: 8 matmul_precision: high multi_shot: false intervals: 3 is_prenorm: false accumulate_grad_batches: 2 cart: true cart_p: 0.45 cart_scale: 1.0 distillation: false lr_scheduler: type: cosine warmup_steps: 720 total_steps: 14400 min_lr_ratio: 0.1 polynomial: end_lr_ratio: 0.0 power: 1.0 plateau: factor: 0.5 patience: 3 min_lr: 1.0e-06 trainer: accelerator: gpu devices: -1 check_val_every_n_epoch: null num_sanity_val_steps: 0 accumulate_grad_batches: 1 precision: bf16-true enable_checkpointing: true default_root_dir: ./checkpoints/hydra_mark wandb: project: hydra-training_hypernet model_name: HydraForMaskedLM watch_log: all log_freq: 20 hydra_config: hidden_size: 768 vocab_size: 30522 type_vocab_size: 2 pad_token_id: 0 use_position_embeddings: false max_position_embeddings: 4096 use_timestep_embeddings: true layer_norm_eps: 1.0e-12 dropout: 0.0 max_timestep_embeddings: 1000 current_timestep: 0 d_state: 64 d_conv: 7 head_dim: 64 expand: 2 chunk_size: 256 is_prenorm: false use_eff_compute: false gradient_checkpointing: true num_hidden_layers: 23 guider_hidden_layers: 12 device: cpu pool_all: false mark_kernel: hypernet mark_ensemble: false rank: 2 degree: 5 L_timepoints: 256 n_freqs: 8 mark_mlp_dim: 256 hidden_act: swish initializer_range: 0.02