ibitec's picture
Upload folder using huggingface_hub
08e4228 verified
batch_size: 112
epochs: 3
stage: 1
unfrozen_ratio: 0.3
ckpt_weights_only: false
checkpoint_dir: ./checkpoints/hydra_mark
train_data_dir: ./data/train_shards1
val_data_dir: ./data/val_shards
weights_path: ./models/hydra_hypernet_mark.pt
shuffle: true
use_early_stopping: false
max_patience_counter: 5
min_delta: 0.01
use_gradient_clipping: true
gradient_clipping_norm: 1.0
pad_length: 4096
learning_rate_mark: 0.0006
learning_rate_hydra: 3.0e-05
learning_rate_cls: 0.0001
no_cache: false
num_workers: 8
matmul_precision: high
multi_shot: false
intervals: 3
is_prenorm: false
accumulate_grad_batches: 2
cart: true
cart_p: 0.45
cart_scale: 1.0
distillation: false
lr_scheduler:
type: cosine
warmup_steps: 720
total_steps: 14400
min_lr_ratio: 0.1
polynomial:
end_lr_ratio: 0.0
power: 1.0
plateau:
factor: 0.5
patience: 3
min_lr: 1.0e-06
trainer:
accelerator: gpu
devices: -1
check_val_every_n_epoch: null
num_sanity_val_steps: 0
accumulate_grad_batches: 1
precision: bf16-true
enable_checkpointing: true
default_root_dir: ./checkpoints/hydra_mark
wandb:
project: hydra-training_hypernet
model_name: HydraForMaskedLM
watch_log: all
log_freq: 20
hydra_config:
hidden_size: 768
vocab_size: 30522
type_vocab_size: 2
pad_token_id: 0
use_position_embeddings: false
max_position_embeddings: 4096
use_timestep_embeddings: true
layer_norm_eps: 1.0e-12
dropout: 0.0
max_timestep_embeddings: 1000
current_timestep: 0
d_state: 64
d_conv: 7
head_dim: 64
expand: 2
chunk_size: 256
is_prenorm: false
use_eff_compute: false
gradient_checkpointing: true
num_hidden_layers: 23
guider_hidden_layers: 12
device: cpu
pool_all: false
mark_kernel: hypernet
mark_ensemble: false
rank: 2
degree: 5
L_timepoints: 256
n_freqs: 8
mark_mlp_dim: 256
hidden_act: swish
initializer_range: 0.02