| model: | |
| d_model: 384 | |
| n_layer: 2 | |
| d_inner: ${eval:4 * ${.d_model}} | |
| vocab_size: ${tokenizer.vocab_size} | |
| resid_dropout: 0.0 | |
| embed_dropout: 0.1 | |
| residual_in_fp32: true | |
| pad_vocab_size_multiple: 8 | |
| mamba_ver: mamba2 | |
| layer: | |
| d_model: ${model.d_model} | |
| d_state: 64 | |
| d_conv: 4 | |
| expand: 2 | |
| headdim: 48 | |
| n_classes: null | |
| dataset: | |
| __train_len: ${div_up:1_000_000_000, ${.max_len}} | |
| __l_max: ${.max_len} | |
| randomize_offset: true | |
| input_path: ./data/ | |
| max_len: 660 | |
| use_padding: true | |
| add_eos: false | |
| rc_aug: true | |
| phase: pretrain | |
| classify_level: null | |
| num_workers: 0 | |
| batch_size: 16 | |
| pretrain_method: ntp | |
| mask_ratio: 0.5 | |
| tokenizer: | |
| use_unk_token: true | |
| name: char | |
| characters: | |
| - A | |
| - C | |
| - G | |
| - T | |
| - 'N' | |
| model_max_length: ${dataset.max_len} + 2 | |
| add_special_tokens: false | |
| padding_side: left | |
| vocab_size: 8 | |
| trainer: | |
| accelerator: gpu | |
| devices: -1 | |
| num_nodes: 1 | |
| max_epochs: 50 | |
| gradient_clip_val: 1.0 | |
| fast_dev_run: false | |
| strategy: ddp | |
| train: | |
| logger: wandb | |
| run_name: null | |
| gpu_mem: ${eval:"round(float(__import__('subprocess').check_output('nvidia-smi -i | |
| 0 --query-gpu=memory.total --format=csv,noheader,nounits', shell=True).strip().decode()) | |
| / 1000)"} | |
| seed: 2222 | |
| global_batch_size: 256 | |
| ckpt: null | |
| ema: 0.0 | |
| test: true | |
| interval: step | |
| monitor: val/loss_epoch | |
| mode: min | |
| validate_at_start: false | |
| pretrained_model_strict_load: false | |
| pretrained_model_path: null | |
| scheduler: | |
| t_in_epochs: false | |
| t_initial: ${eval:${div_up:${dataset.__train_len}, ${train.global_batch_size}} * | |
| ${trainer.max_epochs}} | |
| warmup_lr_init: 1.0e-06 | |
| warmup_t: ${eval:${div_up:${dataset.__train_len}, ${train.global_batch_size}} * | |
| ${trainer.max_epochs} * 0.01} | |
| lr_min: ${eval:0.1 * ${optimizer.lr}} | |
| optimizer: | |
| lr: 0.0008 | |
| weight_decay: 0.1 | |
| betas: | |
| - 0.9 | |
| - 0.999 | |
| model_checkpoint: | |
| monitor: ${train.monitor} | |
| mode: ${train.mode} | |
| save_top_k: 1 | |
| save_last: true | |
| dirpath: checkpoints/ | |
| filename: barcode-mamba-${dataset.phase}-{epoch:02d} | |
| save_on_train_epoch_end: true | |
| auto_insert_metric_name: true | |
| verbose: true | |
| debug: false | |