| data_config: |
| cache_size: 10000 |
| dataset: train_30M_sqrt_s42 |
| enable_reverse_augmentation: true |
| force_rebuild_index: false |
| glm_probability: 0.333 |
| lineage_file: /rna-multiverse/data/training_data/lineage_greengenes.tsv |
| max_samples: null |
| max_seq_length: 8192 |
| mode: mixed |
| pretrain_ratio: 0.2 |
| span_config: |
| allow_overlap: false |
| coverage_probs: |
| - 0.2 |
| - 0.3 |
| - 0.3 |
| - 0.2 |
| max_coverage_ratios: |
| - 0.15 |
| - 0.35 |
| - 0.55 |
| - 0.75 |
| max_num_spans: 30 |
| span_distributions: |
| - - 6 |
| - 3 |
| - - 40 |
| - 15 |
| - - 60 |
| - 15 |
| - - 150 |
| - 50 |
| - - 400 |
| - 100 |
| - - 800 |
| - 200 |
| train_file: /rna-multiverse/data/training_datasets/sampled_30M_sqrt_s42_split_new/train_30M_sqrt_s42_50only_new.fa |
| use_chunked: true |
| use_direction_tokens: true |
| use_lineage_prefix: true |
| use_rna_type_prefix: true |
| distributed_config: |
| backend: nccl |
| data_parallel_size: 4 |
| expert_parallel_size: 4 |
| weight_parallel_size: 1 |
| logging_config: |
| enable_wandb: true |
| log_dir: /rna-multiverse/results/mid_training/mid_training_v5_from_v31_checkpoint_22000/logs |
| wandb_project: rna-mid-training |
| wandb_run_name: mid_training_v5_from_v31_checkpoint_22000 |
| memory_config: |
| cleanup_frequency: 100 |
| enable_monitoring: true |
| gc_frequency: 50 |
| model_config: |
| attention_dropout: 0.0 |
| dropout_ramp_steps: 0 |
| dropout_schedule: linear |
| dropout_warmup_steps: 0 |
| eos_loss_weight: 1.0 |
| expert_capacity_factor: 1.5 |
| gradient_clip_norm: 0.0 |
| hidden_dropout: 0.0 |
| hidden_size: 1024 |
| initializer_range: 0.02 |
| intermediate_size: 3072 |
| label_smoothing: 0.0 |
| max_position_embeddings: 8192 |
| moe_implementation: megablocks |
| moe_world_size: 4 |
| num_attention_heads: 16 |
| num_experts: 8 |
| num_experts_per_tok: 2 |
| num_hidden_layers: 26 |
| num_key_value_heads: 16 |
| resid_dropout: 0.0 |
| rms_norm_eps: 1e-6 |
| router_aux_loss_coef: 0.01 |
| use_cache: true |
| vocab_size: 114 |
| training_config: |
| adam_beta1: 0.9 |
| adam_beta2: 0.95 |
| adam_epsilon: 1e-8 |
| bf16: true |
| dataloader_drop_last: true |
| dataloader_num_workers: 8 |
| dataloader_pin_memory: true |
| fp16: false |
| gradient_accumulation_steps: 16 |
| gradient_checkpointing: false |
| learning_rate: 0.0001 |
| logging_steps: 30 |
| max_epochs: 1 |
| max_wall_time_hours: 100 |
| min_lr_ratio: 0.1 |
| output_dir: /rna-multiverse/results/mid_training/mid_training_v5_from_v31_checkpoint_22000 |
| per_device_train_batch_size: 8 |
| resume_from_pretrain: /rna-multiverse/results/experiments/scaling_1.4B_v31_pretrain/checkpoint-22000 |
| run_name: mid_training_v5_from_v31_checkpoint_22000 |
| save_steps: 500 |
| seed: 42 |
| warmup_steps: 3000 |
| weight_decay: 5.0e-06 |
|
|