{ "adamw_betas_for_muon_others": [ 0.9, 0.95 ], "adamw_eps_for_muon_others": 1e-10, "adamw_lr": 0.00064, "adamw_lr_for_muon_others": 0.000267, "adamw_max_grad_norm": 0.5, "adamw_weight_decay": 0.01, "adamw_weight_decay_for_muon_others": 0.05, "adjust_learning_rate_for_accumulation": true, "architectures": [ "ViralBERTForContrastiveBinning" ], "attention_head_size": 64, "attention_probs_dropout_prob": 0.0, "batch_size": 256, "cls_token_id": 1, "compile_backend": "inductor", "compile_fullgraph": true, "compile_mode": "default", "data_dir": "data/raw", "dtype": "float32", "fasta_file": "meta-1-vg.fna", "feed_forward_activation": "swiglu", "filter_n": false, "fp16": true, "global_attn_every_n_layers": 0, "global_max_grad_norm": 1.0, "gradient_accumulation_steps": 1, "hidden_dropout_prob": 0.0, "hidden_size": 768, "high_lr_multiplier": 1.0, "high_lr_steps_ratio": 0.0, "initializer_range": 0.02, "intermediate_size": 2048, "layer_norm_eps": 1e-12, "logging_steps": 1000, "lr_scheduler_type": "warmup_cosine", "mask_token_id": 3, "masking_strategy": "structural", "max_eval_samples": 2048, "max_steps_for_sweep": null, "min_lr_ratio": 0.05, "mlm_probability": 0.15, "moco_dim": 128, "moco_k": 65536, "moco_m": 0.999, "moco_t": 0.07, "model_type": "viralbert_for_contrastive_binning", "muon_lr": 0.0015, "muon_max_grad_norm": 1.0, "muon_momentum": 0.95, "muon_weight_decay": 0.05, "norm_layer_type": "rmsnorm", "num_attention_heads": 12, "num_hidden_layers": 12, "num_train_epochs": 3, "num_workers": 4, "optimizer_type": "muon_adamw", "p_codon": 0.5, "pad_token_id": 0, "position_embedding_type": "rope", "resume_from_checkpoint": null, "resume_mode": null, "reverse_complement_prob": 0.5, "rope_interpolation_factor": 1.0, "run_name": "", "save_steps": 10000, "save_total_limit": 5, "scale_loss_for_accumulation": true, "seed": 42, "sep_token_id": 2, "seq_length": 512, "sliding_window_size": 0, "stride": 256, "sweep_early_stopping_patience_steps": 1000, "sweep_early_stopping_threshold": 50.0, "tie_word_embeddings": false, "transformers_version": "4.56.1", "unfreeze_last_n_layers": 6, "use_compile": true, "use_per_group_clipping": false, "use_qk_norm": true, "use_xpos": false, "vocab_size": 14, "wandb_enabled": true, "wandb_group": "", "wandb_name": "", "wandb_notes": "", "wandb_project": "", "wandb_tags": [], "wandb_watch_freq": null, "wandb_watch_model": false, "warmup_steps": 4000, "warmup_steps_ratio": 0.1 }