| { | |
| "adamw_betas_for_muon_others": [ | |
| 0.9, | |
| 0.95 | |
| ], | |
| "adamw_eps_for_muon_others": 1e-10, | |
| "adamw_lr": 0.00064, | |
| "adamw_lr_for_muon_others": 0.000267, | |
| "adamw_max_grad_norm": 0.5, | |
| "adamw_weight_decay": 0.01, | |
| "adamw_weight_decay_for_muon_others": 0.05, | |
| "adjust_learning_rate_for_accumulation": true, | |
| "architectures": [ | |
| "ViralBERTForSequenceClassification" | |
| ], | |
| "attention_head_size": 64, | |
| "attention_probs_dropout_prob": 0.0, | |
| "batch_size": 256, | |
| "class_weights": null, | |
| "classifier_dropout_prob": 0.1, | |
| "cls_token_id": 1, | |
| "compile_backend": "inductor", | |
| "compile_fullgraph": true, | |
| "compile_mode": "default", | |
| "data_dir": "", | |
| "dtype": "float32", | |
| "fasta_file": "", | |
| "feed_forward_activation": "swiglu", | |
| "filter_n": false, | |
| "fp16": true, | |
| "freeze_bert_layers": 8, | |
| "global_attn_every_n_layers": 0, | |
| "global_max_grad_norm": 1.0, | |
| "gradient_accumulation_steps": 8, | |
| "hidden_dropout_prob": 0.0, | |
| "hidden_size": 768, | |
| "high_lr_multiplier": 1.0, | |
| "high_lr_steps_ratio": 0.0, | |
| "id2label": { | |
| "0": "bac", | |
| "1": "virus" | |
| }, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 2048, | |
| "label2id": { | |
| "bac": 0, | |
| "virus": 1 | |
| }, | |
| "label_smoothing_factor": 0.1, | |
| "layer_norm_eps": 1e-12, | |
| "logging_steps": 1000, | |
| "loss_type": "ce", | |
| "lr_scheduler_type": "cosine", | |
| "mask_token_id": 3, | |
| "masking_strategy": "structural", | |
| "max_eval_samples": 2048, | |
| "max_steps_for_sweep": null, | |
| "min_lr_ratio": 0.05, | |
| "mlm_probability": 0.15, | |
| "model_type": "viralbert_for_sequence_classification", | |
| "muon_lr": 0.0015, | |
| "muon_max_grad_norm": 1.0, | |
| "muon_momentum": 0.95, | |
| "muon_weight_decay": 0.05, | |
| "n_token_id": 9, | |
| "norm_layer_type": "rmsnorm", | |
| "num_attention_heads": 12, | |
| "num_hidden_layers": 12, | |
| "num_train_epochs": 3, | |
| "num_workers": 4, | |
| "optimizer_type": "muon_adamw", | |
| "p_codon": 0.5, | |
| "pad_token_id": 0, | |
| "pos_weight": null, | |
| "position_embedding_type": "rope", | |
| "resume_from_checkpoint": null, | |
| "resume_mode": null, | |
| "reverse_complement_prob": 0.5, | |
| "rope_interpolation_factor": 1.0, | |
| "run_name": "", | |
| "save_steps": 10000, | |
| "save_total_limit": 5, | |
| "scale_loss_for_accumulation": true, | |
| "seed": 42, | |
| "sep_token_id": 2, | |
| "seq_length": 512, | |
| "seq_mask_prob": 0.5, | |
| "seq_mask_ratio": 0.15, | |
| "sliding_window_size": 0, | |
| "stride": 256, | |
| "sweep_early_stopping_patience_steps": 1000, | |
| "sweep_early_stopping_threshold": 50.0, | |
| "tie_word_embeddings": false, | |
| "transformers_version": "4.56.1", | |
| "use_compile": true, | |
| "use_per_group_clipping": false, | |
| "use_qk_norm": true, | |
| "use_seq_augment": true, | |
| "use_xpos": false, | |
| "vocab_size": 14, | |
| "wandb_enabled": true, | |
| "wandb_group": "", | |
| "wandb_name": "", | |
| "wandb_notes": "", | |
| "wandb_project": "", | |
| "wandb_tags": [], | |
| "wandb_watch_freq": null, | |
| "wandb_watch_model": false, | |
| "warmup_steps": 4000, | |
| "warmup_steps_ratio": 0.1 | |
| } | |