{ "model_class": "BERTModel", "model_config": { "name": "Mini-Albertina-001", "hidden_size": 768, "ffn_factor": 4.0, "vocab_size": 32769, "bos_token_id": 1, "eos_token_id": 2, "pad_token_id": 0, "mask_token_id": 32768, "masked_substitution_rate": 0.15, "num_hidden_layers": 12, "num_attention_heads": 12, "tie_word_embeddings": false, "rms_norm_eps": 1e-06, "attention_type": [], "max_position_embeddings": 1024, "block_size_for_attention": 128, "rope_theta": 10000.0, "compile_flexattn": false, "bias": false, "training_objective": "masked", "is_causal": false, "default_layer": { "attn_impl": "flash", "sliding_window_size": null, "positional_encoding": "alibi", "normalization": "rmsnorm", "normalization_position": "post", "ffn_activation": "swiglu", "hooks": {} }, "custom_layers": {} }, "training": { "optimizer": "muon", "lr_scheduling": true, "lr": 0.0005, "final_lr": 2e-05, "hold_steps": 0.21, "weight_decay": 0.01, "scheduler": "custom", "gradient_clip_val": 1.0, "warmup_steps": 0.05, "max_epochs": 1, "accumulate_grad_batches": 5, "seed": 27, "save_every_n_steps": 5000, "checkpoint_name": "mini_albertina_001" }, "tokenizer": { "type": "huggingface", "pretrained_name": "sapienzanlp/Minerva-350M-base-v1.0", "varlen_strategy": "unpadding" }, "data": { "data_root": "/home/matteo/Albertone/Albertina/Albertina_mdat", "batch_size": 48, "num_workers": 1, "mdat_strategy": "Minerva1024", "mdat_view": null }, "save_dir": "./checkpoints_albertone", "wandb_project": "Albertone", "wandb_run_name": "Mini-Albertina-000" }