| { |
| "model_name_or_path": "answerdotai/ModernBERT-base", |
| "rope_scaling_type": "yarn", |
| "rope_scaling_factor": 4.0, |
| "rope_original_max_position_embeddings": 8192, |
| "yarn_beta_fast": 32.0, |
| "yarn_beta_slow": 1.0, |
| "yarn_extrapolation_factor": 1.0, |
| "yarn_attn_factor": 1.0, |
| "dataset_name": "/data/datasets/slimpajama_32k_1B", |
| "model_max_length": 32768, |
| "mlm_probability": 0.3, |
| "max_train_samples": null, |
| "preprocessing_num_workers": 4, |
| "dataloader_num_workers": 4, |
| "output_dir": "/data/outputs/modernbert-32k-retrieval", |
| "per_device_train_batch_size": 6, |
| "torch_compile": false, |
| "gradient_accumulation_steps": 1, |
| "learning_rate": 1e-05, |
| "weight_decay": 0.01, |
| "warmup_steps": 100, |
| "warmup_ratio": 0.1, |
| "num_train_epochs": 1, |
| "max_grad_norm": 1.0, |
| "lr_scheduler_type": "constant_with_warmup", |
| "use_retrieval_masking": true, |
| "retrieval_probability": 0.1, |
| "min_distance_for_retrieval": 512, |
| "use_ewc": true, |
| "ewc_lambda": 1000.0, |
| "ewc_samples": 100, |
| "bf16": true, |
| "logging_steps": 10, |
| "save_steps": 500, |
| "seed": 42 |
| } |