| { | |
| "ampere_pruning_method": "disabled", | |
| "attention_block_cols": 32, | |
| "attention_block_rows": 32, | |
| "attention_lambda": 1.0, | |
| "attention_output_with_dense": 0, | |
| "attention_pruning_method": "sigmoied_threshold", | |
| "bias_mask": true, | |
| "dense_block_cols": 1, | |
| "dense_block_rows": 1, | |
| "dense_lambda": 1.0, | |
| "dense_pruning_method": "sigmoied_threshold:1d_alt", | |
| "distil_alpha_ce": 0.1, | |
| "distil_alpha_teacher": 0.9, | |
| "distil_teacher_name_or_path": null, | |
| "distil_temperature": 2.0, | |
| "eval_with_current_patch_params": 1, | |
| "final_ampere_temperature": 20.0, | |
| "final_finetune": false, | |
| "final_threshold": 1.0, | |
| "final_warmup": 0.0, | |
| "gelu_patch": 0, | |
| "gelu_patch_steps": 50000, | |
| "initial_ampere_temperature": 0.0, | |
| "initial_threshold": 1.0, | |
| "initial_warmup": 1, | |
| "layer_norm_patch": 0, | |
| "layer_norm_patch_start_delta": 0.99, | |
| "layer_norm_patch_steps": 50000, | |
| "linear_min_parameters": 0, | |
| "mask_init": "constant", | |
| "mask_scale": 0.0, | |
| "mask_scores_learning_rate": 0.01, | |
| "regularization": "l1", | |
| "regularization_final_lambda": 0, | |
| "rewind_model_name_or_path": null | |
| } |