| { | |
| "backbone_id": "Qwen/Qwen2.5-7B", | |
| "backbone_dtype": "bfloat16", | |
| "mah_layer_indices": [ | |
| 7, | |
| 14, | |
| 21 | |
| ], | |
| "rrm_inject_indices": [ | |
| 14, | |
| 21 | |
| ], | |
| "community_layer_idx": 4, | |
| "num_mah_layers": 3, | |
| "mah": { | |
| "d_sub": 512, | |
| "d_divergence": 256, | |
| "num_heads": 4, | |
| "dropout": 0.1 | |
| }, | |
| "rrm": { | |
| "d_meta": 512, | |
| "inject_scale": 1.0 | |
| }, | |
| "ben": { | |
| "d_hidden": 256 | |
| }, | |
| "community": { | |
| "num_prototypes": 32, | |
| "d_community": 64, | |
| "temperature": 1.0, | |
| "use_prototypes": false | |
| }, | |
| "loss": { | |
| "ce_weight": 1.0, | |
| "chain_weight": 0.5, | |
| "bif_weight": 1.0, | |
| "regime_weight": 5.0, | |
| "div_alive_weight": 0.1, | |
| "inject_reg_weight": 0.0, | |
| "inject_target_norm": 1.0, | |
| "community_entropy_weight": 0.01, | |
| "community_supcon_weight": 2.0, | |
| "community_supcon_temperature": 0.1, | |
| "divergence_supcon_weight": 0.3, | |
| "divergence_supcon_temperature": 0.1, | |
| "listnet_weight": 0.5, | |
| "listnet_temperature": 1.0, | |
| "chain_residual_aux_weight": 0.05, | |
| "chain_residual_aux_target": 0.5 | |
| } | |
| } | |