| { | |
| "activation_dropout": 0.1, | |
| "activation_fn": "gelu", | |
| "architectures": [ | |
| "RDDistillerModel" | |
| ], | |
| "attention_dropout": 0.1, | |
| "attention_type": "original", | |
| "auto_map": { | |
| "AutoConfig": "configuration_distiller.DistillerConfig", | |
| "AutoModel": "distiller_model.RDDistillerModel" | |
| }, | |
| "conv_pos": 128, | |
| "conv_pos_groups": 16, | |
| "cosine_loss": 1.0, | |
| "dropout": 0.1, | |
| "dtype": "float32", | |
| "encoder_attention_heads": 12, | |
| "encoder_embed_dim": 768, | |
| "encoder_ffn_embed_dim": 3072, | |
| "encoder_layerdrop": 0.0, | |
| "encoder_layers": 2, | |
| "extractor_conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", | |
| "extractor_dropout": 0.0, | |
| "extractor_mode": "default", | |
| "feat_pen_loss": 0.0, | |
| "feature_grad_mult": 0.1, | |
| "final_dim": 768, | |
| "init_teacher_conv_layers": true, | |
| "init_teacher_encoder_layers": true, | |
| "layer_emb_size": 0, | |
| "layer_norm_first": false, | |
| "loss_type": "l1", | |
| "model_type": "rd_distiller", | |
| "n_tasks": 3, | |
| "out_layer_inter_dim": -1, | |
| "out_layer_type": "expand-last", | |
| "pred_layer_id": [ | |
| 4, | |
| 8, | |
| 12 | |
| ], | |
| "task_emb_size": 0, | |
| "task_emb_type": "expand-last", | |
| "transformers_version": "5.1.0" | |
| } | |