{ "activation_dropout": 0.1, "activation_fn": "gelu", "architectures": [ "RDDistillerModel" ], "attention_dropout": 0.1, "attention_type": "original", "auto_map": { "AutoConfig": "configuration_distiller.DistillerConfig", "AutoModel": "distiller_model.RDDistillerModel" }, "conv_pos": 128, "conv_pos_groups": 16, "cosine_loss": 1.0, "dropout": 0.1, "dtype": "float32", "encoder_attention_heads": 12, "encoder_embed_dim": 768, "encoder_ffn_embed_dim": 3072, "encoder_layerdrop": 0.0, "encoder_layers": 2, "extractor_conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", "extractor_dropout": 0.0, "extractor_mode": "default", "feat_pen_loss": 0.0, "feature_grad_mult": 0.1, "final_dim": 768, "init_teacher_conv_layers": true, "init_teacher_encoder_layers": true, "layer_emb_size": 0, "layer_norm_first": false, "loss_type": "l1", "model_type": "rd_distiller", "n_tasks": 3, "out_layer_inter_dim": -1, "out_layer_type": "expand-last", "pred_layer_id": [ 4, 8, 12 ], "task_emb_size": 0, "task_emb_type": "expand-last", "transformers_version": "5.1.0" }