TagSpeech-AMI / config.json
yshao18's picture
Upload config.json with huggingface_hub
7cf7695 verified
{
"audio_encoder_config": {
"model_type": "zipformer",
"feature_dim": 80,
"output_downsampling_factor": 2,
"num_encoder_layers": [
2,
2,
4,
5,
4,
2
],
"downsampling_factor": [
1,
2,
4,
8,
4,
2
],
"encoder_dim": [
192,
256,
512,
768,
512,
256
],
"feedforward_dim": [
576,
768,
1536,
2304,
1536,
768
],
"warmup_batches": 4000.0,
"dropout": null,
"num_heads": [
4,
4,
4,
8,
4,
4
],
"query_head_dim": [
32
],
"value_head_dim": [
12
],
"pos_head_dim": [
4
],
"pos_dim": 48,
"encoder_unmasked_dim": [
192,
192,
256,
256,
256,
192
],
"cnn_module_kernel": [
31,
31,
15,
15,
15,
31
],
"causal": false,
"chunk_size": [
16,
32,
64,
-1
],
"left_context_frames": [
64,
128,
256,
-1
]
},
"llm_config": {
"vocab_size": 152064,
"max_position_embeddings": 32768,
"hidden_size": 3584,
"intermediate_size": 18944,
"num_hidden_layers": 28,
"num_attention_heads": 28,
"use_sliding_window": false,
"sliding_window": 131072,
"max_window_layers": 28,
"num_key_value_heads": 4,
"hidden_act": "silu",
"initializer_range": 0.02,
"rms_norm_eps": 1e-06,
"use_cache": true,
"rope_theta": 1000000.0,
"attention_dropout": 0.0,
"torch_dtype": "float16",
"tie_word_embeddings": false,
"architectures": [
"Qwen2ForCausalLM"
],
"bos_token_id": 151643,
"eos_token_id": 151645,
"_name_or_path": "/projects/bejv/models/Qwen2.5-7B-Instruct",
"transformers_version": "4.38.2",
"model_type": "qwen2"
},
"use_flash_attn": false,
"audio_encoder_projector_ds_rate": 8,
"exclude_from_checkpoint": [
"audio_encoder",
"voice_encoder",
"llm"
],
"tag_audio_boundary": false,
"audio_token": "<|AUDIO|>",
"model_type": "tagspeech",
"max_length": 800,
"voice_encoder_config": {
"model_type": "zipformer",
"feature_dim": 80,
"output_downsampling_factor": 2,
"num_encoder_layers": [
2,
2,
4,
5,
4,
2
],
"downsampling_factor": [
1,
2,
4,
8,
4,
2
],
"encoder_dim": [
192,
256,
512,
768,
512,
256
],
"feedforward_dim": [
576,
768,
1536,
2304,
1536,
768
],
"warmup_batches": 4000.0,
"dropout": null,
"num_heads": [
4,
4,
4,
8,
4,
4
],
"query_head_dim": [
32
],
"value_head_dim": [
12
],
"pos_head_dim": [
4
],
"pos_dim": 48,
"encoder_unmasked_dim": [
192,
192,
256,
256,
256,
192
],
"cnn_module_kernel": [
31,
31,
15,
15,
15,
31
],
"causal": false,
"chunk_size": [
16,
32,
64,
-1
],
"left_context_frames": [
64,
128,
256,
-1
]
},
"semantic_projector_ds_rate": 4,
"voice_projector_ds_rate": 4,
"semantic_anchor_interval": 8,
"voice_anchor_interval": 8,
"insert_anchors_at_ends": true
}