AudenAI
/

TagSpeech-AMI

+{
+  "audio_encoder_config": {
+    "model_type": "zipformer",
+    "feature_dim": 80,
+    "output_downsampling_factor": 2,
+    "num_encoder_layers": [
+      2,
+      2,
+      4,
+      5,
+      4,
+      2
+    ],
+    "downsampling_factor": [
+      1,
+      2,
+      4,
+      8,
+      4,
+      2
+    ],
+    "encoder_dim": [
+      192,
+      256,
+      512,
+      768,
+      512,
+      256
+    ],
+    "feedforward_dim": [
+      576,
+      768,
+      1536,
+      2304,
+      1536,
+      768
+    ],
+    "warmup_batches": 4000.0,
+    "dropout": null,
+    "num_heads": [
+      4,
+      4,
+      4,
+      8,
+      4,
+      4
+    ],
+    "query_head_dim": [
+      32
+    ],
+    "value_head_dim": [
+      12
+    ],
+    "pos_head_dim": [
+      4
+    ],
+    "pos_dim": 48,
+    "encoder_unmasked_dim": [
+      192,
+      192,
+      256,
+      256,
+      256,
+      192
+    ],
+    "cnn_module_kernel": [
+      31,
+      31,
+      15,
+      15,
+      15,
+      31
+    ],
+    "causal": false,
+    "chunk_size": [
+      16,
+      32,
+      64,
+      -1
+    ],
+    "left_context_frames": [
+      64,
+      128,
+      256,
+      -1
+    ]
+  },
+  "llm_config": {
+    "vocab_size": 152064,
+    "max_position_embeddings": 32768,
+    "hidden_size": 3584,
+    "intermediate_size": 18944,
+    "num_hidden_layers": 28,
+    "num_attention_heads": 28,
+    "use_sliding_window": false,
+    "sliding_window": 131072,
+    "max_window_layers": 28,
+    "num_key_value_heads": 4,
+    "hidden_act": "silu",
+    "initializer_range": 0.02,
+    "rms_norm_eps": 1e-06,
+    "use_cache": true,
+    "rope_theta": 1000000.0,
+    "attention_dropout": 0.0,
+    "torch_dtype": "float16",
+    "tie_word_embeddings": false,
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "_name_or_path": "/projects/bejv/models/Qwen2.5-7B-Instruct",
+    "transformers_version": "4.38.2",
+    "model_type": "qwen2"
+  },
+  "use_flash_attn": false,
+  "audio_encoder_projector_ds_rate": 8,
+  "exclude_from_checkpoint": [
+    "audio_encoder",
+    "voice_encoder",
+    "llm"
+  ],
+  "tag_audio_boundary": false,
+  "audio_token": "<|AUDIO|>",
+  "model_type": "audio-llm-dual-audio-tokens-anchor-num",
+  "max_length": 800,
+  "voice_encoder_config": {
+    "model_type": "zipformer",
+    "feature_dim": 80,
+    "output_downsampling_factor": 2,
+    "num_encoder_layers": [
+      2,
+      2,
+      4,
+      5,
+      4,
+      2
+    ],
+    "downsampling_factor": [
+      1,
+      2,
+      4,
+      8,
+      4,
+      2
+    ],
+    "encoder_dim": [
+      192,
+      256,
+      512,
+      768,
+      512,
+      256
+    ],
+    "feedforward_dim": [
+      576,
+      768,
+      1536,
+      2304,
+      1536,
+      768
+    ],
+    "warmup_batches": 4000.0,
+    "dropout": null,
+    "num_heads": [
+      4,
+      4,
+      4,
+      8,
+      4,
+      4
+    ],
+    "query_head_dim": [
+      32
+    ],
+    "value_head_dim": [
+      12
+    ],
+    "pos_head_dim": [
+      4
+    ],
+    "pos_dim": 48,
+    "encoder_unmasked_dim": [
+      192,
+      192,
+      256,
+      256,
+      256,
+      192
+    ],
+    "cnn_module_kernel": [
+      31,
+      31,
+      15,
+      15,
+      15,
+      31
+    ],
+    "causal": false,
+    "chunk_size": [
+      16,
+      32,
+      64,
+      -1
+    ],
+    "left_context_frames": [
+      64,
+      128,
+      256,
+      -1
+    ]
+  },
+  "semantic_projector_ds_rate": 4,
+  "voice_projector_ds_rate": 4,
+  "semantic_anchor_interval": 8,
+  "voice_anchor_interval": 8,
+  "insert_anchors_at_ends": true,
+  "digit_embedding_path": "/projects/bejv/code/Auden/examples/multi_asr_llm/models/audio_llm_dual_audio_tokens_anchor_num/digit_token_embeddings.pt"
+}