| data: | |
| sampling_rate: 32000 | |
| segment_seconds: 10 | |
| tokenizer_type: "HuggingFaceTB/SmolLM2-135M" | |
| text_tokenization_len: 129 | |
| model: | |
| encoder: | |
| audioenc_name: 'HTSAT' | |
| transformer_embed_dim: 768 | |
| out_emb: 768 | |
| d_proj: 576 | |
| decoder: | |
| text_decoder: "HuggingFaceTB/SmolLM2-135M" | |
| prefix_length: 389 | |
| model_type: Mellow | |