data: sampling_rate: 32000 segment_seconds: 10 tokenizer_type: "HuggingFaceTB/SmolLM2-135M" text_tokenization_len: 129 model: encoder: audioenc_name: 'HTSAT' transformer_embed_dim: 768 out_emb: 768 d_proj: 576 decoder: text_decoder: "HuggingFaceTB/SmolLM2-135M" prefix_length: 389 model_type: Mellow