data: train_json_files: - data/MC/MusicCaps_train.json - data/MC/LP_MusicCaps_MC_train.json - data/MC/MusicCapsBinary.json - data/MC/MusicCapsMCQ.json - data/MC/MusicCapsDetailQA.json - data/MC/MusicInstruct_train.json - data/MC_NEW/MusicCapsMCQ.json - data/MTT/LP_MusicCaps_MTT_train.json - data/MTT/MTT_Binary.json - data/MTT/MTT_MCQ.json - data/MTT/OpenMU_MTT_train.json - data/MTT_NEW/MTT_MCQ.json - data/ASM_NEW/AudioSetMusicBinary.json - data/ASM_NEW/AudioSetMusicMCQ.json - data/ASM_NEW/AudioSetMusicQA.json - data/ASM_NEW/Captioning.json - data/FMA/FMA_Caption_Train.json - data/FMA/FMA_QA_MIX.json - data/FMA/FMA_QA_MIX_Part2.json train_audio_dirs: - ../../data/LP_MusicCaps_MC/Audio - ../../data/LP_MusicCaps_MC/Audio - ../../data/LP_MusicCaps_MC/Audio - ../../data/LP_MusicCaps_MC/Audio - ../../data/LP_MusicCaps_MC/Audio - ../../data/LP_MusicCaps_MC/Audio - ../../data/LP_MusicCaps_MC/Audio - ../../data/LP_MusicCaps_MTT/Audio - ../../data/LP_MusicCaps_MTT/Audio - ../../data/LP_MusicCaps_MTT/Audio - ../../data/LP_MusicCaps_MTT/Audio - ../../data/LP_MusicCaps_MTT/Audio - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio - /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s - /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s - /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s val_json_files: - data/captioning/MusicCaps_test.json val_audio_dirs: - ../../data/LP_MusicCaps_MC/Audio sample_rate: 32000 max_length: 10 tokenizer_type: HuggingFaceTB/SmolLM2-135M max_text_token_len: 129 batch_size: 16 num_workers: 8 wav_aug: false model: encoder: audioenc_name: MATPAC matpac_ckpt_path: ./weights/matpac_plus_as_48_1_map_enconly.pt freeze: true ds_rate: 1 c2l_first: true wo_repeat: true projector: proj_name: linearprojector d_in: 3840 d_h: 2048 d_out: 576 decoder: textdec_name: HuggingFaceTB/SmolLM2-135M max_text_token_len: 129 freeze: false use_lora: false lora_config: r: 32 lora_alpha: 128 lora_dropout: 0.1 bias: none target_modules: - q_proj - v_proj model_type: m2t training: epochs: 3 clip_grad: 2 dropout: 0.2 seed: 42 device: cuda validation_step: 3000 optim_args: lr: 0.0001 warmup_steps: 0 scheduler: cosine optimizer_name: adam betas: - 0.9 - 0.999 eps: 1.0e-08 momentum: 0.9 warmup_ratio: 0.05