| data: | |
| train_json_files: | |
| - data/MC/MusicCaps_train.json | |
| - data/MC/LP_MusicCaps_MC_train.json | |
| - data/MC/MusicCapsBinary.json | |
| - data/MC/MusicCapsMCQ.json | |
| - data/MC/MusicCapsDetailQA.json | |
| - data/MC/MusicInstruct_train.json | |
| - data/MC_NEW/MusicCapsMCQ.json | |
| - data/MTT/LP_MusicCaps_MTT_train.json | |
| - data/MTT/MTT_Binary.json | |
| - data/MTT/MTT_MCQ.json | |
| - data/MTT/OpenMU_MTT_train.json | |
| - data/MTT_NEW/MTT_MCQ.json | |
| - data/ASM_NEW/AudioSetMusicBinary.json | |
| - data/ASM_NEW/AudioSetMusicMCQ.json | |
| - data/ASM_NEW/AudioSetMusicQA.json | |
| - data/ASM_NEW/Captioning.json | |
| - data/FMA/FMA_Caption_Train.json | |
| - data/FMA/FMA_QA_MIX.json | |
| - data/FMA/FMA_QA_MIX_Part2.json | |
| train_audio_dirs: | |
| - ../../data/LP_MusicCaps_MC/Audio | |
| - ../../data/LP_MusicCaps_MC/Audio | |
| - ../../data/LP_MusicCaps_MC/Audio | |
| - ../../data/LP_MusicCaps_MC/Audio | |
| - ../../data/LP_MusicCaps_MC/Audio | |
| - ../../data/LP_MusicCaps_MC/Audio | |
| - ../../data/LP_MusicCaps_MC/Audio | |
| - ../../data/LP_MusicCaps_MTT/Audio | |
| - ../../data/LP_MusicCaps_MTT/Audio | |
| - ../../data/LP_MusicCaps_MTT/Audio | |
| - ../../data/LP_MusicCaps_MTT/Audio | |
| - ../../data/LP_MusicCaps_MTT/Audio | |
| - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio | |
| - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio | |
| - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio | |
| - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio | |
| - /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s | |
| - /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s | |
| - /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s | |
| val_json_files: | |
| - data/captioning/MusicCaps_test.json | |
| val_audio_dirs: | |
| - ../../data/LP_MusicCaps_MC/Audio | |
| sample_rate: 32000 | |
| max_length: 10 | |
| tokenizer_type: HuggingFaceTB/SmolLM2-135M | |
| max_text_token_len: 129 | |
| batch_size: 16 | |
| num_workers: 8 | |
| wav_aug: false | |
| model: | |
| encoder: | |
| audioenc_name: MATPAC | |
| matpac_ckpt_path: ./weights/matpac_plus_as_48_1_map_enconly.pt | |
| freeze: true | |
| ds_rate: 1 | |
| c2l_first: true | |
| wo_repeat: true | |
| projector: | |
| proj_name: linearprojector | |
| d_in: 3840 | |
| d_h: 2048 | |
| d_out: 576 | |
| decoder: | |
| textdec_name: HuggingFaceTB/SmolLM2-135M | |
| max_text_token_len: 129 | |
| freeze: false | |
| use_lora: false | |
| lora_config: | |
| r: 32 | |
| lora_alpha: 128 | |
| lora_dropout: 0.1 | |
| bias: none | |
| target_modules: | |
| - q_proj | |
| - v_proj | |
| model_type: m2t | |
| training: | |
| epochs: 3 | |
| clip_grad: 2 | |
| dropout: 0.2 | |
| seed: 42 | |
| device: cuda | |
| validation_step: 3000 | |
| optim_args: | |
| lr: 0.0001 | |
| warmup_steps: 0 | |
| scheduler: cosine | |
| optimizer_name: adam | |
| betas: | |
| - 0.9 | |
| - 0.999 | |
| eps: 1.0e-08 | |
| momentum: 0.9 | |
| warmup_ratio: 0.05 | |