File size: 2,757 Bytes
64a4c0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
data:
train_json_files:
- data/MC/MusicCaps_train.json
- data/MC/LP_MusicCaps_MC_train.json
- data/MC/MusicCapsBinary.json
- data/MC/MusicCapsMCQ.json
- data/MC/MusicCapsDetailQA.json
- data/MC/MusicInstruct_train.json
- data/MC_NEW/MusicCapsMCQ.json
- data/MTT/LP_MusicCaps_MTT_train.json
- data/MTT/MTT_Binary.json
- data/MTT/MTT_MCQ.json
- data/MTT/OpenMU_MTT_train.json
- data/MTT_NEW/MTT_MCQ.json
- data/ASM_NEW/AudioSetMusicBinary.json
- data/ASM_NEW/AudioSetMusicMCQ.json
- data/ASM_NEW/AudioSetMusicQA.json
- data/ASM_NEW/Captioning.json
- data/FMA/FMA_Caption_Train.json
- data/FMA/FMA_QA_MIX.json
- data/FMA/FMA_QA_MIX_Part2.json
train_audio_dirs:
- ../../data/LP_MusicCaps_MC/Audio
- ../../data/LP_MusicCaps_MC/Audio
- ../../data/LP_MusicCaps_MC/Audio
- ../../data/LP_MusicCaps_MC/Audio
- ../../data/LP_MusicCaps_MC/Audio
- ../../data/LP_MusicCaps_MC/Audio
- ../../data/LP_MusicCaps_MC/Audio
- ../../data/LP_MusicCaps_MTT/Audio
- ../../data/LP_MusicCaps_MTT/Audio
- ../../data/LP_MusicCaps_MTT/Audio
- ../../data/LP_MusicCaps_MTT/Audio
- ../../data/LP_MusicCaps_MTT/Audio
- /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio
- /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio
- /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio
- /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio
- /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s
- /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s
- /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s
val_json_files:
- data/captioning/MusicCaps_test.json
val_audio_dirs:
- ../../data/LP_MusicCaps_MC/Audio
sample_rate: 32000
max_length: 10
tokenizer_type: HuggingFaceTB/SmolLM2-135M
max_text_token_len: 129
batch_size: 16
num_workers: 8
wav_aug: false
model:
encoder:
audioenc_name: MATPAC
matpac_ckpt_path: ./weights/matpac_plus_as_48_1_map_enconly.pt
freeze: true
ds_rate: 1
c2l_first: true
wo_repeat: true
projector:
proj_name: linearprojector
d_in: 3840
d_h: 2048
d_out: 576
decoder:
textdec_name: HuggingFaceTB/SmolLM2-135M
max_text_token_len: 129
freeze: false
use_lora: false
lora_config:
r: 32
lora_alpha: 128
lora_dropout: 0.1
bias: none
target_modules:
- q_proj
- v_proj
model_type: m2t
training:
epochs: 3
clip_grad: 2
dropout: 0.2
seed: 42
device: cuda
validation_step: 3000
optim_args:
lr: 0.0001
warmup_steps: 0
scheduler: cosine
optimizer_name: adam
betas:
- 0.9
- 0.999
eps: 1.0e-08
momentum: 0.9
warmup_ratio: 0.05
|