zhouchushu commited on
Commit
cd27f88
·
verified ·
1 Parent(s): 6c16d7d

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. ast_1_AS20k/ast_new_audioset/checkpoint_1.pt +3 -0
  2. ast_1_AS20k/ast_new_audioset/checkpoint_11.pt +3 -0
  3. ast_1_AS20k/ast_new_audioset/checkpoint_15.pt +3 -0
  4. ast_1_AS20k/ast_new_audioset/checkpoint_16.pt +3 -0
  5. ast_1_AS20k/ast_new_audioset/checkpoint_2.pt +3 -0
  6. ast_1_AS20k/ast_new_audioset/checkpoint_20.pt +3 -0
  7. ast_1_AS20k/ast_new_audioset/checkpoint_21.pt +3 -0
  8. ast_1_AS20k/ast_new_audioset/checkpoint_22.pt +3 -0
  9. ast_1_AS20k/ast_new_audioset/checkpoint_3.pt +3 -0
  10. ast_1_AS20k/ast_new_audioset/checkpoint_9.pt +3 -0
  11. ast_1_AS20k/ast_new_audioset/checkpoint_best.pt +3 -0
  12. ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/result.csv +25 -0
  13. ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/train.log +837 -0
  14. ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/wa_result.csv +5 -0
  15. pre_4_AS2M/conv_clap_1_2025-09-30_06-58-32/pretraining_AS2M.sh +340 -0
  16. pre_4_AS2M/conv_clap_1_2025-09-30_06-59-40/pretraining_AS2M.sh +339 -0
  17. pre_4_AS2M/conv_clap_1_2025-09-30_07-01-07/pretraining_AS2M.sh +339 -0
  18. pre_4_AS2M/conv_clap_1_2025-09-30_07-08-58/pretraining_AS2M.sh +336 -0
  19. pre_4_AS2M/conv_clap_1_2025-09-30_07-14-17/pretraining_AS2M.sh +336 -0
  20. pre_4_AS2M/conv_clap_1_2025-09-30_07-19-43/pretraining_AS2M.sh +336 -0
  21. pre_4_AS2M/conv_clap_1_2025-09-30_07-25-52/pretraining_AS2M.sh +336 -0
  22. pre_4_AS2M/conv_clap_1_2025-09-30_08-31-42/pretraining_AS2M.sh +418 -0
  23. pre_4_AS2M/conv_clap_1_2025-09-30_08-31-59/pretraining_AS2M.sh +416 -0
  24. pre_4_AS2M/conv_clap_2_2025-09-30_09-12-51/pretraining_AS2M.sh +416 -0
  25. pre_4_AS2M/conv_clap_4_2025-09-30_07-37-48/pretraining_AS2M.sh +387 -0
  26. pre_4_AS2M/conv_clap_4_2025-09-30_07-38-18/pretraining_AS2M.sh +384 -0
  27. pre_4_AS2M/conv_clap_4_2025-09-30_07-42-31/pretraining_AS2M.sh +384 -0
  28. pre_4_AS2M/conv_clap_4_2025-09-30_07-45-39/pretraining_AS2M.sh +384 -0
  29. pre_4_AS2M/conv_clap_4_2025-09-30_07-49-28/pretraining_AS2M.sh +384 -0
  30. pre_4_AS2M/conv_clap_4_2025-09-30_07-57-18/pretraining_AS2M.sh +384 -0
  31. pre_4_AS2M/conv_clap_4_2025-09-30_08-05-21/pretraining_AS2M.sh +384 -0
  32. pre_4_AS2M/conv_clap_4_2025-09-30_08-13-17/pretraining_AS2M.sh +384 -0
  33. pre_4_AS2M/conv_clap_4_2025-09-30_08-23-09/pretraining_AS2M.sh +384 -0
  34. pre_4_AS2M/disp_0_2025-09-24_13-58-24/pretraining_AS2M.sh +246 -0
  35. pre_4_AS2M/disp_0_2025-09-24_14-09-31/pretraining_AS2M.sh +246 -0
  36. pre_4_AS2M/disp_0_2025-09-24_14-12-12/pretraining_AS2M.sh +246 -0
  37. pre_4_AS2M/disp_0_2025-09-24_14-17-47/pretraining_AS2M.sh +246 -0
  38. pre_4_AS2M/disp_1_2025-09-26_14-32-16/pretraining_AS2M.sh +258 -0
  39. pre_4_AS2M/disp_1_2025-09-26_14-33-34/pretraining_AS2M.sh +258 -0
  40. pre_4_AS2M/disp_1_2025-09-26_14-34-35/pretraining_AS2M.sh +258 -0
  41. pre_4_AS2M/disp_1_2025-09-26_14-39-04/pretraining_AS2M.sh +258 -0
  42. pre_4_AS2M/disp_1_2025-09-26_14-57-51/pretraining_AS2M.sh +258 -0
  43. pre_4_AS2M/disp_3_2025-09-27_05-57-32/pretraining_AS2M.sh +282 -0
  44. pre_4_AS2M/disp_4_2025-09-28_05-38-34/pretraining_AS2M.sh +294 -0
  45. pre_4_AS2M/disp_5_2025-09-28_06-51-25/pretraining_AS2M.sh +306 -0
  46. pre_4_AS2M/disp_5_2025-09-28_07-56-38/pretraining_AS2M.sh +318 -0
  47. pre_4_AS2M/disp_6_2025-09-28_08-28-48/pretraining_AS2M.sh +318 -0
  48. pre_4_AS2M/disp_6_2025-09-28_08-49-54/pretraining_AS2M.sh +318 -0
  49. pre_4_AS2M/disp_6_2025-09-28_08-55-19/pretraining_AS2M.sh +318 -0
  50. pre_4_AS2M/disp_6_2025-09-28_08-58-05/pretraining_AS2M.sh +318 -0
ast_1_AS20k/ast_new_audioset/checkpoint_1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22f4d1bdccbf34b3986cdfaf97bceaa9e82b4d3a8e011ea4e111904f294f8f6f
3
+ size 352586874
ast_1_AS20k/ast_new_audioset/checkpoint_11.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:153f81afacd4d63f575871978aeb9d4d0bfdeb0b4a77c2ce2a1564dd07608579
3
+ size 352587039
ast_1_AS20k/ast_new_audioset/checkpoint_15.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10cb65aeca1f773590b23ce9d2d705aead1e54a46f5fdf886d06f5aac1f3da41
3
+ size 352587039
ast_1_AS20k/ast_new_audioset/checkpoint_16.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebed3d5c168a32dadc61357b1a234ebf59651252ce1d5eb1880b6e469e82365b
3
+ size 352587039
ast_1_AS20k/ast_new_audioset/checkpoint_2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2be2bef5d1da4dcbbcf358260ca6f88b9322fdd3308399302c897cde1413f8f7
3
+ size 352586874
ast_1_AS20k/ast_new_audioset/checkpoint_20.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c740289dd3129cdcb9e18ebbf228d47b9113f89cd5d4d0f5d6fe098e2a7ae9b6
3
+ size 352587039
ast_1_AS20k/ast_new_audioset/checkpoint_21.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be020848a6bbca8a5eb8de7f3c10109fbc4284f5b9edb364d011ca4bddc6fa52
3
+ size 352587039
ast_1_AS20k/ast_new_audioset/checkpoint_22.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:164f6312bb7956a80a31add7003edd9363dc9d69e28c76cdfd6223061bfc74c8
3
+ size 352587039
ast_1_AS20k/ast_new_audioset/checkpoint_3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34a58857d6206ff01e85982306ead2139a232cba9c49c334667a1893faf46abd
3
+ size 352586874
ast_1_AS20k/ast_new_audioset/checkpoint_9.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7da4625fa3f5a8bd3adabf856aa377917ae7da7ee2f177d83ec8e66094d14ead
3
+ size 352586874
ast_1_AS20k/ast_new_audioset/checkpoint_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d74a92edfb701a1e7fbfba743caab8d687ab122e679115e44d99b55809aa41ce
3
+ size 352587369
ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/result.csv ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1.422225945558316940e-02,6.487563597807765037e-01,6.059869982241350601e-03,9.984290564287322534e-01,5.401801698178747557e-01,1.123867675634848823e-01,6.956494450569152832e-01,1.422225945558316940e-02,6.487563597807765037e-01,5.000000000000000240e-05
2
+ 5.325997280235295062e-02,8.158944676321755463e-01,1.306437393490925657e-02,9.709086275491799478e-01,1.272550891389265137e+00,2.355064642799161584e-02,6.952877044677734375e-01,5.093040520527827852e-02,8.065371915632957300e-01,5.000000000000000240e-05
3
+ 1.104590830848553862e-01,8.968618757921559270e-01,6.706614959781720398e-02,7.171915221978218957e-01,1.787383839223240622e+00,2.194607003870671647e-02,6.950482130050659180e-01,1.073419470179194091e-01,8.839559858178680507e-01,5.000000000000000240e-05
4
+ 1.848396644615896978e-01,9.313106471184702251e-01,7.167611495216880124e-02,6.694785396948753631e-01,2.100989061540869507e+00,2.017162883966943515e-02,6.948846578598022461e-01,1.791385835122070447e-01,9.215415158207399537e-01,5.000000000000000240e-05
5
+ 2.224462178579142690e-01,9.413168661608459775e-01,7.231485140869021999e-02,7.035087224749546619e-01,2.214551820998618137e+00,1.874961108785041033e-02,6.944540739059448242e-01,2.185575499842252467e-01,9.345964582357189077e-01,5.000000000000000240e-05
6
+ 2.548203316499514925e-01,9.479657052440092491e-01,5.870892769216756735e-02,7.725844616170880474e-01,2.298720947675281678e+00,1.763229149318959466e-02,6.945921778678894043e-01,2.526630268584063033e-01,9.435292535915005274e-01,5.000000000000000240e-05
7
+ 2.742160424132608632e-01,9.515682622442890315e-01,5.749621905254873044e-02,8.066989852343421363e-01,2.347952949601005201e+00,1.670030884553481282e-02,6.943714618682861328e-01,2.735843322137676004e-01,9.486307065585155573e-01,5.000000000000000240e-05
8
+ 2.891340671033661436e-01,9.539887857444313557e-01,5.727079787194905985e-02,8.220393549750605322e-01,2.382701716918723900e+00,1.596927918330596358e-02,6.942600607872009277e-01,2.902446482600923860e-01,9.520384123928512521e-01,5.000000000000000240e-05
9
+ 2.997937190939304331e-01,9.556058565055307596e-01,5.469515067152212751e-02,8.430501262804962481e-01,2.406743177850766635e+00,1.527043480806759661e-02,6.942504048347473145e-01,3.028899608327243476e-01,9.547648724123766195e-01,5.000000000000000240e-05
10
+ 3.019933285999398254e-01,9.551177780206882018e-01,5.673163717388598343e-02,8.369639127603817341e-01,2.399413530906429504e+00,1.471027910790956815e-02,6.941569447517395020e-01,3.115329036723915590e-01,9.565940278989387702e-01,5.000000000000000240e-05
11
+ 3.216990458954857579e-01,9.573572677069273063e-01,5.875842801928884279e-02,8.495200585745303901e-01,2.433591614166512151e+00,1.350063216294685418e-02,6.940920352935791016e-01,3.202949833204151719e-01,9.580394075942408882e-01,2.500000000000000120e-05
12
+ 3.237938459476596975e-01,9.568247991379073003e-01,5.627071898598238336e-02,8.472165610019362081e-01,2.425336344437045710e+00,1.277143365193674981e-02,6.940239071846008301e-01,3.264502469374453986e-01,9.591369993481135836e-01,2.500000000000000120e-05
13
+ 3.238792107002882448e-01,9.560063329207294514e-01,5.774346572337184930e-02,8.414383773149294310e-01,2.412805928578551029e+00,1.238875423823177294e-02,6.939673423767089844e-01,3.308619542913583955e-01,9.598927917725000869e-01,2.500000000000000120e-05
14
+ 3.206599597620702347e-01,9.540928202972318584e-01,5.840423268860757411e-02,8.374629739899399627e-01,2.384227802331127410e+00,1.193688203737030933e-02,6.939578056335449219e-01,3.341927086664225888e-01,9.604666643675140447e-01,2.500000000000000120e-05
15
+ 3.241879057377317075e-01,9.550509282205020822e-01,5.668243643471783388e-02,8.459431676912494424e-01,2.398414627994010839e+00,1.159679852697971178e-02,6.938989162445068359e-01,3.370616133181535967e-01,9.609689860314127863e-01,2.500000000000000120e-05
16
+ 3.274090426375390050e-01,9.538047907862127195e-01,5.882890954492734498e-02,8.395026159089084006e-01,2.380009463526119973e+00,1.087626258523142174e-02,6.938264966011047363e-01,3.390441351590992025e-01,9.612789191340032069e-01,1.250000000000000060e-05
17
+ 3.266298745594018449e-01,9.531561281365846794e-01,5.786502311129928383e-02,8.372581634183908772e-01,2.370586249390366440e+00,1.057763744308783116e-02,6.937884092330932617e-01,3.404189266645940570e-01,9.615116642607353104e-01,1.250000000000000060e-05
18
+ 3.272432009645968032e-01,9.523735520790914677e-01,5.759590126563685075e-02,8.398432031605835846e-01,2.359355951258224504e+00,1.041628441236315018e-02,6.938322186470031738e-01,3.419091536737948189e-01,9.616955903658975791e-01,1.250000000000000060e-05
19
+ 3.255807419210634546e-01,9.512071086194867631e-01,5.740117490775870773e-02,8.361371858623385389e-01,2.342888179552985672e+00,1.005493109991406793e-02,6.938048601150512695e-01,3.427275325615427026e-01,9.618161049645025384e-01,1.250000000000000060e-05
20
+ 3.260315090994287957e-01,9.512684697375038967e-01,5.997403778213243608e-02,8.288957385703106251e-01,2.343746581586883870e+00,9.951956874289869318e-03,6.936931610107421875e-01,3.433311674477271258e-01,9.618989983333274818e-01,1.250000000000000060e-05
21
+ 3.249040983002512983e-01,9.500949421687898688e-01,5.878458400191581557e-02,8.265363288594659297e-01,2.327477157438643030e+00,9.605016077793862572e-03,6.936856508255004883e-01,3.436332817873031242e-01,9.619345990419769787e-01,6.250000000000000300e-06
22
+ 3.255695187517096967e-01,9.499316465986847868e-01,5.797600592170489703e-02,8.279170449610905314e-01,2.325237544245208010e+00,9.480287065851862593e-03,6.936790347099304199e-01,3.441953690545042077e-01,9.619586781271860509e-01,6.250000000000000300e-06
23
+ 3.243047968373096723e-01,9.491688511518816540e-01,5.580597852971000417e-02,8.301318670729562754e-01,2.314852206236093224e+00,9.343003603446222577e-03,6.937055587768554688e-01,3.442617626297410083e-01,9.619489128942837475e-01,6.250000000000000300e-06
24
+ 3.250544555977981642e-01,9.489687673682015712e-01,5.894922958301676563e-02,8.262462971438052639e-01,2.312148661301067776e+00,9.364594131517802247e-03,6.936329603195190430e-01,3.443755436123956959e-01,9.619321524013675351e-01,6.250000000000000300e-06
25
+ 3.239834326417725396e-01,9.480251122897331850e-01,6.097054245045899906e-02,8.205525080094422385e-01,2.299510454271016968e+00,9.206688810194279052e-03,6.936240792274475098e-01,3.443998966946163476e-01,9.619017707066223055e-01,6.250000000000000300e-06
ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/train.log ADDED
@@ -0,0 +1,837 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I am process 52222, running on zcs-cfc-eat-l-worker-0: starting (Wed Aug 27 02:40:44 2025)
2
+ now train a audio spectrogram transformer model
3
+ balanced sampler is not used
4
+ ---------------the train dataloader---------------
5
+ now using following mask: 48 freq, 192 time
6
+ now using mix-up with rate 0.500000
7
+ now process audioset
8
+ use dataset mean -4.268 and std 4.569 to normalize the input.
9
+ number of classes is 527
10
+ ---------------the evaluation dataloader---------------
11
+ now using following mask: 0 freq, 0 time
12
+ now using mix-up with rate 0.000000
13
+ now process audioset
14
+ use dataset mean -4.268 and std 4.569 to normalize the input.
15
+ number of classes is 527
16
+ ---------------AST Model Summary---------------
17
+ ImageNet pretraining: True, AudioSet pretraining: False
18
+ frequncey stride=10, time stride=10
19
+ number of patches=1212
20
+
21
+ Creating experiment directory: /opt/gpfs/home/chushu/exp/eat/ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe
22
+ Now starting training for 25 epochs
23
+ running on cuda
24
+ Total parameter number is : 88.132 million
25
+ Total trainable parameter number is : 88.132 million
26
+ now training with audioset, main metrics: mAP, loss function: BCEWithLogitsLoss(), learning rate scheduler: <torch.optim.lr_scheduler.MultiStepLR object at 0x7f99dcde0df0>
27
+ The learning rate scheduler starts at 10 epoch with decay rate of 0.500 every 5 epochs
28
+ current #steps=0, #epochs=1
29
+ start training...
30
+ ---------------
31
+ 2025-08-27 02:40:45.711145
32
+ current #epochs=1, #steps=0
33
+ warm-up learning rate is 0.000000
34
+ warm-up learning rate is 0.000003
35
+ warm-up learning rate is 0.000005
36
+ Epoch: [1][100/1713] Per Sample Total Time 0.01385 Per Sample Data Time 0.00061 Per Sample DNN Time 0.01324 Train Loss 0.6809
37
+ warm-up learning rate is 0.000008
38
+ warm-up learning rate is 0.000010
39
+ Epoch: [1][200/1713] Per Sample Total Time 0.01299 Per Sample Data Time 0.00033 Per Sample DNN Time 0.01267 Train Loss 0.5411
40
+ warm-up learning rate is 0.000013
41
+ warm-up learning rate is 0.000015
42
+ Epoch: [1][300/1713] Per Sample Total Time 0.01253 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01230 Train Loss 0.4430
43
+ warm-up learning rate is 0.000017
44
+ warm-up learning rate is 0.000020
45
+ Epoch: [1][400/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01222 Train Loss 0.3676
46
+ warm-up learning rate is 0.000023
47
+ warm-up learning rate is 0.000025
48
+ Epoch: [1][500/1713] Per Sample Total Time 0.01238 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01223 Train Loss 0.3109
49
+ warm-up learning rate is 0.000028
50
+ warm-up learning rate is 0.000030
51
+ Epoch: [1][600/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01215 Train Loss 0.2684
52
+ warm-up learning rate is 0.000033
53
+ warm-up learning rate is 0.000035
54
+ Epoch: [1][700/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01215 Train Loss 0.2360
55
+ warm-up learning rate is 0.000038
56
+ warm-up learning rate is 0.000040
57
+ Epoch: [1][800/1713] Per Sample Total Time 0.01229 Per Sample Data Time 0.00011 Per Sample DNN Time 0.01218 Train Loss 0.2108
58
+ warm-up learning rate is 0.000043
59
+ warm-up learning rate is 0.000045
60
+ Epoch: [1][900/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00011 Per Sample DNN Time 0.01217 Train Loss 0.1908
61
+ warm-up learning rate is 0.000048
62
+ warm-up learning rate is 0.000050
63
+ Epoch: [1][1000/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00010 Per Sample DNN Time 0.01217 Train Loss 0.1745
64
+ Epoch: [1][1100/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00009 Per Sample DNN Time 0.01208 Train Loss 0.1610
65
+ Epoch: [1][1200/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00009 Per Sample DNN Time 0.01208 Train Loss 0.1497
66
+ Epoch: [1][1300/1713] Per Sample Total Time 0.01218 Per Sample Data Time 0.00009 Per Sample DNN Time 0.01209 Train Loss 0.1402
67
+ Epoch: [1][1400/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00008 Per Sample DNN Time 0.01209 Train Loss 0.1319
68
+ Epoch: [1][1500/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00008 Per Sample DNN Time 0.01206 Train Loss 0.1248
69
+ Epoch: [1][1600/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00008 Per Sample DNN Time 0.01203 Train Loss 0.1185
70
+ Epoch: [1][1700/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00007 Per Sample DNN Time 0.01198 Train Loss 0.1130
71
+ start validation
72
+ mAP: 0.014222
73
+ AUC: 0.648756
74
+ Avg Precision: 0.006060
75
+ Avg Recall: 0.998429
76
+ d_prime: 0.540180
77
+ train_loss: 0.112387
78
+ valid_loss: 0.695649
79
+ validation finished
80
+ Epoch-1 lr: 5e-05
81
+ epoch 1 training time: 327.615
82
+ ---------------
83
+ 2025-08-27 02:46:13.326272
84
+ current #epochs=2, #steps=1713
85
+ Epoch: [2][87/1713] Per Sample Total Time 0.01450 Per Sample Data Time 0.00147 Per Sample DNN Time 0.01303 Train Loss 0.0242
86
+ Epoch: [2][187/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00071 Per Sample DNN Time 0.01150 Train Loss 0.0241
87
+ Epoch: [2][287/1713] Per Sample Total Time 0.01223 Per Sample Data Time 0.00048 Per Sample DNN Time 0.01175 Train Loss 0.0241
88
+ Epoch: [2][387/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01198 Train Loss 0.0242
89
+ Epoch: [2][487/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01214 Train Loss 0.0241
90
+ Epoch: [2][587/1713] Per Sample Total Time 0.01185 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01159 Train Loss 0.0240
91
+ Epoch: [2][687/1713] Per Sample Total Time 0.01197 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01174 Train Loss 0.0240
92
+ Epoch: [2][787/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01189 Train Loss 0.0239
93
+ Epoch: [2][887/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01200 Train Loss 0.0239
94
+ Epoch: [2][987/1713] Per Sample Total Time 0.01225 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01208 Train Loss 0.0239
95
+ Epoch: [2][1087/1713] Per Sample Total Time 0.01196 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01181 Train Loss 0.0238
96
+ Epoch: [2][1187/1713] Per Sample Total Time 0.01201 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01187 Train Loss 0.0238
97
+ Epoch: [2][1287/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01190 Train Loss 0.0237
98
+ Epoch: [2][1387/1713] Per Sample Total Time 0.01206 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01193 Train Loss 0.0237
99
+ Epoch: [2][1487/1713] Per Sample Total Time 0.01183 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01171 Train Loss 0.0236
100
+ Epoch: [2][1587/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01171 Train Loss 0.0236
101
+ Epoch: [2][1687/1713] Per Sample Total Time 0.01187 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01175 Train Loss 0.0236
102
+ start validation
103
+ mAP: 0.053260
104
+ AUC: 0.815894
105
+ Avg Precision: 0.013064
106
+ Avg Recall: 0.970909
107
+ d_prime: 1.272551
108
+ train_loss: 0.023551
109
+ valid_loss: 0.695288
110
+ validation finished
111
+ Epoch-2 lr: 5e-05
112
+ epoch 2 training time: 324.709
113
+ ---------------
114
+ 2025-08-27 02:51:38.034980
115
+ current #epochs=3, #steps=3426
116
+ Epoch: [3][74/1713] Per Sample Total Time 0.01292 Per Sample Data Time 0.00173 Per Sample DNN Time 0.01119 Train Loss 0.0230
117
+ Epoch: [3][174/1713] Per Sample Total Time 0.01254 Per Sample Data Time 0.00077 Per Sample DNN Time 0.01177 Train Loss 0.0228
118
+ Epoch: [3][274/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00050 Per Sample DNN Time 0.01192 Train Loss 0.0228
119
+ Epoch: [3][374/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01203 Train Loss 0.0228
120
+ Epoch: [3][474/1713] Per Sample Total Time 0.01191 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01160 Train Loss 0.0226
121
+ Epoch: [3][574/1713] Per Sample Total Time 0.01196 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01169 Train Loss 0.0226
122
+ Epoch: [3][674/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01177 Train Loss 0.0226
123
+ Epoch: [3][774/1713] Per Sample Total Time 0.01170 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01149 Train Loss 0.0225
124
+ Epoch: [3][874/1713] Per Sample Total Time 0.01170 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01151 Train Loss 0.0224
125
+ Epoch: [3][974/1713] Per Sample Total Time 0.01178 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01160 Train Loss 0.0223
126
+ Epoch: [3][1074/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01168 Train Loss 0.0223
127
+ Epoch: [3][1174/1713] Per Sample Total Time 0.01168 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01153 Train Loss 0.0222
128
+ Epoch: [3][1274/1713] Per Sample Total Time 0.01166 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01152 Train Loss 0.0221
129
+ Epoch: [3][1374/1713] Per Sample Total Time 0.01173 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01160 Train Loss 0.0221
130
+ Epoch: [3][1474/1713] Per Sample Total Time 0.01174 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01161 Train Loss 0.0221
131
+ Epoch: [3][1574/1713] Per Sample Total Time 0.01178 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01165 Train Loss 0.0220
132
+ Epoch: [3][1674/1713] Per Sample Total Time 0.01168 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01156 Train Loss 0.0220
133
+ start validation
134
+ mAP: 0.110459
135
+ AUC: 0.896862
136
+ Avg Precision: 0.067066
137
+ Avg Recall: 0.717192
138
+ d_prime: 1.787384
139
+ train_loss: 0.021946
140
+ valid_loss: 0.695048
141
+ validation finished
142
+ Epoch-3 lr: 5e-05
143
+ epoch 3 training time: 324.807
144
+ ---------------
145
+ 2025-08-27 02:57:02.842395
146
+ current #epochs=4, #steps=5139
147
+ Epoch: [4][61/1713] Per Sample Total Time 0.01467 Per Sample Data Time 0.00201 Per Sample DNN Time 0.01265 Train Loss 0.0211
148
+ Epoch: [4][161/1713] Per Sample Total Time 0.01335 Per Sample Data Time 0.00080 Per Sample DNN Time 0.01255 Train Loss 0.0209
149
+ Epoch: [4][261/1713] Per Sample Total Time 0.01297 Per Sample Data Time 0.00051 Per Sample DNN Time 0.01245 Train Loss 0.0208
150
+ Epoch: [4][361/1713] Per Sample Total Time 0.01263 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01225 Train Loss 0.0209
151
+ Epoch: [4][461/1713] Per Sample Total Time 0.01237 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01206 Train Loss 0.0208
152
+ Epoch: [4][561/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01208 Train Loss 0.0207
153
+ Epoch: [4][661/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01209 Train Loss 0.0207
154
+ Epoch: [4][761/1713] Per Sample Total Time 0.01233 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01213 Train Loss 0.0207
155
+ Epoch: [4][861/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01208 Train Loss 0.0206
156
+ Epoch: [4][961/1713] Per Sample Total Time 0.01222 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01205 Train Loss 0.0205
157
+ Epoch: [4][1061/1713] Per Sample Total Time 0.01197 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01181 Train Loss 0.0205
158
+ Epoch: [4][1161/1713] Per Sample Total Time 0.01199 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01185 Train Loss 0.0204
159
+ Epoch: [4][1261/1713] Per Sample Total Time 0.01204 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01190 Train Loss 0.0204
160
+ Epoch: [4][1361/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01192 Train Loss 0.0203
161
+ Epoch: [4][1461/1713] Per Sample Total Time 0.01204 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01192 Train Loss 0.0203
162
+ Epoch: [4][1561/1713] Per Sample Total Time 0.01199 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01187 Train Loss 0.0202
163
+ Epoch: [4][1661/1713] Per Sample Total Time 0.01202 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01190 Train Loss 0.0202
164
+ start validation
165
+ mAP: 0.184840
166
+ AUC: 0.931311
167
+ Avg Precision: 0.071676
168
+ Avg Recall: 0.669479
169
+ d_prime: 2.100989
170
+ train_loss: 0.020172
171
+ valid_loss: 0.694885
172
+ validation finished
173
+ Epoch-4 lr: 5e-05
174
+ epoch 4 training time: 330.929
175
+ ---------------
176
+ 2025-08-27 03:02:33.771092
177
+ current #epochs=5, #steps=6852
178
+ Epoch: [5][48/1713] Per Sample Total Time 0.01044 Per Sample Data Time 0.00242 Per Sample DNN Time 0.00802 Train Loss 0.0193
179
+ Epoch: [5][148/1713] Per Sample Total Time 0.01150 Per Sample Data Time 0.00082 Per Sample DNN Time 0.01068 Train Loss 0.0191
180
+ Epoch: [5][248/1713] Per Sample Total Time 0.01172 Per Sample Data Time 0.00051 Per Sample DNN Time 0.01121 Train Loss 0.0190
181
+ Epoch: [5][348/1713] Per Sample Total Time 0.01187 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01149 Train Loss 0.0190
182
+ Epoch: [5][448/1713] Per Sample Total Time 0.01133 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01103 Train Loss 0.0190
183
+ Epoch: [5][548/1713] Per Sample Total Time 0.01143 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01118 Train Loss 0.0191
184
+ Epoch: [5][648/1713] Per Sample Total Time 0.01154 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01132 Train Loss 0.0190
185
+ Epoch: [5][748/1713] Per Sample Total Time 0.01166 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01146 Train Loss 0.0189
186
+ Epoch: [5][848/1713] Per Sample Total Time 0.01167 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01149 Train Loss 0.0189
187
+ Epoch: [5][948/1713] Per Sample Total Time 0.01143 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01126 Train Loss 0.0190
188
+ Epoch: [5][1048/1713] Per Sample Total Time 0.01153 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01138 Train Loss 0.0189
189
+ Epoch: [5][1148/1713] Per Sample Total Time 0.01166 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01152 Train Loss 0.0189
190
+ Epoch: [5][1248/1713] Per Sample Total Time 0.01169 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01155 Train Loss 0.0189
191
+ Epoch: [5][1348/1713] Per Sample Total Time 0.01164 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01151 Train Loss 0.0189
192
+ Epoch: [5][1448/1713] Per Sample Total Time 0.01169 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01157 Train Loss 0.0188
193
+ Epoch: [5][1548/1713] Per Sample Total Time 0.01173 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01161 Train Loss 0.0188
194
+ Epoch: [5][1648/1713] Per Sample Total Time 0.01176 Per Sample Data Time 0.00011 Per Sample DNN Time 0.01164 Train Loss 0.0187
195
+ start validation
196
+ mAP: 0.222446
197
+ AUC: 0.941317
198
+ Avg Precision: 0.072315
199
+ Avg Recall: 0.703509
200
+ d_prime: 2.214552
201
+ train_loss: 0.018750
202
+ valid_loss: 0.694454
203
+ validation finished
204
+ Epoch-5 lr: 5e-05
205
+ epoch 5 training time: 325.961
206
+ ---------------
207
+ 2025-08-27 03:07:59.731986
208
+ current #epochs=6, #steps=8565
209
+ Epoch: [6][35/1713] Per Sample Total Time 0.01637 Per Sample Data Time 0.00389 Per Sample DNN Time 0.01247 Train Loss 0.0174
210
+ Epoch: [6][135/1713] Per Sample Total Time 0.01327 Per Sample Data Time 0.00107 Per Sample DNN Time 0.01220 Train Loss 0.0180
211
+ Epoch: [6][235/1713] Per Sample Total Time 0.01290 Per Sample Data Time 0.00064 Per Sample DNN Time 0.01226 Train Loss 0.0179
212
+ Epoch: [6][335/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00046 Per Sample DNN Time 0.01180 Train Loss 0.0179
213
+ Epoch: [6][435/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01191 Train Loss 0.0179
214
+ Epoch: [6][535/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01203 Train Loss 0.0178
215
+ Epoch: [6][635/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01205 Train Loss 0.0178
216
+ Epoch: [6][735/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01191 Train Loss 0.0178
217
+ Epoch: [6][835/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01190 Train Loss 0.0178
218
+ Epoch: [6][935/1713] Per Sample Total Time 0.01213 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01193 Train Loss 0.0177
219
+ Epoch: [6][1035/1713] Per Sample Total Time 0.01215 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01197 Train Loss 0.0177
220
+ Epoch: [6][1135/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01196 Train Loss 0.0177
221
+ Epoch: [6][1235/1713] Per Sample Total Time 0.01210 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01195 Train Loss 0.0177
222
+ Epoch: [6][1335/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01196 Train Loss 0.0177
223
+ Epoch: [6][1435/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01197 Train Loss 0.0177
224
+ Epoch: [6][1535/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01198 Train Loss 0.0177
225
+ Epoch: [6][1635/1713] Per Sample Total Time 0.01210 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01197 Train Loss 0.0176
226
+ start validation
227
+ mAP: 0.254820
228
+ AUC: 0.947966
229
+ Avg Precision: 0.058709
230
+ Avg Recall: 0.772584
231
+ d_prime: 2.298721
232
+ train_loss: 0.017632
233
+ valid_loss: 0.694592
234
+ validation finished
235
+ Epoch-6 lr: 5e-05
236
+ epoch 6 training time: 334.468
237
+ ---------------
238
+ 2025-08-27 03:13:34.200040
239
+ current #epochs=7, #steps=10278
240
+ Epoch: [7][22/1713] Per Sample Total Time 0.01837 Per Sample Data Time 0.00587 Per Sample DNN Time 0.01250 Train Loss 0.0174
241
+ Epoch: [7][122/1713] Per Sample Total Time 0.01342 Per Sample Data Time 0.00113 Per Sample DNN Time 0.01229 Train Loss 0.0168
242
+ Epoch: [7][222/1713] Per Sample Total Time 0.01290 Per Sample Data Time 0.00065 Per Sample DNN Time 0.01225 Train Loss 0.0167
243
+ Epoch: [7][322/1713] Per Sample Total Time 0.01259 Per Sample Data Time 0.00046 Per Sample DNN Time 0.01213 Train Loss 0.0167
244
+ Epoch: [7][422/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01208 Train Loss 0.0167
245
+ Epoch: [7][522/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01196 Train Loss 0.0168
246
+ Epoch: [7][622/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01202 Train Loss 0.0167
247
+ Epoch: [7][722/1713] Per Sample Total Time 0.01201 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01178 Train Loss 0.0167
248
+ Epoch: [7][822/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01184 Train Loss 0.0168
249
+ Epoch: [7][922/1713] Per Sample Total Time 0.01210 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01191 Train Loss 0.0168
250
+ Epoch: [7][1022/1713] Per Sample Total Time 0.01213 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01195 Train Loss 0.0168
251
+ Epoch: [7][1122/1713] Per Sample Total Time 0.01215 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01199 Train Loss 0.0168
252
+ Epoch: [7][1222/1713] Per Sample Total Time 0.01218 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01202 Train Loss 0.0167
253
+ Epoch: [7][1322/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01205 Train Loss 0.0167
254
+ Epoch: [7][1422/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01207 Train Loss 0.0167
255
+ Epoch: [7][1522/1713] Per Sample Total Time 0.01222 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01209 Train Loss 0.0167
256
+ Epoch: [7][1622/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01199 Train Loss 0.0167
257
+ start validation
258
+ mAP: 0.274216
259
+ AUC: 0.951568
260
+ Avg Precision: 0.057496
261
+ Avg Recall: 0.806699
262
+ d_prime: 2.347953
263
+ train_loss: 0.016700
264
+ valid_loss: 0.694371
265
+ validation finished
266
+ Epoch-7 lr: 5e-05
267
+ epoch 7 training time: 338.084
268
+ ---------------
269
+ 2025-08-27 03:19:12.284334
270
+ current #epochs=8, #steps=11991
271
+ Epoch: [8][9/1713] Per Sample Total Time 0.02563 Per Sample Data Time 0.01307 Per Sample DNN Time 0.01255 Train Loss 0.0155
272
+ Epoch: [8][109/1713] Per Sample Total Time 0.01330 Per Sample Data Time 0.00123 Per Sample DNN Time 0.01207 Train Loss 0.0156
273
+ Epoch: [8][209/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00066 Per Sample DNN Time 0.01165 Train Loss 0.0157
274
+ Epoch: [8][309/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00047 Per Sample DNN Time 0.01166 Train Loss 0.0158
275
+ Epoch: [8][409/1713] Per Sample Total Time 0.01208 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01172 Train Loss 0.0159
276
+ Epoch: [8][509/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01181 Train Loss 0.0159
277
+ Epoch: [8][609/1713] Per Sample Total Time 0.01189 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01163 Train Loss 0.0159
278
+ Epoch: [8][709/1713] Per Sample Total Time 0.01173 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01150 Train Loss 0.0159
279
+ Epoch: [8][809/1713] Per Sample Total Time 0.01175 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01154 Train Loss 0.0159
280
+ Epoch: [8][909/1713] Per Sample Total Time 0.01177 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01158 Train Loss 0.0160
281
+ Epoch: [8][1009/1713] Per Sample Total Time 0.01182 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01165 Train Loss 0.0159
282
+ Epoch: [8][1109/1713] Per Sample Total Time 0.01170 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01154 Train Loss 0.0160
283
+ Epoch: [8][1209/1713] Per Sample Total Time 0.01175 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01160 Train Loss 0.0160
284
+ Epoch: [8][1309/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01164 Train Loss 0.0160
285
+ Epoch: [8][1409/1713] Per Sample Total Time 0.01180 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01167 Train Loss 0.0160
286
+ Epoch: [8][1509/1713] Per Sample Total Time 0.01171 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01159 Train Loss 0.0160
287
+ Epoch: [8][1609/1713] Per Sample Total Time 0.01174 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01162 Train Loss 0.0160
288
+ Epoch: [8][1709/1713] Per Sample Total Time 0.01177 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01165 Train Loss 0.0160
289
+ start validation
290
+ mAP: 0.289134
291
+ AUC: 0.953989
292
+ Avg Precision: 0.057271
293
+ Avg Recall: 0.822039
294
+ d_prime: 2.382702
295
+ train_loss: 0.015969
296
+ valid_loss: 0.694260
297
+ validation finished
298
+ Epoch-8 lr: 5e-05
299
+ epoch 8 training time: 326.545
300
+ ---------------
301
+ 2025-08-27 03:24:38.829389
302
+ current #epochs=9, #steps=13704
303
+ Epoch: [9][96/1713] Per Sample Total Time 0.01018 Per Sample Data Time 0.00133 Per Sample DNN Time 0.00884 Train Loss 0.0148
304
+ Epoch: [9][196/1713] Per Sample Total Time 0.01046 Per Sample Data Time 0.00067 Per Sample DNN Time 0.00978 Train Loss 0.0150
305
+ Epoch: [9][296/1713] Per Sample Total Time 0.01110 Per Sample Data Time 0.00046 Per Sample DNN Time 0.01064 Train Loss 0.0150
306
+ Epoch: [9][396/1713] Per Sample Total Time 0.01141 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01106 Train Loss 0.0150
307
+ Epoch: [9][496/1713] Per Sample Total Time 0.01171 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01142 Train Loss 0.0150
308
+ Epoch: [9][596/1713] Per Sample Total Time 0.01158 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01132 Train Loss 0.0151
309
+ Epoch: [9][696/1713] Per Sample Total Time 0.01148 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01126 Train Loss 0.0151
310
+ Epoch: [9][796/1713] Per Sample Total Time 0.01162 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01142 Train Loss 0.0152
311
+ Epoch: [9][896/1713] Per Sample Total Time 0.01169 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01151 Train Loss 0.0152
312
+ Epoch: [9][996/1713] Per Sample Total Time 0.01175 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01159 Train Loss 0.0152
313
+ Epoch: [9][1096/1713] Per Sample Total Time 0.01171 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01155 Train Loss 0.0152
314
+ Epoch: [9][1196/1713] Per Sample Total Time 0.01167 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01153 Train Loss 0.0152
315
+ Epoch: [9][1296/1713] Per Sample Total Time 0.01176 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01162 Train Loss 0.0152
316
+ Epoch: [9][1396/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01166 Train Loss 0.0152
317
+ Epoch: [9][1496/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01166 Train Loss 0.0153
318
+ Epoch: [9][1596/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01172 Train Loss 0.0153
319
+ Epoch: [9][1696/1713] Per Sample Total Time 0.01188 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01176 Train Loss 0.0153
320
+ start validation
321
+ mAP: 0.299794
322
+ AUC: 0.955606
323
+ Avg Precision: 0.054695
324
+ Avg Recall: 0.843050
325
+ d_prime: 2.406743
326
+ train_loss: 0.015270
327
+ valid_loss: 0.694250
328
+ validation finished
329
+ Epoch-9 lr: 5e-05
330
+ epoch 9 training time: 328.892
331
+ ---------------
332
+ 2025-08-27 03:30:07.721315
333
+ current #epochs=10, #steps=15417
334
+ Epoch: [10][83/1713] Per Sample Total Time 0.01054 Per Sample Data Time 0.00211 Per Sample DNN Time 0.00843 Train Loss 0.0144
335
+ Epoch: [10][183/1713] Per Sample Total Time 0.01190 Per Sample Data Time 0.00099 Per Sample DNN Time 0.01090 Train Loss 0.0145
336
+ Epoch: [10][283/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00066 Per Sample DNN Time 0.01162 Train Loss 0.0145
337
+ Epoch: [10][383/1713] Per Sample Total Time 0.01242 Per Sample Data Time 0.00050 Per Sample DNN Time 0.01192 Train Loss 0.0146
338
+ Epoch: [10][483/1713] Per Sample Total Time 0.01206 Per Sample Data Time 0.00040 Per Sample DNN Time 0.01166 Train Loss 0.0146
339
+ Epoch: [10][583/1713] Per Sample Total Time 0.01188 Per Sample Data Time 0.00034 Per Sample DNN Time 0.01154 Train Loss 0.0146
340
+ Epoch: [10][683/1713] Per Sample Total Time 0.01195 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01165 Train Loss 0.0146
341
+ Epoch: [10][783/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01173 Train Loss 0.0146
342
+ Epoch: [10][883/1713] Per Sample Total Time 0.01204 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01180 Train Loss 0.0146
343
+ Epoch: [10][983/1713] Per Sample Total Time 0.01197 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01175 Train Loss 0.0146
344
+ Epoch: [10][1083/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01158 Train Loss 0.0146
345
+ Epoch: [10][1183/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01165 Train Loss 0.0147
346
+ Epoch: [10][1283/1713] Per Sample Total Time 0.01190 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01172 Train Loss 0.0147
347
+ Epoch: [10][1383/1713] Per Sample Total Time 0.01194 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01177 Train Loss 0.0147
348
+ Epoch: [10][1483/1713] Per Sample Total Time 0.01189 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01173 Train Loss 0.0147
349
+ Epoch: [10][1583/1713] Per Sample Total Time 0.01176 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01161 Train Loss 0.0147
350
+ Epoch: [10][1683/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01164 Train Loss 0.0147
351
+ start validation
352
+ mAP: 0.301993
353
+ AUC: 0.955118
354
+ Avg Precision: 0.056732
355
+ Avg Recall: 0.836964
356
+ d_prime: 2.399414
357
+ train_loss: 0.014710
358
+ valid_loss: 0.694157
359
+ validation finished
360
+ Epoch-10 lr: 2.5e-05
361
+ epoch 10 training time: 328.114
362
+ ---------------
363
+ 2025-08-27 03:35:35.835882
364
+ current #epochs=11, #steps=17130
365
+ Epoch: [11][70/1713] Per Sample Total Time 0.01434 Per Sample Data Time 0.00201 Per Sample DNN Time 0.01234 Train Loss 0.0134
366
+ Epoch: [11][170/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00086 Per Sample DNN Time 0.01135 Train Loss 0.0134
367
+ Epoch: [11][270/1713] Per Sample Total Time 0.01147 Per Sample Data Time 0.00055 Per Sample DNN Time 0.01092 Train Loss 0.0135
368
+ Epoch: [11][370/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00042 Per Sample DNN Time 0.01138 Train Loss 0.0135
369
+ Epoch: [11][470/1713] Per Sample Total Time 0.01185 Per Sample Data Time 0.00034 Per Sample DNN Time 0.01151 Train Loss 0.0135
370
+ Epoch: [11][570/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01156 Train Loss 0.0136
371
+ Epoch: [11][670/1713] Per Sample Total Time 0.01180 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01155 Train Loss 0.0135
372
+ Epoch: [11][770/1713] Per Sample Total Time 0.01161 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01138 Train Loss 0.0135
373
+ Epoch: [11][870/1713] Per Sample Total Time 0.01172 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01152 Train Loss 0.0135
374
+ Epoch: [11][970/1713] Per Sample Total Time 0.01180 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01161 Train Loss 0.0136
375
+ Epoch: [11][1070/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01167 Train Loss 0.0136
376
+ Epoch: [11][1170/1713] Per Sample Total Time 0.01190 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01173 Train Loss 0.0136
377
+ Epoch: [11][1270/1713] Per Sample Total Time 0.01193 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01178 Train Loss 0.0136
378
+ Epoch: [11][1370/1713] Per Sample Total Time 0.01198 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01183 Train Loss 0.0135
379
+ Epoch: [11][1470/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01187 Train Loss 0.0135
380
+ Epoch: [11][1570/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01187 Train Loss 0.0135
381
+ Epoch: [11][1670/1713] Per Sample Total Time 0.01201 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01189 Train Loss 0.0135
382
+ start validation
383
+ mAP: 0.321699
384
+ AUC: 0.957357
385
+ Avg Precision: 0.058758
386
+ Avg Recall: 0.849520
387
+ d_prime: 2.433592
388
+ train_loss: 0.013501
389
+ valid_loss: 0.694092
390
+ validation finished
391
+ Epoch-11 lr: 2.5e-05
392
+ epoch 11 training time: 334.541
393
+ ---------------
394
+ 2025-08-27 03:41:10.375904
395
+ current #epochs=12, #steps=18843
396
+ Epoch: [12][57/1713] Per Sample Total Time 0.01455 Per Sample Data Time 0.00255 Per Sample DNN Time 0.01201 Train Loss 0.0128
397
+ Epoch: [12][157/1713] Per Sample Total Time 0.01312 Per Sample Data Time 0.00097 Per Sample DNN Time 0.01215 Train Loss 0.0127
398
+ Epoch: [12][257/1713] Per Sample Total Time 0.01283 Per Sample Data Time 0.00061 Per Sample DNN Time 0.01222 Train Loss 0.0126
399
+ Epoch: [12][357/1713] Per Sample Total Time 0.01268 Per Sample Data Time 0.00045 Per Sample DNN Time 0.01223 Train Loss 0.0126
400
+ Epoch: [12][457/1713] Per Sample Total Time 0.01254 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01218 Train Loss 0.0126
401
+ Epoch: [12][557/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01209 Train Loss 0.0127
402
+ Epoch: [12][657/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01201 Train Loss 0.0127
403
+ Epoch: [12][757/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01203 Train Loss 0.0127
404
+ Epoch: [12][857/1713] Per Sample Total Time 0.01225 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01204 Train Loss 0.0127
405
+ Epoch: [12][957/1713] Per Sample Total Time 0.01229 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01209 Train Loss 0.0127
406
+ Epoch: [12][1057/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01214 Train Loss 0.0127
407
+ Epoch: [12][1157/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01214 Train Loss 0.0127
408
+ Epoch: [12][1257/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01216 Train Loss 0.0128
409
+ Epoch: [12][1357/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01219 Train Loss 0.0128
410
+ Epoch: [12][1457/1713] Per Sample Total Time 0.01235 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01221 Train Loss 0.0127
411
+ Epoch: [12][1557/1713] Per Sample Total Time 0.01236 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01222 Train Loss 0.0128
412
+ Epoch: [12][1657/1713] Per Sample Total Time 0.01237 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01223 Train Loss 0.0128
413
+ start validation
414
+ mAP: 0.323794
415
+ AUC: 0.956825
416
+ Avg Precision: 0.056271
417
+ Avg Recall: 0.847217
418
+ d_prime: 2.425336
419
+ train_loss: 0.012771
420
+ valid_loss: 0.694024
421
+ validation finished
422
+ Epoch-12 lr: 2.5e-05
423
+ epoch 12 training time: 341.729
424
+ ---------------
425
+ 2025-08-27 03:46:52.104774
426
+ current #epochs=13, #steps=20556
427
+ Epoch: [13][44/1713] Per Sample Total Time 0.01595 Per Sample Data Time 0.00302 Per Sample DNN Time 0.01293 Train Loss 0.0122
428
+ Epoch: [13][144/1713] Per Sample Total Time 0.01372 Per Sample Data Time 0.00097 Per Sample DNN Time 0.01275 Train Loss 0.0125
429
+ Epoch: [13][244/1713] Per Sample Total Time 0.01330 Per Sample Data Time 0.00060 Per Sample DNN Time 0.01270 Train Loss 0.0125
430
+ Epoch: [13][344/1713] Per Sample Total Time 0.01301 Per Sample Data Time 0.00044 Per Sample DNN Time 0.01257 Train Loss 0.0124
431
+ Epoch: [13][444/1713] Per Sample Total Time 0.01288 Per Sample Data Time 0.00035 Per Sample DNN Time 0.01254 Train Loss 0.0124
432
+ Epoch: [13][544/1713] Per Sample Total Time 0.01282 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01253 Train Loss 0.0123
433
+ Epoch: [13][644/1713] Per Sample Total Time 0.01277 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01252 Train Loss 0.0123
434
+ Epoch: [13][744/1713] Per Sample Total Time 0.01275 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01252 Train Loss 0.0123
435
+ Epoch: [13][844/1713] Per Sample Total Time 0.01276 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01256 Train Loss 0.0124
436
+ Epoch: [13][944/1713] Per Sample Total Time 0.01273 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01254 Train Loss 0.0124
437
+ Epoch: [13][1044/1713] Per Sample Total Time 0.01272 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01255 Train Loss 0.0124
438
+ Epoch: [13][1144/1713] Per Sample Total Time 0.01273 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01257 Train Loss 0.0124
439
+ Epoch: [13][1244/1713] Per Sample Total Time 0.01280 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01265 Train Loss 0.0124
440
+ Epoch: [13][1344/1713] Per Sample Total Time 0.01280 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01265 Train Loss 0.0124
441
+ Epoch: [13][1444/1713] Per Sample Total Time 0.01279 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01265 Train Loss 0.0124
442
+ Epoch: [13][1544/1713] Per Sample Total Time 0.01276 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01263 Train Loss 0.0124
443
+ Epoch: [13][1644/1713] Per Sample Total Time 0.01275 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01262 Train Loss 0.0124
444
+ start validation
445
+ mAP: 0.323879
446
+ AUC: 0.956006
447
+ Avg Precision: 0.057743
448
+ Avg Recall: 0.841438
449
+ d_prime: 2.412806
450
+ train_loss: 0.012389
451
+ valid_loss: 0.693967
452
+ validation finished
453
+ Epoch-13 lr: 2.5e-05
454
+ epoch 13 training time: 348.570
455
+ ---------------
456
+ 2025-08-27 03:52:40.675110
457
+ current #epochs=14, #steps=22269
458
+ Epoch: [14][31/1713] Per Sample Total Time 0.01726 Per Sample Data Time 0.00422 Per Sample DNN Time 0.01303 Train Loss 0.0124
459
+ Epoch: [14][131/1713] Per Sample Total Time 0.01382 Per Sample Data Time 0.00106 Per Sample DNN Time 0.01276 Train Loss 0.0120
460
+ Epoch: [14][231/1713] Per Sample Total Time 0.01358 Per Sample Data Time 0.00062 Per Sample DNN Time 0.01296 Train Loss 0.0119
461
+ Epoch: [14][331/1713] Per Sample Total Time 0.01321 Per Sample Data Time 0.00045 Per Sample DNN Time 0.01276 Train Loss 0.0119
462
+ Epoch: [14][431/1713] Per Sample Total Time 0.01297 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01261 Train Loss 0.0119
463
+ Epoch: [14][531/1713] Per Sample Total Time 0.01283 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01253 Train Loss 0.0119
464
+ Epoch: [14][631/1713] Per Sample Total Time 0.01281 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01255 Train Loss 0.0119
465
+ Epoch: [14][731/1713] Per Sample Total Time 0.01279 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01257 Train Loss 0.0119
466
+ Epoch: [14][831/1713] Per Sample Total Time 0.01267 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01247 Train Loss 0.0119
467
+ Epoch: [14][931/1713] Per Sample Total Time 0.01262 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01244 Train Loss 0.0119
468
+ Epoch: [14][1031/1713] Per Sample Total Time 0.01256 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01239 Train Loss 0.0119
469
+ Epoch: [14][1131/1713] Per Sample Total Time 0.01251 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01235 Train Loss 0.0119
470
+ Epoch: [14][1231/1713] Per Sample Total Time 0.01249 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01234 Train Loss 0.0119
471
+ Epoch: [14][1331/1713] Per Sample Total Time 0.01246 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01231 Train Loss 0.0119
472
+ Epoch: [14][1431/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01229 Train Loss 0.0119
473
+ Epoch: [14][1531/1713] Per Sample Total Time 0.01238 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01225 Train Loss 0.0119
474
+ Epoch: [14][1631/1713] Per Sample Total Time 0.01237 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01225 Train Loss 0.0119
475
+ start validation
476
+ mAP: 0.320660
477
+ AUC: 0.954093
478
+ Avg Precision: 0.058404
479
+ Avg Recall: 0.837463
480
+ d_prime: 2.384228
481
+ train_loss: 0.011937
482
+ valid_loss: 0.693958
483
+ validation finished
484
+ Epoch-14 lr: 2.5e-05
485
+ epoch 14 training time: 341.352
486
+ ---------------
487
+ 2025-08-27 03:58:22.027047
488
+ current #epochs=15, #steps=23982
489
+ Epoch: [15][18/1713] Per Sample Total Time 0.01914 Per Sample Data Time 0.00752 Per Sample DNN Time 0.01162 Train Loss 0.0115
490
+ Epoch: [15][118/1713] Per Sample Total Time 0.01358 Per Sample Data Time 0.00124 Per Sample DNN Time 0.01234 Train Loss 0.0115
491
+ Epoch: [15][218/1713] Per Sample Total Time 0.01286 Per Sample Data Time 0.00070 Per Sample DNN Time 0.01217 Train Loss 0.0117
492
+ Epoch: [15][318/1713] Per Sample Total Time 0.01255 Per Sample Data Time 0.00049 Per Sample DNN Time 0.01206 Train Loss 0.0117
493
+ Epoch: [15][418/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01202 Train Loss 0.0117
494
+ Epoch: [15][518/1713] Per Sample Total Time 0.01239 Per Sample Data Time 0.00032 Per Sample DNN Time 0.01207 Train Loss 0.0116
495
+ Epoch: [15][618/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01214 Train Loss 0.0117
496
+ Epoch: [15][718/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01207 Train Loss 0.0117
497
+ Epoch: [15][818/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01197 Train Loss 0.0116
498
+ Epoch: [15][918/1713] Per Sample Total Time 0.01203 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01183 Train Loss 0.0116
499
+ Epoch: [15][1018/1713] Per Sample Total Time 0.01198 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01180 Train Loss 0.0116
500
+ Epoch: [15][1118/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01183 Train Loss 0.0116
501
+ Epoch: [15][1218/1713] Per Sample Total Time 0.01193 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01178 Train Loss 0.0116
502
+ Epoch: [15][1318/1713] Per Sample Total Time 0.01188 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01173 Train Loss 0.0116
503
+ Epoch: [15][1418/1713] Per Sample Total Time 0.01186 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01172 Train Loss 0.0116
504
+ Epoch: [15][1518/1713] Per Sample Total Time 0.01187 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01173 Train Loss 0.0116
505
+ Epoch: [15][1618/1713] Per Sample Total Time 0.01187 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01174 Train Loss 0.0116
506
+ start validation
507
+ mAP: 0.324188
508
+ AUC: 0.955051
509
+ Avg Precision: 0.056682
510
+ Avg Recall: 0.845943
511
+ d_prime: 2.398415
512
+ train_loss: 0.011597
513
+ valid_loss: 0.693899
514
+ validation finished
515
+ Epoch-15 lr: 1.25e-05
516
+ epoch 15 training time: 331.892
517
+ ---------------
518
+ 2025-08-27 04:03:53.919434
519
+ current #epochs=16, #steps=25695
520
+ Epoch: [16][5/1713] Per Sample Total Time 0.04252 Per Sample Data Time 0.02888 Per Sample DNN Time 0.01365 Train Loss 0.0112
521
+ Epoch: [16][105/1713] Per Sample Total Time 0.01470 Per Sample Data Time 0.00168 Per Sample DNN Time 0.01302 Train Loss 0.0111
522
+ Epoch: [16][205/1713] Per Sample Total Time 0.01382 Per Sample Data Time 0.00089 Per Sample DNN Time 0.01293 Train Loss 0.0109
523
+ Epoch: [16][305/1713] Per Sample Total Time 0.01352 Per Sample Data Time 0.00062 Per Sample DNN Time 0.01291 Train Loss 0.0109
524
+ Epoch: [16][405/1713] Per Sample Total Time 0.01325 Per Sample Data Time 0.00048 Per Sample DNN Time 0.01278 Train Loss 0.0109
525
+ Epoch: [16][505/1713] Per Sample Total Time 0.01320 Per Sample Data Time 0.00039 Per Sample DNN Time 0.01281 Train Loss 0.0109
526
+ Epoch: [16][605/1713] Per Sample Total Time 0.01319 Per Sample Data Time 0.00034 Per Sample DNN Time 0.01286 Train Loss 0.0109
527
+ Epoch: [16][705/1713] Per Sample Total Time 0.01316 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01287 Train Loss 0.0109
528
+ Epoch: [16][805/1713] Per Sample Total Time 0.01307 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01280 Train Loss 0.0109
529
+ Epoch: [16][905/1713] Per Sample Total Time 0.01300 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01276 Train Loss 0.0109
530
+ Epoch: [16][1005/1713] Per Sample Total Time 0.01295 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01273 Train Loss 0.0109
531
+ Epoch: [16][1105/1713] Per Sample Total Time 0.01290 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01269 Train Loss 0.0109
532
+ Epoch: [16][1205/1713] Per Sample Total Time 0.01287 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01268 Train Loss 0.0109
533
+ Epoch: [16][1305/1713] Per Sample Total Time 0.01283 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01264 Train Loss 0.0109
534
+ Epoch: [16][1405/1713] Per Sample Total Time 0.01281 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01264 Train Loss 0.0109
535
+ Epoch: [16][1505/1713] Per Sample Total Time 0.01278 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01262 Train Loss 0.0109
536
+ Epoch: [16][1605/1713] Per Sample Total Time 0.01276 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01261 Train Loss 0.0109
537
+ Epoch: [16][1705/1713] Per Sample Total Time 0.01273 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01258 Train Loss 0.0109
538
+ start validation
539
+ mAP: 0.327409
540
+ AUC: 0.953805
541
+ Avg Precision: 0.058829
542
+ Avg Recall: 0.839503
543
+ d_prime: 2.380009
544
+ train_loss: 0.010876
545
+ valid_loss: 0.693826
546
+ validation finished
547
+ Epoch-16 lr: 1.25e-05
548
+ epoch 16 training time: 347.769
549
+ ---------------
550
+ 2025-08-27 04:09:41.688588
551
+ current #epochs=17, #steps=27408
552
+ Epoch: [17][92/1713] Per Sample Total Time 0.01398 Per Sample Data Time 0.00155 Per Sample DNN Time 0.01243 Train Loss 0.0107
553
+ Epoch: [17][192/1713] Per Sample Total Time 0.01296 Per Sample Data Time 0.00077 Per Sample DNN Time 0.01219 Train Loss 0.0107
554
+ Epoch: [17][292/1713] Per Sample Total Time 0.01261 Per Sample Data Time 0.00052 Per Sample DNN Time 0.01208 Train Loss 0.0105
555
+ Epoch: [17][392/1713] Per Sample Total Time 0.01233 Per Sample Data Time 0.00040 Per Sample DNN Time 0.01193 Train Loss 0.0105
556
+ Epoch: [17][492/1713] Per Sample Total Time 0.01210 Per Sample Data Time 0.00033 Per Sample DNN Time 0.01177 Train Loss 0.0106
557
+ Epoch: [17][592/1713] Per Sample Total Time 0.01185 Per Sample Data Time 0.00028 Per Sample DNN Time 0.01158 Train Loss 0.0106
558
+ Epoch: [17][692/1713] Per Sample Total Time 0.01191 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01167 Train Loss 0.0106
559
+ Epoch: [17][792/1713] Per Sample Total Time 0.01196 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01174 Train Loss 0.0106
560
+ Epoch: [17][892/1713] Per Sample Total Time 0.01201 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01181 Train Loss 0.0106
561
+ Epoch: [17][992/1713] Per Sample Total Time 0.01206 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01188 Train Loss 0.0106
562
+ Epoch: [17][1092/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01192 Train Loss 0.0106
563
+ Epoch: [17][1192/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01196 Train Loss 0.0106
564
+ Epoch: [17][1292/1713] Per Sample Total Time 0.01213 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01197 Train Loss 0.0106
565
+ Epoch: [17][1392/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01200 Train Loss 0.0106
566
+ Epoch: [17][1492/1713] Per Sample Total Time 0.01216 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01202 Train Loss 0.0106
567
+ Epoch: [17][1592/1713] Per Sample Total Time 0.01215 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01202 Train Loss 0.0106
568
+ Epoch: [17][1692/1713] Per Sample Total Time 0.01215 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01203 Train Loss 0.0106
569
+ start validation
570
+ mAP: 0.326630
571
+ AUC: 0.953156
572
+ Avg Precision: 0.057865
573
+ Avg Recall: 0.837258
574
+ d_prime: 2.370586
575
+ train_loss: 0.010578
576
+ valid_loss: 0.693788
577
+ validation finished
578
+ Epoch-17 lr: 1.25e-05
579
+ epoch 17 training time: 336.202
580
+ ---------------
581
+ 2025-08-27 04:15:17.890290
582
+ current #epochs=18, #steps=29121
583
+ Epoch: [18][79/1713] Per Sample Total Time 0.01361 Per Sample Data Time 0.00190 Per Sample DNN Time 0.01172 Train Loss 0.0103
584
+ Epoch: [18][179/1713] Per Sample Total Time 0.01298 Per Sample Data Time 0.00087 Per Sample DNN Time 0.01211 Train Loss 0.0104
585
+ Epoch: [18][279/1713] Per Sample Total Time 0.01268 Per Sample Data Time 0.00058 Per Sample DNN Time 0.01211 Train Loss 0.0104
586
+ Epoch: [18][379/1713] Per Sample Total Time 0.01250 Per Sample Data Time 0.00044 Per Sample DNN Time 0.01207 Train Loss 0.0104
587
+ Epoch: [18][479/1713] Per Sample Total Time 0.01249 Per Sample Data Time 0.00035 Per Sample DNN Time 0.01213 Train Loss 0.0105
588
+ Epoch: [18][579/1713] Per Sample Total Time 0.01249 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01219 Train Loss 0.0105
589
+ Epoch: [18][679/1713] Per Sample Total Time 0.01250 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01223 Train Loss 0.0105
590
+ Epoch: [18][779/1713] Per Sample Total Time 0.01246 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01222 Train Loss 0.0105
591
+ Epoch: [18][879/1713] Per Sample Total Time 0.01245 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01224 Train Loss 0.0105
592
+ Epoch: [18][979/1713] Per Sample Total Time 0.01252 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01233 Train Loss 0.0104
593
+ Epoch: [18][1079/1713] Per Sample Total Time 0.01258 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01240 Train Loss 0.0104
594
+ Epoch: [18][1179/1713] Per Sample Total Time 0.01259 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01242 Train Loss 0.0104
595
+ Epoch: [18][1279/1713] Per Sample Total Time 0.01258 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01241 Train Loss 0.0104
596
+ Epoch: [18][1379/1713] Per Sample Total Time 0.01256 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01240 Train Loss 0.0104
597
+ Epoch: [18][1479/1713] Per Sample Total Time 0.01254 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01239 Train Loss 0.0104
598
+ Epoch: [18][1579/1713] Per Sample Total Time 0.01252 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01238 Train Loss 0.0104
599
+ Epoch: [18][1679/1713] Per Sample Total Time 0.01249 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01235 Train Loss 0.0104
600
+ start validation
601
+ mAP: 0.327243
602
+ AUC: 0.952374
603
+ Avg Precision: 0.057596
604
+ Avg Recall: 0.839843
605
+ d_prime: 2.359356
606
+ train_loss: 0.010416
607
+ valid_loss: 0.693832
608
+ validation finished
609
+ Epoch-18 lr: 1.25e-05
610
+ epoch 18 training time: 343.352
611
+ ---------------
612
+ 2025-08-27 04:21:01.242578
613
+ current #epochs=19, #steps=30834
614
+ Epoch: [19][66/1713] Per Sample Total Time 0.01454 Per Sample Data Time 0.00188 Per Sample DNN Time 0.01266 Train Loss 0.0100
615
+ Epoch: [19][166/1713] Per Sample Total Time 0.01352 Per Sample Data Time 0.00078 Per Sample DNN Time 0.01274 Train Loss 0.0101
616
+ Epoch: [19][266/1713] Per Sample Total Time 0.01298 Per Sample Data Time 0.00051 Per Sample DNN Time 0.01247 Train Loss 0.0100
617
+ Epoch: [19][366/1713] Per Sample Total Time 0.01277 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01239 Train Loss 0.0100
618
+ Epoch: [19][466/1713] Per Sample Total Time 0.01253 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01222 Train Loss 0.0100
619
+ Epoch: [19][566/1713] Per Sample Total Time 0.01230 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01204 Train Loss 0.0100
620
+ Epoch: [19][666/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01205 Train Loss 0.0101
621
+ Epoch: [19][766/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01206 Train Loss 0.0101
622
+ Epoch: [19][866/1713] Per Sample Total Time 0.01222 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01204 Train Loss 0.0101
623
+ Epoch: [19][966/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01204 Train Loss 0.0101
624
+ Epoch: [19][1066/1713] Per Sample Total Time 0.01220 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01204 Train Loss 0.0101
625
+ Epoch: [19][1166/1713] Per Sample Total Time 0.01216 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01201 Train Loss 0.0101
626
+ Epoch: [19][1266/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01197 Train Loss 0.0101
627
+ Epoch: [19][1366/1713] Per Sample Total Time 0.01202 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01189 Train Loss 0.0101
628
+ Epoch: [19][1466/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01192 Train Loss 0.0101
629
+ Epoch: [19][1566/1713] Per Sample Total Time 0.01208 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01195 Train Loss 0.0101
630
+ Epoch: [19][1666/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01198 Train Loss 0.0101
631
+ start validation
632
+ mAP: 0.325581
633
+ AUC: 0.951207
634
+ Avg Precision: 0.057401
635
+ Avg Recall: 0.836137
636
+ d_prime: 2.342888
637
+ train_loss: 0.010055
638
+ valid_loss: 0.693805
639
+ validation finished
640
+ Epoch-19 lr: 1.25e-05
641
+ epoch 19 training time: 334.613
642
+ ---------------
643
+ 2025-08-27 04:26:35.855950
644
+ current #epochs=20, #steps=32547
645
+ Epoch: [20][53/1713] Per Sample Total Time 0.01465 Per Sample Data Time 0.00259 Per Sample DNN Time 0.01206 Train Loss 0.0100
646
+ Epoch: [20][153/1713] Per Sample Total Time 0.01233 Per Sample Data Time 0.00094 Per Sample DNN Time 0.01139 Train Loss 0.0099
647
+ Epoch: [20][253/1713] Per Sample Total Time 0.01216 Per Sample Data Time 0.00059 Per Sample DNN Time 0.01158 Train Loss 0.0097
648
+ Epoch: [20][353/1713] Per Sample Total Time 0.01224 Per Sample Data Time 0.00043 Per Sample DNN Time 0.01181 Train Loss 0.0098
649
+ Epoch: [20][453/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00035 Per Sample DNN Time 0.01192 Train Loss 0.0098
650
+ Epoch: [20][553/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01198 Train Loss 0.0098
651
+ Epoch: [20][653/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01201 Train Loss 0.0098
652
+ Epoch: [20][753/1713] Per Sample Total Time 0.01230 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01207 Train Loss 0.0099
653
+ Epoch: [20][853/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01212 Train Loss 0.0099
654
+ Epoch: [20][953/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01213 Train Loss 0.0099
655
+ Epoch: [20][1053/1713] Per Sample Total Time 0.01229 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01212 Train Loss 0.0100
656
+ Epoch: [20][1153/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01211 Train Loss 0.0100
657
+ Epoch: [20][1253/1713] Per Sample Total Time 0.01230 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01214 Train Loss 0.0100
658
+ Epoch: [20][1353/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01217 Train Loss 0.0100
659
+ Epoch: [20][1453/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01218 Train Loss 0.0100
660
+ Epoch: [20][1553/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01219 Train Loss 0.0100
661
+ Epoch: [20][1653/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01219 Train Loss 0.0100
662
+ start validation
663
+ mAP: 0.326032
664
+ AUC: 0.951268
665
+ Avg Precision: 0.059974
666
+ Avg Recall: 0.828896
667
+ d_prime: 2.343747
668
+ train_loss: 0.009952
669
+ valid_loss: 0.693693
670
+ validation finished
671
+ Epoch-20 lr: 6.25e-06
672
+ epoch 20 training time: 338.954
673
+ ---------------
674
+ 2025-08-27 04:32:14.809724
675
+ current #epochs=21, #steps=34260
676
+ Epoch: [21][40/1713] Per Sample Total Time 0.01581 Per Sample Data Time 0.00377 Per Sample DNN Time 0.01204 Train Loss 0.0094
677
+ Epoch: [21][140/1713] Per Sample Total Time 0.01329 Per Sample Data Time 0.00113 Per Sample DNN Time 0.01216 Train Loss 0.0096
678
+ Epoch: [21][240/1713] Per Sample Total Time 0.01279 Per Sample Data Time 0.00068 Per Sample DNN Time 0.01211 Train Loss 0.0098
679
+ Epoch: [21][340/1713] Per Sample Total Time 0.01265 Per Sample Data Time 0.00049 Per Sample DNN Time 0.01215 Train Loss 0.0098
680
+ Epoch: [21][440/1713] Per Sample Total Time 0.01259 Per Sample Data Time 0.00039 Per Sample DNN Time 0.01220 Train Loss 0.0098
681
+ Epoch: [21][540/1713] Per Sample Total Time 0.01253 Per Sample Data Time 0.00033 Per Sample DNN Time 0.01220 Train Loss 0.0097
682
+ Epoch: [21][640/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00028 Per Sample DNN Time 0.01216 Train Loss 0.0097
683
+ Epoch: [21][740/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01215 Train Loss 0.0096
684
+ Epoch: [21][840/1713] Per Sample Total Time 0.01237 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01214 Train Loss 0.0096
685
+ Epoch: [21][940/1713] Per Sample Total Time 0.01236 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01215 Train Loss 0.0096
686
+ Epoch: [21][1040/1713] Per Sample Total Time 0.01235 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01216 Train Loss 0.0096
687
+ Epoch: [21][1140/1713] Per Sample Total Time 0.01236 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01218 Train Loss 0.0096
688
+ Epoch: [21][1240/1713] Per Sample Total Time 0.01236 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01219 Train Loss 0.0096
689
+ Epoch: [21][1340/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01224 Train Loss 0.0096
690
+ Epoch: [21][1440/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01227 Train Loss 0.0096
691
+ Epoch: [21][1540/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01229 Train Loss 0.0096
692
+ Epoch: [21][1640/1713] Per Sample Total Time 0.01245 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01231 Train Loss 0.0096
693
+ start validation
694
+ mAP: 0.324904
695
+ AUC: 0.950095
696
+ Avg Precision: 0.058785
697
+ Avg Recall: 0.826536
698
+ d_prime: 2.327477
699
+ train_loss: 0.009605
700
+ valid_loss: 0.693686
701
+ validation finished
702
+ Epoch-21 lr: 6.25e-06
703
+ epoch 21 training time: 340.738
704
+ ---------------
705
+ 2025-08-27 04:37:55.547797
706
+ current #epochs=22, #steps=35973
707
+ Epoch: [22][27/1713] Per Sample Total Time 0.01741 Per Sample Data Time 0.00499 Per Sample DNN Time 0.01242 Train Loss 0.0093
708
+ Epoch: [22][127/1713] Per Sample Total Time 0.01313 Per Sample Data Time 0.00113 Per Sample DNN Time 0.01200 Train Loss 0.0092
709
+ Epoch: [22][227/1713] Per Sample Total Time 0.01251 Per Sample Data Time 0.00065 Per Sample DNN Time 0.01186 Train Loss 0.0092
710
+ Epoch: [22][327/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00047 Per Sample DNN Time 0.01180 Train Loss 0.0092
711
+ Epoch: [22][427/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00037 Per Sample DNN Time 0.01182 Train Loss 0.0093
712
+ Epoch: [22][527/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01183 Train Loss 0.0093
713
+ Epoch: [22][627/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01182 Train Loss 0.0094
714
+ Epoch: [22][727/1713] Per Sample Total Time 0.01208 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01185 Train Loss 0.0094
715
+ Epoch: [22][827/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01188 Train Loss 0.0094
716
+ Epoch: [22][927/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01190 Train Loss 0.0094
717
+ Epoch: [22][1027/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01195 Train Loss 0.0094
718
+ Epoch: [22][1127/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01193 Train Loss 0.0095
719
+ Epoch: [22][1227/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01196 Train Loss 0.0095
720
+ Epoch: [22][1327/1713] Per Sample Total Time 0.01213 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01199 Train Loss 0.0095
721
+ Epoch: [22][1427/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01203 Train Loss 0.0095
722
+ Epoch: [22][1527/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01206 Train Loss 0.0095
723
+ Epoch: [22][1627/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01208 Train Loss 0.0095
724
+ start validation
725
+ mAP: 0.325570
726
+ AUC: 0.949932
727
+ Avg Precision: 0.057976
728
+ Avg Recall: 0.827917
729
+ d_prime: 2.325238
730
+ train_loss: 0.009480
731
+ valid_loss: 0.693679
732
+ validation finished
733
+ Epoch-22 lr: 6.25e-06
734
+ epoch 22 training time: 336.561
735
+ ---------------
736
+ 2025-08-27 04:43:32.108769
737
+ current #epochs=23, #steps=37686
738
+ Epoch: [23][14/1713] Per Sample Total Time 0.02238 Per Sample Data Time 0.00949 Per Sample DNN Time 0.01290 Train Loss 0.0097
739
+ Epoch: [23][114/1713] Per Sample Total Time 0.01363 Per Sample Data Time 0.00127 Per Sample DNN Time 0.01235 Train Loss 0.0094
740
+ Epoch: [23][214/1713] Per Sample Total Time 0.01300 Per Sample Data Time 0.00070 Per Sample DNN Time 0.01230 Train Loss 0.0094
741
+ Epoch: [23][314/1713] Per Sample Total Time 0.01281 Per Sample Data Time 0.00049 Per Sample DNN Time 0.01232 Train Loss 0.0094
742
+ Epoch: [23][414/1713] Per Sample Total Time 0.01265 Per Sample Data Time 0.00039 Per Sample DNN Time 0.01227 Train Loss 0.0094
743
+ Epoch: [23][514/1713] Per Sample Total Time 0.01262 Per Sample Data Time 0.00032 Per Sample DNN Time 0.01230 Train Loss 0.0094
744
+ Epoch: [23][614/1713] Per Sample Total Time 0.01255 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01227 Train Loss 0.0094
745
+ Epoch: [23][714/1713] Per Sample Total Time 0.01251 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01227 Train Loss 0.0094
746
+ Epoch: [23][814/1713] Per Sample Total Time 0.01242 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01221 Train Loss 0.0094
747
+ Epoch: [23][914/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01220 Train Loss 0.0093
748
+ Epoch: [23][1014/1713] Per Sample Total Time 0.01235 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01217 Train Loss 0.0093
749
+ Epoch: [23][1114/1713] Per Sample Total Time 0.01225 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01208 Train Loss 0.0094
750
+ Epoch: [23][1214/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01205 Train Loss 0.0094
751
+ Epoch: [23][1314/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01202 Train Loss 0.0094
752
+ Epoch: [23][1414/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01200 Train Loss 0.0093
753
+ Epoch: [23][1514/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01201 Train Loss 0.0093
754
+ Epoch: [23][1614/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01198 Train Loss 0.0093
755
+ start validation
756
+ mAP: 0.324305
757
+ AUC: 0.949169
758
+ Avg Precision: 0.055806
759
+ Avg Recall: 0.830132
760
+ d_prime: 2.314852
761
+ train_loss: 0.009343
762
+ valid_loss: 0.693706
763
+ validation finished
764
+ Epoch-23 lr: 6.25e-06
765
+ epoch 23 training time: 334.655
766
+ ---------------
767
+ 2025-08-27 04:49:06.763788
768
+ current #epochs=24, #steps=39399
769
+ Epoch: [24][1/1713] Per Sample Total Time 0.08944 Per Sample Data Time 0.07234 Per Sample DNN Time 0.01709 Train Loss 0.0076
770
+ Epoch: [24][101/1713] Per Sample Total Time 0.01414 Per Sample Data Time 0.00146 Per Sample DNN Time 0.01268 Train Loss 0.0092
771
+ Epoch: [24][201/1713] Per Sample Total Time 0.01317 Per Sample Data Time 0.00076 Per Sample DNN Time 0.01241 Train Loss 0.0092
772
+ Epoch: [24][301/1713] Per Sample Total Time 0.01286 Per Sample Data Time 0.00052 Per Sample DNN Time 0.01233 Train Loss 0.0092
773
+ Epoch: [24][401/1713] Per Sample Total Time 0.01280 Per Sample Data Time 0.00041 Per Sample DNN Time 0.01239 Train Loss 0.0092
774
+ Epoch: [24][501/1713] Per Sample Total Time 0.01272 Per Sample Data Time 0.00033 Per Sample DNN Time 0.01238 Train Loss 0.0093
775
+ Epoch: [24][601/1713] Per Sample Total Time 0.01260 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01232 Train Loss 0.0093
776
+ Epoch: [24][701/1713] Per Sample Total Time 0.01256 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01230 Train Loss 0.0093
777
+ Epoch: [24][801/1713] Per Sample Total Time 0.01253 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01231 Train Loss 0.0093
778
+ Epoch: [24][901/1713] Per Sample Total Time 0.01248 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01228 Train Loss 0.0093
779
+ Epoch: [24][1001/1713] Per Sample Total Time 0.01248 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01229 Train Loss 0.0093
780
+ Epoch: [24][1101/1713] Per Sample Total Time 0.01247 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01230 Train Loss 0.0093
781
+ Epoch: [24][1201/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01226 Train Loss 0.0094
782
+ Epoch: [24][1301/1713] Per Sample Total Time 0.01242 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01227 Train Loss 0.0094
783
+ Epoch: [24][1401/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01229 Train Loss 0.0094
784
+ Epoch: [24][1501/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01226 Train Loss 0.0094
785
+ Epoch: [24][1601/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01227 Train Loss 0.0094
786
+ Epoch: [24][1701/1713] Per Sample Total Time 0.01238 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01225 Train Loss 0.0094
787
+ start validation
788
+ mAP: 0.325054
789
+ AUC: 0.948969
790
+ Avg Precision: 0.058949
791
+ Avg Recall: 0.826246
792
+ d_prime: 2.312149
793
+ train_loss: 0.009365
794
+ valid_loss: 0.693633
795
+ validation finished
796
+ Epoch-24 lr: 6.25e-06
797
+ epoch 24 training time: 343.091
798
+ ---------------
799
+ 2025-08-27 04:54:49.854832
800
+ current #epochs=25, #steps=41112
801
+ Epoch: [25][88/1713] Per Sample Total Time 0.01340 Per Sample Data Time 0.00154 Per Sample DNN Time 0.01186 Train Loss 0.0092
802
+ Epoch: [25][188/1713] Per Sample Total Time 0.01266 Per Sample Data Time 0.00075 Per Sample DNN Time 0.01191 Train Loss 0.0092
803
+ Epoch: [25][288/1713] Per Sample Total Time 0.01242 Per Sample Data Time 0.00050 Per Sample DNN Time 0.01192 Train Loss 0.0091
804
+ Epoch: [25][388/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00039 Per Sample DNN Time 0.01205 Train Loss 0.0092
805
+ Epoch: [25][488/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00032 Per Sample DNN Time 0.01212 Train Loss 0.0092
806
+ Epoch: [25][588/1713] Per Sample Total Time 0.01245 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01218 Train Loss 0.0091
807
+ Epoch: [25][688/1713] Per Sample Total Time 0.01238 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01215 Train Loss 0.0092
808
+ Epoch: [25][788/1713] Per Sample Total Time 0.01235 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01214 Train Loss 0.0092
809
+ Epoch: [25][888/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01215 Train Loss 0.0092
810
+ Epoch: [25][988/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01213 Train Loss 0.0092
811
+ Epoch: [25][1088/1713] Per Sample Total Time 0.01225 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01208 Train Loss 0.0092
812
+ Epoch: [25][1188/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01204 Train Loss 0.0092
813
+ Epoch: [25][1288/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01206 Train Loss 0.0092
814
+ Epoch: [25][1388/1713] Per Sample Total Time 0.01222 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01208 Train Loss 0.0092
815
+ Epoch: [25][1488/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01208 Train Loss 0.0092
816
+ Epoch: [25][1588/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01208 Train Loss 0.0092
817
+ Epoch: [25][1688/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01205 Train Loss 0.0092
818
+ start validation
819
+ mAP: 0.323983
820
+ AUC: 0.948025
821
+ Avg Precision: 0.060971
822
+ Avg Recall: 0.820553
823
+ d_prime: 2.299510
824
+ train_loss: 0.009207
825
+ valid_loss: 0.693624
826
+ validation finished
827
+ Epoch-25 lr: 3.125e-06
828
+ epoch 25 training time: 338.680
829
+ ---------------Training Finished---------------
830
+ weighted averaged model results
831
+ mAP: 0.340667
832
+ AUC: 0.959997
833
+ Avg Precision: 0.058671
834
+ Avg Recall: 0.859400
835
+ d_prime: 2.475802
836
+ train_loss: 0.000000
837
+ valid_loss: 0.693624
ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/wa_result.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 3.406672498832421514e-01
2
+ 9.599974407170855928e-01
3
+ 5.867078735816431967e-02
4
+ 8.594002744825509632e-01
5
+ 2.475801985654465742e+00
pre_4_AS2M/conv_clap_1_2025-09-30_06-58-32/pretraining_AS2M.sh ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_resolution="4, 8, 16"
35
+ model_modalities_image_conv_in_chans="1, 256, 384, 768"
36
+
37
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
38
+ echo "Config ${train_mode} ${config_option}"
39
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
40
+ task_load_clap_emb=false
41
+ task_load_source_file=true
42
+ task_load_mel_file=false
43
+ model_proj_type=null
44
+ model_clone_batch=4
45
+ dataset_batch_size=96
46
+ model_clap_loss=0
47
+ checkpoint_keep_interval_updates=-1
48
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
49
+ echo "Config ${train_mode} ${config_option}"
50
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
51
+ task_load_clap_emb=false
52
+ task_load_source_file=true
53
+ task_load_mel_file=false
54
+ model_proj_type=null
55
+ model_clone_batch=4
56
+ dataset_batch_size=96
57
+ model_dispersive_loss=1
58
+ model_dispersive_loss_layer=0
59
+ checkpoint_keep_interval_updates=1
60
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
61
+ echo "Config ${train_mode} ${config_option}"
62
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
63
+ task_load_clap_emb=false
64
+ task_load_source_file=true
65
+ task_load_mel_file=false
66
+ model_proj_type=null
67
+ model_clone_batch=1
68
+ dataset_batch_size=384
69
+ model_dispersive_loss=1
70
+ model_dispersive_loss_layer=0
71
+ checkpoint_keep_interval_updates=1
72
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
73
+ echo "Config ${train_mode} ${config_option}"
74
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
75
+ task_load_clap_emb=false
76
+ task_load_source_file=true
77
+ task_load_mel_file=false
78
+ model_proj_type=null
79
+ model_clone_batch=1
80
+ dataset_batch_size=384
81
+ model_dispersive_loss=10.0
82
+ model_dispersive_loss_layer=0
83
+ checkpoint_keep_interval_updates=1
84
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
85
+ echo "Config ${train_mode} ${config_option}"
86
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
87
+ task_load_clap_emb=false
88
+ task_load_source_file=true
89
+ task_load_mel_file=false
90
+ model_proj_type=null
91
+ model_clone_batch=1
92
+ dataset_batch_size=384
93
+ model_dispersive_loss=100.0
94
+ model_dispersive_loss_layer=0
95
+ checkpoint_keep_interval_updates=1
96
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
97
+ echo "Config ${train_mode} ${config_option}"
98
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
99
+ task_load_clap_emb=false
100
+ task_load_source_file=true
101
+ task_load_mel_file=false
102
+ model_proj_type=null
103
+ model_clone_batch=1
104
+ dataset_batch_size=384
105
+ model_dispersive_loss=10000.0
106
+ model_dispersive_loss_layer=0
107
+ checkpoint_keep_interval_updates=1
108
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
109
+ echo "Config ${train_mode} ${config_option}"
110
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
111
+ task_load_clap_emb=false
112
+ task_load_source_file=true
113
+ task_load_mel_file=false
114
+ model_proj_type=null
115
+ model_clone_batch=1
116
+ dataset_batch_size=384
117
+ model_dispersive_loss=1000.0
118
+ model_dispersive_loss_layer=0
119
+ checkpoint_keep_interval_updates=1
120
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
121
+ echo "Config ${train_mode} ${config_option}"
122
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
123
+ task_load_clap_emb=false
124
+ task_load_source_file=true
125
+ task_load_mel_file=false
126
+ model_proj_type=null
127
+ model_clone_batch=4
128
+ dataset_batch_size=96
129
+ model_dispersive_loss=1000.0
130
+ model_dispersive_loss_layer=10
131
+ checkpoint_keep_interval_updates=1
132
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
133
+ echo "Config ${train_mode} ${config_option}"
134
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
135
+ task_load_clap_emb=true
136
+ model_proj_type=2
137
+ model_clone_batch=4
138
+ dataset_batch_size=48
139
+ model_clap_loss=1.0
140
+ average_top_k_layers=12
141
+ model_add_conv=false
142
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
143
+ echo "Config ${train_mode} ${config_option}"
144
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
145
+ task_load_clap_emb=true
146
+ model_proj_type=2
147
+ model_clone_batch=4
148
+ dataset_batch_size=48
149
+ model_clap_loss=1.0
150
+ average_top_k_layers=1
151
+ # loss type ablation
152
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
153
+ echo "Config ${train_mode} ${config_option}"
154
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
155
+ task_load_clap_emb=true
156
+ model_proj_type=2
157
+ model_clone_batch=4
158
+ dataset_batch_size=48
159
+ model_clap_loss=1.0
160
+ average_top_k_layers=12
161
+ model_clap_loss_type="ce"
162
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
163
+ echo "Config ${train_mode} ${config_option}"
164
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
165
+ task_load_clap_emb=true
166
+ model_proj_type=2
167
+ model_clone_batch=4
168
+ dataset_batch_size=48
169
+ model_clap_loss=1.0
170
+ average_top_k_layers=12
171
+ model_clap_loss_type="l1"
172
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
173
+ echo "Config ${train_mode} ${config_option}"
174
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
175
+ task_load_clap_emb=true
176
+ model_proj_type=2
177
+ model_clone_batch=4
178
+ dataset_batch_size=96
179
+ model_clap_loss=1.0
180
+ average_top_k_layers=12
181
+ model_clap_loss_type="cosine"
182
+ # loss layer ablation
183
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
184
+ echo "Config ${train_mode} ${config_option}"
185
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
186
+ task_load_clap_emb=true
187
+ model_proj_type=2
188
+ model_clone_batch=4
189
+ dataset_batch_size=96
190
+ model_clap_loss=1.0
191
+ average_top_k_layers=12
192
+ model_clap_loss_type="mse"
193
+ model_clap_loss_layer=10
194
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
195
+ echo "Config ${train_mode} ${config_option}"
196
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
197
+ task_load_clap_emb=true
198
+ task_load_source_file=true
199
+ task_load_mel_file=false
200
+ model_proj_type=2
201
+ model_clone_batch=4
202
+ dataset_batch_size=96
203
+ model_clap_loss=1.0
204
+ average_top_k_layers=12
205
+ model_clap_loss_type="mse"
206
+ model_clap_loss_layer=8
207
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
208
+ echo "Config ${train_mode} ${config_option}"
209
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
210
+ task_load_clap_emb=true
211
+ task_load_source_file=true
212
+ task_load_mel_file=false
213
+ model_proj_type=2
214
+ model_clone_batch=4
215
+ dataset_batch_size=96
216
+ model_clap_loss=1.0
217
+ average_top_k_layers=12
218
+ model_clap_loss_type="mse"
219
+ model_clap_loss_layer=6
220
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
221
+ echo "Config ${train_mode} ${config_option}"
222
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
223
+ task_load_clap_emb=true
224
+ task_load_source_file=true
225
+ task_load_mel_file=false
226
+ model_proj_type=2
227
+ model_clone_batch=4
228
+ model_clap_loss=5.0
229
+ dataset_batch_size=96
230
+ average_top_k_layers=12
231
+ model_clap_loss_type="mse"
232
+ checkpoint_keep_interval_updates=-1
233
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
234
+ echo "Config ${train_mode} ${config_option}"
235
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
236
+ task_load_clap_emb=true
237
+ task_load_source_file=true
238
+ task_load_mel_file=false
239
+ model_proj_type=2
240
+ model_clone_batch=4
241
+ model_clap_loss=0.1
242
+ dataset_batch_size=96
243
+ average_top_k_layers=12
244
+ model_clap_loss_type="mse"
245
+ checkpoint_keep_interval_updates=-1
246
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
247
+ echo "Config ${train_mode} ${config_option}"
248
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
249
+ task_load_clap_emb=true
250
+ model_proj_type=4
251
+ model_clone_batch=4
252
+ model_clap_loss=1.0
253
+ dataset_batch_size=48
254
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
255
+ echo "Config ${train_mode} ${config_option}"
256
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
257
+ task_load_clap_emb=true
258
+ model_proj_type=4
259
+ model_clone_batch=4
260
+ model_clap_loss=0.001
261
+ dataset_batch_size=48
262
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
263
+ echo "Config ${train_mode} ${config_option}"
264
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
265
+ task_load_clap_emb=true
266
+ model_proj_type=4
267
+ model_clone_batch=4
268
+ model_clap_loss=0.01
269
+ dataset_batch_size=48
270
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
271
+ echo "Config ${train_mode} ${config_option}"
272
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
273
+ task_load_clap_emb=true
274
+ model_proj_type=6
275
+ model_clone_batch=4
276
+ dataset_batch_size=48
277
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
278
+ echo "Config ${train_mode} ${config_option}"
279
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
280
+ task_load_clap_emb=true
281
+ task_load_source_file=true
282
+ task_load_mel_file=false
283
+ model_proj_type=2
284
+ model_clone_batch=4
285
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
286
+ model_clap_loss=1.0
287
+ average_top_k_layers=11 # modify with model depth
288
+ model_add_conv=true
289
+ model_depth=11 #
290
+ checkpoint_keep_interval_updates=-1 # default 1
291
+ checkpoint_save_interval_updates=10000
292
+ fi
293
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
294
+ echo "Config ${train_mode} ${config_option}"
295
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
296
+ task_load_clap_emb=true
297
+ task_load_source_file=true
298
+ task_load_mel_file=false
299
+ model_proj_type=2
300
+ model_clone_batch=4
301
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
302
+ model_clap_loss=1.0
303
+ average_top_k_layers=12 # modify with model depth
304
+ model_add_conv=true
305
+ model_modalities_image_conv_resolution="16,"
306
+ model_modalities_image_conv_in_chans="1, 768"
307
+ model_depth=12 #
308
+ checkpoint_keep_interval_updates=1 # default 1
309
+ checkpoint_save_interval_updates=10000
310
+ fi
311
+
312
+ python fairseq_cli/hydra_train.py -m \
313
+ --config-dir ./EAT/config \
314
+ --config-name pretraining_AS2M \
315
+ common.user_dir=./EAT \
316
+ checkpoint.save_dir=${checkpoint_save_dir} \
317
+ checkpoint.restore_file=${checkpoint_restore_file} \
318
+ distributed_training.distributed_world_size=${1:-2} \
319
+ dataset.num_workers=24 \
320
+ dataset.data_buffer_size=48 \
321
+ dataset.batch_size=${dataset_batch_size} \
322
+ task.data=${task_data} \
323
+ task.h5_format=False \
324
+ task.load_clap_emb=${task_load_clap_emb} \
325
+ +task.load_source_file=${task_load_source_file} \
326
+ +task.load_mel_file=${task_load_mel_file} \
327
+ model.proj_type=${model_proj_type} \
328
+ model.clone_batch=${model_clone_batch} \
329
+ model.clap_loss=${model_clap_loss} \
330
+ model.average_top_k_layers=${average_top_k_layers} \
331
+ +model.add_conv=${model_add_conv} \
332
+ +model.clap_loss_type=${model_clap_loss_type} \
333
+ +model.clap_loss_layer=${model_clap_loss_layer} \
334
+ +model.dispersive_loss=${model_dispersive_loss} \
335
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
336
+ model.depth=${model_depth} \
337
+ +model.modalities.image.conv_resolution=${model_modalities_image_conv_resolution} \
338
+ +model.modalities.image.conv_in_chans=${model_modalities_image_conv_in_chans} \
339
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
340
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_1_2025-09-30_06-59-40/pretraining_AS2M.sh ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_resolution="4, 8, 16"
35
+ model_modalities_image_conv_in_chans="1, 256, 384, 768"
36
+
37
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
38
+ echo "Config ${train_mode} ${config_option}"
39
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
40
+ task_load_clap_emb=false
41
+ task_load_source_file=true
42
+ task_load_mel_file=false
43
+ model_proj_type=null
44
+ model_clone_batch=4
45
+ dataset_batch_size=96
46
+ model_clap_loss=0
47
+ checkpoint_keep_interval_updates=-1
48
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
49
+ echo "Config ${train_mode} ${config_option}"
50
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
51
+ task_load_clap_emb=false
52
+ task_load_source_file=true
53
+ task_load_mel_file=false
54
+ model_proj_type=null
55
+ model_clone_batch=4
56
+ dataset_batch_size=96
57
+ model_dispersive_loss=1
58
+ model_dispersive_loss_layer=0
59
+ checkpoint_keep_interval_updates=1
60
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
61
+ echo "Config ${train_mode} ${config_option}"
62
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
63
+ task_load_clap_emb=false
64
+ task_load_source_file=true
65
+ task_load_mel_file=false
66
+ model_proj_type=null
67
+ model_clone_batch=1
68
+ dataset_batch_size=384
69
+ model_dispersive_loss=1
70
+ model_dispersive_loss_layer=0
71
+ checkpoint_keep_interval_updates=1
72
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
73
+ echo "Config ${train_mode} ${config_option}"
74
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
75
+ task_load_clap_emb=false
76
+ task_load_source_file=true
77
+ task_load_mel_file=false
78
+ model_proj_type=null
79
+ model_clone_batch=1
80
+ dataset_batch_size=384
81
+ model_dispersive_loss=10.0
82
+ model_dispersive_loss_layer=0
83
+ checkpoint_keep_interval_updates=1
84
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
85
+ echo "Config ${train_mode} ${config_option}"
86
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
87
+ task_load_clap_emb=false
88
+ task_load_source_file=true
89
+ task_load_mel_file=false
90
+ model_proj_type=null
91
+ model_clone_batch=1
92
+ dataset_batch_size=384
93
+ model_dispersive_loss=100.0
94
+ model_dispersive_loss_layer=0
95
+ checkpoint_keep_interval_updates=1
96
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
97
+ echo "Config ${train_mode} ${config_option}"
98
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
99
+ task_load_clap_emb=false
100
+ task_load_source_file=true
101
+ task_load_mel_file=false
102
+ model_proj_type=null
103
+ model_clone_batch=1
104
+ dataset_batch_size=384
105
+ model_dispersive_loss=10000.0
106
+ model_dispersive_loss_layer=0
107
+ checkpoint_keep_interval_updates=1
108
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
109
+ echo "Config ${train_mode} ${config_option}"
110
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
111
+ task_load_clap_emb=false
112
+ task_load_source_file=true
113
+ task_load_mel_file=false
114
+ model_proj_type=null
115
+ model_clone_batch=1
116
+ dataset_batch_size=384
117
+ model_dispersive_loss=1000.0
118
+ model_dispersive_loss_layer=0
119
+ checkpoint_keep_interval_updates=1
120
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
121
+ echo "Config ${train_mode} ${config_option}"
122
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
123
+ task_load_clap_emb=false
124
+ task_load_source_file=true
125
+ task_load_mel_file=false
126
+ model_proj_type=null
127
+ model_clone_batch=4
128
+ dataset_batch_size=96
129
+ model_dispersive_loss=1000.0
130
+ model_dispersive_loss_layer=10
131
+ checkpoint_keep_interval_updates=1
132
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
133
+ echo "Config ${train_mode} ${config_option}"
134
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
135
+ task_load_clap_emb=true
136
+ model_proj_type=2
137
+ model_clone_batch=4
138
+ dataset_batch_size=48
139
+ model_clap_loss=1.0
140
+ average_top_k_layers=12
141
+ model_add_conv=false
142
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
143
+ echo "Config ${train_mode} ${config_option}"
144
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
145
+ task_load_clap_emb=true
146
+ model_proj_type=2
147
+ model_clone_batch=4
148
+ dataset_batch_size=48
149
+ model_clap_loss=1.0
150
+ average_top_k_layers=1
151
+ # loss type ablation
152
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
153
+ echo "Config ${train_mode} ${config_option}"
154
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
155
+ task_load_clap_emb=true
156
+ model_proj_type=2
157
+ model_clone_batch=4
158
+ dataset_batch_size=48
159
+ model_clap_loss=1.0
160
+ average_top_k_layers=12
161
+ model_clap_loss_type="ce"
162
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
163
+ echo "Config ${train_mode} ${config_option}"
164
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
165
+ task_load_clap_emb=true
166
+ model_proj_type=2
167
+ model_clone_batch=4
168
+ dataset_batch_size=48
169
+ model_clap_loss=1.0
170
+ average_top_k_layers=12
171
+ model_clap_loss_type="l1"
172
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
173
+ echo "Config ${train_mode} ${config_option}"
174
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
175
+ task_load_clap_emb=true
176
+ model_proj_type=2
177
+ model_clone_batch=4
178
+ dataset_batch_size=96
179
+ model_clap_loss=1.0
180
+ average_top_k_layers=12
181
+ model_clap_loss_type="cosine"
182
+ # loss layer ablation
183
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
184
+ echo "Config ${train_mode} ${config_option}"
185
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
186
+ task_load_clap_emb=true
187
+ model_proj_type=2
188
+ model_clone_batch=4
189
+ dataset_batch_size=96
190
+ model_clap_loss=1.0
191
+ average_top_k_layers=12
192
+ model_clap_loss_type="mse"
193
+ model_clap_loss_layer=10
194
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
195
+ echo "Config ${train_mode} ${config_option}"
196
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
197
+ task_load_clap_emb=true
198
+ task_load_source_file=true
199
+ task_load_mel_file=false
200
+ model_proj_type=2
201
+ model_clone_batch=4
202
+ dataset_batch_size=96
203
+ model_clap_loss=1.0
204
+ average_top_k_layers=12
205
+ model_clap_loss_type="mse"
206
+ model_clap_loss_layer=8
207
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
208
+ echo "Config ${train_mode} ${config_option}"
209
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
210
+ task_load_clap_emb=true
211
+ task_load_source_file=true
212
+ task_load_mel_file=false
213
+ model_proj_type=2
214
+ model_clone_batch=4
215
+ dataset_batch_size=96
216
+ model_clap_loss=1.0
217
+ average_top_k_layers=12
218
+ model_clap_loss_type="mse"
219
+ model_clap_loss_layer=6
220
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
221
+ echo "Config ${train_mode} ${config_option}"
222
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
223
+ task_load_clap_emb=true
224
+ task_load_source_file=true
225
+ task_load_mel_file=false
226
+ model_proj_type=2
227
+ model_clone_batch=4
228
+ model_clap_loss=5.0
229
+ dataset_batch_size=96
230
+ average_top_k_layers=12
231
+ model_clap_loss_type="mse"
232
+ checkpoint_keep_interval_updates=-1
233
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
234
+ echo "Config ${train_mode} ${config_option}"
235
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
236
+ task_load_clap_emb=true
237
+ task_load_source_file=true
238
+ task_load_mel_file=false
239
+ model_proj_type=2
240
+ model_clone_batch=4
241
+ model_clap_loss=0.1
242
+ dataset_batch_size=96
243
+ average_top_k_layers=12
244
+ model_clap_loss_type="mse"
245
+ checkpoint_keep_interval_updates=-1
246
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
247
+ echo "Config ${train_mode} ${config_option}"
248
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
249
+ task_load_clap_emb=true
250
+ model_proj_type=4
251
+ model_clone_batch=4
252
+ model_clap_loss=1.0
253
+ dataset_batch_size=48
254
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
255
+ echo "Config ${train_mode} ${config_option}"
256
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
257
+ task_load_clap_emb=true
258
+ model_proj_type=4
259
+ model_clone_batch=4
260
+ model_clap_loss=0.001
261
+ dataset_batch_size=48
262
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
263
+ echo "Config ${train_mode} ${config_option}"
264
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
265
+ task_load_clap_emb=true
266
+ model_proj_type=4
267
+ model_clone_batch=4
268
+ model_clap_loss=0.01
269
+ dataset_batch_size=48
270
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
271
+ echo "Config ${train_mode} ${config_option}"
272
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
273
+ task_load_clap_emb=true
274
+ model_proj_type=6
275
+ model_clone_batch=4
276
+ dataset_batch_size=48
277
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
278
+ echo "Config ${train_mode} ${config_option}"
279
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
280
+ task_load_clap_emb=true
281
+ task_load_source_file=true
282
+ task_load_mel_file=false
283
+ model_proj_type=2
284
+ model_clone_batch=4
285
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
286
+ model_clap_loss=1.0
287
+ average_top_k_layers=11 # modify with model depth
288
+ model_add_conv=true
289
+ model_depth=11 #
290
+ checkpoint_keep_interval_updates=-1 # default 1
291
+ checkpoint_save_interval_updates=10000
292
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
293
+ echo "Config ${train_mode} ${config_option}"
294
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
295
+ task_load_clap_emb=true
296
+ task_load_source_file=true
297
+ task_load_mel_file=false
298
+ model_proj_type=2
299
+ model_clone_batch=4
300
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
301
+ model_clap_loss=1.0
302
+ average_top_k_layers=12 # modify with model depth
303
+ model_add_conv=true
304
+ model_modalities_image_conv_resolution="16,"
305
+ model_modalities_image_conv_in_chans="1, 768"
306
+ model_depth=12 #
307
+ checkpoint_keep_interval_updates=1 # default 1
308
+ checkpoint_save_interval_updates=10000
309
+ fi
310
+
311
+ python fairseq_cli/hydra_train.py -m \
312
+ --config-dir ./EAT/config \
313
+ --config-name pretraining_AS2M \
314
+ common.user_dir=./EAT \
315
+ checkpoint.save_dir=${checkpoint_save_dir} \
316
+ checkpoint.restore_file=${checkpoint_restore_file} \
317
+ distributed_training.distributed_world_size=${1:-2} \
318
+ dataset.num_workers=24 \
319
+ dataset.data_buffer_size=48 \
320
+ dataset.batch_size=${dataset_batch_size} \
321
+ task.data=${task_data} \
322
+ task.h5_format=False \
323
+ task.load_clap_emb=${task_load_clap_emb} \
324
+ +task.load_source_file=${task_load_source_file} \
325
+ +task.load_mel_file=${task_load_mel_file} \
326
+ model.proj_type=${model_proj_type} \
327
+ model.clone_batch=${model_clone_batch} \
328
+ model.clap_loss=${model_clap_loss} \
329
+ model.average_top_k_layers=${average_top_k_layers} \
330
+ +model.add_conv=${model_add_conv} \
331
+ +model.clap_loss_type=${model_clap_loss_type} \
332
+ +model.clap_loss_layer=${model_clap_loss_layer} \
333
+ +model.dispersive_loss=${model_dispersive_loss} \
334
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
335
+ model.depth=${model_depth} \
336
+ +model.modalities.image.conv_resolution=${model_modalities_image_conv_resolution} \
337
+ +model.modalities.image.conv_in_chans=${model_modalities_image_conv_in_chans} \
338
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
339
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_1_2025-09-30_07-01-07/pretraining_AS2M.sh ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_resolution='[4,8,16]'
35
+ model_modalities_image_conv_in_chans='[1,256,384,768]'
36
+
37
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
38
+ echo "Config ${train_mode} ${config_option}"
39
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
40
+ task_load_clap_emb=false
41
+ task_load_source_file=true
42
+ task_load_mel_file=false
43
+ model_proj_type=null
44
+ model_clone_batch=4
45
+ dataset_batch_size=96
46
+ model_clap_loss=0
47
+ checkpoint_keep_interval_updates=-1
48
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
49
+ echo "Config ${train_mode} ${config_option}"
50
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
51
+ task_load_clap_emb=false
52
+ task_load_source_file=true
53
+ task_load_mel_file=false
54
+ model_proj_type=null
55
+ model_clone_batch=4
56
+ dataset_batch_size=96
57
+ model_dispersive_loss=1
58
+ model_dispersive_loss_layer=0
59
+ checkpoint_keep_interval_updates=1
60
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
61
+ echo "Config ${train_mode} ${config_option}"
62
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
63
+ task_load_clap_emb=false
64
+ task_load_source_file=true
65
+ task_load_mel_file=false
66
+ model_proj_type=null
67
+ model_clone_batch=1
68
+ dataset_batch_size=384
69
+ model_dispersive_loss=1
70
+ model_dispersive_loss_layer=0
71
+ checkpoint_keep_interval_updates=1
72
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
73
+ echo "Config ${train_mode} ${config_option}"
74
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
75
+ task_load_clap_emb=false
76
+ task_load_source_file=true
77
+ task_load_mel_file=false
78
+ model_proj_type=null
79
+ model_clone_batch=1
80
+ dataset_batch_size=384
81
+ model_dispersive_loss=10.0
82
+ model_dispersive_loss_layer=0
83
+ checkpoint_keep_interval_updates=1
84
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
85
+ echo "Config ${train_mode} ${config_option}"
86
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
87
+ task_load_clap_emb=false
88
+ task_load_source_file=true
89
+ task_load_mel_file=false
90
+ model_proj_type=null
91
+ model_clone_batch=1
92
+ dataset_batch_size=384
93
+ model_dispersive_loss=100.0
94
+ model_dispersive_loss_layer=0
95
+ checkpoint_keep_interval_updates=1
96
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
97
+ echo "Config ${train_mode} ${config_option}"
98
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
99
+ task_load_clap_emb=false
100
+ task_load_source_file=true
101
+ task_load_mel_file=false
102
+ model_proj_type=null
103
+ model_clone_batch=1
104
+ dataset_batch_size=384
105
+ model_dispersive_loss=10000.0
106
+ model_dispersive_loss_layer=0
107
+ checkpoint_keep_interval_updates=1
108
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
109
+ echo "Config ${train_mode} ${config_option}"
110
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
111
+ task_load_clap_emb=false
112
+ task_load_source_file=true
113
+ task_load_mel_file=false
114
+ model_proj_type=null
115
+ model_clone_batch=1
116
+ dataset_batch_size=384
117
+ model_dispersive_loss=1000.0
118
+ model_dispersive_loss_layer=0
119
+ checkpoint_keep_interval_updates=1
120
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
121
+ echo "Config ${train_mode} ${config_option}"
122
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
123
+ task_load_clap_emb=false
124
+ task_load_source_file=true
125
+ task_load_mel_file=false
126
+ model_proj_type=null
127
+ model_clone_batch=4
128
+ dataset_batch_size=96
129
+ model_dispersive_loss=1000.0
130
+ model_dispersive_loss_layer=10
131
+ checkpoint_keep_interval_updates=1
132
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
133
+ echo "Config ${train_mode} ${config_option}"
134
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
135
+ task_load_clap_emb=true
136
+ model_proj_type=2
137
+ model_clone_batch=4
138
+ dataset_batch_size=48
139
+ model_clap_loss=1.0
140
+ average_top_k_layers=12
141
+ model_add_conv=false
142
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
143
+ echo "Config ${train_mode} ${config_option}"
144
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
145
+ task_load_clap_emb=true
146
+ model_proj_type=2
147
+ model_clone_batch=4
148
+ dataset_batch_size=48
149
+ model_clap_loss=1.0
150
+ average_top_k_layers=1
151
+ # loss type ablation
152
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
153
+ echo "Config ${train_mode} ${config_option}"
154
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
155
+ task_load_clap_emb=true
156
+ model_proj_type=2
157
+ model_clone_batch=4
158
+ dataset_batch_size=48
159
+ model_clap_loss=1.0
160
+ average_top_k_layers=12
161
+ model_clap_loss_type="ce"
162
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
163
+ echo "Config ${train_mode} ${config_option}"
164
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
165
+ task_load_clap_emb=true
166
+ model_proj_type=2
167
+ model_clone_batch=4
168
+ dataset_batch_size=48
169
+ model_clap_loss=1.0
170
+ average_top_k_layers=12
171
+ model_clap_loss_type="l1"
172
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
173
+ echo "Config ${train_mode} ${config_option}"
174
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
175
+ task_load_clap_emb=true
176
+ model_proj_type=2
177
+ model_clone_batch=4
178
+ dataset_batch_size=96
179
+ model_clap_loss=1.0
180
+ average_top_k_layers=12
181
+ model_clap_loss_type="cosine"
182
+ # loss layer ablation
183
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
184
+ echo "Config ${train_mode} ${config_option}"
185
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
186
+ task_load_clap_emb=true
187
+ model_proj_type=2
188
+ model_clone_batch=4
189
+ dataset_batch_size=96
190
+ model_clap_loss=1.0
191
+ average_top_k_layers=12
192
+ model_clap_loss_type="mse"
193
+ model_clap_loss_layer=10
194
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
195
+ echo "Config ${train_mode} ${config_option}"
196
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
197
+ task_load_clap_emb=true
198
+ task_load_source_file=true
199
+ task_load_mel_file=false
200
+ model_proj_type=2
201
+ model_clone_batch=4
202
+ dataset_batch_size=96
203
+ model_clap_loss=1.0
204
+ average_top_k_layers=12
205
+ model_clap_loss_type="mse"
206
+ model_clap_loss_layer=8
207
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
208
+ echo "Config ${train_mode} ${config_option}"
209
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
210
+ task_load_clap_emb=true
211
+ task_load_source_file=true
212
+ task_load_mel_file=false
213
+ model_proj_type=2
214
+ model_clone_batch=4
215
+ dataset_batch_size=96
216
+ model_clap_loss=1.0
217
+ average_top_k_layers=12
218
+ model_clap_loss_type="mse"
219
+ model_clap_loss_layer=6
220
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
221
+ echo "Config ${train_mode} ${config_option}"
222
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
223
+ task_load_clap_emb=true
224
+ task_load_source_file=true
225
+ task_load_mel_file=false
226
+ model_proj_type=2
227
+ model_clone_batch=4
228
+ model_clap_loss=5.0
229
+ dataset_batch_size=96
230
+ average_top_k_layers=12
231
+ model_clap_loss_type="mse"
232
+ checkpoint_keep_interval_updates=-1
233
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
234
+ echo "Config ${train_mode} ${config_option}"
235
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
236
+ task_load_clap_emb=true
237
+ task_load_source_file=true
238
+ task_load_mel_file=false
239
+ model_proj_type=2
240
+ model_clone_batch=4
241
+ model_clap_loss=0.1
242
+ dataset_batch_size=96
243
+ average_top_k_layers=12
244
+ model_clap_loss_type="mse"
245
+ checkpoint_keep_interval_updates=-1
246
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
247
+ echo "Config ${train_mode} ${config_option}"
248
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
249
+ task_load_clap_emb=true
250
+ model_proj_type=4
251
+ model_clone_batch=4
252
+ model_clap_loss=1.0
253
+ dataset_batch_size=48
254
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
255
+ echo "Config ${train_mode} ${config_option}"
256
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
257
+ task_load_clap_emb=true
258
+ model_proj_type=4
259
+ model_clone_batch=4
260
+ model_clap_loss=0.001
261
+ dataset_batch_size=48
262
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
263
+ echo "Config ${train_mode} ${config_option}"
264
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
265
+ task_load_clap_emb=true
266
+ model_proj_type=4
267
+ model_clone_batch=4
268
+ model_clap_loss=0.01
269
+ dataset_batch_size=48
270
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
271
+ echo "Config ${train_mode} ${config_option}"
272
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
273
+ task_load_clap_emb=true
274
+ model_proj_type=6
275
+ model_clone_batch=4
276
+ dataset_batch_size=48
277
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
278
+ echo "Config ${train_mode} ${config_option}"
279
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
280
+ task_load_clap_emb=true
281
+ task_load_source_file=true
282
+ task_load_mel_file=false
283
+ model_proj_type=2
284
+ model_clone_batch=4
285
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
286
+ model_clap_loss=1.0
287
+ average_top_k_layers=11 # modify with model depth
288
+ model_add_conv=true
289
+ model_depth=11 #
290
+ checkpoint_keep_interval_updates=-1 # default 1
291
+ checkpoint_save_interval_updates=10000
292
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
293
+ echo "Config ${train_mode} ${config_option}"
294
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
295
+ task_load_clap_emb=true
296
+ task_load_source_file=true
297
+ task_load_mel_file=false
298
+ model_proj_type=2
299
+ model_clone_batch=4
300
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
301
+ model_clap_loss=1.0
302
+ average_top_k_layers=12 # modify with model depth
303
+ model_add_conv=true
304
+ model_modalities_image_conv_resolution='[16]'
305
+ model_modalities_image_conv_in_chans='[1,768]'
306
+ model_depth=12 #
307
+ checkpoint_keep_interval_updates=1 # default 1
308
+ checkpoint_save_interval_updates=10000
309
+ fi
310
+
311
+ python fairseq_cli/hydra_train.py -m \
312
+ --config-dir ./EAT/config \
313
+ --config-name pretraining_AS2M \
314
+ common.user_dir=./EAT \
315
+ checkpoint.save_dir=${checkpoint_save_dir} \
316
+ checkpoint.restore_file=${checkpoint_restore_file} \
317
+ distributed_training.distributed_world_size=${1:-2} \
318
+ dataset.num_workers=24 \
319
+ dataset.data_buffer_size=48 \
320
+ dataset.batch_size=${dataset_batch_size} \
321
+ task.data=${task_data} \
322
+ task.h5_format=False \
323
+ task.load_clap_emb=${task_load_clap_emb} \
324
+ +task.load_source_file=${task_load_source_file} \
325
+ +task.load_mel_file=${task_load_mel_file} \
326
+ model.proj_type=${model_proj_type} \
327
+ model.clone_batch=${model_clone_batch} \
328
+ model.clap_loss=${model_clap_loss} \
329
+ model.average_top_k_layers=${average_top_k_layers} \
330
+ +model.add_conv=${model_add_conv} \
331
+ +model.clap_loss_type=${model_clap_loss_type} \
332
+ +model.clap_loss_layer=${model_clap_loss_layer} \
333
+ +model.dispersive_loss=${model_dispersive_loss} \
334
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
335
+ model.depth=${model_depth} \
336
+ +model.modalities.image.conv_resolution=${model_modalities_image_conv_resolution} \
337
+ +model.modalities.image.conv_in_chans=${model_modalities_image_conv_in_chans} \
338
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
339
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_1_2025-09-30_07-08-58/pretraining_AS2M.sh ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ fi
308
+
309
+ python fairseq_cli/hydra_train.py -m \
310
+ --config-dir ./EAT/config \
311
+ --config-name pretraining_AS2M \
312
+ common.user_dir=./EAT \
313
+ checkpoint.save_dir=${checkpoint_save_dir} \
314
+ checkpoint.restore_file=${checkpoint_restore_file} \
315
+ distributed_training.distributed_world_size=${1:-2} \
316
+ dataset.num_workers=24 \
317
+ dataset.data_buffer_size=48 \
318
+ dataset.batch_size=${dataset_batch_size} \
319
+ task.data=${task_data} \
320
+ task.h5_format=False \
321
+ task.load_clap_emb=${task_load_clap_emb} \
322
+ +task.load_source_file=${task_load_source_file} \
323
+ +task.load_mel_file=${task_load_mel_file} \
324
+ model.proj_type=${model_proj_type} \
325
+ model.clone_batch=${model_clone_batch} \
326
+ model.clap_loss=${model_clap_loss} \
327
+ model.average_top_k_layers=${average_top_k_layers} \
328
+ +model.add_conv=${model_add_conv} \
329
+ +model.clap_loss_type=${model_clap_loss_type} \
330
+ +model.clap_loss_layer=${model_clap_loss_layer} \
331
+ +model.dispersive_loss=${model_dispersive_loss} \
332
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
333
+ model.depth=${model_depth} \
334
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
335
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
336
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_1_2025-09-30_07-14-17/pretraining_AS2M.sh ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ fi
308
+
309
+ python fairseq_cli/hydra_train.py -m \
310
+ --config-dir ./EAT/config \
311
+ --config-name pretraining_AS2M \
312
+ common.user_dir=./EAT \
313
+ checkpoint.save_dir=${checkpoint_save_dir} \
314
+ checkpoint.restore_file=${checkpoint_restore_file} \
315
+ distributed_training.distributed_world_size=${1:-2} \
316
+ dataset.num_workers=24 \
317
+ dataset.data_buffer_size=48 \
318
+ dataset.batch_size=${dataset_batch_size} \
319
+ task.data=${task_data} \
320
+ task.h5_format=False \
321
+ task.load_clap_emb=${task_load_clap_emb} \
322
+ +task.load_source_file=${task_load_source_file} \
323
+ +task.load_mel_file=${task_load_mel_file} \
324
+ model.proj_type=${model_proj_type} \
325
+ model.clone_batch=${model_clone_batch} \
326
+ model.clap_loss=${model_clap_loss} \
327
+ model.average_top_k_layers=${average_top_k_layers} \
328
+ +model.add_conv=${model_add_conv} \
329
+ +model.clap_loss_type=${model_clap_loss_type} \
330
+ +model.clap_loss_layer=${model_clap_loss_layer} \
331
+ +model.dispersive_loss=${model_dispersive_loss} \
332
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
333
+ model.depth=${model_depth} \
334
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
335
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
336
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_1_2025-09-30_07-19-43/pretraining_AS2M.sh ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ fi
308
+
309
+ python fairseq_cli/hydra_train.py -m \
310
+ --config-dir ./EAT/config \
311
+ --config-name pretraining_AS2M \
312
+ common.user_dir=./EAT \
313
+ checkpoint.save_dir=${checkpoint_save_dir} \
314
+ checkpoint.restore_file=${checkpoint_restore_file} \
315
+ distributed_training.distributed_world_size=${1:-2} \
316
+ dataset.num_workers=24 \
317
+ dataset.data_buffer_size=48 \
318
+ dataset.batch_size=${dataset_batch_size} \
319
+ task.data=${task_data} \
320
+ task.h5_format=False \
321
+ task.load_clap_emb=${task_load_clap_emb} \
322
+ +task.load_source_file=${task_load_source_file} \
323
+ +task.load_mel_file=${task_load_mel_file} \
324
+ model.proj_type=${model_proj_type} \
325
+ model.clone_batch=${model_clone_batch} \
326
+ model.clap_loss=${model_clap_loss} \
327
+ model.average_top_k_layers=${average_top_k_layers} \
328
+ +model.add_conv=${model_add_conv} \
329
+ +model.clap_loss_type=${model_clap_loss_type} \
330
+ +model.clap_loss_layer=${model_clap_loss_layer} \
331
+ +model.dispersive_loss=${model_dispersive_loss} \
332
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
333
+ model.depth=${model_depth} \
334
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
335
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
336
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_1_2025-09-30_07-25-52/pretraining_AS2M.sh ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ fi
308
+
309
+ python fairseq_cli/hydra_train.py -m \
310
+ --config-dir ./EAT/config \
311
+ --config-name pretraining_AS2M \
312
+ common.user_dir=./EAT \
313
+ checkpoint.save_dir=${checkpoint_save_dir} \
314
+ checkpoint.restore_file=${checkpoint_restore_file} \
315
+ distributed_training.distributed_world_size=${1:-2} \
316
+ dataset.num_workers=24 \
317
+ dataset.data_buffer_size=48 \
318
+ dataset.batch_size=${dataset_batch_size} \
319
+ task.data=${task_data} \
320
+ task.h5_format=False \
321
+ task.load_clap_emb=${task_load_clap_emb} \
322
+ +task.load_source_file=${task_load_source_file} \
323
+ +task.load_mel_file=${task_load_mel_file} \
324
+ model.proj_type=${model_proj_type} \
325
+ model.clone_batch=${model_clone_batch} \
326
+ model.clap_loss=${model_clap_loss} \
327
+ model.average_top_k_layers=${average_top_k_layers} \
328
+ +model.add_conv=${model_add_conv} \
329
+ +model.clap_loss_type=${model_clap_loss_type} \
330
+ +model.clap_loss_layer=${model_clap_loss_layer} \
331
+ +model.dispersive_loss=${model_dispersive_loss} \
332
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
333
+ model.depth=${model_depth} \
334
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
335
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
336
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_1_2025-09-30_08-31-42/pretraining_AS2M.sh ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ fi
356
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 5 ]]; then
357
+ echo "Config ${train_mode} ${config_option}"
358
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
359
+ task_load_clap_emb=true
360
+ task_load_source_file=true
361
+ task_load_mel_file=false
362
+ model_proj_type=2
363
+ model_clone_batch=4
364
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
365
+ model_clap_loss=1.0
366
+ average_top_k_layers=12 # modify with model depth
367
+ model_add_conv=true
368
+ model_modalities_image_conv_option=5
369
+ model_depth=12 #
370
+ checkpoint_keep_interval_updates=1 # default 1
371
+ checkpoint_save_interval_updates=10000
372
+ fi
373
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 6 ]]; then
374
+ echo "Config ${train_mode} ${config_option}"
375
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
376
+ task_load_clap_emb=true
377
+ task_load_source_file=true
378
+ task_load_mel_file=false
379
+ model_proj_type=2
380
+ model_clone_batch=4
381
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
382
+ model_clap_loss=1.0
383
+ average_top_k_layers=12 # modify with model depth
384
+ model_add_conv=true
385
+ model_modalities_image_conv_option=6
386
+ model_depth=12 #
387
+ checkpoint_keep_interval_updates=1 # default 1
388
+ checkpoint_save_interval_updates=10000
389
+ fi
390
+
391
+ python fairseq_cli/hydra_train.py -m \
392
+ --config-dir ./EAT/config \
393
+ --config-name pretraining_AS2M \
394
+ common.user_dir=./EAT \
395
+ checkpoint.save_dir=${checkpoint_save_dir} \
396
+ checkpoint.restore_file=${checkpoint_restore_file} \
397
+ distributed_training.distributed_world_size=${1:-2} \
398
+ dataset.num_workers=24 \
399
+ dataset.data_buffer_size=48 \
400
+ dataset.batch_size=${dataset_batch_size} \
401
+ task.data=${task_data} \
402
+ task.h5_format=False \
403
+ task.load_clap_emb=${task_load_clap_emb} \
404
+ +task.load_source_file=${task_load_source_file} \
405
+ +task.load_mel_file=${task_load_mel_file} \
406
+ model.proj_type=${model_proj_type} \
407
+ model.clone_batch=${model_clone_batch} \
408
+ model.clap_loss=${model_clap_loss} \
409
+ model.average_top_k_layers=${average_top_k_layers} \
410
+ +model.add_conv=${model_add_conv} \
411
+ +model.clap_loss_type=${model_clap_loss_type} \
412
+ +model.clap_loss_layer=${model_clap_loss_layer} \
413
+ +model.dispersive_loss=${model_dispersive_loss} \
414
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
415
+ model.depth=${model_depth} \
416
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
417
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
418
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_1_2025-09-30_08-31-59/pretraining_AS2M.sh ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 5 ]]; then
356
+ echo "Config ${train_mode} ${config_option}"
357
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
358
+ task_load_clap_emb=true
359
+ task_load_source_file=true
360
+ task_load_mel_file=false
361
+ model_proj_type=2
362
+ model_clone_batch=4
363
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
364
+ model_clap_loss=1.0
365
+ average_top_k_layers=12 # modify with model depth
366
+ model_add_conv=true
367
+ model_modalities_image_conv_option=5
368
+ model_depth=12 #
369
+ checkpoint_keep_interval_updates=1 # default 1
370
+ checkpoint_save_interval_updates=10000
371
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 6 ]]; then
372
+ echo "Config ${train_mode} ${config_option}"
373
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
374
+ task_load_clap_emb=true
375
+ task_load_source_file=true
376
+ task_load_mel_file=false
377
+ model_proj_type=2
378
+ model_clone_batch=4
379
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
380
+ model_clap_loss=1.0
381
+ average_top_k_layers=12 # modify with model depth
382
+ model_add_conv=true
383
+ model_modalities_image_conv_option=6
384
+ model_depth=12 #
385
+ checkpoint_keep_interval_updates=1 # default 1
386
+ checkpoint_save_interval_updates=10000
387
+ fi
388
+
389
+ python fairseq_cli/hydra_train.py -m \
390
+ --config-dir ./EAT/config \
391
+ --config-name pretraining_AS2M \
392
+ common.user_dir=./EAT \
393
+ checkpoint.save_dir=${checkpoint_save_dir} \
394
+ checkpoint.restore_file=${checkpoint_restore_file} \
395
+ distributed_training.distributed_world_size=${1:-2} \
396
+ dataset.num_workers=24 \
397
+ dataset.data_buffer_size=48 \
398
+ dataset.batch_size=${dataset_batch_size} \
399
+ task.data=${task_data} \
400
+ task.h5_format=False \
401
+ task.load_clap_emb=${task_load_clap_emb} \
402
+ +task.load_source_file=${task_load_source_file} \
403
+ +task.load_mel_file=${task_load_mel_file} \
404
+ model.proj_type=${model_proj_type} \
405
+ model.clone_batch=${model_clone_batch} \
406
+ model.clap_loss=${model_clap_loss} \
407
+ model.average_top_k_layers=${average_top_k_layers} \
408
+ +model.add_conv=${model_add_conv} \
409
+ +model.clap_loss_type=${model_clap_loss_type} \
410
+ +model.clap_loss_layer=${model_clap_loss_layer} \
411
+ +model.dispersive_loss=${model_dispersive_loss} \
412
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
413
+ model.depth=${model_depth} \
414
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
415
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
416
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_2_2025-09-30_09-12-51/pretraining_AS2M.sh ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=2
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 5 ]]; then
356
+ echo "Config ${train_mode} ${config_option}"
357
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
358
+ task_load_clap_emb=true
359
+ task_load_source_file=true
360
+ task_load_mel_file=false
361
+ model_proj_type=2
362
+ model_clone_batch=4
363
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
364
+ model_clap_loss=1.0
365
+ average_top_k_layers=12 # modify with model depth
366
+ model_add_conv=true
367
+ model_modalities_image_conv_option=5
368
+ model_depth=12 #
369
+ checkpoint_keep_interval_updates=1 # default 1
370
+ checkpoint_save_interval_updates=10000
371
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 6 ]]; then
372
+ echo "Config ${train_mode} ${config_option}"
373
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
374
+ task_load_clap_emb=true
375
+ task_load_source_file=true
376
+ task_load_mel_file=false
377
+ model_proj_type=2
378
+ model_clone_batch=4
379
+ dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
380
+ model_clap_loss=1.0
381
+ average_top_k_layers=12 # modify with model depth
382
+ model_add_conv=true
383
+ model_modalities_image_conv_option=6
384
+ model_depth=12 #
385
+ checkpoint_keep_interval_updates=1 # default 1
386
+ checkpoint_save_interval_updates=10000
387
+ fi
388
+
389
+ python fairseq_cli/hydra_train.py -m \
390
+ --config-dir ./EAT/config \
391
+ --config-name pretraining_AS2M \
392
+ common.user_dir=./EAT \
393
+ checkpoint.save_dir=${checkpoint_save_dir} \
394
+ checkpoint.restore_file=${checkpoint_restore_file} \
395
+ distributed_training.distributed_world_size=${1:-2} \
396
+ dataset.num_workers=24 \
397
+ dataset.data_buffer_size=48 \
398
+ dataset.batch_size=${dataset_batch_size} \
399
+ task.data=${task_data} \
400
+ task.h5_format=False \
401
+ task.load_clap_emb=${task_load_clap_emb} \
402
+ +task.load_source_file=${task_load_source_file} \
403
+ +task.load_mel_file=${task_load_mel_file} \
404
+ model.proj_type=${model_proj_type} \
405
+ model.clone_batch=${model_clone_batch} \
406
+ model.clap_loss=${model_clap_loss} \
407
+ model.average_top_k_layers=${average_top_k_layers} \
408
+ +model.add_conv=${model_add_conv} \
409
+ +model.clap_loss_type=${model_clap_loss_type} \
410
+ +model.clap_loss_layer=${model_clap_loss_layer} \
411
+ +model.dispersive_loss=${model_dispersive_loss} \
412
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
413
+ model.depth=${model_depth} \
414
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
415
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
416
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_4_2025-09-30_07-37-48/pretraining_AS2M.sh ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ fi
308
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
309
+ echo "Config ${train_mode} ${config_option}"
310
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
311
+ task_load_clap_emb=true
312
+ task_load_source_file=true
313
+ task_load_mel_file=false
314
+ model_proj_type=2
315
+ model_clone_batch=4
316
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
317
+ model_clap_loss=1.0
318
+ average_top_k_layers=12 # modify with model depth
319
+ model_add_conv=true
320
+ model_modalities_image_conv_option=2
321
+ model_depth=12 #
322
+ checkpoint_keep_interval_updates=1 # default 1
323
+ checkpoint_save_interval_updates=10000
324
+ fi
325
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
326
+ echo "Config ${train_mode} ${config_option}"
327
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
328
+ task_load_clap_emb=true
329
+ task_load_source_file=true
330
+ task_load_mel_file=false
331
+ model_proj_type=2
332
+ model_clone_batch=4
333
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
334
+ model_clap_loss=1.0
335
+ average_top_k_layers=12 # modify with model depth
336
+ model_add_conv=true
337
+ model_modalities_image_conv_option=3
338
+ model_depth=12 #
339
+ checkpoint_keep_interval_updates=1 # default 1
340
+ checkpoint_save_interval_updates=10000
341
+ fi
342
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
343
+ echo "Config ${train_mode} ${config_option}"
344
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
345
+ task_load_clap_emb=true
346
+ task_load_source_file=true
347
+ task_load_mel_file=false
348
+ model_proj_type=2
349
+ model_clone_batch=4
350
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
351
+ model_clap_loss=1.0
352
+ average_top_k_layers=12 # modify with model depth
353
+ model_add_conv=true
354
+ model_modalities_image_conv_option=4
355
+ model_depth=12 #
356
+ checkpoint_keep_interval_updates=1 # default 1
357
+ checkpoint_save_interval_updates=10000
358
+ fi
359
+
360
+ python fairseq_cli/hydra_train.py -m \
361
+ --config-dir ./EAT/config \
362
+ --config-name pretraining_AS2M \
363
+ common.user_dir=./EAT \
364
+ checkpoint.save_dir=${checkpoint_save_dir} \
365
+ checkpoint.restore_file=${checkpoint_restore_file} \
366
+ distributed_training.distributed_world_size=${1:-2} \
367
+ dataset.num_workers=24 \
368
+ dataset.data_buffer_size=48 \
369
+ dataset.batch_size=${dataset_batch_size} \
370
+ task.data=${task_data} \
371
+ task.h5_format=False \
372
+ task.load_clap_emb=${task_load_clap_emb} \
373
+ +task.load_source_file=${task_load_source_file} \
374
+ +task.load_mel_file=${task_load_mel_file} \
375
+ model.proj_type=${model_proj_type} \
376
+ model.clone_batch=${model_clone_batch} \
377
+ model.clap_loss=${model_clap_loss} \
378
+ model.average_top_k_layers=${average_top_k_layers} \
379
+ +model.add_conv=${model_add_conv} \
380
+ +model.clap_loss_type=${model_clap_loss_type} \
381
+ +model.clap_loss_layer=${model_clap_loss_layer} \
382
+ +model.dispersive_loss=${model_dispersive_loss} \
383
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
384
+ model.depth=${model_depth} \
385
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
386
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
387
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_4_2025-09-30_07-38-18/pretraining_AS2M.sh ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ fi
356
+
357
+ python fairseq_cli/hydra_train.py -m \
358
+ --config-dir ./EAT/config \
359
+ --config-name pretraining_AS2M \
360
+ common.user_dir=./EAT \
361
+ checkpoint.save_dir=${checkpoint_save_dir} \
362
+ checkpoint.restore_file=${checkpoint_restore_file} \
363
+ distributed_training.distributed_world_size=${1:-2} \
364
+ dataset.num_workers=24 \
365
+ dataset.data_buffer_size=48 \
366
+ dataset.batch_size=${dataset_batch_size} \
367
+ task.data=${task_data} \
368
+ task.h5_format=False \
369
+ task.load_clap_emb=${task_load_clap_emb} \
370
+ +task.load_source_file=${task_load_source_file} \
371
+ +task.load_mel_file=${task_load_mel_file} \
372
+ model.proj_type=${model_proj_type} \
373
+ model.clone_batch=${model_clone_batch} \
374
+ model.clap_loss=${model_clap_loss} \
375
+ model.average_top_k_layers=${average_top_k_layers} \
376
+ +model.add_conv=${model_add_conv} \
377
+ +model.clap_loss_type=${model_clap_loss_type} \
378
+ +model.clap_loss_layer=${model_clap_loss_layer} \
379
+ +model.dispersive_loss=${model_dispersive_loss} \
380
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
381
+ model.depth=${model_depth} \
382
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
383
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
384
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_4_2025-09-30_07-42-31/pretraining_AS2M.sh ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ fi
356
+
357
+ python fairseq_cli/hydra_train.py -m \
358
+ --config-dir ./EAT/config \
359
+ --config-name pretraining_AS2M \
360
+ common.user_dir=./EAT \
361
+ checkpoint.save_dir=${checkpoint_save_dir} \
362
+ checkpoint.restore_file=${checkpoint_restore_file} \
363
+ distributed_training.distributed_world_size=${1:-2} \
364
+ dataset.num_workers=24 \
365
+ dataset.data_buffer_size=48 \
366
+ dataset.batch_size=${dataset_batch_size} \
367
+ task.data=${task_data} \
368
+ task.h5_format=False \
369
+ task.load_clap_emb=${task_load_clap_emb} \
370
+ +task.load_source_file=${task_load_source_file} \
371
+ +task.load_mel_file=${task_load_mel_file} \
372
+ model.proj_type=${model_proj_type} \
373
+ model.clone_batch=${model_clone_batch} \
374
+ model.clap_loss=${model_clap_loss} \
375
+ model.average_top_k_layers=${average_top_k_layers} \
376
+ +model.add_conv=${model_add_conv} \
377
+ +model.clap_loss_type=${model_clap_loss_type} \
378
+ +model.clap_loss_layer=${model_clap_loss_layer} \
379
+ +model.dispersive_loss=${model_dispersive_loss} \
380
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
381
+ model.depth=${model_depth} \
382
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
383
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
384
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_4_2025-09-30_07-45-39/pretraining_AS2M.sh ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ fi
356
+
357
+ python fairseq_cli/hydra_train.py -m \
358
+ --config-dir ./EAT/config \
359
+ --config-name pretraining_AS2M \
360
+ common.user_dir=./EAT \
361
+ checkpoint.save_dir=${checkpoint_save_dir} \
362
+ checkpoint.restore_file=${checkpoint_restore_file} \
363
+ distributed_training.distributed_world_size=${1:-2} \
364
+ dataset.num_workers=24 \
365
+ dataset.data_buffer_size=48 \
366
+ dataset.batch_size=${dataset_batch_size} \
367
+ task.data=${task_data} \
368
+ task.h5_format=False \
369
+ task.load_clap_emb=${task_load_clap_emb} \
370
+ +task.load_source_file=${task_load_source_file} \
371
+ +task.load_mel_file=${task_load_mel_file} \
372
+ model.proj_type=${model_proj_type} \
373
+ model.clone_batch=${model_clone_batch} \
374
+ model.clap_loss=${model_clap_loss} \
375
+ model.average_top_k_layers=${average_top_k_layers} \
376
+ +model.add_conv=${model_add_conv} \
377
+ +model.clap_loss_type=${model_clap_loss_type} \
378
+ +model.clap_loss_layer=${model_clap_loss_layer} \
379
+ +model.dispersive_loss=${model_dispersive_loss} \
380
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
381
+ model.depth=${model_depth} \
382
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
383
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
384
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_4_2025-09-30_07-49-28/pretraining_AS2M.sh ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ fi
356
+
357
+ python fairseq_cli/hydra_train.py -m \
358
+ --config-dir ./EAT/config \
359
+ --config-name pretraining_AS2M \
360
+ common.user_dir=./EAT \
361
+ checkpoint.save_dir=${checkpoint_save_dir} \
362
+ checkpoint.restore_file=${checkpoint_restore_file} \
363
+ distributed_training.distributed_world_size=${1:-2} \
364
+ dataset.num_workers=24 \
365
+ dataset.data_buffer_size=48 \
366
+ dataset.batch_size=${dataset_batch_size} \
367
+ task.data=${task_data} \
368
+ task.h5_format=False \
369
+ task.load_clap_emb=${task_load_clap_emb} \
370
+ +task.load_source_file=${task_load_source_file} \
371
+ +task.load_mel_file=${task_load_mel_file} \
372
+ model.proj_type=${model_proj_type} \
373
+ model.clone_batch=${model_clone_batch} \
374
+ model.clap_loss=${model_clap_loss} \
375
+ model.average_top_k_layers=${average_top_k_layers} \
376
+ +model.add_conv=${model_add_conv} \
377
+ +model.clap_loss_type=${model_clap_loss_type} \
378
+ +model.clap_loss_layer=${model_clap_loss_layer} \
379
+ +model.dispersive_loss=${model_dispersive_loss} \
380
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
381
+ model.depth=${model_depth} \
382
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
383
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
384
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_4_2025-09-30_07-57-18/pretraining_AS2M.sh ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ fi
356
+
357
+ python fairseq_cli/hydra_train.py -m \
358
+ --config-dir ./EAT/config \
359
+ --config-name pretraining_AS2M \
360
+ common.user_dir=./EAT \
361
+ checkpoint.save_dir=${checkpoint_save_dir} \
362
+ checkpoint.restore_file=${checkpoint_restore_file} \
363
+ distributed_training.distributed_world_size=${1:-2} \
364
+ dataset.num_workers=24 \
365
+ dataset.data_buffer_size=48 \
366
+ dataset.batch_size=${dataset_batch_size} \
367
+ task.data=${task_data} \
368
+ task.h5_format=False \
369
+ task.load_clap_emb=${task_load_clap_emb} \
370
+ +task.load_source_file=${task_load_source_file} \
371
+ +task.load_mel_file=${task_load_mel_file} \
372
+ model.proj_type=${model_proj_type} \
373
+ model.clone_batch=${model_clone_batch} \
374
+ model.clap_loss=${model_clap_loss} \
375
+ model.average_top_k_layers=${average_top_k_layers} \
376
+ +model.add_conv=${model_add_conv} \
377
+ +model.clap_loss_type=${model_clap_loss_type} \
378
+ +model.clap_loss_layer=${model_clap_loss_layer} \
379
+ +model.dispersive_loss=${model_dispersive_loss} \
380
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
381
+ model.depth=${model_depth} \
382
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
383
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
384
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_4_2025-09-30_08-05-21/pretraining_AS2M.sh ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ fi
356
+
357
+ python fairseq_cli/hydra_train.py -m \
358
+ --config-dir ./EAT/config \
359
+ --config-name pretraining_AS2M \
360
+ common.user_dir=./EAT \
361
+ checkpoint.save_dir=${checkpoint_save_dir} \
362
+ checkpoint.restore_file=${checkpoint_restore_file} \
363
+ distributed_training.distributed_world_size=${1:-2} \
364
+ dataset.num_workers=24 \
365
+ dataset.data_buffer_size=48 \
366
+ dataset.batch_size=${dataset_batch_size} \
367
+ task.data=${task_data} \
368
+ task.h5_format=False \
369
+ task.load_clap_emb=${task_load_clap_emb} \
370
+ +task.load_source_file=${task_load_source_file} \
371
+ +task.load_mel_file=${task_load_mel_file} \
372
+ model.proj_type=${model_proj_type} \
373
+ model.clone_batch=${model_clone_batch} \
374
+ model.clap_loss=${model_clap_loss} \
375
+ model.average_top_k_layers=${average_top_k_layers} \
376
+ +model.add_conv=${model_add_conv} \
377
+ +model.clap_loss_type=${model_clap_loss_type} \
378
+ +model.clap_loss_layer=${model_clap_loss_layer} \
379
+ +model.dispersive_loss=${model_dispersive_loss} \
380
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
381
+ model.depth=${model_depth} \
382
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
383
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
384
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_4_2025-09-30_08-13-17/pretraining_AS2M.sh ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ fi
356
+
357
+ python fairseq_cli/hydra_train.py -m \
358
+ --config-dir ./EAT/config \
359
+ --config-name pretraining_AS2M \
360
+ common.user_dir=./EAT \
361
+ checkpoint.save_dir=${checkpoint_save_dir} \
362
+ checkpoint.restore_file=${checkpoint_restore_file} \
363
+ distributed_training.distributed_world_size=${1:-2} \
364
+ dataset.num_workers=24 \
365
+ dataset.data_buffer_size=48 \
366
+ dataset.batch_size=${dataset_batch_size} \
367
+ task.data=${task_data} \
368
+ task.h5_format=False \
369
+ task.load_clap_emb=${task_load_clap_emb} \
370
+ +task.load_source_file=${task_load_source_file} \
371
+ +task.load_mel_file=${task_load_mel_file} \
372
+ model.proj_type=${model_proj_type} \
373
+ model.clone_batch=${model_clone_batch} \
374
+ model.clap_loss=${model_clap_loss} \
375
+ model.average_top_k_layers=${average_top_k_layers} \
376
+ +model.add_conv=${model_add_conv} \
377
+ +model.clap_loss_type=${model_clap_loss_type} \
378
+ +model.clap_loss_layer=${model_clap_loss_layer} \
379
+ +model.dispersive_loss=${model_dispersive_loss} \
380
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
381
+ model.depth=${model_depth} \
382
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
383
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
384
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/conv_clap_4_2025-09-30_08-23-09/pretraining_AS2M.sh ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=conv_clap
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+ model_modalities_image_conv_option=0
35
+
36
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
37
+ echo "Config ${train_mode} ${config_option}"
38
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
39
+ task_load_clap_emb=false
40
+ task_load_source_file=true
41
+ task_load_mel_file=false
42
+ model_proj_type=null
43
+ model_clone_batch=4
44
+ dataset_batch_size=96
45
+ model_clap_loss=0
46
+ checkpoint_keep_interval_updates=-1
47
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
48
+ echo "Config ${train_mode} ${config_option}"
49
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
50
+ task_load_clap_emb=false
51
+ task_load_source_file=true
52
+ task_load_mel_file=false
53
+ model_proj_type=null
54
+ model_clone_batch=4
55
+ dataset_batch_size=96
56
+ model_dispersive_loss=1
57
+ model_dispersive_loss_layer=0
58
+ checkpoint_keep_interval_updates=1
59
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
60
+ echo "Config ${train_mode} ${config_option}"
61
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
62
+ task_load_clap_emb=false
63
+ task_load_source_file=true
64
+ task_load_mel_file=false
65
+ model_proj_type=null
66
+ model_clone_batch=1
67
+ dataset_batch_size=384
68
+ model_dispersive_loss=1
69
+ model_dispersive_loss_layer=0
70
+ checkpoint_keep_interval_updates=1
71
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
72
+ echo "Config ${train_mode} ${config_option}"
73
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
74
+ task_load_clap_emb=false
75
+ task_load_source_file=true
76
+ task_load_mel_file=false
77
+ model_proj_type=null
78
+ model_clone_batch=1
79
+ dataset_batch_size=384
80
+ model_dispersive_loss=10.0
81
+ model_dispersive_loss_layer=0
82
+ checkpoint_keep_interval_updates=1
83
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
84
+ echo "Config ${train_mode} ${config_option}"
85
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
86
+ task_load_clap_emb=false
87
+ task_load_source_file=true
88
+ task_load_mel_file=false
89
+ model_proj_type=null
90
+ model_clone_batch=1
91
+ dataset_batch_size=384
92
+ model_dispersive_loss=100.0
93
+ model_dispersive_loss_layer=0
94
+ checkpoint_keep_interval_updates=1
95
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
96
+ echo "Config ${train_mode} ${config_option}"
97
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
98
+ task_load_clap_emb=false
99
+ task_load_source_file=true
100
+ task_load_mel_file=false
101
+ model_proj_type=null
102
+ model_clone_batch=1
103
+ dataset_batch_size=384
104
+ model_dispersive_loss=10000.0
105
+ model_dispersive_loss_layer=0
106
+ checkpoint_keep_interval_updates=1
107
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
108
+ echo "Config ${train_mode} ${config_option}"
109
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
110
+ task_load_clap_emb=false
111
+ task_load_source_file=true
112
+ task_load_mel_file=false
113
+ model_proj_type=null
114
+ model_clone_batch=1
115
+ dataset_batch_size=384
116
+ model_dispersive_loss=1000.0
117
+ model_dispersive_loss_layer=0
118
+ checkpoint_keep_interval_updates=1
119
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
120
+ echo "Config ${train_mode} ${config_option}"
121
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
122
+ task_load_clap_emb=false
123
+ task_load_source_file=true
124
+ task_load_mel_file=false
125
+ model_proj_type=null
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_dispersive_loss=1000.0
129
+ model_dispersive_loss_layer=10
130
+ checkpoint_keep_interval_updates=1
131
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
132
+ echo "Config ${train_mode} ${config_option}"
133
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
134
+ task_load_clap_emb=true
135
+ model_proj_type=2
136
+ model_clone_batch=4
137
+ dataset_batch_size=48
138
+ model_clap_loss=1.0
139
+ average_top_k_layers=12
140
+ model_add_conv=false
141
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
142
+ echo "Config ${train_mode} ${config_option}"
143
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
144
+ task_load_clap_emb=true
145
+ model_proj_type=2
146
+ model_clone_batch=4
147
+ dataset_batch_size=48
148
+ model_clap_loss=1.0
149
+ average_top_k_layers=1
150
+ # loss type ablation
151
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
152
+ echo "Config ${train_mode} ${config_option}"
153
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
154
+ task_load_clap_emb=true
155
+ model_proj_type=2
156
+ model_clone_batch=4
157
+ dataset_batch_size=48
158
+ model_clap_loss=1.0
159
+ average_top_k_layers=12
160
+ model_clap_loss_type="ce"
161
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
162
+ echo "Config ${train_mode} ${config_option}"
163
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
164
+ task_load_clap_emb=true
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ dataset_batch_size=48
168
+ model_clap_loss=1.0
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="l1"
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="cosine"
181
+ # loss layer ablation
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=10
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=8
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ dataset_batch_size=96
215
+ model_clap_loss=1.0
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ model_clap_loss_layer=6
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=5.0
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
235
+ task_load_clap_emb=true
236
+ task_load_source_file=true
237
+ task_load_mel_file=false
238
+ model_proj_type=2
239
+ model_clone_batch=4
240
+ model_clap_loss=0.1
241
+ dataset_batch_size=96
242
+ average_top_k_layers=12
243
+ model_clap_loss_type="mse"
244
+ checkpoint_keep_interval_updates=-1
245
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
246
+ echo "Config ${train_mode} ${config_option}"
247
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
248
+ task_load_clap_emb=true
249
+ model_proj_type=4
250
+ model_clone_batch=4
251
+ model_clap_loss=1.0
252
+ dataset_batch_size=48
253
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
254
+ echo "Config ${train_mode} ${config_option}"
255
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
256
+ task_load_clap_emb=true
257
+ model_proj_type=4
258
+ model_clone_batch=4
259
+ model_clap_loss=0.001
260
+ dataset_batch_size=48
261
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
262
+ echo "Config ${train_mode} ${config_option}"
263
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
264
+ task_load_clap_emb=true
265
+ model_proj_type=4
266
+ model_clone_batch=4
267
+ model_clap_loss=0.01
268
+ dataset_batch_size=48
269
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
270
+ echo "Config ${train_mode} ${config_option}"
271
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
272
+ task_load_clap_emb=true
273
+ model_proj_type=6
274
+ model_clone_batch=4
275
+ dataset_batch_size=48
276
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
277
+ echo "Config ${train_mode} ${config_option}"
278
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
279
+ task_load_clap_emb=true
280
+ task_load_source_file=true
281
+ task_load_mel_file=false
282
+ model_proj_type=2
283
+ model_clone_batch=4
284
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
285
+ model_clap_loss=1.0
286
+ average_top_k_layers=11 # modify with model depth
287
+ model_add_conv=true
288
+ model_depth=11 #
289
+ checkpoint_keep_interval_updates=-1 # default 1
290
+ checkpoint_save_interval_updates=10000
291
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
292
+ echo "Config ${train_mode} ${config_option}"
293
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
294
+ task_load_clap_emb=true
295
+ task_load_source_file=true
296
+ task_load_mel_file=false
297
+ model_proj_type=2
298
+ model_clone_batch=4
299
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
300
+ model_clap_loss=1.0
301
+ average_top_k_layers=12 # modify with model depth
302
+ model_add_conv=true
303
+ model_modalities_image_conv_option=1
304
+ model_depth=12 #
305
+ checkpoint_keep_interval_updates=1 # default 1
306
+ checkpoint_save_interval_updates=10000
307
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
308
+ echo "Config ${train_mode} ${config_option}"
309
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
310
+ task_load_clap_emb=true
311
+ task_load_source_file=true
312
+ task_load_mel_file=false
313
+ model_proj_type=2
314
+ model_clone_batch=4
315
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
316
+ model_clap_loss=1.0
317
+ average_top_k_layers=12 # modify with model depth
318
+ model_add_conv=true
319
+ model_modalities_image_conv_option=2
320
+ model_depth=12 #
321
+ checkpoint_keep_interval_updates=1 # default 1
322
+ checkpoint_save_interval_updates=10000
323
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
324
+ echo "Config ${train_mode} ${config_option}"
325
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
326
+ task_load_clap_emb=true
327
+ task_load_source_file=true
328
+ task_load_mel_file=false
329
+ model_proj_type=2
330
+ model_clone_batch=4
331
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
332
+ model_clap_loss=1.0
333
+ average_top_k_layers=12 # modify with model depth
334
+ model_add_conv=true
335
+ model_modalities_image_conv_option=3
336
+ model_depth=12 #
337
+ checkpoint_keep_interval_updates=1 # default 1
338
+ checkpoint_save_interval_updates=10000
339
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
340
+ echo "Config ${train_mode} ${config_option}"
341
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
342
+ task_load_clap_emb=true
343
+ task_load_source_file=true
344
+ task_load_mel_file=false
345
+ model_proj_type=2
346
+ model_clone_batch=4
347
+ dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
348
+ model_clap_loss=1.0
349
+ average_top_k_layers=12 # modify with model depth
350
+ model_add_conv=true
351
+ model_modalities_image_conv_option=4
352
+ model_depth=12 #
353
+ checkpoint_keep_interval_updates=1 # default 1
354
+ checkpoint_save_interval_updates=10000
355
+ fi
356
+
357
+ python fairseq_cli/hydra_train.py -m \
358
+ --config-dir ./EAT/config \
359
+ --config-name pretraining_AS2M \
360
+ common.user_dir=./EAT \
361
+ checkpoint.save_dir=${checkpoint_save_dir} \
362
+ checkpoint.restore_file=${checkpoint_restore_file} \
363
+ distributed_training.distributed_world_size=${1:-2} \
364
+ dataset.num_workers=24 \
365
+ dataset.data_buffer_size=48 \
366
+ dataset.batch_size=${dataset_batch_size} \
367
+ task.data=${task_data} \
368
+ task.h5_format=False \
369
+ task.load_clap_emb=${task_load_clap_emb} \
370
+ +task.load_source_file=${task_load_source_file} \
371
+ +task.load_mel_file=${task_load_mel_file} \
372
+ model.proj_type=${model_proj_type} \
373
+ model.clone_batch=${model_clone_batch} \
374
+ model.clap_loss=${model_clap_loss} \
375
+ model.average_top_k_layers=${average_top_k_layers} \
376
+ +model.add_conv=${model_add_conv} \
377
+ +model.clap_loss_type=${model_clap_loss_type} \
378
+ +model.clap_loss_layer=${model_clap_loss_layer} \
379
+ +model.dispersive_loss=${model_dispersive_loss} \
380
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
381
+ model.depth=${model_depth} \
382
+ +model.modalities.image.conv_option=${model_modalities_image_conv_option} \
383
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
384
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_0_2025-09-24_13-58-24/pretraining_AS2M.sh ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=0
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
61
+ task_load_clap_emb=true
62
+ model_proj_type=2
63
+ model_clone_batch=4
64
+ dataset_batch_size=48
65
+ model_clap_loss=1.0
66
+ average_top_k_layers=12
67
+ model_add_conv=false
68
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
69
+ echo "Config ${train_mode} ${config_option}"
70
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
71
+ task_load_clap_emb=true
72
+ model_proj_type=2
73
+ model_clone_batch=4
74
+ dataset_batch_size=48
75
+ model_clap_loss=1.0
76
+ average_top_k_layers=1
77
+ # loss type ablation
78
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
79
+ echo "Config ${train_mode} ${config_option}"
80
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
81
+ task_load_clap_emb=true
82
+ model_proj_type=2
83
+ model_clone_batch=4
84
+ dataset_batch_size=48
85
+ model_clap_loss=1.0
86
+ average_top_k_layers=12
87
+ model_clap_loss_type="ce"
88
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
89
+ echo "Config ${train_mode} ${config_option}"
90
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
91
+ task_load_clap_emb=true
92
+ model_proj_type=2
93
+ model_clone_batch=4
94
+ dataset_batch_size=48
95
+ model_clap_loss=1.0
96
+ average_top_k_layers=12
97
+ model_clap_loss_type="l1"
98
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
99
+ echo "Config ${train_mode} ${config_option}"
100
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
101
+ task_load_clap_emb=true
102
+ model_proj_type=2
103
+ model_clone_batch=4
104
+ dataset_batch_size=96
105
+ model_clap_loss=1.0
106
+ average_top_k_layers=12
107
+ model_clap_loss_type="cosine"
108
+ # loss layer ablation
109
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
110
+ echo "Config ${train_mode} ${config_option}"
111
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
112
+ task_load_clap_emb=true
113
+ model_proj_type=2
114
+ model_clone_batch=4
115
+ dataset_batch_size=96
116
+ model_clap_loss=1.0
117
+ average_top_k_layers=12
118
+ model_clap_loss_type="mse"
119
+ model_clap_loss_layer=10
120
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
121
+ echo "Config ${train_mode} ${config_option}"
122
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
123
+ task_load_clap_emb=true
124
+ task_load_source_file=true
125
+ task_load_mel_file=false
126
+ model_proj_type=2
127
+ model_clone_batch=4
128
+ dataset_batch_size=96
129
+ model_clap_loss=1.0
130
+ average_top_k_layers=12
131
+ model_clap_loss_type="mse"
132
+ model_clap_loss_layer=8
133
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
134
+ echo "Config ${train_mode} ${config_option}"
135
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
136
+ task_load_clap_emb=true
137
+ task_load_source_file=true
138
+ task_load_mel_file=false
139
+ model_proj_type=2
140
+ model_clone_batch=4
141
+ dataset_batch_size=96
142
+ model_clap_loss=1.0
143
+ average_top_k_layers=12
144
+ model_clap_loss_type="mse"
145
+ model_clap_loss_layer=6
146
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
147
+ echo "Config ${train_mode} ${config_option}"
148
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
149
+ task_load_clap_emb=true
150
+ task_load_source_file=true
151
+ task_load_mel_file=false
152
+ model_proj_type=2
153
+ model_clone_batch=4
154
+ model_clap_loss=5.0
155
+ dataset_batch_size=96
156
+ average_top_k_layers=12
157
+ model_clap_loss_type="mse"
158
+ checkpoint_keep_interval_updates=-1
159
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
160
+ echo "Config ${train_mode} ${config_option}"
161
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
162
+ task_load_clap_emb=true
163
+ task_load_source_file=true
164
+ task_load_mel_file=false
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ model_clap_loss=0.1
168
+ dataset_batch_size=96
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="mse"
171
+ checkpoint_keep_interval_updates=-1
172
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
173
+ echo "Config ${train_mode} ${config_option}"
174
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
175
+ task_load_clap_emb=true
176
+ model_proj_type=4
177
+ model_clone_batch=4
178
+ model_clap_loss=1.0
179
+ dataset_batch_size=48
180
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
181
+ echo "Config ${train_mode} ${config_option}"
182
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
183
+ task_load_clap_emb=true
184
+ model_proj_type=4
185
+ model_clone_batch=4
186
+ model_clap_loss=0.001
187
+ dataset_batch_size=48
188
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
189
+ echo "Config ${train_mode} ${config_option}"
190
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
191
+ task_load_clap_emb=true
192
+ model_proj_type=4
193
+ model_clone_batch=4
194
+ model_clap_loss=0.01
195
+ dataset_batch_size=48
196
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
197
+ echo "Config ${train_mode} ${config_option}"
198
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
199
+ task_load_clap_emb=true
200
+ model_proj_type=6
201
+ model_clone_batch=4
202
+ dataset_batch_size=48
203
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
204
+ echo "Config ${train_mode} ${config_option}"
205
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
206
+ task_load_clap_emb=true
207
+ task_load_source_file=true
208
+ task_load_mel_file=false
209
+ model_proj_type=2
210
+ model_clone_batch=4
211
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
212
+ model_clap_loss=1.0
213
+ average_top_k_layers=11 # modify with model depth
214
+ model_add_conv=true
215
+ model_depth=11 #
216
+ checkpoint_keep_interval_updates=-1 # default 1
217
+ checkpoint_save_interval_updates=10000
218
+ fi
219
+
220
+ python fairseq_cli/hydra_train.py -m \
221
+ --config-dir ./EAT/config \
222
+ --config-name pretraining_AS2M \
223
+ common.user_dir=./EAT \
224
+ checkpoint.save_dir=${checkpoint_save_dir} \
225
+ checkpoint.restore_file=${checkpoint_restore_file} \
226
+ distributed_training.distributed_world_size=${1:-2} \
227
+ dataset.num_workers=24 \
228
+ dataset.data_buffer_size=48 \
229
+ dataset.batch_size=${dataset_batch_size} \
230
+ task.data=${task_data} \
231
+ task.h5_format=False \
232
+ task.load_clap_emb=${task_load_clap_emb} \
233
+ +task.load_source_file=${task_load_source_file} \
234
+ +task.load_mel_file=${task_load_mel_file} \
235
+ model.proj_type=${model_proj_type} \
236
+ model.clone_batch=${model_clone_batch} \
237
+ model.clap_loss=${model_clap_loss} \
238
+ model.average_top_k_layers=${average_top_k_layers} \
239
+ +model.add_conv=${model_add_conv} \
240
+ +model.clap_loss_type=${model_clap_loss_type} \
241
+ +model.clap_loss_layer=${model_clap_loss_layer} \
242
+ +model.dispersive_loss=${model_dispersive_loss} \
243
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
244
+ model.depth=${model_depth} \
245
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
246
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_0_2025-09-24_14-09-31/pretraining_AS2M.sh ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=0
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
61
+ task_load_clap_emb=true
62
+ model_proj_type=2
63
+ model_clone_batch=4
64
+ dataset_batch_size=48
65
+ model_clap_loss=1.0
66
+ average_top_k_layers=12
67
+ model_add_conv=false
68
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
69
+ echo "Config ${train_mode} ${config_option}"
70
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
71
+ task_load_clap_emb=true
72
+ model_proj_type=2
73
+ model_clone_batch=4
74
+ dataset_batch_size=48
75
+ model_clap_loss=1.0
76
+ average_top_k_layers=1
77
+ # loss type ablation
78
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
79
+ echo "Config ${train_mode} ${config_option}"
80
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
81
+ task_load_clap_emb=true
82
+ model_proj_type=2
83
+ model_clone_batch=4
84
+ dataset_batch_size=48
85
+ model_clap_loss=1.0
86
+ average_top_k_layers=12
87
+ model_clap_loss_type="ce"
88
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
89
+ echo "Config ${train_mode} ${config_option}"
90
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
91
+ task_load_clap_emb=true
92
+ model_proj_type=2
93
+ model_clone_batch=4
94
+ dataset_batch_size=48
95
+ model_clap_loss=1.0
96
+ average_top_k_layers=12
97
+ model_clap_loss_type="l1"
98
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
99
+ echo "Config ${train_mode} ${config_option}"
100
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
101
+ task_load_clap_emb=true
102
+ model_proj_type=2
103
+ model_clone_batch=4
104
+ dataset_batch_size=96
105
+ model_clap_loss=1.0
106
+ average_top_k_layers=12
107
+ model_clap_loss_type="cosine"
108
+ # loss layer ablation
109
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
110
+ echo "Config ${train_mode} ${config_option}"
111
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
112
+ task_load_clap_emb=true
113
+ model_proj_type=2
114
+ model_clone_batch=4
115
+ dataset_batch_size=96
116
+ model_clap_loss=1.0
117
+ average_top_k_layers=12
118
+ model_clap_loss_type="mse"
119
+ model_clap_loss_layer=10
120
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
121
+ echo "Config ${train_mode} ${config_option}"
122
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
123
+ task_load_clap_emb=true
124
+ task_load_source_file=true
125
+ task_load_mel_file=false
126
+ model_proj_type=2
127
+ model_clone_batch=4
128
+ dataset_batch_size=96
129
+ model_clap_loss=1.0
130
+ average_top_k_layers=12
131
+ model_clap_loss_type="mse"
132
+ model_clap_loss_layer=8
133
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
134
+ echo "Config ${train_mode} ${config_option}"
135
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
136
+ task_load_clap_emb=true
137
+ task_load_source_file=true
138
+ task_load_mel_file=false
139
+ model_proj_type=2
140
+ model_clone_batch=4
141
+ dataset_batch_size=96
142
+ model_clap_loss=1.0
143
+ average_top_k_layers=12
144
+ model_clap_loss_type="mse"
145
+ model_clap_loss_layer=6
146
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
147
+ echo "Config ${train_mode} ${config_option}"
148
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
149
+ task_load_clap_emb=true
150
+ task_load_source_file=true
151
+ task_load_mel_file=false
152
+ model_proj_type=2
153
+ model_clone_batch=4
154
+ model_clap_loss=5.0
155
+ dataset_batch_size=96
156
+ average_top_k_layers=12
157
+ model_clap_loss_type="mse"
158
+ checkpoint_keep_interval_updates=-1
159
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
160
+ echo "Config ${train_mode} ${config_option}"
161
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
162
+ task_load_clap_emb=true
163
+ task_load_source_file=true
164
+ task_load_mel_file=false
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ model_clap_loss=0.1
168
+ dataset_batch_size=96
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="mse"
171
+ checkpoint_keep_interval_updates=-1
172
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
173
+ echo "Config ${train_mode} ${config_option}"
174
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
175
+ task_load_clap_emb=true
176
+ model_proj_type=4
177
+ model_clone_batch=4
178
+ model_clap_loss=1.0
179
+ dataset_batch_size=48
180
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
181
+ echo "Config ${train_mode} ${config_option}"
182
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
183
+ task_load_clap_emb=true
184
+ model_proj_type=4
185
+ model_clone_batch=4
186
+ model_clap_loss=0.001
187
+ dataset_batch_size=48
188
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
189
+ echo "Config ${train_mode} ${config_option}"
190
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
191
+ task_load_clap_emb=true
192
+ model_proj_type=4
193
+ model_clone_batch=4
194
+ model_clap_loss=0.01
195
+ dataset_batch_size=48
196
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
197
+ echo "Config ${train_mode} ${config_option}"
198
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
199
+ task_load_clap_emb=true
200
+ model_proj_type=6
201
+ model_clone_batch=4
202
+ dataset_batch_size=48
203
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
204
+ echo "Config ${train_mode} ${config_option}"
205
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
206
+ task_load_clap_emb=true
207
+ task_load_source_file=true
208
+ task_load_mel_file=false
209
+ model_proj_type=2
210
+ model_clone_batch=4
211
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
212
+ model_clap_loss=1.0
213
+ average_top_k_layers=11 # modify with model depth
214
+ model_add_conv=true
215
+ model_depth=11 #
216
+ checkpoint_keep_interval_updates=-1 # default 1
217
+ checkpoint_save_interval_updates=10000
218
+ fi
219
+
220
+ python fairseq_cli/hydra_train.py -m \
221
+ --config-dir ./EAT/config \
222
+ --config-name pretraining_AS2M \
223
+ common.user_dir=./EAT \
224
+ checkpoint.save_dir=${checkpoint_save_dir} \
225
+ checkpoint.restore_file=${checkpoint_restore_file} \
226
+ distributed_training.distributed_world_size=${1:-2} \
227
+ dataset.num_workers=24 \
228
+ dataset.data_buffer_size=48 \
229
+ dataset.batch_size=${dataset_batch_size} \
230
+ task.data=${task_data} \
231
+ task.h5_format=False \
232
+ task.load_clap_emb=${task_load_clap_emb} \
233
+ +task.load_source_file=${task_load_source_file} \
234
+ +task.load_mel_file=${task_load_mel_file} \
235
+ model.proj_type=${model_proj_type} \
236
+ model.clone_batch=${model_clone_batch} \
237
+ model.clap_loss=${model_clap_loss} \
238
+ model.average_top_k_layers=${average_top_k_layers} \
239
+ +model.add_conv=${model_add_conv} \
240
+ +model.clap_loss_type=${model_clap_loss_type} \
241
+ +model.clap_loss_layer=${model_clap_loss_layer} \
242
+ +model.dispersive_loss=${model_dispersive_loss} \
243
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
244
+ model.depth=${model_depth} \
245
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
246
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_0_2025-09-24_14-12-12/pretraining_AS2M.sh ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=0
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
61
+ task_load_clap_emb=true
62
+ model_proj_type=2
63
+ model_clone_batch=4
64
+ dataset_batch_size=48
65
+ model_clap_loss=1.0
66
+ average_top_k_layers=12
67
+ model_add_conv=false
68
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
69
+ echo "Config ${train_mode} ${config_option}"
70
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
71
+ task_load_clap_emb=true
72
+ model_proj_type=2
73
+ model_clone_batch=4
74
+ dataset_batch_size=48
75
+ model_clap_loss=1.0
76
+ average_top_k_layers=1
77
+ # loss type ablation
78
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
79
+ echo "Config ${train_mode} ${config_option}"
80
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
81
+ task_load_clap_emb=true
82
+ model_proj_type=2
83
+ model_clone_batch=4
84
+ dataset_batch_size=48
85
+ model_clap_loss=1.0
86
+ average_top_k_layers=12
87
+ model_clap_loss_type="ce"
88
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
89
+ echo "Config ${train_mode} ${config_option}"
90
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
91
+ task_load_clap_emb=true
92
+ model_proj_type=2
93
+ model_clone_batch=4
94
+ dataset_batch_size=48
95
+ model_clap_loss=1.0
96
+ average_top_k_layers=12
97
+ model_clap_loss_type="l1"
98
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
99
+ echo "Config ${train_mode} ${config_option}"
100
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
101
+ task_load_clap_emb=true
102
+ model_proj_type=2
103
+ model_clone_batch=4
104
+ dataset_batch_size=96
105
+ model_clap_loss=1.0
106
+ average_top_k_layers=12
107
+ model_clap_loss_type="cosine"
108
+ # loss layer ablation
109
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
110
+ echo "Config ${train_mode} ${config_option}"
111
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
112
+ task_load_clap_emb=true
113
+ model_proj_type=2
114
+ model_clone_batch=4
115
+ dataset_batch_size=96
116
+ model_clap_loss=1.0
117
+ average_top_k_layers=12
118
+ model_clap_loss_type="mse"
119
+ model_clap_loss_layer=10
120
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
121
+ echo "Config ${train_mode} ${config_option}"
122
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
123
+ task_load_clap_emb=true
124
+ task_load_source_file=true
125
+ task_load_mel_file=false
126
+ model_proj_type=2
127
+ model_clone_batch=4
128
+ dataset_batch_size=96
129
+ model_clap_loss=1.0
130
+ average_top_k_layers=12
131
+ model_clap_loss_type="mse"
132
+ model_clap_loss_layer=8
133
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
134
+ echo "Config ${train_mode} ${config_option}"
135
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
136
+ task_load_clap_emb=true
137
+ task_load_source_file=true
138
+ task_load_mel_file=false
139
+ model_proj_type=2
140
+ model_clone_batch=4
141
+ dataset_batch_size=96
142
+ model_clap_loss=1.0
143
+ average_top_k_layers=12
144
+ model_clap_loss_type="mse"
145
+ model_clap_loss_layer=6
146
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
147
+ echo "Config ${train_mode} ${config_option}"
148
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
149
+ task_load_clap_emb=true
150
+ task_load_source_file=true
151
+ task_load_mel_file=false
152
+ model_proj_type=2
153
+ model_clone_batch=4
154
+ model_clap_loss=5.0
155
+ dataset_batch_size=96
156
+ average_top_k_layers=12
157
+ model_clap_loss_type="mse"
158
+ checkpoint_keep_interval_updates=-1
159
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
160
+ echo "Config ${train_mode} ${config_option}"
161
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
162
+ task_load_clap_emb=true
163
+ task_load_source_file=true
164
+ task_load_mel_file=false
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ model_clap_loss=0.1
168
+ dataset_batch_size=96
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="mse"
171
+ checkpoint_keep_interval_updates=-1
172
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
173
+ echo "Config ${train_mode} ${config_option}"
174
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
175
+ task_load_clap_emb=true
176
+ model_proj_type=4
177
+ model_clone_batch=4
178
+ model_clap_loss=1.0
179
+ dataset_batch_size=48
180
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
181
+ echo "Config ${train_mode} ${config_option}"
182
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
183
+ task_load_clap_emb=true
184
+ model_proj_type=4
185
+ model_clone_batch=4
186
+ model_clap_loss=0.001
187
+ dataset_batch_size=48
188
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
189
+ echo "Config ${train_mode} ${config_option}"
190
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
191
+ task_load_clap_emb=true
192
+ model_proj_type=4
193
+ model_clone_batch=4
194
+ model_clap_loss=0.01
195
+ dataset_batch_size=48
196
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
197
+ echo "Config ${train_mode} ${config_option}"
198
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
199
+ task_load_clap_emb=true
200
+ model_proj_type=6
201
+ model_clone_batch=4
202
+ dataset_batch_size=48
203
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
204
+ echo "Config ${train_mode} ${config_option}"
205
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
206
+ task_load_clap_emb=true
207
+ task_load_source_file=true
208
+ task_load_mel_file=false
209
+ model_proj_type=2
210
+ model_clone_batch=4
211
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
212
+ model_clap_loss=1.0
213
+ average_top_k_layers=11 # modify with model depth
214
+ model_add_conv=true
215
+ model_depth=11 #
216
+ checkpoint_keep_interval_updates=-1 # default 1
217
+ checkpoint_save_interval_updates=10000
218
+ fi
219
+
220
+ python fairseq_cli/hydra_train.py -m \
221
+ --config-dir ./EAT/config \
222
+ --config-name pretraining_AS2M \
223
+ common.user_dir=./EAT \
224
+ checkpoint.save_dir=${checkpoint_save_dir} \
225
+ checkpoint.restore_file=${checkpoint_restore_file} \
226
+ distributed_training.distributed_world_size=${1:-2} \
227
+ dataset.num_workers=24 \
228
+ dataset.data_buffer_size=48 \
229
+ dataset.batch_size=${dataset_batch_size} \
230
+ task.data=${task_data} \
231
+ task.h5_format=False \
232
+ task.load_clap_emb=${task_load_clap_emb} \
233
+ +task.load_source_file=${task_load_source_file} \
234
+ +task.load_mel_file=${task_load_mel_file} \
235
+ model.proj_type=${model_proj_type} \
236
+ model.clone_batch=${model_clone_batch} \
237
+ model.clap_loss=${model_clap_loss} \
238
+ model.average_top_k_layers=${average_top_k_layers} \
239
+ +model.add_conv=${model_add_conv} \
240
+ +model.clap_loss_type=${model_clap_loss_type} \
241
+ +model.clap_loss_layer=${model_clap_loss_layer} \
242
+ +model.dispersive_loss=${model_dispersive_loss} \
243
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
244
+ model.depth=${model_depth} \
245
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
246
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_0_2025-09-24_14-17-47/pretraining_AS2M.sh ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=0
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
61
+ task_load_clap_emb=true
62
+ model_proj_type=2
63
+ model_clone_batch=4
64
+ dataset_batch_size=48
65
+ model_clap_loss=1.0
66
+ average_top_k_layers=12
67
+ model_add_conv=false
68
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
69
+ echo "Config ${train_mode} ${config_option}"
70
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
71
+ task_load_clap_emb=true
72
+ model_proj_type=2
73
+ model_clone_batch=4
74
+ dataset_batch_size=48
75
+ model_clap_loss=1.0
76
+ average_top_k_layers=1
77
+ # loss type ablation
78
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
79
+ echo "Config ${train_mode} ${config_option}"
80
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
81
+ task_load_clap_emb=true
82
+ model_proj_type=2
83
+ model_clone_batch=4
84
+ dataset_batch_size=48
85
+ model_clap_loss=1.0
86
+ average_top_k_layers=12
87
+ model_clap_loss_type="ce"
88
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
89
+ echo "Config ${train_mode} ${config_option}"
90
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
91
+ task_load_clap_emb=true
92
+ model_proj_type=2
93
+ model_clone_batch=4
94
+ dataset_batch_size=48
95
+ model_clap_loss=1.0
96
+ average_top_k_layers=12
97
+ model_clap_loss_type="l1"
98
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
99
+ echo "Config ${train_mode} ${config_option}"
100
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
101
+ task_load_clap_emb=true
102
+ model_proj_type=2
103
+ model_clone_batch=4
104
+ dataset_batch_size=96
105
+ model_clap_loss=1.0
106
+ average_top_k_layers=12
107
+ model_clap_loss_type="cosine"
108
+ # loss layer ablation
109
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
110
+ echo "Config ${train_mode} ${config_option}"
111
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
112
+ task_load_clap_emb=true
113
+ model_proj_type=2
114
+ model_clone_batch=4
115
+ dataset_batch_size=96
116
+ model_clap_loss=1.0
117
+ average_top_k_layers=12
118
+ model_clap_loss_type="mse"
119
+ model_clap_loss_layer=10
120
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
121
+ echo "Config ${train_mode} ${config_option}"
122
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
123
+ task_load_clap_emb=true
124
+ task_load_source_file=true
125
+ task_load_mel_file=false
126
+ model_proj_type=2
127
+ model_clone_batch=4
128
+ dataset_batch_size=96
129
+ model_clap_loss=1.0
130
+ average_top_k_layers=12
131
+ model_clap_loss_type="mse"
132
+ model_clap_loss_layer=8
133
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
134
+ echo "Config ${train_mode} ${config_option}"
135
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
136
+ task_load_clap_emb=true
137
+ task_load_source_file=true
138
+ task_load_mel_file=false
139
+ model_proj_type=2
140
+ model_clone_batch=4
141
+ dataset_batch_size=96
142
+ model_clap_loss=1.0
143
+ average_top_k_layers=12
144
+ model_clap_loss_type="mse"
145
+ model_clap_loss_layer=6
146
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
147
+ echo "Config ${train_mode} ${config_option}"
148
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
149
+ task_load_clap_emb=true
150
+ task_load_source_file=true
151
+ task_load_mel_file=false
152
+ model_proj_type=2
153
+ model_clone_batch=4
154
+ model_clap_loss=5.0
155
+ dataset_batch_size=96
156
+ average_top_k_layers=12
157
+ model_clap_loss_type="mse"
158
+ checkpoint_keep_interval_updates=-1
159
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
160
+ echo "Config ${train_mode} ${config_option}"
161
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
162
+ task_load_clap_emb=true
163
+ task_load_source_file=true
164
+ task_load_mel_file=false
165
+ model_proj_type=2
166
+ model_clone_batch=4
167
+ model_clap_loss=0.1
168
+ dataset_batch_size=96
169
+ average_top_k_layers=12
170
+ model_clap_loss_type="mse"
171
+ checkpoint_keep_interval_updates=-1
172
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
173
+ echo "Config ${train_mode} ${config_option}"
174
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
175
+ task_load_clap_emb=true
176
+ model_proj_type=4
177
+ model_clone_batch=4
178
+ model_clap_loss=1.0
179
+ dataset_batch_size=48
180
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
181
+ echo "Config ${train_mode} ${config_option}"
182
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
183
+ task_load_clap_emb=true
184
+ model_proj_type=4
185
+ model_clone_batch=4
186
+ model_clap_loss=0.001
187
+ dataset_batch_size=48
188
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
189
+ echo "Config ${train_mode} ${config_option}"
190
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
191
+ task_load_clap_emb=true
192
+ model_proj_type=4
193
+ model_clone_batch=4
194
+ model_clap_loss=0.01
195
+ dataset_batch_size=48
196
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
197
+ echo "Config ${train_mode} ${config_option}"
198
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
199
+ task_load_clap_emb=true
200
+ model_proj_type=6
201
+ model_clone_batch=4
202
+ dataset_batch_size=48
203
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
204
+ echo "Config ${train_mode} ${config_option}"
205
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
206
+ task_load_clap_emb=true
207
+ task_load_source_file=true
208
+ task_load_mel_file=false
209
+ model_proj_type=2
210
+ model_clone_batch=4
211
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
212
+ model_clap_loss=1.0
213
+ average_top_k_layers=11 # modify with model depth
214
+ model_add_conv=true
215
+ model_depth=11 #
216
+ checkpoint_keep_interval_updates=-1 # default 1
217
+ checkpoint_save_interval_updates=10000
218
+ fi
219
+
220
+ python fairseq_cli/hydra_train.py -m \
221
+ --config-dir ./EAT/config \
222
+ --config-name pretraining_AS2M \
223
+ common.user_dir=./EAT \
224
+ checkpoint.save_dir=${checkpoint_save_dir} \
225
+ checkpoint.restore_file=${checkpoint_restore_file} \
226
+ distributed_training.distributed_world_size=${1:-2} \
227
+ dataset.num_workers=24 \
228
+ dataset.data_buffer_size=48 \
229
+ dataset.batch_size=${dataset_batch_size} \
230
+ task.data=${task_data} \
231
+ task.h5_format=False \
232
+ task.load_clap_emb=${task_load_clap_emb} \
233
+ +task.load_source_file=${task_load_source_file} \
234
+ +task.load_mel_file=${task_load_mel_file} \
235
+ model.proj_type=${model_proj_type} \
236
+ model.clone_batch=${model_clone_batch} \
237
+ model.clap_loss=${model_clap_loss} \
238
+ model.average_top_k_layers=${average_top_k_layers} \
239
+ +model.add_conv=${model_add_conv} \
240
+ +model.clap_loss_type=${model_clap_loss_type} \
241
+ +model.clap_loss_layer=${model_clap_loss_layer} \
242
+ +model.dispersive_loss=${model_dispersive_loss} \
243
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
244
+ model.depth=${model_depth} \
245
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
246
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_1_2025-09-26_14-32-16/pretraining_AS2M.sh ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=1
54
+ dataset_batch_size=384
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=4
66
+ dataset_batch_size=96
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
73
+ task_load_clap_emb=true
74
+ model_proj_type=2
75
+ model_clone_batch=4
76
+ dataset_batch_size=48
77
+ model_clap_loss=1.0
78
+ average_top_k_layers=12
79
+ model_add_conv=false
80
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
81
+ echo "Config ${train_mode} ${config_option}"
82
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
83
+ task_load_clap_emb=true
84
+ model_proj_type=2
85
+ model_clone_batch=4
86
+ dataset_batch_size=48
87
+ model_clap_loss=1.0
88
+ average_top_k_layers=1
89
+ # loss type ablation
90
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
91
+ echo "Config ${train_mode} ${config_option}"
92
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
93
+ task_load_clap_emb=true
94
+ model_proj_type=2
95
+ model_clone_batch=4
96
+ dataset_batch_size=48
97
+ model_clap_loss=1.0
98
+ average_top_k_layers=12
99
+ model_clap_loss_type="ce"
100
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
101
+ echo "Config ${train_mode} ${config_option}"
102
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
103
+ task_load_clap_emb=true
104
+ model_proj_type=2
105
+ model_clone_batch=4
106
+ dataset_batch_size=48
107
+ model_clap_loss=1.0
108
+ average_top_k_layers=12
109
+ model_clap_loss_type="l1"
110
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
111
+ echo "Config ${train_mode} ${config_option}"
112
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
113
+ task_load_clap_emb=true
114
+ model_proj_type=2
115
+ model_clone_batch=4
116
+ dataset_batch_size=96
117
+ model_clap_loss=1.0
118
+ average_top_k_layers=12
119
+ model_clap_loss_type="cosine"
120
+ # loss layer ablation
121
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
122
+ echo "Config ${train_mode} ${config_option}"
123
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
124
+ task_load_clap_emb=true
125
+ model_proj_type=2
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_clap_loss=1.0
129
+ average_top_k_layers=12
130
+ model_clap_loss_type="mse"
131
+ model_clap_loss_layer=10
132
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
133
+ echo "Config ${train_mode} ${config_option}"
134
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
135
+ task_load_clap_emb=true
136
+ task_load_source_file=true
137
+ task_load_mel_file=false
138
+ model_proj_type=2
139
+ model_clone_batch=4
140
+ dataset_batch_size=96
141
+ model_clap_loss=1.0
142
+ average_top_k_layers=12
143
+ model_clap_loss_type="mse"
144
+ model_clap_loss_layer=8
145
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
146
+ echo "Config ${train_mode} ${config_option}"
147
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
148
+ task_load_clap_emb=true
149
+ task_load_source_file=true
150
+ task_load_mel_file=false
151
+ model_proj_type=2
152
+ model_clone_batch=4
153
+ dataset_batch_size=96
154
+ model_clap_loss=1.0
155
+ average_top_k_layers=12
156
+ model_clap_loss_type="mse"
157
+ model_clap_loss_layer=6
158
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
159
+ echo "Config ${train_mode} ${config_option}"
160
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
161
+ task_load_clap_emb=true
162
+ task_load_source_file=true
163
+ task_load_mel_file=false
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ model_clap_loss=5.0
167
+ dataset_batch_size=96
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="mse"
170
+ checkpoint_keep_interval_updates=-1
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ task_load_source_file=true
176
+ task_load_mel_file=false
177
+ model_proj_type=2
178
+ model_clone_batch=4
179
+ model_clap_loss=0.1
180
+ dataset_batch_size=96
181
+ average_top_k_layers=12
182
+ model_clap_loss_type="mse"
183
+ checkpoint_keep_interval_updates=-1
184
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
185
+ echo "Config ${train_mode} ${config_option}"
186
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
187
+ task_load_clap_emb=true
188
+ model_proj_type=4
189
+ model_clone_batch=4
190
+ model_clap_loss=1.0
191
+ dataset_batch_size=48
192
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
195
+ task_load_clap_emb=true
196
+ model_proj_type=4
197
+ model_clone_batch=4
198
+ model_clap_loss=0.001
199
+ dataset_batch_size=48
200
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
201
+ echo "Config ${train_mode} ${config_option}"
202
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
203
+ task_load_clap_emb=true
204
+ model_proj_type=4
205
+ model_clone_batch=4
206
+ model_clap_loss=0.01
207
+ dataset_batch_size=48
208
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
209
+ echo "Config ${train_mode} ${config_option}"
210
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
211
+ task_load_clap_emb=true
212
+ model_proj_type=6
213
+ model_clone_batch=4
214
+ dataset_batch_size=48
215
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
216
+ echo "Config ${train_mode} ${config_option}"
217
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
218
+ task_load_clap_emb=true
219
+ task_load_source_file=true
220
+ task_load_mel_file=false
221
+ model_proj_type=2
222
+ model_clone_batch=4
223
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
224
+ model_clap_loss=1.0
225
+ average_top_k_layers=11 # modify with model depth
226
+ model_add_conv=true
227
+ model_depth=11 #
228
+ checkpoint_keep_interval_updates=-1 # default 1
229
+ checkpoint_save_interval_updates=10000
230
+ fi
231
+
232
+ python fairseq_cli/hydra_train.py -m \
233
+ --config-dir ./EAT/config \
234
+ --config-name pretraining_AS2M \
235
+ common.user_dir=./EAT \
236
+ checkpoint.save_dir=${checkpoint_save_dir} \
237
+ checkpoint.restore_file=${checkpoint_restore_file} \
238
+ distributed_training.distributed_world_size=${1:-2} \
239
+ dataset.num_workers=24 \
240
+ dataset.data_buffer_size=48 \
241
+ dataset.batch_size=${dataset_batch_size} \
242
+ task.data=${task_data} \
243
+ task.h5_format=False \
244
+ task.load_clap_emb=${task_load_clap_emb} \
245
+ +task.load_source_file=${task_load_source_file} \
246
+ +task.load_mel_file=${task_load_mel_file} \
247
+ model.proj_type=${model_proj_type} \
248
+ model.clone_batch=${model_clone_batch} \
249
+ model.clap_loss=${model_clap_loss} \
250
+ model.average_top_k_layers=${average_top_k_layers} \
251
+ +model.add_conv=${model_add_conv} \
252
+ +model.clap_loss_type=${model_clap_loss_type} \
253
+ +model.clap_loss_layer=${model_clap_loss_layer} \
254
+ +model.dispersive_loss=${model_dispersive_loss} \
255
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
256
+ model.depth=${model_depth} \
257
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
258
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_1_2025-09-26_14-33-34/pretraining_AS2M.sh ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=1
54
+ dataset_batch_size=384
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=4
66
+ dataset_batch_size=96
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
73
+ task_load_clap_emb=true
74
+ model_proj_type=2
75
+ model_clone_batch=4
76
+ dataset_batch_size=48
77
+ model_clap_loss=1.0
78
+ average_top_k_layers=12
79
+ model_add_conv=false
80
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
81
+ echo "Config ${train_mode} ${config_option}"
82
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
83
+ task_load_clap_emb=true
84
+ model_proj_type=2
85
+ model_clone_batch=4
86
+ dataset_batch_size=48
87
+ model_clap_loss=1.0
88
+ average_top_k_layers=1
89
+ # loss type ablation
90
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
91
+ echo "Config ${train_mode} ${config_option}"
92
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
93
+ task_load_clap_emb=true
94
+ model_proj_type=2
95
+ model_clone_batch=4
96
+ dataset_batch_size=48
97
+ model_clap_loss=1.0
98
+ average_top_k_layers=12
99
+ model_clap_loss_type="ce"
100
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
101
+ echo "Config ${train_mode} ${config_option}"
102
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
103
+ task_load_clap_emb=true
104
+ model_proj_type=2
105
+ model_clone_batch=4
106
+ dataset_batch_size=48
107
+ model_clap_loss=1.0
108
+ average_top_k_layers=12
109
+ model_clap_loss_type="l1"
110
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
111
+ echo "Config ${train_mode} ${config_option}"
112
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
113
+ task_load_clap_emb=true
114
+ model_proj_type=2
115
+ model_clone_batch=4
116
+ dataset_batch_size=96
117
+ model_clap_loss=1.0
118
+ average_top_k_layers=12
119
+ model_clap_loss_type="cosine"
120
+ # loss layer ablation
121
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
122
+ echo "Config ${train_mode} ${config_option}"
123
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
124
+ task_load_clap_emb=true
125
+ model_proj_type=2
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_clap_loss=1.0
129
+ average_top_k_layers=12
130
+ model_clap_loss_type="mse"
131
+ model_clap_loss_layer=10
132
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
133
+ echo "Config ${train_mode} ${config_option}"
134
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
135
+ task_load_clap_emb=true
136
+ task_load_source_file=true
137
+ task_load_mel_file=false
138
+ model_proj_type=2
139
+ model_clone_batch=4
140
+ dataset_batch_size=96
141
+ model_clap_loss=1.0
142
+ average_top_k_layers=12
143
+ model_clap_loss_type="mse"
144
+ model_clap_loss_layer=8
145
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
146
+ echo "Config ${train_mode} ${config_option}"
147
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
148
+ task_load_clap_emb=true
149
+ task_load_source_file=true
150
+ task_load_mel_file=false
151
+ model_proj_type=2
152
+ model_clone_batch=4
153
+ dataset_batch_size=96
154
+ model_clap_loss=1.0
155
+ average_top_k_layers=12
156
+ model_clap_loss_type="mse"
157
+ model_clap_loss_layer=6
158
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
159
+ echo "Config ${train_mode} ${config_option}"
160
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
161
+ task_load_clap_emb=true
162
+ task_load_source_file=true
163
+ task_load_mel_file=false
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ model_clap_loss=5.0
167
+ dataset_batch_size=96
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="mse"
170
+ checkpoint_keep_interval_updates=-1
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ task_load_source_file=true
176
+ task_load_mel_file=false
177
+ model_proj_type=2
178
+ model_clone_batch=4
179
+ model_clap_loss=0.1
180
+ dataset_batch_size=96
181
+ average_top_k_layers=12
182
+ model_clap_loss_type="mse"
183
+ checkpoint_keep_interval_updates=-1
184
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
185
+ echo "Config ${train_mode} ${config_option}"
186
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
187
+ task_load_clap_emb=true
188
+ model_proj_type=4
189
+ model_clone_batch=4
190
+ model_clap_loss=1.0
191
+ dataset_batch_size=48
192
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
195
+ task_load_clap_emb=true
196
+ model_proj_type=4
197
+ model_clone_batch=4
198
+ model_clap_loss=0.001
199
+ dataset_batch_size=48
200
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
201
+ echo "Config ${train_mode} ${config_option}"
202
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
203
+ task_load_clap_emb=true
204
+ model_proj_type=4
205
+ model_clone_batch=4
206
+ model_clap_loss=0.01
207
+ dataset_batch_size=48
208
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
209
+ echo "Config ${train_mode} ${config_option}"
210
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
211
+ task_load_clap_emb=true
212
+ model_proj_type=6
213
+ model_clone_batch=4
214
+ dataset_batch_size=48
215
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
216
+ echo "Config ${train_mode} ${config_option}"
217
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
218
+ task_load_clap_emb=true
219
+ task_load_source_file=true
220
+ task_load_mel_file=false
221
+ model_proj_type=2
222
+ model_clone_batch=4
223
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
224
+ model_clap_loss=1.0
225
+ average_top_k_layers=11 # modify with model depth
226
+ model_add_conv=true
227
+ model_depth=11 #
228
+ checkpoint_keep_interval_updates=-1 # default 1
229
+ checkpoint_save_interval_updates=10000
230
+ fi
231
+
232
+ python fairseq_cli/hydra_train.py -m \
233
+ --config-dir ./EAT/config \
234
+ --config-name pretraining_AS2M \
235
+ common.user_dir=./EAT \
236
+ checkpoint.save_dir=${checkpoint_save_dir} \
237
+ checkpoint.restore_file=${checkpoint_restore_file} \
238
+ distributed_training.distributed_world_size=${1:-2} \
239
+ dataset.num_workers=24 \
240
+ dataset.data_buffer_size=48 \
241
+ dataset.batch_size=${dataset_batch_size} \
242
+ task.data=${task_data} \
243
+ task.h5_format=False \
244
+ task.load_clap_emb=${task_load_clap_emb} \
245
+ +task.load_source_file=${task_load_source_file} \
246
+ +task.load_mel_file=${task_load_mel_file} \
247
+ model.proj_type=${model_proj_type} \
248
+ model.clone_batch=${model_clone_batch} \
249
+ model.clap_loss=${model_clap_loss} \
250
+ model.average_top_k_layers=${average_top_k_layers} \
251
+ +model.add_conv=${model_add_conv} \
252
+ +model.clap_loss_type=${model_clap_loss_type} \
253
+ +model.clap_loss_layer=${model_clap_loss_layer} \
254
+ +model.dispersive_loss=${model_dispersive_loss} \
255
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
256
+ model.depth=${model_depth} \
257
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
258
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_1_2025-09-26_14-34-35/pretraining_AS2M.sh ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=1
54
+ dataset_batch_size=384
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=4
66
+ dataset_batch_size=96
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
73
+ task_load_clap_emb=true
74
+ model_proj_type=2
75
+ model_clone_batch=4
76
+ dataset_batch_size=48
77
+ model_clap_loss=1.0
78
+ average_top_k_layers=12
79
+ model_add_conv=false
80
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
81
+ echo "Config ${train_mode} ${config_option}"
82
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
83
+ task_load_clap_emb=true
84
+ model_proj_type=2
85
+ model_clone_batch=4
86
+ dataset_batch_size=48
87
+ model_clap_loss=1.0
88
+ average_top_k_layers=1
89
+ # loss type ablation
90
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
91
+ echo "Config ${train_mode} ${config_option}"
92
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
93
+ task_load_clap_emb=true
94
+ model_proj_type=2
95
+ model_clone_batch=4
96
+ dataset_batch_size=48
97
+ model_clap_loss=1.0
98
+ average_top_k_layers=12
99
+ model_clap_loss_type="ce"
100
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
101
+ echo "Config ${train_mode} ${config_option}"
102
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
103
+ task_load_clap_emb=true
104
+ model_proj_type=2
105
+ model_clone_batch=4
106
+ dataset_batch_size=48
107
+ model_clap_loss=1.0
108
+ average_top_k_layers=12
109
+ model_clap_loss_type="l1"
110
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
111
+ echo "Config ${train_mode} ${config_option}"
112
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
113
+ task_load_clap_emb=true
114
+ model_proj_type=2
115
+ model_clone_batch=4
116
+ dataset_batch_size=96
117
+ model_clap_loss=1.0
118
+ average_top_k_layers=12
119
+ model_clap_loss_type="cosine"
120
+ # loss layer ablation
121
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
122
+ echo "Config ${train_mode} ${config_option}"
123
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
124
+ task_load_clap_emb=true
125
+ model_proj_type=2
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_clap_loss=1.0
129
+ average_top_k_layers=12
130
+ model_clap_loss_type="mse"
131
+ model_clap_loss_layer=10
132
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
133
+ echo "Config ${train_mode} ${config_option}"
134
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
135
+ task_load_clap_emb=true
136
+ task_load_source_file=true
137
+ task_load_mel_file=false
138
+ model_proj_type=2
139
+ model_clone_batch=4
140
+ dataset_batch_size=96
141
+ model_clap_loss=1.0
142
+ average_top_k_layers=12
143
+ model_clap_loss_type="mse"
144
+ model_clap_loss_layer=8
145
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
146
+ echo "Config ${train_mode} ${config_option}"
147
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
148
+ task_load_clap_emb=true
149
+ task_load_source_file=true
150
+ task_load_mel_file=false
151
+ model_proj_type=2
152
+ model_clone_batch=4
153
+ dataset_batch_size=96
154
+ model_clap_loss=1.0
155
+ average_top_k_layers=12
156
+ model_clap_loss_type="mse"
157
+ model_clap_loss_layer=6
158
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
159
+ echo "Config ${train_mode} ${config_option}"
160
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
161
+ task_load_clap_emb=true
162
+ task_load_source_file=true
163
+ task_load_mel_file=false
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ model_clap_loss=5.0
167
+ dataset_batch_size=96
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="mse"
170
+ checkpoint_keep_interval_updates=-1
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ task_load_source_file=true
176
+ task_load_mel_file=false
177
+ model_proj_type=2
178
+ model_clone_batch=4
179
+ model_clap_loss=0.1
180
+ dataset_batch_size=96
181
+ average_top_k_layers=12
182
+ model_clap_loss_type="mse"
183
+ checkpoint_keep_interval_updates=-1
184
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
185
+ echo "Config ${train_mode} ${config_option}"
186
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
187
+ task_load_clap_emb=true
188
+ model_proj_type=4
189
+ model_clone_batch=4
190
+ model_clap_loss=1.0
191
+ dataset_batch_size=48
192
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
195
+ task_load_clap_emb=true
196
+ model_proj_type=4
197
+ model_clone_batch=4
198
+ model_clap_loss=0.001
199
+ dataset_batch_size=48
200
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
201
+ echo "Config ${train_mode} ${config_option}"
202
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
203
+ task_load_clap_emb=true
204
+ model_proj_type=4
205
+ model_clone_batch=4
206
+ model_clap_loss=0.01
207
+ dataset_batch_size=48
208
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
209
+ echo "Config ${train_mode} ${config_option}"
210
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
211
+ task_load_clap_emb=true
212
+ model_proj_type=6
213
+ model_clone_batch=4
214
+ dataset_batch_size=48
215
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
216
+ echo "Config ${train_mode} ${config_option}"
217
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
218
+ task_load_clap_emb=true
219
+ task_load_source_file=true
220
+ task_load_mel_file=false
221
+ model_proj_type=2
222
+ model_clone_batch=4
223
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
224
+ model_clap_loss=1.0
225
+ average_top_k_layers=11 # modify with model depth
226
+ model_add_conv=true
227
+ model_depth=11 #
228
+ checkpoint_keep_interval_updates=-1 # default 1
229
+ checkpoint_save_interval_updates=10000
230
+ fi
231
+
232
+ python fairseq_cli/hydra_train.py -m \
233
+ --config-dir ./EAT/config \
234
+ --config-name pretraining_AS2M \
235
+ common.user_dir=./EAT \
236
+ checkpoint.save_dir=${checkpoint_save_dir} \
237
+ checkpoint.restore_file=${checkpoint_restore_file} \
238
+ distributed_training.distributed_world_size=${1:-2} \
239
+ dataset.num_workers=24 \
240
+ dataset.data_buffer_size=48 \
241
+ dataset.batch_size=${dataset_batch_size} \
242
+ task.data=${task_data} \
243
+ task.h5_format=False \
244
+ task.load_clap_emb=${task_load_clap_emb} \
245
+ +task.load_source_file=${task_load_source_file} \
246
+ +task.load_mel_file=${task_load_mel_file} \
247
+ model.proj_type=${model_proj_type} \
248
+ model.clone_batch=${model_clone_batch} \
249
+ model.clap_loss=${model_clap_loss} \
250
+ model.average_top_k_layers=${average_top_k_layers} \
251
+ +model.add_conv=${model_add_conv} \
252
+ +model.clap_loss_type=${model_clap_loss_type} \
253
+ +model.clap_loss_layer=${model_clap_loss_layer} \
254
+ +model.dispersive_loss=${model_dispersive_loss} \
255
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
256
+ model.depth=${model_depth} \
257
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
258
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_1_2025-09-26_14-39-04/pretraining_AS2M.sh ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=1
54
+ dataset_batch_size=384
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=4
66
+ dataset_batch_size=96
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
73
+ task_load_clap_emb=true
74
+ model_proj_type=2
75
+ model_clone_batch=4
76
+ dataset_batch_size=48
77
+ model_clap_loss=1.0
78
+ average_top_k_layers=12
79
+ model_add_conv=false
80
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
81
+ echo "Config ${train_mode} ${config_option}"
82
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
83
+ task_load_clap_emb=true
84
+ model_proj_type=2
85
+ model_clone_batch=4
86
+ dataset_batch_size=48
87
+ model_clap_loss=1.0
88
+ average_top_k_layers=1
89
+ # loss type ablation
90
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
91
+ echo "Config ${train_mode} ${config_option}"
92
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
93
+ task_load_clap_emb=true
94
+ model_proj_type=2
95
+ model_clone_batch=4
96
+ dataset_batch_size=48
97
+ model_clap_loss=1.0
98
+ average_top_k_layers=12
99
+ model_clap_loss_type="ce"
100
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
101
+ echo "Config ${train_mode} ${config_option}"
102
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
103
+ task_load_clap_emb=true
104
+ model_proj_type=2
105
+ model_clone_batch=4
106
+ dataset_batch_size=48
107
+ model_clap_loss=1.0
108
+ average_top_k_layers=12
109
+ model_clap_loss_type="l1"
110
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
111
+ echo "Config ${train_mode} ${config_option}"
112
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
113
+ task_load_clap_emb=true
114
+ model_proj_type=2
115
+ model_clone_batch=4
116
+ dataset_batch_size=96
117
+ model_clap_loss=1.0
118
+ average_top_k_layers=12
119
+ model_clap_loss_type="cosine"
120
+ # loss layer ablation
121
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
122
+ echo "Config ${train_mode} ${config_option}"
123
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
124
+ task_load_clap_emb=true
125
+ model_proj_type=2
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_clap_loss=1.0
129
+ average_top_k_layers=12
130
+ model_clap_loss_type="mse"
131
+ model_clap_loss_layer=10
132
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
133
+ echo "Config ${train_mode} ${config_option}"
134
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
135
+ task_load_clap_emb=true
136
+ task_load_source_file=true
137
+ task_load_mel_file=false
138
+ model_proj_type=2
139
+ model_clone_batch=4
140
+ dataset_batch_size=96
141
+ model_clap_loss=1.0
142
+ average_top_k_layers=12
143
+ model_clap_loss_type="mse"
144
+ model_clap_loss_layer=8
145
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
146
+ echo "Config ${train_mode} ${config_option}"
147
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
148
+ task_load_clap_emb=true
149
+ task_load_source_file=true
150
+ task_load_mel_file=false
151
+ model_proj_type=2
152
+ model_clone_batch=4
153
+ dataset_batch_size=96
154
+ model_clap_loss=1.0
155
+ average_top_k_layers=12
156
+ model_clap_loss_type="mse"
157
+ model_clap_loss_layer=6
158
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
159
+ echo "Config ${train_mode} ${config_option}"
160
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
161
+ task_load_clap_emb=true
162
+ task_load_source_file=true
163
+ task_load_mel_file=false
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ model_clap_loss=5.0
167
+ dataset_batch_size=96
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="mse"
170
+ checkpoint_keep_interval_updates=-1
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ task_load_source_file=true
176
+ task_load_mel_file=false
177
+ model_proj_type=2
178
+ model_clone_batch=4
179
+ model_clap_loss=0.1
180
+ dataset_batch_size=96
181
+ average_top_k_layers=12
182
+ model_clap_loss_type="mse"
183
+ checkpoint_keep_interval_updates=-1
184
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
185
+ echo "Config ${train_mode} ${config_option}"
186
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
187
+ task_load_clap_emb=true
188
+ model_proj_type=4
189
+ model_clone_batch=4
190
+ model_clap_loss=1.0
191
+ dataset_batch_size=48
192
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
195
+ task_load_clap_emb=true
196
+ model_proj_type=4
197
+ model_clone_batch=4
198
+ model_clap_loss=0.001
199
+ dataset_batch_size=48
200
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
201
+ echo "Config ${train_mode} ${config_option}"
202
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
203
+ task_load_clap_emb=true
204
+ model_proj_type=4
205
+ model_clone_batch=4
206
+ model_clap_loss=0.01
207
+ dataset_batch_size=48
208
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
209
+ echo "Config ${train_mode} ${config_option}"
210
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
211
+ task_load_clap_emb=true
212
+ model_proj_type=6
213
+ model_clone_batch=4
214
+ dataset_batch_size=48
215
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
216
+ echo "Config ${train_mode} ${config_option}"
217
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
218
+ task_load_clap_emb=true
219
+ task_load_source_file=true
220
+ task_load_mel_file=false
221
+ model_proj_type=2
222
+ model_clone_batch=4
223
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
224
+ model_clap_loss=1.0
225
+ average_top_k_layers=11 # modify with model depth
226
+ model_add_conv=true
227
+ model_depth=11 #
228
+ checkpoint_keep_interval_updates=-1 # default 1
229
+ checkpoint_save_interval_updates=10000
230
+ fi
231
+
232
+ python fairseq_cli/hydra_train.py -m \
233
+ --config-dir ./EAT/config \
234
+ --config-name pretraining_AS2M \
235
+ common.user_dir=./EAT \
236
+ checkpoint.save_dir=${checkpoint_save_dir} \
237
+ checkpoint.restore_file=${checkpoint_restore_file} \
238
+ distributed_training.distributed_world_size=${1:-2} \
239
+ dataset.num_workers=24 \
240
+ dataset.data_buffer_size=48 \
241
+ dataset.batch_size=${dataset_batch_size} \
242
+ task.data=${task_data} \
243
+ task.h5_format=False \
244
+ task.load_clap_emb=${task_load_clap_emb} \
245
+ +task.load_source_file=${task_load_source_file} \
246
+ +task.load_mel_file=${task_load_mel_file} \
247
+ model.proj_type=${model_proj_type} \
248
+ model.clone_batch=${model_clone_batch} \
249
+ model.clap_loss=${model_clap_loss} \
250
+ model.average_top_k_layers=${average_top_k_layers} \
251
+ +model.add_conv=${model_add_conv} \
252
+ +model.clap_loss_type=${model_clap_loss_type} \
253
+ +model.clap_loss_layer=${model_clap_loss_layer} \
254
+ +model.dispersive_loss=${model_dispersive_loss} \
255
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
256
+ model.depth=${model_depth} \
257
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
258
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_1_2025-09-26_14-57-51/pretraining_AS2M.sh ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=1
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=1
66
+ dataset_batch_size=384
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
73
+ task_load_clap_emb=true
74
+ model_proj_type=2
75
+ model_clone_batch=4
76
+ dataset_batch_size=48
77
+ model_clap_loss=1.0
78
+ average_top_k_layers=12
79
+ model_add_conv=false
80
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
81
+ echo "Config ${train_mode} ${config_option}"
82
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
83
+ task_load_clap_emb=true
84
+ model_proj_type=2
85
+ model_clone_batch=4
86
+ dataset_batch_size=48
87
+ model_clap_loss=1.0
88
+ average_top_k_layers=1
89
+ # loss type ablation
90
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
91
+ echo "Config ${train_mode} ${config_option}"
92
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
93
+ task_load_clap_emb=true
94
+ model_proj_type=2
95
+ model_clone_batch=4
96
+ dataset_batch_size=48
97
+ model_clap_loss=1.0
98
+ average_top_k_layers=12
99
+ model_clap_loss_type="ce"
100
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
101
+ echo "Config ${train_mode} ${config_option}"
102
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
103
+ task_load_clap_emb=true
104
+ model_proj_type=2
105
+ model_clone_batch=4
106
+ dataset_batch_size=48
107
+ model_clap_loss=1.0
108
+ average_top_k_layers=12
109
+ model_clap_loss_type="l1"
110
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
111
+ echo "Config ${train_mode} ${config_option}"
112
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
113
+ task_load_clap_emb=true
114
+ model_proj_type=2
115
+ model_clone_batch=4
116
+ dataset_batch_size=96
117
+ model_clap_loss=1.0
118
+ average_top_k_layers=12
119
+ model_clap_loss_type="cosine"
120
+ # loss layer ablation
121
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
122
+ echo "Config ${train_mode} ${config_option}"
123
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
124
+ task_load_clap_emb=true
125
+ model_proj_type=2
126
+ model_clone_batch=4
127
+ dataset_batch_size=96
128
+ model_clap_loss=1.0
129
+ average_top_k_layers=12
130
+ model_clap_loss_type="mse"
131
+ model_clap_loss_layer=10
132
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
133
+ echo "Config ${train_mode} ${config_option}"
134
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
135
+ task_load_clap_emb=true
136
+ task_load_source_file=true
137
+ task_load_mel_file=false
138
+ model_proj_type=2
139
+ model_clone_batch=4
140
+ dataset_batch_size=96
141
+ model_clap_loss=1.0
142
+ average_top_k_layers=12
143
+ model_clap_loss_type="mse"
144
+ model_clap_loss_layer=8
145
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
146
+ echo "Config ${train_mode} ${config_option}"
147
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
148
+ task_load_clap_emb=true
149
+ task_load_source_file=true
150
+ task_load_mel_file=false
151
+ model_proj_type=2
152
+ model_clone_batch=4
153
+ dataset_batch_size=96
154
+ model_clap_loss=1.0
155
+ average_top_k_layers=12
156
+ model_clap_loss_type="mse"
157
+ model_clap_loss_layer=6
158
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
159
+ echo "Config ${train_mode} ${config_option}"
160
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
161
+ task_load_clap_emb=true
162
+ task_load_source_file=true
163
+ task_load_mel_file=false
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ model_clap_loss=5.0
167
+ dataset_batch_size=96
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="mse"
170
+ checkpoint_keep_interval_updates=-1
171
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
172
+ echo "Config ${train_mode} ${config_option}"
173
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
174
+ task_load_clap_emb=true
175
+ task_load_source_file=true
176
+ task_load_mel_file=false
177
+ model_proj_type=2
178
+ model_clone_batch=4
179
+ model_clap_loss=0.1
180
+ dataset_batch_size=96
181
+ average_top_k_layers=12
182
+ model_clap_loss_type="mse"
183
+ checkpoint_keep_interval_updates=-1
184
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
185
+ echo "Config ${train_mode} ${config_option}"
186
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
187
+ task_load_clap_emb=true
188
+ model_proj_type=4
189
+ model_clone_batch=4
190
+ model_clap_loss=1.0
191
+ dataset_batch_size=48
192
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
195
+ task_load_clap_emb=true
196
+ model_proj_type=4
197
+ model_clone_batch=4
198
+ model_clap_loss=0.001
199
+ dataset_batch_size=48
200
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
201
+ echo "Config ${train_mode} ${config_option}"
202
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
203
+ task_load_clap_emb=true
204
+ model_proj_type=4
205
+ model_clone_batch=4
206
+ model_clap_loss=0.01
207
+ dataset_batch_size=48
208
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
209
+ echo "Config ${train_mode} ${config_option}"
210
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
211
+ task_load_clap_emb=true
212
+ model_proj_type=6
213
+ model_clone_batch=4
214
+ dataset_batch_size=48
215
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
216
+ echo "Config ${train_mode} ${config_option}"
217
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
218
+ task_load_clap_emb=true
219
+ task_load_source_file=true
220
+ task_load_mel_file=false
221
+ model_proj_type=2
222
+ model_clone_batch=4
223
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
224
+ model_clap_loss=1.0
225
+ average_top_k_layers=11 # modify with model depth
226
+ model_add_conv=true
227
+ model_depth=11 #
228
+ checkpoint_keep_interval_updates=-1 # default 1
229
+ checkpoint_save_interval_updates=10000
230
+ fi
231
+
232
+ python fairseq_cli/hydra_train.py -m \
233
+ --config-dir ./EAT/config \
234
+ --config-name pretraining_AS2M \
235
+ common.user_dir=./EAT \
236
+ checkpoint.save_dir=${checkpoint_save_dir} \
237
+ checkpoint.restore_file=${checkpoint_restore_file} \
238
+ distributed_training.distributed_world_size=${1:-2} \
239
+ dataset.num_workers=24 \
240
+ dataset.data_buffer_size=48 \
241
+ dataset.batch_size=${dataset_batch_size} \
242
+ task.data=${task_data} \
243
+ task.h5_format=False \
244
+ task.load_clap_emb=${task_load_clap_emb} \
245
+ +task.load_source_file=${task_load_source_file} \
246
+ +task.load_mel_file=${task_load_mel_file} \
247
+ model.proj_type=${model_proj_type} \
248
+ model.clone_batch=${model_clone_batch} \
249
+ model.clap_loss=${model_clap_loss} \
250
+ model.average_top_k_layers=${average_top_k_layers} \
251
+ +model.add_conv=${model_add_conv} \
252
+ +model.clap_loss_type=${model_clap_loss_type} \
253
+ +model.clap_loss_layer=${model_clap_loss_layer} \
254
+ +model.dispersive_loss=${model_dispersive_loss} \
255
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
256
+ model.depth=${model_depth} \
257
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
258
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_3_2025-09-27_05-57-32/pretraining_AS2M.sh ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=3
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=1
66
+ dataset_batch_size=384
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
73
+ task_load_clap_emb=false
74
+ task_load_source_file=true
75
+ task_load_mel_file=false
76
+ model_proj_type=null
77
+ model_clone_batch=1
78
+ dataset_batch_size=384
79
+ model_dispersive_loss=10.0
80
+ model_dispersive_loss_layer=0
81
+ checkpoint_keep_interval_updates=1
82
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
83
+ echo "Config ${train_mode} ${config_option}"
84
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
85
+ task_load_clap_emb=false
86
+ task_load_source_file=true
87
+ task_load_mel_file=false
88
+ model_proj_type=null
89
+ model_clone_batch=1
90
+ dataset_batch_size=384
91
+ model_dispersive_loss=100.0
92
+ model_dispersive_loss_layer=0
93
+ checkpoint_keep_interval_updates=1
94
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
95
+ echo "Config ${train_mode} ${config_option}"
96
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
97
+ task_load_clap_emb=true
98
+ model_proj_type=2
99
+ model_clone_batch=4
100
+ dataset_batch_size=48
101
+ model_clap_loss=1.0
102
+ average_top_k_layers=12
103
+ model_add_conv=false
104
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
105
+ echo "Config ${train_mode} ${config_option}"
106
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
107
+ task_load_clap_emb=true
108
+ model_proj_type=2
109
+ model_clone_batch=4
110
+ dataset_batch_size=48
111
+ model_clap_loss=1.0
112
+ average_top_k_layers=1
113
+ # loss type ablation
114
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
115
+ echo "Config ${train_mode} ${config_option}"
116
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
117
+ task_load_clap_emb=true
118
+ model_proj_type=2
119
+ model_clone_batch=4
120
+ dataset_batch_size=48
121
+ model_clap_loss=1.0
122
+ average_top_k_layers=12
123
+ model_clap_loss_type="ce"
124
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
125
+ echo "Config ${train_mode} ${config_option}"
126
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
127
+ task_load_clap_emb=true
128
+ model_proj_type=2
129
+ model_clone_batch=4
130
+ dataset_batch_size=48
131
+ model_clap_loss=1.0
132
+ average_top_k_layers=12
133
+ model_clap_loss_type="l1"
134
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
135
+ echo "Config ${train_mode} ${config_option}"
136
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
137
+ task_load_clap_emb=true
138
+ model_proj_type=2
139
+ model_clone_batch=4
140
+ dataset_batch_size=96
141
+ model_clap_loss=1.0
142
+ average_top_k_layers=12
143
+ model_clap_loss_type="cosine"
144
+ # loss layer ablation
145
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
146
+ echo "Config ${train_mode} ${config_option}"
147
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
148
+ task_load_clap_emb=true
149
+ model_proj_type=2
150
+ model_clone_batch=4
151
+ dataset_batch_size=96
152
+ model_clap_loss=1.0
153
+ average_top_k_layers=12
154
+ model_clap_loss_type="mse"
155
+ model_clap_loss_layer=10
156
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
157
+ echo "Config ${train_mode} ${config_option}"
158
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
159
+ task_load_clap_emb=true
160
+ task_load_source_file=true
161
+ task_load_mel_file=false
162
+ model_proj_type=2
163
+ model_clone_batch=4
164
+ dataset_batch_size=96
165
+ model_clap_loss=1.0
166
+ average_top_k_layers=12
167
+ model_clap_loss_type="mse"
168
+ model_clap_loss_layer=8
169
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
170
+ echo "Config ${train_mode} ${config_option}"
171
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
172
+ task_load_clap_emb=true
173
+ task_load_source_file=true
174
+ task_load_mel_file=false
175
+ model_proj_type=2
176
+ model_clone_batch=4
177
+ dataset_batch_size=96
178
+ model_clap_loss=1.0
179
+ average_top_k_layers=12
180
+ model_clap_loss_type="mse"
181
+ model_clap_loss_layer=6
182
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
183
+ echo "Config ${train_mode} ${config_option}"
184
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
185
+ task_load_clap_emb=true
186
+ task_load_source_file=true
187
+ task_load_mel_file=false
188
+ model_proj_type=2
189
+ model_clone_batch=4
190
+ model_clap_loss=5.0
191
+ dataset_batch_size=96
192
+ average_top_k_layers=12
193
+ model_clap_loss_type="mse"
194
+ checkpoint_keep_interval_updates=-1
195
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
196
+ echo "Config ${train_mode} ${config_option}"
197
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
198
+ task_load_clap_emb=true
199
+ task_load_source_file=true
200
+ task_load_mel_file=false
201
+ model_proj_type=2
202
+ model_clone_batch=4
203
+ model_clap_loss=0.1
204
+ dataset_batch_size=96
205
+ average_top_k_layers=12
206
+ model_clap_loss_type="mse"
207
+ checkpoint_keep_interval_updates=-1
208
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
209
+ echo "Config ${train_mode} ${config_option}"
210
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
211
+ task_load_clap_emb=true
212
+ model_proj_type=4
213
+ model_clone_batch=4
214
+ model_clap_loss=1.0
215
+ dataset_batch_size=48
216
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
217
+ echo "Config ${train_mode} ${config_option}"
218
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
219
+ task_load_clap_emb=true
220
+ model_proj_type=4
221
+ model_clone_batch=4
222
+ model_clap_loss=0.001
223
+ dataset_batch_size=48
224
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
225
+ echo "Config ${train_mode} ${config_option}"
226
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
227
+ task_load_clap_emb=true
228
+ model_proj_type=4
229
+ model_clone_batch=4
230
+ model_clap_loss=0.01
231
+ dataset_batch_size=48
232
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
235
+ task_load_clap_emb=true
236
+ model_proj_type=6
237
+ model_clone_batch=4
238
+ dataset_batch_size=48
239
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
240
+ echo "Config ${train_mode} ${config_option}"
241
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
242
+ task_load_clap_emb=true
243
+ task_load_source_file=true
244
+ task_load_mel_file=false
245
+ model_proj_type=2
246
+ model_clone_batch=4
247
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
248
+ model_clap_loss=1.0
249
+ average_top_k_layers=11 # modify with model depth
250
+ model_add_conv=true
251
+ model_depth=11 #
252
+ checkpoint_keep_interval_updates=-1 # default 1
253
+ checkpoint_save_interval_updates=10000
254
+ fi
255
+
256
+ python fairseq_cli/hydra_train.py -m \
257
+ --config-dir ./EAT/config \
258
+ --config-name pretraining_AS2M \
259
+ common.user_dir=./EAT \
260
+ checkpoint.save_dir=${checkpoint_save_dir} \
261
+ checkpoint.restore_file=${checkpoint_restore_file} \
262
+ distributed_training.distributed_world_size=${1:-2} \
263
+ dataset.num_workers=24 \
264
+ dataset.data_buffer_size=48 \
265
+ dataset.batch_size=${dataset_batch_size} \
266
+ task.data=${task_data} \
267
+ task.h5_format=False \
268
+ task.load_clap_emb=${task_load_clap_emb} \
269
+ +task.load_source_file=${task_load_source_file} \
270
+ +task.load_mel_file=${task_load_mel_file} \
271
+ model.proj_type=${model_proj_type} \
272
+ model.clone_batch=${model_clone_batch} \
273
+ model.clap_loss=${model_clap_loss} \
274
+ model.average_top_k_layers=${average_top_k_layers} \
275
+ +model.add_conv=${model_add_conv} \
276
+ +model.clap_loss_type=${model_clap_loss_type} \
277
+ +model.clap_loss_layer=${model_clap_loss_layer} \
278
+ +model.dispersive_loss=${model_dispersive_loss} \
279
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
280
+ model.depth=${model_depth} \
281
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
282
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_4_2025-09-28_05-38-34/pretraining_AS2M.sh ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=4
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=1
66
+ dataset_batch_size=384
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
73
+ task_load_clap_emb=false
74
+ task_load_source_file=true
75
+ task_load_mel_file=false
76
+ model_proj_type=null
77
+ model_clone_batch=1
78
+ dataset_batch_size=384
79
+ model_dispersive_loss=10.0
80
+ model_dispersive_loss_layer=0
81
+ checkpoint_keep_interval_updates=1
82
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
83
+ echo "Config ${train_mode} ${config_option}"
84
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
85
+ task_load_clap_emb=false
86
+ task_load_source_file=true
87
+ task_load_mel_file=false
88
+ model_proj_type=null
89
+ model_clone_batch=1
90
+ dataset_batch_size=384
91
+ model_dispersive_loss=100.0
92
+ model_dispersive_loss_layer=0
93
+ checkpoint_keep_interval_updates=1
94
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
95
+ echo "Config ${train_mode} ${config_option}"
96
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
97
+ task_load_clap_emb=false
98
+ task_load_source_file=true
99
+ task_load_mel_file=false
100
+ model_proj_type=null
101
+ model_clone_batch=1
102
+ dataset_batch_size=384
103
+ model_dispersive_loss=10000.0
104
+ model_dispersive_loss_layer=0
105
+ checkpoint_keep_interval_updates=1
106
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
107
+ echo "Config ${train_mode} ${config_option}"
108
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
109
+ task_load_clap_emb=true
110
+ model_proj_type=2
111
+ model_clone_batch=4
112
+ dataset_batch_size=48
113
+ model_clap_loss=1.0
114
+ average_top_k_layers=12
115
+ model_add_conv=false
116
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
117
+ echo "Config ${train_mode} ${config_option}"
118
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
119
+ task_load_clap_emb=true
120
+ model_proj_type=2
121
+ model_clone_batch=4
122
+ dataset_batch_size=48
123
+ model_clap_loss=1.0
124
+ average_top_k_layers=1
125
+ # loss type ablation
126
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
127
+ echo "Config ${train_mode} ${config_option}"
128
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
129
+ task_load_clap_emb=true
130
+ model_proj_type=2
131
+ model_clone_batch=4
132
+ dataset_batch_size=48
133
+ model_clap_loss=1.0
134
+ average_top_k_layers=12
135
+ model_clap_loss_type="ce"
136
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
137
+ echo "Config ${train_mode} ${config_option}"
138
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
139
+ task_load_clap_emb=true
140
+ model_proj_type=2
141
+ model_clone_batch=4
142
+ dataset_batch_size=48
143
+ model_clap_loss=1.0
144
+ average_top_k_layers=12
145
+ model_clap_loss_type="l1"
146
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
147
+ echo "Config ${train_mode} ${config_option}"
148
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
149
+ task_load_clap_emb=true
150
+ model_proj_type=2
151
+ model_clone_batch=4
152
+ dataset_batch_size=96
153
+ model_clap_loss=1.0
154
+ average_top_k_layers=12
155
+ model_clap_loss_type="cosine"
156
+ # loss layer ablation
157
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
158
+ echo "Config ${train_mode} ${config_option}"
159
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
160
+ task_load_clap_emb=true
161
+ model_proj_type=2
162
+ model_clone_batch=4
163
+ dataset_batch_size=96
164
+ model_clap_loss=1.0
165
+ average_top_k_layers=12
166
+ model_clap_loss_type="mse"
167
+ model_clap_loss_layer=10
168
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
169
+ echo "Config ${train_mode} ${config_option}"
170
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
171
+ task_load_clap_emb=true
172
+ task_load_source_file=true
173
+ task_load_mel_file=false
174
+ model_proj_type=2
175
+ model_clone_batch=4
176
+ dataset_batch_size=96
177
+ model_clap_loss=1.0
178
+ average_top_k_layers=12
179
+ model_clap_loss_type="mse"
180
+ model_clap_loss_layer=8
181
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
182
+ echo "Config ${train_mode} ${config_option}"
183
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
184
+ task_load_clap_emb=true
185
+ task_load_source_file=true
186
+ task_load_mel_file=false
187
+ model_proj_type=2
188
+ model_clone_batch=4
189
+ dataset_batch_size=96
190
+ model_clap_loss=1.0
191
+ average_top_k_layers=12
192
+ model_clap_loss_type="mse"
193
+ model_clap_loss_layer=6
194
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
195
+ echo "Config ${train_mode} ${config_option}"
196
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
197
+ task_load_clap_emb=true
198
+ task_load_source_file=true
199
+ task_load_mel_file=false
200
+ model_proj_type=2
201
+ model_clone_batch=4
202
+ model_clap_loss=5.0
203
+ dataset_batch_size=96
204
+ average_top_k_layers=12
205
+ model_clap_loss_type="mse"
206
+ checkpoint_keep_interval_updates=-1
207
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
208
+ echo "Config ${train_mode} ${config_option}"
209
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
210
+ task_load_clap_emb=true
211
+ task_load_source_file=true
212
+ task_load_mel_file=false
213
+ model_proj_type=2
214
+ model_clone_batch=4
215
+ model_clap_loss=0.1
216
+ dataset_batch_size=96
217
+ average_top_k_layers=12
218
+ model_clap_loss_type="mse"
219
+ checkpoint_keep_interval_updates=-1
220
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
221
+ echo "Config ${train_mode} ${config_option}"
222
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
223
+ task_load_clap_emb=true
224
+ model_proj_type=4
225
+ model_clone_batch=4
226
+ model_clap_loss=1.0
227
+ dataset_batch_size=48
228
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
229
+ echo "Config ${train_mode} ${config_option}"
230
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
231
+ task_load_clap_emb=true
232
+ model_proj_type=4
233
+ model_clone_batch=4
234
+ model_clap_loss=0.001
235
+ dataset_batch_size=48
236
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
237
+ echo "Config ${train_mode} ${config_option}"
238
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
239
+ task_load_clap_emb=true
240
+ model_proj_type=4
241
+ model_clone_batch=4
242
+ model_clap_loss=0.01
243
+ dataset_batch_size=48
244
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
245
+ echo "Config ${train_mode} ${config_option}"
246
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
247
+ task_load_clap_emb=true
248
+ model_proj_type=6
249
+ model_clone_batch=4
250
+ dataset_batch_size=48
251
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
252
+ echo "Config ${train_mode} ${config_option}"
253
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
254
+ task_load_clap_emb=true
255
+ task_load_source_file=true
256
+ task_load_mel_file=false
257
+ model_proj_type=2
258
+ model_clone_batch=4
259
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
260
+ model_clap_loss=1.0
261
+ average_top_k_layers=11 # modify with model depth
262
+ model_add_conv=true
263
+ model_depth=11 #
264
+ checkpoint_keep_interval_updates=-1 # default 1
265
+ checkpoint_save_interval_updates=10000
266
+ fi
267
+
268
+ python fairseq_cli/hydra_train.py -m \
269
+ --config-dir ./EAT/config \
270
+ --config-name pretraining_AS2M \
271
+ common.user_dir=./EAT \
272
+ checkpoint.save_dir=${checkpoint_save_dir} \
273
+ checkpoint.restore_file=${checkpoint_restore_file} \
274
+ distributed_training.distributed_world_size=${1:-2} \
275
+ dataset.num_workers=24 \
276
+ dataset.data_buffer_size=48 \
277
+ dataset.batch_size=${dataset_batch_size} \
278
+ task.data=${task_data} \
279
+ task.h5_format=False \
280
+ task.load_clap_emb=${task_load_clap_emb} \
281
+ +task.load_source_file=${task_load_source_file} \
282
+ +task.load_mel_file=${task_load_mel_file} \
283
+ model.proj_type=${model_proj_type} \
284
+ model.clone_batch=${model_clone_batch} \
285
+ model.clap_loss=${model_clap_loss} \
286
+ model.average_top_k_layers=${average_top_k_layers} \
287
+ +model.add_conv=${model_add_conv} \
288
+ +model.clap_loss_type=${model_clap_loss_type} \
289
+ +model.clap_loss_layer=${model_clap_loss_layer} \
290
+ +model.dispersive_loss=${model_dispersive_loss} \
291
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
292
+ model.depth=${model_depth} \
293
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
294
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_5_2025-09-28_06-51-25/pretraining_AS2M.sh ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=5
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=1
66
+ dataset_batch_size=384
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
73
+ task_load_clap_emb=false
74
+ task_load_source_file=true
75
+ task_load_mel_file=false
76
+ model_proj_type=null
77
+ model_clone_batch=1
78
+ dataset_batch_size=384
79
+ model_dispersive_loss=10.0
80
+ model_dispersive_loss_layer=0
81
+ checkpoint_keep_interval_updates=1
82
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
83
+ echo "Config ${train_mode} ${config_option}"
84
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
85
+ task_load_clap_emb=false
86
+ task_load_source_file=true
87
+ task_load_mel_file=false
88
+ model_proj_type=null
89
+ model_clone_batch=1
90
+ dataset_batch_size=384
91
+ model_dispersive_loss=100.0
92
+ model_dispersive_loss_layer=0
93
+ checkpoint_keep_interval_updates=1
94
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
95
+ echo "Config ${train_mode} ${config_option}"
96
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
97
+ task_load_clap_emb=false
98
+ task_load_source_file=true
99
+ task_load_mel_file=false
100
+ model_proj_type=null
101
+ model_clone_batch=1
102
+ dataset_batch_size=384
103
+ model_dispersive_loss=10000.0
104
+ model_dispersive_loss_layer=0
105
+ checkpoint_keep_interval_updates=1
106
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
107
+ echo "Config ${train_mode} ${config_option}"
108
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
109
+ task_load_clap_emb=false
110
+ task_load_source_file=true
111
+ task_load_mel_file=false
112
+ model_proj_type=null
113
+ model_clone_batch=1
114
+ dataset_batch_size=384
115
+ model_dispersive_loss=1000.0
116
+ model_dispersive_loss_layer=0
117
+ checkpoint_keep_interval_updates=1
118
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
119
+ echo "Config ${train_mode} ${config_option}"
120
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
121
+ task_load_clap_emb=true
122
+ model_proj_type=2
123
+ model_clone_batch=4
124
+ dataset_batch_size=48
125
+ model_clap_loss=1.0
126
+ average_top_k_layers=12
127
+ model_add_conv=false
128
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
129
+ echo "Config ${train_mode} ${config_option}"
130
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
131
+ task_load_clap_emb=true
132
+ model_proj_type=2
133
+ model_clone_batch=4
134
+ dataset_batch_size=48
135
+ model_clap_loss=1.0
136
+ average_top_k_layers=1
137
+ # loss type ablation
138
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
139
+ echo "Config ${train_mode} ${config_option}"
140
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
141
+ task_load_clap_emb=true
142
+ model_proj_type=2
143
+ model_clone_batch=4
144
+ dataset_batch_size=48
145
+ model_clap_loss=1.0
146
+ average_top_k_layers=12
147
+ model_clap_loss_type="ce"
148
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
149
+ echo "Config ${train_mode} ${config_option}"
150
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
151
+ task_load_clap_emb=true
152
+ model_proj_type=2
153
+ model_clone_batch=4
154
+ dataset_batch_size=48
155
+ model_clap_loss=1.0
156
+ average_top_k_layers=12
157
+ model_clap_loss_type="l1"
158
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
159
+ echo "Config ${train_mode} ${config_option}"
160
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
161
+ task_load_clap_emb=true
162
+ model_proj_type=2
163
+ model_clone_batch=4
164
+ dataset_batch_size=96
165
+ model_clap_loss=1.0
166
+ average_top_k_layers=12
167
+ model_clap_loss_type="cosine"
168
+ # loss layer ablation
169
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
170
+ echo "Config ${train_mode} ${config_option}"
171
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
172
+ task_load_clap_emb=true
173
+ model_proj_type=2
174
+ model_clone_batch=4
175
+ dataset_batch_size=96
176
+ model_clap_loss=1.0
177
+ average_top_k_layers=12
178
+ model_clap_loss_type="mse"
179
+ model_clap_loss_layer=10
180
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
181
+ echo "Config ${train_mode} ${config_option}"
182
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
183
+ task_load_clap_emb=true
184
+ task_load_source_file=true
185
+ task_load_mel_file=false
186
+ model_proj_type=2
187
+ model_clone_batch=4
188
+ dataset_batch_size=96
189
+ model_clap_loss=1.0
190
+ average_top_k_layers=12
191
+ model_clap_loss_type="mse"
192
+ model_clap_loss_layer=8
193
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
194
+ echo "Config ${train_mode} ${config_option}"
195
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
196
+ task_load_clap_emb=true
197
+ task_load_source_file=true
198
+ task_load_mel_file=false
199
+ model_proj_type=2
200
+ model_clone_batch=4
201
+ dataset_batch_size=96
202
+ model_clap_loss=1.0
203
+ average_top_k_layers=12
204
+ model_clap_loss_type="mse"
205
+ model_clap_loss_layer=6
206
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
207
+ echo "Config ${train_mode} ${config_option}"
208
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
209
+ task_load_clap_emb=true
210
+ task_load_source_file=true
211
+ task_load_mel_file=false
212
+ model_proj_type=2
213
+ model_clone_batch=4
214
+ model_clap_loss=5.0
215
+ dataset_batch_size=96
216
+ average_top_k_layers=12
217
+ model_clap_loss_type="mse"
218
+ checkpoint_keep_interval_updates=-1
219
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
220
+ echo "Config ${train_mode} ${config_option}"
221
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
222
+ task_load_clap_emb=true
223
+ task_load_source_file=true
224
+ task_load_mel_file=false
225
+ model_proj_type=2
226
+ model_clone_batch=4
227
+ model_clap_loss=0.1
228
+ dataset_batch_size=96
229
+ average_top_k_layers=12
230
+ model_clap_loss_type="mse"
231
+ checkpoint_keep_interval_updates=-1
232
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
233
+ echo "Config ${train_mode} ${config_option}"
234
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
235
+ task_load_clap_emb=true
236
+ model_proj_type=4
237
+ model_clone_batch=4
238
+ model_clap_loss=1.0
239
+ dataset_batch_size=48
240
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
241
+ echo "Config ${train_mode} ${config_option}"
242
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
243
+ task_load_clap_emb=true
244
+ model_proj_type=4
245
+ model_clone_batch=4
246
+ model_clap_loss=0.001
247
+ dataset_batch_size=48
248
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
249
+ echo "Config ${train_mode} ${config_option}"
250
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
251
+ task_load_clap_emb=true
252
+ model_proj_type=4
253
+ model_clone_batch=4
254
+ model_clap_loss=0.01
255
+ dataset_batch_size=48
256
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
257
+ echo "Config ${train_mode} ${config_option}"
258
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
259
+ task_load_clap_emb=true
260
+ model_proj_type=6
261
+ model_clone_batch=4
262
+ dataset_batch_size=48
263
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
264
+ echo "Config ${train_mode} ${config_option}"
265
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
266
+ task_load_clap_emb=true
267
+ task_load_source_file=true
268
+ task_load_mel_file=false
269
+ model_proj_type=2
270
+ model_clone_batch=4
271
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
272
+ model_clap_loss=1.0
273
+ average_top_k_layers=11 # modify with model depth
274
+ model_add_conv=true
275
+ model_depth=11 #
276
+ checkpoint_keep_interval_updates=-1 # default 1
277
+ checkpoint_save_interval_updates=10000
278
+ fi
279
+
280
+ python fairseq_cli/hydra_train.py -m \
281
+ --config-dir ./EAT/config \
282
+ --config-name pretraining_AS2M \
283
+ common.user_dir=./EAT \
284
+ checkpoint.save_dir=${checkpoint_save_dir} \
285
+ checkpoint.restore_file=${checkpoint_restore_file} \
286
+ distributed_training.distributed_world_size=${1:-2} \
287
+ dataset.num_workers=24 \
288
+ dataset.data_buffer_size=48 \
289
+ dataset.batch_size=${dataset_batch_size} \
290
+ task.data=${task_data} \
291
+ task.h5_format=False \
292
+ task.load_clap_emb=${task_load_clap_emb} \
293
+ +task.load_source_file=${task_load_source_file} \
294
+ +task.load_mel_file=${task_load_mel_file} \
295
+ model.proj_type=${model_proj_type} \
296
+ model.clone_batch=${model_clone_batch} \
297
+ model.clap_loss=${model_clap_loss} \
298
+ model.average_top_k_layers=${average_top_k_layers} \
299
+ +model.add_conv=${model_add_conv} \
300
+ +model.clap_loss_type=${model_clap_loss_type} \
301
+ +model.clap_loss_layer=${model_clap_loss_layer} \
302
+ +model.dispersive_loss=${model_dispersive_loss} \
303
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
304
+ model.depth=${model_depth} \
305
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
306
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_5_2025-09-28_07-56-38/pretraining_AS2M.sh ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=5
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=1
66
+ dataset_batch_size=384
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
73
+ task_load_clap_emb=false
74
+ task_load_source_file=true
75
+ task_load_mel_file=false
76
+ model_proj_type=null
77
+ model_clone_batch=1
78
+ dataset_batch_size=384
79
+ model_dispersive_loss=10.0
80
+ model_dispersive_loss_layer=0
81
+ checkpoint_keep_interval_updates=1
82
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
83
+ echo "Config ${train_mode} ${config_option}"
84
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
85
+ task_load_clap_emb=false
86
+ task_load_source_file=true
87
+ task_load_mel_file=false
88
+ model_proj_type=null
89
+ model_clone_batch=1
90
+ dataset_batch_size=384
91
+ model_dispersive_loss=100.0
92
+ model_dispersive_loss_layer=0
93
+ checkpoint_keep_interval_updates=1
94
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
95
+ echo "Config ${train_mode} ${config_option}"
96
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
97
+ task_load_clap_emb=false
98
+ task_load_source_file=true
99
+ task_load_mel_file=false
100
+ model_proj_type=null
101
+ model_clone_batch=1
102
+ dataset_batch_size=384
103
+ model_dispersive_loss=10000.0
104
+ model_dispersive_loss_layer=0
105
+ checkpoint_keep_interval_updates=1
106
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
107
+ echo "Config ${train_mode} ${config_option}"
108
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
109
+ task_load_clap_emb=false
110
+ task_load_source_file=true
111
+ task_load_mel_file=false
112
+ model_proj_type=null
113
+ model_clone_batch=1
114
+ dataset_batch_size=384
115
+ model_dispersive_loss=1000.0
116
+ model_dispersive_loss_layer=0
117
+ checkpoint_keep_interval_updates=1
118
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
119
+ echo "Config ${train_mode} ${config_option}"
120
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
121
+ task_load_clap_emb=false
122
+ task_load_source_file=true
123
+ task_load_mel_file=false
124
+ model_proj_type=null
125
+ model_clone_batch=4
126
+ dataset_batch_size=96
127
+ model_dispersive_loss=1000.0
128
+ model_dispersive_loss_layer=10
129
+ checkpoint_keep_interval_updates=1
130
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
131
+ echo "Config ${train_mode} ${config_option}"
132
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
133
+ task_load_clap_emb=true
134
+ model_proj_type=2
135
+ model_clone_batch=4
136
+ dataset_batch_size=48
137
+ model_clap_loss=1.0
138
+ average_top_k_layers=12
139
+ model_add_conv=false
140
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
141
+ echo "Config ${train_mode} ${config_option}"
142
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
143
+ task_load_clap_emb=true
144
+ model_proj_type=2
145
+ model_clone_batch=4
146
+ dataset_batch_size=48
147
+ model_clap_loss=1.0
148
+ average_top_k_layers=1
149
+ # loss type ablation
150
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
151
+ echo "Config ${train_mode} ${config_option}"
152
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
153
+ task_load_clap_emb=true
154
+ model_proj_type=2
155
+ model_clone_batch=4
156
+ dataset_batch_size=48
157
+ model_clap_loss=1.0
158
+ average_top_k_layers=12
159
+ model_clap_loss_type="ce"
160
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
161
+ echo "Config ${train_mode} ${config_option}"
162
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
163
+ task_load_clap_emb=true
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ dataset_batch_size=48
167
+ model_clap_loss=1.0
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="l1"
170
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
171
+ echo "Config ${train_mode} ${config_option}"
172
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
173
+ task_load_clap_emb=true
174
+ model_proj_type=2
175
+ model_clone_batch=4
176
+ dataset_batch_size=96
177
+ model_clap_loss=1.0
178
+ average_top_k_layers=12
179
+ model_clap_loss_type="cosine"
180
+ # loss layer ablation
181
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
182
+ echo "Config ${train_mode} ${config_option}"
183
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
184
+ task_load_clap_emb=true
185
+ model_proj_type=2
186
+ model_clone_batch=4
187
+ dataset_batch_size=96
188
+ model_clap_loss=1.0
189
+ average_top_k_layers=12
190
+ model_clap_loss_type="mse"
191
+ model_clap_loss_layer=10
192
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
195
+ task_load_clap_emb=true
196
+ task_load_source_file=true
197
+ task_load_mel_file=false
198
+ model_proj_type=2
199
+ model_clone_batch=4
200
+ dataset_batch_size=96
201
+ model_clap_loss=1.0
202
+ average_top_k_layers=12
203
+ model_clap_loss_type="mse"
204
+ model_clap_loss_layer=8
205
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
206
+ echo "Config ${train_mode} ${config_option}"
207
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
208
+ task_load_clap_emb=true
209
+ task_load_source_file=true
210
+ task_load_mel_file=false
211
+ model_proj_type=2
212
+ model_clone_batch=4
213
+ dataset_batch_size=96
214
+ model_clap_loss=1.0
215
+ average_top_k_layers=12
216
+ model_clap_loss_type="mse"
217
+ model_clap_loss_layer=6
218
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
219
+ echo "Config ${train_mode} ${config_option}"
220
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
221
+ task_load_clap_emb=true
222
+ task_load_source_file=true
223
+ task_load_mel_file=false
224
+ model_proj_type=2
225
+ model_clone_batch=4
226
+ model_clap_loss=5.0
227
+ dataset_batch_size=96
228
+ average_top_k_layers=12
229
+ model_clap_loss_type="mse"
230
+ checkpoint_keep_interval_updates=-1
231
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
232
+ echo "Config ${train_mode} ${config_option}"
233
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
234
+ task_load_clap_emb=true
235
+ task_load_source_file=true
236
+ task_load_mel_file=false
237
+ model_proj_type=2
238
+ model_clone_batch=4
239
+ model_clap_loss=0.1
240
+ dataset_batch_size=96
241
+ average_top_k_layers=12
242
+ model_clap_loss_type="mse"
243
+ checkpoint_keep_interval_updates=-1
244
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
245
+ echo "Config ${train_mode} ${config_option}"
246
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
247
+ task_load_clap_emb=true
248
+ model_proj_type=4
249
+ model_clone_batch=4
250
+ model_clap_loss=1.0
251
+ dataset_batch_size=48
252
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
253
+ echo "Config ${train_mode} ${config_option}"
254
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
255
+ task_load_clap_emb=true
256
+ model_proj_type=4
257
+ model_clone_batch=4
258
+ model_clap_loss=0.001
259
+ dataset_batch_size=48
260
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
261
+ echo "Config ${train_mode} ${config_option}"
262
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
263
+ task_load_clap_emb=true
264
+ model_proj_type=4
265
+ model_clone_batch=4
266
+ model_clap_loss=0.01
267
+ dataset_batch_size=48
268
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
269
+ echo "Config ${train_mode} ${config_option}"
270
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
271
+ task_load_clap_emb=true
272
+ model_proj_type=6
273
+ model_clone_batch=4
274
+ dataset_batch_size=48
275
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
276
+ echo "Config ${train_mode} ${config_option}"
277
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
278
+ task_load_clap_emb=true
279
+ task_load_source_file=true
280
+ task_load_mel_file=false
281
+ model_proj_type=2
282
+ model_clone_batch=4
283
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
284
+ model_clap_loss=1.0
285
+ average_top_k_layers=11 # modify with model depth
286
+ model_add_conv=true
287
+ model_depth=11 #
288
+ checkpoint_keep_interval_updates=-1 # default 1
289
+ checkpoint_save_interval_updates=10000
290
+ fi
291
+
292
+ python fairseq_cli/hydra_train.py -m \
293
+ --config-dir ./EAT/config \
294
+ --config-name pretraining_AS2M \
295
+ common.user_dir=./EAT \
296
+ checkpoint.save_dir=${checkpoint_save_dir} \
297
+ checkpoint.restore_file=${checkpoint_restore_file} \
298
+ distributed_training.distributed_world_size=${1:-2} \
299
+ dataset.num_workers=24 \
300
+ dataset.data_buffer_size=48 \
301
+ dataset.batch_size=${dataset_batch_size} \
302
+ task.data=${task_data} \
303
+ task.h5_format=False \
304
+ task.load_clap_emb=${task_load_clap_emb} \
305
+ +task.load_source_file=${task_load_source_file} \
306
+ +task.load_mel_file=${task_load_mel_file} \
307
+ model.proj_type=${model_proj_type} \
308
+ model.clone_batch=${model_clone_batch} \
309
+ model.clap_loss=${model_clap_loss} \
310
+ model.average_top_k_layers=${average_top_k_layers} \
311
+ +model.add_conv=${model_add_conv} \
312
+ +model.clap_loss_type=${model_clap_loss_type} \
313
+ +model.clap_loss_layer=${model_clap_loss_layer} \
314
+ +model.dispersive_loss=${model_dispersive_loss} \
315
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
316
+ model.depth=${model_depth} \
317
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
318
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_6_2025-09-28_08-28-48/pretraining_AS2M.sh ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=6
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=1
66
+ dataset_batch_size=384
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
73
+ task_load_clap_emb=false
74
+ task_load_source_file=true
75
+ task_load_mel_file=false
76
+ model_proj_type=null
77
+ model_clone_batch=1
78
+ dataset_batch_size=384
79
+ model_dispersive_loss=10.0
80
+ model_dispersive_loss_layer=0
81
+ checkpoint_keep_interval_updates=1
82
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
83
+ echo "Config ${train_mode} ${config_option}"
84
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
85
+ task_load_clap_emb=false
86
+ task_load_source_file=true
87
+ task_load_mel_file=false
88
+ model_proj_type=null
89
+ model_clone_batch=1
90
+ dataset_batch_size=384
91
+ model_dispersive_loss=100.0
92
+ model_dispersive_loss_layer=0
93
+ checkpoint_keep_interval_updates=1
94
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
95
+ echo "Config ${train_mode} ${config_option}"
96
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
97
+ task_load_clap_emb=false
98
+ task_load_source_file=true
99
+ task_load_mel_file=false
100
+ model_proj_type=null
101
+ model_clone_batch=1
102
+ dataset_batch_size=384
103
+ model_dispersive_loss=10000.0
104
+ model_dispersive_loss_layer=0
105
+ checkpoint_keep_interval_updates=1
106
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
107
+ echo "Config ${train_mode} ${config_option}"
108
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
109
+ task_load_clap_emb=false
110
+ task_load_source_file=true
111
+ task_load_mel_file=false
112
+ model_proj_type=null
113
+ model_clone_batch=1
114
+ dataset_batch_size=384
115
+ model_dispersive_loss=1000.0
116
+ model_dispersive_loss_layer=0
117
+ checkpoint_keep_interval_updates=1
118
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
119
+ echo "Config ${train_mode} ${config_option}"
120
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
121
+ task_load_clap_emb=false
122
+ task_load_source_file=true
123
+ task_load_mel_file=false
124
+ model_proj_type=null
125
+ model_clone_batch=4
126
+ dataset_batch_size=96
127
+ model_dispersive_loss=1000.0
128
+ model_dispersive_loss_layer=10
129
+ checkpoint_keep_interval_updates=1
130
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
131
+ echo "Config ${train_mode} ${config_option}"
132
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
133
+ task_load_clap_emb=true
134
+ model_proj_type=2
135
+ model_clone_batch=4
136
+ dataset_batch_size=48
137
+ model_clap_loss=1.0
138
+ average_top_k_layers=12
139
+ model_add_conv=false
140
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
141
+ echo "Config ${train_mode} ${config_option}"
142
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
143
+ task_load_clap_emb=true
144
+ model_proj_type=2
145
+ model_clone_batch=4
146
+ dataset_batch_size=48
147
+ model_clap_loss=1.0
148
+ average_top_k_layers=1
149
+ # loss type ablation
150
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
151
+ echo "Config ${train_mode} ${config_option}"
152
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
153
+ task_load_clap_emb=true
154
+ model_proj_type=2
155
+ model_clone_batch=4
156
+ dataset_batch_size=48
157
+ model_clap_loss=1.0
158
+ average_top_k_layers=12
159
+ model_clap_loss_type="ce"
160
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
161
+ echo "Config ${train_mode} ${config_option}"
162
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
163
+ task_load_clap_emb=true
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ dataset_batch_size=48
167
+ model_clap_loss=1.0
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="l1"
170
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
171
+ echo "Config ${train_mode} ${config_option}"
172
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
173
+ task_load_clap_emb=true
174
+ model_proj_type=2
175
+ model_clone_batch=4
176
+ dataset_batch_size=96
177
+ model_clap_loss=1.0
178
+ average_top_k_layers=12
179
+ model_clap_loss_type="cosine"
180
+ # loss layer ablation
181
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
182
+ echo "Config ${train_mode} ${config_option}"
183
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
184
+ task_load_clap_emb=true
185
+ model_proj_type=2
186
+ model_clone_batch=4
187
+ dataset_batch_size=96
188
+ model_clap_loss=1.0
189
+ average_top_k_layers=12
190
+ model_clap_loss_type="mse"
191
+ model_clap_loss_layer=10
192
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
195
+ task_load_clap_emb=true
196
+ task_load_source_file=true
197
+ task_load_mel_file=false
198
+ model_proj_type=2
199
+ model_clone_batch=4
200
+ dataset_batch_size=96
201
+ model_clap_loss=1.0
202
+ average_top_k_layers=12
203
+ model_clap_loss_type="mse"
204
+ model_clap_loss_layer=8
205
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
206
+ echo "Config ${train_mode} ${config_option}"
207
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
208
+ task_load_clap_emb=true
209
+ task_load_source_file=true
210
+ task_load_mel_file=false
211
+ model_proj_type=2
212
+ model_clone_batch=4
213
+ dataset_batch_size=96
214
+ model_clap_loss=1.0
215
+ average_top_k_layers=12
216
+ model_clap_loss_type="mse"
217
+ model_clap_loss_layer=6
218
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
219
+ echo "Config ${train_mode} ${config_option}"
220
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
221
+ task_load_clap_emb=true
222
+ task_load_source_file=true
223
+ task_load_mel_file=false
224
+ model_proj_type=2
225
+ model_clone_batch=4
226
+ model_clap_loss=5.0
227
+ dataset_batch_size=96
228
+ average_top_k_layers=12
229
+ model_clap_loss_type="mse"
230
+ checkpoint_keep_interval_updates=-1
231
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
232
+ echo "Config ${train_mode} ${config_option}"
233
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
234
+ task_load_clap_emb=true
235
+ task_load_source_file=true
236
+ task_load_mel_file=false
237
+ model_proj_type=2
238
+ model_clone_batch=4
239
+ model_clap_loss=0.1
240
+ dataset_batch_size=96
241
+ average_top_k_layers=12
242
+ model_clap_loss_type="mse"
243
+ checkpoint_keep_interval_updates=-1
244
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
245
+ echo "Config ${train_mode} ${config_option}"
246
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
247
+ task_load_clap_emb=true
248
+ model_proj_type=4
249
+ model_clone_batch=4
250
+ model_clap_loss=1.0
251
+ dataset_batch_size=48
252
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
253
+ echo "Config ${train_mode} ${config_option}"
254
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
255
+ task_load_clap_emb=true
256
+ model_proj_type=4
257
+ model_clone_batch=4
258
+ model_clap_loss=0.001
259
+ dataset_batch_size=48
260
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
261
+ echo "Config ${train_mode} ${config_option}"
262
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
263
+ task_load_clap_emb=true
264
+ model_proj_type=4
265
+ model_clone_batch=4
266
+ model_clap_loss=0.01
267
+ dataset_batch_size=48
268
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
269
+ echo "Config ${train_mode} ${config_option}"
270
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
271
+ task_load_clap_emb=true
272
+ model_proj_type=6
273
+ model_clone_batch=4
274
+ dataset_batch_size=48
275
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
276
+ echo "Config ${train_mode} ${config_option}"
277
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
278
+ task_load_clap_emb=true
279
+ task_load_source_file=true
280
+ task_load_mel_file=false
281
+ model_proj_type=2
282
+ model_clone_batch=4
283
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
284
+ model_clap_loss=1.0
285
+ average_top_k_layers=11 # modify with model depth
286
+ model_add_conv=true
287
+ model_depth=11 #
288
+ checkpoint_keep_interval_updates=-1 # default 1
289
+ checkpoint_save_interval_updates=10000
290
+ fi
291
+
292
+ python fairseq_cli/hydra_train.py -m \
293
+ --config-dir ./EAT/config \
294
+ --config-name pretraining_AS2M \
295
+ common.user_dir=./EAT \
296
+ checkpoint.save_dir=${checkpoint_save_dir} \
297
+ checkpoint.restore_file=${checkpoint_restore_file} \
298
+ distributed_training.distributed_world_size=${1:-2} \
299
+ dataset.num_workers=24 \
300
+ dataset.data_buffer_size=48 \
301
+ dataset.batch_size=${dataset_batch_size} \
302
+ task.data=${task_data} \
303
+ task.h5_format=False \
304
+ task.load_clap_emb=${task_load_clap_emb} \
305
+ +task.load_source_file=${task_load_source_file} \
306
+ +task.load_mel_file=${task_load_mel_file} \
307
+ model.proj_type=${model_proj_type} \
308
+ model.clone_batch=${model_clone_batch} \
309
+ model.clap_loss=${model_clap_loss} \
310
+ model.average_top_k_layers=${average_top_k_layers} \
311
+ +model.add_conv=${model_add_conv} \
312
+ +model.clap_loss_type=${model_clap_loss_type} \
313
+ +model.clap_loss_layer=${model_clap_loss_layer} \
314
+ +model.dispersive_loss=${model_dispersive_loss} \
315
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
316
+ model.depth=${model_depth} \
317
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
318
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_6_2025-09-28_08-49-54/pretraining_AS2M.sh ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=6
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=1
66
+ dataset_batch_size=384
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
73
+ task_load_clap_emb=false
74
+ task_load_source_file=true
75
+ task_load_mel_file=false
76
+ model_proj_type=null
77
+ model_clone_batch=1
78
+ dataset_batch_size=384
79
+ model_dispersive_loss=10.0
80
+ model_dispersive_loss_layer=0
81
+ checkpoint_keep_interval_updates=1
82
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
83
+ echo "Config ${train_mode} ${config_option}"
84
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
85
+ task_load_clap_emb=false
86
+ task_load_source_file=true
87
+ task_load_mel_file=false
88
+ model_proj_type=null
89
+ model_clone_batch=1
90
+ dataset_batch_size=384
91
+ model_dispersive_loss=100.0
92
+ model_dispersive_loss_layer=0
93
+ checkpoint_keep_interval_updates=1
94
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
95
+ echo "Config ${train_mode} ${config_option}"
96
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
97
+ task_load_clap_emb=false
98
+ task_load_source_file=true
99
+ task_load_mel_file=false
100
+ model_proj_type=null
101
+ model_clone_batch=1
102
+ dataset_batch_size=384
103
+ model_dispersive_loss=10000.0
104
+ model_dispersive_loss_layer=0
105
+ checkpoint_keep_interval_updates=1
106
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
107
+ echo "Config ${train_mode} ${config_option}"
108
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
109
+ task_load_clap_emb=false
110
+ task_load_source_file=true
111
+ task_load_mel_file=false
112
+ model_proj_type=null
113
+ model_clone_batch=1
114
+ dataset_batch_size=384
115
+ model_dispersive_loss=1000.0
116
+ model_dispersive_loss_layer=0
117
+ checkpoint_keep_interval_updates=1
118
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
119
+ echo "Config ${train_mode} ${config_option}"
120
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
121
+ task_load_clap_emb=false
122
+ task_load_source_file=true
123
+ task_load_mel_file=false
124
+ model_proj_type=null
125
+ model_clone_batch=4
126
+ dataset_batch_size=96
127
+ model_dispersive_loss=1000.0
128
+ model_dispersive_loss_layer=10
129
+ checkpoint_keep_interval_updates=1
130
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
131
+ echo "Config ${train_mode} ${config_option}"
132
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
133
+ task_load_clap_emb=true
134
+ model_proj_type=2
135
+ model_clone_batch=4
136
+ dataset_batch_size=48
137
+ model_clap_loss=1.0
138
+ average_top_k_layers=12
139
+ model_add_conv=false
140
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
141
+ echo "Config ${train_mode} ${config_option}"
142
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
143
+ task_load_clap_emb=true
144
+ model_proj_type=2
145
+ model_clone_batch=4
146
+ dataset_batch_size=48
147
+ model_clap_loss=1.0
148
+ average_top_k_layers=1
149
+ # loss type ablation
150
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
151
+ echo "Config ${train_mode} ${config_option}"
152
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
153
+ task_load_clap_emb=true
154
+ model_proj_type=2
155
+ model_clone_batch=4
156
+ dataset_batch_size=48
157
+ model_clap_loss=1.0
158
+ average_top_k_layers=12
159
+ model_clap_loss_type="ce"
160
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
161
+ echo "Config ${train_mode} ${config_option}"
162
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
163
+ task_load_clap_emb=true
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ dataset_batch_size=48
167
+ model_clap_loss=1.0
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="l1"
170
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
171
+ echo "Config ${train_mode} ${config_option}"
172
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
173
+ task_load_clap_emb=true
174
+ model_proj_type=2
175
+ model_clone_batch=4
176
+ dataset_batch_size=96
177
+ model_clap_loss=1.0
178
+ average_top_k_layers=12
179
+ model_clap_loss_type="cosine"
180
+ # loss layer ablation
181
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
182
+ echo "Config ${train_mode} ${config_option}"
183
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
184
+ task_load_clap_emb=true
185
+ model_proj_type=2
186
+ model_clone_batch=4
187
+ dataset_batch_size=96
188
+ model_clap_loss=1.0
189
+ average_top_k_layers=12
190
+ model_clap_loss_type="mse"
191
+ model_clap_loss_layer=10
192
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
195
+ task_load_clap_emb=true
196
+ task_load_source_file=true
197
+ task_load_mel_file=false
198
+ model_proj_type=2
199
+ model_clone_batch=4
200
+ dataset_batch_size=96
201
+ model_clap_loss=1.0
202
+ average_top_k_layers=12
203
+ model_clap_loss_type="mse"
204
+ model_clap_loss_layer=8
205
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
206
+ echo "Config ${train_mode} ${config_option}"
207
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
208
+ task_load_clap_emb=true
209
+ task_load_source_file=true
210
+ task_load_mel_file=false
211
+ model_proj_type=2
212
+ model_clone_batch=4
213
+ dataset_batch_size=96
214
+ model_clap_loss=1.0
215
+ average_top_k_layers=12
216
+ model_clap_loss_type="mse"
217
+ model_clap_loss_layer=6
218
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
219
+ echo "Config ${train_mode} ${config_option}"
220
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
221
+ task_load_clap_emb=true
222
+ task_load_source_file=true
223
+ task_load_mel_file=false
224
+ model_proj_type=2
225
+ model_clone_batch=4
226
+ model_clap_loss=5.0
227
+ dataset_batch_size=96
228
+ average_top_k_layers=12
229
+ model_clap_loss_type="mse"
230
+ checkpoint_keep_interval_updates=-1
231
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
232
+ echo "Config ${train_mode} ${config_option}"
233
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
234
+ task_load_clap_emb=true
235
+ task_load_source_file=true
236
+ task_load_mel_file=false
237
+ model_proj_type=2
238
+ model_clone_batch=4
239
+ model_clap_loss=0.1
240
+ dataset_batch_size=96
241
+ average_top_k_layers=12
242
+ model_clap_loss_type="mse"
243
+ checkpoint_keep_interval_updates=-1
244
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
245
+ echo "Config ${train_mode} ${config_option}"
246
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
247
+ task_load_clap_emb=true
248
+ model_proj_type=4
249
+ model_clone_batch=4
250
+ model_clap_loss=1.0
251
+ dataset_batch_size=48
252
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
253
+ echo "Config ${train_mode} ${config_option}"
254
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
255
+ task_load_clap_emb=true
256
+ model_proj_type=4
257
+ model_clone_batch=4
258
+ model_clap_loss=0.001
259
+ dataset_batch_size=48
260
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
261
+ echo "Config ${train_mode} ${config_option}"
262
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
263
+ task_load_clap_emb=true
264
+ model_proj_type=4
265
+ model_clone_batch=4
266
+ model_clap_loss=0.01
267
+ dataset_batch_size=48
268
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
269
+ echo "Config ${train_mode} ${config_option}"
270
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
271
+ task_load_clap_emb=true
272
+ model_proj_type=6
273
+ model_clone_batch=4
274
+ dataset_batch_size=48
275
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
276
+ echo "Config ${train_mode} ${config_option}"
277
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
278
+ task_load_clap_emb=true
279
+ task_load_source_file=true
280
+ task_load_mel_file=false
281
+ model_proj_type=2
282
+ model_clone_batch=4
283
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
284
+ model_clap_loss=1.0
285
+ average_top_k_layers=11 # modify with model depth
286
+ model_add_conv=true
287
+ model_depth=11 #
288
+ checkpoint_keep_interval_updates=-1 # default 1
289
+ checkpoint_save_interval_updates=10000
290
+ fi
291
+
292
+ python fairseq_cli/hydra_train.py -m \
293
+ --config-dir ./EAT/config \
294
+ --config-name pretraining_AS2M \
295
+ common.user_dir=./EAT \
296
+ checkpoint.save_dir=${checkpoint_save_dir} \
297
+ checkpoint.restore_file=${checkpoint_restore_file} \
298
+ distributed_training.distributed_world_size=${1:-2} \
299
+ dataset.num_workers=24 \
300
+ dataset.data_buffer_size=48 \
301
+ dataset.batch_size=${dataset_batch_size} \
302
+ task.data=${task_data} \
303
+ task.h5_format=False \
304
+ task.load_clap_emb=${task_load_clap_emb} \
305
+ +task.load_source_file=${task_load_source_file} \
306
+ +task.load_mel_file=${task_load_mel_file} \
307
+ model.proj_type=${model_proj_type} \
308
+ model.clone_batch=${model_clone_batch} \
309
+ model.clap_loss=${model_clap_loss} \
310
+ model.average_top_k_layers=${average_top_k_layers} \
311
+ +model.add_conv=${model_add_conv} \
312
+ +model.clap_loss_type=${model_clap_loss_type} \
313
+ +model.clap_loss_layer=${model_clap_loss_layer} \
314
+ +model.dispersive_loss=${model_dispersive_loss} \
315
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
316
+ model.depth=${model_depth} \
317
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
318
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_6_2025-09-28_08-55-19/pretraining_AS2M.sh ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=6
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=1
66
+ dataset_batch_size=384
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
73
+ task_load_clap_emb=false
74
+ task_load_source_file=true
75
+ task_load_mel_file=false
76
+ model_proj_type=null
77
+ model_clone_batch=1
78
+ dataset_batch_size=384
79
+ model_dispersive_loss=10.0
80
+ model_dispersive_loss_layer=0
81
+ checkpoint_keep_interval_updates=1
82
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
83
+ echo "Config ${train_mode} ${config_option}"
84
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
85
+ task_load_clap_emb=false
86
+ task_load_source_file=true
87
+ task_load_mel_file=false
88
+ model_proj_type=null
89
+ model_clone_batch=1
90
+ dataset_batch_size=384
91
+ model_dispersive_loss=100.0
92
+ model_dispersive_loss_layer=0
93
+ checkpoint_keep_interval_updates=1
94
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
95
+ echo "Config ${train_mode} ${config_option}"
96
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
97
+ task_load_clap_emb=false
98
+ task_load_source_file=true
99
+ task_load_mel_file=false
100
+ model_proj_type=null
101
+ model_clone_batch=1
102
+ dataset_batch_size=384
103
+ model_dispersive_loss=10000.0
104
+ model_dispersive_loss_layer=0
105
+ checkpoint_keep_interval_updates=1
106
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
107
+ echo "Config ${train_mode} ${config_option}"
108
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
109
+ task_load_clap_emb=false
110
+ task_load_source_file=true
111
+ task_load_mel_file=false
112
+ model_proj_type=null
113
+ model_clone_batch=1
114
+ dataset_batch_size=384
115
+ model_dispersive_loss=1000.0
116
+ model_dispersive_loss_layer=0
117
+ checkpoint_keep_interval_updates=1
118
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
119
+ echo "Config ${train_mode} ${config_option}"
120
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
121
+ task_load_clap_emb=false
122
+ task_load_source_file=true
123
+ task_load_mel_file=false
124
+ model_proj_type=null
125
+ model_clone_batch=4
126
+ dataset_batch_size=96
127
+ model_dispersive_loss=1000.0
128
+ model_dispersive_loss_layer=10
129
+ checkpoint_keep_interval_updates=1
130
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
131
+ echo "Config ${train_mode} ${config_option}"
132
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
133
+ task_load_clap_emb=true
134
+ model_proj_type=2
135
+ model_clone_batch=4
136
+ dataset_batch_size=48
137
+ model_clap_loss=1.0
138
+ average_top_k_layers=12
139
+ model_add_conv=false
140
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
141
+ echo "Config ${train_mode} ${config_option}"
142
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
143
+ task_load_clap_emb=true
144
+ model_proj_type=2
145
+ model_clone_batch=4
146
+ dataset_batch_size=48
147
+ model_clap_loss=1.0
148
+ average_top_k_layers=1
149
+ # loss type ablation
150
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
151
+ echo "Config ${train_mode} ${config_option}"
152
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
153
+ task_load_clap_emb=true
154
+ model_proj_type=2
155
+ model_clone_batch=4
156
+ dataset_batch_size=48
157
+ model_clap_loss=1.0
158
+ average_top_k_layers=12
159
+ model_clap_loss_type="ce"
160
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
161
+ echo "Config ${train_mode} ${config_option}"
162
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
163
+ task_load_clap_emb=true
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ dataset_batch_size=48
167
+ model_clap_loss=1.0
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="l1"
170
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
171
+ echo "Config ${train_mode} ${config_option}"
172
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
173
+ task_load_clap_emb=true
174
+ model_proj_type=2
175
+ model_clone_batch=4
176
+ dataset_batch_size=96
177
+ model_clap_loss=1.0
178
+ average_top_k_layers=12
179
+ model_clap_loss_type="cosine"
180
+ # loss layer ablation
181
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
182
+ echo "Config ${train_mode} ${config_option}"
183
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
184
+ task_load_clap_emb=true
185
+ model_proj_type=2
186
+ model_clone_batch=4
187
+ dataset_batch_size=96
188
+ model_clap_loss=1.0
189
+ average_top_k_layers=12
190
+ model_clap_loss_type="mse"
191
+ model_clap_loss_layer=10
192
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
195
+ task_load_clap_emb=true
196
+ task_load_source_file=true
197
+ task_load_mel_file=false
198
+ model_proj_type=2
199
+ model_clone_batch=4
200
+ dataset_batch_size=96
201
+ model_clap_loss=1.0
202
+ average_top_k_layers=12
203
+ model_clap_loss_type="mse"
204
+ model_clap_loss_layer=8
205
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
206
+ echo "Config ${train_mode} ${config_option}"
207
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
208
+ task_load_clap_emb=true
209
+ task_load_source_file=true
210
+ task_load_mel_file=false
211
+ model_proj_type=2
212
+ model_clone_batch=4
213
+ dataset_batch_size=96
214
+ model_clap_loss=1.0
215
+ average_top_k_layers=12
216
+ model_clap_loss_type="mse"
217
+ model_clap_loss_layer=6
218
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
219
+ echo "Config ${train_mode} ${config_option}"
220
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
221
+ task_load_clap_emb=true
222
+ task_load_source_file=true
223
+ task_load_mel_file=false
224
+ model_proj_type=2
225
+ model_clone_batch=4
226
+ model_clap_loss=5.0
227
+ dataset_batch_size=96
228
+ average_top_k_layers=12
229
+ model_clap_loss_type="mse"
230
+ checkpoint_keep_interval_updates=-1
231
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
232
+ echo "Config ${train_mode} ${config_option}"
233
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
234
+ task_load_clap_emb=true
235
+ task_load_source_file=true
236
+ task_load_mel_file=false
237
+ model_proj_type=2
238
+ model_clone_batch=4
239
+ model_clap_loss=0.1
240
+ dataset_batch_size=96
241
+ average_top_k_layers=12
242
+ model_clap_loss_type="mse"
243
+ checkpoint_keep_interval_updates=-1
244
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
245
+ echo "Config ${train_mode} ${config_option}"
246
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
247
+ task_load_clap_emb=true
248
+ model_proj_type=4
249
+ model_clone_batch=4
250
+ model_clap_loss=1.0
251
+ dataset_batch_size=48
252
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
253
+ echo "Config ${train_mode} ${config_option}"
254
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
255
+ task_load_clap_emb=true
256
+ model_proj_type=4
257
+ model_clone_batch=4
258
+ model_clap_loss=0.001
259
+ dataset_batch_size=48
260
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
261
+ echo "Config ${train_mode} ${config_option}"
262
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
263
+ task_load_clap_emb=true
264
+ model_proj_type=4
265
+ model_clone_batch=4
266
+ model_clap_loss=0.01
267
+ dataset_batch_size=48
268
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
269
+ echo "Config ${train_mode} ${config_option}"
270
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
271
+ task_load_clap_emb=true
272
+ model_proj_type=6
273
+ model_clone_batch=4
274
+ dataset_batch_size=48
275
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
276
+ echo "Config ${train_mode} ${config_option}"
277
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
278
+ task_load_clap_emb=true
279
+ task_load_source_file=true
280
+ task_load_mel_file=false
281
+ model_proj_type=2
282
+ model_clone_batch=4
283
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
284
+ model_clap_loss=1.0
285
+ average_top_k_layers=11 # modify with model depth
286
+ model_add_conv=true
287
+ model_depth=11 #
288
+ checkpoint_keep_interval_updates=-1 # default 1
289
+ checkpoint_save_interval_updates=10000
290
+ fi
291
+
292
+ python fairseq_cli/hydra_train.py -m \
293
+ --config-dir ./EAT/config \
294
+ --config-name pretraining_AS2M \
295
+ common.user_dir=./EAT \
296
+ checkpoint.save_dir=${checkpoint_save_dir} \
297
+ checkpoint.restore_file=${checkpoint_restore_file} \
298
+ distributed_training.distributed_world_size=${1:-2} \
299
+ dataset.num_workers=24 \
300
+ dataset.data_buffer_size=48 \
301
+ dataset.batch_size=${dataset_batch_size} \
302
+ task.data=${task_data} \
303
+ task.h5_format=False \
304
+ task.load_clap_emb=${task_load_clap_emb} \
305
+ +task.load_source_file=${task_load_source_file} \
306
+ +task.load_mel_file=${task_load_mel_file} \
307
+ model.proj_type=${model_proj_type} \
308
+ model.clone_batch=${model_clone_batch} \
309
+ model.clap_loss=${model_clap_loss} \
310
+ model.average_top_k_layers=${average_top_k_layers} \
311
+ +model.add_conv=${model_add_conv} \
312
+ +model.clap_loss_type=${model_clap_loss_type} \
313
+ +model.clap_loss_layer=${model_clap_loss_layer} \
314
+ +model.dispersive_loss=${model_dispersive_loss} \
315
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
316
+ model.depth=${model_depth} \
317
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
318
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
pre_4_AS2M/disp_6_2025-09-28_08-58-05/pretraining_AS2M.sh ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # config options
3
+ train_mode=disp
4
+ config_option=6
5
+ # change world size
6
+
7
+ # shared config
8
+ SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
9
+ checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
10
+ checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
11
+
12
+ # 脚本自身的绝对路径与文件名(解析符号链接)
13
+ script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
14
+ script_name="$(basename -- "$script_path")"
15
+ # 创建目录并拷贝(保留权限与时间戳)
16
+ mkdir -p -- "$checkpoint_save_dir"
17
+ cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
18
+ echo "script_path: ${script_path}"
19
+ echo "checkpoint_save_dir: ${checkpoint_save_dir}"
20
+
21
+ # default setting
22
+ model_clone_batch=4
23
+ dataset_batch_size=48
24
+ model_clap_loss=0
25
+ model_clap_loss_type="mse" # option ce cosine l1
26
+ model_clap_loss_layer=0
27
+ average_top_k_layers=12
28
+ model_add_conv=false
29
+ model_depth=12
30
+ model_dispersive_loss=0
31
+ model_dispersive_loss_layer=0
32
+ checkpoint_keep_interval_updates=1 # TODO change this parameter if need
33
+ checkpoint_save_interval_updates=10000
34
+
35
+ if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
36
+ echo "Config ${train_mode} ${config_option}"
37
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
38
+ task_load_clap_emb=false
39
+ task_load_source_file=true
40
+ task_load_mel_file=false
41
+ model_proj_type=null
42
+ model_clone_batch=4
43
+ dataset_batch_size=96
44
+ model_clap_loss=0
45
+ checkpoint_keep_interval_updates=-1
46
+ elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
47
+ echo "Config ${train_mode} ${config_option}"
48
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
49
+ task_load_clap_emb=false
50
+ task_load_source_file=true
51
+ task_load_mel_file=false
52
+ model_proj_type=null
53
+ model_clone_batch=4
54
+ dataset_batch_size=96
55
+ model_dispersive_loss=1
56
+ model_dispersive_loss_layer=0
57
+ checkpoint_keep_interval_updates=1
58
+ elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
59
+ echo "Config ${train_mode} ${config_option}"
60
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
61
+ task_load_clap_emb=false
62
+ task_load_source_file=true
63
+ task_load_mel_file=false
64
+ model_proj_type=null
65
+ model_clone_batch=1
66
+ dataset_batch_size=384
67
+ model_dispersive_loss=1
68
+ model_dispersive_loss_layer=0
69
+ checkpoint_keep_interval_updates=1
70
+ elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
71
+ echo "Config ${train_mode} ${config_option}"
72
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
73
+ task_load_clap_emb=false
74
+ task_load_source_file=true
75
+ task_load_mel_file=false
76
+ model_proj_type=null
77
+ model_clone_batch=1
78
+ dataset_batch_size=384
79
+ model_dispersive_loss=10.0
80
+ model_dispersive_loss_layer=0
81
+ checkpoint_keep_interval_updates=1
82
+ elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
83
+ echo "Config ${train_mode} ${config_option}"
84
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
85
+ task_load_clap_emb=false
86
+ task_load_source_file=true
87
+ task_load_mel_file=false
88
+ model_proj_type=null
89
+ model_clone_batch=1
90
+ dataset_batch_size=384
91
+ model_dispersive_loss=100.0
92
+ model_dispersive_loss_layer=0
93
+ checkpoint_keep_interval_updates=1
94
+ elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
95
+ echo "Config ${train_mode} ${config_option}"
96
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
97
+ task_load_clap_emb=false
98
+ task_load_source_file=true
99
+ task_load_mel_file=false
100
+ model_proj_type=null
101
+ model_clone_batch=1
102
+ dataset_batch_size=384
103
+ model_dispersive_loss=10000.0
104
+ model_dispersive_loss_layer=0
105
+ checkpoint_keep_interval_updates=1
106
+ elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
107
+ echo "Config ${train_mode} ${config_option}"
108
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
109
+ task_load_clap_emb=false
110
+ task_load_source_file=true
111
+ task_load_mel_file=false
112
+ model_proj_type=null
113
+ model_clone_batch=1
114
+ dataset_batch_size=384
115
+ model_dispersive_loss=1000.0
116
+ model_dispersive_loss_layer=0
117
+ checkpoint_keep_interval_updates=1
118
+ elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
119
+ echo "Config ${train_mode} ${config_option}"
120
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
121
+ task_load_clap_emb=false
122
+ task_load_source_file=true
123
+ task_load_mel_file=false
124
+ model_proj_type=null
125
+ model_clone_batch=4
126
+ dataset_batch_size=96
127
+ model_dispersive_loss=1000.0
128
+ model_dispersive_loss_layer=10
129
+ checkpoint_keep_interval_updates=1
130
+ elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
131
+ echo "Config ${train_mode} ${config_option}"
132
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
133
+ task_load_clap_emb=true
134
+ model_proj_type=2
135
+ model_clone_batch=4
136
+ dataset_batch_size=48
137
+ model_clap_loss=1.0
138
+ average_top_k_layers=12
139
+ model_add_conv=false
140
+ elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
141
+ echo "Config ${train_mode} ${config_option}"
142
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
143
+ task_load_clap_emb=true
144
+ model_proj_type=2
145
+ model_clone_batch=4
146
+ dataset_batch_size=48
147
+ model_clap_loss=1.0
148
+ average_top_k_layers=1
149
+ # loss type ablation
150
+ elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
151
+ echo "Config ${train_mode} ${config_option}"
152
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
153
+ task_load_clap_emb=true
154
+ model_proj_type=2
155
+ model_clone_batch=4
156
+ dataset_batch_size=48
157
+ model_clap_loss=1.0
158
+ average_top_k_layers=12
159
+ model_clap_loss_type="ce"
160
+ elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
161
+ echo "Config ${train_mode} ${config_option}"
162
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
163
+ task_load_clap_emb=true
164
+ model_proj_type=2
165
+ model_clone_batch=4
166
+ dataset_batch_size=48
167
+ model_clap_loss=1.0
168
+ average_top_k_layers=12
169
+ model_clap_loss_type="l1"
170
+ elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
171
+ echo "Config ${train_mode} ${config_option}"
172
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
173
+ task_load_clap_emb=true
174
+ model_proj_type=2
175
+ model_clone_batch=4
176
+ dataset_batch_size=96
177
+ model_clap_loss=1.0
178
+ average_top_k_layers=12
179
+ model_clap_loss_type="cosine"
180
+ # loss layer ablation
181
+ elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
182
+ echo "Config ${train_mode} ${config_option}"
183
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
184
+ task_load_clap_emb=true
185
+ model_proj_type=2
186
+ model_clone_batch=4
187
+ dataset_batch_size=96
188
+ model_clap_loss=1.0
189
+ average_top_k_layers=12
190
+ model_clap_loss_type="mse"
191
+ model_clap_loss_layer=10
192
+ elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
193
+ echo "Config ${train_mode} ${config_option}"
194
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
195
+ task_load_clap_emb=true
196
+ task_load_source_file=true
197
+ task_load_mel_file=false
198
+ model_proj_type=2
199
+ model_clone_batch=4
200
+ dataset_batch_size=96
201
+ model_clap_loss=1.0
202
+ average_top_k_layers=12
203
+ model_clap_loss_type="mse"
204
+ model_clap_loss_layer=8
205
+ elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
206
+ echo "Config ${train_mode} ${config_option}"
207
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
208
+ task_load_clap_emb=true
209
+ task_load_source_file=true
210
+ task_load_mel_file=false
211
+ model_proj_type=2
212
+ model_clone_batch=4
213
+ dataset_batch_size=96
214
+ model_clap_loss=1.0
215
+ average_top_k_layers=12
216
+ model_clap_loss_type="mse"
217
+ model_clap_loss_layer=6
218
+ elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
219
+ echo "Config ${train_mode} ${config_option}"
220
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
221
+ task_load_clap_emb=true
222
+ task_load_source_file=true
223
+ task_load_mel_file=false
224
+ model_proj_type=2
225
+ model_clone_batch=4
226
+ model_clap_loss=5.0
227
+ dataset_batch_size=96
228
+ average_top_k_layers=12
229
+ model_clap_loss_type="mse"
230
+ checkpoint_keep_interval_updates=-1
231
+ elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
232
+ echo "Config ${train_mode} ${config_option}"
233
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
234
+ task_load_clap_emb=true
235
+ task_load_source_file=true
236
+ task_load_mel_file=false
237
+ model_proj_type=2
238
+ model_clone_batch=4
239
+ model_clap_loss=0.1
240
+ dataset_batch_size=96
241
+ average_top_k_layers=12
242
+ model_clap_loss_type="mse"
243
+ checkpoint_keep_interval_updates=-1
244
+ elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
245
+ echo "Config ${train_mode} ${config_option}"
246
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
247
+ task_load_clap_emb=true
248
+ model_proj_type=4
249
+ model_clone_batch=4
250
+ model_clap_loss=1.0
251
+ dataset_batch_size=48
252
+ elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
253
+ echo "Config ${train_mode} ${config_option}"
254
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
255
+ task_load_clap_emb=true
256
+ model_proj_type=4
257
+ model_clone_batch=4
258
+ model_clap_loss=0.001
259
+ dataset_batch_size=48
260
+ elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
261
+ echo "Config ${train_mode} ${config_option}"
262
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
263
+ task_load_clap_emb=true
264
+ model_proj_type=4
265
+ model_clone_batch=4
266
+ model_clap_loss=0.01
267
+ dataset_batch_size=48
268
+ elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
269
+ echo "Config ${train_mode} ${config_option}"
270
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
271
+ task_load_clap_emb=true
272
+ model_proj_type=6
273
+ model_clone_batch=4
274
+ dataset_batch_size=48
275
+ elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
276
+ echo "Config ${train_mode} ${config_option}"
277
+ task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
278
+ task_load_clap_emb=true
279
+ task_load_source_file=true
280
+ task_load_mel_file=false
281
+ model_proj_type=2
282
+ model_clone_batch=4
283
+ dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
284
+ model_clap_loss=1.0
285
+ average_top_k_layers=11 # modify with model depth
286
+ model_add_conv=true
287
+ model_depth=11 #
288
+ checkpoint_keep_interval_updates=-1 # default 1
289
+ checkpoint_save_interval_updates=10000
290
+ fi
291
+
292
+ python fairseq_cli/hydra_train.py -m \
293
+ --config-dir ./EAT/config \
294
+ --config-name pretraining_AS2M \
295
+ common.user_dir=./EAT \
296
+ checkpoint.save_dir=${checkpoint_save_dir} \
297
+ checkpoint.restore_file=${checkpoint_restore_file} \
298
+ distributed_training.distributed_world_size=${1:-2} \
299
+ dataset.num_workers=24 \
300
+ dataset.data_buffer_size=48 \
301
+ dataset.batch_size=${dataset_batch_size} \
302
+ task.data=${task_data} \
303
+ task.h5_format=False \
304
+ task.load_clap_emb=${task_load_clap_emb} \
305
+ +task.load_source_file=${task_load_source_file} \
306
+ +task.load_mel_file=${task_load_mel_file} \
307
+ model.proj_type=${model_proj_type} \
308
+ model.clone_batch=${model_clone_batch} \
309
+ model.clap_loss=${model_clap_loss} \
310
+ model.average_top_k_layers=${average_top_k_layers} \
311
+ +model.add_conv=${model_add_conv} \
312
+ +model.clap_loss_type=${model_clap_loss_type} \
313
+ +model.clap_loss_layer=${model_clap_loss_layer} \
314
+ +model.dispersive_loss=${model_dispersive_loss} \
315
+ +model.dispersive_loss_layer=${model_dispersive_loss_layer} \
316
+ model.depth=${model_depth} \
317
+ checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
318
+ checkpoint.save_interval_updates=${checkpoint_save_interval_updates}