Add files using upload-large-folder tool
Browse files- ast_1_AS20k/ast_new_audioset/checkpoint_1.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_11.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_15.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_16.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_2.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_20.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_21.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_22.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_3.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_9.pt +3 -0
- ast_1_AS20k/ast_new_audioset/checkpoint_best.pt +3 -0
- ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/result.csv +25 -0
- ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/train.log +837 -0
- ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/wa_result.csv +5 -0
- pre_4_AS2M/conv_clap_1_2025-09-30_06-58-32/pretraining_AS2M.sh +340 -0
- pre_4_AS2M/conv_clap_1_2025-09-30_06-59-40/pretraining_AS2M.sh +339 -0
- pre_4_AS2M/conv_clap_1_2025-09-30_07-01-07/pretraining_AS2M.sh +339 -0
- pre_4_AS2M/conv_clap_1_2025-09-30_07-08-58/pretraining_AS2M.sh +336 -0
- pre_4_AS2M/conv_clap_1_2025-09-30_07-14-17/pretraining_AS2M.sh +336 -0
- pre_4_AS2M/conv_clap_1_2025-09-30_07-19-43/pretraining_AS2M.sh +336 -0
- pre_4_AS2M/conv_clap_1_2025-09-30_07-25-52/pretraining_AS2M.sh +336 -0
- pre_4_AS2M/conv_clap_1_2025-09-30_08-31-42/pretraining_AS2M.sh +418 -0
- pre_4_AS2M/conv_clap_1_2025-09-30_08-31-59/pretraining_AS2M.sh +416 -0
- pre_4_AS2M/conv_clap_2_2025-09-30_09-12-51/pretraining_AS2M.sh +416 -0
- pre_4_AS2M/conv_clap_4_2025-09-30_07-37-48/pretraining_AS2M.sh +387 -0
- pre_4_AS2M/conv_clap_4_2025-09-30_07-38-18/pretraining_AS2M.sh +384 -0
- pre_4_AS2M/conv_clap_4_2025-09-30_07-42-31/pretraining_AS2M.sh +384 -0
- pre_4_AS2M/conv_clap_4_2025-09-30_07-45-39/pretraining_AS2M.sh +384 -0
- pre_4_AS2M/conv_clap_4_2025-09-30_07-49-28/pretraining_AS2M.sh +384 -0
- pre_4_AS2M/conv_clap_4_2025-09-30_07-57-18/pretraining_AS2M.sh +384 -0
- pre_4_AS2M/conv_clap_4_2025-09-30_08-05-21/pretraining_AS2M.sh +384 -0
- pre_4_AS2M/conv_clap_4_2025-09-30_08-13-17/pretraining_AS2M.sh +384 -0
- pre_4_AS2M/conv_clap_4_2025-09-30_08-23-09/pretraining_AS2M.sh +384 -0
- pre_4_AS2M/disp_0_2025-09-24_13-58-24/pretraining_AS2M.sh +246 -0
- pre_4_AS2M/disp_0_2025-09-24_14-09-31/pretraining_AS2M.sh +246 -0
- pre_4_AS2M/disp_0_2025-09-24_14-12-12/pretraining_AS2M.sh +246 -0
- pre_4_AS2M/disp_0_2025-09-24_14-17-47/pretraining_AS2M.sh +246 -0
- pre_4_AS2M/disp_1_2025-09-26_14-32-16/pretraining_AS2M.sh +258 -0
- pre_4_AS2M/disp_1_2025-09-26_14-33-34/pretraining_AS2M.sh +258 -0
- pre_4_AS2M/disp_1_2025-09-26_14-34-35/pretraining_AS2M.sh +258 -0
- pre_4_AS2M/disp_1_2025-09-26_14-39-04/pretraining_AS2M.sh +258 -0
- pre_4_AS2M/disp_1_2025-09-26_14-57-51/pretraining_AS2M.sh +258 -0
- pre_4_AS2M/disp_3_2025-09-27_05-57-32/pretraining_AS2M.sh +282 -0
- pre_4_AS2M/disp_4_2025-09-28_05-38-34/pretraining_AS2M.sh +294 -0
- pre_4_AS2M/disp_5_2025-09-28_06-51-25/pretraining_AS2M.sh +306 -0
- pre_4_AS2M/disp_5_2025-09-28_07-56-38/pretraining_AS2M.sh +318 -0
- pre_4_AS2M/disp_6_2025-09-28_08-28-48/pretraining_AS2M.sh +318 -0
- pre_4_AS2M/disp_6_2025-09-28_08-49-54/pretraining_AS2M.sh +318 -0
- pre_4_AS2M/disp_6_2025-09-28_08-55-19/pretraining_AS2M.sh +318 -0
- pre_4_AS2M/disp_6_2025-09-28_08-58-05/pretraining_AS2M.sh +318 -0
ast_1_AS20k/ast_new_audioset/checkpoint_1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22f4d1bdccbf34b3986cdfaf97bceaa9e82b4d3a8e011ea4e111904f294f8f6f
|
| 3 |
+
size 352586874
|
ast_1_AS20k/ast_new_audioset/checkpoint_11.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:153f81afacd4d63f575871978aeb9d4d0bfdeb0b4a77c2ce2a1564dd07608579
|
| 3 |
+
size 352587039
|
ast_1_AS20k/ast_new_audioset/checkpoint_15.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10cb65aeca1f773590b23ce9d2d705aead1e54a46f5fdf886d06f5aac1f3da41
|
| 3 |
+
size 352587039
|
ast_1_AS20k/ast_new_audioset/checkpoint_16.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebed3d5c168a32dadc61357b1a234ebf59651252ce1d5eb1880b6e469e82365b
|
| 3 |
+
size 352587039
|
ast_1_AS20k/ast_new_audioset/checkpoint_2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2be2bef5d1da4dcbbcf358260ca6f88b9322fdd3308399302c897cde1413f8f7
|
| 3 |
+
size 352586874
|
ast_1_AS20k/ast_new_audioset/checkpoint_20.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c740289dd3129cdcb9e18ebbf228d47b9113f89cd5d4d0f5d6fe098e2a7ae9b6
|
| 3 |
+
size 352587039
|
ast_1_AS20k/ast_new_audioset/checkpoint_21.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be020848a6bbca8a5eb8de7f3c10109fbc4284f5b9edb364d011ca4bddc6fa52
|
| 3 |
+
size 352587039
|
ast_1_AS20k/ast_new_audioset/checkpoint_22.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:164f6312bb7956a80a31add7003edd9363dc9d69e28c76cdfd6223061bfc74c8
|
| 3 |
+
size 352587039
|
ast_1_AS20k/ast_new_audioset/checkpoint_3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34a58857d6206ff01e85982306ead2139a232cba9c49c334667a1893faf46abd
|
| 3 |
+
size 352586874
|
ast_1_AS20k/ast_new_audioset/checkpoint_9.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7da4625fa3f5a8bd3adabf856aa377917ae7da7ee2f177d83ec8e66094d14ead
|
| 3 |
+
size 352586874
|
ast_1_AS20k/ast_new_audioset/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d74a92edfb701a1e7fbfba743caab8d687ab122e679115e44d99b55809aa41ce
|
| 3 |
+
size 352587369
|
ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/result.csv
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1.422225945558316940e-02,6.487563597807765037e-01,6.059869982241350601e-03,9.984290564287322534e-01,5.401801698178747557e-01,1.123867675634848823e-01,6.956494450569152832e-01,1.422225945558316940e-02,6.487563597807765037e-01,5.000000000000000240e-05
|
| 2 |
+
5.325997280235295062e-02,8.158944676321755463e-01,1.306437393490925657e-02,9.709086275491799478e-01,1.272550891389265137e+00,2.355064642799161584e-02,6.952877044677734375e-01,5.093040520527827852e-02,8.065371915632957300e-01,5.000000000000000240e-05
|
| 3 |
+
1.104590830848553862e-01,8.968618757921559270e-01,6.706614959781720398e-02,7.171915221978218957e-01,1.787383839223240622e+00,2.194607003870671647e-02,6.950482130050659180e-01,1.073419470179194091e-01,8.839559858178680507e-01,5.000000000000000240e-05
|
| 4 |
+
1.848396644615896978e-01,9.313106471184702251e-01,7.167611495216880124e-02,6.694785396948753631e-01,2.100989061540869507e+00,2.017162883966943515e-02,6.948846578598022461e-01,1.791385835122070447e-01,9.215415158207399537e-01,5.000000000000000240e-05
|
| 5 |
+
2.224462178579142690e-01,9.413168661608459775e-01,7.231485140869021999e-02,7.035087224749546619e-01,2.214551820998618137e+00,1.874961108785041033e-02,6.944540739059448242e-01,2.185575499842252467e-01,9.345964582357189077e-01,5.000000000000000240e-05
|
| 6 |
+
2.548203316499514925e-01,9.479657052440092491e-01,5.870892769216756735e-02,7.725844616170880474e-01,2.298720947675281678e+00,1.763229149318959466e-02,6.945921778678894043e-01,2.526630268584063033e-01,9.435292535915005274e-01,5.000000000000000240e-05
|
| 7 |
+
2.742160424132608632e-01,9.515682622442890315e-01,5.749621905254873044e-02,8.066989852343421363e-01,2.347952949601005201e+00,1.670030884553481282e-02,6.943714618682861328e-01,2.735843322137676004e-01,9.486307065585155573e-01,5.000000000000000240e-05
|
| 8 |
+
2.891340671033661436e-01,9.539887857444313557e-01,5.727079787194905985e-02,8.220393549750605322e-01,2.382701716918723900e+00,1.596927918330596358e-02,6.942600607872009277e-01,2.902446482600923860e-01,9.520384123928512521e-01,5.000000000000000240e-05
|
| 9 |
+
2.997937190939304331e-01,9.556058565055307596e-01,5.469515067152212751e-02,8.430501262804962481e-01,2.406743177850766635e+00,1.527043480806759661e-02,6.942504048347473145e-01,3.028899608327243476e-01,9.547648724123766195e-01,5.000000000000000240e-05
|
| 10 |
+
3.019933285999398254e-01,9.551177780206882018e-01,5.673163717388598343e-02,8.369639127603817341e-01,2.399413530906429504e+00,1.471027910790956815e-02,6.941569447517395020e-01,3.115329036723915590e-01,9.565940278989387702e-01,5.000000000000000240e-05
|
| 11 |
+
3.216990458954857579e-01,9.573572677069273063e-01,5.875842801928884279e-02,8.495200585745303901e-01,2.433591614166512151e+00,1.350063216294685418e-02,6.940920352935791016e-01,3.202949833204151719e-01,9.580394075942408882e-01,2.500000000000000120e-05
|
| 12 |
+
3.237938459476596975e-01,9.568247991379073003e-01,5.627071898598238336e-02,8.472165610019362081e-01,2.425336344437045710e+00,1.277143365193674981e-02,6.940239071846008301e-01,3.264502469374453986e-01,9.591369993481135836e-01,2.500000000000000120e-05
|
| 13 |
+
3.238792107002882448e-01,9.560063329207294514e-01,5.774346572337184930e-02,8.414383773149294310e-01,2.412805928578551029e+00,1.238875423823177294e-02,6.939673423767089844e-01,3.308619542913583955e-01,9.598927917725000869e-01,2.500000000000000120e-05
|
| 14 |
+
3.206599597620702347e-01,9.540928202972318584e-01,5.840423268860757411e-02,8.374629739899399627e-01,2.384227802331127410e+00,1.193688203737030933e-02,6.939578056335449219e-01,3.341927086664225888e-01,9.604666643675140447e-01,2.500000000000000120e-05
|
| 15 |
+
3.241879057377317075e-01,9.550509282205020822e-01,5.668243643471783388e-02,8.459431676912494424e-01,2.398414627994010839e+00,1.159679852697971178e-02,6.938989162445068359e-01,3.370616133181535967e-01,9.609689860314127863e-01,2.500000000000000120e-05
|
| 16 |
+
3.274090426375390050e-01,9.538047907862127195e-01,5.882890954492734498e-02,8.395026159089084006e-01,2.380009463526119973e+00,1.087626258523142174e-02,6.938264966011047363e-01,3.390441351590992025e-01,9.612789191340032069e-01,1.250000000000000060e-05
|
| 17 |
+
3.266298745594018449e-01,9.531561281365846794e-01,5.786502311129928383e-02,8.372581634183908772e-01,2.370586249390366440e+00,1.057763744308783116e-02,6.937884092330932617e-01,3.404189266645940570e-01,9.615116642607353104e-01,1.250000000000000060e-05
|
| 18 |
+
3.272432009645968032e-01,9.523735520790914677e-01,5.759590126563685075e-02,8.398432031605835846e-01,2.359355951258224504e+00,1.041628441236315018e-02,6.938322186470031738e-01,3.419091536737948189e-01,9.616955903658975791e-01,1.250000000000000060e-05
|
| 19 |
+
3.255807419210634546e-01,9.512071086194867631e-01,5.740117490775870773e-02,8.361371858623385389e-01,2.342888179552985672e+00,1.005493109991406793e-02,6.938048601150512695e-01,3.427275325615427026e-01,9.618161049645025384e-01,1.250000000000000060e-05
|
| 20 |
+
3.260315090994287957e-01,9.512684697375038967e-01,5.997403778213243608e-02,8.288957385703106251e-01,2.343746581586883870e+00,9.951956874289869318e-03,6.936931610107421875e-01,3.433311674477271258e-01,9.618989983333274818e-01,1.250000000000000060e-05
|
| 21 |
+
3.249040983002512983e-01,9.500949421687898688e-01,5.878458400191581557e-02,8.265363288594659297e-01,2.327477157438643030e+00,9.605016077793862572e-03,6.936856508255004883e-01,3.436332817873031242e-01,9.619345990419769787e-01,6.250000000000000300e-06
|
| 22 |
+
3.255695187517096967e-01,9.499316465986847868e-01,5.797600592170489703e-02,8.279170449610905314e-01,2.325237544245208010e+00,9.480287065851862593e-03,6.936790347099304199e-01,3.441953690545042077e-01,9.619586781271860509e-01,6.250000000000000300e-06
|
| 23 |
+
3.243047968373096723e-01,9.491688511518816540e-01,5.580597852971000417e-02,8.301318670729562754e-01,2.314852206236093224e+00,9.343003603446222577e-03,6.937055587768554688e-01,3.442617626297410083e-01,9.619489128942837475e-01,6.250000000000000300e-06
|
| 24 |
+
3.250544555977981642e-01,9.489687673682015712e-01,5.894922958301676563e-02,8.262462971438052639e-01,2.312148661301067776e+00,9.364594131517802247e-03,6.936329603195190430e-01,3.443755436123956959e-01,9.619321524013675351e-01,6.250000000000000300e-06
|
| 25 |
+
3.239834326417725396e-01,9.480251122897331850e-01,6.097054245045899906e-02,8.205525080094422385e-01,2.299510454271016968e+00,9.206688810194279052e-03,6.936240792274475098e-01,3.443998966946163476e-01,9.619017707066223055e-01,6.250000000000000300e-06
|
ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/train.log
ADDED
|
@@ -0,0 +1,837 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
I am process 52222, running on zcs-cfc-eat-l-worker-0: starting (Wed Aug 27 02:40:44 2025)
|
| 2 |
+
now train a audio spectrogram transformer model
|
| 3 |
+
balanced sampler is not used
|
| 4 |
+
---------------the train dataloader---------------
|
| 5 |
+
now using following mask: 48 freq, 192 time
|
| 6 |
+
now using mix-up with rate 0.500000
|
| 7 |
+
now process audioset
|
| 8 |
+
use dataset mean -4.268 and std 4.569 to normalize the input.
|
| 9 |
+
number of classes is 527
|
| 10 |
+
---------------the evaluation dataloader---------------
|
| 11 |
+
now using following mask: 0 freq, 0 time
|
| 12 |
+
now using mix-up with rate 0.000000
|
| 13 |
+
now process audioset
|
| 14 |
+
use dataset mean -4.268 and std 4.569 to normalize the input.
|
| 15 |
+
number of classes is 527
|
| 16 |
+
---------------AST Model Summary---------------
|
| 17 |
+
ImageNet pretraining: True, AudioSet pretraining: False
|
| 18 |
+
frequncey stride=10, time stride=10
|
| 19 |
+
number of patches=1212
|
| 20 |
+
|
| 21 |
+
Creating experiment directory: /opt/gpfs/home/chushu/exp/eat/ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe
|
| 22 |
+
Now starting training for 25 epochs
|
| 23 |
+
running on cuda
|
| 24 |
+
Total parameter number is : 88.132 million
|
| 25 |
+
Total trainable parameter number is : 88.132 million
|
| 26 |
+
now training with audioset, main metrics: mAP, loss function: BCEWithLogitsLoss(), learning rate scheduler: <torch.optim.lr_scheduler.MultiStepLR object at 0x7f99dcde0df0>
|
| 27 |
+
The learning rate scheduler starts at 10 epoch with decay rate of 0.500 every 5 epochs
|
| 28 |
+
current #steps=0, #epochs=1
|
| 29 |
+
start training...
|
| 30 |
+
---------------
|
| 31 |
+
2025-08-27 02:40:45.711145
|
| 32 |
+
current #epochs=1, #steps=0
|
| 33 |
+
warm-up learning rate is 0.000000
|
| 34 |
+
warm-up learning rate is 0.000003
|
| 35 |
+
warm-up learning rate is 0.000005
|
| 36 |
+
Epoch: [1][100/1713] Per Sample Total Time 0.01385 Per Sample Data Time 0.00061 Per Sample DNN Time 0.01324 Train Loss 0.6809
|
| 37 |
+
warm-up learning rate is 0.000008
|
| 38 |
+
warm-up learning rate is 0.000010
|
| 39 |
+
Epoch: [1][200/1713] Per Sample Total Time 0.01299 Per Sample Data Time 0.00033 Per Sample DNN Time 0.01267 Train Loss 0.5411
|
| 40 |
+
warm-up learning rate is 0.000013
|
| 41 |
+
warm-up learning rate is 0.000015
|
| 42 |
+
Epoch: [1][300/1713] Per Sample Total Time 0.01253 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01230 Train Loss 0.4430
|
| 43 |
+
warm-up learning rate is 0.000017
|
| 44 |
+
warm-up learning rate is 0.000020
|
| 45 |
+
Epoch: [1][400/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01222 Train Loss 0.3676
|
| 46 |
+
warm-up learning rate is 0.000023
|
| 47 |
+
warm-up learning rate is 0.000025
|
| 48 |
+
Epoch: [1][500/1713] Per Sample Total Time 0.01238 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01223 Train Loss 0.3109
|
| 49 |
+
warm-up learning rate is 0.000028
|
| 50 |
+
warm-up learning rate is 0.000030
|
| 51 |
+
Epoch: [1][600/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01215 Train Loss 0.2684
|
| 52 |
+
warm-up learning rate is 0.000033
|
| 53 |
+
warm-up learning rate is 0.000035
|
| 54 |
+
Epoch: [1][700/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01215 Train Loss 0.2360
|
| 55 |
+
warm-up learning rate is 0.000038
|
| 56 |
+
warm-up learning rate is 0.000040
|
| 57 |
+
Epoch: [1][800/1713] Per Sample Total Time 0.01229 Per Sample Data Time 0.00011 Per Sample DNN Time 0.01218 Train Loss 0.2108
|
| 58 |
+
warm-up learning rate is 0.000043
|
| 59 |
+
warm-up learning rate is 0.000045
|
| 60 |
+
Epoch: [1][900/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00011 Per Sample DNN Time 0.01217 Train Loss 0.1908
|
| 61 |
+
warm-up learning rate is 0.000048
|
| 62 |
+
warm-up learning rate is 0.000050
|
| 63 |
+
Epoch: [1][1000/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00010 Per Sample DNN Time 0.01217 Train Loss 0.1745
|
| 64 |
+
Epoch: [1][1100/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00009 Per Sample DNN Time 0.01208 Train Loss 0.1610
|
| 65 |
+
Epoch: [1][1200/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00009 Per Sample DNN Time 0.01208 Train Loss 0.1497
|
| 66 |
+
Epoch: [1][1300/1713] Per Sample Total Time 0.01218 Per Sample Data Time 0.00009 Per Sample DNN Time 0.01209 Train Loss 0.1402
|
| 67 |
+
Epoch: [1][1400/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00008 Per Sample DNN Time 0.01209 Train Loss 0.1319
|
| 68 |
+
Epoch: [1][1500/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00008 Per Sample DNN Time 0.01206 Train Loss 0.1248
|
| 69 |
+
Epoch: [1][1600/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00008 Per Sample DNN Time 0.01203 Train Loss 0.1185
|
| 70 |
+
Epoch: [1][1700/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00007 Per Sample DNN Time 0.01198 Train Loss 0.1130
|
| 71 |
+
start validation
|
| 72 |
+
mAP: 0.014222
|
| 73 |
+
AUC: 0.648756
|
| 74 |
+
Avg Precision: 0.006060
|
| 75 |
+
Avg Recall: 0.998429
|
| 76 |
+
d_prime: 0.540180
|
| 77 |
+
train_loss: 0.112387
|
| 78 |
+
valid_loss: 0.695649
|
| 79 |
+
validation finished
|
| 80 |
+
Epoch-1 lr: 5e-05
|
| 81 |
+
epoch 1 training time: 327.615
|
| 82 |
+
---------------
|
| 83 |
+
2025-08-27 02:46:13.326272
|
| 84 |
+
current #epochs=2, #steps=1713
|
| 85 |
+
Epoch: [2][87/1713] Per Sample Total Time 0.01450 Per Sample Data Time 0.00147 Per Sample DNN Time 0.01303 Train Loss 0.0242
|
| 86 |
+
Epoch: [2][187/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00071 Per Sample DNN Time 0.01150 Train Loss 0.0241
|
| 87 |
+
Epoch: [2][287/1713] Per Sample Total Time 0.01223 Per Sample Data Time 0.00048 Per Sample DNN Time 0.01175 Train Loss 0.0241
|
| 88 |
+
Epoch: [2][387/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01198 Train Loss 0.0242
|
| 89 |
+
Epoch: [2][487/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01214 Train Loss 0.0241
|
| 90 |
+
Epoch: [2][587/1713] Per Sample Total Time 0.01185 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01159 Train Loss 0.0240
|
| 91 |
+
Epoch: [2][687/1713] Per Sample Total Time 0.01197 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01174 Train Loss 0.0240
|
| 92 |
+
Epoch: [2][787/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01189 Train Loss 0.0239
|
| 93 |
+
Epoch: [2][887/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01200 Train Loss 0.0239
|
| 94 |
+
Epoch: [2][987/1713] Per Sample Total Time 0.01225 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01208 Train Loss 0.0239
|
| 95 |
+
Epoch: [2][1087/1713] Per Sample Total Time 0.01196 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01181 Train Loss 0.0238
|
| 96 |
+
Epoch: [2][1187/1713] Per Sample Total Time 0.01201 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01187 Train Loss 0.0238
|
| 97 |
+
Epoch: [2][1287/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01190 Train Loss 0.0237
|
| 98 |
+
Epoch: [2][1387/1713] Per Sample Total Time 0.01206 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01193 Train Loss 0.0237
|
| 99 |
+
Epoch: [2][1487/1713] Per Sample Total Time 0.01183 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01171 Train Loss 0.0236
|
| 100 |
+
Epoch: [2][1587/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01171 Train Loss 0.0236
|
| 101 |
+
Epoch: [2][1687/1713] Per Sample Total Time 0.01187 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01175 Train Loss 0.0236
|
| 102 |
+
start validation
|
| 103 |
+
mAP: 0.053260
|
| 104 |
+
AUC: 0.815894
|
| 105 |
+
Avg Precision: 0.013064
|
| 106 |
+
Avg Recall: 0.970909
|
| 107 |
+
d_prime: 1.272551
|
| 108 |
+
train_loss: 0.023551
|
| 109 |
+
valid_loss: 0.695288
|
| 110 |
+
validation finished
|
| 111 |
+
Epoch-2 lr: 5e-05
|
| 112 |
+
epoch 2 training time: 324.709
|
| 113 |
+
---------------
|
| 114 |
+
2025-08-27 02:51:38.034980
|
| 115 |
+
current #epochs=3, #steps=3426
|
| 116 |
+
Epoch: [3][74/1713] Per Sample Total Time 0.01292 Per Sample Data Time 0.00173 Per Sample DNN Time 0.01119 Train Loss 0.0230
|
| 117 |
+
Epoch: [3][174/1713] Per Sample Total Time 0.01254 Per Sample Data Time 0.00077 Per Sample DNN Time 0.01177 Train Loss 0.0228
|
| 118 |
+
Epoch: [3][274/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00050 Per Sample DNN Time 0.01192 Train Loss 0.0228
|
| 119 |
+
Epoch: [3][374/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01203 Train Loss 0.0228
|
| 120 |
+
Epoch: [3][474/1713] Per Sample Total Time 0.01191 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01160 Train Loss 0.0226
|
| 121 |
+
Epoch: [3][574/1713] Per Sample Total Time 0.01196 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01169 Train Loss 0.0226
|
| 122 |
+
Epoch: [3][674/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01177 Train Loss 0.0226
|
| 123 |
+
Epoch: [3][774/1713] Per Sample Total Time 0.01170 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01149 Train Loss 0.0225
|
| 124 |
+
Epoch: [3][874/1713] Per Sample Total Time 0.01170 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01151 Train Loss 0.0224
|
| 125 |
+
Epoch: [3][974/1713] Per Sample Total Time 0.01178 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01160 Train Loss 0.0223
|
| 126 |
+
Epoch: [3][1074/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01168 Train Loss 0.0223
|
| 127 |
+
Epoch: [3][1174/1713] Per Sample Total Time 0.01168 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01153 Train Loss 0.0222
|
| 128 |
+
Epoch: [3][1274/1713] Per Sample Total Time 0.01166 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01152 Train Loss 0.0221
|
| 129 |
+
Epoch: [3][1374/1713] Per Sample Total Time 0.01173 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01160 Train Loss 0.0221
|
| 130 |
+
Epoch: [3][1474/1713] Per Sample Total Time 0.01174 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01161 Train Loss 0.0221
|
| 131 |
+
Epoch: [3][1574/1713] Per Sample Total Time 0.01178 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01165 Train Loss 0.0220
|
| 132 |
+
Epoch: [3][1674/1713] Per Sample Total Time 0.01168 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01156 Train Loss 0.0220
|
| 133 |
+
start validation
|
| 134 |
+
mAP: 0.110459
|
| 135 |
+
AUC: 0.896862
|
| 136 |
+
Avg Precision: 0.067066
|
| 137 |
+
Avg Recall: 0.717192
|
| 138 |
+
d_prime: 1.787384
|
| 139 |
+
train_loss: 0.021946
|
| 140 |
+
valid_loss: 0.695048
|
| 141 |
+
validation finished
|
| 142 |
+
Epoch-3 lr: 5e-05
|
| 143 |
+
epoch 3 training time: 324.807
|
| 144 |
+
---------------
|
| 145 |
+
2025-08-27 02:57:02.842395
|
| 146 |
+
current #epochs=4, #steps=5139
|
| 147 |
+
Epoch: [4][61/1713] Per Sample Total Time 0.01467 Per Sample Data Time 0.00201 Per Sample DNN Time 0.01265 Train Loss 0.0211
|
| 148 |
+
Epoch: [4][161/1713] Per Sample Total Time 0.01335 Per Sample Data Time 0.00080 Per Sample DNN Time 0.01255 Train Loss 0.0209
|
| 149 |
+
Epoch: [4][261/1713] Per Sample Total Time 0.01297 Per Sample Data Time 0.00051 Per Sample DNN Time 0.01245 Train Loss 0.0208
|
| 150 |
+
Epoch: [4][361/1713] Per Sample Total Time 0.01263 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01225 Train Loss 0.0209
|
| 151 |
+
Epoch: [4][461/1713] Per Sample Total Time 0.01237 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01206 Train Loss 0.0208
|
| 152 |
+
Epoch: [4][561/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01208 Train Loss 0.0207
|
| 153 |
+
Epoch: [4][661/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01209 Train Loss 0.0207
|
| 154 |
+
Epoch: [4][761/1713] Per Sample Total Time 0.01233 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01213 Train Loss 0.0207
|
| 155 |
+
Epoch: [4][861/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01208 Train Loss 0.0206
|
| 156 |
+
Epoch: [4][961/1713] Per Sample Total Time 0.01222 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01205 Train Loss 0.0205
|
| 157 |
+
Epoch: [4][1061/1713] Per Sample Total Time 0.01197 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01181 Train Loss 0.0205
|
| 158 |
+
Epoch: [4][1161/1713] Per Sample Total Time 0.01199 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01185 Train Loss 0.0204
|
| 159 |
+
Epoch: [4][1261/1713] Per Sample Total Time 0.01204 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01190 Train Loss 0.0204
|
| 160 |
+
Epoch: [4][1361/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01192 Train Loss 0.0203
|
| 161 |
+
Epoch: [4][1461/1713] Per Sample Total Time 0.01204 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01192 Train Loss 0.0203
|
| 162 |
+
Epoch: [4][1561/1713] Per Sample Total Time 0.01199 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01187 Train Loss 0.0202
|
| 163 |
+
Epoch: [4][1661/1713] Per Sample Total Time 0.01202 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01190 Train Loss 0.0202
|
| 164 |
+
start validation
|
| 165 |
+
mAP: 0.184840
|
| 166 |
+
AUC: 0.931311
|
| 167 |
+
Avg Precision: 0.071676
|
| 168 |
+
Avg Recall: 0.669479
|
| 169 |
+
d_prime: 2.100989
|
| 170 |
+
train_loss: 0.020172
|
| 171 |
+
valid_loss: 0.694885
|
| 172 |
+
validation finished
|
| 173 |
+
Epoch-4 lr: 5e-05
|
| 174 |
+
epoch 4 training time: 330.929
|
| 175 |
+
---------------
|
| 176 |
+
2025-08-27 03:02:33.771092
|
| 177 |
+
current #epochs=5, #steps=6852
|
| 178 |
+
Epoch: [5][48/1713] Per Sample Total Time 0.01044 Per Sample Data Time 0.00242 Per Sample DNN Time 0.00802 Train Loss 0.0193
|
| 179 |
+
Epoch: [5][148/1713] Per Sample Total Time 0.01150 Per Sample Data Time 0.00082 Per Sample DNN Time 0.01068 Train Loss 0.0191
|
| 180 |
+
Epoch: [5][248/1713] Per Sample Total Time 0.01172 Per Sample Data Time 0.00051 Per Sample DNN Time 0.01121 Train Loss 0.0190
|
| 181 |
+
Epoch: [5][348/1713] Per Sample Total Time 0.01187 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01149 Train Loss 0.0190
|
| 182 |
+
Epoch: [5][448/1713] Per Sample Total Time 0.01133 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01103 Train Loss 0.0190
|
| 183 |
+
Epoch: [5][548/1713] Per Sample Total Time 0.01143 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01118 Train Loss 0.0191
|
| 184 |
+
Epoch: [5][648/1713] Per Sample Total Time 0.01154 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01132 Train Loss 0.0190
|
| 185 |
+
Epoch: [5][748/1713] Per Sample Total Time 0.01166 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01146 Train Loss 0.0189
|
| 186 |
+
Epoch: [5][848/1713] Per Sample Total Time 0.01167 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01149 Train Loss 0.0189
|
| 187 |
+
Epoch: [5][948/1713] Per Sample Total Time 0.01143 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01126 Train Loss 0.0190
|
| 188 |
+
Epoch: [5][1048/1713] Per Sample Total Time 0.01153 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01138 Train Loss 0.0189
|
| 189 |
+
Epoch: [5][1148/1713] Per Sample Total Time 0.01166 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01152 Train Loss 0.0189
|
| 190 |
+
Epoch: [5][1248/1713] Per Sample Total Time 0.01169 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01155 Train Loss 0.0189
|
| 191 |
+
Epoch: [5][1348/1713] Per Sample Total Time 0.01164 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01151 Train Loss 0.0189
|
| 192 |
+
Epoch: [5][1448/1713] Per Sample Total Time 0.01169 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01157 Train Loss 0.0188
|
| 193 |
+
Epoch: [5][1548/1713] Per Sample Total Time 0.01173 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01161 Train Loss 0.0188
|
| 194 |
+
Epoch: [5][1648/1713] Per Sample Total Time 0.01176 Per Sample Data Time 0.00011 Per Sample DNN Time 0.01164 Train Loss 0.0187
|
| 195 |
+
start validation
|
| 196 |
+
mAP: 0.222446
|
| 197 |
+
AUC: 0.941317
|
| 198 |
+
Avg Precision: 0.072315
|
| 199 |
+
Avg Recall: 0.703509
|
| 200 |
+
d_prime: 2.214552
|
| 201 |
+
train_loss: 0.018750
|
| 202 |
+
valid_loss: 0.694454
|
| 203 |
+
validation finished
|
| 204 |
+
Epoch-5 lr: 5e-05
|
| 205 |
+
epoch 5 training time: 325.961
|
| 206 |
+
---------------
|
| 207 |
+
2025-08-27 03:07:59.731986
|
| 208 |
+
current #epochs=6, #steps=8565
|
| 209 |
+
Epoch: [6][35/1713] Per Sample Total Time 0.01637 Per Sample Data Time 0.00389 Per Sample DNN Time 0.01247 Train Loss 0.0174
|
| 210 |
+
Epoch: [6][135/1713] Per Sample Total Time 0.01327 Per Sample Data Time 0.00107 Per Sample DNN Time 0.01220 Train Loss 0.0180
|
| 211 |
+
Epoch: [6][235/1713] Per Sample Total Time 0.01290 Per Sample Data Time 0.00064 Per Sample DNN Time 0.01226 Train Loss 0.0179
|
| 212 |
+
Epoch: [6][335/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00046 Per Sample DNN Time 0.01180 Train Loss 0.0179
|
| 213 |
+
Epoch: [6][435/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01191 Train Loss 0.0179
|
| 214 |
+
Epoch: [6][535/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01203 Train Loss 0.0178
|
| 215 |
+
Epoch: [6][635/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01205 Train Loss 0.0178
|
| 216 |
+
Epoch: [6][735/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01191 Train Loss 0.0178
|
| 217 |
+
Epoch: [6][835/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01190 Train Loss 0.0178
|
| 218 |
+
Epoch: [6][935/1713] Per Sample Total Time 0.01213 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01193 Train Loss 0.0177
|
| 219 |
+
Epoch: [6][1035/1713] Per Sample Total Time 0.01215 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01197 Train Loss 0.0177
|
| 220 |
+
Epoch: [6][1135/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01196 Train Loss 0.0177
|
| 221 |
+
Epoch: [6][1235/1713] Per Sample Total Time 0.01210 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01195 Train Loss 0.0177
|
| 222 |
+
Epoch: [6][1335/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01196 Train Loss 0.0177
|
| 223 |
+
Epoch: [6][1435/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01197 Train Loss 0.0177
|
| 224 |
+
Epoch: [6][1535/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01198 Train Loss 0.0177
|
| 225 |
+
Epoch: [6][1635/1713] Per Sample Total Time 0.01210 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01197 Train Loss 0.0176
|
| 226 |
+
start validation
|
| 227 |
+
mAP: 0.254820
|
| 228 |
+
AUC: 0.947966
|
| 229 |
+
Avg Precision: 0.058709
|
| 230 |
+
Avg Recall: 0.772584
|
| 231 |
+
d_prime: 2.298721
|
| 232 |
+
train_loss: 0.017632
|
| 233 |
+
valid_loss: 0.694592
|
| 234 |
+
validation finished
|
| 235 |
+
Epoch-6 lr: 5e-05
|
| 236 |
+
epoch 6 training time: 334.468
|
| 237 |
+
---------------
|
| 238 |
+
2025-08-27 03:13:34.200040
|
| 239 |
+
current #epochs=7, #steps=10278
|
| 240 |
+
Epoch: [7][22/1713] Per Sample Total Time 0.01837 Per Sample Data Time 0.00587 Per Sample DNN Time 0.01250 Train Loss 0.0174
|
| 241 |
+
Epoch: [7][122/1713] Per Sample Total Time 0.01342 Per Sample Data Time 0.00113 Per Sample DNN Time 0.01229 Train Loss 0.0168
|
| 242 |
+
Epoch: [7][222/1713] Per Sample Total Time 0.01290 Per Sample Data Time 0.00065 Per Sample DNN Time 0.01225 Train Loss 0.0167
|
| 243 |
+
Epoch: [7][322/1713] Per Sample Total Time 0.01259 Per Sample Data Time 0.00046 Per Sample DNN Time 0.01213 Train Loss 0.0167
|
| 244 |
+
Epoch: [7][422/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01208 Train Loss 0.0167
|
| 245 |
+
Epoch: [7][522/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01196 Train Loss 0.0168
|
| 246 |
+
Epoch: [7][622/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01202 Train Loss 0.0167
|
| 247 |
+
Epoch: [7][722/1713] Per Sample Total Time 0.01201 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01178 Train Loss 0.0167
|
| 248 |
+
Epoch: [7][822/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01184 Train Loss 0.0168
|
| 249 |
+
Epoch: [7][922/1713] Per Sample Total Time 0.01210 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01191 Train Loss 0.0168
|
| 250 |
+
Epoch: [7][1022/1713] Per Sample Total Time 0.01213 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01195 Train Loss 0.0168
|
| 251 |
+
Epoch: [7][1122/1713] Per Sample Total Time 0.01215 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01199 Train Loss 0.0168
|
| 252 |
+
Epoch: [7][1222/1713] Per Sample Total Time 0.01218 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01202 Train Loss 0.0167
|
| 253 |
+
Epoch: [7][1322/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01205 Train Loss 0.0167
|
| 254 |
+
Epoch: [7][1422/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01207 Train Loss 0.0167
|
| 255 |
+
Epoch: [7][1522/1713] Per Sample Total Time 0.01222 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01209 Train Loss 0.0167
|
| 256 |
+
Epoch: [7][1622/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01199 Train Loss 0.0167
|
| 257 |
+
start validation
|
| 258 |
+
mAP: 0.274216
|
| 259 |
+
AUC: 0.951568
|
| 260 |
+
Avg Precision: 0.057496
|
| 261 |
+
Avg Recall: 0.806699
|
| 262 |
+
d_prime: 2.347953
|
| 263 |
+
train_loss: 0.016700
|
| 264 |
+
valid_loss: 0.694371
|
| 265 |
+
validation finished
|
| 266 |
+
Epoch-7 lr: 5e-05
|
| 267 |
+
epoch 7 training time: 338.084
|
| 268 |
+
---------------
|
| 269 |
+
2025-08-27 03:19:12.284334
|
| 270 |
+
current #epochs=8, #steps=11991
|
| 271 |
+
Epoch: [8][9/1713] Per Sample Total Time 0.02563 Per Sample Data Time 0.01307 Per Sample DNN Time 0.01255 Train Loss 0.0155
|
| 272 |
+
Epoch: [8][109/1713] Per Sample Total Time 0.01330 Per Sample Data Time 0.00123 Per Sample DNN Time 0.01207 Train Loss 0.0156
|
| 273 |
+
Epoch: [8][209/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00066 Per Sample DNN Time 0.01165 Train Loss 0.0157
|
| 274 |
+
Epoch: [8][309/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00047 Per Sample DNN Time 0.01166 Train Loss 0.0158
|
| 275 |
+
Epoch: [8][409/1713] Per Sample Total Time 0.01208 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01172 Train Loss 0.0159
|
| 276 |
+
Epoch: [8][509/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01181 Train Loss 0.0159
|
| 277 |
+
Epoch: [8][609/1713] Per Sample Total Time 0.01189 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01163 Train Loss 0.0159
|
| 278 |
+
Epoch: [8][709/1713] Per Sample Total Time 0.01173 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01150 Train Loss 0.0159
|
| 279 |
+
Epoch: [8][809/1713] Per Sample Total Time 0.01175 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01154 Train Loss 0.0159
|
| 280 |
+
Epoch: [8][909/1713] Per Sample Total Time 0.01177 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01158 Train Loss 0.0160
|
| 281 |
+
Epoch: [8][1009/1713] Per Sample Total Time 0.01182 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01165 Train Loss 0.0159
|
| 282 |
+
Epoch: [8][1109/1713] Per Sample Total Time 0.01170 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01154 Train Loss 0.0160
|
| 283 |
+
Epoch: [8][1209/1713] Per Sample Total Time 0.01175 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01160 Train Loss 0.0160
|
| 284 |
+
Epoch: [8][1309/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01164 Train Loss 0.0160
|
| 285 |
+
Epoch: [8][1409/1713] Per Sample Total Time 0.01180 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01167 Train Loss 0.0160
|
| 286 |
+
Epoch: [8][1509/1713] Per Sample Total Time 0.01171 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01159 Train Loss 0.0160
|
| 287 |
+
Epoch: [8][1609/1713] Per Sample Total Time 0.01174 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01162 Train Loss 0.0160
|
| 288 |
+
Epoch: [8][1709/1713] Per Sample Total Time 0.01177 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01165 Train Loss 0.0160
|
| 289 |
+
start validation
|
| 290 |
+
mAP: 0.289134
|
| 291 |
+
AUC: 0.953989
|
| 292 |
+
Avg Precision: 0.057271
|
| 293 |
+
Avg Recall: 0.822039
|
| 294 |
+
d_prime: 2.382702
|
| 295 |
+
train_loss: 0.015969
|
| 296 |
+
valid_loss: 0.694260
|
| 297 |
+
validation finished
|
| 298 |
+
Epoch-8 lr: 5e-05
|
| 299 |
+
epoch 8 training time: 326.545
|
| 300 |
+
---------------
|
| 301 |
+
2025-08-27 03:24:38.829389
|
| 302 |
+
current #epochs=9, #steps=13704
|
| 303 |
+
Epoch: [9][96/1713] Per Sample Total Time 0.01018 Per Sample Data Time 0.00133 Per Sample DNN Time 0.00884 Train Loss 0.0148
|
| 304 |
+
Epoch: [9][196/1713] Per Sample Total Time 0.01046 Per Sample Data Time 0.00067 Per Sample DNN Time 0.00978 Train Loss 0.0150
|
| 305 |
+
Epoch: [9][296/1713] Per Sample Total Time 0.01110 Per Sample Data Time 0.00046 Per Sample DNN Time 0.01064 Train Loss 0.0150
|
| 306 |
+
Epoch: [9][396/1713] Per Sample Total Time 0.01141 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01106 Train Loss 0.0150
|
| 307 |
+
Epoch: [9][496/1713] Per Sample Total Time 0.01171 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01142 Train Loss 0.0150
|
| 308 |
+
Epoch: [9][596/1713] Per Sample Total Time 0.01158 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01132 Train Loss 0.0151
|
| 309 |
+
Epoch: [9][696/1713] Per Sample Total Time 0.01148 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01126 Train Loss 0.0151
|
| 310 |
+
Epoch: [9][796/1713] Per Sample Total Time 0.01162 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01142 Train Loss 0.0152
|
| 311 |
+
Epoch: [9][896/1713] Per Sample Total Time 0.01169 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01151 Train Loss 0.0152
|
| 312 |
+
Epoch: [9][996/1713] Per Sample Total Time 0.01175 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01159 Train Loss 0.0152
|
| 313 |
+
Epoch: [9][1096/1713] Per Sample Total Time 0.01171 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01155 Train Loss 0.0152
|
| 314 |
+
Epoch: [9][1196/1713] Per Sample Total Time 0.01167 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01153 Train Loss 0.0152
|
| 315 |
+
Epoch: [9][1296/1713] Per Sample Total Time 0.01176 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01162 Train Loss 0.0152
|
| 316 |
+
Epoch: [9][1396/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01166 Train Loss 0.0152
|
| 317 |
+
Epoch: [9][1496/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01166 Train Loss 0.0153
|
| 318 |
+
Epoch: [9][1596/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01172 Train Loss 0.0153
|
| 319 |
+
Epoch: [9][1696/1713] Per Sample Total Time 0.01188 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01176 Train Loss 0.0153
|
| 320 |
+
start validation
|
| 321 |
+
mAP: 0.299794
|
| 322 |
+
AUC: 0.955606
|
| 323 |
+
Avg Precision: 0.054695
|
| 324 |
+
Avg Recall: 0.843050
|
| 325 |
+
d_prime: 2.406743
|
| 326 |
+
train_loss: 0.015270
|
| 327 |
+
valid_loss: 0.694250
|
| 328 |
+
validation finished
|
| 329 |
+
Epoch-9 lr: 5e-05
|
| 330 |
+
epoch 9 training time: 328.892
|
| 331 |
+
---------------
|
| 332 |
+
2025-08-27 03:30:07.721315
|
| 333 |
+
current #epochs=10, #steps=15417
|
| 334 |
+
Epoch: [10][83/1713] Per Sample Total Time 0.01054 Per Sample Data Time 0.00211 Per Sample DNN Time 0.00843 Train Loss 0.0144
|
| 335 |
+
Epoch: [10][183/1713] Per Sample Total Time 0.01190 Per Sample Data Time 0.00099 Per Sample DNN Time 0.01090 Train Loss 0.0145
|
| 336 |
+
Epoch: [10][283/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00066 Per Sample DNN Time 0.01162 Train Loss 0.0145
|
| 337 |
+
Epoch: [10][383/1713] Per Sample Total Time 0.01242 Per Sample Data Time 0.00050 Per Sample DNN Time 0.01192 Train Loss 0.0146
|
| 338 |
+
Epoch: [10][483/1713] Per Sample Total Time 0.01206 Per Sample Data Time 0.00040 Per Sample DNN Time 0.01166 Train Loss 0.0146
|
| 339 |
+
Epoch: [10][583/1713] Per Sample Total Time 0.01188 Per Sample Data Time 0.00034 Per Sample DNN Time 0.01154 Train Loss 0.0146
|
| 340 |
+
Epoch: [10][683/1713] Per Sample Total Time 0.01195 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01165 Train Loss 0.0146
|
| 341 |
+
Epoch: [10][783/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01173 Train Loss 0.0146
|
| 342 |
+
Epoch: [10][883/1713] Per Sample Total Time 0.01204 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01180 Train Loss 0.0146
|
| 343 |
+
Epoch: [10][983/1713] Per Sample Total Time 0.01197 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01175 Train Loss 0.0146
|
| 344 |
+
Epoch: [10][1083/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01158 Train Loss 0.0146
|
| 345 |
+
Epoch: [10][1183/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01165 Train Loss 0.0147
|
| 346 |
+
Epoch: [10][1283/1713] Per Sample Total Time 0.01190 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01172 Train Loss 0.0147
|
| 347 |
+
Epoch: [10][1383/1713] Per Sample Total Time 0.01194 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01177 Train Loss 0.0147
|
| 348 |
+
Epoch: [10][1483/1713] Per Sample Total Time 0.01189 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01173 Train Loss 0.0147
|
| 349 |
+
Epoch: [10][1583/1713] Per Sample Total Time 0.01176 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01161 Train Loss 0.0147
|
| 350 |
+
Epoch: [10][1683/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01164 Train Loss 0.0147
|
| 351 |
+
start validation
|
| 352 |
+
mAP: 0.301993
|
| 353 |
+
AUC: 0.955118
|
| 354 |
+
Avg Precision: 0.056732
|
| 355 |
+
Avg Recall: 0.836964
|
| 356 |
+
d_prime: 2.399414
|
| 357 |
+
train_loss: 0.014710
|
| 358 |
+
valid_loss: 0.694157
|
| 359 |
+
validation finished
|
| 360 |
+
Epoch-10 lr: 2.5e-05
|
| 361 |
+
epoch 10 training time: 328.114
|
| 362 |
+
---------------
|
| 363 |
+
2025-08-27 03:35:35.835882
|
| 364 |
+
current #epochs=11, #steps=17130
|
| 365 |
+
Epoch: [11][70/1713] Per Sample Total Time 0.01434 Per Sample Data Time 0.00201 Per Sample DNN Time 0.01234 Train Loss 0.0134
|
| 366 |
+
Epoch: [11][170/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00086 Per Sample DNN Time 0.01135 Train Loss 0.0134
|
| 367 |
+
Epoch: [11][270/1713] Per Sample Total Time 0.01147 Per Sample Data Time 0.00055 Per Sample DNN Time 0.01092 Train Loss 0.0135
|
| 368 |
+
Epoch: [11][370/1713] Per Sample Total Time 0.01179 Per Sample Data Time 0.00042 Per Sample DNN Time 0.01138 Train Loss 0.0135
|
| 369 |
+
Epoch: [11][470/1713] Per Sample Total Time 0.01185 Per Sample Data Time 0.00034 Per Sample DNN Time 0.01151 Train Loss 0.0135
|
| 370 |
+
Epoch: [11][570/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01156 Train Loss 0.0136
|
| 371 |
+
Epoch: [11][670/1713] Per Sample Total Time 0.01180 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01155 Train Loss 0.0135
|
| 372 |
+
Epoch: [11][770/1713] Per Sample Total Time 0.01161 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01138 Train Loss 0.0135
|
| 373 |
+
Epoch: [11][870/1713] Per Sample Total Time 0.01172 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01152 Train Loss 0.0135
|
| 374 |
+
Epoch: [11][970/1713] Per Sample Total Time 0.01180 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01161 Train Loss 0.0136
|
| 375 |
+
Epoch: [11][1070/1713] Per Sample Total Time 0.01184 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01167 Train Loss 0.0136
|
| 376 |
+
Epoch: [11][1170/1713] Per Sample Total Time 0.01190 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01173 Train Loss 0.0136
|
| 377 |
+
Epoch: [11][1270/1713] Per Sample Total Time 0.01193 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01178 Train Loss 0.0136
|
| 378 |
+
Epoch: [11][1370/1713] Per Sample Total Time 0.01198 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01183 Train Loss 0.0135
|
| 379 |
+
Epoch: [11][1470/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01187 Train Loss 0.0135
|
| 380 |
+
Epoch: [11][1570/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01187 Train Loss 0.0135
|
| 381 |
+
Epoch: [11][1670/1713] Per Sample Total Time 0.01201 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01189 Train Loss 0.0135
|
| 382 |
+
start validation
|
| 383 |
+
mAP: 0.321699
|
| 384 |
+
AUC: 0.957357
|
| 385 |
+
Avg Precision: 0.058758
|
| 386 |
+
Avg Recall: 0.849520
|
| 387 |
+
d_prime: 2.433592
|
| 388 |
+
train_loss: 0.013501
|
| 389 |
+
valid_loss: 0.694092
|
| 390 |
+
validation finished
|
| 391 |
+
Epoch-11 lr: 2.5e-05
|
| 392 |
+
epoch 11 training time: 334.541
|
| 393 |
+
---------------
|
| 394 |
+
2025-08-27 03:41:10.375904
|
| 395 |
+
current #epochs=12, #steps=18843
|
| 396 |
+
Epoch: [12][57/1713] Per Sample Total Time 0.01455 Per Sample Data Time 0.00255 Per Sample DNN Time 0.01201 Train Loss 0.0128
|
| 397 |
+
Epoch: [12][157/1713] Per Sample Total Time 0.01312 Per Sample Data Time 0.00097 Per Sample DNN Time 0.01215 Train Loss 0.0127
|
| 398 |
+
Epoch: [12][257/1713] Per Sample Total Time 0.01283 Per Sample Data Time 0.00061 Per Sample DNN Time 0.01222 Train Loss 0.0126
|
| 399 |
+
Epoch: [12][357/1713] Per Sample Total Time 0.01268 Per Sample Data Time 0.00045 Per Sample DNN Time 0.01223 Train Loss 0.0126
|
| 400 |
+
Epoch: [12][457/1713] Per Sample Total Time 0.01254 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01218 Train Loss 0.0126
|
| 401 |
+
Epoch: [12][557/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01209 Train Loss 0.0127
|
| 402 |
+
Epoch: [12][657/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01201 Train Loss 0.0127
|
| 403 |
+
Epoch: [12][757/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01203 Train Loss 0.0127
|
| 404 |
+
Epoch: [12][857/1713] Per Sample Total Time 0.01225 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01204 Train Loss 0.0127
|
| 405 |
+
Epoch: [12][957/1713] Per Sample Total Time 0.01229 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01209 Train Loss 0.0127
|
| 406 |
+
Epoch: [12][1057/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01214 Train Loss 0.0127
|
| 407 |
+
Epoch: [12][1157/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01214 Train Loss 0.0127
|
| 408 |
+
Epoch: [12][1257/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01216 Train Loss 0.0128
|
| 409 |
+
Epoch: [12][1357/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01219 Train Loss 0.0128
|
| 410 |
+
Epoch: [12][1457/1713] Per Sample Total Time 0.01235 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01221 Train Loss 0.0127
|
| 411 |
+
Epoch: [12][1557/1713] Per Sample Total Time 0.01236 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01222 Train Loss 0.0128
|
| 412 |
+
Epoch: [12][1657/1713] Per Sample Total Time 0.01237 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01223 Train Loss 0.0128
|
| 413 |
+
start validation
|
| 414 |
+
mAP: 0.323794
|
| 415 |
+
AUC: 0.956825
|
| 416 |
+
Avg Precision: 0.056271
|
| 417 |
+
Avg Recall: 0.847217
|
| 418 |
+
d_prime: 2.425336
|
| 419 |
+
train_loss: 0.012771
|
| 420 |
+
valid_loss: 0.694024
|
| 421 |
+
validation finished
|
| 422 |
+
Epoch-12 lr: 2.5e-05
|
| 423 |
+
epoch 12 training time: 341.729
|
| 424 |
+
---------------
|
| 425 |
+
2025-08-27 03:46:52.104774
|
| 426 |
+
current #epochs=13, #steps=20556
|
| 427 |
+
Epoch: [13][44/1713] Per Sample Total Time 0.01595 Per Sample Data Time 0.00302 Per Sample DNN Time 0.01293 Train Loss 0.0122
|
| 428 |
+
Epoch: [13][144/1713] Per Sample Total Time 0.01372 Per Sample Data Time 0.00097 Per Sample DNN Time 0.01275 Train Loss 0.0125
|
| 429 |
+
Epoch: [13][244/1713] Per Sample Total Time 0.01330 Per Sample Data Time 0.00060 Per Sample DNN Time 0.01270 Train Loss 0.0125
|
| 430 |
+
Epoch: [13][344/1713] Per Sample Total Time 0.01301 Per Sample Data Time 0.00044 Per Sample DNN Time 0.01257 Train Loss 0.0124
|
| 431 |
+
Epoch: [13][444/1713] Per Sample Total Time 0.01288 Per Sample Data Time 0.00035 Per Sample DNN Time 0.01254 Train Loss 0.0124
|
| 432 |
+
Epoch: [13][544/1713] Per Sample Total Time 0.01282 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01253 Train Loss 0.0123
|
| 433 |
+
Epoch: [13][644/1713] Per Sample Total Time 0.01277 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01252 Train Loss 0.0123
|
| 434 |
+
Epoch: [13][744/1713] Per Sample Total Time 0.01275 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01252 Train Loss 0.0123
|
| 435 |
+
Epoch: [13][844/1713] Per Sample Total Time 0.01276 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01256 Train Loss 0.0124
|
| 436 |
+
Epoch: [13][944/1713] Per Sample Total Time 0.01273 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01254 Train Loss 0.0124
|
| 437 |
+
Epoch: [13][1044/1713] Per Sample Total Time 0.01272 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01255 Train Loss 0.0124
|
| 438 |
+
Epoch: [13][1144/1713] Per Sample Total Time 0.01273 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01257 Train Loss 0.0124
|
| 439 |
+
Epoch: [13][1244/1713] Per Sample Total Time 0.01280 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01265 Train Loss 0.0124
|
| 440 |
+
Epoch: [13][1344/1713] Per Sample Total Time 0.01280 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01265 Train Loss 0.0124
|
| 441 |
+
Epoch: [13][1444/1713] Per Sample Total Time 0.01279 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01265 Train Loss 0.0124
|
| 442 |
+
Epoch: [13][1544/1713] Per Sample Total Time 0.01276 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01263 Train Loss 0.0124
|
| 443 |
+
Epoch: [13][1644/1713] Per Sample Total Time 0.01275 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01262 Train Loss 0.0124
|
| 444 |
+
start validation
|
| 445 |
+
mAP: 0.323879
|
| 446 |
+
AUC: 0.956006
|
| 447 |
+
Avg Precision: 0.057743
|
| 448 |
+
Avg Recall: 0.841438
|
| 449 |
+
d_prime: 2.412806
|
| 450 |
+
train_loss: 0.012389
|
| 451 |
+
valid_loss: 0.693967
|
| 452 |
+
validation finished
|
| 453 |
+
Epoch-13 lr: 2.5e-05
|
| 454 |
+
epoch 13 training time: 348.570
|
| 455 |
+
---------------
|
| 456 |
+
2025-08-27 03:52:40.675110
|
| 457 |
+
current #epochs=14, #steps=22269
|
| 458 |
+
Epoch: [14][31/1713] Per Sample Total Time 0.01726 Per Sample Data Time 0.00422 Per Sample DNN Time 0.01303 Train Loss 0.0124
|
| 459 |
+
Epoch: [14][131/1713] Per Sample Total Time 0.01382 Per Sample Data Time 0.00106 Per Sample DNN Time 0.01276 Train Loss 0.0120
|
| 460 |
+
Epoch: [14][231/1713] Per Sample Total Time 0.01358 Per Sample Data Time 0.00062 Per Sample DNN Time 0.01296 Train Loss 0.0119
|
| 461 |
+
Epoch: [14][331/1713] Per Sample Total Time 0.01321 Per Sample Data Time 0.00045 Per Sample DNN Time 0.01276 Train Loss 0.0119
|
| 462 |
+
Epoch: [14][431/1713] Per Sample Total Time 0.01297 Per Sample Data Time 0.00036 Per Sample DNN Time 0.01261 Train Loss 0.0119
|
| 463 |
+
Epoch: [14][531/1713] Per Sample Total Time 0.01283 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01253 Train Loss 0.0119
|
| 464 |
+
Epoch: [14][631/1713] Per Sample Total Time 0.01281 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01255 Train Loss 0.0119
|
| 465 |
+
Epoch: [14][731/1713] Per Sample Total Time 0.01279 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01257 Train Loss 0.0119
|
| 466 |
+
Epoch: [14][831/1713] Per Sample Total Time 0.01267 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01247 Train Loss 0.0119
|
| 467 |
+
Epoch: [14][931/1713] Per Sample Total Time 0.01262 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01244 Train Loss 0.0119
|
| 468 |
+
Epoch: [14][1031/1713] Per Sample Total Time 0.01256 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01239 Train Loss 0.0119
|
| 469 |
+
Epoch: [14][1131/1713] Per Sample Total Time 0.01251 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01235 Train Loss 0.0119
|
| 470 |
+
Epoch: [14][1231/1713] Per Sample Total Time 0.01249 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01234 Train Loss 0.0119
|
| 471 |
+
Epoch: [14][1331/1713] Per Sample Total Time 0.01246 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01231 Train Loss 0.0119
|
| 472 |
+
Epoch: [14][1431/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01229 Train Loss 0.0119
|
| 473 |
+
Epoch: [14][1531/1713] Per Sample Total Time 0.01238 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01225 Train Loss 0.0119
|
| 474 |
+
Epoch: [14][1631/1713] Per Sample Total Time 0.01237 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01225 Train Loss 0.0119
|
| 475 |
+
start validation
|
| 476 |
+
mAP: 0.320660
|
| 477 |
+
AUC: 0.954093
|
| 478 |
+
Avg Precision: 0.058404
|
| 479 |
+
Avg Recall: 0.837463
|
| 480 |
+
d_prime: 2.384228
|
| 481 |
+
train_loss: 0.011937
|
| 482 |
+
valid_loss: 0.693958
|
| 483 |
+
validation finished
|
| 484 |
+
Epoch-14 lr: 2.5e-05
|
| 485 |
+
epoch 14 training time: 341.352
|
| 486 |
+
---------------
|
| 487 |
+
2025-08-27 03:58:22.027047
|
| 488 |
+
current #epochs=15, #steps=23982
|
| 489 |
+
Epoch: [15][18/1713] Per Sample Total Time 0.01914 Per Sample Data Time 0.00752 Per Sample DNN Time 0.01162 Train Loss 0.0115
|
| 490 |
+
Epoch: [15][118/1713] Per Sample Total Time 0.01358 Per Sample Data Time 0.00124 Per Sample DNN Time 0.01234 Train Loss 0.0115
|
| 491 |
+
Epoch: [15][218/1713] Per Sample Total Time 0.01286 Per Sample Data Time 0.00070 Per Sample DNN Time 0.01217 Train Loss 0.0117
|
| 492 |
+
Epoch: [15][318/1713] Per Sample Total Time 0.01255 Per Sample Data Time 0.00049 Per Sample DNN Time 0.01206 Train Loss 0.0117
|
| 493 |
+
Epoch: [15][418/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01202 Train Loss 0.0117
|
| 494 |
+
Epoch: [15][518/1713] Per Sample Total Time 0.01239 Per Sample Data Time 0.00032 Per Sample DNN Time 0.01207 Train Loss 0.0116
|
| 495 |
+
Epoch: [15][618/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01214 Train Loss 0.0117
|
| 496 |
+
Epoch: [15][718/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01207 Train Loss 0.0117
|
| 497 |
+
Epoch: [15][818/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01197 Train Loss 0.0116
|
| 498 |
+
Epoch: [15][918/1713] Per Sample Total Time 0.01203 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01183 Train Loss 0.0116
|
| 499 |
+
Epoch: [15][1018/1713] Per Sample Total Time 0.01198 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01180 Train Loss 0.0116
|
| 500 |
+
Epoch: [15][1118/1713] Per Sample Total Time 0.01200 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01183 Train Loss 0.0116
|
| 501 |
+
Epoch: [15][1218/1713] Per Sample Total Time 0.01193 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01178 Train Loss 0.0116
|
| 502 |
+
Epoch: [15][1318/1713] Per Sample Total Time 0.01188 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01173 Train Loss 0.0116
|
| 503 |
+
Epoch: [15][1418/1713] Per Sample Total Time 0.01186 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01172 Train Loss 0.0116
|
| 504 |
+
Epoch: [15][1518/1713] Per Sample Total Time 0.01187 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01173 Train Loss 0.0116
|
| 505 |
+
Epoch: [15][1618/1713] Per Sample Total Time 0.01187 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01174 Train Loss 0.0116
|
| 506 |
+
start validation
|
| 507 |
+
mAP: 0.324188
|
| 508 |
+
AUC: 0.955051
|
| 509 |
+
Avg Precision: 0.056682
|
| 510 |
+
Avg Recall: 0.845943
|
| 511 |
+
d_prime: 2.398415
|
| 512 |
+
train_loss: 0.011597
|
| 513 |
+
valid_loss: 0.693899
|
| 514 |
+
validation finished
|
| 515 |
+
Epoch-15 lr: 1.25e-05
|
| 516 |
+
epoch 15 training time: 331.892
|
| 517 |
+
---------------
|
| 518 |
+
2025-08-27 04:03:53.919434
|
| 519 |
+
current #epochs=16, #steps=25695
|
| 520 |
+
Epoch: [16][5/1713] Per Sample Total Time 0.04252 Per Sample Data Time 0.02888 Per Sample DNN Time 0.01365 Train Loss 0.0112
|
| 521 |
+
Epoch: [16][105/1713] Per Sample Total Time 0.01470 Per Sample Data Time 0.00168 Per Sample DNN Time 0.01302 Train Loss 0.0111
|
| 522 |
+
Epoch: [16][205/1713] Per Sample Total Time 0.01382 Per Sample Data Time 0.00089 Per Sample DNN Time 0.01293 Train Loss 0.0109
|
| 523 |
+
Epoch: [16][305/1713] Per Sample Total Time 0.01352 Per Sample Data Time 0.00062 Per Sample DNN Time 0.01291 Train Loss 0.0109
|
| 524 |
+
Epoch: [16][405/1713] Per Sample Total Time 0.01325 Per Sample Data Time 0.00048 Per Sample DNN Time 0.01278 Train Loss 0.0109
|
| 525 |
+
Epoch: [16][505/1713] Per Sample Total Time 0.01320 Per Sample Data Time 0.00039 Per Sample DNN Time 0.01281 Train Loss 0.0109
|
| 526 |
+
Epoch: [16][605/1713] Per Sample Total Time 0.01319 Per Sample Data Time 0.00034 Per Sample DNN Time 0.01286 Train Loss 0.0109
|
| 527 |
+
Epoch: [16][705/1713] Per Sample Total Time 0.01316 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01287 Train Loss 0.0109
|
| 528 |
+
Epoch: [16][805/1713] Per Sample Total Time 0.01307 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01280 Train Loss 0.0109
|
| 529 |
+
Epoch: [16][905/1713] Per Sample Total Time 0.01300 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01276 Train Loss 0.0109
|
| 530 |
+
Epoch: [16][1005/1713] Per Sample Total Time 0.01295 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01273 Train Loss 0.0109
|
| 531 |
+
Epoch: [16][1105/1713] Per Sample Total Time 0.01290 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01269 Train Loss 0.0109
|
| 532 |
+
Epoch: [16][1205/1713] Per Sample Total Time 0.01287 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01268 Train Loss 0.0109
|
| 533 |
+
Epoch: [16][1305/1713] Per Sample Total Time 0.01283 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01264 Train Loss 0.0109
|
| 534 |
+
Epoch: [16][1405/1713] Per Sample Total Time 0.01281 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01264 Train Loss 0.0109
|
| 535 |
+
Epoch: [16][1505/1713] Per Sample Total Time 0.01278 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01262 Train Loss 0.0109
|
| 536 |
+
Epoch: [16][1605/1713] Per Sample Total Time 0.01276 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01261 Train Loss 0.0109
|
| 537 |
+
Epoch: [16][1705/1713] Per Sample Total Time 0.01273 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01258 Train Loss 0.0109
|
| 538 |
+
start validation
|
| 539 |
+
mAP: 0.327409
|
| 540 |
+
AUC: 0.953805
|
| 541 |
+
Avg Precision: 0.058829
|
| 542 |
+
Avg Recall: 0.839503
|
| 543 |
+
d_prime: 2.380009
|
| 544 |
+
train_loss: 0.010876
|
| 545 |
+
valid_loss: 0.693826
|
| 546 |
+
validation finished
|
| 547 |
+
Epoch-16 lr: 1.25e-05
|
| 548 |
+
epoch 16 training time: 347.769
|
| 549 |
+
---------------
|
| 550 |
+
2025-08-27 04:09:41.688588
|
| 551 |
+
current #epochs=17, #steps=27408
|
| 552 |
+
Epoch: [17][92/1713] Per Sample Total Time 0.01398 Per Sample Data Time 0.00155 Per Sample DNN Time 0.01243 Train Loss 0.0107
|
| 553 |
+
Epoch: [17][192/1713] Per Sample Total Time 0.01296 Per Sample Data Time 0.00077 Per Sample DNN Time 0.01219 Train Loss 0.0107
|
| 554 |
+
Epoch: [17][292/1713] Per Sample Total Time 0.01261 Per Sample Data Time 0.00052 Per Sample DNN Time 0.01208 Train Loss 0.0105
|
| 555 |
+
Epoch: [17][392/1713] Per Sample Total Time 0.01233 Per Sample Data Time 0.00040 Per Sample DNN Time 0.01193 Train Loss 0.0105
|
| 556 |
+
Epoch: [17][492/1713] Per Sample Total Time 0.01210 Per Sample Data Time 0.00033 Per Sample DNN Time 0.01177 Train Loss 0.0106
|
| 557 |
+
Epoch: [17][592/1713] Per Sample Total Time 0.01185 Per Sample Data Time 0.00028 Per Sample DNN Time 0.01158 Train Loss 0.0106
|
| 558 |
+
Epoch: [17][692/1713] Per Sample Total Time 0.01191 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01167 Train Loss 0.0106
|
| 559 |
+
Epoch: [17][792/1713] Per Sample Total Time 0.01196 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01174 Train Loss 0.0106
|
| 560 |
+
Epoch: [17][892/1713] Per Sample Total Time 0.01201 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01181 Train Loss 0.0106
|
| 561 |
+
Epoch: [17][992/1713] Per Sample Total Time 0.01206 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01188 Train Loss 0.0106
|
| 562 |
+
Epoch: [17][1092/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01192 Train Loss 0.0106
|
| 563 |
+
Epoch: [17][1192/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01196 Train Loss 0.0106
|
| 564 |
+
Epoch: [17][1292/1713] Per Sample Total Time 0.01213 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01197 Train Loss 0.0106
|
| 565 |
+
Epoch: [17][1392/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01200 Train Loss 0.0106
|
| 566 |
+
Epoch: [17][1492/1713] Per Sample Total Time 0.01216 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01202 Train Loss 0.0106
|
| 567 |
+
Epoch: [17][1592/1713] Per Sample Total Time 0.01215 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01202 Train Loss 0.0106
|
| 568 |
+
Epoch: [17][1692/1713] Per Sample Total Time 0.01215 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01203 Train Loss 0.0106
|
| 569 |
+
start validation
|
| 570 |
+
mAP: 0.326630
|
| 571 |
+
AUC: 0.953156
|
| 572 |
+
Avg Precision: 0.057865
|
| 573 |
+
Avg Recall: 0.837258
|
| 574 |
+
d_prime: 2.370586
|
| 575 |
+
train_loss: 0.010578
|
| 576 |
+
valid_loss: 0.693788
|
| 577 |
+
validation finished
|
| 578 |
+
Epoch-17 lr: 1.25e-05
|
| 579 |
+
epoch 17 training time: 336.202
|
| 580 |
+
---------------
|
| 581 |
+
2025-08-27 04:15:17.890290
|
| 582 |
+
current #epochs=18, #steps=29121
|
| 583 |
+
Epoch: [18][79/1713] Per Sample Total Time 0.01361 Per Sample Data Time 0.00190 Per Sample DNN Time 0.01172 Train Loss 0.0103
|
| 584 |
+
Epoch: [18][179/1713] Per Sample Total Time 0.01298 Per Sample Data Time 0.00087 Per Sample DNN Time 0.01211 Train Loss 0.0104
|
| 585 |
+
Epoch: [18][279/1713] Per Sample Total Time 0.01268 Per Sample Data Time 0.00058 Per Sample DNN Time 0.01211 Train Loss 0.0104
|
| 586 |
+
Epoch: [18][379/1713] Per Sample Total Time 0.01250 Per Sample Data Time 0.00044 Per Sample DNN Time 0.01207 Train Loss 0.0104
|
| 587 |
+
Epoch: [18][479/1713] Per Sample Total Time 0.01249 Per Sample Data Time 0.00035 Per Sample DNN Time 0.01213 Train Loss 0.0105
|
| 588 |
+
Epoch: [18][579/1713] Per Sample Total Time 0.01249 Per Sample Data Time 0.00030 Per Sample DNN Time 0.01219 Train Loss 0.0105
|
| 589 |
+
Epoch: [18][679/1713] Per Sample Total Time 0.01250 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01223 Train Loss 0.0105
|
| 590 |
+
Epoch: [18][779/1713] Per Sample Total Time 0.01246 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01222 Train Loss 0.0105
|
| 591 |
+
Epoch: [18][879/1713] Per Sample Total Time 0.01245 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01224 Train Loss 0.0105
|
| 592 |
+
Epoch: [18][979/1713] Per Sample Total Time 0.01252 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01233 Train Loss 0.0104
|
| 593 |
+
Epoch: [18][1079/1713] Per Sample Total Time 0.01258 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01240 Train Loss 0.0104
|
| 594 |
+
Epoch: [18][1179/1713] Per Sample Total Time 0.01259 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01242 Train Loss 0.0104
|
| 595 |
+
Epoch: [18][1279/1713] Per Sample Total Time 0.01258 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01241 Train Loss 0.0104
|
| 596 |
+
Epoch: [18][1379/1713] Per Sample Total Time 0.01256 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01240 Train Loss 0.0104
|
| 597 |
+
Epoch: [18][1479/1713] Per Sample Total Time 0.01254 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01239 Train Loss 0.0104
|
| 598 |
+
Epoch: [18][1579/1713] Per Sample Total Time 0.01252 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01238 Train Loss 0.0104
|
| 599 |
+
Epoch: [18][1679/1713] Per Sample Total Time 0.01249 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01235 Train Loss 0.0104
|
| 600 |
+
start validation
|
| 601 |
+
mAP: 0.327243
|
| 602 |
+
AUC: 0.952374
|
| 603 |
+
Avg Precision: 0.057596
|
| 604 |
+
Avg Recall: 0.839843
|
| 605 |
+
d_prime: 2.359356
|
| 606 |
+
train_loss: 0.010416
|
| 607 |
+
valid_loss: 0.693832
|
| 608 |
+
validation finished
|
| 609 |
+
Epoch-18 lr: 1.25e-05
|
| 610 |
+
epoch 18 training time: 343.352
|
| 611 |
+
---------------
|
| 612 |
+
2025-08-27 04:21:01.242578
|
| 613 |
+
current #epochs=19, #steps=30834
|
| 614 |
+
Epoch: [19][66/1713] Per Sample Total Time 0.01454 Per Sample Data Time 0.00188 Per Sample DNN Time 0.01266 Train Loss 0.0100
|
| 615 |
+
Epoch: [19][166/1713] Per Sample Total Time 0.01352 Per Sample Data Time 0.00078 Per Sample DNN Time 0.01274 Train Loss 0.0101
|
| 616 |
+
Epoch: [19][266/1713] Per Sample Total Time 0.01298 Per Sample Data Time 0.00051 Per Sample DNN Time 0.01247 Train Loss 0.0100
|
| 617 |
+
Epoch: [19][366/1713] Per Sample Total Time 0.01277 Per Sample Data Time 0.00038 Per Sample DNN Time 0.01239 Train Loss 0.0100
|
| 618 |
+
Epoch: [19][466/1713] Per Sample Total Time 0.01253 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01222 Train Loss 0.0100
|
| 619 |
+
Epoch: [19][566/1713] Per Sample Total Time 0.01230 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01204 Train Loss 0.0100
|
| 620 |
+
Epoch: [19][666/1713] Per Sample Total Time 0.01228 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01205 Train Loss 0.0101
|
| 621 |
+
Epoch: [19][766/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01206 Train Loss 0.0101
|
| 622 |
+
Epoch: [19][866/1713] Per Sample Total Time 0.01222 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01204 Train Loss 0.0101
|
| 623 |
+
Epoch: [19][966/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01204 Train Loss 0.0101
|
| 624 |
+
Epoch: [19][1066/1713] Per Sample Total Time 0.01220 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01204 Train Loss 0.0101
|
| 625 |
+
Epoch: [19][1166/1713] Per Sample Total Time 0.01216 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01201 Train Loss 0.0101
|
| 626 |
+
Epoch: [19][1266/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01197 Train Loss 0.0101
|
| 627 |
+
Epoch: [19][1366/1713] Per Sample Total Time 0.01202 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01189 Train Loss 0.0101
|
| 628 |
+
Epoch: [19][1466/1713] Per Sample Total Time 0.01205 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01192 Train Loss 0.0101
|
| 629 |
+
Epoch: [19][1566/1713] Per Sample Total Time 0.01208 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01195 Train Loss 0.0101
|
| 630 |
+
Epoch: [19][1666/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01198 Train Loss 0.0101
|
| 631 |
+
start validation
|
| 632 |
+
mAP: 0.325581
|
| 633 |
+
AUC: 0.951207
|
| 634 |
+
Avg Precision: 0.057401
|
| 635 |
+
Avg Recall: 0.836137
|
| 636 |
+
d_prime: 2.342888
|
| 637 |
+
train_loss: 0.010055
|
| 638 |
+
valid_loss: 0.693805
|
| 639 |
+
validation finished
|
| 640 |
+
Epoch-19 lr: 1.25e-05
|
| 641 |
+
epoch 19 training time: 334.613
|
| 642 |
+
---------------
|
| 643 |
+
2025-08-27 04:26:35.855950
|
| 644 |
+
current #epochs=20, #steps=32547
|
| 645 |
+
Epoch: [20][53/1713] Per Sample Total Time 0.01465 Per Sample Data Time 0.00259 Per Sample DNN Time 0.01206 Train Loss 0.0100
|
| 646 |
+
Epoch: [20][153/1713] Per Sample Total Time 0.01233 Per Sample Data Time 0.00094 Per Sample DNN Time 0.01139 Train Loss 0.0099
|
| 647 |
+
Epoch: [20][253/1713] Per Sample Total Time 0.01216 Per Sample Data Time 0.00059 Per Sample DNN Time 0.01158 Train Loss 0.0097
|
| 648 |
+
Epoch: [20][353/1713] Per Sample Total Time 0.01224 Per Sample Data Time 0.00043 Per Sample DNN Time 0.01181 Train Loss 0.0098
|
| 649 |
+
Epoch: [20][453/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00035 Per Sample DNN Time 0.01192 Train Loss 0.0098
|
| 650 |
+
Epoch: [20][553/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01198 Train Loss 0.0098
|
| 651 |
+
Epoch: [20][653/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01201 Train Loss 0.0098
|
| 652 |
+
Epoch: [20][753/1713] Per Sample Total Time 0.01230 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01207 Train Loss 0.0099
|
| 653 |
+
Epoch: [20][853/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01212 Train Loss 0.0099
|
| 654 |
+
Epoch: [20][953/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01213 Train Loss 0.0099
|
| 655 |
+
Epoch: [20][1053/1713] Per Sample Total Time 0.01229 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01212 Train Loss 0.0100
|
| 656 |
+
Epoch: [20][1153/1713] Per Sample Total Time 0.01227 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01211 Train Loss 0.0100
|
| 657 |
+
Epoch: [20][1253/1713] Per Sample Total Time 0.01230 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01214 Train Loss 0.0100
|
| 658 |
+
Epoch: [20][1353/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01217 Train Loss 0.0100
|
| 659 |
+
Epoch: [20][1453/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01218 Train Loss 0.0100
|
| 660 |
+
Epoch: [20][1553/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01219 Train Loss 0.0100
|
| 661 |
+
Epoch: [20][1653/1713] Per Sample Total Time 0.01232 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01219 Train Loss 0.0100
|
| 662 |
+
start validation
|
| 663 |
+
mAP: 0.326032
|
| 664 |
+
AUC: 0.951268
|
| 665 |
+
Avg Precision: 0.059974
|
| 666 |
+
Avg Recall: 0.828896
|
| 667 |
+
d_prime: 2.343747
|
| 668 |
+
train_loss: 0.009952
|
| 669 |
+
valid_loss: 0.693693
|
| 670 |
+
validation finished
|
| 671 |
+
Epoch-20 lr: 6.25e-06
|
| 672 |
+
epoch 20 training time: 338.954
|
| 673 |
+
---------------
|
| 674 |
+
2025-08-27 04:32:14.809724
|
| 675 |
+
current #epochs=21, #steps=34260
|
| 676 |
+
Epoch: [21][40/1713] Per Sample Total Time 0.01581 Per Sample Data Time 0.00377 Per Sample DNN Time 0.01204 Train Loss 0.0094
|
| 677 |
+
Epoch: [21][140/1713] Per Sample Total Time 0.01329 Per Sample Data Time 0.00113 Per Sample DNN Time 0.01216 Train Loss 0.0096
|
| 678 |
+
Epoch: [21][240/1713] Per Sample Total Time 0.01279 Per Sample Data Time 0.00068 Per Sample DNN Time 0.01211 Train Loss 0.0098
|
| 679 |
+
Epoch: [21][340/1713] Per Sample Total Time 0.01265 Per Sample Data Time 0.00049 Per Sample DNN Time 0.01215 Train Loss 0.0098
|
| 680 |
+
Epoch: [21][440/1713] Per Sample Total Time 0.01259 Per Sample Data Time 0.00039 Per Sample DNN Time 0.01220 Train Loss 0.0098
|
| 681 |
+
Epoch: [21][540/1713] Per Sample Total Time 0.01253 Per Sample Data Time 0.00033 Per Sample DNN Time 0.01220 Train Loss 0.0097
|
| 682 |
+
Epoch: [21][640/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00028 Per Sample DNN Time 0.01216 Train Loss 0.0097
|
| 683 |
+
Epoch: [21][740/1713] Per Sample Total Time 0.01241 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01215 Train Loss 0.0096
|
| 684 |
+
Epoch: [21][840/1713] Per Sample Total Time 0.01237 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01214 Train Loss 0.0096
|
| 685 |
+
Epoch: [21][940/1713] Per Sample Total Time 0.01236 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01215 Train Loss 0.0096
|
| 686 |
+
Epoch: [21][1040/1713] Per Sample Total Time 0.01235 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01216 Train Loss 0.0096
|
| 687 |
+
Epoch: [21][1140/1713] Per Sample Total Time 0.01236 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01218 Train Loss 0.0096
|
| 688 |
+
Epoch: [21][1240/1713] Per Sample Total Time 0.01236 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01219 Train Loss 0.0096
|
| 689 |
+
Epoch: [21][1340/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01224 Train Loss 0.0096
|
| 690 |
+
Epoch: [21][1440/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01227 Train Loss 0.0096
|
| 691 |
+
Epoch: [21][1540/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01229 Train Loss 0.0096
|
| 692 |
+
Epoch: [21][1640/1713] Per Sample Total Time 0.01245 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01231 Train Loss 0.0096
|
| 693 |
+
start validation
|
| 694 |
+
mAP: 0.324904
|
| 695 |
+
AUC: 0.950095
|
| 696 |
+
Avg Precision: 0.058785
|
| 697 |
+
Avg Recall: 0.826536
|
| 698 |
+
d_prime: 2.327477
|
| 699 |
+
train_loss: 0.009605
|
| 700 |
+
valid_loss: 0.693686
|
| 701 |
+
validation finished
|
| 702 |
+
Epoch-21 lr: 6.25e-06
|
| 703 |
+
epoch 21 training time: 340.738
|
| 704 |
+
---------------
|
| 705 |
+
2025-08-27 04:37:55.547797
|
| 706 |
+
current #epochs=22, #steps=35973
|
| 707 |
+
Epoch: [22][27/1713] Per Sample Total Time 0.01741 Per Sample Data Time 0.00499 Per Sample DNN Time 0.01242 Train Loss 0.0093
|
| 708 |
+
Epoch: [22][127/1713] Per Sample Total Time 0.01313 Per Sample Data Time 0.00113 Per Sample DNN Time 0.01200 Train Loss 0.0092
|
| 709 |
+
Epoch: [22][227/1713] Per Sample Total Time 0.01251 Per Sample Data Time 0.00065 Per Sample DNN Time 0.01186 Train Loss 0.0092
|
| 710 |
+
Epoch: [22][327/1713] Per Sample Total Time 0.01226 Per Sample Data Time 0.00047 Per Sample DNN Time 0.01180 Train Loss 0.0092
|
| 711 |
+
Epoch: [22][427/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00037 Per Sample DNN Time 0.01182 Train Loss 0.0093
|
| 712 |
+
Epoch: [22][527/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00031 Per Sample DNN Time 0.01183 Train Loss 0.0093
|
| 713 |
+
Epoch: [22][627/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00026 Per Sample DNN Time 0.01182 Train Loss 0.0094
|
| 714 |
+
Epoch: [22][727/1713] Per Sample Total Time 0.01208 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01185 Train Loss 0.0094
|
| 715 |
+
Epoch: [22][827/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01188 Train Loss 0.0094
|
| 716 |
+
Epoch: [22][927/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01190 Train Loss 0.0094
|
| 717 |
+
Epoch: [22][1027/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01195 Train Loss 0.0094
|
| 718 |
+
Epoch: [22][1127/1713] Per Sample Total Time 0.01209 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01193 Train Loss 0.0095
|
| 719 |
+
Epoch: [22][1227/1713] Per Sample Total Time 0.01212 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01196 Train Loss 0.0095
|
| 720 |
+
Epoch: [22][1327/1713] Per Sample Total Time 0.01213 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01199 Train Loss 0.0095
|
| 721 |
+
Epoch: [22][1427/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01203 Train Loss 0.0095
|
| 722 |
+
Epoch: [22][1527/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01206 Train Loss 0.0095
|
| 723 |
+
Epoch: [22][1627/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01208 Train Loss 0.0095
|
| 724 |
+
start validation
|
| 725 |
+
mAP: 0.325570
|
| 726 |
+
AUC: 0.949932
|
| 727 |
+
Avg Precision: 0.057976
|
| 728 |
+
Avg Recall: 0.827917
|
| 729 |
+
d_prime: 2.325238
|
| 730 |
+
train_loss: 0.009480
|
| 731 |
+
valid_loss: 0.693679
|
| 732 |
+
validation finished
|
| 733 |
+
Epoch-22 lr: 6.25e-06
|
| 734 |
+
epoch 22 training time: 336.561
|
| 735 |
+
---------------
|
| 736 |
+
2025-08-27 04:43:32.108769
|
| 737 |
+
current #epochs=23, #steps=37686
|
| 738 |
+
Epoch: [23][14/1713] Per Sample Total Time 0.02238 Per Sample Data Time 0.00949 Per Sample DNN Time 0.01290 Train Loss 0.0097
|
| 739 |
+
Epoch: [23][114/1713] Per Sample Total Time 0.01363 Per Sample Data Time 0.00127 Per Sample DNN Time 0.01235 Train Loss 0.0094
|
| 740 |
+
Epoch: [23][214/1713] Per Sample Total Time 0.01300 Per Sample Data Time 0.00070 Per Sample DNN Time 0.01230 Train Loss 0.0094
|
| 741 |
+
Epoch: [23][314/1713] Per Sample Total Time 0.01281 Per Sample Data Time 0.00049 Per Sample DNN Time 0.01232 Train Loss 0.0094
|
| 742 |
+
Epoch: [23][414/1713] Per Sample Total Time 0.01265 Per Sample Data Time 0.00039 Per Sample DNN Time 0.01227 Train Loss 0.0094
|
| 743 |
+
Epoch: [23][514/1713] Per Sample Total Time 0.01262 Per Sample Data Time 0.00032 Per Sample DNN Time 0.01230 Train Loss 0.0094
|
| 744 |
+
Epoch: [23][614/1713] Per Sample Total Time 0.01255 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01227 Train Loss 0.0094
|
| 745 |
+
Epoch: [23][714/1713] Per Sample Total Time 0.01251 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01227 Train Loss 0.0094
|
| 746 |
+
Epoch: [23][814/1713] Per Sample Total Time 0.01242 Per Sample Data Time 0.00022 Per Sample DNN Time 0.01221 Train Loss 0.0094
|
| 747 |
+
Epoch: [23][914/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00020 Per Sample DNN Time 0.01220 Train Loss 0.0093
|
| 748 |
+
Epoch: [23][1014/1713] Per Sample Total Time 0.01235 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01217 Train Loss 0.0093
|
| 749 |
+
Epoch: [23][1114/1713] Per Sample Total Time 0.01225 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01208 Train Loss 0.0094
|
| 750 |
+
Epoch: [23][1214/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01205 Train Loss 0.0094
|
| 751 |
+
Epoch: [23][1314/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01202 Train Loss 0.0094
|
| 752 |
+
Epoch: [23][1414/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01200 Train Loss 0.0093
|
| 753 |
+
Epoch: [23][1514/1713] Per Sample Total Time 0.01214 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01201 Train Loss 0.0093
|
| 754 |
+
Epoch: [23][1614/1713] Per Sample Total Time 0.01211 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01198 Train Loss 0.0093
|
| 755 |
+
start validation
|
| 756 |
+
mAP: 0.324305
|
| 757 |
+
AUC: 0.949169
|
| 758 |
+
Avg Precision: 0.055806
|
| 759 |
+
Avg Recall: 0.830132
|
| 760 |
+
d_prime: 2.314852
|
| 761 |
+
train_loss: 0.009343
|
| 762 |
+
valid_loss: 0.693706
|
| 763 |
+
validation finished
|
| 764 |
+
Epoch-23 lr: 6.25e-06
|
| 765 |
+
epoch 23 training time: 334.655
|
| 766 |
+
---------------
|
| 767 |
+
2025-08-27 04:49:06.763788
|
| 768 |
+
current #epochs=24, #steps=39399
|
| 769 |
+
Epoch: [24][1/1713] Per Sample Total Time 0.08944 Per Sample Data Time 0.07234 Per Sample DNN Time 0.01709 Train Loss 0.0076
|
| 770 |
+
Epoch: [24][101/1713] Per Sample Total Time 0.01414 Per Sample Data Time 0.00146 Per Sample DNN Time 0.01268 Train Loss 0.0092
|
| 771 |
+
Epoch: [24][201/1713] Per Sample Total Time 0.01317 Per Sample Data Time 0.00076 Per Sample DNN Time 0.01241 Train Loss 0.0092
|
| 772 |
+
Epoch: [24][301/1713] Per Sample Total Time 0.01286 Per Sample Data Time 0.00052 Per Sample DNN Time 0.01233 Train Loss 0.0092
|
| 773 |
+
Epoch: [24][401/1713] Per Sample Total Time 0.01280 Per Sample Data Time 0.00041 Per Sample DNN Time 0.01239 Train Loss 0.0092
|
| 774 |
+
Epoch: [24][501/1713] Per Sample Total Time 0.01272 Per Sample Data Time 0.00033 Per Sample DNN Time 0.01238 Train Loss 0.0093
|
| 775 |
+
Epoch: [24][601/1713] Per Sample Total Time 0.01260 Per Sample Data Time 0.00029 Per Sample DNN Time 0.01232 Train Loss 0.0093
|
| 776 |
+
Epoch: [24][701/1713] Per Sample Total Time 0.01256 Per Sample Data Time 0.00025 Per Sample DNN Time 0.01230 Train Loss 0.0093
|
| 777 |
+
Epoch: [24][801/1713] Per Sample Total Time 0.01253 Per Sample Data Time 0.00023 Per Sample DNN Time 0.01231 Train Loss 0.0093
|
| 778 |
+
Epoch: [24][901/1713] Per Sample Total Time 0.01248 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01228 Train Loss 0.0093
|
| 779 |
+
Epoch: [24][1001/1713] Per Sample Total Time 0.01248 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01229 Train Loss 0.0093
|
| 780 |
+
Epoch: [24][1101/1713] Per Sample Total Time 0.01247 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01230 Train Loss 0.0093
|
| 781 |
+
Epoch: [24][1201/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01226 Train Loss 0.0094
|
| 782 |
+
Epoch: [24][1301/1713] Per Sample Total Time 0.01242 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01227 Train Loss 0.0094
|
| 783 |
+
Epoch: [24][1401/1713] Per Sample Total Time 0.01244 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01229 Train Loss 0.0094
|
| 784 |
+
Epoch: [24][1501/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01226 Train Loss 0.0094
|
| 785 |
+
Epoch: [24][1601/1713] Per Sample Total Time 0.01240 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01227 Train Loss 0.0094
|
| 786 |
+
Epoch: [24][1701/1713] Per Sample Total Time 0.01238 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01225 Train Loss 0.0094
|
| 787 |
+
start validation
|
| 788 |
+
mAP: 0.325054
|
| 789 |
+
AUC: 0.948969
|
| 790 |
+
Avg Precision: 0.058949
|
| 791 |
+
Avg Recall: 0.826246
|
| 792 |
+
d_prime: 2.312149
|
| 793 |
+
train_loss: 0.009365
|
| 794 |
+
valid_loss: 0.693633
|
| 795 |
+
validation finished
|
| 796 |
+
Epoch-24 lr: 6.25e-06
|
| 797 |
+
epoch 24 training time: 343.091
|
| 798 |
+
---------------
|
| 799 |
+
2025-08-27 04:54:49.854832
|
| 800 |
+
current #epochs=25, #steps=41112
|
| 801 |
+
Epoch: [25][88/1713] Per Sample Total Time 0.01340 Per Sample Data Time 0.00154 Per Sample DNN Time 0.01186 Train Loss 0.0092
|
| 802 |
+
Epoch: [25][188/1713] Per Sample Total Time 0.01266 Per Sample Data Time 0.00075 Per Sample DNN Time 0.01191 Train Loss 0.0092
|
| 803 |
+
Epoch: [25][288/1713] Per Sample Total Time 0.01242 Per Sample Data Time 0.00050 Per Sample DNN Time 0.01192 Train Loss 0.0091
|
| 804 |
+
Epoch: [25][388/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00039 Per Sample DNN Time 0.01205 Train Loss 0.0092
|
| 805 |
+
Epoch: [25][488/1713] Per Sample Total Time 0.01243 Per Sample Data Time 0.00032 Per Sample DNN Time 0.01212 Train Loss 0.0092
|
| 806 |
+
Epoch: [25][588/1713] Per Sample Total Time 0.01245 Per Sample Data Time 0.00027 Per Sample DNN Time 0.01218 Train Loss 0.0091
|
| 807 |
+
Epoch: [25][688/1713] Per Sample Total Time 0.01238 Per Sample Data Time 0.00024 Per Sample DNN Time 0.01215 Train Loss 0.0092
|
| 808 |
+
Epoch: [25][788/1713] Per Sample Total Time 0.01235 Per Sample Data Time 0.00021 Per Sample DNN Time 0.01214 Train Loss 0.0092
|
| 809 |
+
Epoch: [25][888/1713] Per Sample Total Time 0.01234 Per Sample Data Time 0.00019 Per Sample DNN Time 0.01215 Train Loss 0.0092
|
| 810 |
+
Epoch: [25][988/1713] Per Sample Total Time 0.01231 Per Sample Data Time 0.00018 Per Sample DNN Time 0.01213 Train Loss 0.0092
|
| 811 |
+
Epoch: [25][1088/1713] Per Sample Total Time 0.01225 Per Sample Data Time 0.00017 Per Sample DNN Time 0.01208 Train Loss 0.0092
|
| 812 |
+
Epoch: [25][1188/1713] Per Sample Total Time 0.01219 Per Sample Data Time 0.00016 Per Sample DNN Time 0.01204 Train Loss 0.0092
|
| 813 |
+
Epoch: [25][1288/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00015 Per Sample DNN Time 0.01206 Train Loss 0.0092
|
| 814 |
+
Epoch: [25][1388/1713] Per Sample Total Time 0.01222 Per Sample Data Time 0.00014 Per Sample DNN Time 0.01208 Train Loss 0.0092
|
| 815 |
+
Epoch: [25][1488/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01208 Train Loss 0.0092
|
| 816 |
+
Epoch: [25][1588/1713] Per Sample Total Time 0.01221 Per Sample Data Time 0.00013 Per Sample DNN Time 0.01208 Train Loss 0.0092
|
| 817 |
+
Epoch: [25][1688/1713] Per Sample Total Time 0.01217 Per Sample Data Time 0.00012 Per Sample DNN Time 0.01205 Train Loss 0.0092
|
| 818 |
+
start validation
|
| 819 |
+
mAP: 0.323983
|
| 820 |
+
AUC: 0.948025
|
| 821 |
+
Avg Precision: 0.060971
|
| 822 |
+
Avg Recall: 0.820553
|
| 823 |
+
d_prime: 2.299510
|
| 824 |
+
train_loss: 0.009207
|
| 825 |
+
valid_loss: 0.693624
|
| 826 |
+
validation finished
|
| 827 |
+
Epoch-25 lr: 3.125e-06
|
| 828 |
+
epoch 25 training time: 338.680
|
| 829 |
+
---------------Training Finished---------------
|
| 830 |
+
weighted averaged model results
|
| 831 |
+
mAP: 0.340667
|
| 832 |
+
AUC: 0.959997
|
| 833 |
+
Avg Precision: 0.058671
|
| 834 |
+
Avg Recall: 0.859400
|
| 835 |
+
d_prime: 2.475802
|
| 836 |
+
train_loss: 0.000000
|
| 837 |
+
valid_loss: 0.693624
|
ast_1_AS20k/ast_origin_implement/test-balanced-f10-t10-pTrue-b12-lr5e-5-decoupe/wa_result.csv
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3.406672498832421514e-01
|
| 2 |
+
9.599974407170855928e-01
|
| 3 |
+
5.867078735816431967e-02
|
| 4 |
+
8.594002744825509632e-01
|
| 5 |
+
2.475801985654465742e+00
|
pre_4_AS2M/conv_clap_1_2025-09-30_06-58-32/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_resolution="4, 8, 16"
|
| 35 |
+
model_modalities_image_conv_in_chans="1, 256, 384, 768"
|
| 36 |
+
|
| 37 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 38 |
+
echo "Config ${train_mode} ${config_option}"
|
| 39 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 40 |
+
task_load_clap_emb=false
|
| 41 |
+
task_load_source_file=true
|
| 42 |
+
task_load_mel_file=false
|
| 43 |
+
model_proj_type=null
|
| 44 |
+
model_clone_batch=4
|
| 45 |
+
dataset_batch_size=96
|
| 46 |
+
model_clap_loss=0
|
| 47 |
+
checkpoint_keep_interval_updates=-1
|
| 48 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 49 |
+
echo "Config ${train_mode} ${config_option}"
|
| 50 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 51 |
+
task_load_clap_emb=false
|
| 52 |
+
task_load_source_file=true
|
| 53 |
+
task_load_mel_file=false
|
| 54 |
+
model_proj_type=null
|
| 55 |
+
model_clone_batch=4
|
| 56 |
+
dataset_batch_size=96
|
| 57 |
+
model_dispersive_loss=1
|
| 58 |
+
model_dispersive_loss_layer=0
|
| 59 |
+
checkpoint_keep_interval_updates=1
|
| 60 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 61 |
+
echo "Config ${train_mode} ${config_option}"
|
| 62 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 63 |
+
task_load_clap_emb=false
|
| 64 |
+
task_load_source_file=true
|
| 65 |
+
task_load_mel_file=false
|
| 66 |
+
model_proj_type=null
|
| 67 |
+
model_clone_batch=1
|
| 68 |
+
dataset_batch_size=384
|
| 69 |
+
model_dispersive_loss=1
|
| 70 |
+
model_dispersive_loss_layer=0
|
| 71 |
+
checkpoint_keep_interval_updates=1
|
| 72 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 73 |
+
echo "Config ${train_mode} ${config_option}"
|
| 74 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 75 |
+
task_load_clap_emb=false
|
| 76 |
+
task_load_source_file=true
|
| 77 |
+
task_load_mel_file=false
|
| 78 |
+
model_proj_type=null
|
| 79 |
+
model_clone_batch=1
|
| 80 |
+
dataset_batch_size=384
|
| 81 |
+
model_dispersive_loss=10.0
|
| 82 |
+
model_dispersive_loss_layer=0
|
| 83 |
+
checkpoint_keep_interval_updates=1
|
| 84 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 85 |
+
echo "Config ${train_mode} ${config_option}"
|
| 86 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 87 |
+
task_load_clap_emb=false
|
| 88 |
+
task_load_source_file=true
|
| 89 |
+
task_load_mel_file=false
|
| 90 |
+
model_proj_type=null
|
| 91 |
+
model_clone_batch=1
|
| 92 |
+
dataset_batch_size=384
|
| 93 |
+
model_dispersive_loss=100.0
|
| 94 |
+
model_dispersive_loss_layer=0
|
| 95 |
+
checkpoint_keep_interval_updates=1
|
| 96 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 97 |
+
echo "Config ${train_mode} ${config_option}"
|
| 98 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 99 |
+
task_load_clap_emb=false
|
| 100 |
+
task_load_source_file=true
|
| 101 |
+
task_load_mel_file=false
|
| 102 |
+
model_proj_type=null
|
| 103 |
+
model_clone_batch=1
|
| 104 |
+
dataset_batch_size=384
|
| 105 |
+
model_dispersive_loss=10000.0
|
| 106 |
+
model_dispersive_loss_layer=0
|
| 107 |
+
checkpoint_keep_interval_updates=1
|
| 108 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 109 |
+
echo "Config ${train_mode} ${config_option}"
|
| 110 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 111 |
+
task_load_clap_emb=false
|
| 112 |
+
task_load_source_file=true
|
| 113 |
+
task_load_mel_file=false
|
| 114 |
+
model_proj_type=null
|
| 115 |
+
model_clone_batch=1
|
| 116 |
+
dataset_batch_size=384
|
| 117 |
+
model_dispersive_loss=1000.0
|
| 118 |
+
model_dispersive_loss_layer=0
|
| 119 |
+
checkpoint_keep_interval_updates=1
|
| 120 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 121 |
+
echo "Config ${train_mode} ${config_option}"
|
| 122 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 123 |
+
task_load_clap_emb=false
|
| 124 |
+
task_load_source_file=true
|
| 125 |
+
task_load_mel_file=false
|
| 126 |
+
model_proj_type=null
|
| 127 |
+
model_clone_batch=4
|
| 128 |
+
dataset_batch_size=96
|
| 129 |
+
model_dispersive_loss=1000.0
|
| 130 |
+
model_dispersive_loss_layer=10
|
| 131 |
+
checkpoint_keep_interval_updates=1
|
| 132 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 133 |
+
echo "Config ${train_mode} ${config_option}"
|
| 134 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 135 |
+
task_load_clap_emb=true
|
| 136 |
+
model_proj_type=2
|
| 137 |
+
model_clone_batch=4
|
| 138 |
+
dataset_batch_size=48
|
| 139 |
+
model_clap_loss=1.0
|
| 140 |
+
average_top_k_layers=12
|
| 141 |
+
model_add_conv=false
|
| 142 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 143 |
+
echo "Config ${train_mode} ${config_option}"
|
| 144 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 145 |
+
task_load_clap_emb=true
|
| 146 |
+
model_proj_type=2
|
| 147 |
+
model_clone_batch=4
|
| 148 |
+
dataset_batch_size=48
|
| 149 |
+
model_clap_loss=1.0
|
| 150 |
+
average_top_k_layers=1
|
| 151 |
+
# loss type ablation
|
| 152 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 153 |
+
echo "Config ${train_mode} ${config_option}"
|
| 154 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 155 |
+
task_load_clap_emb=true
|
| 156 |
+
model_proj_type=2
|
| 157 |
+
model_clone_batch=4
|
| 158 |
+
dataset_batch_size=48
|
| 159 |
+
model_clap_loss=1.0
|
| 160 |
+
average_top_k_layers=12
|
| 161 |
+
model_clap_loss_type="ce"
|
| 162 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 163 |
+
echo "Config ${train_mode} ${config_option}"
|
| 164 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 165 |
+
task_load_clap_emb=true
|
| 166 |
+
model_proj_type=2
|
| 167 |
+
model_clone_batch=4
|
| 168 |
+
dataset_batch_size=48
|
| 169 |
+
model_clap_loss=1.0
|
| 170 |
+
average_top_k_layers=12
|
| 171 |
+
model_clap_loss_type="l1"
|
| 172 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 173 |
+
echo "Config ${train_mode} ${config_option}"
|
| 174 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 175 |
+
task_load_clap_emb=true
|
| 176 |
+
model_proj_type=2
|
| 177 |
+
model_clone_batch=4
|
| 178 |
+
dataset_batch_size=96
|
| 179 |
+
model_clap_loss=1.0
|
| 180 |
+
average_top_k_layers=12
|
| 181 |
+
model_clap_loss_type="cosine"
|
| 182 |
+
# loss layer ablation
|
| 183 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 184 |
+
echo "Config ${train_mode} ${config_option}"
|
| 185 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 186 |
+
task_load_clap_emb=true
|
| 187 |
+
model_proj_type=2
|
| 188 |
+
model_clone_batch=4
|
| 189 |
+
dataset_batch_size=96
|
| 190 |
+
model_clap_loss=1.0
|
| 191 |
+
average_top_k_layers=12
|
| 192 |
+
model_clap_loss_type="mse"
|
| 193 |
+
model_clap_loss_layer=10
|
| 194 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 195 |
+
echo "Config ${train_mode} ${config_option}"
|
| 196 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 197 |
+
task_load_clap_emb=true
|
| 198 |
+
task_load_source_file=true
|
| 199 |
+
task_load_mel_file=false
|
| 200 |
+
model_proj_type=2
|
| 201 |
+
model_clone_batch=4
|
| 202 |
+
dataset_batch_size=96
|
| 203 |
+
model_clap_loss=1.0
|
| 204 |
+
average_top_k_layers=12
|
| 205 |
+
model_clap_loss_type="mse"
|
| 206 |
+
model_clap_loss_layer=8
|
| 207 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 208 |
+
echo "Config ${train_mode} ${config_option}"
|
| 209 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 210 |
+
task_load_clap_emb=true
|
| 211 |
+
task_load_source_file=true
|
| 212 |
+
task_load_mel_file=false
|
| 213 |
+
model_proj_type=2
|
| 214 |
+
model_clone_batch=4
|
| 215 |
+
dataset_batch_size=96
|
| 216 |
+
model_clap_loss=1.0
|
| 217 |
+
average_top_k_layers=12
|
| 218 |
+
model_clap_loss_type="mse"
|
| 219 |
+
model_clap_loss_layer=6
|
| 220 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 221 |
+
echo "Config ${train_mode} ${config_option}"
|
| 222 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 223 |
+
task_load_clap_emb=true
|
| 224 |
+
task_load_source_file=true
|
| 225 |
+
task_load_mel_file=false
|
| 226 |
+
model_proj_type=2
|
| 227 |
+
model_clone_batch=4
|
| 228 |
+
model_clap_loss=5.0
|
| 229 |
+
dataset_batch_size=96
|
| 230 |
+
average_top_k_layers=12
|
| 231 |
+
model_clap_loss_type="mse"
|
| 232 |
+
checkpoint_keep_interval_updates=-1
|
| 233 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 234 |
+
echo "Config ${train_mode} ${config_option}"
|
| 235 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 236 |
+
task_load_clap_emb=true
|
| 237 |
+
task_load_source_file=true
|
| 238 |
+
task_load_mel_file=false
|
| 239 |
+
model_proj_type=2
|
| 240 |
+
model_clone_batch=4
|
| 241 |
+
model_clap_loss=0.1
|
| 242 |
+
dataset_batch_size=96
|
| 243 |
+
average_top_k_layers=12
|
| 244 |
+
model_clap_loss_type="mse"
|
| 245 |
+
checkpoint_keep_interval_updates=-1
|
| 246 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 247 |
+
echo "Config ${train_mode} ${config_option}"
|
| 248 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 249 |
+
task_load_clap_emb=true
|
| 250 |
+
model_proj_type=4
|
| 251 |
+
model_clone_batch=4
|
| 252 |
+
model_clap_loss=1.0
|
| 253 |
+
dataset_batch_size=48
|
| 254 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 255 |
+
echo "Config ${train_mode} ${config_option}"
|
| 256 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 257 |
+
task_load_clap_emb=true
|
| 258 |
+
model_proj_type=4
|
| 259 |
+
model_clone_batch=4
|
| 260 |
+
model_clap_loss=0.001
|
| 261 |
+
dataset_batch_size=48
|
| 262 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 263 |
+
echo "Config ${train_mode} ${config_option}"
|
| 264 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 265 |
+
task_load_clap_emb=true
|
| 266 |
+
model_proj_type=4
|
| 267 |
+
model_clone_batch=4
|
| 268 |
+
model_clap_loss=0.01
|
| 269 |
+
dataset_batch_size=48
|
| 270 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 271 |
+
echo "Config ${train_mode} ${config_option}"
|
| 272 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 273 |
+
task_load_clap_emb=true
|
| 274 |
+
model_proj_type=6
|
| 275 |
+
model_clone_batch=4
|
| 276 |
+
dataset_batch_size=48
|
| 277 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 278 |
+
echo "Config ${train_mode} ${config_option}"
|
| 279 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 280 |
+
task_load_clap_emb=true
|
| 281 |
+
task_load_source_file=true
|
| 282 |
+
task_load_mel_file=false
|
| 283 |
+
model_proj_type=2
|
| 284 |
+
model_clone_batch=4
|
| 285 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 286 |
+
model_clap_loss=1.0
|
| 287 |
+
average_top_k_layers=11 # modify with model depth
|
| 288 |
+
model_add_conv=true
|
| 289 |
+
model_depth=11 #
|
| 290 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 291 |
+
checkpoint_save_interval_updates=10000
|
| 292 |
+
fi
|
| 293 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 294 |
+
echo "Config ${train_mode} ${config_option}"
|
| 295 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 296 |
+
task_load_clap_emb=true
|
| 297 |
+
task_load_source_file=true
|
| 298 |
+
task_load_mel_file=false
|
| 299 |
+
model_proj_type=2
|
| 300 |
+
model_clone_batch=4
|
| 301 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 302 |
+
model_clap_loss=1.0
|
| 303 |
+
average_top_k_layers=12 # modify with model depth
|
| 304 |
+
model_add_conv=true
|
| 305 |
+
model_modalities_image_conv_resolution="16,"
|
| 306 |
+
model_modalities_image_conv_in_chans="1, 768"
|
| 307 |
+
model_depth=12 #
|
| 308 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 309 |
+
checkpoint_save_interval_updates=10000
|
| 310 |
+
fi
|
| 311 |
+
|
| 312 |
+
python fairseq_cli/hydra_train.py -m \
|
| 313 |
+
--config-dir ./EAT/config \
|
| 314 |
+
--config-name pretraining_AS2M \
|
| 315 |
+
common.user_dir=./EAT \
|
| 316 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 317 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 318 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 319 |
+
dataset.num_workers=24 \
|
| 320 |
+
dataset.data_buffer_size=48 \
|
| 321 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 322 |
+
task.data=${task_data} \
|
| 323 |
+
task.h5_format=False \
|
| 324 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 325 |
+
+task.load_source_file=${task_load_source_file} \
|
| 326 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 327 |
+
model.proj_type=${model_proj_type} \
|
| 328 |
+
model.clone_batch=${model_clone_batch} \
|
| 329 |
+
model.clap_loss=${model_clap_loss} \
|
| 330 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 331 |
+
+model.add_conv=${model_add_conv} \
|
| 332 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 333 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 334 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 335 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 336 |
+
model.depth=${model_depth} \
|
| 337 |
+
+model.modalities.image.conv_resolution=${model_modalities_image_conv_resolution} \
|
| 338 |
+
+model.modalities.image.conv_in_chans=${model_modalities_image_conv_in_chans} \
|
| 339 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 340 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_1_2025-09-30_06-59-40/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_resolution="4, 8, 16"
|
| 35 |
+
model_modalities_image_conv_in_chans="1, 256, 384, 768"
|
| 36 |
+
|
| 37 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 38 |
+
echo "Config ${train_mode} ${config_option}"
|
| 39 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 40 |
+
task_load_clap_emb=false
|
| 41 |
+
task_load_source_file=true
|
| 42 |
+
task_load_mel_file=false
|
| 43 |
+
model_proj_type=null
|
| 44 |
+
model_clone_batch=4
|
| 45 |
+
dataset_batch_size=96
|
| 46 |
+
model_clap_loss=0
|
| 47 |
+
checkpoint_keep_interval_updates=-1
|
| 48 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 49 |
+
echo "Config ${train_mode} ${config_option}"
|
| 50 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 51 |
+
task_load_clap_emb=false
|
| 52 |
+
task_load_source_file=true
|
| 53 |
+
task_load_mel_file=false
|
| 54 |
+
model_proj_type=null
|
| 55 |
+
model_clone_batch=4
|
| 56 |
+
dataset_batch_size=96
|
| 57 |
+
model_dispersive_loss=1
|
| 58 |
+
model_dispersive_loss_layer=0
|
| 59 |
+
checkpoint_keep_interval_updates=1
|
| 60 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 61 |
+
echo "Config ${train_mode} ${config_option}"
|
| 62 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 63 |
+
task_load_clap_emb=false
|
| 64 |
+
task_load_source_file=true
|
| 65 |
+
task_load_mel_file=false
|
| 66 |
+
model_proj_type=null
|
| 67 |
+
model_clone_batch=1
|
| 68 |
+
dataset_batch_size=384
|
| 69 |
+
model_dispersive_loss=1
|
| 70 |
+
model_dispersive_loss_layer=0
|
| 71 |
+
checkpoint_keep_interval_updates=1
|
| 72 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 73 |
+
echo "Config ${train_mode} ${config_option}"
|
| 74 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 75 |
+
task_load_clap_emb=false
|
| 76 |
+
task_load_source_file=true
|
| 77 |
+
task_load_mel_file=false
|
| 78 |
+
model_proj_type=null
|
| 79 |
+
model_clone_batch=1
|
| 80 |
+
dataset_batch_size=384
|
| 81 |
+
model_dispersive_loss=10.0
|
| 82 |
+
model_dispersive_loss_layer=0
|
| 83 |
+
checkpoint_keep_interval_updates=1
|
| 84 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 85 |
+
echo "Config ${train_mode} ${config_option}"
|
| 86 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 87 |
+
task_load_clap_emb=false
|
| 88 |
+
task_load_source_file=true
|
| 89 |
+
task_load_mel_file=false
|
| 90 |
+
model_proj_type=null
|
| 91 |
+
model_clone_batch=1
|
| 92 |
+
dataset_batch_size=384
|
| 93 |
+
model_dispersive_loss=100.0
|
| 94 |
+
model_dispersive_loss_layer=0
|
| 95 |
+
checkpoint_keep_interval_updates=1
|
| 96 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 97 |
+
echo "Config ${train_mode} ${config_option}"
|
| 98 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 99 |
+
task_load_clap_emb=false
|
| 100 |
+
task_load_source_file=true
|
| 101 |
+
task_load_mel_file=false
|
| 102 |
+
model_proj_type=null
|
| 103 |
+
model_clone_batch=1
|
| 104 |
+
dataset_batch_size=384
|
| 105 |
+
model_dispersive_loss=10000.0
|
| 106 |
+
model_dispersive_loss_layer=0
|
| 107 |
+
checkpoint_keep_interval_updates=1
|
| 108 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 109 |
+
echo "Config ${train_mode} ${config_option}"
|
| 110 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 111 |
+
task_load_clap_emb=false
|
| 112 |
+
task_load_source_file=true
|
| 113 |
+
task_load_mel_file=false
|
| 114 |
+
model_proj_type=null
|
| 115 |
+
model_clone_batch=1
|
| 116 |
+
dataset_batch_size=384
|
| 117 |
+
model_dispersive_loss=1000.0
|
| 118 |
+
model_dispersive_loss_layer=0
|
| 119 |
+
checkpoint_keep_interval_updates=1
|
| 120 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 121 |
+
echo "Config ${train_mode} ${config_option}"
|
| 122 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 123 |
+
task_load_clap_emb=false
|
| 124 |
+
task_load_source_file=true
|
| 125 |
+
task_load_mel_file=false
|
| 126 |
+
model_proj_type=null
|
| 127 |
+
model_clone_batch=4
|
| 128 |
+
dataset_batch_size=96
|
| 129 |
+
model_dispersive_loss=1000.0
|
| 130 |
+
model_dispersive_loss_layer=10
|
| 131 |
+
checkpoint_keep_interval_updates=1
|
| 132 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 133 |
+
echo "Config ${train_mode} ${config_option}"
|
| 134 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 135 |
+
task_load_clap_emb=true
|
| 136 |
+
model_proj_type=2
|
| 137 |
+
model_clone_batch=4
|
| 138 |
+
dataset_batch_size=48
|
| 139 |
+
model_clap_loss=1.0
|
| 140 |
+
average_top_k_layers=12
|
| 141 |
+
model_add_conv=false
|
| 142 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 143 |
+
echo "Config ${train_mode} ${config_option}"
|
| 144 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 145 |
+
task_load_clap_emb=true
|
| 146 |
+
model_proj_type=2
|
| 147 |
+
model_clone_batch=4
|
| 148 |
+
dataset_batch_size=48
|
| 149 |
+
model_clap_loss=1.0
|
| 150 |
+
average_top_k_layers=1
|
| 151 |
+
# loss type ablation
|
| 152 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 153 |
+
echo "Config ${train_mode} ${config_option}"
|
| 154 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 155 |
+
task_load_clap_emb=true
|
| 156 |
+
model_proj_type=2
|
| 157 |
+
model_clone_batch=4
|
| 158 |
+
dataset_batch_size=48
|
| 159 |
+
model_clap_loss=1.0
|
| 160 |
+
average_top_k_layers=12
|
| 161 |
+
model_clap_loss_type="ce"
|
| 162 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 163 |
+
echo "Config ${train_mode} ${config_option}"
|
| 164 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 165 |
+
task_load_clap_emb=true
|
| 166 |
+
model_proj_type=2
|
| 167 |
+
model_clone_batch=4
|
| 168 |
+
dataset_batch_size=48
|
| 169 |
+
model_clap_loss=1.0
|
| 170 |
+
average_top_k_layers=12
|
| 171 |
+
model_clap_loss_type="l1"
|
| 172 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 173 |
+
echo "Config ${train_mode} ${config_option}"
|
| 174 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 175 |
+
task_load_clap_emb=true
|
| 176 |
+
model_proj_type=2
|
| 177 |
+
model_clone_batch=4
|
| 178 |
+
dataset_batch_size=96
|
| 179 |
+
model_clap_loss=1.0
|
| 180 |
+
average_top_k_layers=12
|
| 181 |
+
model_clap_loss_type="cosine"
|
| 182 |
+
# loss layer ablation
|
| 183 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 184 |
+
echo "Config ${train_mode} ${config_option}"
|
| 185 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 186 |
+
task_load_clap_emb=true
|
| 187 |
+
model_proj_type=2
|
| 188 |
+
model_clone_batch=4
|
| 189 |
+
dataset_batch_size=96
|
| 190 |
+
model_clap_loss=1.0
|
| 191 |
+
average_top_k_layers=12
|
| 192 |
+
model_clap_loss_type="mse"
|
| 193 |
+
model_clap_loss_layer=10
|
| 194 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 195 |
+
echo "Config ${train_mode} ${config_option}"
|
| 196 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 197 |
+
task_load_clap_emb=true
|
| 198 |
+
task_load_source_file=true
|
| 199 |
+
task_load_mel_file=false
|
| 200 |
+
model_proj_type=2
|
| 201 |
+
model_clone_batch=4
|
| 202 |
+
dataset_batch_size=96
|
| 203 |
+
model_clap_loss=1.0
|
| 204 |
+
average_top_k_layers=12
|
| 205 |
+
model_clap_loss_type="mse"
|
| 206 |
+
model_clap_loss_layer=8
|
| 207 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 208 |
+
echo "Config ${train_mode} ${config_option}"
|
| 209 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 210 |
+
task_load_clap_emb=true
|
| 211 |
+
task_load_source_file=true
|
| 212 |
+
task_load_mel_file=false
|
| 213 |
+
model_proj_type=2
|
| 214 |
+
model_clone_batch=4
|
| 215 |
+
dataset_batch_size=96
|
| 216 |
+
model_clap_loss=1.0
|
| 217 |
+
average_top_k_layers=12
|
| 218 |
+
model_clap_loss_type="mse"
|
| 219 |
+
model_clap_loss_layer=6
|
| 220 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 221 |
+
echo "Config ${train_mode} ${config_option}"
|
| 222 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 223 |
+
task_load_clap_emb=true
|
| 224 |
+
task_load_source_file=true
|
| 225 |
+
task_load_mel_file=false
|
| 226 |
+
model_proj_type=2
|
| 227 |
+
model_clone_batch=4
|
| 228 |
+
model_clap_loss=5.0
|
| 229 |
+
dataset_batch_size=96
|
| 230 |
+
average_top_k_layers=12
|
| 231 |
+
model_clap_loss_type="mse"
|
| 232 |
+
checkpoint_keep_interval_updates=-1
|
| 233 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 234 |
+
echo "Config ${train_mode} ${config_option}"
|
| 235 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 236 |
+
task_load_clap_emb=true
|
| 237 |
+
task_load_source_file=true
|
| 238 |
+
task_load_mel_file=false
|
| 239 |
+
model_proj_type=2
|
| 240 |
+
model_clone_batch=4
|
| 241 |
+
model_clap_loss=0.1
|
| 242 |
+
dataset_batch_size=96
|
| 243 |
+
average_top_k_layers=12
|
| 244 |
+
model_clap_loss_type="mse"
|
| 245 |
+
checkpoint_keep_interval_updates=-1
|
| 246 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 247 |
+
echo "Config ${train_mode} ${config_option}"
|
| 248 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 249 |
+
task_load_clap_emb=true
|
| 250 |
+
model_proj_type=4
|
| 251 |
+
model_clone_batch=4
|
| 252 |
+
model_clap_loss=1.0
|
| 253 |
+
dataset_batch_size=48
|
| 254 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 255 |
+
echo "Config ${train_mode} ${config_option}"
|
| 256 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 257 |
+
task_load_clap_emb=true
|
| 258 |
+
model_proj_type=4
|
| 259 |
+
model_clone_batch=4
|
| 260 |
+
model_clap_loss=0.001
|
| 261 |
+
dataset_batch_size=48
|
| 262 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 263 |
+
echo "Config ${train_mode} ${config_option}"
|
| 264 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 265 |
+
task_load_clap_emb=true
|
| 266 |
+
model_proj_type=4
|
| 267 |
+
model_clone_batch=4
|
| 268 |
+
model_clap_loss=0.01
|
| 269 |
+
dataset_batch_size=48
|
| 270 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 271 |
+
echo "Config ${train_mode} ${config_option}"
|
| 272 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 273 |
+
task_load_clap_emb=true
|
| 274 |
+
model_proj_type=6
|
| 275 |
+
model_clone_batch=4
|
| 276 |
+
dataset_batch_size=48
|
| 277 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 278 |
+
echo "Config ${train_mode} ${config_option}"
|
| 279 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 280 |
+
task_load_clap_emb=true
|
| 281 |
+
task_load_source_file=true
|
| 282 |
+
task_load_mel_file=false
|
| 283 |
+
model_proj_type=2
|
| 284 |
+
model_clone_batch=4
|
| 285 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 286 |
+
model_clap_loss=1.0
|
| 287 |
+
average_top_k_layers=11 # modify with model depth
|
| 288 |
+
model_add_conv=true
|
| 289 |
+
model_depth=11 #
|
| 290 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 291 |
+
checkpoint_save_interval_updates=10000
|
| 292 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 293 |
+
echo "Config ${train_mode} ${config_option}"
|
| 294 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 295 |
+
task_load_clap_emb=true
|
| 296 |
+
task_load_source_file=true
|
| 297 |
+
task_load_mel_file=false
|
| 298 |
+
model_proj_type=2
|
| 299 |
+
model_clone_batch=4
|
| 300 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 301 |
+
model_clap_loss=1.0
|
| 302 |
+
average_top_k_layers=12 # modify with model depth
|
| 303 |
+
model_add_conv=true
|
| 304 |
+
model_modalities_image_conv_resolution="16,"
|
| 305 |
+
model_modalities_image_conv_in_chans="1, 768"
|
| 306 |
+
model_depth=12 #
|
| 307 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 308 |
+
checkpoint_save_interval_updates=10000
|
| 309 |
+
fi
|
| 310 |
+
|
| 311 |
+
python fairseq_cli/hydra_train.py -m \
|
| 312 |
+
--config-dir ./EAT/config \
|
| 313 |
+
--config-name pretraining_AS2M \
|
| 314 |
+
common.user_dir=./EAT \
|
| 315 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 316 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 317 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 318 |
+
dataset.num_workers=24 \
|
| 319 |
+
dataset.data_buffer_size=48 \
|
| 320 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 321 |
+
task.data=${task_data} \
|
| 322 |
+
task.h5_format=False \
|
| 323 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 324 |
+
+task.load_source_file=${task_load_source_file} \
|
| 325 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 326 |
+
model.proj_type=${model_proj_type} \
|
| 327 |
+
model.clone_batch=${model_clone_batch} \
|
| 328 |
+
model.clap_loss=${model_clap_loss} \
|
| 329 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 330 |
+
+model.add_conv=${model_add_conv} \
|
| 331 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 332 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 333 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 334 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 335 |
+
model.depth=${model_depth} \
|
| 336 |
+
+model.modalities.image.conv_resolution=${model_modalities_image_conv_resolution} \
|
| 337 |
+
+model.modalities.image.conv_in_chans=${model_modalities_image_conv_in_chans} \
|
| 338 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 339 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_1_2025-09-30_07-01-07/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_resolution='[4,8,16]'
|
| 35 |
+
model_modalities_image_conv_in_chans='[1,256,384,768]'
|
| 36 |
+
|
| 37 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 38 |
+
echo "Config ${train_mode} ${config_option}"
|
| 39 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 40 |
+
task_load_clap_emb=false
|
| 41 |
+
task_load_source_file=true
|
| 42 |
+
task_load_mel_file=false
|
| 43 |
+
model_proj_type=null
|
| 44 |
+
model_clone_batch=4
|
| 45 |
+
dataset_batch_size=96
|
| 46 |
+
model_clap_loss=0
|
| 47 |
+
checkpoint_keep_interval_updates=-1
|
| 48 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 49 |
+
echo "Config ${train_mode} ${config_option}"
|
| 50 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 51 |
+
task_load_clap_emb=false
|
| 52 |
+
task_load_source_file=true
|
| 53 |
+
task_load_mel_file=false
|
| 54 |
+
model_proj_type=null
|
| 55 |
+
model_clone_batch=4
|
| 56 |
+
dataset_batch_size=96
|
| 57 |
+
model_dispersive_loss=1
|
| 58 |
+
model_dispersive_loss_layer=0
|
| 59 |
+
checkpoint_keep_interval_updates=1
|
| 60 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 61 |
+
echo "Config ${train_mode} ${config_option}"
|
| 62 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 63 |
+
task_load_clap_emb=false
|
| 64 |
+
task_load_source_file=true
|
| 65 |
+
task_load_mel_file=false
|
| 66 |
+
model_proj_type=null
|
| 67 |
+
model_clone_batch=1
|
| 68 |
+
dataset_batch_size=384
|
| 69 |
+
model_dispersive_loss=1
|
| 70 |
+
model_dispersive_loss_layer=0
|
| 71 |
+
checkpoint_keep_interval_updates=1
|
| 72 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 73 |
+
echo "Config ${train_mode} ${config_option}"
|
| 74 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 75 |
+
task_load_clap_emb=false
|
| 76 |
+
task_load_source_file=true
|
| 77 |
+
task_load_mel_file=false
|
| 78 |
+
model_proj_type=null
|
| 79 |
+
model_clone_batch=1
|
| 80 |
+
dataset_batch_size=384
|
| 81 |
+
model_dispersive_loss=10.0
|
| 82 |
+
model_dispersive_loss_layer=0
|
| 83 |
+
checkpoint_keep_interval_updates=1
|
| 84 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 85 |
+
echo "Config ${train_mode} ${config_option}"
|
| 86 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 87 |
+
task_load_clap_emb=false
|
| 88 |
+
task_load_source_file=true
|
| 89 |
+
task_load_mel_file=false
|
| 90 |
+
model_proj_type=null
|
| 91 |
+
model_clone_batch=1
|
| 92 |
+
dataset_batch_size=384
|
| 93 |
+
model_dispersive_loss=100.0
|
| 94 |
+
model_dispersive_loss_layer=0
|
| 95 |
+
checkpoint_keep_interval_updates=1
|
| 96 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 97 |
+
echo "Config ${train_mode} ${config_option}"
|
| 98 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 99 |
+
task_load_clap_emb=false
|
| 100 |
+
task_load_source_file=true
|
| 101 |
+
task_load_mel_file=false
|
| 102 |
+
model_proj_type=null
|
| 103 |
+
model_clone_batch=1
|
| 104 |
+
dataset_batch_size=384
|
| 105 |
+
model_dispersive_loss=10000.0
|
| 106 |
+
model_dispersive_loss_layer=0
|
| 107 |
+
checkpoint_keep_interval_updates=1
|
| 108 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 109 |
+
echo "Config ${train_mode} ${config_option}"
|
| 110 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 111 |
+
task_load_clap_emb=false
|
| 112 |
+
task_load_source_file=true
|
| 113 |
+
task_load_mel_file=false
|
| 114 |
+
model_proj_type=null
|
| 115 |
+
model_clone_batch=1
|
| 116 |
+
dataset_batch_size=384
|
| 117 |
+
model_dispersive_loss=1000.0
|
| 118 |
+
model_dispersive_loss_layer=0
|
| 119 |
+
checkpoint_keep_interval_updates=1
|
| 120 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 121 |
+
echo "Config ${train_mode} ${config_option}"
|
| 122 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 123 |
+
task_load_clap_emb=false
|
| 124 |
+
task_load_source_file=true
|
| 125 |
+
task_load_mel_file=false
|
| 126 |
+
model_proj_type=null
|
| 127 |
+
model_clone_batch=4
|
| 128 |
+
dataset_batch_size=96
|
| 129 |
+
model_dispersive_loss=1000.0
|
| 130 |
+
model_dispersive_loss_layer=10
|
| 131 |
+
checkpoint_keep_interval_updates=1
|
| 132 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 133 |
+
echo "Config ${train_mode} ${config_option}"
|
| 134 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 135 |
+
task_load_clap_emb=true
|
| 136 |
+
model_proj_type=2
|
| 137 |
+
model_clone_batch=4
|
| 138 |
+
dataset_batch_size=48
|
| 139 |
+
model_clap_loss=1.0
|
| 140 |
+
average_top_k_layers=12
|
| 141 |
+
model_add_conv=false
|
| 142 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 143 |
+
echo "Config ${train_mode} ${config_option}"
|
| 144 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 145 |
+
task_load_clap_emb=true
|
| 146 |
+
model_proj_type=2
|
| 147 |
+
model_clone_batch=4
|
| 148 |
+
dataset_batch_size=48
|
| 149 |
+
model_clap_loss=1.0
|
| 150 |
+
average_top_k_layers=1
|
| 151 |
+
# loss type ablation
|
| 152 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 153 |
+
echo "Config ${train_mode} ${config_option}"
|
| 154 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 155 |
+
task_load_clap_emb=true
|
| 156 |
+
model_proj_type=2
|
| 157 |
+
model_clone_batch=4
|
| 158 |
+
dataset_batch_size=48
|
| 159 |
+
model_clap_loss=1.0
|
| 160 |
+
average_top_k_layers=12
|
| 161 |
+
model_clap_loss_type="ce"
|
| 162 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 163 |
+
echo "Config ${train_mode} ${config_option}"
|
| 164 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 165 |
+
task_load_clap_emb=true
|
| 166 |
+
model_proj_type=2
|
| 167 |
+
model_clone_batch=4
|
| 168 |
+
dataset_batch_size=48
|
| 169 |
+
model_clap_loss=1.0
|
| 170 |
+
average_top_k_layers=12
|
| 171 |
+
model_clap_loss_type="l1"
|
| 172 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 173 |
+
echo "Config ${train_mode} ${config_option}"
|
| 174 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 175 |
+
task_load_clap_emb=true
|
| 176 |
+
model_proj_type=2
|
| 177 |
+
model_clone_batch=4
|
| 178 |
+
dataset_batch_size=96
|
| 179 |
+
model_clap_loss=1.0
|
| 180 |
+
average_top_k_layers=12
|
| 181 |
+
model_clap_loss_type="cosine"
|
| 182 |
+
# loss layer ablation
|
| 183 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 184 |
+
echo "Config ${train_mode} ${config_option}"
|
| 185 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 186 |
+
task_load_clap_emb=true
|
| 187 |
+
model_proj_type=2
|
| 188 |
+
model_clone_batch=4
|
| 189 |
+
dataset_batch_size=96
|
| 190 |
+
model_clap_loss=1.0
|
| 191 |
+
average_top_k_layers=12
|
| 192 |
+
model_clap_loss_type="mse"
|
| 193 |
+
model_clap_loss_layer=10
|
| 194 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 195 |
+
echo "Config ${train_mode} ${config_option}"
|
| 196 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 197 |
+
task_load_clap_emb=true
|
| 198 |
+
task_load_source_file=true
|
| 199 |
+
task_load_mel_file=false
|
| 200 |
+
model_proj_type=2
|
| 201 |
+
model_clone_batch=4
|
| 202 |
+
dataset_batch_size=96
|
| 203 |
+
model_clap_loss=1.0
|
| 204 |
+
average_top_k_layers=12
|
| 205 |
+
model_clap_loss_type="mse"
|
| 206 |
+
model_clap_loss_layer=8
|
| 207 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 208 |
+
echo "Config ${train_mode} ${config_option}"
|
| 209 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 210 |
+
task_load_clap_emb=true
|
| 211 |
+
task_load_source_file=true
|
| 212 |
+
task_load_mel_file=false
|
| 213 |
+
model_proj_type=2
|
| 214 |
+
model_clone_batch=4
|
| 215 |
+
dataset_batch_size=96
|
| 216 |
+
model_clap_loss=1.0
|
| 217 |
+
average_top_k_layers=12
|
| 218 |
+
model_clap_loss_type="mse"
|
| 219 |
+
model_clap_loss_layer=6
|
| 220 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 221 |
+
echo "Config ${train_mode} ${config_option}"
|
| 222 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 223 |
+
task_load_clap_emb=true
|
| 224 |
+
task_load_source_file=true
|
| 225 |
+
task_load_mel_file=false
|
| 226 |
+
model_proj_type=2
|
| 227 |
+
model_clone_batch=4
|
| 228 |
+
model_clap_loss=5.0
|
| 229 |
+
dataset_batch_size=96
|
| 230 |
+
average_top_k_layers=12
|
| 231 |
+
model_clap_loss_type="mse"
|
| 232 |
+
checkpoint_keep_interval_updates=-1
|
| 233 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 234 |
+
echo "Config ${train_mode} ${config_option}"
|
| 235 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 236 |
+
task_load_clap_emb=true
|
| 237 |
+
task_load_source_file=true
|
| 238 |
+
task_load_mel_file=false
|
| 239 |
+
model_proj_type=2
|
| 240 |
+
model_clone_batch=4
|
| 241 |
+
model_clap_loss=0.1
|
| 242 |
+
dataset_batch_size=96
|
| 243 |
+
average_top_k_layers=12
|
| 244 |
+
model_clap_loss_type="mse"
|
| 245 |
+
checkpoint_keep_interval_updates=-1
|
| 246 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 247 |
+
echo "Config ${train_mode} ${config_option}"
|
| 248 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 249 |
+
task_load_clap_emb=true
|
| 250 |
+
model_proj_type=4
|
| 251 |
+
model_clone_batch=4
|
| 252 |
+
model_clap_loss=1.0
|
| 253 |
+
dataset_batch_size=48
|
| 254 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 255 |
+
echo "Config ${train_mode} ${config_option}"
|
| 256 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 257 |
+
task_load_clap_emb=true
|
| 258 |
+
model_proj_type=4
|
| 259 |
+
model_clone_batch=4
|
| 260 |
+
model_clap_loss=0.001
|
| 261 |
+
dataset_batch_size=48
|
| 262 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 263 |
+
echo "Config ${train_mode} ${config_option}"
|
| 264 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 265 |
+
task_load_clap_emb=true
|
| 266 |
+
model_proj_type=4
|
| 267 |
+
model_clone_batch=4
|
| 268 |
+
model_clap_loss=0.01
|
| 269 |
+
dataset_batch_size=48
|
| 270 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 271 |
+
echo "Config ${train_mode} ${config_option}"
|
| 272 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 273 |
+
task_load_clap_emb=true
|
| 274 |
+
model_proj_type=6
|
| 275 |
+
model_clone_batch=4
|
| 276 |
+
dataset_batch_size=48
|
| 277 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 278 |
+
echo "Config ${train_mode} ${config_option}"
|
| 279 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 280 |
+
task_load_clap_emb=true
|
| 281 |
+
task_load_source_file=true
|
| 282 |
+
task_load_mel_file=false
|
| 283 |
+
model_proj_type=2
|
| 284 |
+
model_clone_batch=4
|
| 285 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 286 |
+
model_clap_loss=1.0
|
| 287 |
+
average_top_k_layers=11 # modify with model depth
|
| 288 |
+
model_add_conv=true
|
| 289 |
+
model_depth=11 #
|
| 290 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 291 |
+
checkpoint_save_interval_updates=10000
|
| 292 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 293 |
+
echo "Config ${train_mode} ${config_option}"
|
| 294 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 295 |
+
task_load_clap_emb=true
|
| 296 |
+
task_load_source_file=true
|
| 297 |
+
task_load_mel_file=false
|
| 298 |
+
model_proj_type=2
|
| 299 |
+
model_clone_batch=4
|
| 300 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 301 |
+
model_clap_loss=1.0
|
| 302 |
+
average_top_k_layers=12 # modify with model depth
|
| 303 |
+
model_add_conv=true
|
| 304 |
+
model_modalities_image_conv_resolution='[16]'
|
| 305 |
+
model_modalities_image_conv_in_chans='[1,768]'
|
| 306 |
+
model_depth=12 #
|
| 307 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 308 |
+
checkpoint_save_interval_updates=10000
|
| 309 |
+
fi
|
| 310 |
+
|
| 311 |
+
python fairseq_cli/hydra_train.py -m \
|
| 312 |
+
--config-dir ./EAT/config \
|
| 313 |
+
--config-name pretraining_AS2M \
|
| 314 |
+
common.user_dir=./EAT \
|
| 315 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 316 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 317 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 318 |
+
dataset.num_workers=24 \
|
| 319 |
+
dataset.data_buffer_size=48 \
|
| 320 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 321 |
+
task.data=${task_data} \
|
| 322 |
+
task.h5_format=False \
|
| 323 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 324 |
+
+task.load_source_file=${task_load_source_file} \
|
| 325 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 326 |
+
model.proj_type=${model_proj_type} \
|
| 327 |
+
model.clone_batch=${model_clone_batch} \
|
| 328 |
+
model.clap_loss=${model_clap_loss} \
|
| 329 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 330 |
+
+model.add_conv=${model_add_conv} \
|
| 331 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 332 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 333 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 334 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 335 |
+
model.depth=${model_depth} \
|
| 336 |
+
+model.modalities.image.conv_resolution=${model_modalities_image_conv_resolution} \
|
| 337 |
+
+model.modalities.image.conv_in_chans=${model_modalities_image_conv_in_chans} \
|
| 338 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 339 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_1_2025-09-30_07-08-58/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
fi
|
| 308 |
+
|
| 309 |
+
python fairseq_cli/hydra_train.py -m \
|
| 310 |
+
--config-dir ./EAT/config \
|
| 311 |
+
--config-name pretraining_AS2M \
|
| 312 |
+
common.user_dir=./EAT \
|
| 313 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 314 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 315 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 316 |
+
dataset.num_workers=24 \
|
| 317 |
+
dataset.data_buffer_size=48 \
|
| 318 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 319 |
+
task.data=${task_data} \
|
| 320 |
+
task.h5_format=False \
|
| 321 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 322 |
+
+task.load_source_file=${task_load_source_file} \
|
| 323 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 324 |
+
model.proj_type=${model_proj_type} \
|
| 325 |
+
model.clone_batch=${model_clone_batch} \
|
| 326 |
+
model.clap_loss=${model_clap_loss} \
|
| 327 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 328 |
+
+model.add_conv=${model_add_conv} \
|
| 329 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 330 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 331 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 332 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 333 |
+
model.depth=${model_depth} \
|
| 334 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 335 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 336 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_1_2025-09-30_07-14-17/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
fi
|
| 308 |
+
|
| 309 |
+
python fairseq_cli/hydra_train.py -m \
|
| 310 |
+
--config-dir ./EAT/config \
|
| 311 |
+
--config-name pretraining_AS2M \
|
| 312 |
+
common.user_dir=./EAT \
|
| 313 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 314 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 315 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 316 |
+
dataset.num_workers=24 \
|
| 317 |
+
dataset.data_buffer_size=48 \
|
| 318 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 319 |
+
task.data=${task_data} \
|
| 320 |
+
task.h5_format=False \
|
| 321 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 322 |
+
+task.load_source_file=${task_load_source_file} \
|
| 323 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 324 |
+
model.proj_type=${model_proj_type} \
|
| 325 |
+
model.clone_batch=${model_clone_batch} \
|
| 326 |
+
model.clap_loss=${model_clap_loss} \
|
| 327 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 328 |
+
+model.add_conv=${model_add_conv} \
|
| 329 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 330 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 331 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 332 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 333 |
+
model.depth=${model_depth} \
|
| 334 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 335 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 336 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_1_2025-09-30_07-19-43/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
fi
|
| 308 |
+
|
| 309 |
+
python fairseq_cli/hydra_train.py -m \
|
| 310 |
+
--config-dir ./EAT/config \
|
| 311 |
+
--config-name pretraining_AS2M \
|
| 312 |
+
common.user_dir=./EAT \
|
| 313 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 314 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 315 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 316 |
+
dataset.num_workers=24 \
|
| 317 |
+
dataset.data_buffer_size=48 \
|
| 318 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 319 |
+
task.data=${task_data} \
|
| 320 |
+
task.h5_format=False \
|
| 321 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 322 |
+
+task.load_source_file=${task_load_source_file} \
|
| 323 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 324 |
+
model.proj_type=${model_proj_type} \
|
| 325 |
+
model.clone_batch=${model_clone_batch} \
|
| 326 |
+
model.clap_loss=${model_clap_loss} \
|
| 327 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 328 |
+
+model.add_conv=${model_add_conv} \
|
| 329 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 330 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 331 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 332 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 333 |
+
model.depth=${model_depth} \
|
| 334 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 335 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 336 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_1_2025-09-30_07-25-52/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
fi
|
| 308 |
+
|
| 309 |
+
python fairseq_cli/hydra_train.py -m \
|
| 310 |
+
--config-dir ./EAT/config \
|
| 311 |
+
--config-name pretraining_AS2M \
|
| 312 |
+
common.user_dir=./EAT \
|
| 313 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 314 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 315 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 316 |
+
dataset.num_workers=24 \
|
| 317 |
+
dataset.data_buffer_size=48 \
|
| 318 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 319 |
+
task.data=${task_data} \
|
| 320 |
+
task.h5_format=False \
|
| 321 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 322 |
+
+task.load_source_file=${task_load_source_file} \
|
| 323 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 324 |
+
model.proj_type=${model_proj_type} \
|
| 325 |
+
model.clone_batch=${model_clone_batch} \
|
| 326 |
+
model.clap_loss=${model_clap_loss} \
|
| 327 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 328 |
+
+model.add_conv=${model_add_conv} \
|
| 329 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 330 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 331 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 332 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 333 |
+
model.depth=${model_depth} \
|
| 334 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 335 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 336 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_1_2025-09-30_08-31-42/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
fi
|
| 356 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 5 ]]; then
|
| 357 |
+
echo "Config ${train_mode} ${config_option}"
|
| 358 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 359 |
+
task_load_clap_emb=true
|
| 360 |
+
task_load_source_file=true
|
| 361 |
+
task_load_mel_file=false
|
| 362 |
+
model_proj_type=2
|
| 363 |
+
model_clone_batch=4
|
| 364 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 365 |
+
model_clap_loss=1.0
|
| 366 |
+
average_top_k_layers=12 # modify with model depth
|
| 367 |
+
model_add_conv=true
|
| 368 |
+
model_modalities_image_conv_option=5
|
| 369 |
+
model_depth=12 #
|
| 370 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 371 |
+
checkpoint_save_interval_updates=10000
|
| 372 |
+
fi
|
| 373 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 6 ]]; then
|
| 374 |
+
echo "Config ${train_mode} ${config_option}"
|
| 375 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 376 |
+
task_load_clap_emb=true
|
| 377 |
+
task_load_source_file=true
|
| 378 |
+
task_load_mel_file=false
|
| 379 |
+
model_proj_type=2
|
| 380 |
+
model_clone_batch=4
|
| 381 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 382 |
+
model_clap_loss=1.0
|
| 383 |
+
average_top_k_layers=12 # modify with model depth
|
| 384 |
+
model_add_conv=true
|
| 385 |
+
model_modalities_image_conv_option=6
|
| 386 |
+
model_depth=12 #
|
| 387 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 388 |
+
checkpoint_save_interval_updates=10000
|
| 389 |
+
fi
|
| 390 |
+
|
| 391 |
+
python fairseq_cli/hydra_train.py -m \
|
| 392 |
+
--config-dir ./EAT/config \
|
| 393 |
+
--config-name pretraining_AS2M \
|
| 394 |
+
common.user_dir=./EAT \
|
| 395 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 396 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 397 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 398 |
+
dataset.num_workers=24 \
|
| 399 |
+
dataset.data_buffer_size=48 \
|
| 400 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 401 |
+
task.data=${task_data} \
|
| 402 |
+
task.h5_format=False \
|
| 403 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 404 |
+
+task.load_source_file=${task_load_source_file} \
|
| 405 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 406 |
+
model.proj_type=${model_proj_type} \
|
| 407 |
+
model.clone_batch=${model_clone_batch} \
|
| 408 |
+
model.clap_loss=${model_clap_loss} \
|
| 409 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 410 |
+
+model.add_conv=${model_add_conv} \
|
| 411 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 412 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 413 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 414 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 415 |
+
model.depth=${model_depth} \
|
| 416 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 417 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 418 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_1_2025-09-30_08-31-59/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 5 ]]; then
|
| 356 |
+
echo "Config ${train_mode} ${config_option}"
|
| 357 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 358 |
+
task_load_clap_emb=true
|
| 359 |
+
task_load_source_file=true
|
| 360 |
+
task_load_mel_file=false
|
| 361 |
+
model_proj_type=2
|
| 362 |
+
model_clone_batch=4
|
| 363 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 364 |
+
model_clap_loss=1.0
|
| 365 |
+
average_top_k_layers=12 # modify with model depth
|
| 366 |
+
model_add_conv=true
|
| 367 |
+
model_modalities_image_conv_option=5
|
| 368 |
+
model_depth=12 #
|
| 369 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 370 |
+
checkpoint_save_interval_updates=10000
|
| 371 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 6 ]]; then
|
| 372 |
+
echo "Config ${train_mode} ${config_option}"
|
| 373 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 374 |
+
task_load_clap_emb=true
|
| 375 |
+
task_load_source_file=true
|
| 376 |
+
task_load_mel_file=false
|
| 377 |
+
model_proj_type=2
|
| 378 |
+
model_clone_batch=4
|
| 379 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 380 |
+
model_clap_loss=1.0
|
| 381 |
+
average_top_k_layers=12 # modify with model depth
|
| 382 |
+
model_add_conv=true
|
| 383 |
+
model_modalities_image_conv_option=6
|
| 384 |
+
model_depth=12 #
|
| 385 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 386 |
+
checkpoint_save_interval_updates=10000
|
| 387 |
+
fi
|
| 388 |
+
|
| 389 |
+
python fairseq_cli/hydra_train.py -m \
|
| 390 |
+
--config-dir ./EAT/config \
|
| 391 |
+
--config-name pretraining_AS2M \
|
| 392 |
+
common.user_dir=./EAT \
|
| 393 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 394 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 395 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 396 |
+
dataset.num_workers=24 \
|
| 397 |
+
dataset.data_buffer_size=48 \
|
| 398 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 399 |
+
task.data=${task_data} \
|
| 400 |
+
task.h5_format=False \
|
| 401 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 402 |
+
+task.load_source_file=${task_load_source_file} \
|
| 403 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 404 |
+
model.proj_type=${model_proj_type} \
|
| 405 |
+
model.clone_batch=${model_clone_batch} \
|
| 406 |
+
model.clap_loss=${model_clap_loss} \
|
| 407 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 408 |
+
+model.add_conv=${model_add_conv} \
|
| 409 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 410 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 411 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 412 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 413 |
+
model.depth=${model_depth} \
|
| 414 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 415 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 416 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_2_2025-09-30_09-12-51/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=2
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 5 ]]; then
|
| 356 |
+
echo "Config ${train_mode} ${config_option}"
|
| 357 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 358 |
+
task_load_clap_emb=true
|
| 359 |
+
task_load_source_file=true
|
| 360 |
+
task_load_mel_file=false
|
| 361 |
+
model_proj_type=2
|
| 362 |
+
model_clone_batch=4
|
| 363 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 364 |
+
model_clap_loss=1.0
|
| 365 |
+
average_top_k_layers=12 # modify with model depth
|
| 366 |
+
model_add_conv=true
|
| 367 |
+
model_modalities_image_conv_option=5
|
| 368 |
+
model_depth=12 #
|
| 369 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 370 |
+
checkpoint_save_interval_updates=10000
|
| 371 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 6 ]]; then
|
| 372 |
+
echo "Config ${train_mode} ${config_option}"
|
| 373 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 374 |
+
task_load_clap_emb=true
|
| 375 |
+
task_load_source_file=true
|
| 376 |
+
task_load_mel_file=false
|
| 377 |
+
model_proj_type=2
|
| 378 |
+
model_clone_batch=4
|
| 379 |
+
dataset_batch_size=48 # original 48 oom on 4090 24G change distributed_world_size
|
| 380 |
+
model_clap_loss=1.0
|
| 381 |
+
average_top_k_layers=12 # modify with model depth
|
| 382 |
+
model_add_conv=true
|
| 383 |
+
model_modalities_image_conv_option=6
|
| 384 |
+
model_depth=12 #
|
| 385 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 386 |
+
checkpoint_save_interval_updates=10000
|
| 387 |
+
fi
|
| 388 |
+
|
| 389 |
+
python fairseq_cli/hydra_train.py -m \
|
| 390 |
+
--config-dir ./EAT/config \
|
| 391 |
+
--config-name pretraining_AS2M \
|
| 392 |
+
common.user_dir=./EAT \
|
| 393 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 394 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 395 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 396 |
+
dataset.num_workers=24 \
|
| 397 |
+
dataset.data_buffer_size=48 \
|
| 398 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 399 |
+
task.data=${task_data} \
|
| 400 |
+
task.h5_format=False \
|
| 401 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 402 |
+
+task.load_source_file=${task_load_source_file} \
|
| 403 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 404 |
+
model.proj_type=${model_proj_type} \
|
| 405 |
+
model.clone_batch=${model_clone_batch} \
|
| 406 |
+
model.clap_loss=${model_clap_loss} \
|
| 407 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 408 |
+
+model.add_conv=${model_add_conv} \
|
| 409 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 410 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 411 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 412 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 413 |
+
model.depth=${model_depth} \
|
| 414 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 415 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 416 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_4_2025-09-30_07-37-48/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
fi
|
| 308 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 309 |
+
echo "Config ${train_mode} ${config_option}"
|
| 310 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 311 |
+
task_load_clap_emb=true
|
| 312 |
+
task_load_source_file=true
|
| 313 |
+
task_load_mel_file=false
|
| 314 |
+
model_proj_type=2
|
| 315 |
+
model_clone_batch=4
|
| 316 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 317 |
+
model_clap_loss=1.0
|
| 318 |
+
average_top_k_layers=12 # modify with model depth
|
| 319 |
+
model_add_conv=true
|
| 320 |
+
model_modalities_image_conv_option=2
|
| 321 |
+
model_depth=12 #
|
| 322 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 323 |
+
checkpoint_save_interval_updates=10000
|
| 324 |
+
fi
|
| 325 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 326 |
+
echo "Config ${train_mode} ${config_option}"
|
| 327 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 328 |
+
task_load_clap_emb=true
|
| 329 |
+
task_load_source_file=true
|
| 330 |
+
task_load_mel_file=false
|
| 331 |
+
model_proj_type=2
|
| 332 |
+
model_clone_batch=4
|
| 333 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 334 |
+
model_clap_loss=1.0
|
| 335 |
+
average_top_k_layers=12 # modify with model depth
|
| 336 |
+
model_add_conv=true
|
| 337 |
+
model_modalities_image_conv_option=3
|
| 338 |
+
model_depth=12 #
|
| 339 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 340 |
+
checkpoint_save_interval_updates=10000
|
| 341 |
+
fi
|
| 342 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 343 |
+
echo "Config ${train_mode} ${config_option}"
|
| 344 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 345 |
+
task_load_clap_emb=true
|
| 346 |
+
task_load_source_file=true
|
| 347 |
+
task_load_mel_file=false
|
| 348 |
+
model_proj_type=2
|
| 349 |
+
model_clone_batch=4
|
| 350 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 351 |
+
model_clap_loss=1.0
|
| 352 |
+
average_top_k_layers=12 # modify with model depth
|
| 353 |
+
model_add_conv=true
|
| 354 |
+
model_modalities_image_conv_option=4
|
| 355 |
+
model_depth=12 #
|
| 356 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 357 |
+
checkpoint_save_interval_updates=10000
|
| 358 |
+
fi
|
| 359 |
+
|
| 360 |
+
python fairseq_cli/hydra_train.py -m \
|
| 361 |
+
--config-dir ./EAT/config \
|
| 362 |
+
--config-name pretraining_AS2M \
|
| 363 |
+
common.user_dir=./EAT \
|
| 364 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 365 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 366 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 367 |
+
dataset.num_workers=24 \
|
| 368 |
+
dataset.data_buffer_size=48 \
|
| 369 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 370 |
+
task.data=${task_data} \
|
| 371 |
+
task.h5_format=False \
|
| 372 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 373 |
+
+task.load_source_file=${task_load_source_file} \
|
| 374 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 375 |
+
model.proj_type=${model_proj_type} \
|
| 376 |
+
model.clone_batch=${model_clone_batch} \
|
| 377 |
+
model.clap_loss=${model_clap_loss} \
|
| 378 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 379 |
+
+model.add_conv=${model_add_conv} \
|
| 380 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 381 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 382 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 383 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 384 |
+
model.depth=${model_depth} \
|
| 385 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 386 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 387 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_4_2025-09-30_07-38-18/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
fi
|
| 356 |
+
|
| 357 |
+
python fairseq_cli/hydra_train.py -m \
|
| 358 |
+
--config-dir ./EAT/config \
|
| 359 |
+
--config-name pretraining_AS2M \
|
| 360 |
+
common.user_dir=./EAT \
|
| 361 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 362 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 363 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 364 |
+
dataset.num_workers=24 \
|
| 365 |
+
dataset.data_buffer_size=48 \
|
| 366 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 367 |
+
task.data=${task_data} \
|
| 368 |
+
task.h5_format=False \
|
| 369 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 370 |
+
+task.load_source_file=${task_load_source_file} \
|
| 371 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 372 |
+
model.proj_type=${model_proj_type} \
|
| 373 |
+
model.clone_batch=${model_clone_batch} \
|
| 374 |
+
model.clap_loss=${model_clap_loss} \
|
| 375 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 376 |
+
+model.add_conv=${model_add_conv} \
|
| 377 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 378 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 379 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 380 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 381 |
+
model.depth=${model_depth} \
|
| 382 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 383 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 384 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_4_2025-09-30_07-42-31/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
fi
|
| 356 |
+
|
| 357 |
+
python fairseq_cli/hydra_train.py -m \
|
| 358 |
+
--config-dir ./EAT/config \
|
| 359 |
+
--config-name pretraining_AS2M \
|
| 360 |
+
common.user_dir=./EAT \
|
| 361 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 362 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 363 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 364 |
+
dataset.num_workers=24 \
|
| 365 |
+
dataset.data_buffer_size=48 \
|
| 366 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 367 |
+
task.data=${task_data} \
|
| 368 |
+
task.h5_format=False \
|
| 369 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 370 |
+
+task.load_source_file=${task_load_source_file} \
|
| 371 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 372 |
+
model.proj_type=${model_proj_type} \
|
| 373 |
+
model.clone_batch=${model_clone_batch} \
|
| 374 |
+
model.clap_loss=${model_clap_loss} \
|
| 375 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 376 |
+
+model.add_conv=${model_add_conv} \
|
| 377 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 378 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 379 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 380 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 381 |
+
model.depth=${model_depth} \
|
| 382 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 383 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 384 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_4_2025-09-30_07-45-39/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
fi
|
| 356 |
+
|
| 357 |
+
python fairseq_cli/hydra_train.py -m \
|
| 358 |
+
--config-dir ./EAT/config \
|
| 359 |
+
--config-name pretraining_AS2M \
|
| 360 |
+
common.user_dir=./EAT \
|
| 361 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 362 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 363 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 364 |
+
dataset.num_workers=24 \
|
| 365 |
+
dataset.data_buffer_size=48 \
|
| 366 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 367 |
+
task.data=${task_data} \
|
| 368 |
+
task.h5_format=False \
|
| 369 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 370 |
+
+task.load_source_file=${task_load_source_file} \
|
| 371 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 372 |
+
model.proj_type=${model_proj_type} \
|
| 373 |
+
model.clone_batch=${model_clone_batch} \
|
| 374 |
+
model.clap_loss=${model_clap_loss} \
|
| 375 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 376 |
+
+model.add_conv=${model_add_conv} \
|
| 377 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 378 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 379 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 380 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 381 |
+
model.depth=${model_depth} \
|
| 382 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 383 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 384 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_4_2025-09-30_07-49-28/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
fi
|
| 356 |
+
|
| 357 |
+
python fairseq_cli/hydra_train.py -m \
|
| 358 |
+
--config-dir ./EAT/config \
|
| 359 |
+
--config-name pretraining_AS2M \
|
| 360 |
+
common.user_dir=./EAT \
|
| 361 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 362 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 363 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 364 |
+
dataset.num_workers=24 \
|
| 365 |
+
dataset.data_buffer_size=48 \
|
| 366 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 367 |
+
task.data=${task_data} \
|
| 368 |
+
task.h5_format=False \
|
| 369 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 370 |
+
+task.load_source_file=${task_load_source_file} \
|
| 371 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 372 |
+
model.proj_type=${model_proj_type} \
|
| 373 |
+
model.clone_batch=${model_clone_batch} \
|
| 374 |
+
model.clap_loss=${model_clap_loss} \
|
| 375 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 376 |
+
+model.add_conv=${model_add_conv} \
|
| 377 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 378 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 379 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 380 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 381 |
+
model.depth=${model_depth} \
|
| 382 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 383 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 384 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_4_2025-09-30_07-57-18/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
fi
|
| 356 |
+
|
| 357 |
+
python fairseq_cli/hydra_train.py -m \
|
| 358 |
+
--config-dir ./EAT/config \
|
| 359 |
+
--config-name pretraining_AS2M \
|
| 360 |
+
common.user_dir=./EAT \
|
| 361 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 362 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 363 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 364 |
+
dataset.num_workers=24 \
|
| 365 |
+
dataset.data_buffer_size=48 \
|
| 366 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 367 |
+
task.data=${task_data} \
|
| 368 |
+
task.h5_format=False \
|
| 369 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 370 |
+
+task.load_source_file=${task_load_source_file} \
|
| 371 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 372 |
+
model.proj_type=${model_proj_type} \
|
| 373 |
+
model.clone_batch=${model_clone_batch} \
|
| 374 |
+
model.clap_loss=${model_clap_loss} \
|
| 375 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 376 |
+
+model.add_conv=${model_add_conv} \
|
| 377 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 378 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 379 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 380 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 381 |
+
model.depth=${model_depth} \
|
| 382 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 383 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 384 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_4_2025-09-30_08-05-21/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
fi
|
| 356 |
+
|
| 357 |
+
python fairseq_cli/hydra_train.py -m \
|
| 358 |
+
--config-dir ./EAT/config \
|
| 359 |
+
--config-name pretraining_AS2M \
|
| 360 |
+
common.user_dir=./EAT \
|
| 361 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 362 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 363 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 364 |
+
dataset.num_workers=24 \
|
| 365 |
+
dataset.data_buffer_size=48 \
|
| 366 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 367 |
+
task.data=${task_data} \
|
| 368 |
+
task.h5_format=False \
|
| 369 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 370 |
+
+task.load_source_file=${task_load_source_file} \
|
| 371 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 372 |
+
model.proj_type=${model_proj_type} \
|
| 373 |
+
model.clone_batch=${model_clone_batch} \
|
| 374 |
+
model.clap_loss=${model_clap_loss} \
|
| 375 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 376 |
+
+model.add_conv=${model_add_conv} \
|
| 377 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 378 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 379 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 380 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 381 |
+
model.depth=${model_depth} \
|
| 382 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 383 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 384 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_4_2025-09-30_08-13-17/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
fi
|
| 356 |
+
|
| 357 |
+
python fairseq_cli/hydra_train.py -m \
|
| 358 |
+
--config-dir ./EAT/config \
|
| 359 |
+
--config-name pretraining_AS2M \
|
| 360 |
+
common.user_dir=./EAT \
|
| 361 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 362 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 363 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 364 |
+
dataset.num_workers=24 \
|
| 365 |
+
dataset.data_buffer_size=48 \
|
| 366 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 367 |
+
task.data=${task_data} \
|
| 368 |
+
task.h5_format=False \
|
| 369 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 370 |
+
+task.load_source_file=${task_load_source_file} \
|
| 371 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 372 |
+
model.proj_type=${model_proj_type} \
|
| 373 |
+
model.clone_batch=${model_clone_batch} \
|
| 374 |
+
model.clap_loss=${model_clap_loss} \
|
| 375 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 376 |
+
+model.add_conv=${model_add_conv} \
|
| 377 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 378 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 379 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 380 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 381 |
+
model.depth=${model_depth} \
|
| 382 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 383 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 384 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/conv_clap_4_2025-09-30_08-23-09/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=conv_clap
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
model_modalities_image_conv_option=0
|
| 35 |
+
|
| 36 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 37 |
+
echo "Config ${train_mode} ${config_option}"
|
| 38 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 39 |
+
task_load_clap_emb=false
|
| 40 |
+
task_load_source_file=true
|
| 41 |
+
task_load_mel_file=false
|
| 42 |
+
model_proj_type=null
|
| 43 |
+
model_clone_batch=4
|
| 44 |
+
dataset_batch_size=96
|
| 45 |
+
model_clap_loss=0
|
| 46 |
+
checkpoint_keep_interval_updates=-1
|
| 47 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 48 |
+
echo "Config ${train_mode} ${config_option}"
|
| 49 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 50 |
+
task_load_clap_emb=false
|
| 51 |
+
task_load_source_file=true
|
| 52 |
+
task_load_mel_file=false
|
| 53 |
+
model_proj_type=null
|
| 54 |
+
model_clone_batch=4
|
| 55 |
+
dataset_batch_size=96
|
| 56 |
+
model_dispersive_loss=1
|
| 57 |
+
model_dispersive_loss_layer=0
|
| 58 |
+
checkpoint_keep_interval_updates=1
|
| 59 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 60 |
+
echo "Config ${train_mode} ${config_option}"
|
| 61 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 62 |
+
task_load_clap_emb=false
|
| 63 |
+
task_load_source_file=true
|
| 64 |
+
task_load_mel_file=false
|
| 65 |
+
model_proj_type=null
|
| 66 |
+
model_clone_batch=1
|
| 67 |
+
dataset_batch_size=384
|
| 68 |
+
model_dispersive_loss=1
|
| 69 |
+
model_dispersive_loss_layer=0
|
| 70 |
+
checkpoint_keep_interval_updates=1
|
| 71 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 72 |
+
echo "Config ${train_mode} ${config_option}"
|
| 73 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 74 |
+
task_load_clap_emb=false
|
| 75 |
+
task_load_source_file=true
|
| 76 |
+
task_load_mel_file=false
|
| 77 |
+
model_proj_type=null
|
| 78 |
+
model_clone_batch=1
|
| 79 |
+
dataset_batch_size=384
|
| 80 |
+
model_dispersive_loss=10.0
|
| 81 |
+
model_dispersive_loss_layer=0
|
| 82 |
+
checkpoint_keep_interval_updates=1
|
| 83 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 84 |
+
echo "Config ${train_mode} ${config_option}"
|
| 85 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 86 |
+
task_load_clap_emb=false
|
| 87 |
+
task_load_source_file=true
|
| 88 |
+
task_load_mel_file=false
|
| 89 |
+
model_proj_type=null
|
| 90 |
+
model_clone_batch=1
|
| 91 |
+
dataset_batch_size=384
|
| 92 |
+
model_dispersive_loss=100.0
|
| 93 |
+
model_dispersive_loss_layer=0
|
| 94 |
+
checkpoint_keep_interval_updates=1
|
| 95 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 96 |
+
echo "Config ${train_mode} ${config_option}"
|
| 97 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 98 |
+
task_load_clap_emb=false
|
| 99 |
+
task_load_source_file=true
|
| 100 |
+
task_load_mel_file=false
|
| 101 |
+
model_proj_type=null
|
| 102 |
+
model_clone_batch=1
|
| 103 |
+
dataset_batch_size=384
|
| 104 |
+
model_dispersive_loss=10000.0
|
| 105 |
+
model_dispersive_loss_layer=0
|
| 106 |
+
checkpoint_keep_interval_updates=1
|
| 107 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 108 |
+
echo "Config ${train_mode} ${config_option}"
|
| 109 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 110 |
+
task_load_clap_emb=false
|
| 111 |
+
task_load_source_file=true
|
| 112 |
+
task_load_mel_file=false
|
| 113 |
+
model_proj_type=null
|
| 114 |
+
model_clone_batch=1
|
| 115 |
+
dataset_batch_size=384
|
| 116 |
+
model_dispersive_loss=1000.0
|
| 117 |
+
model_dispersive_loss_layer=0
|
| 118 |
+
checkpoint_keep_interval_updates=1
|
| 119 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 120 |
+
echo "Config ${train_mode} ${config_option}"
|
| 121 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 122 |
+
task_load_clap_emb=false
|
| 123 |
+
task_load_source_file=true
|
| 124 |
+
task_load_mel_file=false
|
| 125 |
+
model_proj_type=null
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_dispersive_loss=1000.0
|
| 129 |
+
model_dispersive_loss_layer=10
|
| 130 |
+
checkpoint_keep_interval_updates=1
|
| 131 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 132 |
+
echo "Config ${train_mode} ${config_option}"
|
| 133 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 134 |
+
task_load_clap_emb=true
|
| 135 |
+
model_proj_type=2
|
| 136 |
+
model_clone_batch=4
|
| 137 |
+
dataset_batch_size=48
|
| 138 |
+
model_clap_loss=1.0
|
| 139 |
+
average_top_k_layers=12
|
| 140 |
+
model_add_conv=false
|
| 141 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 142 |
+
echo "Config ${train_mode} ${config_option}"
|
| 143 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 144 |
+
task_load_clap_emb=true
|
| 145 |
+
model_proj_type=2
|
| 146 |
+
model_clone_batch=4
|
| 147 |
+
dataset_batch_size=48
|
| 148 |
+
model_clap_loss=1.0
|
| 149 |
+
average_top_k_layers=1
|
| 150 |
+
# loss type ablation
|
| 151 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 152 |
+
echo "Config ${train_mode} ${config_option}"
|
| 153 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 154 |
+
task_load_clap_emb=true
|
| 155 |
+
model_proj_type=2
|
| 156 |
+
model_clone_batch=4
|
| 157 |
+
dataset_batch_size=48
|
| 158 |
+
model_clap_loss=1.0
|
| 159 |
+
average_top_k_layers=12
|
| 160 |
+
model_clap_loss_type="ce"
|
| 161 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 162 |
+
echo "Config ${train_mode} ${config_option}"
|
| 163 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 164 |
+
task_load_clap_emb=true
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
dataset_batch_size=48
|
| 168 |
+
model_clap_loss=1.0
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="l1"
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="cosine"
|
| 181 |
+
# loss layer ablation
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=10
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=8
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=96
|
| 215 |
+
model_clap_loss=1.0
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
model_clap_loss_layer=6
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=5.0
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
task_load_source_file=true
|
| 237 |
+
task_load_mel_file=false
|
| 238 |
+
model_proj_type=2
|
| 239 |
+
model_clone_batch=4
|
| 240 |
+
model_clap_loss=0.1
|
| 241 |
+
dataset_batch_size=96
|
| 242 |
+
average_top_k_layers=12
|
| 243 |
+
model_clap_loss_type="mse"
|
| 244 |
+
checkpoint_keep_interval_updates=-1
|
| 245 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 246 |
+
echo "Config ${train_mode} ${config_option}"
|
| 247 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 248 |
+
task_load_clap_emb=true
|
| 249 |
+
model_proj_type=4
|
| 250 |
+
model_clone_batch=4
|
| 251 |
+
model_clap_loss=1.0
|
| 252 |
+
dataset_batch_size=48
|
| 253 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 254 |
+
echo "Config ${train_mode} ${config_option}"
|
| 255 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 256 |
+
task_load_clap_emb=true
|
| 257 |
+
model_proj_type=4
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
model_clap_loss=0.001
|
| 260 |
+
dataset_batch_size=48
|
| 261 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 262 |
+
echo "Config ${train_mode} ${config_option}"
|
| 263 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 264 |
+
task_load_clap_emb=true
|
| 265 |
+
model_proj_type=4
|
| 266 |
+
model_clone_batch=4
|
| 267 |
+
model_clap_loss=0.01
|
| 268 |
+
dataset_batch_size=48
|
| 269 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 270 |
+
echo "Config ${train_mode} ${config_option}"
|
| 271 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 272 |
+
task_load_clap_emb=true
|
| 273 |
+
model_proj_type=6
|
| 274 |
+
model_clone_batch=4
|
| 275 |
+
dataset_batch_size=48
|
| 276 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 277 |
+
echo "Config ${train_mode} ${config_option}"
|
| 278 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 279 |
+
task_load_clap_emb=true
|
| 280 |
+
task_load_source_file=true
|
| 281 |
+
task_load_mel_file=false
|
| 282 |
+
model_proj_type=2
|
| 283 |
+
model_clone_batch=4
|
| 284 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 285 |
+
model_clap_loss=1.0
|
| 286 |
+
average_top_k_layers=11 # modify with model depth
|
| 287 |
+
model_add_conv=true
|
| 288 |
+
model_depth=11 #
|
| 289 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 290 |
+
checkpoint_save_interval_updates=10000
|
| 291 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 1 ]]; then
|
| 292 |
+
echo "Config ${train_mode} ${config_option}"
|
| 293 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 294 |
+
task_load_clap_emb=true
|
| 295 |
+
task_load_source_file=true
|
| 296 |
+
task_load_mel_file=false
|
| 297 |
+
model_proj_type=2
|
| 298 |
+
model_clone_batch=4
|
| 299 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 300 |
+
model_clap_loss=1.0
|
| 301 |
+
average_top_k_layers=12 # modify with model depth
|
| 302 |
+
model_add_conv=true
|
| 303 |
+
model_modalities_image_conv_option=1
|
| 304 |
+
model_depth=12 #
|
| 305 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 306 |
+
checkpoint_save_interval_updates=10000
|
| 307 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 2 ]]; then
|
| 308 |
+
echo "Config ${train_mode} ${config_option}"
|
| 309 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 310 |
+
task_load_clap_emb=true
|
| 311 |
+
task_load_source_file=true
|
| 312 |
+
task_load_mel_file=false
|
| 313 |
+
model_proj_type=2
|
| 314 |
+
model_clone_batch=4
|
| 315 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 316 |
+
model_clap_loss=1.0
|
| 317 |
+
average_top_k_layers=12 # modify with model depth
|
| 318 |
+
model_add_conv=true
|
| 319 |
+
model_modalities_image_conv_option=2
|
| 320 |
+
model_depth=12 #
|
| 321 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 322 |
+
checkpoint_save_interval_updates=10000
|
| 323 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 3 ]]; then
|
| 324 |
+
echo "Config ${train_mode} ${config_option}"
|
| 325 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 326 |
+
task_load_clap_emb=true
|
| 327 |
+
task_load_source_file=true
|
| 328 |
+
task_load_mel_file=false
|
| 329 |
+
model_proj_type=2
|
| 330 |
+
model_clone_batch=4
|
| 331 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 332 |
+
model_clap_loss=1.0
|
| 333 |
+
average_top_k_layers=12 # modify with model depth
|
| 334 |
+
model_add_conv=true
|
| 335 |
+
model_modalities_image_conv_option=3
|
| 336 |
+
model_depth=12 #
|
| 337 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 338 |
+
checkpoint_save_interval_updates=10000
|
| 339 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 4 ]]; then
|
| 340 |
+
echo "Config ${train_mode} ${config_option}"
|
| 341 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 342 |
+
task_load_clap_emb=true
|
| 343 |
+
task_load_source_file=true
|
| 344 |
+
task_load_mel_file=false
|
| 345 |
+
model_proj_type=2
|
| 346 |
+
model_clone_batch=4
|
| 347 |
+
dataset_batch_size=96 # original 48 oom on 4090 24G change distributed_world_size
|
| 348 |
+
model_clap_loss=1.0
|
| 349 |
+
average_top_k_layers=12 # modify with model depth
|
| 350 |
+
model_add_conv=true
|
| 351 |
+
model_modalities_image_conv_option=4
|
| 352 |
+
model_depth=12 #
|
| 353 |
+
checkpoint_keep_interval_updates=1 # default 1
|
| 354 |
+
checkpoint_save_interval_updates=10000
|
| 355 |
+
fi
|
| 356 |
+
|
| 357 |
+
python fairseq_cli/hydra_train.py -m \
|
| 358 |
+
--config-dir ./EAT/config \
|
| 359 |
+
--config-name pretraining_AS2M \
|
| 360 |
+
common.user_dir=./EAT \
|
| 361 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 362 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 363 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 364 |
+
dataset.num_workers=24 \
|
| 365 |
+
dataset.data_buffer_size=48 \
|
| 366 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 367 |
+
task.data=${task_data} \
|
| 368 |
+
task.h5_format=False \
|
| 369 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 370 |
+
+task.load_source_file=${task_load_source_file} \
|
| 371 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 372 |
+
model.proj_type=${model_proj_type} \
|
| 373 |
+
model.clone_batch=${model_clone_batch} \
|
| 374 |
+
model.clap_loss=${model_clap_loss} \
|
| 375 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 376 |
+
+model.add_conv=${model_add_conv} \
|
| 377 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 378 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 379 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 380 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 381 |
+
model.depth=${model_depth} \
|
| 382 |
+
+model.modalities.image.conv_option=${model_modalities_image_conv_option} \
|
| 383 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 384 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_0_2025-09-24_13-58-24/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=0
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 61 |
+
task_load_clap_emb=true
|
| 62 |
+
model_proj_type=2
|
| 63 |
+
model_clone_batch=4
|
| 64 |
+
dataset_batch_size=48
|
| 65 |
+
model_clap_loss=1.0
|
| 66 |
+
average_top_k_layers=12
|
| 67 |
+
model_add_conv=false
|
| 68 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 69 |
+
echo "Config ${train_mode} ${config_option}"
|
| 70 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 71 |
+
task_load_clap_emb=true
|
| 72 |
+
model_proj_type=2
|
| 73 |
+
model_clone_batch=4
|
| 74 |
+
dataset_batch_size=48
|
| 75 |
+
model_clap_loss=1.0
|
| 76 |
+
average_top_k_layers=1
|
| 77 |
+
# loss type ablation
|
| 78 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 79 |
+
echo "Config ${train_mode} ${config_option}"
|
| 80 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 81 |
+
task_load_clap_emb=true
|
| 82 |
+
model_proj_type=2
|
| 83 |
+
model_clone_batch=4
|
| 84 |
+
dataset_batch_size=48
|
| 85 |
+
model_clap_loss=1.0
|
| 86 |
+
average_top_k_layers=12
|
| 87 |
+
model_clap_loss_type="ce"
|
| 88 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 89 |
+
echo "Config ${train_mode} ${config_option}"
|
| 90 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 91 |
+
task_load_clap_emb=true
|
| 92 |
+
model_proj_type=2
|
| 93 |
+
model_clone_batch=4
|
| 94 |
+
dataset_batch_size=48
|
| 95 |
+
model_clap_loss=1.0
|
| 96 |
+
average_top_k_layers=12
|
| 97 |
+
model_clap_loss_type="l1"
|
| 98 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 99 |
+
echo "Config ${train_mode} ${config_option}"
|
| 100 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 101 |
+
task_load_clap_emb=true
|
| 102 |
+
model_proj_type=2
|
| 103 |
+
model_clone_batch=4
|
| 104 |
+
dataset_batch_size=96
|
| 105 |
+
model_clap_loss=1.0
|
| 106 |
+
average_top_k_layers=12
|
| 107 |
+
model_clap_loss_type="cosine"
|
| 108 |
+
# loss layer ablation
|
| 109 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 110 |
+
echo "Config ${train_mode} ${config_option}"
|
| 111 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 112 |
+
task_load_clap_emb=true
|
| 113 |
+
model_proj_type=2
|
| 114 |
+
model_clone_batch=4
|
| 115 |
+
dataset_batch_size=96
|
| 116 |
+
model_clap_loss=1.0
|
| 117 |
+
average_top_k_layers=12
|
| 118 |
+
model_clap_loss_type="mse"
|
| 119 |
+
model_clap_loss_layer=10
|
| 120 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 121 |
+
echo "Config ${train_mode} ${config_option}"
|
| 122 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 123 |
+
task_load_clap_emb=true
|
| 124 |
+
task_load_source_file=true
|
| 125 |
+
task_load_mel_file=false
|
| 126 |
+
model_proj_type=2
|
| 127 |
+
model_clone_batch=4
|
| 128 |
+
dataset_batch_size=96
|
| 129 |
+
model_clap_loss=1.0
|
| 130 |
+
average_top_k_layers=12
|
| 131 |
+
model_clap_loss_type="mse"
|
| 132 |
+
model_clap_loss_layer=8
|
| 133 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 134 |
+
echo "Config ${train_mode} ${config_option}"
|
| 135 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 136 |
+
task_load_clap_emb=true
|
| 137 |
+
task_load_source_file=true
|
| 138 |
+
task_load_mel_file=false
|
| 139 |
+
model_proj_type=2
|
| 140 |
+
model_clone_batch=4
|
| 141 |
+
dataset_batch_size=96
|
| 142 |
+
model_clap_loss=1.0
|
| 143 |
+
average_top_k_layers=12
|
| 144 |
+
model_clap_loss_type="mse"
|
| 145 |
+
model_clap_loss_layer=6
|
| 146 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 147 |
+
echo "Config ${train_mode} ${config_option}"
|
| 148 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 149 |
+
task_load_clap_emb=true
|
| 150 |
+
task_load_source_file=true
|
| 151 |
+
task_load_mel_file=false
|
| 152 |
+
model_proj_type=2
|
| 153 |
+
model_clone_batch=4
|
| 154 |
+
model_clap_loss=5.0
|
| 155 |
+
dataset_batch_size=96
|
| 156 |
+
average_top_k_layers=12
|
| 157 |
+
model_clap_loss_type="mse"
|
| 158 |
+
checkpoint_keep_interval_updates=-1
|
| 159 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 160 |
+
echo "Config ${train_mode} ${config_option}"
|
| 161 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 162 |
+
task_load_clap_emb=true
|
| 163 |
+
task_load_source_file=true
|
| 164 |
+
task_load_mel_file=false
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
model_clap_loss=0.1
|
| 168 |
+
dataset_batch_size=96
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="mse"
|
| 171 |
+
checkpoint_keep_interval_updates=-1
|
| 172 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 173 |
+
echo "Config ${train_mode} ${config_option}"
|
| 174 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 175 |
+
task_load_clap_emb=true
|
| 176 |
+
model_proj_type=4
|
| 177 |
+
model_clone_batch=4
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
dataset_batch_size=48
|
| 180 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 181 |
+
echo "Config ${train_mode} ${config_option}"
|
| 182 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 183 |
+
task_load_clap_emb=true
|
| 184 |
+
model_proj_type=4
|
| 185 |
+
model_clone_batch=4
|
| 186 |
+
model_clap_loss=0.001
|
| 187 |
+
dataset_batch_size=48
|
| 188 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 189 |
+
echo "Config ${train_mode} ${config_option}"
|
| 190 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 191 |
+
task_load_clap_emb=true
|
| 192 |
+
model_proj_type=4
|
| 193 |
+
model_clone_batch=4
|
| 194 |
+
model_clap_loss=0.01
|
| 195 |
+
dataset_batch_size=48
|
| 196 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 197 |
+
echo "Config ${train_mode} ${config_option}"
|
| 198 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 199 |
+
task_load_clap_emb=true
|
| 200 |
+
model_proj_type=6
|
| 201 |
+
model_clone_batch=4
|
| 202 |
+
dataset_batch_size=48
|
| 203 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 204 |
+
echo "Config ${train_mode} ${config_option}"
|
| 205 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 206 |
+
task_load_clap_emb=true
|
| 207 |
+
task_load_source_file=true
|
| 208 |
+
task_load_mel_file=false
|
| 209 |
+
model_proj_type=2
|
| 210 |
+
model_clone_batch=4
|
| 211 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 212 |
+
model_clap_loss=1.0
|
| 213 |
+
average_top_k_layers=11 # modify with model depth
|
| 214 |
+
model_add_conv=true
|
| 215 |
+
model_depth=11 #
|
| 216 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 217 |
+
checkpoint_save_interval_updates=10000
|
| 218 |
+
fi
|
| 219 |
+
|
| 220 |
+
python fairseq_cli/hydra_train.py -m \
|
| 221 |
+
--config-dir ./EAT/config \
|
| 222 |
+
--config-name pretraining_AS2M \
|
| 223 |
+
common.user_dir=./EAT \
|
| 224 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 225 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 226 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 227 |
+
dataset.num_workers=24 \
|
| 228 |
+
dataset.data_buffer_size=48 \
|
| 229 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 230 |
+
task.data=${task_data} \
|
| 231 |
+
task.h5_format=False \
|
| 232 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 233 |
+
+task.load_source_file=${task_load_source_file} \
|
| 234 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 235 |
+
model.proj_type=${model_proj_type} \
|
| 236 |
+
model.clone_batch=${model_clone_batch} \
|
| 237 |
+
model.clap_loss=${model_clap_loss} \
|
| 238 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 239 |
+
+model.add_conv=${model_add_conv} \
|
| 240 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 241 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 242 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 243 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 244 |
+
model.depth=${model_depth} \
|
| 245 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 246 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_0_2025-09-24_14-09-31/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=0
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 61 |
+
task_load_clap_emb=true
|
| 62 |
+
model_proj_type=2
|
| 63 |
+
model_clone_batch=4
|
| 64 |
+
dataset_batch_size=48
|
| 65 |
+
model_clap_loss=1.0
|
| 66 |
+
average_top_k_layers=12
|
| 67 |
+
model_add_conv=false
|
| 68 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 69 |
+
echo "Config ${train_mode} ${config_option}"
|
| 70 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 71 |
+
task_load_clap_emb=true
|
| 72 |
+
model_proj_type=2
|
| 73 |
+
model_clone_batch=4
|
| 74 |
+
dataset_batch_size=48
|
| 75 |
+
model_clap_loss=1.0
|
| 76 |
+
average_top_k_layers=1
|
| 77 |
+
# loss type ablation
|
| 78 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 79 |
+
echo "Config ${train_mode} ${config_option}"
|
| 80 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 81 |
+
task_load_clap_emb=true
|
| 82 |
+
model_proj_type=2
|
| 83 |
+
model_clone_batch=4
|
| 84 |
+
dataset_batch_size=48
|
| 85 |
+
model_clap_loss=1.0
|
| 86 |
+
average_top_k_layers=12
|
| 87 |
+
model_clap_loss_type="ce"
|
| 88 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 89 |
+
echo "Config ${train_mode} ${config_option}"
|
| 90 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 91 |
+
task_load_clap_emb=true
|
| 92 |
+
model_proj_type=2
|
| 93 |
+
model_clone_batch=4
|
| 94 |
+
dataset_batch_size=48
|
| 95 |
+
model_clap_loss=1.0
|
| 96 |
+
average_top_k_layers=12
|
| 97 |
+
model_clap_loss_type="l1"
|
| 98 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 99 |
+
echo "Config ${train_mode} ${config_option}"
|
| 100 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 101 |
+
task_load_clap_emb=true
|
| 102 |
+
model_proj_type=2
|
| 103 |
+
model_clone_batch=4
|
| 104 |
+
dataset_batch_size=96
|
| 105 |
+
model_clap_loss=1.0
|
| 106 |
+
average_top_k_layers=12
|
| 107 |
+
model_clap_loss_type="cosine"
|
| 108 |
+
# loss layer ablation
|
| 109 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 110 |
+
echo "Config ${train_mode} ${config_option}"
|
| 111 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 112 |
+
task_load_clap_emb=true
|
| 113 |
+
model_proj_type=2
|
| 114 |
+
model_clone_batch=4
|
| 115 |
+
dataset_batch_size=96
|
| 116 |
+
model_clap_loss=1.0
|
| 117 |
+
average_top_k_layers=12
|
| 118 |
+
model_clap_loss_type="mse"
|
| 119 |
+
model_clap_loss_layer=10
|
| 120 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 121 |
+
echo "Config ${train_mode} ${config_option}"
|
| 122 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 123 |
+
task_load_clap_emb=true
|
| 124 |
+
task_load_source_file=true
|
| 125 |
+
task_load_mel_file=false
|
| 126 |
+
model_proj_type=2
|
| 127 |
+
model_clone_batch=4
|
| 128 |
+
dataset_batch_size=96
|
| 129 |
+
model_clap_loss=1.0
|
| 130 |
+
average_top_k_layers=12
|
| 131 |
+
model_clap_loss_type="mse"
|
| 132 |
+
model_clap_loss_layer=8
|
| 133 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 134 |
+
echo "Config ${train_mode} ${config_option}"
|
| 135 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 136 |
+
task_load_clap_emb=true
|
| 137 |
+
task_load_source_file=true
|
| 138 |
+
task_load_mel_file=false
|
| 139 |
+
model_proj_type=2
|
| 140 |
+
model_clone_batch=4
|
| 141 |
+
dataset_batch_size=96
|
| 142 |
+
model_clap_loss=1.0
|
| 143 |
+
average_top_k_layers=12
|
| 144 |
+
model_clap_loss_type="mse"
|
| 145 |
+
model_clap_loss_layer=6
|
| 146 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 147 |
+
echo "Config ${train_mode} ${config_option}"
|
| 148 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 149 |
+
task_load_clap_emb=true
|
| 150 |
+
task_load_source_file=true
|
| 151 |
+
task_load_mel_file=false
|
| 152 |
+
model_proj_type=2
|
| 153 |
+
model_clone_batch=4
|
| 154 |
+
model_clap_loss=5.0
|
| 155 |
+
dataset_batch_size=96
|
| 156 |
+
average_top_k_layers=12
|
| 157 |
+
model_clap_loss_type="mse"
|
| 158 |
+
checkpoint_keep_interval_updates=-1
|
| 159 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 160 |
+
echo "Config ${train_mode} ${config_option}"
|
| 161 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 162 |
+
task_load_clap_emb=true
|
| 163 |
+
task_load_source_file=true
|
| 164 |
+
task_load_mel_file=false
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
model_clap_loss=0.1
|
| 168 |
+
dataset_batch_size=96
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="mse"
|
| 171 |
+
checkpoint_keep_interval_updates=-1
|
| 172 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 173 |
+
echo "Config ${train_mode} ${config_option}"
|
| 174 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 175 |
+
task_load_clap_emb=true
|
| 176 |
+
model_proj_type=4
|
| 177 |
+
model_clone_batch=4
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
dataset_batch_size=48
|
| 180 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 181 |
+
echo "Config ${train_mode} ${config_option}"
|
| 182 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 183 |
+
task_load_clap_emb=true
|
| 184 |
+
model_proj_type=4
|
| 185 |
+
model_clone_batch=4
|
| 186 |
+
model_clap_loss=0.001
|
| 187 |
+
dataset_batch_size=48
|
| 188 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 189 |
+
echo "Config ${train_mode} ${config_option}"
|
| 190 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 191 |
+
task_load_clap_emb=true
|
| 192 |
+
model_proj_type=4
|
| 193 |
+
model_clone_batch=4
|
| 194 |
+
model_clap_loss=0.01
|
| 195 |
+
dataset_batch_size=48
|
| 196 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 197 |
+
echo "Config ${train_mode} ${config_option}"
|
| 198 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 199 |
+
task_load_clap_emb=true
|
| 200 |
+
model_proj_type=6
|
| 201 |
+
model_clone_batch=4
|
| 202 |
+
dataset_batch_size=48
|
| 203 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 204 |
+
echo "Config ${train_mode} ${config_option}"
|
| 205 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 206 |
+
task_load_clap_emb=true
|
| 207 |
+
task_load_source_file=true
|
| 208 |
+
task_load_mel_file=false
|
| 209 |
+
model_proj_type=2
|
| 210 |
+
model_clone_batch=4
|
| 211 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 212 |
+
model_clap_loss=1.0
|
| 213 |
+
average_top_k_layers=11 # modify with model depth
|
| 214 |
+
model_add_conv=true
|
| 215 |
+
model_depth=11 #
|
| 216 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 217 |
+
checkpoint_save_interval_updates=10000
|
| 218 |
+
fi
|
| 219 |
+
|
| 220 |
+
python fairseq_cli/hydra_train.py -m \
|
| 221 |
+
--config-dir ./EAT/config \
|
| 222 |
+
--config-name pretraining_AS2M \
|
| 223 |
+
common.user_dir=./EAT \
|
| 224 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 225 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 226 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 227 |
+
dataset.num_workers=24 \
|
| 228 |
+
dataset.data_buffer_size=48 \
|
| 229 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 230 |
+
task.data=${task_data} \
|
| 231 |
+
task.h5_format=False \
|
| 232 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 233 |
+
+task.load_source_file=${task_load_source_file} \
|
| 234 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 235 |
+
model.proj_type=${model_proj_type} \
|
| 236 |
+
model.clone_batch=${model_clone_batch} \
|
| 237 |
+
model.clap_loss=${model_clap_loss} \
|
| 238 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 239 |
+
+model.add_conv=${model_add_conv} \
|
| 240 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 241 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 242 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 243 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 244 |
+
model.depth=${model_depth} \
|
| 245 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 246 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_0_2025-09-24_14-12-12/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=0
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 61 |
+
task_load_clap_emb=true
|
| 62 |
+
model_proj_type=2
|
| 63 |
+
model_clone_batch=4
|
| 64 |
+
dataset_batch_size=48
|
| 65 |
+
model_clap_loss=1.0
|
| 66 |
+
average_top_k_layers=12
|
| 67 |
+
model_add_conv=false
|
| 68 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 69 |
+
echo "Config ${train_mode} ${config_option}"
|
| 70 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 71 |
+
task_load_clap_emb=true
|
| 72 |
+
model_proj_type=2
|
| 73 |
+
model_clone_batch=4
|
| 74 |
+
dataset_batch_size=48
|
| 75 |
+
model_clap_loss=1.0
|
| 76 |
+
average_top_k_layers=1
|
| 77 |
+
# loss type ablation
|
| 78 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 79 |
+
echo "Config ${train_mode} ${config_option}"
|
| 80 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 81 |
+
task_load_clap_emb=true
|
| 82 |
+
model_proj_type=2
|
| 83 |
+
model_clone_batch=4
|
| 84 |
+
dataset_batch_size=48
|
| 85 |
+
model_clap_loss=1.0
|
| 86 |
+
average_top_k_layers=12
|
| 87 |
+
model_clap_loss_type="ce"
|
| 88 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 89 |
+
echo "Config ${train_mode} ${config_option}"
|
| 90 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 91 |
+
task_load_clap_emb=true
|
| 92 |
+
model_proj_type=2
|
| 93 |
+
model_clone_batch=4
|
| 94 |
+
dataset_batch_size=48
|
| 95 |
+
model_clap_loss=1.0
|
| 96 |
+
average_top_k_layers=12
|
| 97 |
+
model_clap_loss_type="l1"
|
| 98 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 99 |
+
echo "Config ${train_mode} ${config_option}"
|
| 100 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 101 |
+
task_load_clap_emb=true
|
| 102 |
+
model_proj_type=2
|
| 103 |
+
model_clone_batch=4
|
| 104 |
+
dataset_batch_size=96
|
| 105 |
+
model_clap_loss=1.0
|
| 106 |
+
average_top_k_layers=12
|
| 107 |
+
model_clap_loss_type="cosine"
|
| 108 |
+
# loss layer ablation
|
| 109 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 110 |
+
echo "Config ${train_mode} ${config_option}"
|
| 111 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 112 |
+
task_load_clap_emb=true
|
| 113 |
+
model_proj_type=2
|
| 114 |
+
model_clone_batch=4
|
| 115 |
+
dataset_batch_size=96
|
| 116 |
+
model_clap_loss=1.0
|
| 117 |
+
average_top_k_layers=12
|
| 118 |
+
model_clap_loss_type="mse"
|
| 119 |
+
model_clap_loss_layer=10
|
| 120 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 121 |
+
echo "Config ${train_mode} ${config_option}"
|
| 122 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 123 |
+
task_load_clap_emb=true
|
| 124 |
+
task_load_source_file=true
|
| 125 |
+
task_load_mel_file=false
|
| 126 |
+
model_proj_type=2
|
| 127 |
+
model_clone_batch=4
|
| 128 |
+
dataset_batch_size=96
|
| 129 |
+
model_clap_loss=1.0
|
| 130 |
+
average_top_k_layers=12
|
| 131 |
+
model_clap_loss_type="mse"
|
| 132 |
+
model_clap_loss_layer=8
|
| 133 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 134 |
+
echo "Config ${train_mode} ${config_option}"
|
| 135 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 136 |
+
task_load_clap_emb=true
|
| 137 |
+
task_load_source_file=true
|
| 138 |
+
task_load_mel_file=false
|
| 139 |
+
model_proj_type=2
|
| 140 |
+
model_clone_batch=4
|
| 141 |
+
dataset_batch_size=96
|
| 142 |
+
model_clap_loss=1.0
|
| 143 |
+
average_top_k_layers=12
|
| 144 |
+
model_clap_loss_type="mse"
|
| 145 |
+
model_clap_loss_layer=6
|
| 146 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 147 |
+
echo "Config ${train_mode} ${config_option}"
|
| 148 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 149 |
+
task_load_clap_emb=true
|
| 150 |
+
task_load_source_file=true
|
| 151 |
+
task_load_mel_file=false
|
| 152 |
+
model_proj_type=2
|
| 153 |
+
model_clone_batch=4
|
| 154 |
+
model_clap_loss=5.0
|
| 155 |
+
dataset_batch_size=96
|
| 156 |
+
average_top_k_layers=12
|
| 157 |
+
model_clap_loss_type="mse"
|
| 158 |
+
checkpoint_keep_interval_updates=-1
|
| 159 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 160 |
+
echo "Config ${train_mode} ${config_option}"
|
| 161 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 162 |
+
task_load_clap_emb=true
|
| 163 |
+
task_load_source_file=true
|
| 164 |
+
task_load_mel_file=false
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
model_clap_loss=0.1
|
| 168 |
+
dataset_batch_size=96
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="mse"
|
| 171 |
+
checkpoint_keep_interval_updates=-1
|
| 172 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 173 |
+
echo "Config ${train_mode} ${config_option}"
|
| 174 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 175 |
+
task_load_clap_emb=true
|
| 176 |
+
model_proj_type=4
|
| 177 |
+
model_clone_batch=4
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
dataset_batch_size=48
|
| 180 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 181 |
+
echo "Config ${train_mode} ${config_option}"
|
| 182 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 183 |
+
task_load_clap_emb=true
|
| 184 |
+
model_proj_type=4
|
| 185 |
+
model_clone_batch=4
|
| 186 |
+
model_clap_loss=0.001
|
| 187 |
+
dataset_batch_size=48
|
| 188 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 189 |
+
echo "Config ${train_mode} ${config_option}"
|
| 190 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 191 |
+
task_load_clap_emb=true
|
| 192 |
+
model_proj_type=4
|
| 193 |
+
model_clone_batch=4
|
| 194 |
+
model_clap_loss=0.01
|
| 195 |
+
dataset_batch_size=48
|
| 196 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 197 |
+
echo "Config ${train_mode} ${config_option}"
|
| 198 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 199 |
+
task_load_clap_emb=true
|
| 200 |
+
model_proj_type=6
|
| 201 |
+
model_clone_batch=4
|
| 202 |
+
dataset_batch_size=48
|
| 203 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 204 |
+
echo "Config ${train_mode} ${config_option}"
|
| 205 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 206 |
+
task_load_clap_emb=true
|
| 207 |
+
task_load_source_file=true
|
| 208 |
+
task_load_mel_file=false
|
| 209 |
+
model_proj_type=2
|
| 210 |
+
model_clone_batch=4
|
| 211 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 212 |
+
model_clap_loss=1.0
|
| 213 |
+
average_top_k_layers=11 # modify with model depth
|
| 214 |
+
model_add_conv=true
|
| 215 |
+
model_depth=11 #
|
| 216 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 217 |
+
checkpoint_save_interval_updates=10000
|
| 218 |
+
fi
|
| 219 |
+
|
| 220 |
+
python fairseq_cli/hydra_train.py -m \
|
| 221 |
+
--config-dir ./EAT/config \
|
| 222 |
+
--config-name pretraining_AS2M \
|
| 223 |
+
common.user_dir=./EAT \
|
| 224 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 225 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 226 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 227 |
+
dataset.num_workers=24 \
|
| 228 |
+
dataset.data_buffer_size=48 \
|
| 229 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 230 |
+
task.data=${task_data} \
|
| 231 |
+
task.h5_format=False \
|
| 232 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 233 |
+
+task.load_source_file=${task_load_source_file} \
|
| 234 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 235 |
+
model.proj_type=${model_proj_type} \
|
| 236 |
+
model.clone_batch=${model_clone_batch} \
|
| 237 |
+
model.clap_loss=${model_clap_loss} \
|
| 238 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 239 |
+
+model.add_conv=${model_add_conv} \
|
| 240 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 241 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 242 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 243 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 244 |
+
model.depth=${model_depth} \
|
| 245 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 246 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_0_2025-09-24_14-17-47/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=0
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 61 |
+
task_load_clap_emb=true
|
| 62 |
+
model_proj_type=2
|
| 63 |
+
model_clone_batch=4
|
| 64 |
+
dataset_batch_size=48
|
| 65 |
+
model_clap_loss=1.0
|
| 66 |
+
average_top_k_layers=12
|
| 67 |
+
model_add_conv=false
|
| 68 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 69 |
+
echo "Config ${train_mode} ${config_option}"
|
| 70 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 71 |
+
task_load_clap_emb=true
|
| 72 |
+
model_proj_type=2
|
| 73 |
+
model_clone_batch=4
|
| 74 |
+
dataset_batch_size=48
|
| 75 |
+
model_clap_loss=1.0
|
| 76 |
+
average_top_k_layers=1
|
| 77 |
+
# loss type ablation
|
| 78 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 79 |
+
echo "Config ${train_mode} ${config_option}"
|
| 80 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 81 |
+
task_load_clap_emb=true
|
| 82 |
+
model_proj_type=2
|
| 83 |
+
model_clone_batch=4
|
| 84 |
+
dataset_batch_size=48
|
| 85 |
+
model_clap_loss=1.0
|
| 86 |
+
average_top_k_layers=12
|
| 87 |
+
model_clap_loss_type="ce"
|
| 88 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 89 |
+
echo "Config ${train_mode} ${config_option}"
|
| 90 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 91 |
+
task_load_clap_emb=true
|
| 92 |
+
model_proj_type=2
|
| 93 |
+
model_clone_batch=4
|
| 94 |
+
dataset_batch_size=48
|
| 95 |
+
model_clap_loss=1.0
|
| 96 |
+
average_top_k_layers=12
|
| 97 |
+
model_clap_loss_type="l1"
|
| 98 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 99 |
+
echo "Config ${train_mode} ${config_option}"
|
| 100 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 101 |
+
task_load_clap_emb=true
|
| 102 |
+
model_proj_type=2
|
| 103 |
+
model_clone_batch=4
|
| 104 |
+
dataset_batch_size=96
|
| 105 |
+
model_clap_loss=1.0
|
| 106 |
+
average_top_k_layers=12
|
| 107 |
+
model_clap_loss_type="cosine"
|
| 108 |
+
# loss layer ablation
|
| 109 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 110 |
+
echo "Config ${train_mode} ${config_option}"
|
| 111 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 112 |
+
task_load_clap_emb=true
|
| 113 |
+
model_proj_type=2
|
| 114 |
+
model_clone_batch=4
|
| 115 |
+
dataset_batch_size=96
|
| 116 |
+
model_clap_loss=1.0
|
| 117 |
+
average_top_k_layers=12
|
| 118 |
+
model_clap_loss_type="mse"
|
| 119 |
+
model_clap_loss_layer=10
|
| 120 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 121 |
+
echo "Config ${train_mode} ${config_option}"
|
| 122 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 123 |
+
task_load_clap_emb=true
|
| 124 |
+
task_load_source_file=true
|
| 125 |
+
task_load_mel_file=false
|
| 126 |
+
model_proj_type=2
|
| 127 |
+
model_clone_batch=4
|
| 128 |
+
dataset_batch_size=96
|
| 129 |
+
model_clap_loss=1.0
|
| 130 |
+
average_top_k_layers=12
|
| 131 |
+
model_clap_loss_type="mse"
|
| 132 |
+
model_clap_loss_layer=8
|
| 133 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 134 |
+
echo "Config ${train_mode} ${config_option}"
|
| 135 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 136 |
+
task_load_clap_emb=true
|
| 137 |
+
task_load_source_file=true
|
| 138 |
+
task_load_mel_file=false
|
| 139 |
+
model_proj_type=2
|
| 140 |
+
model_clone_batch=4
|
| 141 |
+
dataset_batch_size=96
|
| 142 |
+
model_clap_loss=1.0
|
| 143 |
+
average_top_k_layers=12
|
| 144 |
+
model_clap_loss_type="mse"
|
| 145 |
+
model_clap_loss_layer=6
|
| 146 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 147 |
+
echo "Config ${train_mode} ${config_option}"
|
| 148 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 149 |
+
task_load_clap_emb=true
|
| 150 |
+
task_load_source_file=true
|
| 151 |
+
task_load_mel_file=false
|
| 152 |
+
model_proj_type=2
|
| 153 |
+
model_clone_batch=4
|
| 154 |
+
model_clap_loss=5.0
|
| 155 |
+
dataset_batch_size=96
|
| 156 |
+
average_top_k_layers=12
|
| 157 |
+
model_clap_loss_type="mse"
|
| 158 |
+
checkpoint_keep_interval_updates=-1
|
| 159 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 160 |
+
echo "Config ${train_mode} ${config_option}"
|
| 161 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 162 |
+
task_load_clap_emb=true
|
| 163 |
+
task_load_source_file=true
|
| 164 |
+
task_load_mel_file=false
|
| 165 |
+
model_proj_type=2
|
| 166 |
+
model_clone_batch=4
|
| 167 |
+
model_clap_loss=0.1
|
| 168 |
+
dataset_batch_size=96
|
| 169 |
+
average_top_k_layers=12
|
| 170 |
+
model_clap_loss_type="mse"
|
| 171 |
+
checkpoint_keep_interval_updates=-1
|
| 172 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 173 |
+
echo "Config ${train_mode} ${config_option}"
|
| 174 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 175 |
+
task_load_clap_emb=true
|
| 176 |
+
model_proj_type=4
|
| 177 |
+
model_clone_batch=4
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
dataset_batch_size=48
|
| 180 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 181 |
+
echo "Config ${train_mode} ${config_option}"
|
| 182 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 183 |
+
task_load_clap_emb=true
|
| 184 |
+
model_proj_type=4
|
| 185 |
+
model_clone_batch=4
|
| 186 |
+
model_clap_loss=0.001
|
| 187 |
+
dataset_batch_size=48
|
| 188 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 189 |
+
echo "Config ${train_mode} ${config_option}"
|
| 190 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 191 |
+
task_load_clap_emb=true
|
| 192 |
+
model_proj_type=4
|
| 193 |
+
model_clone_batch=4
|
| 194 |
+
model_clap_loss=0.01
|
| 195 |
+
dataset_batch_size=48
|
| 196 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 197 |
+
echo "Config ${train_mode} ${config_option}"
|
| 198 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 199 |
+
task_load_clap_emb=true
|
| 200 |
+
model_proj_type=6
|
| 201 |
+
model_clone_batch=4
|
| 202 |
+
dataset_batch_size=48
|
| 203 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 204 |
+
echo "Config ${train_mode} ${config_option}"
|
| 205 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 206 |
+
task_load_clap_emb=true
|
| 207 |
+
task_load_source_file=true
|
| 208 |
+
task_load_mel_file=false
|
| 209 |
+
model_proj_type=2
|
| 210 |
+
model_clone_batch=4
|
| 211 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 212 |
+
model_clap_loss=1.0
|
| 213 |
+
average_top_k_layers=11 # modify with model depth
|
| 214 |
+
model_add_conv=true
|
| 215 |
+
model_depth=11 #
|
| 216 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 217 |
+
checkpoint_save_interval_updates=10000
|
| 218 |
+
fi
|
| 219 |
+
|
| 220 |
+
python fairseq_cli/hydra_train.py -m \
|
| 221 |
+
--config-dir ./EAT/config \
|
| 222 |
+
--config-name pretraining_AS2M \
|
| 223 |
+
common.user_dir=./EAT \
|
| 224 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 225 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 226 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 227 |
+
dataset.num_workers=24 \
|
| 228 |
+
dataset.data_buffer_size=48 \
|
| 229 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 230 |
+
task.data=${task_data} \
|
| 231 |
+
task.h5_format=False \
|
| 232 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 233 |
+
+task.load_source_file=${task_load_source_file} \
|
| 234 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 235 |
+
model.proj_type=${model_proj_type} \
|
| 236 |
+
model.clone_batch=${model_clone_batch} \
|
| 237 |
+
model.clap_loss=${model_clap_loss} \
|
| 238 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 239 |
+
+model.add_conv=${model_add_conv} \
|
| 240 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 241 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 242 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 243 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 244 |
+
model.depth=${model_depth} \
|
| 245 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 246 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_1_2025-09-26_14-32-16/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=1
|
| 54 |
+
dataset_batch_size=384
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=4
|
| 66 |
+
dataset_batch_size=96
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 73 |
+
task_load_clap_emb=true
|
| 74 |
+
model_proj_type=2
|
| 75 |
+
model_clone_batch=4
|
| 76 |
+
dataset_batch_size=48
|
| 77 |
+
model_clap_loss=1.0
|
| 78 |
+
average_top_k_layers=12
|
| 79 |
+
model_add_conv=false
|
| 80 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 81 |
+
echo "Config ${train_mode} ${config_option}"
|
| 82 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 83 |
+
task_load_clap_emb=true
|
| 84 |
+
model_proj_type=2
|
| 85 |
+
model_clone_batch=4
|
| 86 |
+
dataset_batch_size=48
|
| 87 |
+
model_clap_loss=1.0
|
| 88 |
+
average_top_k_layers=1
|
| 89 |
+
# loss type ablation
|
| 90 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 91 |
+
echo "Config ${train_mode} ${config_option}"
|
| 92 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 93 |
+
task_load_clap_emb=true
|
| 94 |
+
model_proj_type=2
|
| 95 |
+
model_clone_batch=4
|
| 96 |
+
dataset_batch_size=48
|
| 97 |
+
model_clap_loss=1.0
|
| 98 |
+
average_top_k_layers=12
|
| 99 |
+
model_clap_loss_type="ce"
|
| 100 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 101 |
+
echo "Config ${train_mode} ${config_option}"
|
| 102 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 103 |
+
task_load_clap_emb=true
|
| 104 |
+
model_proj_type=2
|
| 105 |
+
model_clone_batch=4
|
| 106 |
+
dataset_batch_size=48
|
| 107 |
+
model_clap_loss=1.0
|
| 108 |
+
average_top_k_layers=12
|
| 109 |
+
model_clap_loss_type="l1"
|
| 110 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 111 |
+
echo "Config ${train_mode} ${config_option}"
|
| 112 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 113 |
+
task_load_clap_emb=true
|
| 114 |
+
model_proj_type=2
|
| 115 |
+
model_clone_batch=4
|
| 116 |
+
dataset_batch_size=96
|
| 117 |
+
model_clap_loss=1.0
|
| 118 |
+
average_top_k_layers=12
|
| 119 |
+
model_clap_loss_type="cosine"
|
| 120 |
+
# loss layer ablation
|
| 121 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 122 |
+
echo "Config ${train_mode} ${config_option}"
|
| 123 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 124 |
+
task_load_clap_emb=true
|
| 125 |
+
model_proj_type=2
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_clap_loss=1.0
|
| 129 |
+
average_top_k_layers=12
|
| 130 |
+
model_clap_loss_type="mse"
|
| 131 |
+
model_clap_loss_layer=10
|
| 132 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 133 |
+
echo "Config ${train_mode} ${config_option}"
|
| 134 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 135 |
+
task_load_clap_emb=true
|
| 136 |
+
task_load_source_file=true
|
| 137 |
+
task_load_mel_file=false
|
| 138 |
+
model_proj_type=2
|
| 139 |
+
model_clone_batch=4
|
| 140 |
+
dataset_batch_size=96
|
| 141 |
+
model_clap_loss=1.0
|
| 142 |
+
average_top_k_layers=12
|
| 143 |
+
model_clap_loss_type="mse"
|
| 144 |
+
model_clap_loss_layer=8
|
| 145 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 146 |
+
echo "Config ${train_mode} ${config_option}"
|
| 147 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 148 |
+
task_load_clap_emb=true
|
| 149 |
+
task_load_source_file=true
|
| 150 |
+
task_load_mel_file=false
|
| 151 |
+
model_proj_type=2
|
| 152 |
+
model_clone_batch=4
|
| 153 |
+
dataset_batch_size=96
|
| 154 |
+
model_clap_loss=1.0
|
| 155 |
+
average_top_k_layers=12
|
| 156 |
+
model_clap_loss_type="mse"
|
| 157 |
+
model_clap_loss_layer=6
|
| 158 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 159 |
+
echo "Config ${train_mode} ${config_option}"
|
| 160 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 161 |
+
task_load_clap_emb=true
|
| 162 |
+
task_load_source_file=true
|
| 163 |
+
task_load_mel_file=false
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
model_clap_loss=5.0
|
| 167 |
+
dataset_batch_size=96
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="mse"
|
| 170 |
+
checkpoint_keep_interval_updates=-1
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
task_load_source_file=true
|
| 176 |
+
task_load_mel_file=false
|
| 177 |
+
model_proj_type=2
|
| 178 |
+
model_clone_batch=4
|
| 179 |
+
model_clap_loss=0.1
|
| 180 |
+
dataset_batch_size=96
|
| 181 |
+
average_top_k_layers=12
|
| 182 |
+
model_clap_loss_type="mse"
|
| 183 |
+
checkpoint_keep_interval_updates=-1
|
| 184 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 185 |
+
echo "Config ${train_mode} ${config_option}"
|
| 186 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 187 |
+
task_load_clap_emb=true
|
| 188 |
+
model_proj_type=4
|
| 189 |
+
model_clone_batch=4
|
| 190 |
+
model_clap_loss=1.0
|
| 191 |
+
dataset_batch_size=48
|
| 192 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
model_proj_type=4
|
| 197 |
+
model_clone_batch=4
|
| 198 |
+
model_clap_loss=0.001
|
| 199 |
+
dataset_batch_size=48
|
| 200 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 201 |
+
echo "Config ${train_mode} ${config_option}"
|
| 202 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 203 |
+
task_load_clap_emb=true
|
| 204 |
+
model_proj_type=4
|
| 205 |
+
model_clone_batch=4
|
| 206 |
+
model_clap_loss=0.01
|
| 207 |
+
dataset_batch_size=48
|
| 208 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 209 |
+
echo "Config ${train_mode} ${config_option}"
|
| 210 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 211 |
+
task_load_clap_emb=true
|
| 212 |
+
model_proj_type=6
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=48
|
| 215 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 216 |
+
echo "Config ${train_mode} ${config_option}"
|
| 217 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 218 |
+
task_load_clap_emb=true
|
| 219 |
+
task_load_source_file=true
|
| 220 |
+
task_load_mel_file=false
|
| 221 |
+
model_proj_type=2
|
| 222 |
+
model_clone_batch=4
|
| 223 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 224 |
+
model_clap_loss=1.0
|
| 225 |
+
average_top_k_layers=11 # modify with model depth
|
| 226 |
+
model_add_conv=true
|
| 227 |
+
model_depth=11 #
|
| 228 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 229 |
+
checkpoint_save_interval_updates=10000
|
| 230 |
+
fi
|
| 231 |
+
|
| 232 |
+
python fairseq_cli/hydra_train.py -m \
|
| 233 |
+
--config-dir ./EAT/config \
|
| 234 |
+
--config-name pretraining_AS2M \
|
| 235 |
+
common.user_dir=./EAT \
|
| 236 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 237 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 238 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 239 |
+
dataset.num_workers=24 \
|
| 240 |
+
dataset.data_buffer_size=48 \
|
| 241 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 242 |
+
task.data=${task_data} \
|
| 243 |
+
task.h5_format=False \
|
| 244 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 245 |
+
+task.load_source_file=${task_load_source_file} \
|
| 246 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 247 |
+
model.proj_type=${model_proj_type} \
|
| 248 |
+
model.clone_batch=${model_clone_batch} \
|
| 249 |
+
model.clap_loss=${model_clap_loss} \
|
| 250 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 251 |
+
+model.add_conv=${model_add_conv} \
|
| 252 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 253 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 254 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 255 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 256 |
+
model.depth=${model_depth} \
|
| 257 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 258 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_1_2025-09-26_14-33-34/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=1
|
| 54 |
+
dataset_batch_size=384
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=4
|
| 66 |
+
dataset_batch_size=96
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 73 |
+
task_load_clap_emb=true
|
| 74 |
+
model_proj_type=2
|
| 75 |
+
model_clone_batch=4
|
| 76 |
+
dataset_batch_size=48
|
| 77 |
+
model_clap_loss=1.0
|
| 78 |
+
average_top_k_layers=12
|
| 79 |
+
model_add_conv=false
|
| 80 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 81 |
+
echo "Config ${train_mode} ${config_option}"
|
| 82 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 83 |
+
task_load_clap_emb=true
|
| 84 |
+
model_proj_type=2
|
| 85 |
+
model_clone_batch=4
|
| 86 |
+
dataset_batch_size=48
|
| 87 |
+
model_clap_loss=1.0
|
| 88 |
+
average_top_k_layers=1
|
| 89 |
+
# loss type ablation
|
| 90 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 91 |
+
echo "Config ${train_mode} ${config_option}"
|
| 92 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 93 |
+
task_load_clap_emb=true
|
| 94 |
+
model_proj_type=2
|
| 95 |
+
model_clone_batch=4
|
| 96 |
+
dataset_batch_size=48
|
| 97 |
+
model_clap_loss=1.0
|
| 98 |
+
average_top_k_layers=12
|
| 99 |
+
model_clap_loss_type="ce"
|
| 100 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 101 |
+
echo "Config ${train_mode} ${config_option}"
|
| 102 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 103 |
+
task_load_clap_emb=true
|
| 104 |
+
model_proj_type=2
|
| 105 |
+
model_clone_batch=4
|
| 106 |
+
dataset_batch_size=48
|
| 107 |
+
model_clap_loss=1.0
|
| 108 |
+
average_top_k_layers=12
|
| 109 |
+
model_clap_loss_type="l1"
|
| 110 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 111 |
+
echo "Config ${train_mode} ${config_option}"
|
| 112 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 113 |
+
task_load_clap_emb=true
|
| 114 |
+
model_proj_type=2
|
| 115 |
+
model_clone_batch=4
|
| 116 |
+
dataset_batch_size=96
|
| 117 |
+
model_clap_loss=1.0
|
| 118 |
+
average_top_k_layers=12
|
| 119 |
+
model_clap_loss_type="cosine"
|
| 120 |
+
# loss layer ablation
|
| 121 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 122 |
+
echo "Config ${train_mode} ${config_option}"
|
| 123 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 124 |
+
task_load_clap_emb=true
|
| 125 |
+
model_proj_type=2
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_clap_loss=1.0
|
| 129 |
+
average_top_k_layers=12
|
| 130 |
+
model_clap_loss_type="mse"
|
| 131 |
+
model_clap_loss_layer=10
|
| 132 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 133 |
+
echo "Config ${train_mode} ${config_option}"
|
| 134 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 135 |
+
task_load_clap_emb=true
|
| 136 |
+
task_load_source_file=true
|
| 137 |
+
task_load_mel_file=false
|
| 138 |
+
model_proj_type=2
|
| 139 |
+
model_clone_batch=4
|
| 140 |
+
dataset_batch_size=96
|
| 141 |
+
model_clap_loss=1.0
|
| 142 |
+
average_top_k_layers=12
|
| 143 |
+
model_clap_loss_type="mse"
|
| 144 |
+
model_clap_loss_layer=8
|
| 145 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 146 |
+
echo "Config ${train_mode} ${config_option}"
|
| 147 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 148 |
+
task_load_clap_emb=true
|
| 149 |
+
task_load_source_file=true
|
| 150 |
+
task_load_mel_file=false
|
| 151 |
+
model_proj_type=2
|
| 152 |
+
model_clone_batch=4
|
| 153 |
+
dataset_batch_size=96
|
| 154 |
+
model_clap_loss=1.0
|
| 155 |
+
average_top_k_layers=12
|
| 156 |
+
model_clap_loss_type="mse"
|
| 157 |
+
model_clap_loss_layer=6
|
| 158 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 159 |
+
echo "Config ${train_mode} ${config_option}"
|
| 160 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 161 |
+
task_load_clap_emb=true
|
| 162 |
+
task_load_source_file=true
|
| 163 |
+
task_load_mel_file=false
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
model_clap_loss=5.0
|
| 167 |
+
dataset_batch_size=96
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="mse"
|
| 170 |
+
checkpoint_keep_interval_updates=-1
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
task_load_source_file=true
|
| 176 |
+
task_load_mel_file=false
|
| 177 |
+
model_proj_type=2
|
| 178 |
+
model_clone_batch=4
|
| 179 |
+
model_clap_loss=0.1
|
| 180 |
+
dataset_batch_size=96
|
| 181 |
+
average_top_k_layers=12
|
| 182 |
+
model_clap_loss_type="mse"
|
| 183 |
+
checkpoint_keep_interval_updates=-1
|
| 184 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 185 |
+
echo "Config ${train_mode} ${config_option}"
|
| 186 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 187 |
+
task_load_clap_emb=true
|
| 188 |
+
model_proj_type=4
|
| 189 |
+
model_clone_batch=4
|
| 190 |
+
model_clap_loss=1.0
|
| 191 |
+
dataset_batch_size=48
|
| 192 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
model_proj_type=4
|
| 197 |
+
model_clone_batch=4
|
| 198 |
+
model_clap_loss=0.001
|
| 199 |
+
dataset_batch_size=48
|
| 200 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 201 |
+
echo "Config ${train_mode} ${config_option}"
|
| 202 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 203 |
+
task_load_clap_emb=true
|
| 204 |
+
model_proj_type=4
|
| 205 |
+
model_clone_batch=4
|
| 206 |
+
model_clap_loss=0.01
|
| 207 |
+
dataset_batch_size=48
|
| 208 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 209 |
+
echo "Config ${train_mode} ${config_option}"
|
| 210 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 211 |
+
task_load_clap_emb=true
|
| 212 |
+
model_proj_type=6
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=48
|
| 215 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 216 |
+
echo "Config ${train_mode} ${config_option}"
|
| 217 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 218 |
+
task_load_clap_emb=true
|
| 219 |
+
task_load_source_file=true
|
| 220 |
+
task_load_mel_file=false
|
| 221 |
+
model_proj_type=2
|
| 222 |
+
model_clone_batch=4
|
| 223 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 224 |
+
model_clap_loss=1.0
|
| 225 |
+
average_top_k_layers=11 # modify with model depth
|
| 226 |
+
model_add_conv=true
|
| 227 |
+
model_depth=11 #
|
| 228 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 229 |
+
checkpoint_save_interval_updates=10000
|
| 230 |
+
fi
|
| 231 |
+
|
| 232 |
+
python fairseq_cli/hydra_train.py -m \
|
| 233 |
+
--config-dir ./EAT/config \
|
| 234 |
+
--config-name pretraining_AS2M \
|
| 235 |
+
common.user_dir=./EAT \
|
| 236 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 237 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 238 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 239 |
+
dataset.num_workers=24 \
|
| 240 |
+
dataset.data_buffer_size=48 \
|
| 241 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 242 |
+
task.data=${task_data} \
|
| 243 |
+
task.h5_format=False \
|
| 244 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 245 |
+
+task.load_source_file=${task_load_source_file} \
|
| 246 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 247 |
+
model.proj_type=${model_proj_type} \
|
| 248 |
+
model.clone_batch=${model_clone_batch} \
|
| 249 |
+
model.clap_loss=${model_clap_loss} \
|
| 250 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 251 |
+
+model.add_conv=${model_add_conv} \
|
| 252 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 253 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 254 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 255 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 256 |
+
model.depth=${model_depth} \
|
| 257 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 258 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_1_2025-09-26_14-34-35/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=1
|
| 54 |
+
dataset_batch_size=384
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=4
|
| 66 |
+
dataset_batch_size=96
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 73 |
+
task_load_clap_emb=true
|
| 74 |
+
model_proj_type=2
|
| 75 |
+
model_clone_batch=4
|
| 76 |
+
dataset_batch_size=48
|
| 77 |
+
model_clap_loss=1.0
|
| 78 |
+
average_top_k_layers=12
|
| 79 |
+
model_add_conv=false
|
| 80 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 81 |
+
echo "Config ${train_mode} ${config_option}"
|
| 82 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 83 |
+
task_load_clap_emb=true
|
| 84 |
+
model_proj_type=2
|
| 85 |
+
model_clone_batch=4
|
| 86 |
+
dataset_batch_size=48
|
| 87 |
+
model_clap_loss=1.0
|
| 88 |
+
average_top_k_layers=1
|
| 89 |
+
# loss type ablation
|
| 90 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 91 |
+
echo "Config ${train_mode} ${config_option}"
|
| 92 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 93 |
+
task_load_clap_emb=true
|
| 94 |
+
model_proj_type=2
|
| 95 |
+
model_clone_batch=4
|
| 96 |
+
dataset_batch_size=48
|
| 97 |
+
model_clap_loss=1.0
|
| 98 |
+
average_top_k_layers=12
|
| 99 |
+
model_clap_loss_type="ce"
|
| 100 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 101 |
+
echo "Config ${train_mode} ${config_option}"
|
| 102 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 103 |
+
task_load_clap_emb=true
|
| 104 |
+
model_proj_type=2
|
| 105 |
+
model_clone_batch=4
|
| 106 |
+
dataset_batch_size=48
|
| 107 |
+
model_clap_loss=1.0
|
| 108 |
+
average_top_k_layers=12
|
| 109 |
+
model_clap_loss_type="l1"
|
| 110 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 111 |
+
echo "Config ${train_mode} ${config_option}"
|
| 112 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 113 |
+
task_load_clap_emb=true
|
| 114 |
+
model_proj_type=2
|
| 115 |
+
model_clone_batch=4
|
| 116 |
+
dataset_batch_size=96
|
| 117 |
+
model_clap_loss=1.0
|
| 118 |
+
average_top_k_layers=12
|
| 119 |
+
model_clap_loss_type="cosine"
|
| 120 |
+
# loss layer ablation
|
| 121 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 122 |
+
echo "Config ${train_mode} ${config_option}"
|
| 123 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 124 |
+
task_load_clap_emb=true
|
| 125 |
+
model_proj_type=2
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_clap_loss=1.0
|
| 129 |
+
average_top_k_layers=12
|
| 130 |
+
model_clap_loss_type="mse"
|
| 131 |
+
model_clap_loss_layer=10
|
| 132 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 133 |
+
echo "Config ${train_mode} ${config_option}"
|
| 134 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 135 |
+
task_load_clap_emb=true
|
| 136 |
+
task_load_source_file=true
|
| 137 |
+
task_load_mel_file=false
|
| 138 |
+
model_proj_type=2
|
| 139 |
+
model_clone_batch=4
|
| 140 |
+
dataset_batch_size=96
|
| 141 |
+
model_clap_loss=1.0
|
| 142 |
+
average_top_k_layers=12
|
| 143 |
+
model_clap_loss_type="mse"
|
| 144 |
+
model_clap_loss_layer=8
|
| 145 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 146 |
+
echo "Config ${train_mode} ${config_option}"
|
| 147 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 148 |
+
task_load_clap_emb=true
|
| 149 |
+
task_load_source_file=true
|
| 150 |
+
task_load_mel_file=false
|
| 151 |
+
model_proj_type=2
|
| 152 |
+
model_clone_batch=4
|
| 153 |
+
dataset_batch_size=96
|
| 154 |
+
model_clap_loss=1.0
|
| 155 |
+
average_top_k_layers=12
|
| 156 |
+
model_clap_loss_type="mse"
|
| 157 |
+
model_clap_loss_layer=6
|
| 158 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 159 |
+
echo "Config ${train_mode} ${config_option}"
|
| 160 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 161 |
+
task_load_clap_emb=true
|
| 162 |
+
task_load_source_file=true
|
| 163 |
+
task_load_mel_file=false
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
model_clap_loss=5.0
|
| 167 |
+
dataset_batch_size=96
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="mse"
|
| 170 |
+
checkpoint_keep_interval_updates=-1
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
task_load_source_file=true
|
| 176 |
+
task_load_mel_file=false
|
| 177 |
+
model_proj_type=2
|
| 178 |
+
model_clone_batch=4
|
| 179 |
+
model_clap_loss=0.1
|
| 180 |
+
dataset_batch_size=96
|
| 181 |
+
average_top_k_layers=12
|
| 182 |
+
model_clap_loss_type="mse"
|
| 183 |
+
checkpoint_keep_interval_updates=-1
|
| 184 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 185 |
+
echo "Config ${train_mode} ${config_option}"
|
| 186 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 187 |
+
task_load_clap_emb=true
|
| 188 |
+
model_proj_type=4
|
| 189 |
+
model_clone_batch=4
|
| 190 |
+
model_clap_loss=1.0
|
| 191 |
+
dataset_batch_size=48
|
| 192 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
model_proj_type=4
|
| 197 |
+
model_clone_batch=4
|
| 198 |
+
model_clap_loss=0.001
|
| 199 |
+
dataset_batch_size=48
|
| 200 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 201 |
+
echo "Config ${train_mode} ${config_option}"
|
| 202 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 203 |
+
task_load_clap_emb=true
|
| 204 |
+
model_proj_type=4
|
| 205 |
+
model_clone_batch=4
|
| 206 |
+
model_clap_loss=0.01
|
| 207 |
+
dataset_batch_size=48
|
| 208 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 209 |
+
echo "Config ${train_mode} ${config_option}"
|
| 210 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 211 |
+
task_load_clap_emb=true
|
| 212 |
+
model_proj_type=6
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=48
|
| 215 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 216 |
+
echo "Config ${train_mode} ${config_option}"
|
| 217 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 218 |
+
task_load_clap_emb=true
|
| 219 |
+
task_load_source_file=true
|
| 220 |
+
task_load_mel_file=false
|
| 221 |
+
model_proj_type=2
|
| 222 |
+
model_clone_batch=4
|
| 223 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 224 |
+
model_clap_loss=1.0
|
| 225 |
+
average_top_k_layers=11 # modify with model depth
|
| 226 |
+
model_add_conv=true
|
| 227 |
+
model_depth=11 #
|
| 228 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 229 |
+
checkpoint_save_interval_updates=10000
|
| 230 |
+
fi
|
| 231 |
+
|
| 232 |
+
python fairseq_cli/hydra_train.py -m \
|
| 233 |
+
--config-dir ./EAT/config \
|
| 234 |
+
--config-name pretraining_AS2M \
|
| 235 |
+
common.user_dir=./EAT \
|
| 236 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 237 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 238 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 239 |
+
dataset.num_workers=24 \
|
| 240 |
+
dataset.data_buffer_size=48 \
|
| 241 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 242 |
+
task.data=${task_data} \
|
| 243 |
+
task.h5_format=False \
|
| 244 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 245 |
+
+task.load_source_file=${task_load_source_file} \
|
| 246 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 247 |
+
model.proj_type=${model_proj_type} \
|
| 248 |
+
model.clone_batch=${model_clone_batch} \
|
| 249 |
+
model.clap_loss=${model_clap_loss} \
|
| 250 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 251 |
+
+model.add_conv=${model_add_conv} \
|
| 252 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 253 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 254 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 255 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 256 |
+
model.depth=${model_depth} \
|
| 257 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 258 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_1_2025-09-26_14-39-04/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=1
|
| 54 |
+
dataset_batch_size=384
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=4
|
| 66 |
+
dataset_batch_size=96
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 73 |
+
task_load_clap_emb=true
|
| 74 |
+
model_proj_type=2
|
| 75 |
+
model_clone_batch=4
|
| 76 |
+
dataset_batch_size=48
|
| 77 |
+
model_clap_loss=1.0
|
| 78 |
+
average_top_k_layers=12
|
| 79 |
+
model_add_conv=false
|
| 80 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 81 |
+
echo "Config ${train_mode} ${config_option}"
|
| 82 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 83 |
+
task_load_clap_emb=true
|
| 84 |
+
model_proj_type=2
|
| 85 |
+
model_clone_batch=4
|
| 86 |
+
dataset_batch_size=48
|
| 87 |
+
model_clap_loss=1.0
|
| 88 |
+
average_top_k_layers=1
|
| 89 |
+
# loss type ablation
|
| 90 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 91 |
+
echo "Config ${train_mode} ${config_option}"
|
| 92 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 93 |
+
task_load_clap_emb=true
|
| 94 |
+
model_proj_type=2
|
| 95 |
+
model_clone_batch=4
|
| 96 |
+
dataset_batch_size=48
|
| 97 |
+
model_clap_loss=1.0
|
| 98 |
+
average_top_k_layers=12
|
| 99 |
+
model_clap_loss_type="ce"
|
| 100 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 101 |
+
echo "Config ${train_mode} ${config_option}"
|
| 102 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 103 |
+
task_load_clap_emb=true
|
| 104 |
+
model_proj_type=2
|
| 105 |
+
model_clone_batch=4
|
| 106 |
+
dataset_batch_size=48
|
| 107 |
+
model_clap_loss=1.0
|
| 108 |
+
average_top_k_layers=12
|
| 109 |
+
model_clap_loss_type="l1"
|
| 110 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 111 |
+
echo "Config ${train_mode} ${config_option}"
|
| 112 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 113 |
+
task_load_clap_emb=true
|
| 114 |
+
model_proj_type=2
|
| 115 |
+
model_clone_batch=4
|
| 116 |
+
dataset_batch_size=96
|
| 117 |
+
model_clap_loss=1.0
|
| 118 |
+
average_top_k_layers=12
|
| 119 |
+
model_clap_loss_type="cosine"
|
| 120 |
+
# loss layer ablation
|
| 121 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 122 |
+
echo "Config ${train_mode} ${config_option}"
|
| 123 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 124 |
+
task_load_clap_emb=true
|
| 125 |
+
model_proj_type=2
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_clap_loss=1.0
|
| 129 |
+
average_top_k_layers=12
|
| 130 |
+
model_clap_loss_type="mse"
|
| 131 |
+
model_clap_loss_layer=10
|
| 132 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 133 |
+
echo "Config ${train_mode} ${config_option}"
|
| 134 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 135 |
+
task_load_clap_emb=true
|
| 136 |
+
task_load_source_file=true
|
| 137 |
+
task_load_mel_file=false
|
| 138 |
+
model_proj_type=2
|
| 139 |
+
model_clone_batch=4
|
| 140 |
+
dataset_batch_size=96
|
| 141 |
+
model_clap_loss=1.0
|
| 142 |
+
average_top_k_layers=12
|
| 143 |
+
model_clap_loss_type="mse"
|
| 144 |
+
model_clap_loss_layer=8
|
| 145 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 146 |
+
echo "Config ${train_mode} ${config_option}"
|
| 147 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 148 |
+
task_load_clap_emb=true
|
| 149 |
+
task_load_source_file=true
|
| 150 |
+
task_load_mel_file=false
|
| 151 |
+
model_proj_type=2
|
| 152 |
+
model_clone_batch=4
|
| 153 |
+
dataset_batch_size=96
|
| 154 |
+
model_clap_loss=1.0
|
| 155 |
+
average_top_k_layers=12
|
| 156 |
+
model_clap_loss_type="mse"
|
| 157 |
+
model_clap_loss_layer=6
|
| 158 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 159 |
+
echo "Config ${train_mode} ${config_option}"
|
| 160 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 161 |
+
task_load_clap_emb=true
|
| 162 |
+
task_load_source_file=true
|
| 163 |
+
task_load_mel_file=false
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
model_clap_loss=5.0
|
| 167 |
+
dataset_batch_size=96
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="mse"
|
| 170 |
+
checkpoint_keep_interval_updates=-1
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
task_load_source_file=true
|
| 176 |
+
task_load_mel_file=false
|
| 177 |
+
model_proj_type=2
|
| 178 |
+
model_clone_batch=4
|
| 179 |
+
model_clap_loss=0.1
|
| 180 |
+
dataset_batch_size=96
|
| 181 |
+
average_top_k_layers=12
|
| 182 |
+
model_clap_loss_type="mse"
|
| 183 |
+
checkpoint_keep_interval_updates=-1
|
| 184 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 185 |
+
echo "Config ${train_mode} ${config_option}"
|
| 186 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 187 |
+
task_load_clap_emb=true
|
| 188 |
+
model_proj_type=4
|
| 189 |
+
model_clone_batch=4
|
| 190 |
+
model_clap_loss=1.0
|
| 191 |
+
dataset_batch_size=48
|
| 192 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
model_proj_type=4
|
| 197 |
+
model_clone_batch=4
|
| 198 |
+
model_clap_loss=0.001
|
| 199 |
+
dataset_batch_size=48
|
| 200 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 201 |
+
echo "Config ${train_mode} ${config_option}"
|
| 202 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 203 |
+
task_load_clap_emb=true
|
| 204 |
+
model_proj_type=4
|
| 205 |
+
model_clone_batch=4
|
| 206 |
+
model_clap_loss=0.01
|
| 207 |
+
dataset_batch_size=48
|
| 208 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 209 |
+
echo "Config ${train_mode} ${config_option}"
|
| 210 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 211 |
+
task_load_clap_emb=true
|
| 212 |
+
model_proj_type=6
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=48
|
| 215 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 216 |
+
echo "Config ${train_mode} ${config_option}"
|
| 217 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 218 |
+
task_load_clap_emb=true
|
| 219 |
+
task_load_source_file=true
|
| 220 |
+
task_load_mel_file=false
|
| 221 |
+
model_proj_type=2
|
| 222 |
+
model_clone_batch=4
|
| 223 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 224 |
+
model_clap_loss=1.0
|
| 225 |
+
average_top_k_layers=11 # modify with model depth
|
| 226 |
+
model_add_conv=true
|
| 227 |
+
model_depth=11 #
|
| 228 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 229 |
+
checkpoint_save_interval_updates=10000
|
| 230 |
+
fi
|
| 231 |
+
|
| 232 |
+
python fairseq_cli/hydra_train.py -m \
|
| 233 |
+
--config-dir ./EAT/config \
|
| 234 |
+
--config-name pretraining_AS2M \
|
| 235 |
+
common.user_dir=./EAT \
|
| 236 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 237 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 238 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 239 |
+
dataset.num_workers=24 \
|
| 240 |
+
dataset.data_buffer_size=48 \
|
| 241 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 242 |
+
task.data=${task_data} \
|
| 243 |
+
task.h5_format=False \
|
| 244 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 245 |
+
+task.load_source_file=${task_load_source_file} \
|
| 246 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 247 |
+
model.proj_type=${model_proj_type} \
|
| 248 |
+
model.clone_batch=${model_clone_batch} \
|
| 249 |
+
model.clap_loss=${model_clap_loss} \
|
| 250 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 251 |
+
+model.add_conv=${model_add_conv} \
|
| 252 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 253 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 254 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 255 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 256 |
+
model.depth=${model_depth} \
|
| 257 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 258 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_1_2025-09-26_14-57-51/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=1
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=1
|
| 66 |
+
dataset_batch_size=384
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 73 |
+
task_load_clap_emb=true
|
| 74 |
+
model_proj_type=2
|
| 75 |
+
model_clone_batch=4
|
| 76 |
+
dataset_batch_size=48
|
| 77 |
+
model_clap_loss=1.0
|
| 78 |
+
average_top_k_layers=12
|
| 79 |
+
model_add_conv=false
|
| 80 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 81 |
+
echo "Config ${train_mode} ${config_option}"
|
| 82 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 83 |
+
task_load_clap_emb=true
|
| 84 |
+
model_proj_type=2
|
| 85 |
+
model_clone_batch=4
|
| 86 |
+
dataset_batch_size=48
|
| 87 |
+
model_clap_loss=1.0
|
| 88 |
+
average_top_k_layers=1
|
| 89 |
+
# loss type ablation
|
| 90 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 91 |
+
echo "Config ${train_mode} ${config_option}"
|
| 92 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 93 |
+
task_load_clap_emb=true
|
| 94 |
+
model_proj_type=2
|
| 95 |
+
model_clone_batch=4
|
| 96 |
+
dataset_batch_size=48
|
| 97 |
+
model_clap_loss=1.0
|
| 98 |
+
average_top_k_layers=12
|
| 99 |
+
model_clap_loss_type="ce"
|
| 100 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 101 |
+
echo "Config ${train_mode} ${config_option}"
|
| 102 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 103 |
+
task_load_clap_emb=true
|
| 104 |
+
model_proj_type=2
|
| 105 |
+
model_clone_batch=4
|
| 106 |
+
dataset_batch_size=48
|
| 107 |
+
model_clap_loss=1.0
|
| 108 |
+
average_top_k_layers=12
|
| 109 |
+
model_clap_loss_type="l1"
|
| 110 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 111 |
+
echo "Config ${train_mode} ${config_option}"
|
| 112 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 113 |
+
task_load_clap_emb=true
|
| 114 |
+
model_proj_type=2
|
| 115 |
+
model_clone_batch=4
|
| 116 |
+
dataset_batch_size=96
|
| 117 |
+
model_clap_loss=1.0
|
| 118 |
+
average_top_k_layers=12
|
| 119 |
+
model_clap_loss_type="cosine"
|
| 120 |
+
# loss layer ablation
|
| 121 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 122 |
+
echo "Config ${train_mode} ${config_option}"
|
| 123 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 124 |
+
task_load_clap_emb=true
|
| 125 |
+
model_proj_type=2
|
| 126 |
+
model_clone_batch=4
|
| 127 |
+
dataset_batch_size=96
|
| 128 |
+
model_clap_loss=1.0
|
| 129 |
+
average_top_k_layers=12
|
| 130 |
+
model_clap_loss_type="mse"
|
| 131 |
+
model_clap_loss_layer=10
|
| 132 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 133 |
+
echo "Config ${train_mode} ${config_option}"
|
| 134 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 135 |
+
task_load_clap_emb=true
|
| 136 |
+
task_load_source_file=true
|
| 137 |
+
task_load_mel_file=false
|
| 138 |
+
model_proj_type=2
|
| 139 |
+
model_clone_batch=4
|
| 140 |
+
dataset_batch_size=96
|
| 141 |
+
model_clap_loss=1.0
|
| 142 |
+
average_top_k_layers=12
|
| 143 |
+
model_clap_loss_type="mse"
|
| 144 |
+
model_clap_loss_layer=8
|
| 145 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 146 |
+
echo "Config ${train_mode} ${config_option}"
|
| 147 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 148 |
+
task_load_clap_emb=true
|
| 149 |
+
task_load_source_file=true
|
| 150 |
+
task_load_mel_file=false
|
| 151 |
+
model_proj_type=2
|
| 152 |
+
model_clone_batch=4
|
| 153 |
+
dataset_batch_size=96
|
| 154 |
+
model_clap_loss=1.0
|
| 155 |
+
average_top_k_layers=12
|
| 156 |
+
model_clap_loss_type="mse"
|
| 157 |
+
model_clap_loss_layer=6
|
| 158 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 159 |
+
echo "Config ${train_mode} ${config_option}"
|
| 160 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 161 |
+
task_load_clap_emb=true
|
| 162 |
+
task_load_source_file=true
|
| 163 |
+
task_load_mel_file=false
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
model_clap_loss=5.0
|
| 167 |
+
dataset_batch_size=96
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="mse"
|
| 170 |
+
checkpoint_keep_interval_updates=-1
|
| 171 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 172 |
+
echo "Config ${train_mode} ${config_option}"
|
| 173 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 174 |
+
task_load_clap_emb=true
|
| 175 |
+
task_load_source_file=true
|
| 176 |
+
task_load_mel_file=false
|
| 177 |
+
model_proj_type=2
|
| 178 |
+
model_clone_batch=4
|
| 179 |
+
model_clap_loss=0.1
|
| 180 |
+
dataset_batch_size=96
|
| 181 |
+
average_top_k_layers=12
|
| 182 |
+
model_clap_loss_type="mse"
|
| 183 |
+
checkpoint_keep_interval_updates=-1
|
| 184 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 185 |
+
echo "Config ${train_mode} ${config_option}"
|
| 186 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 187 |
+
task_load_clap_emb=true
|
| 188 |
+
model_proj_type=4
|
| 189 |
+
model_clone_batch=4
|
| 190 |
+
model_clap_loss=1.0
|
| 191 |
+
dataset_batch_size=48
|
| 192 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
model_proj_type=4
|
| 197 |
+
model_clone_batch=4
|
| 198 |
+
model_clap_loss=0.001
|
| 199 |
+
dataset_batch_size=48
|
| 200 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 201 |
+
echo "Config ${train_mode} ${config_option}"
|
| 202 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 203 |
+
task_load_clap_emb=true
|
| 204 |
+
model_proj_type=4
|
| 205 |
+
model_clone_batch=4
|
| 206 |
+
model_clap_loss=0.01
|
| 207 |
+
dataset_batch_size=48
|
| 208 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 209 |
+
echo "Config ${train_mode} ${config_option}"
|
| 210 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 211 |
+
task_load_clap_emb=true
|
| 212 |
+
model_proj_type=6
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
dataset_batch_size=48
|
| 215 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 216 |
+
echo "Config ${train_mode} ${config_option}"
|
| 217 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 218 |
+
task_load_clap_emb=true
|
| 219 |
+
task_load_source_file=true
|
| 220 |
+
task_load_mel_file=false
|
| 221 |
+
model_proj_type=2
|
| 222 |
+
model_clone_batch=4
|
| 223 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 224 |
+
model_clap_loss=1.0
|
| 225 |
+
average_top_k_layers=11 # modify with model depth
|
| 226 |
+
model_add_conv=true
|
| 227 |
+
model_depth=11 #
|
| 228 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 229 |
+
checkpoint_save_interval_updates=10000
|
| 230 |
+
fi
|
| 231 |
+
|
| 232 |
+
python fairseq_cli/hydra_train.py -m \
|
| 233 |
+
--config-dir ./EAT/config \
|
| 234 |
+
--config-name pretraining_AS2M \
|
| 235 |
+
common.user_dir=./EAT \
|
| 236 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 237 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 238 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 239 |
+
dataset.num_workers=24 \
|
| 240 |
+
dataset.data_buffer_size=48 \
|
| 241 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 242 |
+
task.data=${task_data} \
|
| 243 |
+
task.h5_format=False \
|
| 244 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 245 |
+
+task.load_source_file=${task_load_source_file} \
|
| 246 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 247 |
+
model.proj_type=${model_proj_type} \
|
| 248 |
+
model.clone_batch=${model_clone_batch} \
|
| 249 |
+
model.clap_loss=${model_clap_loss} \
|
| 250 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 251 |
+
+model.add_conv=${model_add_conv} \
|
| 252 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 253 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 254 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 255 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 256 |
+
model.depth=${model_depth} \
|
| 257 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 258 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_3_2025-09-27_05-57-32/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=3
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=1
|
| 66 |
+
dataset_batch_size=384
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 73 |
+
task_load_clap_emb=false
|
| 74 |
+
task_load_source_file=true
|
| 75 |
+
task_load_mel_file=false
|
| 76 |
+
model_proj_type=null
|
| 77 |
+
model_clone_batch=1
|
| 78 |
+
dataset_batch_size=384
|
| 79 |
+
model_dispersive_loss=10.0
|
| 80 |
+
model_dispersive_loss_layer=0
|
| 81 |
+
checkpoint_keep_interval_updates=1
|
| 82 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 83 |
+
echo "Config ${train_mode} ${config_option}"
|
| 84 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 85 |
+
task_load_clap_emb=false
|
| 86 |
+
task_load_source_file=true
|
| 87 |
+
task_load_mel_file=false
|
| 88 |
+
model_proj_type=null
|
| 89 |
+
model_clone_batch=1
|
| 90 |
+
dataset_batch_size=384
|
| 91 |
+
model_dispersive_loss=100.0
|
| 92 |
+
model_dispersive_loss_layer=0
|
| 93 |
+
checkpoint_keep_interval_updates=1
|
| 94 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 95 |
+
echo "Config ${train_mode} ${config_option}"
|
| 96 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 97 |
+
task_load_clap_emb=true
|
| 98 |
+
model_proj_type=2
|
| 99 |
+
model_clone_batch=4
|
| 100 |
+
dataset_batch_size=48
|
| 101 |
+
model_clap_loss=1.0
|
| 102 |
+
average_top_k_layers=12
|
| 103 |
+
model_add_conv=false
|
| 104 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 105 |
+
echo "Config ${train_mode} ${config_option}"
|
| 106 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 107 |
+
task_load_clap_emb=true
|
| 108 |
+
model_proj_type=2
|
| 109 |
+
model_clone_batch=4
|
| 110 |
+
dataset_batch_size=48
|
| 111 |
+
model_clap_loss=1.0
|
| 112 |
+
average_top_k_layers=1
|
| 113 |
+
# loss type ablation
|
| 114 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 115 |
+
echo "Config ${train_mode} ${config_option}"
|
| 116 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 117 |
+
task_load_clap_emb=true
|
| 118 |
+
model_proj_type=2
|
| 119 |
+
model_clone_batch=4
|
| 120 |
+
dataset_batch_size=48
|
| 121 |
+
model_clap_loss=1.0
|
| 122 |
+
average_top_k_layers=12
|
| 123 |
+
model_clap_loss_type="ce"
|
| 124 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 125 |
+
echo "Config ${train_mode} ${config_option}"
|
| 126 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 127 |
+
task_load_clap_emb=true
|
| 128 |
+
model_proj_type=2
|
| 129 |
+
model_clone_batch=4
|
| 130 |
+
dataset_batch_size=48
|
| 131 |
+
model_clap_loss=1.0
|
| 132 |
+
average_top_k_layers=12
|
| 133 |
+
model_clap_loss_type="l1"
|
| 134 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 135 |
+
echo "Config ${train_mode} ${config_option}"
|
| 136 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 137 |
+
task_load_clap_emb=true
|
| 138 |
+
model_proj_type=2
|
| 139 |
+
model_clone_batch=4
|
| 140 |
+
dataset_batch_size=96
|
| 141 |
+
model_clap_loss=1.0
|
| 142 |
+
average_top_k_layers=12
|
| 143 |
+
model_clap_loss_type="cosine"
|
| 144 |
+
# loss layer ablation
|
| 145 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 146 |
+
echo "Config ${train_mode} ${config_option}"
|
| 147 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 148 |
+
task_load_clap_emb=true
|
| 149 |
+
model_proj_type=2
|
| 150 |
+
model_clone_batch=4
|
| 151 |
+
dataset_batch_size=96
|
| 152 |
+
model_clap_loss=1.0
|
| 153 |
+
average_top_k_layers=12
|
| 154 |
+
model_clap_loss_type="mse"
|
| 155 |
+
model_clap_loss_layer=10
|
| 156 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 157 |
+
echo "Config ${train_mode} ${config_option}"
|
| 158 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 159 |
+
task_load_clap_emb=true
|
| 160 |
+
task_load_source_file=true
|
| 161 |
+
task_load_mel_file=false
|
| 162 |
+
model_proj_type=2
|
| 163 |
+
model_clone_batch=4
|
| 164 |
+
dataset_batch_size=96
|
| 165 |
+
model_clap_loss=1.0
|
| 166 |
+
average_top_k_layers=12
|
| 167 |
+
model_clap_loss_type="mse"
|
| 168 |
+
model_clap_loss_layer=8
|
| 169 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 170 |
+
echo "Config ${train_mode} ${config_option}"
|
| 171 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 172 |
+
task_load_clap_emb=true
|
| 173 |
+
task_load_source_file=true
|
| 174 |
+
task_load_mel_file=false
|
| 175 |
+
model_proj_type=2
|
| 176 |
+
model_clone_batch=4
|
| 177 |
+
dataset_batch_size=96
|
| 178 |
+
model_clap_loss=1.0
|
| 179 |
+
average_top_k_layers=12
|
| 180 |
+
model_clap_loss_type="mse"
|
| 181 |
+
model_clap_loss_layer=6
|
| 182 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 183 |
+
echo "Config ${train_mode} ${config_option}"
|
| 184 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 185 |
+
task_load_clap_emb=true
|
| 186 |
+
task_load_source_file=true
|
| 187 |
+
task_load_mel_file=false
|
| 188 |
+
model_proj_type=2
|
| 189 |
+
model_clone_batch=4
|
| 190 |
+
model_clap_loss=5.0
|
| 191 |
+
dataset_batch_size=96
|
| 192 |
+
average_top_k_layers=12
|
| 193 |
+
model_clap_loss_type="mse"
|
| 194 |
+
checkpoint_keep_interval_updates=-1
|
| 195 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 196 |
+
echo "Config ${train_mode} ${config_option}"
|
| 197 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 198 |
+
task_load_clap_emb=true
|
| 199 |
+
task_load_source_file=true
|
| 200 |
+
task_load_mel_file=false
|
| 201 |
+
model_proj_type=2
|
| 202 |
+
model_clone_batch=4
|
| 203 |
+
model_clap_loss=0.1
|
| 204 |
+
dataset_batch_size=96
|
| 205 |
+
average_top_k_layers=12
|
| 206 |
+
model_clap_loss_type="mse"
|
| 207 |
+
checkpoint_keep_interval_updates=-1
|
| 208 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 209 |
+
echo "Config ${train_mode} ${config_option}"
|
| 210 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 211 |
+
task_load_clap_emb=true
|
| 212 |
+
model_proj_type=4
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
model_clap_loss=1.0
|
| 215 |
+
dataset_batch_size=48
|
| 216 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 217 |
+
echo "Config ${train_mode} ${config_option}"
|
| 218 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 219 |
+
task_load_clap_emb=true
|
| 220 |
+
model_proj_type=4
|
| 221 |
+
model_clone_batch=4
|
| 222 |
+
model_clap_loss=0.001
|
| 223 |
+
dataset_batch_size=48
|
| 224 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 225 |
+
echo "Config ${train_mode} ${config_option}"
|
| 226 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 227 |
+
task_load_clap_emb=true
|
| 228 |
+
model_proj_type=4
|
| 229 |
+
model_clone_batch=4
|
| 230 |
+
model_clap_loss=0.01
|
| 231 |
+
dataset_batch_size=48
|
| 232 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
model_proj_type=6
|
| 237 |
+
model_clone_batch=4
|
| 238 |
+
dataset_batch_size=48
|
| 239 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 240 |
+
echo "Config ${train_mode} ${config_option}"
|
| 241 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 242 |
+
task_load_clap_emb=true
|
| 243 |
+
task_load_source_file=true
|
| 244 |
+
task_load_mel_file=false
|
| 245 |
+
model_proj_type=2
|
| 246 |
+
model_clone_batch=4
|
| 247 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 248 |
+
model_clap_loss=1.0
|
| 249 |
+
average_top_k_layers=11 # modify with model depth
|
| 250 |
+
model_add_conv=true
|
| 251 |
+
model_depth=11 #
|
| 252 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 253 |
+
checkpoint_save_interval_updates=10000
|
| 254 |
+
fi
|
| 255 |
+
|
| 256 |
+
python fairseq_cli/hydra_train.py -m \
|
| 257 |
+
--config-dir ./EAT/config \
|
| 258 |
+
--config-name pretraining_AS2M \
|
| 259 |
+
common.user_dir=./EAT \
|
| 260 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 261 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 262 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 263 |
+
dataset.num_workers=24 \
|
| 264 |
+
dataset.data_buffer_size=48 \
|
| 265 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 266 |
+
task.data=${task_data} \
|
| 267 |
+
task.h5_format=False \
|
| 268 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 269 |
+
+task.load_source_file=${task_load_source_file} \
|
| 270 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 271 |
+
model.proj_type=${model_proj_type} \
|
| 272 |
+
model.clone_batch=${model_clone_batch} \
|
| 273 |
+
model.clap_loss=${model_clap_loss} \
|
| 274 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 275 |
+
+model.add_conv=${model_add_conv} \
|
| 276 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 277 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 278 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 279 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 280 |
+
model.depth=${model_depth} \
|
| 281 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 282 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_4_2025-09-28_05-38-34/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=4
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=1
|
| 66 |
+
dataset_batch_size=384
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 73 |
+
task_load_clap_emb=false
|
| 74 |
+
task_load_source_file=true
|
| 75 |
+
task_load_mel_file=false
|
| 76 |
+
model_proj_type=null
|
| 77 |
+
model_clone_batch=1
|
| 78 |
+
dataset_batch_size=384
|
| 79 |
+
model_dispersive_loss=10.0
|
| 80 |
+
model_dispersive_loss_layer=0
|
| 81 |
+
checkpoint_keep_interval_updates=1
|
| 82 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 83 |
+
echo "Config ${train_mode} ${config_option}"
|
| 84 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 85 |
+
task_load_clap_emb=false
|
| 86 |
+
task_load_source_file=true
|
| 87 |
+
task_load_mel_file=false
|
| 88 |
+
model_proj_type=null
|
| 89 |
+
model_clone_batch=1
|
| 90 |
+
dataset_batch_size=384
|
| 91 |
+
model_dispersive_loss=100.0
|
| 92 |
+
model_dispersive_loss_layer=0
|
| 93 |
+
checkpoint_keep_interval_updates=1
|
| 94 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 95 |
+
echo "Config ${train_mode} ${config_option}"
|
| 96 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 97 |
+
task_load_clap_emb=false
|
| 98 |
+
task_load_source_file=true
|
| 99 |
+
task_load_mel_file=false
|
| 100 |
+
model_proj_type=null
|
| 101 |
+
model_clone_batch=1
|
| 102 |
+
dataset_batch_size=384
|
| 103 |
+
model_dispersive_loss=10000.0
|
| 104 |
+
model_dispersive_loss_layer=0
|
| 105 |
+
checkpoint_keep_interval_updates=1
|
| 106 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 107 |
+
echo "Config ${train_mode} ${config_option}"
|
| 108 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 109 |
+
task_load_clap_emb=true
|
| 110 |
+
model_proj_type=2
|
| 111 |
+
model_clone_batch=4
|
| 112 |
+
dataset_batch_size=48
|
| 113 |
+
model_clap_loss=1.0
|
| 114 |
+
average_top_k_layers=12
|
| 115 |
+
model_add_conv=false
|
| 116 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 117 |
+
echo "Config ${train_mode} ${config_option}"
|
| 118 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 119 |
+
task_load_clap_emb=true
|
| 120 |
+
model_proj_type=2
|
| 121 |
+
model_clone_batch=4
|
| 122 |
+
dataset_batch_size=48
|
| 123 |
+
model_clap_loss=1.0
|
| 124 |
+
average_top_k_layers=1
|
| 125 |
+
# loss type ablation
|
| 126 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 127 |
+
echo "Config ${train_mode} ${config_option}"
|
| 128 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 129 |
+
task_load_clap_emb=true
|
| 130 |
+
model_proj_type=2
|
| 131 |
+
model_clone_batch=4
|
| 132 |
+
dataset_batch_size=48
|
| 133 |
+
model_clap_loss=1.0
|
| 134 |
+
average_top_k_layers=12
|
| 135 |
+
model_clap_loss_type="ce"
|
| 136 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 137 |
+
echo "Config ${train_mode} ${config_option}"
|
| 138 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 139 |
+
task_load_clap_emb=true
|
| 140 |
+
model_proj_type=2
|
| 141 |
+
model_clone_batch=4
|
| 142 |
+
dataset_batch_size=48
|
| 143 |
+
model_clap_loss=1.0
|
| 144 |
+
average_top_k_layers=12
|
| 145 |
+
model_clap_loss_type="l1"
|
| 146 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 147 |
+
echo "Config ${train_mode} ${config_option}"
|
| 148 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 149 |
+
task_load_clap_emb=true
|
| 150 |
+
model_proj_type=2
|
| 151 |
+
model_clone_batch=4
|
| 152 |
+
dataset_batch_size=96
|
| 153 |
+
model_clap_loss=1.0
|
| 154 |
+
average_top_k_layers=12
|
| 155 |
+
model_clap_loss_type="cosine"
|
| 156 |
+
# loss layer ablation
|
| 157 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 158 |
+
echo "Config ${train_mode} ${config_option}"
|
| 159 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 160 |
+
task_load_clap_emb=true
|
| 161 |
+
model_proj_type=2
|
| 162 |
+
model_clone_batch=4
|
| 163 |
+
dataset_batch_size=96
|
| 164 |
+
model_clap_loss=1.0
|
| 165 |
+
average_top_k_layers=12
|
| 166 |
+
model_clap_loss_type="mse"
|
| 167 |
+
model_clap_loss_layer=10
|
| 168 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 169 |
+
echo "Config ${train_mode} ${config_option}"
|
| 170 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 171 |
+
task_load_clap_emb=true
|
| 172 |
+
task_load_source_file=true
|
| 173 |
+
task_load_mel_file=false
|
| 174 |
+
model_proj_type=2
|
| 175 |
+
model_clone_batch=4
|
| 176 |
+
dataset_batch_size=96
|
| 177 |
+
model_clap_loss=1.0
|
| 178 |
+
average_top_k_layers=12
|
| 179 |
+
model_clap_loss_type="mse"
|
| 180 |
+
model_clap_loss_layer=8
|
| 181 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 182 |
+
echo "Config ${train_mode} ${config_option}"
|
| 183 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 184 |
+
task_load_clap_emb=true
|
| 185 |
+
task_load_source_file=true
|
| 186 |
+
task_load_mel_file=false
|
| 187 |
+
model_proj_type=2
|
| 188 |
+
model_clone_batch=4
|
| 189 |
+
dataset_batch_size=96
|
| 190 |
+
model_clap_loss=1.0
|
| 191 |
+
average_top_k_layers=12
|
| 192 |
+
model_clap_loss_type="mse"
|
| 193 |
+
model_clap_loss_layer=6
|
| 194 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 195 |
+
echo "Config ${train_mode} ${config_option}"
|
| 196 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 197 |
+
task_load_clap_emb=true
|
| 198 |
+
task_load_source_file=true
|
| 199 |
+
task_load_mel_file=false
|
| 200 |
+
model_proj_type=2
|
| 201 |
+
model_clone_batch=4
|
| 202 |
+
model_clap_loss=5.0
|
| 203 |
+
dataset_batch_size=96
|
| 204 |
+
average_top_k_layers=12
|
| 205 |
+
model_clap_loss_type="mse"
|
| 206 |
+
checkpoint_keep_interval_updates=-1
|
| 207 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 208 |
+
echo "Config ${train_mode} ${config_option}"
|
| 209 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 210 |
+
task_load_clap_emb=true
|
| 211 |
+
task_load_source_file=true
|
| 212 |
+
task_load_mel_file=false
|
| 213 |
+
model_proj_type=2
|
| 214 |
+
model_clone_batch=4
|
| 215 |
+
model_clap_loss=0.1
|
| 216 |
+
dataset_batch_size=96
|
| 217 |
+
average_top_k_layers=12
|
| 218 |
+
model_clap_loss_type="mse"
|
| 219 |
+
checkpoint_keep_interval_updates=-1
|
| 220 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 221 |
+
echo "Config ${train_mode} ${config_option}"
|
| 222 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 223 |
+
task_load_clap_emb=true
|
| 224 |
+
model_proj_type=4
|
| 225 |
+
model_clone_batch=4
|
| 226 |
+
model_clap_loss=1.0
|
| 227 |
+
dataset_batch_size=48
|
| 228 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 229 |
+
echo "Config ${train_mode} ${config_option}"
|
| 230 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 231 |
+
task_load_clap_emb=true
|
| 232 |
+
model_proj_type=4
|
| 233 |
+
model_clone_batch=4
|
| 234 |
+
model_clap_loss=0.001
|
| 235 |
+
dataset_batch_size=48
|
| 236 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 237 |
+
echo "Config ${train_mode} ${config_option}"
|
| 238 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 239 |
+
task_load_clap_emb=true
|
| 240 |
+
model_proj_type=4
|
| 241 |
+
model_clone_batch=4
|
| 242 |
+
model_clap_loss=0.01
|
| 243 |
+
dataset_batch_size=48
|
| 244 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 245 |
+
echo "Config ${train_mode} ${config_option}"
|
| 246 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 247 |
+
task_load_clap_emb=true
|
| 248 |
+
model_proj_type=6
|
| 249 |
+
model_clone_batch=4
|
| 250 |
+
dataset_batch_size=48
|
| 251 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 252 |
+
echo "Config ${train_mode} ${config_option}"
|
| 253 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 254 |
+
task_load_clap_emb=true
|
| 255 |
+
task_load_source_file=true
|
| 256 |
+
task_load_mel_file=false
|
| 257 |
+
model_proj_type=2
|
| 258 |
+
model_clone_batch=4
|
| 259 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 260 |
+
model_clap_loss=1.0
|
| 261 |
+
average_top_k_layers=11 # modify with model depth
|
| 262 |
+
model_add_conv=true
|
| 263 |
+
model_depth=11 #
|
| 264 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 265 |
+
checkpoint_save_interval_updates=10000
|
| 266 |
+
fi
|
| 267 |
+
|
| 268 |
+
python fairseq_cli/hydra_train.py -m \
|
| 269 |
+
--config-dir ./EAT/config \
|
| 270 |
+
--config-name pretraining_AS2M \
|
| 271 |
+
common.user_dir=./EAT \
|
| 272 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 273 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 274 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 275 |
+
dataset.num_workers=24 \
|
| 276 |
+
dataset.data_buffer_size=48 \
|
| 277 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 278 |
+
task.data=${task_data} \
|
| 279 |
+
task.h5_format=False \
|
| 280 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 281 |
+
+task.load_source_file=${task_load_source_file} \
|
| 282 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 283 |
+
model.proj_type=${model_proj_type} \
|
| 284 |
+
model.clone_batch=${model_clone_batch} \
|
| 285 |
+
model.clap_loss=${model_clap_loss} \
|
| 286 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 287 |
+
+model.add_conv=${model_add_conv} \
|
| 288 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 289 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 290 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 291 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 292 |
+
model.depth=${model_depth} \
|
| 293 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 294 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_5_2025-09-28_06-51-25/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=5
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=1
|
| 66 |
+
dataset_batch_size=384
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 73 |
+
task_load_clap_emb=false
|
| 74 |
+
task_load_source_file=true
|
| 75 |
+
task_load_mel_file=false
|
| 76 |
+
model_proj_type=null
|
| 77 |
+
model_clone_batch=1
|
| 78 |
+
dataset_batch_size=384
|
| 79 |
+
model_dispersive_loss=10.0
|
| 80 |
+
model_dispersive_loss_layer=0
|
| 81 |
+
checkpoint_keep_interval_updates=1
|
| 82 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 83 |
+
echo "Config ${train_mode} ${config_option}"
|
| 84 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 85 |
+
task_load_clap_emb=false
|
| 86 |
+
task_load_source_file=true
|
| 87 |
+
task_load_mel_file=false
|
| 88 |
+
model_proj_type=null
|
| 89 |
+
model_clone_batch=1
|
| 90 |
+
dataset_batch_size=384
|
| 91 |
+
model_dispersive_loss=100.0
|
| 92 |
+
model_dispersive_loss_layer=0
|
| 93 |
+
checkpoint_keep_interval_updates=1
|
| 94 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 95 |
+
echo "Config ${train_mode} ${config_option}"
|
| 96 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 97 |
+
task_load_clap_emb=false
|
| 98 |
+
task_load_source_file=true
|
| 99 |
+
task_load_mel_file=false
|
| 100 |
+
model_proj_type=null
|
| 101 |
+
model_clone_batch=1
|
| 102 |
+
dataset_batch_size=384
|
| 103 |
+
model_dispersive_loss=10000.0
|
| 104 |
+
model_dispersive_loss_layer=0
|
| 105 |
+
checkpoint_keep_interval_updates=1
|
| 106 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 107 |
+
echo "Config ${train_mode} ${config_option}"
|
| 108 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 109 |
+
task_load_clap_emb=false
|
| 110 |
+
task_load_source_file=true
|
| 111 |
+
task_load_mel_file=false
|
| 112 |
+
model_proj_type=null
|
| 113 |
+
model_clone_batch=1
|
| 114 |
+
dataset_batch_size=384
|
| 115 |
+
model_dispersive_loss=1000.0
|
| 116 |
+
model_dispersive_loss_layer=0
|
| 117 |
+
checkpoint_keep_interval_updates=1
|
| 118 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 119 |
+
echo "Config ${train_mode} ${config_option}"
|
| 120 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 121 |
+
task_load_clap_emb=true
|
| 122 |
+
model_proj_type=2
|
| 123 |
+
model_clone_batch=4
|
| 124 |
+
dataset_batch_size=48
|
| 125 |
+
model_clap_loss=1.0
|
| 126 |
+
average_top_k_layers=12
|
| 127 |
+
model_add_conv=false
|
| 128 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 129 |
+
echo "Config ${train_mode} ${config_option}"
|
| 130 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 131 |
+
task_load_clap_emb=true
|
| 132 |
+
model_proj_type=2
|
| 133 |
+
model_clone_batch=4
|
| 134 |
+
dataset_batch_size=48
|
| 135 |
+
model_clap_loss=1.0
|
| 136 |
+
average_top_k_layers=1
|
| 137 |
+
# loss type ablation
|
| 138 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 139 |
+
echo "Config ${train_mode} ${config_option}"
|
| 140 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 141 |
+
task_load_clap_emb=true
|
| 142 |
+
model_proj_type=2
|
| 143 |
+
model_clone_batch=4
|
| 144 |
+
dataset_batch_size=48
|
| 145 |
+
model_clap_loss=1.0
|
| 146 |
+
average_top_k_layers=12
|
| 147 |
+
model_clap_loss_type="ce"
|
| 148 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 149 |
+
echo "Config ${train_mode} ${config_option}"
|
| 150 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 151 |
+
task_load_clap_emb=true
|
| 152 |
+
model_proj_type=2
|
| 153 |
+
model_clone_batch=4
|
| 154 |
+
dataset_batch_size=48
|
| 155 |
+
model_clap_loss=1.0
|
| 156 |
+
average_top_k_layers=12
|
| 157 |
+
model_clap_loss_type="l1"
|
| 158 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 159 |
+
echo "Config ${train_mode} ${config_option}"
|
| 160 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 161 |
+
task_load_clap_emb=true
|
| 162 |
+
model_proj_type=2
|
| 163 |
+
model_clone_batch=4
|
| 164 |
+
dataset_batch_size=96
|
| 165 |
+
model_clap_loss=1.0
|
| 166 |
+
average_top_k_layers=12
|
| 167 |
+
model_clap_loss_type="cosine"
|
| 168 |
+
# loss layer ablation
|
| 169 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 170 |
+
echo "Config ${train_mode} ${config_option}"
|
| 171 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 172 |
+
task_load_clap_emb=true
|
| 173 |
+
model_proj_type=2
|
| 174 |
+
model_clone_batch=4
|
| 175 |
+
dataset_batch_size=96
|
| 176 |
+
model_clap_loss=1.0
|
| 177 |
+
average_top_k_layers=12
|
| 178 |
+
model_clap_loss_type="mse"
|
| 179 |
+
model_clap_loss_layer=10
|
| 180 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 181 |
+
echo "Config ${train_mode} ${config_option}"
|
| 182 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 183 |
+
task_load_clap_emb=true
|
| 184 |
+
task_load_source_file=true
|
| 185 |
+
task_load_mel_file=false
|
| 186 |
+
model_proj_type=2
|
| 187 |
+
model_clone_batch=4
|
| 188 |
+
dataset_batch_size=96
|
| 189 |
+
model_clap_loss=1.0
|
| 190 |
+
average_top_k_layers=12
|
| 191 |
+
model_clap_loss_type="mse"
|
| 192 |
+
model_clap_loss_layer=8
|
| 193 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 194 |
+
echo "Config ${train_mode} ${config_option}"
|
| 195 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 196 |
+
task_load_clap_emb=true
|
| 197 |
+
task_load_source_file=true
|
| 198 |
+
task_load_mel_file=false
|
| 199 |
+
model_proj_type=2
|
| 200 |
+
model_clone_batch=4
|
| 201 |
+
dataset_batch_size=96
|
| 202 |
+
model_clap_loss=1.0
|
| 203 |
+
average_top_k_layers=12
|
| 204 |
+
model_clap_loss_type="mse"
|
| 205 |
+
model_clap_loss_layer=6
|
| 206 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 207 |
+
echo "Config ${train_mode} ${config_option}"
|
| 208 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 209 |
+
task_load_clap_emb=true
|
| 210 |
+
task_load_source_file=true
|
| 211 |
+
task_load_mel_file=false
|
| 212 |
+
model_proj_type=2
|
| 213 |
+
model_clone_batch=4
|
| 214 |
+
model_clap_loss=5.0
|
| 215 |
+
dataset_batch_size=96
|
| 216 |
+
average_top_k_layers=12
|
| 217 |
+
model_clap_loss_type="mse"
|
| 218 |
+
checkpoint_keep_interval_updates=-1
|
| 219 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 220 |
+
echo "Config ${train_mode} ${config_option}"
|
| 221 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 222 |
+
task_load_clap_emb=true
|
| 223 |
+
task_load_source_file=true
|
| 224 |
+
task_load_mel_file=false
|
| 225 |
+
model_proj_type=2
|
| 226 |
+
model_clone_batch=4
|
| 227 |
+
model_clap_loss=0.1
|
| 228 |
+
dataset_batch_size=96
|
| 229 |
+
average_top_k_layers=12
|
| 230 |
+
model_clap_loss_type="mse"
|
| 231 |
+
checkpoint_keep_interval_updates=-1
|
| 232 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 233 |
+
echo "Config ${train_mode} ${config_option}"
|
| 234 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 235 |
+
task_load_clap_emb=true
|
| 236 |
+
model_proj_type=4
|
| 237 |
+
model_clone_batch=4
|
| 238 |
+
model_clap_loss=1.0
|
| 239 |
+
dataset_batch_size=48
|
| 240 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 241 |
+
echo "Config ${train_mode} ${config_option}"
|
| 242 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 243 |
+
task_load_clap_emb=true
|
| 244 |
+
model_proj_type=4
|
| 245 |
+
model_clone_batch=4
|
| 246 |
+
model_clap_loss=0.001
|
| 247 |
+
dataset_batch_size=48
|
| 248 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 249 |
+
echo "Config ${train_mode} ${config_option}"
|
| 250 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 251 |
+
task_load_clap_emb=true
|
| 252 |
+
model_proj_type=4
|
| 253 |
+
model_clone_batch=4
|
| 254 |
+
model_clap_loss=0.01
|
| 255 |
+
dataset_batch_size=48
|
| 256 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 257 |
+
echo "Config ${train_mode} ${config_option}"
|
| 258 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 259 |
+
task_load_clap_emb=true
|
| 260 |
+
model_proj_type=6
|
| 261 |
+
model_clone_batch=4
|
| 262 |
+
dataset_batch_size=48
|
| 263 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 264 |
+
echo "Config ${train_mode} ${config_option}"
|
| 265 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 266 |
+
task_load_clap_emb=true
|
| 267 |
+
task_load_source_file=true
|
| 268 |
+
task_load_mel_file=false
|
| 269 |
+
model_proj_type=2
|
| 270 |
+
model_clone_batch=4
|
| 271 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 272 |
+
model_clap_loss=1.0
|
| 273 |
+
average_top_k_layers=11 # modify with model depth
|
| 274 |
+
model_add_conv=true
|
| 275 |
+
model_depth=11 #
|
| 276 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 277 |
+
checkpoint_save_interval_updates=10000
|
| 278 |
+
fi
|
| 279 |
+
|
| 280 |
+
python fairseq_cli/hydra_train.py -m \
|
| 281 |
+
--config-dir ./EAT/config \
|
| 282 |
+
--config-name pretraining_AS2M \
|
| 283 |
+
common.user_dir=./EAT \
|
| 284 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 285 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 286 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 287 |
+
dataset.num_workers=24 \
|
| 288 |
+
dataset.data_buffer_size=48 \
|
| 289 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 290 |
+
task.data=${task_data} \
|
| 291 |
+
task.h5_format=False \
|
| 292 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 293 |
+
+task.load_source_file=${task_load_source_file} \
|
| 294 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 295 |
+
model.proj_type=${model_proj_type} \
|
| 296 |
+
model.clone_batch=${model_clone_batch} \
|
| 297 |
+
model.clap_loss=${model_clap_loss} \
|
| 298 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 299 |
+
+model.add_conv=${model_add_conv} \
|
| 300 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 301 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 302 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 303 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 304 |
+
model.depth=${model_depth} \
|
| 305 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 306 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_5_2025-09-28_07-56-38/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=5
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=1
|
| 66 |
+
dataset_batch_size=384
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 73 |
+
task_load_clap_emb=false
|
| 74 |
+
task_load_source_file=true
|
| 75 |
+
task_load_mel_file=false
|
| 76 |
+
model_proj_type=null
|
| 77 |
+
model_clone_batch=1
|
| 78 |
+
dataset_batch_size=384
|
| 79 |
+
model_dispersive_loss=10.0
|
| 80 |
+
model_dispersive_loss_layer=0
|
| 81 |
+
checkpoint_keep_interval_updates=1
|
| 82 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 83 |
+
echo "Config ${train_mode} ${config_option}"
|
| 84 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 85 |
+
task_load_clap_emb=false
|
| 86 |
+
task_load_source_file=true
|
| 87 |
+
task_load_mel_file=false
|
| 88 |
+
model_proj_type=null
|
| 89 |
+
model_clone_batch=1
|
| 90 |
+
dataset_batch_size=384
|
| 91 |
+
model_dispersive_loss=100.0
|
| 92 |
+
model_dispersive_loss_layer=0
|
| 93 |
+
checkpoint_keep_interval_updates=1
|
| 94 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 95 |
+
echo "Config ${train_mode} ${config_option}"
|
| 96 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 97 |
+
task_load_clap_emb=false
|
| 98 |
+
task_load_source_file=true
|
| 99 |
+
task_load_mel_file=false
|
| 100 |
+
model_proj_type=null
|
| 101 |
+
model_clone_batch=1
|
| 102 |
+
dataset_batch_size=384
|
| 103 |
+
model_dispersive_loss=10000.0
|
| 104 |
+
model_dispersive_loss_layer=0
|
| 105 |
+
checkpoint_keep_interval_updates=1
|
| 106 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 107 |
+
echo "Config ${train_mode} ${config_option}"
|
| 108 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 109 |
+
task_load_clap_emb=false
|
| 110 |
+
task_load_source_file=true
|
| 111 |
+
task_load_mel_file=false
|
| 112 |
+
model_proj_type=null
|
| 113 |
+
model_clone_batch=1
|
| 114 |
+
dataset_batch_size=384
|
| 115 |
+
model_dispersive_loss=1000.0
|
| 116 |
+
model_dispersive_loss_layer=0
|
| 117 |
+
checkpoint_keep_interval_updates=1
|
| 118 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 119 |
+
echo "Config ${train_mode} ${config_option}"
|
| 120 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 121 |
+
task_load_clap_emb=false
|
| 122 |
+
task_load_source_file=true
|
| 123 |
+
task_load_mel_file=false
|
| 124 |
+
model_proj_type=null
|
| 125 |
+
model_clone_batch=4
|
| 126 |
+
dataset_batch_size=96
|
| 127 |
+
model_dispersive_loss=1000.0
|
| 128 |
+
model_dispersive_loss_layer=10
|
| 129 |
+
checkpoint_keep_interval_updates=1
|
| 130 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 131 |
+
echo "Config ${train_mode} ${config_option}"
|
| 132 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 133 |
+
task_load_clap_emb=true
|
| 134 |
+
model_proj_type=2
|
| 135 |
+
model_clone_batch=4
|
| 136 |
+
dataset_batch_size=48
|
| 137 |
+
model_clap_loss=1.0
|
| 138 |
+
average_top_k_layers=12
|
| 139 |
+
model_add_conv=false
|
| 140 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 141 |
+
echo "Config ${train_mode} ${config_option}"
|
| 142 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 143 |
+
task_load_clap_emb=true
|
| 144 |
+
model_proj_type=2
|
| 145 |
+
model_clone_batch=4
|
| 146 |
+
dataset_batch_size=48
|
| 147 |
+
model_clap_loss=1.0
|
| 148 |
+
average_top_k_layers=1
|
| 149 |
+
# loss type ablation
|
| 150 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 151 |
+
echo "Config ${train_mode} ${config_option}"
|
| 152 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 153 |
+
task_load_clap_emb=true
|
| 154 |
+
model_proj_type=2
|
| 155 |
+
model_clone_batch=4
|
| 156 |
+
dataset_batch_size=48
|
| 157 |
+
model_clap_loss=1.0
|
| 158 |
+
average_top_k_layers=12
|
| 159 |
+
model_clap_loss_type="ce"
|
| 160 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 161 |
+
echo "Config ${train_mode} ${config_option}"
|
| 162 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 163 |
+
task_load_clap_emb=true
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
dataset_batch_size=48
|
| 167 |
+
model_clap_loss=1.0
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="l1"
|
| 170 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 171 |
+
echo "Config ${train_mode} ${config_option}"
|
| 172 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 173 |
+
task_load_clap_emb=true
|
| 174 |
+
model_proj_type=2
|
| 175 |
+
model_clone_batch=4
|
| 176 |
+
dataset_batch_size=96
|
| 177 |
+
model_clap_loss=1.0
|
| 178 |
+
average_top_k_layers=12
|
| 179 |
+
model_clap_loss_type="cosine"
|
| 180 |
+
# loss layer ablation
|
| 181 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 182 |
+
echo "Config ${train_mode} ${config_option}"
|
| 183 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 184 |
+
task_load_clap_emb=true
|
| 185 |
+
model_proj_type=2
|
| 186 |
+
model_clone_batch=4
|
| 187 |
+
dataset_batch_size=96
|
| 188 |
+
model_clap_loss=1.0
|
| 189 |
+
average_top_k_layers=12
|
| 190 |
+
model_clap_loss_type="mse"
|
| 191 |
+
model_clap_loss_layer=10
|
| 192 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
task_load_source_file=true
|
| 197 |
+
task_load_mel_file=false
|
| 198 |
+
model_proj_type=2
|
| 199 |
+
model_clone_batch=4
|
| 200 |
+
dataset_batch_size=96
|
| 201 |
+
model_clap_loss=1.0
|
| 202 |
+
average_top_k_layers=12
|
| 203 |
+
model_clap_loss_type="mse"
|
| 204 |
+
model_clap_loss_layer=8
|
| 205 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 206 |
+
echo "Config ${train_mode} ${config_option}"
|
| 207 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 208 |
+
task_load_clap_emb=true
|
| 209 |
+
task_load_source_file=true
|
| 210 |
+
task_load_mel_file=false
|
| 211 |
+
model_proj_type=2
|
| 212 |
+
model_clone_batch=4
|
| 213 |
+
dataset_batch_size=96
|
| 214 |
+
model_clap_loss=1.0
|
| 215 |
+
average_top_k_layers=12
|
| 216 |
+
model_clap_loss_type="mse"
|
| 217 |
+
model_clap_loss_layer=6
|
| 218 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 219 |
+
echo "Config ${train_mode} ${config_option}"
|
| 220 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 221 |
+
task_load_clap_emb=true
|
| 222 |
+
task_load_source_file=true
|
| 223 |
+
task_load_mel_file=false
|
| 224 |
+
model_proj_type=2
|
| 225 |
+
model_clone_batch=4
|
| 226 |
+
model_clap_loss=5.0
|
| 227 |
+
dataset_batch_size=96
|
| 228 |
+
average_top_k_layers=12
|
| 229 |
+
model_clap_loss_type="mse"
|
| 230 |
+
checkpoint_keep_interval_updates=-1
|
| 231 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 232 |
+
echo "Config ${train_mode} ${config_option}"
|
| 233 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 234 |
+
task_load_clap_emb=true
|
| 235 |
+
task_load_source_file=true
|
| 236 |
+
task_load_mel_file=false
|
| 237 |
+
model_proj_type=2
|
| 238 |
+
model_clone_batch=4
|
| 239 |
+
model_clap_loss=0.1
|
| 240 |
+
dataset_batch_size=96
|
| 241 |
+
average_top_k_layers=12
|
| 242 |
+
model_clap_loss_type="mse"
|
| 243 |
+
checkpoint_keep_interval_updates=-1
|
| 244 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 245 |
+
echo "Config ${train_mode} ${config_option}"
|
| 246 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 247 |
+
task_load_clap_emb=true
|
| 248 |
+
model_proj_type=4
|
| 249 |
+
model_clone_batch=4
|
| 250 |
+
model_clap_loss=1.0
|
| 251 |
+
dataset_batch_size=48
|
| 252 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 253 |
+
echo "Config ${train_mode} ${config_option}"
|
| 254 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 255 |
+
task_load_clap_emb=true
|
| 256 |
+
model_proj_type=4
|
| 257 |
+
model_clone_batch=4
|
| 258 |
+
model_clap_loss=0.001
|
| 259 |
+
dataset_batch_size=48
|
| 260 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 261 |
+
echo "Config ${train_mode} ${config_option}"
|
| 262 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 263 |
+
task_load_clap_emb=true
|
| 264 |
+
model_proj_type=4
|
| 265 |
+
model_clone_batch=4
|
| 266 |
+
model_clap_loss=0.01
|
| 267 |
+
dataset_batch_size=48
|
| 268 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 269 |
+
echo "Config ${train_mode} ${config_option}"
|
| 270 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 271 |
+
task_load_clap_emb=true
|
| 272 |
+
model_proj_type=6
|
| 273 |
+
model_clone_batch=4
|
| 274 |
+
dataset_batch_size=48
|
| 275 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 276 |
+
echo "Config ${train_mode} ${config_option}"
|
| 277 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 278 |
+
task_load_clap_emb=true
|
| 279 |
+
task_load_source_file=true
|
| 280 |
+
task_load_mel_file=false
|
| 281 |
+
model_proj_type=2
|
| 282 |
+
model_clone_batch=4
|
| 283 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 284 |
+
model_clap_loss=1.0
|
| 285 |
+
average_top_k_layers=11 # modify with model depth
|
| 286 |
+
model_add_conv=true
|
| 287 |
+
model_depth=11 #
|
| 288 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 289 |
+
checkpoint_save_interval_updates=10000
|
| 290 |
+
fi
|
| 291 |
+
|
| 292 |
+
python fairseq_cli/hydra_train.py -m \
|
| 293 |
+
--config-dir ./EAT/config \
|
| 294 |
+
--config-name pretraining_AS2M \
|
| 295 |
+
common.user_dir=./EAT \
|
| 296 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 297 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 298 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 299 |
+
dataset.num_workers=24 \
|
| 300 |
+
dataset.data_buffer_size=48 \
|
| 301 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 302 |
+
task.data=${task_data} \
|
| 303 |
+
task.h5_format=False \
|
| 304 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 305 |
+
+task.load_source_file=${task_load_source_file} \
|
| 306 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 307 |
+
model.proj_type=${model_proj_type} \
|
| 308 |
+
model.clone_batch=${model_clone_batch} \
|
| 309 |
+
model.clap_loss=${model_clap_loss} \
|
| 310 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 311 |
+
+model.add_conv=${model_add_conv} \
|
| 312 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 313 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 314 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 315 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 316 |
+
model.depth=${model_depth} \
|
| 317 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 318 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_6_2025-09-28_08-28-48/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=6
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=1
|
| 66 |
+
dataset_batch_size=384
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 73 |
+
task_load_clap_emb=false
|
| 74 |
+
task_load_source_file=true
|
| 75 |
+
task_load_mel_file=false
|
| 76 |
+
model_proj_type=null
|
| 77 |
+
model_clone_batch=1
|
| 78 |
+
dataset_batch_size=384
|
| 79 |
+
model_dispersive_loss=10.0
|
| 80 |
+
model_dispersive_loss_layer=0
|
| 81 |
+
checkpoint_keep_interval_updates=1
|
| 82 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 83 |
+
echo "Config ${train_mode} ${config_option}"
|
| 84 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 85 |
+
task_load_clap_emb=false
|
| 86 |
+
task_load_source_file=true
|
| 87 |
+
task_load_mel_file=false
|
| 88 |
+
model_proj_type=null
|
| 89 |
+
model_clone_batch=1
|
| 90 |
+
dataset_batch_size=384
|
| 91 |
+
model_dispersive_loss=100.0
|
| 92 |
+
model_dispersive_loss_layer=0
|
| 93 |
+
checkpoint_keep_interval_updates=1
|
| 94 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 95 |
+
echo "Config ${train_mode} ${config_option}"
|
| 96 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 97 |
+
task_load_clap_emb=false
|
| 98 |
+
task_load_source_file=true
|
| 99 |
+
task_load_mel_file=false
|
| 100 |
+
model_proj_type=null
|
| 101 |
+
model_clone_batch=1
|
| 102 |
+
dataset_batch_size=384
|
| 103 |
+
model_dispersive_loss=10000.0
|
| 104 |
+
model_dispersive_loss_layer=0
|
| 105 |
+
checkpoint_keep_interval_updates=1
|
| 106 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 107 |
+
echo "Config ${train_mode} ${config_option}"
|
| 108 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 109 |
+
task_load_clap_emb=false
|
| 110 |
+
task_load_source_file=true
|
| 111 |
+
task_load_mel_file=false
|
| 112 |
+
model_proj_type=null
|
| 113 |
+
model_clone_batch=1
|
| 114 |
+
dataset_batch_size=384
|
| 115 |
+
model_dispersive_loss=1000.0
|
| 116 |
+
model_dispersive_loss_layer=0
|
| 117 |
+
checkpoint_keep_interval_updates=1
|
| 118 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 119 |
+
echo "Config ${train_mode} ${config_option}"
|
| 120 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 121 |
+
task_load_clap_emb=false
|
| 122 |
+
task_load_source_file=true
|
| 123 |
+
task_load_mel_file=false
|
| 124 |
+
model_proj_type=null
|
| 125 |
+
model_clone_batch=4
|
| 126 |
+
dataset_batch_size=96
|
| 127 |
+
model_dispersive_loss=1000.0
|
| 128 |
+
model_dispersive_loss_layer=10
|
| 129 |
+
checkpoint_keep_interval_updates=1
|
| 130 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 131 |
+
echo "Config ${train_mode} ${config_option}"
|
| 132 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 133 |
+
task_load_clap_emb=true
|
| 134 |
+
model_proj_type=2
|
| 135 |
+
model_clone_batch=4
|
| 136 |
+
dataset_batch_size=48
|
| 137 |
+
model_clap_loss=1.0
|
| 138 |
+
average_top_k_layers=12
|
| 139 |
+
model_add_conv=false
|
| 140 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 141 |
+
echo "Config ${train_mode} ${config_option}"
|
| 142 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 143 |
+
task_load_clap_emb=true
|
| 144 |
+
model_proj_type=2
|
| 145 |
+
model_clone_batch=4
|
| 146 |
+
dataset_batch_size=48
|
| 147 |
+
model_clap_loss=1.0
|
| 148 |
+
average_top_k_layers=1
|
| 149 |
+
# loss type ablation
|
| 150 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 151 |
+
echo "Config ${train_mode} ${config_option}"
|
| 152 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 153 |
+
task_load_clap_emb=true
|
| 154 |
+
model_proj_type=2
|
| 155 |
+
model_clone_batch=4
|
| 156 |
+
dataset_batch_size=48
|
| 157 |
+
model_clap_loss=1.0
|
| 158 |
+
average_top_k_layers=12
|
| 159 |
+
model_clap_loss_type="ce"
|
| 160 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 161 |
+
echo "Config ${train_mode} ${config_option}"
|
| 162 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 163 |
+
task_load_clap_emb=true
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
dataset_batch_size=48
|
| 167 |
+
model_clap_loss=1.0
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="l1"
|
| 170 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 171 |
+
echo "Config ${train_mode} ${config_option}"
|
| 172 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 173 |
+
task_load_clap_emb=true
|
| 174 |
+
model_proj_type=2
|
| 175 |
+
model_clone_batch=4
|
| 176 |
+
dataset_batch_size=96
|
| 177 |
+
model_clap_loss=1.0
|
| 178 |
+
average_top_k_layers=12
|
| 179 |
+
model_clap_loss_type="cosine"
|
| 180 |
+
# loss layer ablation
|
| 181 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 182 |
+
echo "Config ${train_mode} ${config_option}"
|
| 183 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 184 |
+
task_load_clap_emb=true
|
| 185 |
+
model_proj_type=2
|
| 186 |
+
model_clone_batch=4
|
| 187 |
+
dataset_batch_size=96
|
| 188 |
+
model_clap_loss=1.0
|
| 189 |
+
average_top_k_layers=12
|
| 190 |
+
model_clap_loss_type="mse"
|
| 191 |
+
model_clap_loss_layer=10
|
| 192 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
task_load_source_file=true
|
| 197 |
+
task_load_mel_file=false
|
| 198 |
+
model_proj_type=2
|
| 199 |
+
model_clone_batch=4
|
| 200 |
+
dataset_batch_size=96
|
| 201 |
+
model_clap_loss=1.0
|
| 202 |
+
average_top_k_layers=12
|
| 203 |
+
model_clap_loss_type="mse"
|
| 204 |
+
model_clap_loss_layer=8
|
| 205 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 206 |
+
echo "Config ${train_mode} ${config_option}"
|
| 207 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 208 |
+
task_load_clap_emb=true
|
| 209 |
+
task_load_source_file=true
|
| 210 |
+
task_load_mel_file=false
|
| 211 |
+
model_proj_type=2
|
| 212 |
+
model_clone_batch=4
|
| 213 |
+
dataset_batch_size=96
|
| 214 |
+
model_clap_loss=1.0
|
| 215 |
+
average_top_k_layers=12
|
| 216 |
+
model_clap_loss_type="mse"
|
| 217 |
+
model_clap_loss_layer=6
|
| 218 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 219 |
+
echo "Config ${train_mode} ${config_option}"
|
| 220 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 221 |
+
task_load_clap_emb=true
|
| 222 |
+
task_load_source_file=true
|
| 223 |
+
task_load_mel_file=false
|
| 224 |
+
model_proj_type=2
|
| 225 |
+
model_clone_batch=4
|
| 226 |
+
model_clap_loss=5.0
|
| 227 |
+
dataset_batch_size=96
|
| 228 |
+
average_top_k_layers=12
|
| 229 |
+
model_clap_loss_type="mse"
|
| 230 |
+
checkpoint_keep_interval_updates=-1
|
| 231 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 232 |
+
echo "Config ${train_mode} ${config_option}"
|
| 233 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 234 |
+
task_load_clap_emb=true
|
| 235 |
+
task_load_source_file=true
|
| 236 |
+
task_load_mel_file=false
|
| 237 |
+
model_proj_type=2
|
| 238 |
+
model_clone_batch=4
|
| 239 |
+
model_clap_loss=0.1
|
| 240 |
+
dataset_batch_size=96
|
| 241 |
+
average_top_k_layers=12
|
| 242 |
+
model_clap_loss_type="mse"
|
| 243 |
+
checkpoint_keep_interval_updates=-1
|
| 244 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 245 |
+
echo "Config ${train_mode} ${config_option}"
|
| 246 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 247 |
+
task_load_clap_emb=true
|
| 248 |
+
model_proj_type=4
|
| 249 |
+
model_clone_batch=4
|
| 250 |
+
model_clap_loss=1.0
|
| 251 |
+
dataset_batch_size=48
|
| 252 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 253 |
+
echo "Config ${train_mode} ${config_option}"
|
| 254 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 255 |
+
task_load_clap_emb=true
|
| 256 |
+
model_proj_type=4
|
| 257 |
+
model_clone_batch=4
|
| 258 |
+
model_clap_loss=0.001
|
| 259 |
+
dataset_batch_size=48
|
| 260 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 261 |
+
echo "Config ${train_mode} ${config_option}"
|
| 262 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 263 |
+
task_load_clap_emb=true
|
| 264 |
+
model_proj_type=4
|
| 265 |
+
model_clone_batch=4
|
| 266 |
+
model_clap_loss=0.01
|
| 267 |
+
dataset_batch_size=48
|
| 268 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 269 |
+
echo "Config ${train_mode} ${config_option}"
|
| 270 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 271 |
+
task_load_clap_emb=true
|
| 272 |
+
model_proj_type=6
|
| 273 |
+
model_clone_batch=4
|
| 274 |
+
dataset_batch_size=48
|
| 275 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 276 |
+
echo "Config ${train_mode} ${config_option}"
|
| 277 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 278 |
+
task_load_clap_emb=true
|
| 279 |
+
task_load_source_file=true
|
| 280 |
+
task_load_mel_file=false
|
| 281 |
+
model_proj_type=2
|
| 282 |
+
model_clone_batch=4
|
| 283 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 284 |
+
model_clap_loss=1.0
|
| 285 |
+
average_top_k_layers=11 # modify with model depth
|
| 286 |
+
model_add_conv=true
|
| 287 |
+
model_depth=11 #
|
| 288 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 289 |
+
checkpoint_save_interval_updates=10000
|
| 290 |
+
fi
|
| 291 |
+
|
| 292 |
+
python fairseq_cli/hydra_train.py -m \
|
| 293 |
+
--config-dir ./EAT/config \
|
| 294 |
+
--config-name pretraining_AS2M \
|
| 295 |
+
common.user_dir=./EAT \
|
| 296 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 297 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 298 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 299 |
+
dataset.num_workers=24 \
|
| 300 |
+
dataset.data_buffer_size=48 \
|
| 301 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 302 |
+
task.data=${task_data} \
|
| 303 |
+
task.h5_format=False \
|
| 304 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 305 |
+
+task.load_source_file=${task_load_source_file} \
|
| 306 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 307 |
+
model.proj_type=${model_proj_type} \
|
| 308 |
+
model.clone_batch=${model_clone_batch} \
|
| 309 |
+
model.clap_loss=${model_clap_loss} \
|
| 310 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 311 |
+
+model.add_conv=${model_add_conv} \
|
| 312 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 313 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 314 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 315 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 316 |
+
model.depth=${model_depth} \
|
| 317 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 318 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_6_2025-09-28_08-49-54/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=6
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=1
|
| 66 |
+
dataset_batch_size=384
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 73 |
+
task_load_clap_emb=false
|
| 74 |
+
task_load_source_file=true
|
| 75 |
+
task_load_mel_file=false
|
| 76 |
+
model_proj_type=null
|
| 77 |
+
model_clone_batch=1
|
| 78 |
+
dataset_batch_size=384
|
| 79 |
+
model_dispersive_loss=10.0
|
| 80 |
+
model_dispersive_loss_layer=0
|
| 81 |
+
checkpoint_keep_interval_updates=1
|
| 82 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 83 |
+
echo "Config ${train_mode} ${config_option}"
|
| 84 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 85 |
+
task_load_clap_emb=false
|
| 86 |
+
task_load_source_file=true
|
| 87 |
+
task_load_mel_file=false
|
| 88 |
+
model_proj_type=null
|
| 89 |
+
model_clone_batch=1
|
| 90 |
+
dataset_batch_size=384
|
| 91 |
+
model_dispersive_loss=100.0
|
| 92 |
+
model_dispersive_loss_layer=0
|
| 93 |
+
checkpoint_keep_interval_updates=1
|
| 94 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 95 |
+
echo "Config ${train_mode} ${config_option}"
|
| 96 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 97 |
+
task_load_clap_emb=false
|
| 98 |
+
task_load_source_file=true
|
| 99 |
+
task_load_mel_file=false
|
| 100 |
+
model_proj_type=null
|
| 101 |
+
model_clone_batch=1
|
| 102 |
+
dataset_batch_size=384
|
| 103 |
+
model_dispersive_loss=10000.0
|
| 104 |
+
model_dispersive_loss_layer=0
|
| 105 |
+
checkpoint_keep_interval_updates=1
|
| 106 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 107 |
+
echo "Config ${train_mode} ${config_option}"
|
| 108 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 109 |
+
task_load_clap_emb=false
|
| 110 |
+
task_load_source_file=true
|
| 111 |
+
task_load_mel_file=false
|
| 112 |
+
model_proj_type=null
|
| 113 |
+
model_clone_batch=1
|
| 114 |
+
dataset_batch_size=384
|
| 115 |
+
model_dispersive_loss=1000.0
|
| 116 |
+
model_dispersive_loss_layer=0
|
| 117 |
+
checkpoint_keep_interval_updates=1
|
| 118 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 119 |
+
echo "Config ${train_mode} ${config_option}"
|
| 120 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 121 |
+
task_load_clap_emb=false
|
| 122 |
+
task_load_source_file=true
|
| 123 |
+
task_load_mel_file=false
|
| 124 |
+
model_proj_type=null
|
| 125 |
+
model_clone_batch=4
|
| 126 |
+
dataset_batch_size=96
|
| 127 |
+
model_dispersive_loss=1000.0
|
| 128 |
+
model_dispersive_loss_layer=10
|
| 129 |
+
checkpoint_keep_interval_updates=1
|
| 130 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 131 |
+
echo "Config ${train_mode} ${config_option}"
|
| 132 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 133 |
+
task_load_clap_emb=true
|
| 134 |
+
model_proj_type=2
|
| 135 |
+
model_clone_batch=4
|
| 136 |
+
dataset_batch_size=48
|
| 137 |
+
model_clap_loss=1.0
|
| 138 |
+
average_top_k_layers=12
|
| 139 |
+
model_add_conv=false
|
| 140 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 141 |
+
echo "Config ${train_mode} ${config_option}"
|
| 142 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 143 |
+
task_load_clap_emb=true
|
| 144 |
+
model_proj_type=2
|
| 145 |
+
model_clone_batch=4
|
| 146 |
+
dataset_batch_size=48
|
| 147 |
+
model_clap_loss=1.0
|
| 148 |
+
average_top_k_layers=1
|
| 149 |
+
# loss type ablation
|
| 150 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 151 |
+
echo "Config ${train_mode} ${config_option}"
|
| 152 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 153 |
+
task_load_clap_emb=true
|
| 154 |
+
model_proj_type=2
|
| 155 |
+
model_clone_batch=4
|
| 156 |
+
dataset_batch_size=48
|
| 157 |
+
model_clap_loss=1.0
|
| 158 |
+
average_top_k_layers=12
|
| 159 |
+
model_clap_loss_type="ce"
|
| 160 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 161 |
+
echo "Config ${train_mode} ${config_option}"
|
| 162 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 163 |
+
task_load_clap_emb=true
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
dataset_batch_size=48
|
| 167 |
+
model_clap_loss=1.0
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="l1"
|
| 170 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 171 |
+
echo "Config ${train_mode} ${config_option}"
|
| 172 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 173 |
+
task_load_clap_emb=true
|
| 174 |
+
model_proj_type=2
|
| 175 |
+
model_clone_batch=4
|
| 176 |
+
dataset_batch_size=96
|
| 177 |
+
model_clap_loss=1.0
|
| 178 |
+
average_top_k_layers=12
|
| 179 |
+
model_clap_loss_type="cosine"
|
| 180 |
+
# loss layer ablation
|
| 181 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 182 |
+
echo "Config ${train_mode} ${config_option}"
|
| 183 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 184 |
+
task_load_clap_emb=true
|
| 185 |
+
model_proj_type=2
|
| 186 |
+
model_clone_batch=4
|
| 187 |
+
dataset_batch_size=96
|
| 188 |
+
model_clap_loss=1.0
|
| 189 |
+
average_top_k_layers=12
|
| 190 |
+
model_clap_loss_type="mse"
|
| 191 |
+
model_clap_loss_layer=10
|
| 192 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
task_load_source_file=true
|
| 197 |
+
task_load_mel_file=false
|
| 198 |
+
model_proj_type=2
|
| 199 |
+
model_clone_batch=4
|
| 200 |
+
dataset_batch_size=96
|
| 201 |
+
model_clap_loss=1.0
|
| 202 |
+
average_top_k_layers=12
|
| 203 |
+
model_clap_loss_type="mse"
|
| 204 |
+
model_clap_loss_layer=8
|
| 205 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 206 |
+
echo "Config ${train_mode} ${config_option}"
|
| 207 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 208 |
+
task_load_clap_emb=true
|
| 209 |
+
task_load_source_file=true
|
| 210 |
+
task_load_mel_file=false
|
| 211 |
+
model_proj_type=2
|
| 212 |
+
model_clone_batch=4
|
| 213 |
+
dataset_batch_size=96
|
| 214 |
+
model_clap_loss=1.0
|
| 215 |
+
average_top_k_layers=12
|
| 216 |
+
model_clap_loss_type="mse"
|
| 217 |
+
model_clap_loss_layer=6
|
| 218 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 219 |
+
echo "Config ${train_mode} ${config_option}"
|
| 220 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 221 |
+
task_load_clap_emb=true
|
| 222 |
+
task_load_source_file=true
|
| 223 |
+
task_load_mel_file=false
|
| 224 |
+
model_proj_type=2
|
| 225 |
+
model_clone_batch=4
|
| 226 |
+
model_clap_loss=5.0
|
| 227 |
+
dataset_batch_size=96
|
| 228 |
+
average_top_k_layers=12
|
| 229 |
+
model_clap_loss_type="mse"
|
| 230 |
+
checkpoint_keep_interval_updates=-1
|
| 231 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 232 |
+
echo "Config ${train_mode} ${config_option}"
|
| 233 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 234 |
+
task_load_clap_emb=true
|
| 235 |
+
task_load_source_file=true
|
| 236 |
+
task_load_mel_file=false
|
| 237 |
+
model_proj_type=2
|
| 238 |
+
model_clone_batch=4
|
| 239 |
+
model_clap_loss=0.1
|
| 240 |
+
dataset_batch_size=96
|
| 241 |
+
average_top_k_layers=12
|
| 242 |
+
model_clap_loss_type="mse"
|
| 243 |
+
checkpoint_keep_interval_updates=-1
|
| 244 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 245 |
+
echo "Config ${train_mode} ${config_option}"
|
| 246 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 247 |
+
task_load_clap_emb=true
|
| 248 |
+
model_proj_type=4
|
| 249 |
+
model_clone_batch=4
|
| 250 |
+
model_clap_loss=1.0
|
| 251 |
+
dataset_batch_size=48
|
| 252 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 253 |
+
echo "Config ${train_mode} ${config_option}"
|
| 254 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 255 |
+
task_load_clap_emb=true
|
| 256 |
+
model_proj_type=4
|
| 257 |
+
model_clone_batch=4
|
| 258 |
+
model_clap_loss=0.001
|
| 259 |
+
dataset_batch_size=48
|
| 260 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 261 |
+
echo "Config ${train_mode} ${config_option}"
|
| 262 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 263 |
+
task_load_clap_emb=true
|
| 264 |
+
model_proj_type=4
|
| 265 |
+
model_clone_batch=4
|
| 266 |
+
model_clap_loss=0.01
|
| 267 |
+
dataset_batch_size=48
|
| 268 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 269 |
+
echo "Config ${train_mode} ${config_option}"
|
| 270 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 271 |
+
task_load_clap_emb=true
|
| 272 |
+
model_proj_type=6
|
| 273 |
+
model_clone_batch=4
|
| 274 |
+
dataset_batch_size=48
|
| 275 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 276 |
+
echo "Config ${train_mode} ${config_option}"
|
| 277 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 278 |
+
task_load_clap_emb=true
|
| 279 |
+
task_load_source_file=true
|
| 280 |
+
task_load_mel_file=false
|
| 281 |
+
model_proj_type=2
|
| 282 |
+
model_clone_batch=4
|
| 283 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 284 |
+
model_clap_loss=1.0
|
| 285 |
+
average_top_k_layers=11 # modify with model depth
|
| 286 |
+
model_add_conv=true
|
| 287 |
+
model_depth=11 #
|
| 288 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 289 |
+
checkpoint_save_interval_updates=10000
|
| 290 |
+
fi
|
| 291 |
+
|
| 292 |
+
python fairseq_cli/hydra_train.py -m \
|
| 293 |
+
--config-dir ./EAT/config \
|
| 294 |
+
--config-name pretraining_AS2M \
|
| 295 |
+
common.user_dir=./EAT \
|
| 296 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 297 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 298 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 299 |
+
dataset.num_workers=24 \
|
| 300 |
+
dataset.data_buffer_size=48 \
|
| 301 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 302 |
+
task.data=${task_data} \
|
| 303 |
+
task.h5_format=False \
|
| 304 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 305 |
+
+task.load_source_file=${task_load_source_file} \
|
| 306 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 307 |
+
model.proj_type=${model_proj_type} \
|
| 308 |
+
model.clone_batch=${model_clone_batch} \
|
| 309 |
+
model.clap_loss=${model_clap_loss} \
|
| 310 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 311 |
+
+model.add_conv=${model_add_conv} \
|
| 312 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 313 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 314 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 315 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 316 |
+
model.depth=${model_depth} \
|
| 317 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 318 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_6_2025-09-28_08-55-19/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=6
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=1
|
| 66 |
+
dataset_batch_size=384
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 73 |
+
task_load_clap_emb=false
|
| 74 |
+
task_load_source_file=true
|
| 75 |
+
task_load_mel_file=false
|
| 76 |
+
model_proj_type=null
|
| 77 |
+
model_clone_batch=1
|
| 78 |
+
dataset_batch_size=384
|
| 79 |
+
model_dispersive_loss=10.0
|
| 80 |
+
model_dispersive_loss_layer=0
|
| 81 |
+
checkpoint_keep_interval_updates=1
|
| 82 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 83 |
+
echo "Config ${train_mode} ${config_option}"
|
| 84 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 85 |
+
task_load_clap_emb=false
|
| 86 |
+
task_load_source_file=true
|
| 87 |
+
task_load_mel_file=false
|
| 88 |
+
model_proj_type=null
|
| 89 |
+
model_clone_batch=1
|
| 90 |
+
dataset_batch_size=384
|
| 91 |
+
model_dispersive_loss=100.0
|
| 92 |
+
model_dispersive_loss_layer=0
|
| 93 |
+
checkpoint_keep_interval_updates=1
|
| 94 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 95 |
+
echo "Config ${train_mode} ${config_option}"
|
| 96 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 97 |
+
task_load_clap_emb=false
|
| 98 |
+
task_load_source_file=true
|
| 99 |
+
task_load_mel_file=false
|
| 100 |
+
model_proj_type=null
|
| 101 |
+
model_clone_batch=1
|
| 102 |
+
dataset_batch_size=384
|
| 103 |
+
model_dispersive_loss=10000.0
|
| 104 |
+
model_dispersive_loss_layer=0
|
| 105 |
+
checkpoint_keep_interval_updates=1
|
| 106 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 107 |
+
echo "Config ${train_mode} ${config_option}"
|
| 108 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 109 |
+
task_load_clap_emb=false
|
| 110 |
+
task_load_source_file=true
|
| 111 |
+
task_load_mel_file=false
|
| 112 |
+
model_proj_type=null
|
| 113 |
+
model_clone_batch=1
|
| 114 |
+
dataset_batch_size=384
|
| 115 |
+
model_dispersive_loss=1000.0
|
| 116 |
+
model_dispersive_loss_layer=0
|
| 117 |
+
checkpoint_keep_interval_updates=1
|
| 118 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 119 |
+
echo "Config ${train_mode} ${config_option}"
|
| 120 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 121 |
+
task_load_clap_emb=false
|
| 122 |
+
task_load_source_file=true
|
| 123 |
+
task_load_mel_file=false
|
| 124 |
+
model_proj_type=null
|
| 125 |
+
model_clone_batch=4
|
| 126 |
+
dataset_batch_size=96
|
| 127 |
+
model_dispersive_loss=1000.0
|
| 128 |
+
model_dispersive_loss_layer=10
|
| 129 |
+
checkpoint_keep_interval_updates=1
|
| 130 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 131 |
+
echo "Config ${train_mode} ${config_option}"
|
| 132 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 133 |
+
task_load_clap_emb=true
|
| 134 |
+
model_proj_type=2
|
| 135 |
+
model_clone_batch=4
|
| 136 |
+
dataset_batch_size=48
|
| 137 |
+
model_clap_loss=1.0
|
| 138 |
+
average_top_k_layers=12
|
| 139 |
+
model_add_conv=false
|
| 140 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 141 |
+
echo "Config ${train_mode} ${config_option}"
|
| 142 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 143 |
+
task_load_clap_emb=true
|
| 144 |
+
model_proj_type=2
|
| 145 |
+
model_clone_batch=4
|
| 146 |
+
dataset_batch_size=48
|
| 147 |
+
model_clap_loss=1.0
|
| 148 |
+
average_top_k_layers=1
|
| 149 |
+
# loss type ablation
|
| 150 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 151 |
+
echo "Config ${train_mode} ${config_option}"
|
| 152 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 153 |
+
task_load_clap_emb=true
|
| 154 |
+
model_proj_type=2
|
| 155 |
+
model_clone_batch=4
|
| 156 |
+
dataset_batch_size=48
|
| 157 |
+
model_clap_loss=1.0
|
| 158 |
+
average_top_k_layers=12
|
| 159 |
+
model_clap_loss_type="ce"
|
| 160 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 161 |
+
echo "Config ${train_mode} ${config_option}"
|
| 162 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 163 |
+
task_load_clap_emb=true
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
dataset_batch_size=48
|
| 167 |
+
model_clap_loss=1.0
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="l1"
|
| 170 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 171 |
+
echo "Config ${train_mode} ${config_option}"
|
| 172 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 173 |
+
task_load_clap_emb=true
|
| 174 |
+
model_proj_type=2
|
| 175 |
+
model_clone_batch=4
|
| 176 |
+
dataset_batch_size=96
|
| 177 |
+
model_clap_loss=1.0
|
| 178 |
+
average_top_k_layers=12
|
| 179 |
+
model_clap_loss_type="cosine"
|
| 180 |
+
# loss layer ablation
|
| 181 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 182 |
+
echo "Config ${train_mode} ${config_option}"
|
| 183 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 184 |
+
task_load_clap_emb=true
|
| 185 |
+
model_proj_type=2
|
| 186 |
+
model_clone_batch=4
|
| 187 |
+
dataset_batch_size=96
|
| 188 |
+
model_clap_loss=1.0
|
| 189 |
+
average_top_k_layers=12
|
| 190 |
+
model_clap_loss_type="mse"
|
| 191 |
+
model_clap_loss_layer=10
|
| 192 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
task_load_source_file=true
|
| 197 |
+
task_load_mel_file=false
|
| 198 |
+
model_proj_type=2
|
| 199 |
+
model_clone_batch=4
|
| 200 |
+
dataset_batch_size=96
|
| 201 |
+
model_clap_loss=1.0
|
| 202 |
+
average_top_k_layers=12
|
| 203 |
+
model_clap_loss_type="mse"
|
| 204 |
+
model_clap_loss_layer=8
|
| 205 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 206 |
+
echo "Config ${train_mode} ${config_option}"
|
| 207 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 208 |
+
task_load_clap_emb=true
|
| 209 |
+
task_load_source_file=true
|
| 210 |
+
task_load_mel_file=false
|
| 211 |
+
model_proj_type=2
|
| 212 |
+
model_clone_batch=4
|
| 213 |
+
dataset_batch_size=96
|
| 214 |
+
model_clap_loss=1.0
|
| 215 |
+
average_top_k_layers=12
|
| 216 |
+
model_clap_loss_type="mse"
|
| 217 |
+
model_clap_loss_layer=6
|
| 218 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 219 |
+
echo "Config ${train_mode} ${config_option}"
|
| 220 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 221 |
+
task_load_clap_emb=true
|
| 222 |
+
task_load_source_file=true
|
| 223 |
+
task_load_mel_file=false
|
| 224 |
+
model_proj_type=2
|
| 225 |
+
model_clone_batch=4
|
| 226 |
+
model_clap_loss=5.0
|
| 227 |
+
dataset_batch_size=96
|
| 228 |
+
average_top_k_layers=12
|
| 229 |
+
model_clap_loss_type="mse"
|
| 230 |
+
checkpoint_keep_interval_updates=-1
|
| 231 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 232 |
+
echo "Config ${train_mode} ${config_option}"
|
| 233 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 234 |
+
task_load_clap_emb=true
|
| 235 |
+
task_load_source_file=true
|
| 236 |
+
task_load_mel_file=false
|
| 237 |
+
model_proj_type=2
|
| 238 |
+
model_clone_batch=4
|
| 239 |
+
model_clap_loss=0.1
|
| 240 |
+
dataset_batch_size=96
|
| 241 |
+
average_top_k_layers=12
|
| 242 |
+
model_clap_loss_type="mse"
|
| 243 |
+
checkpoint_keep_interval_updates=-1
|
| 244 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 245 |
+
echo "Config ${train_mode} ${config_option}"
|
| 246 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 247 |
+
task_load_clap_emb=true
|
| 248 |
+
model_proj_type=4
|
| 249 |
+
model_clone_batch=4
|
| 250 |
+
model_clap_loss=1.0
|
| 251 |
+
dataset_batch_size=48
|
| 252 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 253 |
+
echo "Config ${train_mode} ${config_option}"
|
| 254 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 255 |
+
task_load_clap_emb=true
|
| 256 |
+
model_proj_type=4
|
| 257 |
+
model_clone_batch=4
|
| 258 |
+
model_clap_loss=0.001
|
| 259 |
+
dataset_batch_size=48
|
| 260 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 261 |
+
echo "Config ${train_mode} ${config_option}"
|
| 262 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 263 |
+
task_load_clap_emb=true
|
| 264 |
+
model_proj_type=4
|
| 265 |
+
model_clone_batch=4
|
| 266 |
+
model_clap_loss=0.01
|
| 267 |
+
dataset_batch_size=48
|
| 268 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 269 |
+
echo "Config ${train_mode} ${config_option}"
|
| 270 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 271 |
+
task_load_clap_emb=true
|
| 272 |
+
model_proj_type=6
|
| 273 |
+
model_clone_batch=4
|
| 274 |
+
dataset_batch_size=48
|
| 275 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 276 |
+
echo "Config ${train_mode} ${config_option}"
|
| 277 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 278 |
+
task_load_clap_emb=true
|
| 279 |
+
task_load_source_file=true
|
| 280 |
+
task_load_mel_file=false
|
| 281 |
+
model_proj_type=2
|
| 282 |
+
model_clone_batch=4
|
| 283 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 284 |
+
model_clap_loss=1.0
|
| 285 |
+
average_top_k_layers=11 # modify with model depth
|
| 286 |
+
model_add_conv=true
|
| 287 |
+
model_depth=11 #
|
| 288 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 289 |
+
checkpoint_save_interval_updates=10000
|
| 290 |
+
fi
|
| 291 |
+
|
| 292 |
+
python fairseq_cli/hydra_train.py -m \
|
| 293 |
+
--config-dir ./EAT/config \
|
| 294 |
+
--config-name pretraining_AS2M \
|
| 295 |
+
common.user_dir=./EAT \
|
| 296 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 297 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 298 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 299 |
+
dataset.num_workers=24 \
|
| 300 |
+
dataset.data_buffer_size=48 \
|
| 301 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 302 |
+
task.data=${task_data} \
|
| 303 |
+
task.h5_format=False \
|
| 304 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 305 |
+
+task.load_source_file=${task_load_source_file} \
|
| 306 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 307 |
+
model.proj_type=${model_proj_type} \
|
| 308 |
+
model.clone_batch=${model_clone_batch} \
|
| 309 |
+
model.clap_loss=${model_clap_loss} \
|
| 310 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 311 |
+
+model.add_conv=${model_add_conv} \
|
| 312 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 313 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 314 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 315 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 316 |
+
model.depth=${model_depth} \
|
| 317 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 318 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|
pre_4_AS2M/disp_6_2025-09-28_08-58-05/pretraining_AS2M.sh
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# config options
|
| 3 |
+
train_mode=disp
|
| 4 |
+
config_option=6
|
| 5 |
+
# change world size
|
| 6 |
+
|
| 7 |
+
# shared config
|
| 8 |
+
SAVE_DIR_ROOT=/opt/gpfs/home/chushu/exp/eat/pre_4_AS2M
|
| 9 |
+
checkpoint_save_dir=${SAVE_DIR_ROOT}/${train_mode}_${config_option}_$(date +"%Y-%m-%d_%H-%M-%S")
|
| 10 |
+
checkpoint_restore_file=${checkpoint_save_dir}/checkpoint_last.pt
|
| 11 |
+
|
| 12 |
+
# 脚本自身的绝对路径与文件名(解析符号链接)
|
| 13 |
+
script_path="$(readlink -f -- "${BASH_SOURCE[0]}")"
|
| 14 |
+
script_name="$(basename -- "$script_path")"
|
| 15 |
+
# 创建目录并拷贝(保留权限与时间戳)
|
| 16 |
+
mkdir -p -- "$checkpoint_save_dir"
|
| 17 |
+
cp -p -- "$script_path" "$checkpoint_save_dir/$script_name"
|
| 18 |
+
echo "script_path: ${script_path}"
|
| 19 |
+
echo "checkpoint_save_dir: ${checkpoint_save_dir}"
|
| 20 |
+
|
| 21 |
+
# default setting
|
| 22 |
+
model_clone_batch=4
|
| 23 |
+
dataset_batch_size=48
|
| 24 |
+
model_clap_loss=0
|
| 25 |
+
model_clap_loss_type="mse" # option ce cosine l1
|
| 26 |
+
model_clap_loss_layer=0
|
| 27 |
+
average_top_k_layers=12
|
| 28 |
+
model_add_conv=false
|
| 29 |
+
model_depth=12
|
| 30 |
+
model_dispersive_loss=0
|
| 31 |
+
model_dispersive_loss_layer=0
|
| 32 |
+
checkpoint_keep_interval_updates=1 # TODO change this parameter if need
|
| 33 |
+
checkpoint_save_interval_updates=10000
|
| 34 |
+
|
| 35 |
+
if [[ $train_mode == "default" && ${config_option} -eq 0 ]]; then
|
| 36 |
+
echo "Config ${train_mode} ${config_option}"
|
| 37 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 38 |
+
task_load_clap_emb=false
|
| 39 |
+
task_load_source_file=true
|
| 40 |
+
task_load_mel_file=false
|
| 41 |
+
model_proj_type=null
|
| 42 |
+
model_clone_batch=4
|
| 43 |
+
dataset_batch_size=96
|
| 44 |
+
model_clap_loss=0
|
| 45 |
+
checkpoint_keep_interval_updates=-1
|
| 46 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 0 ]]; then
|
| 47 |
+
echo "Config ${train_mode} ${config_option}"
|
| 48 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 49 |
+
task_load_clap_emb=false
|
| 50 |
+
task_load_source_file=true
|
| 51 |
+
task_load_mel_file=false
|
| 52 |
+
model_proj_type=null
|
| 53 |
+
model_clone_batch=4
|
| 54 |
+
dataset_batch_size=96
|
| 55 |
+
model_dispersive_loss=1
|
| 56 |
+
model_dispersive_loss_layer=0
|
| 57 |
+
checkpoint_keep_interval_updates=1
|
| 58 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 1 ]]; then
|
| 59 |
+
echo "Config ${train_mode} ${config_option}"
|
| 60 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 61 |
+
task_load_clap_emb=false
|
| 62 |
+
task_load_source_file=true
|
| 63 |
+
task_load_mel_file=false
|
| 64 |
+
model_proj_type=null
|
| 65 |
+
model_clone_batch=1
|
| 66 |
+
dataset_batch_size=384
|
| 67 |
+
model_dispersive_loss=1
|
| 68 |
+
model_dispersive_loss_layer=0
|
| 69 |
+
checkpoint_keep_interval_updates=1
|
| 70 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 2 ]]; then
|
| 71 |
+
echo "Config ${train_mode} ${config_option}"
|
| 72 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 73 |
+
task_load_clap_emb=false
|
| 74 |
+
task_load_source_file=true
|
| 75 |
+
task_load_mel_file=false
|
| 76 |
+
model_proj_type=null
|
| 77 |
+
model_clone_batch=1
|
| 78 |
+
dataset_batch_size=384
|
| 79 |
+
model_dispersive_loss=10.0
|
| 80 |
+
model_dispersive_loss_layer=0
|
| 81 |
+
checkpoint_keep_interval_updates=1
|
| 82 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 3 ]]; then
|
| 83 |
+
echo "Config ${train_mode} ${config_option}"
|
| 84 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 85 |
+
task_load_clap_emb=false
|
| 86 |
+
task_load_source_file=true
|
| 87 |
+
task_load_mel_file=false
|
| 88 |
+
model_proj_type=null
|
| 89 |
+
model_clone_batch=1
|
| 90 |
+
dataset_batch_size=384
|
| 91 |
+
model_dispersive_loss=100.0
|
| 92 |
+
model_dispersive_loss_layer=0
|
| 93 |
+
checkpoint_keep_interval_updates=1
|
| 94 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 4 ]]; then
|
| 95 |
+
echo "Config ${train_mode} ${config_option}"
|
| 96 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 97 |
+
task_load_clap_emb=false
|
| 98 |
+
task_load_source_file=true
|
| 99 |
+
task_load_mel_file=false
|
| 100 |
+
model_proj_type=null
|
| 101 |
+
model_clone_batch=1
|
| 102 |
+
dataset_batch_size=384
|
| 103 |
+
model_dispersive_loss=10000.0
|
| 104 |
+
model_dispersive_loss_layer=0
|
| 105 |
+
checkpoint_keep_interval_updates=1
|
| 106 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 5 ]]; then
|
| 107 |
+
echo "Config ${train_mode} ${config_option}"
|
| 108 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 109 |
+
task_load_clap_emb=false
|
| 110 |
+
task_load_source_file=true
|
| 111 |
+
task_load_mel_file=false
|
| 112 |
+
model_proj_type=null
|
| 113 |
+
model_clone_batch=1
|
| 114 |
+
dataset_batch_size=384
|
| 115 |
+
model_dispersive_loss=1000.0
|
| 116 |
+
model_dispersive_loss_layer=0
|
| 117 |
+
checkpoint_keep_interval_updates=1
|
| 118 |
+
elif [[ $train_mode == "disp" && ${config_option} -eq 6 ]]; then
|
| 119 |
+
echo "Config ${train_mode} ${config_option}"
|
| 120 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M
|
| 121 |
+
task_load_clap_emb=false
|
| 122 |
+
task_load_source_file=true
|
| 123 |
+
task_load_mel_file=false
|
| 124 |
+
model_proj_type=null
|
| 125 |
+
model_clone_batch=4
|
| 126 |
+
dataset_batch_size=96
|
| 127 |
+
model_dispersive_loss=1000.0
|
| 128 |
+
model_dispersive_loss_layer=10
|
| 129 |
+
checkpoint_keep_interval_updates=1
|
| 130 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 0 ]]; then
|
| 131 |
+
echo "Config ${train_mode} ${config_option}"
|
| 132 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 133 |
+
task_load_clap_emb=true
|
| 134 |
+
model_proj_type=2
|
| 135 |
+
model_clone_batch=4
|
| 136 |
+
dataset_batch_size=48
|
| 137 |
+
model_clap_loss=1.0
|
| 138 |
+
average_top_k_layers=12
|
| 139 |
+
model_add_conv=false
|
| 140 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 1 ]]; then
|
| 141 |
+
echo "Config ${train_mode} ${config_option}"
|
| 142 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 143 |
+
task_load_clap_emb=true
|
| 144 |
+
model_proj_type=2
|
| 145 |
+
model_clone_batch=4
|
| 146 |
+
dataset_batch_size=48
|
| 147 |
+
model_clap_loss=1.0
|
| 148 |
+
average_top_k_layers=1
|
| 149 |
+
# loss type ablation
|
| 150 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 2 ]]; then
|
| 151 |
+
echo "Config ${train_mode} ${config_option}"
|
| 152 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 153 |
+
task_load_clap_emb=true
|
| 154 |
+
model_proj_type=2
|
| 155 |
+
model_clone_batch=4
|
| 156 |
+
dataset_batch_size=48
|
| 157 |
+
model_clap_loss=1.0
|
| 158 |
+
average_top_k_layers=12
|
| 159 |
+
model_clap_loss_type="ce"
|
| 160 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 3 ]]; then
|
| 161 |
+
echo "Config ${train_mode} ${config_option}"
|
| 162 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 163 |
+
task_load_clap_emb=true
|
| 164 |
+
model_proj_type=2
|
| 165 |
+
model_clone_batch=4
|
| 166 |
+
dataset_batch_size=48
|
| 167 |
+
model_clap_loss=1.0
|
| 168 |
+
average_top_k_layers=12
|
| 169 |
+
model_clap_loss_type="l1"
|
| 170 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 4 ]]; then
|
| 171 |
+
echo "Config ${train_mode} ${config_option}"
|
| 172 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 173 |
+
task_load_clap_emb=true
|
| 174 |
+
model_proj_type=2
|
| 175 |
+
model_clone_batch=4
|
| 176 |
+
dataset_batch_size=96
|
| 177 |
+
model_clap_loss=1.0
|
| 178 |
+
average_top_k_layers=12
|
| 179 |
+
model_clap_loss_type="cosine"
|
| 180 |
+
# loss layer ablation
|
| 181 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 5 ]]; then
|
| 182 |
+
echo "Config ${train_mode} ${config_option}"
|
| 183 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 184 |
+
task_load_clap_emb=true
|
| 185 |
+
model_proj_type=2
|
| 186 |
+
model_clone_batch=4
|
| 187 |
+
dataset_batch_size=96
|
| 188 |
+
model_clap_loss=1.0
|
| 189 |
+
average_top_k_layers=12
|
| 190 |
+
model_clap_loss_type="mse"
|
| 191 |
+
model_clap_loss_layer=10
|
| 192 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 6 ]]; then
|
| 193 |
+
echo "Config ${train_mode} ${config_option}"
|
| 194 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 195 |
+
task_load_clap_emb=true
|
| 196 |
+
task_load_source_file=true
|
| 197 |
+
task_load_mel_file=false
|
| 198 |
+
model_proj_type=2
|
| 199 |
+
model_clone_batch=4
|
| 200 |
+
dataset_batch_size=96
|
| 201 |
+
model_clap_loss=1.0
|
| 202 |
+
average_top_k_layers=12
|
| 203 |
+
model_clap_loss_type="mse"
|
| 204 |
+
model_clap_loss_layer=8
|
| 205 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 7 ]]; then
|
| 206 |
+
echo "Config ${train_mode} ${config_option}"
|
| 207 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 208 |
+
task_load_clap_emb=true
|
| 209 |
+
task_load_source_file=true
|
| 210 |
+
task_load_mel_file=false
|
| 211 |
+
model_proj_type=2
|
| 212 |
+
model_clone_batch=4
|
| 213 |
+
dataset_batch_size=96
|
| 214 |
+
model_clap_loss=1.0
|
| 215 |
+
average_top_k_layers=12
|
| 216 |
+
model_clap_loss_type="mse"
|
| 217 |
+
model_clap_loss_layer=6
|
| 218 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 8 ]]; then
|
| 219 |
+
echo "Config ${train_mode} ${config_option}"
|
| 220 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 221 |
+
task_load_clap_emb=true
|
| 222 |
+
task_load_source_file=true
|
| 223 |
+
task_load_mel_file=false
|
| 224 |
+
model_proj_type=2
|
| 225 |
+
model_clone_batch=4
|
| 226 |
+
model_clap_loss=5.0
|
| 227 |
+
dataset_batch_size=96
|
| 228 |
+
average_top_k_layers=12
|
| 229 |
+
model_clap_loss_type="mse"
|
| 230 |
+
checkpoint_keep_interval_updates=-1
|
| 231 |
+
elif [[ $train_mode == "clap" && ${config_option} -eq 9 ]]; then
|
| 232 |
+
echo "Config ${train_mode} ${config_option}"
|
| 233 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 234 |
+
task_load_clap_emb=true
|
| 235 |
+
task_load_source_file=true
|
| 236 |
+
task_load_mel_file=false
|
| 237 |
+
model_proj_type=2
|
| 238 |
+
model_clone_batch=4
|
| 239 |
+
model_clap_loss=0.1
|
| 240 |
+
dataset_batch_size=96
|
| 241 |
+
average_top_k_layers=12
|
| 242 |
+
model_clap_loss_type="mse"
|
| 243 |
+
checkpoint_keep_interval_updates=-1
|
| 244 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 0 ]]; then
|
| 245 |
+
echo "Config ${train_mode} ${config_option}"
|
| 246 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 247 |
+
task_load_clap_emb=true
|
| 248 |
+
model_proj_type=4
|
| 249 |
+
model_clone_batch=4
|
| 250 |
+
model_clap_loss=1.0
|
| 251 |
+
dataset_batch_size=48
|
| 252 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 1 ]]; then
|
| 253 |
+
echo "Config ${train_mode} ${config_option}"
|
| 254 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 255 |
+
task_load_clap_emb=true
|
| 256 |
+
model_proj_type=4
|
| 257 |
+
model_clone_batch=4
|
| 258 |
+
model_clap_loss=0.001
|
| 259 |
+
dataset_batch_size=48
|
| 260 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 2 ]]; then
|
| 261 |
+
echo "Config ${train_mode} ${config_option}"
|
| 262 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_in
|
| 263 |
+
task_load_clap_emb=true
|
| 264 |
+
model_proj_type=4
|
| 265 |
+
model_clone_batch=4
|
| 266 |
+
model_clap_loss=0.01
|
| 267 |
+
dataset_batch_size=48
|
| 268 |
+
elif [[ $train_mode == "ast" && ${config_option} -eq 3 ]]; then
|
| 269 |
+
echo "Config ${train_mode} ${config_option}"
|
| 270 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_AST/mlp_head_out
|
| 271 |
+
task_load_clap_emb=true
|
| 272 |
+
model_proj_type=6
|
| 273 |
+
model_clone_batch=4
|
| 274 |
+
dataset_batch_size=48
|
| 275 |
+
elif [[ $train_mode == "conv_clap" && ${config_option} -eq 0 ]]; then
|
| 276 |
+
echo "Config ${train_mode} ${config_option}"
|
| 277 |
+
task_data=/opt/gpfs/home/chushu/data/audioset/setting/PRETRAIN_AS2M_w_CLAP
|
| 278 |
+
task_load_clap_emb=true
|
| 279 |
+
task_load_source_file=true
|
| 280 |
+
task_load_mel_file=false
|
| 281 |
+
model_proj_type=2
|
| 282 |
+
model_clone_batch=4
|
| 283 |
+
dataset_batch_size=64 # original 48 oom on 4090 24G change distributed_world_size
|
| 284 |
+
model_clap_loss=1.0
|
| 285 |
+
average_top_k_layers=11 # modify with model depth
|
| 286 |
+
model_add_conv=true
|
| 287 |
+
model_depth=11 #
|
| 288 |
+
checkpoint_keep_interval_updates=-1 # default 1
|
| 289 |
+
checkpoint_save_interval_updates=10000
|
| 290 |
+
fi
|
| 291 |
+
|
| 292 |
+
python fairseq_cli/hydra_train.py -m \
|
| 293 |
+
--config-dir ./EAT/config \
|
| 294 |
+
--config-name pretraining_AS2M \
|
| 295 |
+
common.user_dir=./EAT \
|
| 296 |
+
checkpoint.save_dir=${checkpoint_save_dir} \
|
| 297 |
+
checkpoint.restore_file=${checkpoint_restore_file} \
|
| 298 |
+
distributed_training.distributed_world_size=${1:-2} \
|
| 299 |
+
dataset.num_workers=24 \
|
| 300 |
+
dataset.data_buffer_size=48 \
|
| 301 |
+
dataset.batch_size=${dataset_batch_size} \
|
| 302 |
+
task.data=${task_data} \
|
| 303 |
+
task.h5_format=False \
|
| 304 |
+
task.load_clap_emb=${task_load_clap_emb} \
|
| 305 |
+
+task.load_source_file=${task_load_source_file} \
|
| 306 |
+
+task.load_mel_file=${task_load_mel_file} \
|
| 307 |
+
model.proj_type=${model_proj_type} \
|
| 308 |
+
model.clone_batch=${model_clone_batch} \
|
| 309 |
+
model.clap_loss=${model_clap_loss} \
|
| 310 |
+
model.average_top_k_layers=${average_top_k_layers} \
|
| 311 |
+
+model.add_conv=${model_add_conv} \
|
| 312 |
+
+model.clap_loss_type=${model_clap_loss_type} \
|
| 313 |
+
+model.clap_loss_layer=${model_clap_loss_layer} \
|
| 314 |
+
+model.dispersive_loss=${model_dispersive_loss} \
|
| 315 |
+
+model.dispersive_loss_layer=${model_dispersive_loss_layer} \
|
| 316 |
+
model.depth=${model_depth} \
|
| 317 |
+
checkpoint.keep_interval_updates=${checkpoint_keep_interval_updates} \
|
| 318 |
+
checkpoint.save_interval_updates=${checkpoint_save_interval_updates}
|