Upload folder using huggingface_hub
Browse files
bs_roformer_voc_hyperacev2.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
modelConfiguration = { 'audio': { 'chunk_size': 960000,
|
| 4 |
+
'dim_f': 1024,
|
| 5 |
+
'dim_t': 801,
|
| 6 |
+
'hop_length': 441,
|
| 7 |
+
'min_mean_abs': 0.0001,
|
| 8 |
+
'n_fft': 2048,
|
| 9 |
+
'num_channels': 2,
|
| 10 |
+
'sample_rate': 44100},
|
| 11 |
+
'inference': {'batch_size': 2, 'dim_t': 1876, 'num_overlap': 4},
|
| 12 |
+
'model': { 'attn_dropout': 0.0,
|
| 13 |
+
'depth': 12,
|
| 14 |
+
'dim': 256,
|
| 15 |
+
'dim_freqs_in': 1025,
|
| 16 |
+
'dim_head': 64,
|
| 17 |
+
'ff_dropout': 0.0,
|
| 18 |
+
'flash_attn': True,
|
| 19 |
+
'freq_transformer_depth': 1,
|
| 20 |
+
'freqs_per_bands': ( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12,
|
| 21 |
+
12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129),
|
| 22 |
+
'heads': 8,
|
| 23 |
+
'linear_transformer_depth': 0,
|
| 24 |
+
'mask_estimator_depth': 1,
|
| 25 |
+
'mask_filter_bank': torch.tensor(dtype=torch.bool, data=[[1,1]+[0]*1023,[0,0,1,1]+[0]*1021,[0]*4+[1,1]+[0]*1019,[0]*6+[1,1]+[0]*1017,[0]*8+[1,1]+[0]*1015,[0]*10+[1,1]+[0]*1013,[0]*12+[1,1]+[0]*1011,[0]*14+[1,1]+[0]*1009,[0]*16+[1,1]+[0]*1007,[0]*18+[1,1]+[0]*1005,[0]*20+[1,1]+[0]*1003,[0]*22+[1,1]+[0]*1001,[0]*24+[1,1]+[0]*999,[0]*26+[1,1]+[0]*997,[0]*28+[1,1]+[0]*995,[0]*30+[1,1]+[0]*993,[0]*32+[1,1]+[0]*991,[0]*34+[1,1]+[0]*989,[0]*36+[1,1]+[0]*987,[0]*38+[1,1]+[0]*985,[0]*40+[1,1]+[0]*983,[0]*42+[1,1]+[0]*981,[0]*44+[1,1]+[0]*979,[0]*46+[1,1]+[0]*977,[0]*48+[1]*4+[0]*973,[0]*52+[1]*4+[0]*969,[0]*56+[1]*4+[0]*965,[0]*60+[1]*4+[0]*961,[0]*64+[1]*4+[0]*957,[0]*68+[1]*4+[0]*953,[0]*72+[1]*4+[0]*949,[0]*76+[1]*4+[0]*945,[0]*80+[1]*4+[0]*941,[0]*84+[1]*4+[0]*937,[0]*88+[1]*4+[0]*933,[0]*92+[1]*4+[0]*929,[0]*96+[1]*12+[0]*917,[0]*108+[1]*12+[0]*905,[0]*120+[1]*12+[0]*893,[0]*132+[1]*12+[0]*881,[0]*144+[1]*12+[0]*869,[0]*156+[1]*12+[0]*857,[0]*168+[1]*12+[0]*845,[0]*180+[1]*12+[0]*833,[0]*192+[1]*24+[0]*809,[0]*216+[1]*24+[0]*785,[0]*240+[1]*24+[0]*761,[0]*264+[1]*24+[0]*737,[0]*288+[1]*24+[0]*713,[0]*312+[1]*24+[0]*689,[0]*336+[1]*24+[0]*665,[0]*360+[1]*24+[0]*641,[0]*384+[1]*48+[0]*593,[0]*432+[1]*48+[0]*545,[0]*480+[1]*48+[0]*497,[0]*528+[1]*48+[0]*449,[0]*576+[1]*48+[0]*401,[0]*624+[1]*48+[0]*353,[0]*672+[1]*48+[0]*305,[0]*720+[1]*48+[0]*257,[0]*768+[1]*128+[0]*129,[0]*896+[1]*129]),
|
| 26 |
+
'mlp_expansion_factor': 4,
|
| 27 |
+
'multi_stft_hop_size': 147,
|
| 28 |
+
'multi_stft_normalized': False,
|
| 29 |
+
'multi_stft_resolution_loss_weight': 1.0,
|
| 30 |
+
'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256),
|
| 31 |
+
'num_stems': 1,
|
| 32 |
+
'skip_connection': False,
|
| 33 |
+
'stereo': True,
|
| 34 |
+
'stft_hop_length': 512,
|
| 35 |
+
'stft_n_fft': 2048,
|
| 36 |
+
'stft_normalized': False,
|
| 37 |
+
'stft_win_length': 1024,
|
| 38 |
+
'time_transformer_depth': 1,
|
| 39 |
+
'use_torch_checkpoint': True},
|
| 40 |
+
'training': { 'batch_size': 1,
|
| 41 |
+
'coarse_loss_clip': True,
|
| 42 |
+
'ema_momentum': 0.999,
|
| 43 |
+
'grad_clip': 0,
|
| 44 |
+
'gradient_accumulation_steps': 1,
|
| 45 |
+
'instruments': ['vocals', 'instrument'],
|
| 46 |
+
'lr': 1e-05,
|
| 47 |
+
'num_epochs': 1000,
|
| 48 |
+
'num_steps': 1000,
|
| 49 |
+
'optimizer': 'adam',
|
| 50 |
+
'other_fix': False,
|
| 51 |
+
'patience': 5,
|
| 52 |
+
'q': 0.95,
|
| 53 |
+
'reduce_factor': 0.9,
|
| 54 |
+
'target_instrument': 'vocals',
|
| 55 |
+
'use_amp': True}}
|