hunterhogan commited on
Commit
ee04a7a
·
verified ·
1 Parent(s): c4b1496

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. bs_roformer_voc_hyperacev2.py +55 -52
bs_roformer_voc_hyperacev2.py CHANGED
@@ -1,55 +1,58 @@
1
  import torch
2
 
3
  modelConfiguration = { 'audio': { 'chunk_size': 960000,
4
- 'dim_f': 1024,
5
- 'dim_t': 801,
6
- 'hop_length': 441,
7
- 'min_mean_abs': 0.0001,
8
- 'n_fft': 2048,
9
- 'num_channels': 2,
10
- 'sample_rate': 44100},
11
- 'inference': {'batch_size': 2, 'dim_t': 1876, 'num_overlap': 4},
12
- 'model': { 'attn_dropout': 0.0,
13
- 'depth': 12,
14
- 'dim': 256,
15
- 'dim_freqs_in': 1025,
16
- 'dim_head': 64,
17
- 'ff_dropout': 0.0,
18
- 'flash_attn': True,
19
- 'freq_transformer_depth': 1,
20
- 'freqs_per_bands': ( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12,
21
- 12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129),
22
- 'heads': 8,
23
- 'linear_transformer_depth': 0,
24
- 'mask_estimator_depth': 1,
25
- 'mask_filter_bank': torch.tensor(dtype=torch.bool, data=[[1,1]+[0]*1023,[0,0,1,1]+[0]*1021,[0]*4+[1,1]+[0]*1019,[0]*6+[1,1]+[0]*1017,[0]*8+[1,1]+[0]*1015,[0]*10+[1,1]+[0]*1013,[0]*12+[1,1]+[0]*1011,[0]*14+[1,1]+[0]*1009,[0]*16+[1,1]+[0]*1007,[0]*18+[1,1]+[0]*1005,[0]*20+[1,1]+[0]*1003,[0]*22+[1,1]+[0]*1001,[0]*24+[1,1]+[0]*999,[0]*26+[1,1]+[0]*997,[0]*28+[1,1]+[0]*995,[0]*30+[1,1]+[0]*993,[0]*32+[1,1]+[0]*991,[0]*34+[1,1]+[0]*989,[0]*36+[1,1]+[0]*987,[0]*38+[1,1]+[0]*985,[0]*40+[1,1]+[0]*983,[0]*42+[1,1]+[0]*981,[0]*44+[1,1]+[0]*979,[0]*46+[1,1]+[0]*977,[0]*48+[1]*4+[0]*973,[0]*52+[1]*4+[0]*969,[0]*56+[1]*4+[0]*965,[0]*60+[1]*4+[0]*961,[0]*64+[1]*4+[0]*957,[0]*68+[1]*4+[0]*953,[0]*72+[1]*4+[0]*949,[0]*76+[1]*4+[0]*945,[0]*80+[1]*4+[0]*941,[0]*84+[1]*4+[0]*937,[0]*88+[1]*4+[0]*933,[0]*92+[1]*4+[0]*929,[0]*96+[1]*12+[0]*917,[0]*108+[1]*12+[0]*905,[0]*120+[1]*12+[0]*893,[0]*132+[1]*12+[0]*881,[0]*144+[1]*12+[0]*869,[0]*156+[1]*12+[0]*857,[0]*168+[1]*12+[0]*845,[0]*180+[1]*12+[0]*833,[0]*192+[1]*24+[0]*809,[0]*216+[1]*24+[0]*785,[0]*240+[1]*24+[0]*761,[0]*264+[1]*24+[0]*737,[0]*288+[1]*24+[0]*713,[0]*312+[1]*24+[0]*689,[0]*336+[1]*24+[0]*665,[0]*360+[1]*24+[0]*641,[0]*384+[1]*48+[0]*593,[0]*432+[1]*48+[0]*545,[0]*480+[1]*48+[0]*497,[0]*528+[1]*48+[0]*449,[0]*576+[1]*48+[0]*401,[0]*624+[1]*48+[0]*353,[0]*672+[1]*48+[0]*305,[0]*720+[1]*48+[0]*257,[0]*768+[1]*128+[0]*129,[0]*896+[1]*129]),
26
- 'mlp_expansion_factor': 4,
27
- 'multi_stft_hop_size': 147,
28
- 'multi_stft_normalized': False,
29
- 'multi_stft_resolution_loss_weight': 1.0,
30
- 'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256),
31
- 'num_stems': 1,
32
- 'skip_connection': False,
33
- 'stereo': True,
34
- 'stft_hop_length': 512,
35
- 'stft_n_fft': 2048,
36
- 'stft_normalized': False,
37
- 'stft_win_length': 1024,
38
- 'time_transformer_depth': 1,
39
- 'use_torch_checkpoint': True},
40
- 'training': { 'batch_size': 1,
41
- 'coarse_loss_clip': True,
42
- 'ema_momentum': 0.999,
43
- 'grad_clip': 0,
44
- 'gradient_accumulation_steps': 1,
45
- 'instruments': ['vocals', 'instrument'],
46
- 'lr': 1e-05,
47
- 'num_epochs': 1000,
48
- 'num_steps': 1000,
49
- 'optimizer': 'adam',
50
- 'other_fix': False,
51
- 'patience': 5,
52
- 'q': 0.95,
53
- 'reduce_factor': 0.9,
54
- 'target_instrument': 'vocals',
55
- 'use_amp': True}}
 
 
 
 
1
  import torch
2
 
3
  modelConfiguration = { 'audio': { 'chunk_size': 960000,
4
+ 'dim_f': 1024,
5
+ 'dim_t': 801,
6
+ 'hop_length': 441,
7
+ 'min_mean_abs': 0.0001,
8
+ 'n_fft': 2048,
9
+ 'num_channels': 2,
10
+ 'sample_rate': 44100},
11
+ 'inference': {'batch_size': 2, 'dim_t': 1876, 'num_overlap': 4},
12
+ 'model': { 'attn_dropout': 0.0,
13
+ 'depth': 12,
14
+ 'dim': 256,
15
+ 'dim_freqs_in': 1025,
16
+ 'dim_head': 64,
17
+ 'ff_dropout': 0.0,
18
+ 'flash_attn': True,
19
+ 'freq_transformer_depth': 1,
20
+ 'freqs_per_bands': ( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12,
21
+ 12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129),
22
+ 'final_norm': True,
23
+ 'heads': 8,
24
+ 'linear_transformer_depth': 0,
25
+ 'mask_estimator_depth': 1,
26
+ 'mask_filter_bank': torch.tensor(dtype=torch.bool, data=[[1,1]+[0]*1023,[0,0,1,1]+[0]*1021,[0]*4+[1,1]+[0]*1019,[0]*6+[1,1]+[0]*1017,[0]*8+[1,1]+[0]*1015,[0]*10+[1,1]+[0]*1013,[0]*12+[1,1]+[0]*1011,[0]*14+[1,1]+[0]*1009,[0]*16+[1,1]+[0]*1007,[0]*18+[1,1]+[0]*1005,[0]*20+[1,1]+[0]*1003,[0]*22+[1,1]+[0]*1001,[0]*24+[1,1]+[0]*999,[0]*26+[1,1]+[0]*997,[0]*28+[1,1]+[0]*995,[0]*30+[1,1]+[0]*993,[0]*32+[1,1]+[0]*991,[0]*34+[1,1]+[0]*989,[0]*36+[1,1]+[0]*987,[0]*38+[1,1]+[0]*985,[0]*40+[1,1]+[0]*983,[0]*42+[1,1]+[0]*981,[0]*44+[1,1]+[0]*979,[0]*46+[1,1]+[0]*977,[0]*48+[1]*4+[0]*973,[0]*52+[1]*4+[0]*969,[0]*56+[1]*4+[0]*965,[0]*60+[1]*4+[0]*961,[0]*64+[1]*4+[0]*957,[0]*68+[1]*4+[0]*953,[0]*72+[1]*4+[0]*949,[0]*76+[1]*4+[0]*945,[0]*80+[1]*4+[0]*941,[0]*84+[1]*4+[0]*937,[0]*88+[1]*4+[0]*933,[0]*92+[1]*4+[0]*929,[0]*96+[1]*12+[0]*917,[0]*108+[1]*12+[0]*905,[0]*120+[1]*12+[0]*893,[0]*132+[1]*12+[0]*881,[0]*144+[1]*12+[0]*869,[0]*156+[1]*12+[0]*857,[0]*168+[1]*12+[0]*845,[0]*180+[1]*12+[0]*833,[0]*192+[1]*24+[0]*809,[0]*216+[1]*24+[0]*785,[0]*240+[1]*24+[0]*761,[0]*264+[1]*24+[0]*737,[0]*288+[1]*24+[0]*713,[0]*312+[1]*24+[0]*689,[0]*336+[1]*24+[0]*665,[0]*360+[1]*24+[0]*641,[0]*384+[1]*48+[0]*593,[0]*432+[1]*48+[0]*545,[0]*480+[1]*48+[0]*497,[0]*528+[1]*48+[0]*449,[0]*576+[1]*48+[0]*401,[0]*624+[1]*48+[0]*353,[0]*672+[1]*48+[0]*305,[0]*720+[1]*48+[0]*257,[0]*768+[1]*128+[0]*129,[0]*896+[1]*129]),
27
+ 'mlp_expansion_factor': 4,
28
+ 'multi_stft_hop_size': 147,
29
+ 'multi_stft_normalized': False,
30
+ 'multi_stft_resolution_loss_weight': 1.0,
31
+ 'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256),
32
+ 'norm_output': False,
33
+ 'num_stems': 1,
34
+ 'skip_connection': False,
35
+ 'stereo': True,
36
+ 'stft_hop_length': 512,
37
+ 'stft_n_fft': 2048,
38
+ 'stft_normalized': False,
39
+ 'stft_win_length': 1024,
40
+ 'time_transformer_depth': 1,
41
+ 'use_torch_checkpoint': True,
42
+ 'zero_dc': True},
43
+ 'training': { 'batch_size': 1,
44
+ 'coarse_loss_clip': True,
45
+ 'ema_momentum': 0.999,
46
+ 'grad_clip': 0,
47
+ 'gradient_accumulation_steps': 1,
48
+ 'instruments': ['vocals', 'instrument'],
49
+ 'lr': 1e-05,
50
+ 'num_epochs': 1000,
51
+ 'num_steps': 1000,
52
+ 'optimizer': 'adam',
53
+ 'other_fix': False,
54
+ 'patience': 5,
55
+ 'q': 0.95,
56
+ 'reduce_factor': 0.9,
57
+ 'target_instrument': 'vocals',
58
+ 'use_amp': True}}