hunterhogan commited on
Commit
6fcb60a
·
verified ·
1 Parent(s): 727e1b0

Upload folder using huggingface_hub

Browse files
bs_roformer_voc_hyperacev2.py CHANGED
@@ -1,6 +1,6 @@
1
  import torch
2
 
3
- modelConfiguration = { 'audio': { 'chunk_size': 960000,
4
  'dim_f': 1024,
5
  'dim_t': 801,
6
  'hop_length': 441,
 
1
  import torch
2
 
3
+ modelConfiguration = { 'audio': { 'chunk_size': 2 ** 20,
4
  'dim_f': 1024,
5
  'dim_t': 801,
6
  'hop_length': 441,
config_deux_becruily.py CHANGED
@@ -1,48 +1,48 @@
1
- modelConfiguration = { 'audio': { 'chunk_size': 573300,
2
- 'dim_f': 1024,
3
- 'dim_t': 256,
4
- 'hop_length': 441,
5
- 'min_mean_abs': 0.0,
6
- 'n_fft': 2048,
7
- 'num_channels': 2,
8
- 'sample_rate': 44100},
9
- 'inference': {'batch_size': 4, 'dim_t': 1101, 'num_overlap': 2},
10
- 'model': { 'attn_dropout': 0,
11
- 'depth': 12,
12
- 'dim': 256,
13
- 'dim_freqs_in': 1025,
14
- 'dim_head': 64,
15
- 'ff_dropout': 0,
16
- 'flash_attn': True,
17
- 'freq_transformer_depth': 1,
18
- 'heads': 8,
19
- 'mask_estimator_depth': 2,
20
- 'multi_stft_hop_size': 147,
21
- 'multi_stft_normalized': False,
22
- 'multi_stft_resolution_loss_weight': 1.0,
23
- 'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256),
24
- 'num_bands': 60,
25
- 'num_stems': 2,
26
- 'sample_rate': 44100,
27
- 'stereo': True,
28
- 'stft_hop_length': 441,
29
- 'stft_n_fft': 2048,
30
- 'stft_normalized': False,
31
- 'stft_win_length': 882,
32
- 'time_transformer_depth': 1},
33
- 'training': { 'batch_size': 1,
34
- 'coarse_loss_clip': False,
35
- 'ema_momentum': 0.999,
36
- 'grad_clip': 0,
37
- 'gradient_accumulation_steps': 1,
38
- 'instruments': ['vocals', 'instrum'],
39
- 'lr': 0.0001,
40
- 'num_epochs': 1000,
41
- 'num_steps': 1000,
42
- 'optimizer': 'adamw',
43
- 'other_fix': False,
44
- 'patience': 2,
45
- 'q': 0.95,
46
- 'reduce_factor': 0.95,
47
- 'target_instrument': None,
48
- 'use_amp': True}}
 
1
+ modelConfiguration = { 'audio': { 'chunk_size': 2 ** 20,
2
+ 'dim_f': 1024,
3
+ 'dim_t': 256,
4
+ 'hop_length': 441,
5
+ 'min_mean_abs': 0.0,
6
+ 'n_fft': 2048,
7
+ 'num_channels': 2,
8
+ 'sample_rate': 44100},
9
+ 'inference': {'batch_size': 4, 'dim_t': 1101, 'num_overlap': 2},
10
+ 'model': { 'attn_dropout': 0,
11
+ 'depth': 12,
12
+ 'dim': 256,
13
+ 'dim_freqs_in': 1025,
14
+ 'dim_head': 64,
15
+ 'ff_dropout': 0,
16
+ 'flash_attn': True,
17
+ 'freq_transformer_depth': 1,
18
+ 'heads': 8,
19
+ 'mask_estimator_depth': 2,
20
+ 'multi_stft_hop_size': 147,
21
+ 'multi_stft_normalized': False,
22
+ 'multi_stft_resolution_loss_weight': 1.0,
23
+ 'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256),
24
+ 'num_bands': 60,
25
+ 'num_stems': 2,
26
+ 'sample_rate': 44100,
27
+ 'stereo': True,
28
+ 'stft_hop_length': 512,
29
+ 'stft_n_fft': 2048,
30
+ 'stft_normalized': False,
31
+ 'stft_win_length': 1024,
32
+ 'time_transformer_depth': 1},
33
+ 'training': { 'batch_size': 1,
34
+ 'coarse_loss_clip': False,
35
+ 'ema_momentum': 0.999,
36
+ 'grad_clip': 0,
37
+ 'gradient_accumulation_steps': 1,
38
+ 'instruments': ['vocals', 'instrum'],
39
+ 'lr': 0.0001,
40
+ 'num_epochs': 1000,
41
+ 'num_steps': 1000,
42
+ 'optimizer': 'adamw',
43
+ 'other_fix': False,
44
+ 'patience': 2,
45
+ 'q': 0.95,
46
+ 'reduce_factor': 0.95,
47
+ 'target_instrument': None,
48
+ 'use_amp': True}}