hunterhogan
/

hunterFormsBS

Model card Files Files and versions

xet

Community

hunterhogan commited on 8 days ago

Commit

ee04a7a

verified ·

1 Parent(s): c4b1496

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

bs_roformer_voc_hyperacev2.py +55 -52

bs_roformer_voc_hyperacev2.py CHANGED Viewed

@@ -1,55 +1,58 @@
 import torch
 modelConfiguration = {   'audio': {   'chunk_size': 960000,
-                 'dim_f': 1024,
-                 'dim_t': 801,
-                 'hop_length': 441,
-                 'min_mean_abs': 0.0001,
-                 'n_fft': 2048,
-                 'num_channels': 2,
-                 'sample_rate': 44100},
-    'inference': {'batch_size': 2, 'dim_t': 1876, 'num_overlap': 4},
-    'model': {   'attn_dropout': 0.0,
-                 'depth': 12,
-                 'dim': 256,
-                 'dim_freqs_in': 1025,
-                 'dim_head': 64,
-                 'ff_dropout': 0.0,
-                 'flash_attn': True,
-                 'freq_transformer_depth': 1,
-                 'freqs_per_bands': (   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12,
-                                        12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129),
-                 'heads': 8,
-                 'linear_transformer_depth': 0,
-                 'mask_estimator_depth': 1,
-                 'mask_filter_bank': torch.tensor(dtype=torch.bool, data=[[1,1]+[0]*1023,[0,0,1,1]+[0]*1021,[0]*4+[1,1]+[0]*1019,[0]*6+[1,1]+[0]*1017,[0]*8+[1,1]+[0]*1015,[0]*10+[1,1]+[0]*1013,[0]*12+[1,1]+[0]*1011,[0]*14+[1,1]+[0]*1009,[0]*16+[1,1]+[0]*1007,[0]*18+[1,1]+[0]*1005,[0]*20+[1,1]+[0]*1003,[0]*22+[1,1]+[0]*1001,[0]*24+[1,1]+[0]*999,[0]*26+[1,1]+[0]*997,[0]*28+[1,1]+[0]*995,[0]*30+[1,1]+[0]*993,[0]*32+[1,1]+[0]*991,[0]*34+[1,1]+[0]*989,[0]*36+[1,1]+[0]*987,[0]*38+[1,1]+[0]*985,[0]*40+[1,1]+[0]*983,[0]*42+[1,1]+[0]*981,[0]*44+[1,1]+[0]*979,[0]*46+[1,1]+[0]*977,[0]*48+[1]*4+[0]*973,[0]*52+[1]*4+[0]*969,[0]*56+[1]*4+[0]*965,[0]*60+[1]*4+[0]*961,[0]*64+[1]*4+[0]*957,[0]*68+[1]*4+[0]*953,[0]*72+[1]*4+[0]*949,[0]*76+[1]*4+[0]*945,[0]*80+[1]*4+[0]*941,[0]*84+[1]*4+[0]*937,[0]*88+[1]*4+[0]*933,[0]*92+[1]*4+[0]*929,[0]*96+[1]*12+[0]*917,[0]*108+[1]*12+[0]*905,[0]*120+[1]*12+[0]*893,[0]*132+[1]*12+[0]*881,[0]*144+[1]*12+[0]*869,[0]*156+[1]*12+[0]*857,[0]*168+[1]*12+[0]*845,[0]*180+[1]*12+[0]*833,[0]*192+[1]*24+[0]*809,[0]*216+[1]*24+[0]*785,[0]*240+[1]*24+[0]*761,[0]*264+[1]*24+[0]*737,[0]*288+[1]*24+[0]*713,[0]*312+[1]*24+[0]*689,[0]*336+[1]*24+[0]*665,[0]*360+[1]*24+[0]*641,[0]*384+[1]*48+[0]*593,[0]*432+[1]*48+[0]*545,[0]*480+[1]*48+[0]*497,[0]*528+[1]*48+[0]*449,[0]*576+[1]*48+[0]*401,[0]*624+[1]*48+[0]*353,[0]*672+[1]*48+[0]*305,[0]*720+[1]*48+[0]*257,[0]*768+[1]*128+[0]*129,[0]*896+[1]*129]),
-                 'mlp_expansion_factor': 4,
-                 'multi_stft_hop_size': 147,
-                 'multi_stft_normalized': False,
-                 'multi_stft_resolution_loss_weight': 1.0,
-                 'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256),
-                 'num_stems': 1,
-                 'skip_connection': False,
-                 'stereo': True,
-                 'stft_hop_length': 512,
-                 'stft_n_fft': 2048,
-                 'stft_normalized': False,
-                 'stft_win_length': 1024,
-                 'time_transformer_depth': 1,
-                 'use_torch_checkpoint': True},
-    'training': {   'batch_size': 1,
-                    'coarse_loss_clip': True,
-                    'ema_momentum': 0.999,
-                    'grad_clip': 0,
-                    'gradient_accumulation_steps': 1,
-                    'instruments': ['vocals', 'instrument'],
-                    'lr': 1e-05,
-                    'num_epochs': 1000,
-                    'num_steps': 1000,
-                    'optimizer': 'adam',
-                    'other_fix': False,
-                    'patience': 5,
-                    'q': 0.95,
-                    'reduce_factor': 0.9,
-                    'target_instrument': 'vocals',
-                    'use_amp': True}}

 import torch
 modelConfiguration = {   'audio': {   'chunk_size': 960000,
+				'dim_f': 1024,
+				'dim_t': 801,
+				'hop_length': 441,
+				'min_mean_abs': 0.0001,
+				'n_fft': 2048,
+				'num_channels': 2,
+				'sample_rate': 44100},
+	'inference': {'batch_size': 2, 'dim_t': 1876, 'num_overlap': 4},
+	'model': {   'attn_dropout': 0.0,
+				'depth': 12,
+				'dim': 256,
+				'dim_freqs_in': 1025,
+				'dim_head': 64,
+				'ff_dropout': 0.0,
+				'flash_attn': True,
+				'freq_transformer_depth': 1,
+				'freqs_per_bands': (   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12,
+										12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129),
+				'final_norm': True,
+				'heads': 8,
+				'linear_transformer_depth': 0,
+				'mask_estimator_depth': 1,
+				'mask_filter_bank': torch.tensor(dtype=torch.bool, data=[[1,1]+[0]*1023,[0,0,1,1]+[0]*1021,[0]*4+[1,1]+[0]*1019,[0]*6+[1,1]+[0]*1017,[0]*8+[1,1]+[0]*1015,[0]*10+[1,1]+[0]*1013,[0]*12+[1,1]+[0]*1011,[0]*14+[1,1]+[0]*1009,[0]*16+[1,1]+[0]*1007,[0]*18+[1,1]+[0]*1005,[0]*20+[1,1]+[0]*1003,[0]*22+[1,1]+[0]*1001,[0]*24+[1,1]+[0]*999,[0]*26+[1,1]+[0]*997,[0]*28+[1,1]+[0]*995,[0]*30+[1,1]+[0]*993,[0]*32+[1,1]+[0]*991,[0]*34+[1,1]+[0]*989,[0]*36+[1,1]+[0]*987,[0]*38+[1,1]+[0]*985,[0]*40+[1,1]+[0]*983,[0]*42+[1,1]+[0]*981,[0]*44+[1,1]+[0]*979,[0]*46+[1,1]+[0]*977,[0]*48+[1]*4+[0]*973,[0]*52+[1]*4+[0]*969,[0]*56+[1]*4+[0]*965,[0]*60+[1]*4+[0]*961,[0]*64+[1]*4+[0]*957,[0]*68+[1]*4+[0]*953,[0]*72+[1]*4+[0]*949,[0]*76+[1]*4+[0]*945,[0]*80+[1]*4+[0]*941,[0]*84+[1]*4+[0]*937,[0]*88+[1]*4+[0]*933,[0]*92+[1]*4+[0]*929,[0]*96+[1]*12+[0]*917,[0]*108+[1]*12+[0]*905,[0]*120+[1]*12+[0]*893,[0]*132+[1]*12+[0]*881,[0]*144+[1]*12+[0]*869,[0]*156+[1]*12+[0]*857,[0]*168+[1]*12+[0]*845,[0]*180+[1]*12+[0]*833,[0]*192+[1]*24+[0]*809,[0]*216+[1]*24+[0]*785,[0]*240+[1]*24+[0]*761,[0]*264+[1]*24+[0]*737,[0]*288+[1]*24+[0]*713,[0]*312+[1]*24+[0]*689,[0]*336+[1]*24+[0]*665,[0]*360+[1]*24+[0]*641,[0]*384+[1]*48+[0]*593,[0]*432+[1]*48+[0]*545,[0]*480+[1]*48+[0]*497,[0]*528+[1]*48+[0]*449,[0]*576+[1]*48+[0]*401,[0]*624+[1]*48+[0]*353,[0]*672+[1]*48+[0]*305,[0]*720+[1]*48+[0]*257,[0]*768+[1]*128+[0]*129,[0]*896+[1]*129]),
+				'mlp_expansion_factor': 4,
+				'multi_stft_hop_size': 147,
+				'multi_stft_normalized': False,
+				'multi_stft_resolution_loss_weight': 1.0,
+				'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256),
+                'norm_output': False,
+				'num_stems': 1,
+				'skip_connection': False,
+				'stereo': True,
+				'stft_hop_length': 512,
+				'stft_n_fft': 2048,
+				'stft_normalized': False,
+				'stft_win_length': 1024,
+				'time_transformer_depth': 1,
+				'use_torch_checkpoint': True,
+                'zero_dc': True},
+	'training': {   'batch_size': 1,
+					'coarse_loss_clip': True,
+					'ema_momentum': 0.999,
+					'grad_clip': 0,
+					'gradient_accumulation_steps': 1,
+					'instruments': ['vocals', 'instrument'],
+					'lr': 1e-05,
+					'num_epochs': 1000,
+					'num_steps': 1000,
+					'optimizer': 'adam',
+					'other_fix': False,
+					'patience': 5,
+					'q': 0.95,
+					'reduce_factor': 0.9,
+					'target_instrument': 'vocals',
+					'use_amp': True}}