hunterFormsBS / bs_roformer_revive2.py
hunterhogan's picture
Upload folder using huggingface_hub
157b122 verified
modelConfiguration = { 'audio': { 'chunk_size': 485100,
'dim_f': 1024,
'dim_t': 1101,
'hop_length': 441,
'min_mean_abs': 0.0,
'n_fft': 2048,
'num_channels': 2,
'sample_rate': 44100},
'inference': {'batch_size': 4, 'dim_t': 1101, 'num_overlap': 2},
'model': { 'attn_dropout': 0.0,
'depth': 12,
'dim': 512,
'dim_freqs_in': 1025,
'dim_head': 64,
'ff_dropout': 0.0,
'flash_attn': True,
'freq_transformer_depth': 1,
'freqs_per_bands': ( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12,
12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129),
'heads': 8,
'linear_transformer_depth': 0,
'mask_estimator_depth': 1,
'multi_stft_hop_size': 147,
'multi_stft_normalized': False,
'multi_stft_resolution_loss_weight': 1.0,
'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256),
'num_stems': 1,
'stereo': True,
'stft_hop_length': 441,
'stft_n_fft': 2048,
'stft_normalized': False,
'stft_win_length': 882,
'time_transformer_depth': 1},
'training': { 'augmentation': False,
'augmentation_loudness': False,
'augmentation_loudness_max': 0,
'augmentation_loudness_min': 0,
'augmentation_loudness_type': 1,
'augmentation_mix': False,
'augmentation_type': None,
'batch_size': 1,
'coarse_loss_clip': False,
'ema_momentum': 0.999,
'grad_clip': 0,
'gradient_accumulation_steps': 1,
'instruments': ['vocals', 'other'],
'lr': 1e-05,
'num_epochs': 1000,
'num_steps': 1000,
'optimizer': 'adam',
'other_fix': True,
'patience': 2,
'q': 0.95,
'reduce_factor': 0.95,
'target_instrument': 'vocals',
'use_amp': True,
'use_mp3_compress': False}}