| modelConfiguration = { 'audio': { 'chunk_size': 485100, | |
| 'dim_f': 1024, | |
| 'dim_t': 1101, | |
| 'hop_length': 441, | |
| 'min_mean_abs': 0.0, | |
| 'n_fft': 2048, | |
| 'num_channels': 2, | |
| 'sample_rate': 44100}, | |
| 'inference': {'batch_size': 4, 'dim_t': 1101, 'num_overlap': 2}, | |
| 'model': { 'attn_dropout': 0.0, | |
| 'depth': 12, | |
| 'dim': 512, | |
| 'dim_freqs_in': 1025, | |
| 'dim_head': 64, | |
| 'ff_dropout': 0.0, | |
| 'flash_attn': True, | |
| 'freq_transformer_depth': 1, | |
| 'freqs_per_bands': ( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, | |
| 12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129), | |
| 'heads': 8, | |
| 'linear_transformer_depth': 0, | |
| 'mask_estimator_depth': 1, | |
| 'multi_stft_hop_size': 147, | |
| 'multi_stft_normalized': False, | |
| 'multi_stft_resolution_loss_weight': 1.0, | |
| 'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256), | |
| 'num_stems': 1, | |
| 'stereo': True, | |
| 'stft_hop_length': 441, | |
| 'stft_n_fft': 2048, | |
| 'stft_normalized': False, | |
| 'stft_win_length': 882, | |
| 'time_transformer_depth': 1}, | |
| 'training': { 'augmentation': False, | |
| 'augmentation_loudness': False, | |
| 'augmentation_loudness_max': 0, | |
| 'augmentation_loudness_min': 0, | |
| 'augmentation_loudness_type': 1, | |
| 'augmentation_mix': False, | |
| 'augmentation_type': None, | |
| 'batch_size': 1, | |
| 'coarse_loss_clip': False, | |
| 'ema_momentum': 0.999, | |
| 'grad_clip': 0, | |
| 'gradient_accumulation_steps': 1, | |
| 'instruments': ['vocals', 'other'], | |
| 'lr': 1e-05, | |
| 'num_epochs': 1000, | |
| 'num_steps': 1000, | |
| 'optimizer': 'adam', | |
| 'other_fix': True, | |
| 'patience': 2, | |
| 'q': 0.95, | |
| 'reduce_factor': 0.95, | |
| 'target_instrument': 'vocals', | |
| 'use_amp': True, | |
| 'use_mp3_compress': False}} | |