modelConfiguration = { 'audio': { 'chunk_size': 485100, 'dim_f': 1024, 'dim_t': 1101, 'hop_length': 441, 'min_mean_abs': 0.0, 'n_fft': 2048, 'num_channels': 2, 'sample_rate': 44100}, 'inference': {'batch_size': 4, 'dim_t': 1101, 'num_overlap': 2}, 'model': { 'attn_dropout': 0.0, 'depth': 12, 'dim': 512, 'dim_freqs_in': 1025, 'dim_head': 64, 'ff_dropout': 0.0, 'flash_attn': True, 'freq_transformer_depth': 1, 'freqs_per_bands': ( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129), 'heads': 8, 'linear_transformer_depth': 0, 'mask_estimator_depth': 1, 'multi_stft_hop_size': 147, 'multi_stft_normalized': False, 'multi_stft_resolution_loss_weight': 1.0, 'multi_stft_resolutions_window_sizes': (4096, 2048, 1024, 512, 256), 'num_stems': 1, 'stereo': True, 'stft_hop_length': 441, 'stft_n_fft': 2048, 'stft_normalized': False, 'stft_win_length': 882, 'time_transformer_depth': 1}, 'training': { 'augmentation': False, 'augmentation_loudness': False, 'augmentation_loudness_max': 0, 'augmentation_loudness_min': 0, 'augmentation_loudness_type': 1, 'augmentation_mix': False, 'augmentation_type': None, 'batch_size': 1, 'coarse_loss_clip': False, 'ema_momentum': 0.999, 'grad_clip': 0, 'gradient_accumulation_steps': 1, 'instruments': ['vocals', 'other'], 'lr': 1e-05, 'num_epochs': 1000, 'num_steps': 1000, 'optimizer': 'adam', 'other_fix': True, 'patience': 2, 'q': 0.95, 'reduce_factor': 0.95, 'target_instrument': 'vocals', 'use_amp': True, 'use_mp3_compress': False}}