miercolesv commited on
Commit
3cd03cd
·
verified ·
1 Parent(s): 92aa29d

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. Bandit-v2/.gitattributes +35 -0
  2. Bandit-v2/config_dnr_bandit_v2_mus64.yaml +77 -0
  3. MVSEP-MDX23-music-separation-model/.gitignore +160 -0
  4. MVSEP-MDX23-music-separation-model/README.md +99 -0
  5. MVSEP-MDX23-music-separation-model/demucs3/demucs.py +447 -0
  6. MVSEP-MDX23-music-separation-model/demucs3/hdemucs.py +782 -0
  7. MVSEP-MDX23-music-separation-model/demucs3/htdemucs.py +648 -0
  8. MVSEP-MDX23-music-separation-model/demucs3/spec.py +41 -0
  9. MVSEP-MDX23-music-separation-model/demucs3/states.py +148 -0
  10. MVSEP-MDX23-music-separation-model/demucs3/transformer.py +839 -0
  11. MVSEP-MDX23-music-separation-model/demucs3/utils.py +141 -0
  12. MVSEP-MDX23-music-separation-model/gui.py +411 -0
  13. MVSEP-MDX23-music-separation-model/images/MVSep-Window.png +0 -0
  14. MVSEP-MDX23-music-separation-model/inference.py +914 -0
  15. MVSEP-MDX23-music-separation-model/models/.gitkeep +1 -0
  16. MVSEP-MDX23-music-separation-model/requirements.txt +11 -0
  17. MVSEP-MDX23-music-separation-model/web-ui.py +172 -0
  18. Politrees/UVR_resources/.gitattributes +190 -0
  19. Politrees/UVR_resources/README.md +18 -0
  20. Politrees/UVR_resources/models/Apollo/apollo_edm_big_by_essid.yaml +114 -0
  21. Politrees/UVR_resources/models/Apollo/apollo_edm_by_essid.yaml +114 -0
  22. Politrees/UVR_resources/models/MDX23C/config_dereverb_mdx23c.yaml +135 -0
  23. Politrees/UVR_resources/models/MDX23C/config_drumsep_mdx23c.yaml +87 -0
  24. Politrees/UVR_resources/models/MDX23C/config_mdx23c_similarity.yaml +47 -0
  25. Politrees/UVR_resources/models/MDX23C/model_2_stem_061321.yaml +36 -0
  26. Politrees/UVR_resources/models/MDX23C/model_2_stem_full_band_8k.yaml +43 -0
  27. Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_FNO_by-Unwa.yaml +136 -0
  28. Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_Karaoke_Frazer_by-becruily.yaml +129 -0
  29. Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_Resurrection_Instrumental_by-Unwa.yaml +138 -0
  30. Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_Resurrection_Vocals_by-Unwa.yaml +138 -0
  31. Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_SW_by-jarredou.yaml +197 -0
  32. Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit_Roformer_4stems_FT_by_SYH99999.yaml +196 -0
  33. Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_chorus_male_female.yaml +125 -0
  34. Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_deverb_8_384dim_10depth.yaml +137 -0
  35. Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_ep_317_sdr_12.9755.yaml +133 -0
  36. Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_ep_368_sdr_12.9628.yaml +133 -0
  37. Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_ep_937_sdr_10.5309.yaml +138 -0
  38. Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_inst_exp_vrl.yaml +124 -0
  39. Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_revive_by_unwa.yaml +134 -0
  40. Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_BVE_by-Gonza.yaml +75 -0
  41. Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_Duality_v1_by-Aname.yaml +72 -0
  42. Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_Karaoke_Fusion_Total_by-Gonza.yaml +75 -0
  43. Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_Karaoke_Fusion_by-Gonza.yaml +83 -0
  44. Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_Karaoke_Fusion_v2_by-Gonza.yaml +83 -0
  45. Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand_Roformer_4stems_FT_Large_by_SYH99999.yaml +69 -0
  46. Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand_Roformer_4stems_Large_v1_by_Aname.yaml +167 -0
  47. Politrees/UVR_resources/models/SCnet/config_musdb18_scnet.yaml +83 -0
  48. Politrees/UVR_resources/models/SCnet/config_musdb18_scnet_large.yaml +88 -0
  49. Politrees/UVR_resources/models/SCnet/config_musdb18_scnet_large_starrytong.yaml +88 -0
  50. Politrees/UVR_resources/models/SCnet/config_musdb18_scnet_xl.yaml +207 -0
Bandit-v2/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Bandit-v2/config_dnr_bandit_v2_mus64.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cls: Bandit
2
+
3
+ audio:
4
+ chunk_size: 384000
5
+ num_channels: 2
6
+ sample_rate: 48000
7
+ min_mean_abs: 0.000
8
+
9
+ kwargs:
10
+ in_channels: 1
11
+ stems: ['speech', 'music', 'sfx']
12
+ band_type: musical
13
+ n_bands: 64
14
+ normalize_channel_independently: false
15
+ treat_channel_as_feature: true
16
+ n_sqm_modules: 8
17
+ emb_dim: 128
18
+ rnn_dim: 256
19
+ bidirectional: true
20
+ rnn_type: "GRU"
21
+ mlp_dim: 512
22
+ hidden_activation: "Tanh"
23
+ hidden_activation_kwargs: null
24
+ complex_mask: true
25
+ use_freq_weights: true
26
+ n_fft: 2048
27
+ win_length: 2048
28
+ hop_length: 512
29
+ window_fn: "hann_window"
30
+ wkwargs: null
31
+ power: null
32
+ center: true
33
+ normalized: true
34
+ pad_mode: "reflect"
35
+ onesided: true
36
+
37
+ training:
38
+ batch_size: 4
39
+ gradient_accumulation_steps: 4
40
+ grad_clip: 0
41
+ instruments:
42
+ - speech
43
+ - music
44
+ - sfx
45
+ lr: 9.0e-05
46
+ patience: 2
47
+ reduce_factor: 0.95
48
+ target_instrument: null
49
+ num_epochs: 1000
50
+ num_steps: 1000
51
+ q: 0.95
52
+ coarse_loss_clip: true
53
+ ema_momentum: 0.999
54
+ optimizer: adam
55
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
56
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
57
+
58
+ augmentations:
59
+ enable: true # enable or disable all augmentations (to fast disable if needed)
60
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
61
+ loudness_min: 0.5
62
+ loudness_max: 1.5
63
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
64
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
65
+ - 0.2
66
+ - 0.02
67
+ mixup_loudness_min: 0.5
68
+ mixup_loudness_max: 1.5
69
+ all:
70
+ channel_shuffle: 0.5 # Set 0 or lower to disable
71
+ random_inverse: 0.1 # inverse track (better lower probability)
72
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
73
+
74
+ inference:
75
+ batch_size: 8
76
+ dim_t: 256
77
+ num_overlap: 4
MVSEP-MDX23-music-separation-model/.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
MVSEP-MDX23-music-separation-model/README.md ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MVSEP-MDX23-music-separation-model
2
+ Model for [Sound demixing challenge 2023: Music Demixing Track - MDX'23](https://www.aicrowd.com/challenges/sound-demixing-challenge-2023). Model perform separation of music into 4 stems "bass", "drums", "vocals", "other". Model won 3rd place in challenge (Leaderboard C).
3
+
4
+ Model based on [Demucs4](https://github.com/facebookresearch/demucs), [MDX](https://github.com/kuielab/mdx-net) neural net architectures and some MDX weights from [Ultimate Vocal Remover](https://github.com/Anjok07/ultimatevocalremovergui) project (thanks [Kimberley Jensen](https://github.com/KimberleyJensen) for great high quality vocal models). Thanks [@Ma5onic](https://github.com/Ma5onic) for web UI & helping with dataset augmentation techniques. Brought to you by [MVSep.com](https://mvsep.com).
5
+ ## Usage
6
+
7
+ ```
8
+ python inference.py --input_audio mixture1.wav mixture2.wav --output_folder ./results/
9
+ ```
10
+
11
+ With this command audios with names "mixture1.wav" and "mixture2.wav" will be processed and results will be stored in `./results/` folder in WAV format.
12
+
13
+ ### All available keys
14
+ * `--input_audio` - input audio location. You can provide multiple files at once. **Required**
15
+ * `--output_folder` - output audio folder. **Required**
16
+ * `--cpu` - choose CPU instead of GPU for processing. Can be very slow.
17
+ * `--overlap_large` - overlap of splitted audio for light models. Closer to 1.0 - slower, but better quality. Default: 0.6.
18
+ * `--overlap_small` - overlap of splitted audio for heavy models. Closer to 1.0 - slower, but better quality. Default: 0.5.
19
+ * `--single_onnx` - only use single ONNX model for vocals. Can be useful if you have not enough GPU memory.
20
+ * `--chunk_size` - chunk size for ONNX models. Set lower to reduce GPU memory consumption. Default: 1000000.
21
+ * `--large_gpu` - it will store all models on GPU for faster processing of multiple audio files. Requires at least 11 GB of free GPU memory.
22
+ * `--use_kim_model_1` - use first version of Kim model (as it was on contest).
23
+ * `--only_vocals` - only create vocals and instrumental. Skip bass, drums, other. Processing will be faster.
24
+
25
+ ### Notes
26
+ * If you have not enough GPU memory you can use CPU (`--cpu`), but it will be slow. Additionally you can use single ONNX (`--single_onnx`), but it will decrease quality a little bit. Also reduce of chunk size can help (`--chunk_size 200000`).
27
+ * In current revision code requires less GPU memory, but it process multiple files slower. If you want old fast method use argument `--large_gpu`. It will require > 11 GB of GPU memory, but will work faster.
28
+ * There is [Google.Collab version](https://colab.research.google.com/github/jarredou/MVSEP-MDX23-Colab_v2/blob/main/MVSep-MDX23-Colab.ipynb) of this code.
29
+
30
+ ## Quality comparison
31
+
32
+ Quality comparison with best separation models performed on [MultiSong Dataset](https://mvsep.com/quality_checker/leaderboard2.php?sort=bass).
33
+
34
+ | Algorithm | SDR bass | SDR drums | SDR other | SDR vocals | SDR instrumental |
35
+ | ------------- |:---------:|:----------:|:----------:|:----------:|:------------------:|
36
+ | MVSEP MDX23 | 12.5034 | 11.6870 | 6.5378 | 9.5138 | 15.8213 |
37
+ | Demucs HT 4 | 12.1006 | 11.3037 | 5.7728 | 8.3555 | 13.9902 |
38
+ | Demucs 3 | 10.6947 | 10.2744 | 5.3580 | 8.1335 | 14.4409 |
39
+ | MDX B | --- | ---- | --- | 8.5118 | 14.8192 |
40
+
41
+ * Note: SDR - signal to distortion ratio. Larger is better.
42
+
43
+ ## GUI
44
+
45
+ ![GUI Window](https://github.com/ZFTurbo/MVSEP-MDX23-music-separation-model/blob/main/images/MVSep-Window.png)
46
+
47
+ * Script for GUI (based on PyQt5): [gui.py](gui.py).
48
+ * You can download [standalone program for Windows here](https://github.com/ZFTurbo/MVSEP-MDX23-music-separation-model/releases/download/v1.0.1/MVSep-MDX23_v1.0.1.zip) (~730 MB). Unzip archive and to start program double click `run.bat`. On first run it will download pytorch with CUDA support (~2.8 GB) and some Neural Net models.
49
+ * Program will download all needed neural net models from internet at the first run.
50
+ * GUI supports Drag & Drop of multiple files.
51
+ * Progress bar available.
52
+
53
+ ## Web Interface
54
+ executing `web-ui.py` with python will start the web interface locally on `localhost` (127.0.0.1).
55
+ You'll see what port it is running on within the terminal output.
56
+
57
+ ![image](https://github.com/Ma5onic/MVSEP-MDX23-music-separation-model/assets/18509613/ae7130a5-60a4-4095-abbd-5290e84dcf7c)
58
+
59
+ * Browser-Based user interface
60
+ * Program will download all needed neural net models from internet at the first run.
61
+ * supports Drag & Drop for audio upload (single file)
62
+
63
+ ![Web-UI Window](https://github.com/ZFTurbo/MVSEP-MDX23-music-separation-model/assets/18509613/4872f6aa-5896-44e9-8885-eaee1de3f4ee)
64
+
65
+
66
+ ## Changes
67
+
68
+ ### v1.0.1
69
+ * Settings in GUI updated, now you can control all possible options
70
+ * Kim vocal model updated from version 1 to version 2, you still can use version 1 using parameter `--use_kim_model_1`
71
+ * Added possibility to generate only vocals/instrumental pair if you don't need bass, drums and other stems. Use parameter `--only_vocals`
72
+ * Standalone program was updated. It has less size now. GUI will download torch/cuda on the first run.
73
+
74
+ ## Citation
75
+
76
+ * [arxiv paper](https://arxiv.org/abs/2305.07489)
77
+
78
+ ```
79
+ @misc{solovyev2023benchmarks,
80
+ title={Benchmarks and leaderboards for sound demixing tasks},
81
+ author={Roman Solovyev and Alexander Stempkovskiy and Tatiana Habruseva},
82
+ year={2023},
83
+ eprint={2305.07489},
84
+ archivePrefix={arXiv},
85
+ primaryClass={cs.SD}
86
+ }
87
+
88
+ @article{fabbro2024sound,
89
+ title={The Sound Demixing Challenge 2023-Music Demixing Track.},
90
+ author={Fabbro, G., Uhlich, S., Lai, C.-H., Choi, W., Martínez-Ramírez, M., Liao, W., Gadelha, I., Ramos, G., Hsu, E., Rodrigues, H., Stöter, F.-R.,
91
+ Défossez, A., Luo, Y., Yu, J., Chakraborty, D., Mohanty, S., Solovyev, R., Stempkovskiy, A., Habruseva, T., Goswami, N., Harada, T., Kim, M.,
92
+ Lee, J. H., Dong, Y., Zhang, X., Liu, J., & Mitsufuji, Y},
93
+ journal={Trans. Int. Soc. Music. Inf. Retr.},
94
+ volume={7},
95
+ number={1},
96
+ pages={63--84},
97
+ year={2024}
98
+ }
99
+ ```
MVSEP-MDX23-music-separation-model/demucs3/demucs.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import math
8
+ import typing as tp
9
+
10
+ import julius
11
+ import torch
12
+ from torch import nn
13
+ from torch.nn import functional as F
14
+
15
+ from .states import capture_init
16
+ from .utils import center_trim, unfold
17
+ from .transformer import LayerScale
18
+
19
+
20
+ class BLSTM(nn.Module):
21
+ """
22
+ BiLSTM with same hidden units as input dim.
23
+ If `max_steps` is not None, input will be splitting in overlapping
24
+ chunks and the LSTM applied separately on each chunk.
25
+ """
26
+ def __init__(self, dim, layers=1, max_steps=None, skip=False):
27
+ super().__init__()
28
+ assert max_steps is None or max_steps % 4 == 0
29
+ self.max_steps = max_steps
30
+ self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
31
+ self.linear = nn.Linear(2 * dim, dim)
32
+ self.skip = skip
33
+
34
+ def forward(self, x):
35
+ B, C, T = x.shape
36
+ y = x
37
+ framed = False
38
+ if self.max_steps is not None and T > self.max_steps:
39
+ width = self.max_steps
40
+ stride = width // 2
41
+ frames = unfold(x, width, stride)
42
+ nframes = frames.shape[2]
43
+ framed = True
44
+ x = frames.permute(0, 2, 1, 3).reshape(-1, C, width)
45
+
46
+ x = x.permute(2, 0, 1)
47
+
48
+ x = self.lstm(x)[0]
49
+ x = self.linear(x)
50
+ x = x.permute(1, 2, 0)
51
+ if framed:
52
+ out = []
53
+ frames = x.reshape(B, -1, C, width)
54
+ limit = stride // 2
55
+ for k in range(nframes):
56
+ if k == 0:
57
+ out.append(frames[:, k, :, :-limit])
58
+ elif k == nframes - 1:
59
+ out.append(frames[:, k, :, limit:])
60
+ else:
61
+ out.append(frames[:, k, :, limit:-limit])
62
+ out = torch.cat(out, -1)
63
+ out = out[..., :T]
64
+ x = out
65
+ if self.skip:
66
+ x = x + y
67
+ return x
68
+
69
+
70
+ def rescale_conv(conv, reference):
71
+ """Rescale initial weight scale. It is unclear why it helps but it certainly does.
72
+ """
73
+ std = conv.weight.std().detach()
74
+ scale = (std / reference)**0.5
75
+ conv.weight.data /= scale
76
+ if conv.bias is not None:
77
+ conv.bias.data /= scale
78
+
79
+
80
+ def rescale_module(module, reference):
81
+ for sub in module.modules():
82
+ if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d)):
83
+ rescale_conv(sub, reference)
84
+
85
+
86
+ class DConv(nn.Module):
87
+ """
88
+ New residual branches in each encoder layer.
89
+ This alternates dilated convolutions, potentially with LSTMs and attention.
90
+ Also before entering each residual branch, dimension is projected on a smaller subspace,
91
+ e.g. of dim `channels // compress`.
92
+ """
93
+ def __init__(self, channels: int, compress: float = 4, depth: int = 2, init: float = 1e-4,
94
+ norm=True, attn=False, heads=4, ndecay=4, lstm=False, gelu=True,
95
+ kernel=3, dilate=True):
96
+ """
97
+ Args:
98
+ channels: input/output channels for residual branch.
99
+ compress: amount of channel compression inside the branch.
100
+ depth: number of layers in the residual branch. Each layer has its own
101
+ projection, and potentially LSTM and attention.
102
+ init: initial scale for LayerNorm.
103
+ norm: use GroupNorm.
104
+ attn: use LocalAttention.
105
+ heads: number of heads for the LocalAttention.
106
+ ndecay: number of decay controls in the LocalAttention.
107
+ lstm: use LSTM.
108
+ gelu: Use GELU activation.
109
+ kernel: kernel size for the (dilated) convolutions.
110
+ dilate: if true, use dilation, increasing with the depth.
111
+ """
112
+
113
+ super().__init__()
114
+ assert kernel % 2 == 1
115
+ self.channels = channels
116
+ self.compress = compress
117
+ self.depth = abs(depth)
118
+ dilate = depth > 0
119
+
120
+ norm_fn: tp.Callable[[int], nn.Module]
121
+ norm_fn = lambda d: nn.Identity() # noqa
122
+ if norm:
123
+ norm_fn = lambda d: nn.GroupNorm(1, d) # noqa
124
+
125
+ hidden = int(channels / compress)
126
+
127
+ act: tp.Type[nn.Module]
128
+ if gelu:
129
+ act = nn.GELU
130
+ else:
131
+ act = nn.ReLU
132
+
133
+ self.layers = nn.ModuleList([])
134
+ for d in range(self.depth):
135
+ dilation = 2 ** d if dilate else 1
136
+ padding = dilation * (kernel // 2)
137
+ mods = [
138
+ nn.Conv1d(channels, hidden, kernel, dilation=dilation, padding=padding),
139
+ norm_fn(hidden), act(),
140
+ nn.Conv1d(hidden, 2 * channels, 1),
141
+ norm_fn(2 * channels), nn.GLU(1),
142
+ LayerScale(channels, init),
143
+ ]
144
+ if attn:
145
+ mods.insert(3, LocalState(hidden, heads=heads, ndecay=ndecay))
146
+ if lstm:
147
+ mods.insert(3, BLSTM(hidden, layers=2, max_steps=200, skip=True))
148
+ layer = nn.Sequential(*mods)
149
+ self.layers.append(layer)
150
+
151
+ def forward(self, x):
152
+ for layer in self.layers:
153
+ x = x + layer(x)
154
+ return x
155
+
156
+
157
+ class LocalState(nn.Module):
158
+ """Local state allows to have attention based only on data (no positional embedding),
159
+ but while setting a constraint on the time window (e.g. decaying penalty term).
160
+
161
+ Also a failed experiments with trying to provide some frequency based attention.
162
+ """
163
+ def __init__(self, channels: int, heads: int = 4, nfreqs: int = 0, ndecay: int = 4):
164
+ super().__init__()
165
+ assert channels % heads == 0, (channels, heads)
166
+ self.heads = heads
167
+ self.nfreqs = nfreqs
168
+ self.ndecay = ndecay
169
+ self.content = nn.Conv1d(channels, channels, 1)
170
+ self.query = nn.Conv1d(channels, channels, 1)
171
+ self.key = nn.Conv1d(channels, channels, 1)
172
+ if nfreqs:
173
+ self.query_freqs = nn.Conv1d(channels, heads * nfreqs, 1)
174
+ if ndecay:
175
+ self.query_decay = nn.Conv1d(channels, heads * ndecay, 1)
176
+ # Initialize decay close to zero (there is a sigmoid), for maximum initial window.
177
+ self.query_decay.weight.data *= 0.01
178
+ assert self.query_decay.bias is not None # stupid type checker
179
+ self.query_decay.bias.data[:] = -2
180
+ self.proj = nn.Conv1d(channels + heads * nfreqs, channels, 1)
181
+
182
+ def forward(self, x):
183
+ B, C, T = x.shape
184
+ heads = self.heads
185
+ indexes = torch.arange(T, device=x.device, dtype=x.dtype)
186
+ # left index are keys, right index are queries
187
+ delta = indexes[:, None] - indexes[None, :]
188
+
189
+ queries = self.query(x).view(B, heads, -1, T)
190
+ keys = self.key(x).view(B, heads, -1, T)
191
+ # t are keys, s are queries
192
+ dots = torch.einsum("bhct,bhcs->bhts", keys, queries)
193
+ dots /= keys.shape[2]**0.5
194
+ if self.nfreqs:
195
+ periods = torch.arange(1, self.nfreqs + 1, device=x.device, dtype=x.dtype)
196
+ freq_kernel = torch.cos(2 * math.pi * delta / periods.view(-1, 1, 1))
197
+ freq_q = self.query_freqs(x).view(B, heads, -1, T) / self.nfreqs ** 0.5
198
+ dots += torch.einsum("fts,bhfs->bhts", freq_kernel, freq_q)
199
+ if self.ndecay:
200
+ decays = torch.arange(1, self.ndecay + 1, device=x.device, dtype=x.dtype)
201
+ decay_q = self.query_decay(x).view(B, heads, -1, T)
202
+ decay_q = torch.sigmoid(decay_q) / 2
203
+ decay_kernel = - decays.view(-1, 1, 1) * delta.abs() / self.ndecay**0.5
204
+ dots += torch.einsum("fts,bhfs->bhts", decay_kernel, decay_q)
205
+
206
+ # Kill self reference.
207
+ dots.masked_fill_(torch.eye(T, device=dots.device, dtype=torch.bool), -100)
208
+ weights = torch.softmax(dots, dim=2)
209
+
210
+ content = self.content(x).view(B, heads, -1, T)
211
+ result = torch.einsum("bhts,bhct->bhcs", weights, content)
212
+ if self.nfreqs:
213
+ time_sig = torch.einsum("bhts,fts->bhfs", weights, freq_kernel)
214
+ result = torch.cat([result, time_sig], 2)
215
+ result = result.reshape(B, -1, T)
216
+ return x + self.proj(result)
217
+
218
+
219
+ class Demucs(nn.Module):
220
+ @capture_init
221
+ def __init__(self,
222
+ sources,
223
+ # Channels
224
+ audio_channels=2,
225
+ channels=64,
226
+ growth=2.,
227
+ # Main structure
228
+ depth=6,
229
+ rewrite=True,
230
+ lstm_layers=0,
231
+ # Convolutions
232
+ kernel_size=8,
233
+ stride=4,
234
+ context=1,
235
+ # Activations
236
+ gelu=True,
237
+ glu=True,
238
+ # Normalization
239
+ norm_starts=4,
240
+ norm_groups=4,
241
+ # DConv residual branch
242
+ dconv_mode=1,
243
+ dconv_depth=2,
244
+ dconv_comp=4,
245
+ dconv_attn=4,
246
+ dconv_lstm=4,
247
+ dconv_init=1e-4,
248
+ # Pre/post processing
249
+ normalize=True,
250
+ resample=True,
251
+ # Weight init
252
+ rescale=0.1,
253
+ # Metadata
254
+ samplerate=44100,
255
+ segment=4 * 10):
256
+ """
257
+ Args:
258
+ sources (list[str]): list of source names
259
+ audio_channels (int): stereo or mono
260
+ channels (int): first convolution channels
261
+ depth (int): number of encoder/decoder layers
262
+ growth (float): multiply (resp divide) number of channels by that
263
+ for each layer of the encoder (resp decoder)
264
+ depth (int): number of layers in the encoder and in the decoder.
265
+ rewrite (bool): add 1x1 convolution to each layer.
266
+ lstm_layers (int): number of lstm layers, 0 = no lstm. Deactivated
267
+ by default, as this is now replaced by the smaller and faster small LSTMs
268
+ in the DConv branches.
269
+ kernel_size (int): kernel size for convolutions
270
+ stride (int): stride for convolutions
271
+ context (int): kernel size of the convolution in the
272
+ decoder before the transposed convolution. If > 1,
273
+ will provide some context from neighboring time steps.
274
+ gelu: use GELU activation function.
275
+ glu (bool): use glu instead of ReLU for the 1x1 rewrite conv.
276
+ norm_starts: layer at which group norm starts being used.
277
+ decoder layers are numbered in reverse order.
278
+ norm_groups: number of groups for group norm.
279
+ dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
280
+ dconv_depth: depth of residual DConv branch.
281
+ dconv_comp: compression of DConv branch.
282
+ dconv_attn: adds attention layers in DConv branch starting at this layer.
283
+ dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
284
+ dconv_init: initial scale for the DConv branch LayerScale.
285
+ normalize (bool): normalizes the input audio on the fly, and scales back
286
+ the output by the same amount.
287
+ resample (bool): upsample x2 the input and downsample /2 the output.
288
+ rescale (int): rescale initial weights of convolutions
289
+ to get their standard deviation closer to `rescale`.
290
+ samplerate (int): stored as meta information for easing
291
+ future evaluations of the model.
292
+ segment (float): duration of the chunks of audio to ideally evaluate the model on.
293
+ This is used by `demucs.apply.apply_model`.
294
+ """
295
+
296
+ super().__init__()
297
+ self.audio_channels = audio_channels
298
+ self.sources = sources
299
+ self.kernel_size = kernel_size
300
+ self.context = context
301
+ self.stride = stride
302
+ self.depth = depth
303
+ self.resample = resample
304
+ self.channels = channels
305
+ self.normalize = normalize
306
+ self.samplerate = samplerate
307
+ self.segment = segment
308
+ self.encoder = nn.ModuleList()
309
+ self.decoder = nn.ModuleList()
310
+ self.skip_scales = nn.ModuleList()
311
+
312
+ if glu:
313
+ activation = nn.GLU(dim=1)
314
+ ch_scale = 2
315
+ else:
316
+ activation = nn.ReLU()
317
+ ch_scale = 1
318
+ if gelu:
319
+ act2 = nn.GELU
320
+ else:
321
+ act2 = nn.ReLU
322
+
323
+ in_channels = audio_channels
324
+ padding = 0
325
+ for index in range(depth):
326
+ norm_fn = lambda d: nn.Identity() # noqa
327
+ if index >= norm_starts:
328
+ norm_fn = lambda d: nn.GroupNorm(norm_groups, d) # noqa
329
+
330
+ encode = []
331
+ encode += [
332
+ nn.Conv1d(in_channels, channels, kernel_size, stride),
333
+ norm_fn(channels),
334
+ act2(),
335
+ ]
336
+ attn = index >= dconv_attn
337
+ lstm = index >= dconv_lstm
338
+ if dconv_mode & 1:
339
+ encode += [DConv(channels, depth=dconv_depth, init=dconv_init,
340
+ compress=dconv_comp, attn=attn, lstm=lstm)]
341
+ if rewrite:
342
+ encode += [
343
+ nn.Conv1d(channels, ch_scale * channels, 1),
344
+ norm_fn(ch_scale * channels), activation]
345
+ self.encoder.append(nn.Sequential(*encode))
346
+
347
+ decode = []
348
+ if index > 0:
349
+ out_channels = in_channels
350
+ else:
351
+ out_channels = len(self.sources) * audio_channels
352
+ if rewrite:
353
+ decode += [
354
+ nn.Conv1d(channels, ch_scale * channels, 2 * context + 1, padding=context),
355
+ norm_fn(ch_scale * channels), activation]
356
+ if dconv_mode & 2:
357
+ decode += [DConv(channels, depth=dconv_depth, init=dconv_init,
358
+ compress=dconv_comp, attn=attn, lstm=lstm)]
359
+ decode += [nn.ConvTranspose1d(channels, out_channels,
360
+ kernel_size, stride, padding=padding)]
361
+ if index > 0:
362
+ decode += [norm_fn(out_channels), act2()]
363
+ self.decoder.insert(0, nn.Sequential(*decode))
364
+ in_channels = channels
365
+ channels = int(growth * channels)
366
+
367
+ channels = in_channels
368
+ if lstm_layers:
369
+ self.lstm = BLSTM(channels, lstm_layers)
370
+ else:
371
+ self.lstm = None
372
+
373
+ if rescale:
374
+ rescale_module(self, reference=rescale)
375
+
376
+ def valid_length(self, length):
377
+ """
378
+ Return the nearest valid length to use with the model so that
379
+ there is no time steps left over in a convolution, e.g. for all
380
+ layers, size of the input - kernel_size % stride = 0.
381
+
382
+ Note that input are automatically padded if necessary to ensure that the output
383
+ has the same length as the input.
384
+ """
385
+ if self.resample:
386
+ length *= 2
387
+
388
+ for _ in range(self.depth):
389
+ length = math.ceil((length - self.kernel_size) / self.stride) + 1
390
+ length = max(1, length)
391
+
392
+ for idx in range(self.depth):
393
+ length = (length - 1) * self.stride + self.kernel_size
394
+
395
+ if self.resample:
396
+ length = math.ceil(length / 2)
397
+ return int(length)
398
+
399
+ def forward(self, mix):
400
+ x = mix
401
+ length = x.shape[-1]
402
+
403
+ if self.normalize:
404
+ mono = mix.mean(dim=1, keepdim=True)
405
+ mean = mono.mean(dim=-1, keepdim=True)
406
+ std = mono.std(dim=-1, keepdim=True)
407
+ x = (x - mean) / (1e-5 + std)
408
+ else:
409
+ mean = 0
410
+ std = 1
411
+
412
+ delta = self.valid_length(length) - length
413
+ x = F.pad(x, (delta // 2, delta - delta // 2))
414
+
415
+ if self.resample:
416
+ x = julius.resample_frac(x, 1, 2)
417
+
418
+ saved = []
419
+ for encode in self.encoder:
420
+ x = encode(x)
421
+ saved.append(x)
422
+
423
+ if self.lstm:
424
+ x = self.lstm(x)
425
+
426
+ for decode in self.decoder:
427
+ skip = saved.pop(-1)
428
+ skip = center_trim(skip, x)
429
+ x = decode(x + skip)
430
+
431
+ if self.resample:
432
+ x = julius.resample_frac(x, 2, 1)
433
+ x = x * std + mean
434
+ x = center_trim(x, length)
435
+ x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1))
436
+ return x
437
+
438
+ def load_state_dict(self, state, strict=True):
439
+ # fix a mismatch with previous generation Demucs models.
440
+ for idx in range(self.depth):
441
+ for a in ['encoder', 'decoder']:
442
+ for b in ['bias', 'weight']:
443
+ new = f'{a}.{idx}.3.{b}'
444
+ old = f'{a}.{idx}.2.{b}'
445
+ if old in state and new not in state:
446
+ state[new] = state.pop(old)
447
+ super().load_state_dict(state, strict=strict)
MVSEP-MDX23-music-separation-model/demucs3/hdemucs.py ADDED
@@ -0,0 +1,782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """
7
+ This code contains the spectrogram and Hybrid version of Demucs.
8
+ """
9
+ from copy import deepcopy
10
+ import math
11
+ import typing as tp
12
+
13
+ from openunmix.filtering import wiener
14
+ import torch
15
+ from torch import nn
16
+ from torch.nn import functional as F
17
+
18
+ from .demucs import DConv, rescale_module
19
+ from .states import capture_init
20
+ from .spec import spectro, ispectro
21
+
22
+
23
+ def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
24
+ """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
25
+ If this is the case, we insert extra 0 padding to the right before the reflection happen."""
26
+ x0 = x
27
+ length = x.shape[-1]
28
+ padding_left, padding_right = paddings
29
+ if mode == 'reflect':
30
+ max_pad = max(padding_left, padding_right)
31
+ if length <= max_pad:
32
+ extra_pad = max_pad - length + 1
33
+ extra_pad_right = min(padding_right, extra_pad)
34
+ extra_pad_left = extra_pad - extra_pad_right
35
+ paddings = (padding_left - extra_pad_left, padding_right - extra_pad_right)
36
+ x = F.pad(x, (extra_pad_left, extra_pad_right))
37
+ out = F.pad(x, paddings, mode, value)
38
+ assert out.shape[-1] == length + padding_left + padding_right
39
+ assert (out[..., padding_left: padding_left + length] == x0).all()
40
+ return out
41
+
42
+
43
+ class ScaledEmbedding(nn.Module):
44
+ """
45
+ Boost learning rate for embeddings (with `scale`).
46
+ Also, can make embeddings continuous with `smooth`.
47
+ """
48
+ def __init__(self, num_embeddings: int, embedding_dim: int,
49
+ scale: float = 10., smooth=False):
50
+ super().__init__()
51
+ self.embedding = nn.Embedding(num_embeddings, embedding_dim)
52
+ if smooth:
53
+ weight = torch.cumsum(self.embedding.weight.data, dim=0)
54
+ # when summing gaussian, overscale raises as sqrt(n), so we nornalize by that.
55
+ weight = weight / torch.arange(1, num_embeddings + 1).to(weight).sqrt()[:, None]
56
+ self.embedding.weight.data[:] = weight
57
+ self.embedding.weight.data /= scale
58
+ self.scale = scale
59
+
60
+ @property
61
+ def weight(self):
62
+ return self.embedding.weight * self.scale
63
+
64
+ def forward(self, x):
65
+ out = self.embedding(x) * self.scale
66
+ return out
67
+
68
+
69
+ class HEncLayer(nn.Module):
70
+ def __init__(self, chin, chout, kernel_size=8, stride=4, norm_groups=1, empty=False,
71
+ freq=True, dconv=True, norm=True, context=0, dconv_kw={}, pad=True,
72
+ rewrite=True):
73
+ """Encoder layer. This used both by the time and the frequency branch.
74
+
75
+ Args:
76
+ chin: number of input channels.
77
+ chout: number of output channels.
78
+ norm_groups: number of groups for group norm.
79
+ empty: used to make a layer with just the first conv. this is used
80
+ before merging the time and freq. branches.
81
+ freq: this is acting on frequencies.
82
+ dconv: insert DConv residual branches.
83
+ norm: use GroupNorm.
84
+ context: context size for the 1x1 conv.
85
+ dconv_kw: list of kwargs for the DConv class.
86
+ pad: pad the input. Padding is done so that the output size is
87
+ always the input size / stride.
88
+ rewrite: add 1x1 conv at the end of the layer.
89
+ """
90
+ super().__init__()
91
+ norm_fn = lambda d: nn.Identity() # noqa
92
+ if norm:
93
+ norm_fn = lambda d: nn.GroupNorm(norm_groups, d) # noqa
94
+ if pad:
95
+ pad = kernel_size // 4
96
+ else:
97
+ pad = 0
98
+ klass = nn.Conv1d
99
+ self.freq = freq
100
+ self.kernel_size = kernel_size
101
+ self.stride = stride
102
+ self.empty = empty
103
+ self.norm = norm
104
+ self.pad = pad
105
+ if freq:
106
+ kernel_size = [kernel_size, 1]
107
+ stride = [stride, 1]
108
+ pad = [pad, 0]
109
+ klass = nn.Conv2d
110
+ self.conv = klass(chin, chout, kernel_size, stride, pad)
111
+ if self.empty:
112
+ return
113
+ self.norm1 = norm_fn(chout)
114
+ self.rewrite = None
115
+ if rewrite:
116
+ self.rewrite = klass(chout, 2 * chout, 1 + 2 * context, 1, context)
117
+ self.norm2 = norm_fn(2 * chout)
118
+
119
+ self.dconv = None
120
+ if dconv:
121
+ self.dconv = DConv(chout, **dconv_kw)
122
+
123
+ def forward(self, x, inject=None):
124
+ """
125
+ `inject` is used to inject the result from the time branch into the frequency branch,
126
+ when both have the same stride.
127
+ """
128
+ if not self.freq and x.dim() == 4:
129
+ B, C, Fr, T = x.shape
130
+ x = x.view(B, -1, T)
131
+
132
+ if not self.freq:
133
+ le = x.shape[-1]
134
+ if not le % self.stride == 0:
135
+ x = F.pad(x, (0, self.stride - (le % self.stride)))
136
+ y = self.conv(x)
137
+ if self.empty:
138
+ return y
139
+ if inject is not None:
140
+ assert inject.shape[-1] == y.shape[-1], (inject.shape, y.shape)
141
+ if inject.dim() == 3 and y.dim() == 4:
142
+ inject = inject[:, :, None]
143
+ y = y + inject
144
+ y = F.gelu(self.norm1(y))
145
+ if self.dconv:
146
+ if self.freq:
147
+ B, C, Fr, T = y.shape
148
+ y = y.permute(0, 2, 1, 3).reshape(-1, C, T)
149
+ y = self.dconv(y)
150
+ if self.freq:
151
+ y = y.view(B, Fr, C, T).permute(0, 2, 1, 3)
152
+ if self.rewrite:
153
+ z = self.norm2(self.rewrite(y))
154
+ z = F.glu(z, dim=1)
155
+ else:
156
+ z = y
157
+ return z
158
+
159
+
160
+ class MultiWrap(nn.Module):
161
+ """
162
+ Takes one layer and replicate it N times. each replica will act
163
+ on a frequency band. All is done so that if the N replica have the same weights,
164
+ then this is exactly equivalent to applying the original module on all frequencies.
165
+
166
+ This is a bit over-engineered to avoid edge artifacts when splitting
167
+ the frequency bands, but it is possible the naive implementation would work as well...
168
+ """
169
+ def __init__(self, layer, split_ratios):
170
+ """
171
+ Args:
172
+ layer: module to clone, must be either HEncLayer or HDecLayer.
173
+ split_ratios: list of float indicating which ratio to keep for each band.
174
+ """
175
+ super().__init__()
176
+ self.split_ratios = split_ratios
177
+ self.layers = nn.ModuleList()
178
+ self.conv = isinstance(layer, HEncLayer)
179
+ assert not layer.norm
180
+ assert layer.freq
181
+ assert layer.pad
182
+ if not self.conv:
183
+ assert not layer.context_freq
184
+ for k in range(len(split_ratios) + 1):
185
+ lay = deepcopy(layer)
186
+ if self.conv:
187
+ lay.conv.padding = (0, 0)
188
+ else:
189
+ lay.pad = False
190
+ for m in lay.modules():
191
+ if hasattr(m, 'reset_parameters'):
192
+ m.reset_parameters()
193
+ self.layers.append(lay)
194
+
195
+ def forward(self, x, skip=None, length=None):
196
+ B, C, Fr, T = x.shape
197
+
198
+ ratios = list(self.split_ratios) + [1]
199
+ start = 0
200
+ outs = []
201
+ for ratio, layer in zip(ratios, self.layers):
202
+ if self.conv:
203
+ pad = layer.kernel_size // 4
204
+ if ratio == 1:
205
+ limit = Fr
206
+ frames = -1
207
+ else:
208
+ limit = int(round(Fr * ratio))
209
+ le = limit - start
210
+ if start == 0:
211
+ le += pad
212
+ frames = round((le - layer.kernel_size) / layer.stride + 1)
213
+ limit = start + (frames - 1) * layer.stride + layer.kernel_size
214
+ if start == 0:
215
+ limit -= pad
216
+ assert limit - start > 0, (limit, start)
217
+ assert limit <= Fr, (limit, Fr)
218
+ y = x[:, :, start:limit, :]
219
+ if start == 0:
220
+ y = F.pad(y, (0, 0, pad, 0))
221
+ if ratio == 1:
222
+ y = F.pad(y, (0, 0, 0, pad))
223
+ outs.append(layer(y))
224
+ start = limit - layer.kernel_size + layer.stride
225
+ else:
226
+ if ratio == 1:
227
+ limit = Fr
228
+ else:
229
+ limit = int(round(Fr * ratio))
230
+ last = layer.last
231
+ layer.last = True
232
+
233
+ y = x[:, :, start:limit]
234
+ s = skip[:, :, start:limit]
235
+ out, _ = layer(y, s, None)
236
+ if outs:
237
+ outs[-1][:, :, -layer.stride:] += (
238
+ out[:, :, :layer.stride] - layer.conv_tr.bias.view(1, -1, 1, 1))
239
+ out = out[:, :, layer.stride:]
240
+ if ratio == 1:
241
+ out = out[:, :, :-layer.stride // 2, :]
242
+ if start == 0:
243
+ out = out[:, :, layer.stride // 2:, :]
244
+ outs.append(out)
245
+ layer.last = last
246
+ start = limit
247
+ out = torch.cat(outs, dim=2)
248
+ if not self.conv and not last:
249
+ out = F.gelu(out)
250
+ if self.conv:
251
+ return out
252
+ else:
253
+ return out, None
254
+
255
+
256
+ class HDecLayer(nn.Module):
257
+ def __init__(self, chin, chout, last=False, kernel_size=8, stride=4, norm_groups=1, empty=False,
258
+ freq=True, dconv=True, norm=True, context=1, dconv_kw={}, pad=True,
259
+ context_freq=True, rewrite=True):
260
+ """
261
+ Same as HEncLayer but for decoder. See `HEncLayer` for documentation.
262
+ """
263
+ super().__init__()
264
+ norm_fn = lambda d: nn.Identity() # noqa
265
+ if norm:
266
+ norm_fn = lambda d: nn.GroupNorm(norm_groups, d) # noqa
267
+ if pad:
268
+ pad = kernel_size // 4
269
+ else:
270
+ pad = 0
271
+ self.pad = pad
272
+ self.last = last
273
+ self.freq = freq
274
+ self.chin = chin
275
+ self.empty = empty
276
+ self.stride = stride
277
+ self.kernel_size = kernel_size
278
+ self.norm = norm
279
+ self.context_freq = context_freq
280
+ klass = nn.Conv1d
281
+ klass_tr = nn.ConvTranspose1d
282
+ if freq:
283
+ kernel_size = [kernel_size, 1]
284
+ stride = [stride, 1]
285
+ klass = nn.Conv2d
286
+ klass_tr = nn.ConvTranspose2d
287
+ self.conv_tr = klass_tr(chin, chout, kernel_size, stride)
288
+ self.norm2 = norm_fn(chout)
289
+ if self.empty:
290
+ return
291
+ self.rewrite = None
292
+ if rewrite:
293
+ if context_freq:
294
+ self.rewrite = klass(chin, 2 * chin, 1 + 2 * context, 1, context)
295
+ else:
296
+ self.rewrite = klass(chin, 2 * chin, [1, 1 + 2 * context], 1,
297
+ [0, context])
298
+ self.norm1 = norm_fn(2 * chin)
299
+
300
+ self.dconv = None
301
+ if dconv:
302
+ self.dconv = DConv(chin, **dconv_kw)
303
+
304
+ def forward(self, x, skip, length):
305
+ if self.freq and x.dim() == 3:
306
+ B, C, T = x.shape
307
+ x = x.view(B, self.chin, -1, T)
308
+
309
+ if not self.empty:
310
+ x = x + skip
311
+
312
+ if self.rewrite:
313
+ y = F.glu(self.norm1(self.rewrite(x)), dim=1)
314
+ else:
315
+ y = x
316
+ if self.dconv:
317
+ if self.freq:
318
+ B, C, Fr, T = y.shape
319
+ y = y.permute(0, 2, 1, 3).reshape(-1, C, T)
320
+ y = self.dconv(y)
321
+ if self.freq:
322
+ y = y.view(B, Fr, C, T).permute(0, 2, 1, 3)
323
+ else:
324
+ y = x
325
+ assert skip is None
326
+ z = self.norm2(self.conv_tr(y))
327
+ if self.freq:
328
+ if self.pad:
329
+ z = z[..., self.pad:-self.pad, :]
330
+ else:
331
+ z = z[..., self.pad:self.pad + length]
332
+ assert z.shape[-1] == length, (z.shape[-1], length)
333
+ if not self.last:
334
+ z = F.gelu(z)
335
+ return z, y
336
+
337
+
338
+ class HDemucs(nn.Module):
339
+ """
340
+ Spectrogram and hybrid Demucs model.
341
+ The spectrogram model has the same structure as Demucs, except the first few layers are over the
342
+ frequency axis, until there is only 1 frequency, and then it moves to time convolutions.
343
+ Frequency layers can still access information across time steps thanks to the DConv residual.
344
+
345
+ Hybrid model have a parallel time branch. At some layer, the time branch has the same stride
346
+ as the frequency branch and then the two are combined. The opposite happens in the decoder.
347
+
348
+ Models can either use naive iSTFT from masking, Wiener filtering ([Ulhih et al. 2017]),
349
+ or complex as channels (CaC) [Choi et al. 2020]. Wiener filtering is based on
350
+ Open Unmix implementation [Stoter et al. 2019].
351
+
352
+ The loss is always on the temporal domain, by backpropagating through the above
353
+ output methods and iSTFT. This allows to define hybrid models nicely. However, this breaks
354
+ a bit Wiener filtering, as doing more iteration at test time will change the spectrogram
355
+ contribution, without changing the one from the waveform, which will lead to worse performance.
356
+ I tried using the residual option in OpenUnmix Wiener implementation, but it didn't improve.
357
+ CaC on the other hand provides similar performance for hybrid, and works naturally with
358
+ hybrid models.
359
+
360
+ This model also uses frequency embeddings are used to improve efficiency on convolutions
361
+ over the freq. axis, following [Isik et al. 2020] (https://arxiv.org/pdf/2008.04470.pdf).
362
+
363
+ Unlike classic Demucs, there is no resampling here, and normalization is always applied.
364
+ """
365
+ @capture_init
366
+ def __init__(self,
367
+ sources,
368
+ # Channels
369
+ audio_channels=2,
370
+ channels=48,
371
+ channels_time=None,
372
+ growth=2,
373
+ # STFT
374
+ nfft=4096,
375
+ wiener_iters=0,
376
+ end_iters=0,
377
+ wiener_residual=False,
378
+ cac=True,
379
+ # Main structure
380
+ depth=6,
381
+ rewrite=True,
382
+ hybrid=True,
383
+ hybrid_old=False,
384
+ # Frequency branch
385
+ multi_freqs=None,
386
+ multi_freqs_depth=2,
387
+ freq_emb=0.2,
388
+ emb_scale=10,
389
+ emb_smooth=True,
390
+ # Convolutions
391
+ kernel_size=8,
392
+ time_stride=2,
393
+ stride=4,
394
+ context=1,
395
+ context_enc=0,
396
+ # Normalization
397
+ norm_starts=4,
398
+ norm_groups=4,
399
+ # DConv residual branch
400
+ dconv_mode=1,
401
+ dconv_depth=2,
402
+ dconv_comp=4,
403
+ dconv_attn=4,
404
+ dconv_lstm=4,
405
+ dconv_init=1e-4,
406
+ # Weight init
407
+ rescale=0.1,
408
+ # Metadata
409
+ samplerate=44100,
410
+ segment=4 * 10):
411
+ """
412
+ Args:
413
+ sources (list[str]): list of source names.
414
+ audio_channels (int): input/output audio channels.
415
+ channels (int): initial number of hidden channels.
416
+ channels_time: if not None, use a different `channels` value for the time branch.
417
+ growth: increase the number of hidden channels by this factor at each layer.
418
+ nfft: number of fft bins. Note that changing this require careful computation of
419
+ various shape parameters and will not work out of the box for hybrid models.
420
+ wiener_iters: when using Wiener filtering, number of iterations at test time.
421
+ end_iters: same but at train time. For a hybrid model, must be equal to `wiener_iters`.
422
+ wiener_residual: add residual source before wiener filtering.
423
+ cac: uses complex as channels, i.e. complex numbers are 2 channels each
424
+ in input and output. no further processing is done before ISTFT.
425
+ depth (int): number of layers in the encoder and in the decoder.
426
+ rewrite (bool): add 1x1 convolution to each layer.
427
+ hybrid (bool): make a hybrid time/frequency domain, otherwise frequency only.
428
+ hybrid_old: some models trained for MDX had a padding bug. This replicates
429
+ this bug to avoid retraining them.
430
+ multi_freqs: list of frequency ratios for splitting frequency bands with `MultiWrap`.
431
+ multi_freqs_depth: how many layers to wrap with `MultiWrap`. Only the outermost
432
+ layers will be wrapped.
433
+ freq_emb: add frequency embedding after the first frequency layer if > 0,
434
+ the actual value controls the weight of the embedding.
435
+ emb_scale: equivalent to scaling the embedding learning rate
436
+ emb_smooth: initialize the embedding with a smooth one (with respect to frequencies).
437
+ kernel_size: kernel_size for encoder and decoder layers.
438
+ stride: stride for encoder and decoder layers.
439
+ time_stride: stride for the final time layer, after the merge.
440
+ context: context for 1x1 conv in the decoder.
441
+ context_enc: context for 1x1 conv in the encoder.
442
+ norm_starts: layer at which group norm starts being used.
443
+ decoder layers are numbered in reverse order.
444
+ norm_groups: number of groups for group norm.
445
+ dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
446
+ dconv_depth: depth of residual DConv branch.
447
+ dconv_comp: compression of DConv branch.
448
+ dconv_attn: adds attention layers in DConv branch starting at this layer.
449
+ dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
450
+ dconv_init: initial scale for the DConv branch LayerScale.
451
+ rescale: weight recaling trick
452
+
453
+ """
454
+ super().__init__()
455
+ self.cac = cac
456
+ self.wiener_residual = wiener_residual
457
+ self.audio_channels = audio_channels
458
+ self.sources = sources
459
+ self.kernel_size = kernel_size
460
+ self.context = context
461
+ self.stride = stride
462
+ self.depth = depth
463
+ self.channels = channels
464
+ self.samplerate = samplerate
465
+ self.segment = segment
466
+
467
+ self.nfft = nfft
468
+ self.hop_length = nfft // 4
469
+ self.wiener_iters = wiener_iters
470
+ self.end_iters = end_iters
471
+ self.freq_emb = None
472
+ self.hybrid = hybrid
473
+ self.hybrid_old = hybrid_old
474
+ if hybrid_old:
475
+ assert hybrid, "hybrid_old must come with hybrid=True"
476
+ if hybrid:
477
+ assert wiener_iters == end_iters
478
+
479
+ self.encoder = nn.ModuleList()
480
+ self.decoder = nn.ModuleList()
481
+
482
+ if hybrid:
483
+ self.tencoder = nn.ModuleList()
484
+ self.tdecoder = nn.ModuleList()
485
+
486
+ chin = audio_channels
487
+ chin_z = chin # number of channels for the freq branch
488
+ if self.cac:
489
+ chin_z *= 2
490
+ chout = channels_time or channels
491
+ chout_z = channels
492
+ freqs = nfft // 2
493
+
494
+ for index in range(depth):
495
+ lstm = index >= dconv_lstm
496
+ attn = index >= dconv_attn
497
+ norm = index >= norm_starts
498
+ freq = freqs > 1
499
+ stri = stride
500
+ ker = kernel_size
501
+ if not freq:
502
+ assert freqs == 1
503
+ ker = time_stride * 2
504
+ stri = time_stride
505
+
506
+ pad = True
507
+ last_freq = False
508
+ if freq and freqs <= kernel_size:
509
+ ker = freqs
510
+ pad = False
511
+ last_freq = True
512
+
513
+ kw = {
514
+ 'kernel_size': ker,
515
+ 'stride': stri,
516
+ 'freq': freq,
517
+ 'pad': pad,
518
+ 'norm': norm,
519
+ 'rewrite': rewrite,
520
+ 'norm_groups': norm_groups,
521
+ 'dconv_kw': {
522
+ 'lstm': lstm,
523
+ 'attn': attn,
524
+ 'depth': dconv_depth,
525
+ 'compress': dconv_comp,
526
+ 'init': dconv_init,
527
+ 'gelu': True,
528
+ }
529
+ }
530
+ kwt = dict(kw)
531
+ kwt['freq'] = 0
532
+ kwt['kernel_size'] = kernel_size
533
+ kwt['stride'] = stride
534
+ kwt['pad'] = True
535
+ kw_dec = dict(kw)
536
+ multi = False
537
+ if multi_freqs and index < multi_freqs_depth:
538
+ multi = True
539
+ kw_dec['context_freq'] = False
540
+
541
+ if last_freq:
542
+ chout_z = max(chout, chout_z)
543
+ chout = chout_z
544
+
545
+ enc = HEncLayer(chin_z, chout_z,
546
+ dconv=dconv_mode & 1, context=context_enc, **kw)
547
+ if hybrid and freq:
548
+ tenc = HEncLayer(chin, chout, dconv=dconv_mode & 1, context=context_enc,
549
+ empty=last_freq, **kwt)
550
+ self.tencoder.append(tenc)
551
+
552
+ if multi:
553
+ enc = MultiWrap(enc, multi_freqs)
554
+ self.encoder.append(enc)
555
+ if index == 0:
556
+ chin = self.audio_channels * len(self.sources)
557
+ chin_z = chin
558
+ if self.cac:
559
+ chin_z *= 2
560
+ dec = HDecLayer(chout_z, chin_z, dconv=dconv_mode & 2,
561
+ last=index == 0, context=context, **kw_dec)
562
+ if multi:
563
+ dec = MultiWrap(dec, multi_freqs)
564
+ if hybrid and freq:
565
+ tdec = HDecLayer(chout, chin, dconv=dconv_mode & 2, empty=last_freq,
566
+ last=index == 0, context=context, **kwt)
567
+ self.tdecoder.insert(0, tdec)
568
+ self.decoder.insert(0, dec)
569
+
570
+ chin = chout
571
+ chin_z = chout_z
572
+ chout = int(growth * chout)
573
+ chout_z = int(growth * chout_z)
574
+ if freq:
575
+ if freqs <= kernel_size:
576
+ freqs = 1
577
+ else:
578
+ freqs //= stride
579
+ if index == 0 and freq_emb:
580
+ self.freq_emb = ScaledEmbedding(
581
+ freqs, chin_z, smooth=emb_smooth, scale=emb_scale)
582
+ self.freq_emb_scale = freq_emb
583
+
584
+ if rescale:
585
+ rescale_module(self, reference=rescale)
586
+
587
+ def _spec(self, x):
588
+ hl = self.hop_length
589
+ nfft = self.nfft
590
+ x0 = x # noqa
591
+
592
+ if self.hybrid:
593
+ # We re-pad the signal in order to keep the property
594
+ # that the size of the output is exactly the size of the input
595
+ # divided by the stride (here hop_length), when divisible.
596
+ # This is achieved by padding by 1/4th of the kernel size (here nfft).
597
+ # which is not supported by torch.stft.
598
+ # Having all convolution operations follow this convention allow to easily
599
+ # align the time and frequency branches later on.
600
+ assert hl == nfft // 4
601
+ le = int(math.ceil(x.shape[-1] / hl))
602
+ pad = hl // 2 * 3
603
+ if not self.hybrid_old:
604
+ x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode='reflect')
605
+ else:
606
+ x = pad1d(x, (pad, pad + le * hl - x.shape[-1]))
607
+
608
+ z = spectro(x, nfft, hl)[..., :-1, :]
609
+ if self.hybrid:
610
+ assert z.shape[-1] == le + 4, (z.shape, x.shape, le)
611
+ z = z[..., 2:2+le]
612
+ return z
613
+
614
+ def _ispec(self, z, length=None, scale=0):
615
+ hl = self.hop_length // (4 ** scale)
616
+ z = F.pad(z, (0, 0, 0, 1))
617
+ if self.hybrid:
618
+ z = F.pad(z, (2, 2))
619
+ pad = hl // 2 * 3
620
+ if not self.hybrid_old:
621
+ le = hl * int(math.ceil(length / hl)) + 2 * pad
622
+ else:
623
+ le = hl * int(math.ceil(length / hl))
624
+ x = ispectro(z, hl, length=le)
625
+ if not self.hybrid_old:
626
+ x = x[..., pad:pad + length]
627
+ else:
628
+ x = x[..., :length]
629
+ else:
630
+ x = ispectro(z, hl, length)
631
+ return x
632
+
633
+ def _magnitude(self, z):
634
+ # return the magnitude of the spectrogram, except when cac is True,
635
+ # in which case we just move the complex dimension to the channel one.
636
+ if self.cac:
637
+ B, C, Fr, T = z.shape
638
+ m = torch.view_as_real(z).permute(0, 1, 4, 2, 3)
639
+ m = m.reshape(B, C * 2, Fr, T)
640
+ else:
641
+ m = z.abs()
642
+ return m
643
+
644
+ def _mask(self, z, m):
645
+ # Apply masking given the mixture spectrogram `z` and the estimated mask `m`.
646
+ # If `cac` is True, `m` is actually a full spectrogram and `z` is ignored.
647
+ niters = self.wiener_iters
648
+ if self.cac:
649
+ B, S, C, Fr, T = m.shape
650
+ out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3)
651
+ out = torch.view_as_complex(out.contiguous())
652
+ return out
653
+ if self.training:
654
+ niters = self.end_iters
655
+ if niters < 0:
656
+ z = z[:, None]
657
+ return z / (1e-8 + z.abs()) * m
658
+ else:
659
+ return self._wiener(m, z, niters)
660
+
661
+ def _wiener(self, mag_out, mix_stft, niters):
662
+ # apply wiener filtering from OpenUnmix.
663
+ init = mix_stft.dtype
664
+ wiener_win_len = 300
665
+ residual = self.wiener_residual
666
+
667
+ B, S, C, Fq, T = mag_out.shape
668
+ mag_out = mag_out.permute(0, 4, 3, 2, 1)
669
+ mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1))
670
+
671
+ outs = []
672
+ for sample in range(B):
673
+ pos = 0
674
+ out = []
675
+ for pos in range(0, T, wiener_win_len):
676
+ frame = slice(pos, pos + wiener_win_len)
677
+ z_out = wiener(
678
+ mag_out[sample, frame], mix_stft[sample, frame], niters,
679
+ residual=residual)
680
+ out.append(z_out.transpose(-1, -2))
681
+ outs.append(torch.cat(out, dim=0))
682
+ out = torch.view_as_complex(torch.stack(outs, 0))
683
+ out = out.permute(0, 4, 3, 2, 1).contiguous()
684
+ if residual:
685
+ out = out[:, :-1]
686
+ assert list(out.shape) == [B, S, C, Fq, T]
687
+ return out.to(init)
688
+
689
+ def forward(self, mix):
690
+ x = mix
691
+ length = x.shape[-1]
692
+
693
+ z = self._spec(mix)
694
+ mag = self._magnitude(z)
695
+ x = mag
696
+
697
+ B, C, Fq, T = x.shape
698
+
699
+ # unlike previous Demucs, we always normalize because it is easier.
700
+ mean = x.mean(dim=(1, 2, 3), keepdim=True)
701
+ std = x.std(dim=(1, 2, 3), keepdim=True)
702
+ x = (x - mean) / (1e-5 + std)
703
+ # x will be the freq. branch input.
704
+
705
+ if self.hybrid:
706
+ # Prepare the time branch input.
707
+ xt = mix
708
+ meant = xt.mean(dim=(1, 2), keepdim=True)
709
+ stdt = xt.std(dim=(1, 2), keepdim=True)
710
+ xt = (xt - meant) / (1e-5 + stdt)
711
+
712
+ # okay, this is a giant mess I know...
713
+ saved = [] # skip connections, freq.
714
+ saved_t = [] # skip connections, time.
715
+ lengths = [] # saved lengths to properly remove padding, freq branch.
716
+ lengths_t = [] # saved lengths for time branch.
717
+ for idx, encode in enumerate(self.encoder):
718
+ lengths.append(x.shape[-1])
719
+ inject = None
720
+ if self.hybrid and idx < len(self.tencoder):
721
+ # we have not yet merged branches.
722
+ lengths_t.append(xt.shape[-1])
723
+ tenc = self.tencoder[idx]
724
+ xt = tenc(xt)
725
+ if not tenc.empty:
726
+ # save for skip connection
727
+ saved_t.append(xt)
728
+ else:
729
+ # tenc contains just the first conv., so that now time and freq.
730
+ # branches have the same shape and can be merged.
731
+ inject = xt
732
+ x = encode(x, inject)
733
+ if idx == 0 and self.freq_emb is not None:
734
+ # add frequency embedding to allow for non equivariant convolutions
735
+ # over the frequency axis.
736
+ frs = torch.arange(x.shape[-2], device=x.device)
737
+ emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x)
738
+ x = x + self.freq_emb_scale * emb
739
+
740
+ saved.append(x)
741
+
742
+ x = torch.zeros_like(x)
743
+ if self.hybrid:
744
+ xt = torch.zeros_like(x)
745
+ # initialize everything to zero (signal will go through u-net skips).
746
+
747
+ for idx, decode in enumerate(self.decoder):
748
+ skip = saved.pop(-1)
749
+ x, pre = decode(x, skip, lengths.pop(-1))
750
+ # `pre` contains the output just before final transposed convolution,
751
+ # which is used when the freq. and time branch separate.
752
+
753
+ if self.hybrid:
754
+ offset = self.depth - len(self.tdecoder)
755
+ if self.hybrid and idx >= offset:
756
+ tdec = self.tdecoder[idx - offset]
757
+ length_t = lengths_t.pop(-1)
758
+ if tdec.empty:
759
+ assert pre.shape[2] == 1, pre.shape
760
+ pre = pre[:, :, 0]
761
+ xt, _ = tdec(pre, None, length_t)
762
+ else:
763
+ skip = saved_t.pop(-1)
764
+ xt, _ = tdec(xt, skip, length_t)
765
+
766
+ # Let's make sure we used all stored skip connections.
767
+ assert len(saved) == 0
768
+ assert len(lengths_t) == 0
769
+ assert len(saved_t) == 0
770
+
771
+ S = len(self.sources)
772
+ x = x.view(B, S, -1, Fq, T)
773
+ x = x * std[:, None] + mean[:, None]
774
+
775
+ zout = self._mask(z, x)
776
+ x = self._ispec(zout, length)
777
+
778
+ if self.hybrid:
779
+ xt = xt.view(B, S, -1, length)
780
+ xt = xt * stdt[:, None] + meant[:, None]
781
+ x = xt + x
782
+ return x
MVSEP-MDX23-music-separation-model/demucs3/htdemucs.py ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ # First author is Simon Rouard.
7
+ """
8
+ This code contains the spectrogram and Hybrid version of Demucs.
9
+ """
10
+ import math
11
+
12
+ from openunmix.filtering import wiener
13
+ import torch
14
+ from torch import nn
15
+ from torch.nn import functional as F
16
+ from fractions import Fraction
17
+ from einops import rearrange
18
+
19
+ from .transformer import CrossTransformerEncoder
20
+
21
+ from .demucs import rescale_module
22
+ from .states import capture_init
23
+ from .spec import spectro, ispectro
24
+ from .hdemucs import pad1d, ScaledEmbedding, HEncLayer, MultiWrap, HDecLayer
25
+
26
+
27
+ class HTDemucs(nn.Module):
28
+ """
29
+ Spectrogram and hybrid Demucs model.
30
+ The spectrogram model has the same structure as Demucs, except the first few layers are over the
31
+ frequency axis, until there is only 1 frequency, and then it moves to time convolutions.
32
+ Frequency layers can still access information across time steps thanks to the DConv residual.
33
+
34
+ Hybrid model have a parallel time branch. At some layer, the time branch has the same stride
35
+ as the frequency branch and then the two are combined. The opposite happens in the decoder.
36
+
37
+ Models can either use naive iSTFT from masking, Wiener filtering ([Ulhih et al. 2017]),
38
+ or complex as channels (CaC) [Choi et al. 2020]. Wiener filtering is based on
39
+ Open Unmix implementation [Stoter et al. 2019].
40
+
41
+ The loss is always on the temporal domain, by backpropagating through the above
42
+ output methods and iSTFT. This allows to define hybrid models nicely. However, this breaks
43
+ a bit Wiener filtering, as doing more iteration at test time will change the spectrogram
44
+ contribution, without changing the one from the waveform, which will lead to worse performance.
45
+ I tried using the residual option in OpenUnmix Wiener implementation, but it didn't improve.
46
+ CaC on the other hand provides similar performance for hybrid, and works naturally with
47
+ hybrid models.
48
+
49
+ This model also uses frequency embeddings are used to improve efficiency on convolutions
50
+ over the freq. axis, following [Isik et al. 2020] (https://arxiv.org/pdf/2008.04470.pdf).
51
+
52
+ Unlike classic Demucs, there is no resampling here, and normalization is always applied.
53
+ """
54
+
55
+ @capture_init
56
+ def __init__(
57
+ self,
58
+ sources,
59
+ # Channels
60
+ audio_channels=2,
61
+ channels=48,
62
+ channels_time=None,
63
+ growth=2,
64
+ # STFT
65
+ nfft=4096,
66
+ wiener_iters=0,
67
+ end_iters=0,
68
+ wiener_residual=False,
69
+ cac=True,
70
+ # Main structure
71
+ depth=4,
72
+ rewrite=True,
73
+ # Frequency branch
74
+ multi_freqs=None,
75
+ multi_freqs_depth=3,
76
+ freq_emb=0.2,
77
+ emb_scale=10,
78
+ emb_smooth=True,
79
+ # Convolutions
80
+ kernel_size=8,
81
+ time_stride=2,
82
+ stride=4,
83
+ context=1,
84
+ context_enc=0,
85
+ # Normalization
86
+ norm_starts=4,
87
+ norm_groups=4,
88
+ # DConv residual branch
89
+ dconv_mode=1,
90
+ dconv_depth=2,
91
+ dconv_comp=8,
92
+ dconv_init=1e-3,
93
+ # Before the Transformer
94
+ bottom_channels=0,
95
+ # Transformer
96
+ t_layers=5,
97
+ t_emb="sin",
98
+ t_hidden_scale=4.0,
99
+ t_heads=8,
100
+ t_dropout=0.0,
101
+ t_max_positions=10000,
102
+ t_norm_in=True,
103
+ t_norm_in_group=False,
104
+ t_group_norm=False,
105
+ t_norm_first=True,
106
+ t_norm_out=True,
107
+ t_max_period=10000.0,
108
+ t_weight_decay=0.0,
109
+ t_lr=None,
110
+ t_layer_scale=True,
111
+ t_gelu=True,
112
+ t_weight_pos_embed=1.0,
113
+ t_sin_random_shift=0,
114
+ t_cape_mean_normalize=True,
115
+ t_cape_augment=True,
116
+ t_cape_glob_loc_scale=[5000.0, 1.0, 1.4],
117
+ t_sparse_self_attn=False,
118
+ t_sparse_cross_attn=False,
119
+ t_mask_type="diag",
120
+ t_mask_random_seed=42,
121
+ t_sparse_attn_window=500,
122
+ t_global_window=100,
123
+ t_sparsity=0.95,
124
+ t_auto_sparsity=False,
125
+ # ------ Particuliar parameters
126
+ t_cross_first=False,
127
+ # Weight init
128
+ rescale=0.1,
129
+ # Metadata
130
+ samplerate=44100,
131
+ segment=10,
132
+ use_train_segment=True,
133
+ ):
134
+ """
135
+ Args:
136
+ sources (list[str]): list of source names.
137
+ audio_channels (int): input/output audio channels.
138
+ channels (int): initial number of hidden channels.
139
+ channels_time: if not None, use a different `channels` value for the time branch.
140
+ growth: increase the number of hidden channels by this factor at each layer.
141
+ nfft: number of fft bins. Note that changing this require careful computation of
142
+ various shape parameters and will not work out of the box for hybrid models.
143
+ wiener_iters: when using Wiener filtering, number of iterations at test time.
144
+ end_iters: same but at train time. For a hybrid model, must be equal to `wiener_iters`.
145
+ wiener_residual: add residual source before wiener filtering.
146
+ cac: uses complex as channels, i.e. complex numbers are 2 channels each
147
+ in input and output. no further processing is done before ISTFT.
148
+ depth (int): number of layers in the encoder and in the decoder.
149
+ rewrite (bool): add 1x1 convolution to each layer.
150
+ multi_freqs: list of frequency ratios for splitting frequency bands with `MultiWrap`.
151
+ multi_freqs_depth: how many layers to wrap with `MultiWrap`. Only the outermost
152
+ layers will be wrapped.
153
+ freq_emb: add frequency embedding after the first frequency layer if > 0,
154
+ the actual value controls the weight of the embedding.
155
+ emb_scale: equivalent to scaling the embedding learning rate
156
+ emb_smooth: initialize the embedding with a smooth one (with respect to frequencies).
157
+ kernel_size: kernel_size for encoder and decoder layers.
158
+ stride: stride for encoder and decoder layers.
159
+ time_stride: stride for the final time layer, after the merge.
160
+ context: context for 1x1 conv in the decoder.
161
+ context_enc: context for 1x1 conv in the encoder.
162
+ norm_starts: layer at which group norm starts being used.
163
+ decoder layers are numbered in reverse order.
164
+ norm_groups: number of groups for group norm.
165
+ dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
166
+ dconv_depth: depth of residual DConv branch.
167
+ dconv_comp: compression of DConv branch.
168
+ dconv_attn: adds attention layers in DConv branch starting at this layer.
169
+ dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
170
+ dconv_init: initial scale for the DConv branch LayerScale.
171
+ bottom_channels: if >0 it adds a linear layer (1x1 Conv) before and after the
172
+ transformer in order to change the number of channels
173
+ t_layers: number of layers in each branch (waveform and spec) of the transformer
174
+ t_emb: "sin", "cape" or "scaled"
175
+ t_hidden_scale: the hidden scale of the Feedforward parts of the transformer
176
+ for instance if C = 384 (the number of channels in the transformer) and
177
+ t_hidden_scale = 4.0 then the intermediate layer of the FFN has dimension
178
+ 384 * 4 = 1536
179
+ t_heads: number of heads for the transformer
180
+ t_dropout: dropout in the transformer
181
+ t_max_positions: max_positions for the "scaled" positional embedding, only
182
+ useful if t_emb="scaled"
183
+ t_norm_in: (bool) norm before addinf positional embedding and getting into the
184
+ transformer layers
185
+ t_norm_in_group: (bool) if True while t_norm_in=True, the norm is on all the
186
+ timesteps (GroupNorm with group=1)
187
+ t_group_norm: (bool) if True, the norms of the Encoder Layers are on all the
188
+ timesteps (GroupNorm with group=1)
189
+ t_norm_first: (bool) if True the norm is before the attention and before the FFN
190
+ t_norm_out: (bool) if True, there is a GroupNorm (group=1) at the end of each layer
191
+ t_max_period: (float) denominator in the sinusoidal embedding expression
192
+ t_weight_decay: (float) weight decay for the transformer
193
+ t_lr: (float) specific learning rate for the transformer
194
+ t_layer_scale: (bool) Layer Scale for the transformer
195
+ t_gelu: (bool) activations of the transformer are GeLU if True, ReLU else
196
+ t_weight_pos_embed: (float) weighting of the positional embedding
197
+ t_cape_mean_normalize: (bool) if t_emb="cape", normalisation of positional embeddings
198
+ see: https://arxiv.org/abs/2106.03143
199
+ t_cape_augment: (bool) if t_emb="cape", must be True during training and False
200
+ during the inference, see: https://arxiv.org/abs/2106.03143
201
+ t_cape_glob_loc_scale: (list of 3 floats) if t_emb="cape", CAPE parameters
202
+ see: https://arxiv.org/abs/2106.03143
203
+ t_sparse_self_attn: (bool) if True, the self attentions are sparse
204
+ t_sparse_cross_attn: (bool) if True, the cross-attentions are sparse (don't use it
205
+ unless you designed really specific masks)
206
+ t_mask_type: (str) can be "diag", "jmask", "random", "global" or any combination
207
+ with '_' between: i.e. "diag_jmask_random" (note that this is permutation
208
+ invariant i.e. "diag_jmask_random" is equivalent to "jmask_random_diag")
209
+ t_mask_random_seed: (int) if "random" is in t_mask_type, controls the seed
210
+ that generated the random part of the mask
211
+ t_sparse_attn_window: (int) if "diag" is in t_mask_type, for a query (i), and
212
+ a key (j), the mask is True id |i-j|<=t_sparse_attn_window
213
+ t_global_window: (int) if "global" is in t_mask_type, mask[:t_global_window, :]
214
+ and mask[:, :t_global_window] will be True
215
+ t_sparsity: (float) if "random" is in t_mask_type, t_sparsity is the sparsity
216
+ level of the random part of the mask.
217
+ t_cross_first: (bool) if True cross attention is the first layer of the
218
+ transformer (False seems to be better)
219
+ rescale: weight rescaling trick
220
+ use_train_segment: (bool) if True, the actual size that is used during the
221
+ training is used during inference.
222
+ """
223
+ super().__init__()
224
+ self.cac = cac
225
+ self.wiener_residual = wiener_residual
226
+ self.audio_channels = audio_channels
227
+ self.sources = sources
228
+ self.kernel_size = kernel_size
229
+ self.context = context
230
+ self.stride = stride
231
+ self.depth = depth
232
+ self.bottom_channels = bottom_channels
233
+ self.channels = channels
234
+ self.samplerate = samplerate
235
+ self.segment = segment
236
+ self.use_train_segment = use_train_segment
237
+ self.nfft = nfft
238
+ self.hop_length = nfft // 4
239
+ self.wiener_iters = wiener_iters
240
+ self.end_iters = end_iters
241
+ self.freq_emb = None
242
+ assert wiener_iters == end_iters
243
+
244
+ self.encoder = nn.ModuleList()
245
+ self.decoder = nn.ModuleList()
246
+
247
+ self.tencoder = nn.ModuleList()
248
+ self.tdecoder = nn.ModuleList()
249
+
250
+ chin = audio_channels
251
+ chin_z = chin # number of channels for the freq branch
252
+ if self.cac:
253
+ chin_z *= 2
254
+ chout = channels_time or channels
255
+ chout_z = channels
256
+ freqs = nfft // 2
257
+
258
+ for index in range(depth):
259
+ norm = index >= norm_starts
260
+ freq = freqs > 1
261
+ stri = stride
262
+ ker = kernel_size
263
+ if not freq:
264
+ assert freqs == 1
265
+ ker = time_stride * 2
266
+ stri = time_stride
267
+
268
+ pad = True
269
+ last_freq = False
270
+ if freq and freqs <= kernel_size:
271
+ ker = freqs
272
+ pad = False
273
+ last_freq = True
274
+
275
+ kw = {
276
+ "kernel_size": ker,
277
+ "stride": stri,
278
+ "freq": freq,
279
+ "pad": pad,
280
+ "norm": norm,
281
+ "rewrite": rewrite,
282
+ "norm_groups": norm_groups,
283
+ "dconv_kw": {
284
+ "depth": dconv_depth,
285
+ "compress": dconv_comp,
286
+ "init": dconv_init,
287
+ "gelu": True,
288
+ },
289
+ }
290
+ kwt = dict(kw)
291
+ kwt["freq"] = 0
292
+ kwt["kernel_size"] = kernel_size
293
+ kwt["stride"] = stride
294
+ kwt["pad"] = True
295
+ kw_dec = dict(kw)
296
+ multi = False
297
+ if multi_freqs and index < multi_freqs_depth:
298
+ multi = True
299
+ kw_dec["context_freq"] = False
300
+
301
+ if last_freq:
302
+ chout_z = max(chout, chout_z)
303
+ chout = chout_z
304
+
305
+ enc = HEncLayer(
306
+ chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw
307
+ )
308
+ if freq:
309
+ tenc = HEncLayer(
310
+ chin,
311
+ chout,
312
+ dconv=dconv_mode & 1,
313
+ context=context_enc,
314
+ empty=last_freq,
315
+ **kwt
316
+ )
317
+ self.tencoder.append(tenc)
318
+
319
+ if multi:
320
+ enc = MultiWrap(enc, multi_freqs)
321
+ self.encoder.append(enc)
322
+ if index == 0:
323
+ chin = self.audio_channels * len(self.sources)
324
+ chin_z = chin
325
+ if self.cac:
326
+ chin_z *= 2
327
+ dec = HDecLayer(
328
+ chout_z,
329
+ chin_z,
330
+ dconv=dconv_mode & 2,
331
+ last=index == 0,
332
+ context=context,
333
+ **kw_dec
334
+ )
335
+ if multi:
336
+ dec = MultiWrap(dec, multi_freqs)
337
+ if freq:
338
+ tdec = HDecLayer(
339
+ chout,
340
+ chin,
341
+ dconv=dconv_mode & 2,
342
+ empty=last_freq,
343
+ last=index == 0,
344
+ context=context,
345
+ **kwt
346
+ )
347
+ self.tdecoder.insert(0, tdec)
348
+ self.decoder.insert(0, dec)
349
+
350
+ chin = chout
351
+ chin_z = chout_z
352
+ chout = int(growth * chout)
353
+ chout_z = int(growth * chout_z)
354
+ if freq:
355
+ if freqs <= kernel_size:
356
+ freqs = 1
357
+ else:
358
+ freqs //= stride
359
+ if index == 0 and freq_emb:
360
+ self.freq_emb = ScaledEmbedding(
361
+ freqs, chin_z, smooth=emb_smooth, scale=emb_scale
362
+ )
363
+ self.freq_emb_scale = freq_emb
364
+
365
+ if rescale:
366
+ rescale_module(self, reference=rescale)
367
+
368
+ transformer_channels = channels * growth ** (depth - 1)
369
+ if bottom_channels:
370
+ self.channel_upsampler = nn.Conv1d(transformer_channels, bottom_channels, 1)
371
+ self.channel_downsampler = nn.Conv1d(
372
+ bottom_channels, transformer_channels, 1
373
+ )
374
+ self.channel_upsampler_t = nn.Conv1d(
375
+ transformer_channels, bottom_channels, 1
376
+ )
377
+ self.channel_downsampler_t = nn.Conv1d(
378
+ bottom_channels, transformer_channels, 1
379
+ )
380
+
381
+ transformer_channels = bottom_channels
382
+
383
+ if t_layers > 0:
384
+ self.crosstransformer = CrossTransformerEncoder(
385
+ dim=transformer_channels,
386
+ emb=t_emb,
387
+ hidden_scale=t_hidden_scale,
388
+ num_heads=t_heads,
389
+ num_layers=t_layers,
390
+ cross_first=t_cross_first,
391
+ dropout=t_dropout,
392
+ max_positions=t_max_positions,
393
+ norm_in=t_norm_in,
394
+ norm_in_group=t_norm_in_group,
395
+ group_norm=t_group_norm,
396
+ norm_first=t_norm_first,
397
+ norm_out=t_norm_out,
398
+ max_period=t_max_period,
399
+ weight_decay=t_weight_decay,
400
+ lr=t_lr,
401
+ layer_scale=t_layer_scale,
402
+ gelu=t_gelu,
403
+ sin_random_shift=t_sin_random_shift,
404
+ weight_pos_embed=t_weight_pos_embed,
405
+ cape_mean_normalize=t_cape_mean_normalize,
406
+ cape_augment=t_cape_augment,
407
+ cape_glob_loc_scale=t_cape_glob_loc_scale,
408
+ sparse_self_attn=t_sparse_self_attn,
409
+ sparse_cross_attn=t_sparse_cross_attn,
410
+ mask_type=t_mask_type,
411
+ mask_random_seed=t_mask_random_seed,
412
+ sparse_attn_window=t_sparse_attn_window,
413
+ global_window=t_global_window,
414
+ sparsity=t_sparsity,
415
+ auto_sparsity=t_auto_sparsity,
416
+ )
417
+ else:
418
+ self.crosstransformer = None
419
+
420
+ def _spec(self, x):
421
+ hl = self.hop_length
422
+ nfft = self.nfft
423
+ x0 = x # noqa
424
+
425
+ # We re-pad the signal in order to keep the property
426
+ # that the size of the output is exactly the size of the input
427
+ # divided by the stride (here hop_length), when divisible.
428
+ # This is achieved by padding by 1/4th of the kernel size (here nfft).
429
+ # which is not supported by torch.stft.
430
+ # Having all convolution operations follow this convention allow to easily
431
+ # align the time and frequency branches later on.
432
+ assert hl == nfft // 4
433
+ le = int(math.ceil(x.shape[-1] / hl))
434
+ pad = hl // 2 * 3
435
+ x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect")
436
+
437
+ z = spectro(x, nfft, hl)[..., :-1, :]
438
+ assert z.shape[-1] == le + 4, (z.shape, x.shape, le)
439
+ z = z[..., 2: 2 + le]
440
+ return z
441
+
442
+ def _ispec(self, z, length=None, scale=0):
443
+ hl = self.hop_length // (4**scale)
444
+ z = F.pad(z, (0, 0, 0, 1))
445
+ z = F.pad(z, (2, 2))
446
+ pad = hl // 2 * 3
447
+ le = hl * int(math.ceil(length / hl)) + 2 * pad
448
+ x = ispectro(z, hl, length=le)
449
+ x = x[..., pad: pad + length]
450
+ return x
451
+
452
+ def _magnitude(self, z):
453
+ # return the magnitude of the spectrogram, except when cac is True,
454
+ # in which case we just move the complex dimension to the channel one.
455
+ if self.cac:
456
+ B, C, Fr, T = z.shape
457
+ m = torch.view_as_real(z).permute(0, 1, 4, 2, 3)
458
+ m = m.reshape(B, C * 2, Fr, T)
459
+ else:
460
+ m = z.abs()
461
+ return m
462
+
463
+ def _mask(self, z, m):
464
+ # Apply masking given the mixture spectrogram `z` and the estimated mask `m`.
465
+ # If `cac` is True, `m` is actually a full spectrogram and `z` is ignored.
466
+ niters = self.wiener_iters
467
+ if self.cac:
468
+ B, S, C, Fr, T = m.shape
469
+ out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3)
470
+ out = torch.view_as_complex(out.contiguous())
471
+ return out
472
+ if self.training:
473
+ niters = self.end_iters
474
+ if niters < 0:
475
+ z = z[:, None]
476
+ return z / (1e-8 + z.abs()) * m
477
+ else:
478
+ return self._wiener(m, z, niters)
479
+
480
+ def _wiener(self, mag_out, mix_stft, niters):
481
+ # apply wiener filtering from OpenUnmix.
482
+ init = mix_stft.dtype
483
+ wiener_win_len = 300
484
+ residual = self.wiener_residual
485
+
486
+ B, S, C, Fq, T = mag_out.shape
487
+ mag_out = mag_out.permute(0, 4, 3, 2, 1)
488
+ mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1))
489
+
490
+ outs = []
491
+ for sample in range(B):
492
+ pos = 0
493
+ out = []
494
+ for pos in range(0, T, wiener_win_len):
495
+ frame = slice(pos, pos + wiener_win_len)
496
+ z_out = wiener(
497
+ mag_out[sample, frame],
498
+ mix_stft[sample, frame],
499
+ niters,
500
+ residual=residual,
501
+ )
502
+ out.append(z_out.transpose(-1, -2))
503
+ outs.append(torch.cat(out, dim=0))
504
+ out = torch.view_as_complex(torch.stack(outs, 0))
505
+ out = out.permute(0, 4, 3, 2, 1).contiguous()
506
+ if residual:
507
+ out = out[:, :-1]
508
+ assert list(out.shape) == [B, S, C, Fq, T]
509
+ return out.to(init)
510
+
511
+ def valid_length(self, length: int):
512
+ """
513
+ Return a length that is appropriate for evaluation.
514
+ In our case, always return the training length, unless
515
+ it is smaller than the given length, in which case this
516
+ raises an error.
517
+ """
518
+ if not self.use_train_segment:
519
+ return length
520
+ training_length = int(self.segment * self.samplerate)
521
+ if training_length < length:
522
+ raise ValueError(
523
+ f"Given length {length} is longer than "
524
+ f"training length {training_length}")
525
+ return training_length
526
+
527
+ def forward(self, mix):
528
+ length = mix.shape[-1]
529
+ length_pre_pad = None
530
+ if self.use_train_segment:
531
+ if self.training:
532
+ self.segment = Fraction(mix.shape[-1], self.samplerate)
533
+ else:
534
+ training_length = int(self.segment * self.samplerate)
535
+ if mix.shape[-1] < training_length:
536
+ length_pre_pad = mix.shape[-1]
537
+ mix = F.pad(mix, (0, training_length - length_pre_pad))
538
+ z = self._spec(mix)
539
+ mag = self._magnitude(z)
540
+ x = mag
541
+
542
+ B, C, Fq, T = x.shape
543
+
544
+ # unlike previous Demucs, we always normalize because it is easier.
545
+ mean = x.mean(dim=(1, 2, 3), keepdim=True)
546
+ std = x.std(dim=(1, 2, 3), keepdim=True)
547
+ x = (x - mean) / (1e-5 + std)
548
+ # x will be the freq. branch input.
549
+
550
+ # Prepare the time branch input.
551
+ xt = mix
552
+ meant = xt.mean(dim=(1, 2), keepdim=True)
553
+ stdt = xt.std(dim=(1, 2), keepdim=True)
554
+ xt = (xt - meant) / (1e-5 + stdt)
555
+
556
+ # okay, this is a giant mess I know...
557
+ saved = [] # skip connections, freq.
558
+ saved_t = [] # skip connections, time.
559
+ lengths = [] # saved lengths to properly remove padding, freq branch.
560
+ lengths_t = [] # saved lengths for time branch.
561
+ for idx, encode in enumerate(self.encoder):
562
+ lengths.append(x.shape[-1])
563
+ inject = None
564
+ if idx < len(self.tencoder):
565
+ # we have not yet merged branches.
566
+ lengths_t.append(xt.shape[-1])
567
+ tenc = self.tencoder[idx]
568
+ xt = tenc(xt)
569
+ if not tenc.empty:
570
+ # save for skip connection
571
+ saved_t.append(xt)
572
+ else:
573
+ # tenc contains just the first conv., so that now time and freq.
574
+ # branches have the same shape and can be merged.
575
+ inject = xt
576
+ x = encode(x, inject)
577
+ if idx == 0 and self.freq_emb is not None:
578
+ # add frequency embedding to allow for non equivariant convolutions
579
+ # over the frequency axis.
580
+ frs = torch.arange(x.shape[-2], device=x.device)
581
+ emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x)
582
+ x = x + self.freq_emb_scale * emb
583
+
584
+ saved.append(x)
585
+ if self.crosstransformer:
586
+ if self.bottom_channels:
587
+ b, c, f, t = x.shape
588
+ x = rearrange(x, "b c f t-> b c (f t)")
589
+ x = self.channel_upsampler(x)
590
+ x = rearrange(x, "b c (f t)-> b c f t", f=f)
591
+ xt = self.channel_upsampler_t(xt)
592
+
593
+ x, xt = self.crosstransformer(x, xt)
594
+
595
+ if self.bottom_channels:
596
+ x = rearrange(x, "b c f t-> b c (f t)")
597
+ x = self.channel_downsampler(x)
598
+ x = rearrange(x, "b c (f t)-> b c f t", f=f)
599
+ xt = self.channel_downsampler_t(xt)
600
+
601
+ for idx, decode in enumerate(self.decoder):
602
+ skip = saved.pop(-1)
603
+ x, pre = decode(x, skip, lengths.pop(-1))
604
+ # `pre` contains the output just before final transposed convolution,
605
+ # which is used when the freq. and time branch separate.
606
+
607
+ offset = self.depth - len(self.tdecoder)
608
+ if idx >= offset:
609
+ tdec = self.tdecoder[idx - offset]
610
+ length_t = lengths_t.pop(-1)
611
+ if tdec.empty:
612
+ assert pre.shape[2] == 1, pre.shape
613
+ pre = pre[:, :, 0]
614
+ xt, _ = tdec(pre, None, length_t)
615
+ else:
616
+ skip = saved_t.pop(-1)
617
+ xt, _ = tdec(xt, skip, length_t)
618
+
619
+ # Let's make sure we used all stored skip connections.
620
+ assert len(saved) == 0
621
+ assert len(lengths_t) == 0
622
+ assert len(saved_t) == 0
623
+
624
+ S = len(self.sources)
625
+ x = x.view(B, S, -1, Fq, T)
626
+ x = x * std[:, None] + mean[:, None]
627
+
628
+ zout = self._mask(z, x)
629
+ if self.use_train_segment:
630
+ if self.training:
631
+ x = self._ispec(zout, length)
632
+ else:
633
+ x = self._ispec(zout, training_length)
634
+ else:
635
+ x = self._ispec(zout, length)
636
+
637
+ if self.use_train_segment:
638
+ if self.training:
639
+ xt = xt.view(B, S, -1, length)
640
+ else:
641
+ xt = xt.view(B, S, -1, training_length)
642
+ else:
643
+ xt = xt.view(B, S, -1, length)
644
+ xt = xt * stdt[:, None] + meant[:, None]
645
+ x = xt + x
646
+ if length_pre_pad:
647
+ x = x[..., :length_pre_pad]
648
+ return x
MVSEP-MDX23-music-separation-model/demucs3/spec.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Conveniance wrapper to perform STFT and iSTFT"""
7
+
8
+ import torch as th
9
+
10
+
11
+ def spectro(x, n_fft=512, hop_length=None, pad=0):
12
+ *other, length = x.shape
13
+ x = x.reshape(-1, length)
14
+ z = th.stft(x,
15
+ n_fft * (1 + pad),
16
+ hop_length or n_fft // 4,
17
+ window=th.hann_window(n_fft).to(x),
18
+ win_length=n_fft,
19
+ normalized=True,
20
+ center=True,
21
+ return_complex=True,
22
+ pad_mode='reflect')
23
+ _, freqs, frame = z.shape
24
+ return z.view(*other, freqs, frame)
25
+
26
+
27
+ def ispectro(z, hop_length=None, length=None, pad=0):
28
+ *other, freqs, frames = z.shape
29
+ n_fft = 2 * freqs - 2
30
+ z = z.view(-1, freqs, frames)
31
+ win_length = n_fft // (1 + pad)
32
+ x = th.istft(z,
33
+ n_fft,
34
+ hop_length,
35
+ window=th.hann_window(win_length).to(z.real),
36
+ win_length=win_length,
37
+ normalized=True,
38
+ length=length,
39
+ center=True)
40
+ _, length = x.shape
41
+ return x.view(*other, length)
MVSEP-MDX23-music-separation-model/demucs3/states.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """
7
+ Utilities to save and load models.
8
+ """
9
+ from contextlib import contextmanager
10
+
11
+ import functools
12
+ import hashlib
13
+ import inspect
14
+ import io
15
+ from pathlib import Path
16
+ import warnings
17
+
18
+ from omegaconf import OmegaConf
19
+ from diffq import DiffQuantizer, UniformQuantizer, restore_quantized_state
20
+ import torch
21
+
22
+
23
+ def get_quantizer(model, args, optimizer=None):
24
+ """Return the quantizer given the XP quantization args."""
25
+ quantizer = None
26
+ if args.diffq:
27
+ quantizer = DiffQuantizer(
28
+ model, min_size=args.min_size, group_size=args.group_size)
29
+ if optimizer is not None:
30
+ quantizer.setup_optimizer(optimizer)
31
+ elif args.qat:
32
+ quantizer = UniformQuantizer(
33
+ model, bits=args.qat, min_size=args.min_size)
34
+ return quantizer
35
+
36
+
37
+ def load_model(path_or_package, strict=False):
38
+ """Load a model from the given serialized model, either given as a dict (already loaded)
39
+ or a path to a file on disk."""
40
+ if isinstance(path_or_package, dict):
41
+ package = path_or_package
42
+ elif isinstance(path_or_package, (str, Path)):
43
+ with warnings.catch_warnings():
44
+ warnings.simplefilter("ignore")
45
+ path = path_or_package
46
+ package = torch.load(path, 'cpu')
47
+ else:
48
+ raise ValueError(f"Invalid type for {path_or_package}.")
49
+
50
+ klass = package["klass"]
51
+ args = package["args"]
52
+ kwargs = package["kwargs"]
53
+
54
+ if strict:
55
+ model = klass(*args, **kwargs)
56
+ else:
57
+ sig = inspect.signature(klass)
58
+ for key in list(kwargs):
59
+ if key not in sig.parameters:
60
+ warnings.warn("Dropping inexistant parameter " + key)
61
+ del kwargs[key]
62
+ model = klass(*args, **kwargs)
63
+
64
+ state = package["state"]
65
+
66
+ set_state(model, state)
67
+ return model
68
+
69
+
70
+ def get_state(model, quantizer, half=False):
71
+ """Get the state from a model, potentially with quantization applied.
72
+ If `half` is True, model are stored as half precision, which shouldn't impact performance
73
+ but half the state size."""
74
+ if quantizer is None:
75
+ dtype = torch.half if half else None
76
+ state = {k: p.data.to(device='cpu', dtype=dtype) for k, p in model.state_dict().items()}
77
+ else:
78
+ state = quantizer.get_quantized_state()
79
+ state['__quantized'] = True
80
+ return state
81
+
82
+
83
+ def set_state(model, state, quantizer=None):
84
+ """Set the state on a given model."""
85
+ if state.get('__quantized'):
86
+ if quantizer is not None:
87
+ quantizer.restore_quantized_state(model, state['quantized'])
88
+ else:
89
+ restore_quantized_state(model, state)
90
+ else:
91
+ model.load_state_dict(state)
92
+ return state
93
+
94
+
95
+ def save_with_checksum(content, path):
96
+ """Save the given value on disk, along with a sha256 hash.
97
+ Should be used with the output of either `serialize_model` or `get_state`."""
98
+ buf = io.BytesIO()
99
+ torch.save(content, buf)
100
+ sig = hashlib.sha256(buf.getvalue()).hexdigest()[:8]
101
+
102
+ path = path.parent / (path.stem + "-" + sig + path.suffix)
103
+ path.write_bytes(buf.getvalue())
104
+
105
+
106
+ def serialize_model(model, training_args, quantizer=None, half=True):
107
+ args, kwargs = model._init_args_kwargs
108
+ klass = model.__class__
109
+
110
+ state = get_state(model, quantizer, half)
111
+ return {
112
+ 'klass': klass,
113
+ 'args': args,
114
+ 'kwargs': kwargs,
115
+ 'state': state,
116
+ 'training_args': OmegaConf.to_container(training_args, resolve=True),
117
+ }
118
+
119
+
120
+ def copy_state(state):
121
+ return {k: v.cpu().clone() for k, v in state.items()}
122
+
123
+
124
+ @contextmanager
125
+ def swap_state(model, state):
126
+ """
127
+ Context manager that swaps the state of a model, e.g:
128
+
129
+ # model is in old state
130
+ with swap_state(model, new_state):
131
+ # model in new state
132
+ # model back to old state
133
+ """
134
+ old_state = copy_state(model.state_dict())
135
+ model.load_state_dict(state, strict=False)
136
+ try:
137
+ yield
138
+ finally:
139
+ model.load_state_dict(old_state)
140
+
141
+
142
+ def capture_init(init):
143
+ @functools.wraps(init)
144
+ def __init__(self, *args, **kwargs):
145
+ self._init_args_kwargs = (args, kwargs)
146
+ init(self, *args, **kwargs)
147
+
148
+ return __init__
MVSEP-MDX23-music-separation-model/demucs3/transformer.py ADDED
@@ -0,0 +1,839 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2019-present, Meta, Inc.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ # First author is Simon Rouard.
7
+
8
+ import random
9
+ import typing as tp
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ import torch.nn.functional as F
14
+ import numpy as np
15
+ import math
16
+ from einops import rearrange
17
+
18
+
19
+ def create_sin_embedding(
20
+ length: int, dim: int, shift: int = 0, device="cpu", max_period=10000
21
+ ):
22
+ # We aim for TBC format
23
+ assert dim % 2 == 0
24
+ pos = shift + torch.arange(length, device=device).view(-1, 1, 1)
25
+ half_dim = dim // 2
26
+ adim = torch.arange(dim // 2, device=device).view(1, 1, -1)
27
+ phase = pos / (max_period ** (adim / (half_dim - 1)))
28
+ return torch.cat(
29
+ [
30
+ torch.cos(phase),
31
+ torch.sin(phase),
32
+ ],
33
+ dim=-1,
34
+ )
35
+
36
+
37
+ def create_2d_sin_embedding(d_model, height, width, device="cpu", max_period=10000):
38
+ """
39
+ :param d_model: dimension of the model
40
+ :param height: height of the positions
41
+ :param width: width of the positions
42
+ :return: d_model*height*width position matrix
43
+ """
44
+ if d_model % 4 != 0:
45
+ raise ValueError(
46
+ "Cannot use sin/cos positional encoding with "
47
+ "odd dimension (got dim={:d})".format(d_model)
48
+ )
49
+ pe = torch.zeros(d_model, height, width)
50
+ # Each dimension use half of d_model
51
+ d_model = int(d_model / 2)
52
+ div_term = torch.exp(
53
+ torch.arange(0.0, d_model, 2) * -(math.log(max_period) / d_model)
54
+ )
55
+ pos_w = torch.arange(0.0, width).unsqueeze(1)
56
+ pos_h = torch.arange(0.0, height).unsqueeze(1)
57
+ pe[0:d_model:2, :, :] = (
58
+ torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
59
+ )
60
+ pe[1:d_model:2, :, :] = (
61
+ torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
62
+ )
63
+ pe[d_model::2, :, :] = (
64
+ torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
65
+ )
66
+ pe[d_model + 1:: 2, :, :] = (
67
+ torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
68
+ )
69
+
70
+ return pe[None, :].to(device)
71
+
72
+
73
+ def create_sin_embedding_cape(
74
+ length: int,
75
+ dim: int,
76
+ batch_size: int,
77
+ mean_normalize: bool,
78
+ augment: bool, # True during training
79
+ max_global_shift: float = 0.0, # delta max
80
+ max_local_shift: float = 0.0, # epsilon max
81
+ max_scale: float = 1.0,
82
+ device: str = "cpu",
83
+ max_period: float = 10000.0,
84
+ ):
85
+ # We aim for TBC format
86
+ assert dim % 2 == 0
87
+ pos = 1.0 * torch.arange(length).view(-1, 1, 1) # (length, 1, 1)
88
+ pos = pos.repeat(1, batch_size, 1) # (length, batch_size, 1)
89
+ if mean_normalize:
90
+ pos -= torch.nanmean(pos, dim=0, keepdim=True)
91
+
92
+ if augment:
93
+ delta = np.random.uniform(
94
+ -max_global_shift, +max_global_shift, size=[1, batch_size, 1]
95
+ )
96
+ delta_local = np.random.uniform(
97
+ -max_local_shift, +max_local_shift, size=[length, batch_size, 1]
98
+ )
99
+ log_lambdas = np.random.uniform(
100
+ -np.log(max_scale), +np.log(max_scale), size=[1, batch_size, 1]
101
+ )
102
+ pos = (pos + delta + delta_local) * np.exp(log_lambdas)
103
+
104
+ pos = pos.to(device)
105
+
106
+ half_dim = dim // 2
107
+ adim = torch.arange(dim // 2, device=device).view(1, 1, -1)
108
+ phase = pos / (max_period ** (adim / (half_dim - 1)))
109
+ return torch.cat(
110
+ [
111
+ torch.cos(phase),
112
+ torch.sin(phase),
113
+ ],
114
+ dim=-1,
115
+ ).float()
116
+
117
+
118
+ def get_causal_mask(length):
119
+ pos = torch.arange(length)
120
+ return pos > pos[:, None]
121
+
122
+
123
+ def get_elementary_mask(
124
+ T1,
125
+ T2,
126
+ mask_type,
127
+ sparse_attn_window,
128
+ global_window,
129
+ mask_random_seed,
130
+ sparsity,
131
+ device,
132
+ ):
133
+ """
134
+ When the input of the Decoder has length T1 and the output T2
135
+ The mask matrix has shape (T2, T1)
136
+ """
137
+ assert mask_type in ["diag", "jmask", "random", "global"]
138
+
139
+ if mask_type == "global":
140
+ mask = torch.zeros(T2, T1, dtype=torch.bool)
141
+ mask[:, :global_window] = True
142
+ line_window = int(global_window * T2 / T1)
143
+ mask[:line_window, :] = True
144
+
145
+ if mask_type == "diag":
146
+
147
+ mask = torch.zeros(T2, T1, dtype=torch.bool)
148
+ rows = torch.arange(T2)[:, None]
149
+ cols = (
150
+ (T1 / T2 * rows + torch.arange(-sparse_attn_window, sparse_attn_window + 1))
151
+ .long()
152
+ .clamp(0, T1 - 1)
153
+ )
154
+ mask.scatter_(1, cols, torch.ones(1, dtype=torch.bool).expand_as(cols))
155
+
156
+ elif mask_type == "jmask":
157
+ mask = torch.zeros(T2 + 2, T1 + 2, dtype=torch.bool)
158
+ rows = torch.arange(T2 + 2)[:, None]
159
+ t = torch.arange(0, int((2 * T1) ** 0.5 + 1))
160
+ t = (t * (t + 1) / 2).int()
161
+ t = torch.cat([-t.flip(0)[:-1], t])
162
+ cols = (T1 / T2 * rows + t).long().clamp(0, T1 + 1)
163
+ mask.scatter_(1, cols, torch.ones(1, dtype=torch.bool).expand_as(cols))
164
+ mask = mask[1:-1, 1:-1]
165
+
166
+ elif mask_type == "random":
167
+ gene = torch.Generator(device=device)
168
+ gene.manual_seed(mask_random_seed)
169
+ mask = (
170
+ torch.rand(T1 * T2, generator=gene, device=device).reshape(T2, T1)
171
+ > sparsity
172
+ )
173
+
174
+ mask = mask.to(device)
175
+ return mask
176
+
177
+
178
+ def get_mask(
179
+ T1,
180
+ T2,
181
+ mask_type,
182
+ sparse_attn_window,
183
+ global_window,
184
+ mask_random_seed,
185
+ sparsity,
186
+ device,
187
+ ):
188
+ """
189
+ Return a SparseCSRTensor mask that is a combination of elementary masks
190
+ mask_type can be a combination of multiple masks: for instance "diag_jmask_random"
191
+ """
192
+ from xformers.sparse import SparseCSRTensor
193
+ # create a list
194
+ mask_types = mask_type.split("_")
195
+
196
+ all_masks = [
197
+ get_elementary_mask(
198
+ T1,
199
+ T2,
200
+ mask,
201
+ sparse_attn_window,
202
+ global_window,
203
+ mask_random_seed,
204
+ sparsity,
205
+ device,
206
+ )
207
+ for mask in mask_types
208
+ ]
209
+
210
+ final_mask = torch.stack(all_masks).sum(axis=0) > 0
211
+
212
+ return SparseCSRTensor.from_dense(final_mask[None])
213
+
214
+
215
+ class ScaledEmbedding(nn.Module):
216
+ def __init__(
217
+ self,
218
+ num_embeddings: int,
219
+ embedding_dim: int,
220
+ scale: float = 1.0,
221
+ boost: float = 3.0,
222
+ ):
223
+ super().__init__()
224
+ self.embedding = nn.Embedding(num_embeddings, embedding_dim)
225
+ self.embedding.weight.data *= scale / boost
226
+ self.boost = boost
227
+
228
+ @property
229
+ def weight(self):
230
+ return self.embedding.weight * self.boost
231
+
232
+ def forward(self, x):
233
+ return self.embedding(x) * self.boost
234
+
235
+
236
+ class LayerScale(nn.Module):
237
+ """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
238
+ This rescales diagonaly residual outputs close to 0 initially, then learnt.
239
+ """
240
+
241
+ def __init__(self, channels: int, init: float = 0, channel_last=False):
242
+ """
243
+ channel_last = False corresponds to (B, C, T) tensors
244
+ channel_last = True corresponds to (T, B, C) tensors
245
+ """
246
+ super().__init__()
247
+ self.channel_last = channel_last
248
+ self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True))
249
+ self.scale.data[:] = init
250
+
251
+ def forward(self, x):
252
+ if self.channel_last:
253
+ return self.scale * x
254
+ else:
255
+ return self.scale[:, None] * x
256
+
257
+
258
+ class MyGroupNorm(nn.GroupNorm):
259
+ def __init__(self, *args, **kwargs):
260
+ super().__init__(*args, **kwargs)
261
+
262
+ def forward(self, x):
263
+ """
264
+ x: (B, T, C)
265
+ if num_groups=1: Normalisation on all T and C together for each B
266
+ """
267
+ x = x.transpose(1, 2)
268
+ return super().forward(x).transpose(1, 2)
269
+
270
+
271
+ class MyTransformerEncoderLayer(nn.TransformerEncoderLayer):
272
+ def __init__(
273
+ self,
274
+ d_model,
275
+ nhead,
276
+ dim_feedforward=2048,
277
+ dropout=0.1,
278
+ activation=F.relu,
279
+ group_norm=0,
280
+ norm_first=False,
281
+ norm_out=False,
282
+ layer_norm_eps=1e-5,
283
+ layer_scale=False,
284
+ init_values=1e-4,
285
+ device=None,
286
+ dtype=None,
287
+ sparse=False,
288
+ mask_type="diag",
289
+ mask_random_seed=42,
290
+ sparse_attn_window=500,
291
+ global_window=50,
292
+ auto_sparsity=False,
293
+ sparsity=0.95,
294
+ batch_first=False,
295
+ ):
296
+ factory_kwargs = {"device": device, "dtype": dtype}
297
+ super().__init__(
298
+ d_model=d_model,
299
+ nhead=nhead,
300
+ dim_feedforward=dim_feedforward,
301
+ dropout=dropout,
302
+ activation=activation,
303
+ layer_norm_eps=layer_norm_eps,
304
+ batch_first=batch_first,
305
+ norm_first=norm_first,
306
+ device=device,
307
+ dtype=dtype,
308
+ )
309
+ self.sparse = sparse
310
+ self.auto_sparsity = auto_sparsity
311
+ if sparse:
312
+ if not auto_sparsity:
313
+ self.mask_type = mask_type
314
+ self.sparse_attn_window = sparse_attn_window
315
+ self.global_window = global_window
316
+ self.sparsity = sparsity
317
+ if group_norm:
318
+ self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
319
+ self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
320
+
321
+ self.norm_out = None
322
+ if self.norm_first & norm_out:
323
+ self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model)
324
+ self.gamma_1 = (
325
+ LayerScale(d_model, init_values, True) if layer_scale else nn.Identity()
326
+ )
327
+ self.gamma_2 = (
328
+ LayerScale(d_model, init_values, True) if layer_scale else nn.Identity()
329
+ )
330
+
331
+ if sparse:
332
+ self.self_attn = MultiheadAttention(
333
+ d_model, nhead, dropout=dropout, batch_first=batch_first,
334
+ auto_sparsity=sparsity if auto_sparsity else 0,
335
+ )
336
+ self.__setattr__("src_mask", torch.zeros(1, 1))
337
+ self.mask_random_seed = mask_random_seed
338
+
339
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
340
+ """
341
+ if batch_first = False, src shape is (T, B, C)
342
+ the case where batch_first=True is not covered
343
+ """
344
+ device = src.device
345
+ x = src
346
+ T, B, C = x.shape
347
+ if self.sparse and not self.auto_sparsity:
348
+ assert src_mask is None
349
+ src_mask = self.src_mask
350
+ if src_mask.shape[-1] != T:
351
+ src_mask = get_mask(
352
+ T,
353
+ T,
354
+ self.mask_type,
355
+ self.sparse_attn_window,
356
+ self.global_window,
357
+ self.mask_random_seed,
358
+ self.sparsity,
359
+ device,
360
+ )
361
+ self.__setattr__("src_mask", src_mask)
362
+
363
+ if self.norm_first:
364
+ x = x + self.gamma_1(
365
+ self._sa_block(self.norm1(x), src_mask, src_key_padding_mask)
366
+ )
367
+ x = x + self.gamma_2(self._ff_block(self.norm2(x)))
368
+
369
+ if self.norm_out:
370
+ x = self.norm_out(x)
371
+ else:
372
+ x = self.norm1(
373
+ x + self.gamma_1(self._sa_block(x, src_mask, src_key_padding_mask))
374
+ )
375
+ x = self.norm2(x + self.gamma_2(self._ff_block(x)))
376
+
377
+ return x
378
+
379
+
380
+ class CrossTransformerEncoderLayer(nn.Module):
381
+ def __init__(
382
+ self,
383
+ d_model: int,
384
+ nhead: int,
385
+ dim_feedforward: int = 2048,
386
+ dropout: float = 0.1,
387
+ activation=F.relu,
388
+ layer_norm_eps: float = 1e-5,
389
+ layer_scale: bool = False,
390
+ init_values: float = 1e-4,
391
+ norm_first: bool = False,
392
+ group_norm: bool = False,
393
+ norm_out: bool = False,
394
+ sparse=False,
395
+ mask_type="diag",
396
+ mask_random_seed=42,
397
+ sparse_attn_window=500,
398
+ global_window=50,
399
+ sparsity=0.95,
400
+ auto_sparsity=None,
401
+ device=None,
402
+ dtype=None,
403
+ batch_first=False,
404
+ ):
405
+ factory_kwargs = {"device": device, "dtype": dtype}
406
+ super().__init__()
407
+
408
+ self.sparse = sparse
409
+ self.auto_sparsity = auto_sparsity
410
+ if sparse:
411
+ if not auto_sparsity:
412
+ self.mask_type = mask_type
413
+ self.sparse_attn_window = sparse_attn_window
414
+ self.global_window = global_window
415
+ self.sparsity = sparsity
416
+
417
+ self.cross_attn: nn.Module
418
+ self.cross_attn = nn.MultiheadAttention(
419
+ d_model, nhead, dropout=dropout, batch_first=batch_first)
420
+ # Implementation of Feedforward model
421
+ self.linear1 = nn.Linear(d_model, dim_feedforward, **factory_kwargs)
422
+ self.dropout = nn.Dropout(dropout)
423
+ self.linear2 = nn.Linear(dim_feedforward, d_model, **factory_kwargs)
424
+
425
+ self.norm_first = norm_first
426
+ self.norm1: nn.Module
427
+ self.norm2: nn.Module
428
+ self.norm3: nn.Module
429
+ if group_norm:
430
+ self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
431
+ self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
432
+ self.norm3 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
433
+ else:
434
+ self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
435
+ self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
436
+ self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
437
+
438
+ self.norm_out = None
439
+ if self.norm_first & norm_out:
440
+ self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model)
441
+
442
+ self.gamma_1 = (
443
+ LayerScale(d_model, init_values, True) if layer_scale else nn.Identity()
444
+ )
445
+ self.gamma_2 = (
446
+ LayerScale(d_model, init_values, True) if layer_scale else nn.Identity()
447
+ )
448
+
449
+ self.dropout1 = nn.Dropout(dropout)
450
+ self.dropout2 = nn.Dropout(dropout)
451
+
452
+ # Legacy string support for activation function.
453
+ if isinstance(activation, str):
454
+ self.activation = self._get_activation_fn(activation)
455
+ else:
456
+ self.activation = activation
457
+
458
+ if sparse:
459
+ self.cross_attn = MultiheadAttention(
460
+ d_model, nhead, dropout=dropout, batch_first=batch_first,
461
+ auto_sparsity=sparsity if auto_sparsity else 0)
462
+ if not auto_sparsity:
463
+ self.__setattr__("mask", torch.zeros(1, 1))
464
+ self.mask_random_seed = mask_random_seed
465
+
466
+ def forward(self, q, k, mask=None):
467
+ """
468
+ Args:
469
+ q: tensor of shape (T, B, C)
470
+ k: tensor of shape (S, B, C)
471
+ mask: tensor of shape (T, S)
472
+
473
+ """
474
+ device = q.device
475
+ T, B, C = q.shape
476
+ S, B, C = k.shape
477
+ if self.sparse and not self.auto_sparsity:
478
+ assert mask is None
479
+ mask = self.mask
480
+ if mask.shape[-1] != S or mask.shape[-2] != T:
481
+ mask = get_mask(
482
+ S,
483
+ T,
484
+ self.mask_type,
485
+ self.sparse_attn_window,
486
+ self.global_window,
487
+ self.mask_random_seed,
488
+ self.sparsity,
489
+ device,
490
+ )
491
+ self.__setattr__("mask", mask)
492
+
493
+ if self.norm_first:
494
+ x = q + self.gamma_1(self._ca_block(self.norm1(q), self.norm2(k), mask))
495
+ x = x + self.gamma_2(self._ff_block(self.norm3(x)))
496
+ if self.norm_out:
497
+ x = self.norm_out(x)
498
+ else:
499
+ x = self.norm1(q + self.gamma_1(self._ca_block(q, k, mask)))
500
+ x = self.norm2(x + self.gamma_2(self._ff_block(x)))
501
+
502
+ return x
503
+
504
+ # self-attention block
505
+ def _ca_block(self, q, k, attn_mask=None):
506
+ x = self.cross_attn(q, k, k, attn_mask=attn_mask, need_weights=False)[0]
507
+ return self.dropout1(x)
508
+
509
+ # feed forward block
510
+ def _ff_block(self, x):
511
+ x = self.linear2(self.dropout(self.activation(self.linear1(x))))
512
+ return self.dropout2(x)
513
+
514
+ def _get_activation_fn(self, activation):
515
+ if activation == "relu":
516
+ return F.relu
517
+ elif activation == "gelu":
518
+ return F.gelu
519
+
520
+ raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
521
+
522
+
523
+ # ----------------- MULTI-BLOCKS MODELS: -----------------------
524
+
525
+
526
+ class CrossTransformerEncoder(nn.Module):
527
+ def __init__(
528
+ self,
529
+ dim: int,
530
+ emb: str = "sin",
531
+ hidden_scale: float = 4.0,
532
+ num_heads: int = 8,
533
+ num_layers: int = 6,
534
+ cross_first: bool = False,
535
+ dropout: float = 0.0,
536
+ max_positions: int = 1000,
537
+ norm_in: bool = True,
538
+ norm_in_group: bool = False,
539
+ group_norm: int = False,
540
+ norm_first: bool = False,
541
+ norm_out: bool = False,
542
+ max_period: float = 10000.0,
543
+ weight_decay: float = 0.0,
544
+ lr: tp.Optional[float] = None,
545
+ layer_scale: bool = False,
546
+ gelu: bool = True,
547
+ sin_random_shift: int = 0,
548
+ weight_pos_embed: float = 1.0,
549
+ cape_mean_normalize: bool = True,
550
+ cape_augment: bool = True,
551
+ cape_glob_loc_scale: list = [5000.0, 1.0, 1.4],
552
+ sparse_self_attn: bool = False,
553
+ sparse_cross_attn: bool = False,
554
+ mask_type: str = "diag",
555
+ mask_random_seed: int = 42,
556
+ sparse_attn_window: int = 500,
557
+ global_window: int = 50,
558
+ auto_sparsity: bool = False,
559
+ sparsity: float = 0.95,
560
+ ):
561
+ super().__init__()
562
+ """
563
+ """
564
+ assert dim % num_heads == 0
565
+
566
+ hidden_dim = int(dim * hidden_scale)
567
+
568
+ self.num_layers = num_layers
569
+ # classic parity = 1 means that if idx%2 == 1 there is a
570
+ # classical encoder else there is a cross encoder
571
+ self.classic_parity = 1 if cross_first else 0
572
+ self.emb = emb
573
+ self.max_period = max_period
574
+ self.weight_decay = weight_decay
575
+ self.weight_pos_embed = weight_pos_embed
576
+ self.sin_random_shift = sin_random_shift
577
+ if emb == "cape":
578
+ self.cape_mean_normalize = cape_mean_normalize
579
+ self.cape_augment = cape_augment
580
+ self.cape_glob_loc_scale = cape_glob_loc_scale
581
+ if emb == "scaled":
582
+ self.position_embeddings = ScaledEmbedding(max_positions, dim, scale=0.2)
583
+
584
+ self.lr = lr
585
+
586
+ activation: tp.Any = F.gelu if gelu else F.relu
587
+
588
+ self.norm_in: nn.Module
589
+ self.norm_in_t: nn.Module
590
+ if norm_in:
591
+ self.norm_in = nn.LayerNorm(dim)
592
+ self.norm_in_t = nn.LayerNorm(dim)
593
+ elif norm_in_group:
594
+ self.norm_in = MyGroupNorm(int(norm_in_group), dim)
595
+ self.norm_in_t = MyGroupNorm(int(norm_in_group), dim)
596
+ else:
597
+ self.norm_in = nn.Identity()
598
+ self.norm_in_t = nn.Identity()
599
+
600
+ # spectrogram layers
601
+ self.layers = nn.ModuleList()
602
+ # temporal layers
603
+ self.layers_t = nn.ModuleList()
604
+
605
+ kwargs_common = {
606
+ "d_model": dim,
607
+ "nhead": num_heads,
608
+ "dim_feedforward": hidden_dim,
609
+ "dropout": dropout,
610
+ "activation": activation,
611
+ "group_norm": group_norm,
612
+ "norm_first": norm_first,
613
+ "norm_out": norm_out,
614
+ "layer_scale": layer_scale,
615
+ "mask_type": mask_type,
616
+ "mask_random_seed": mask_random_seed,
617
+ "sparse_attn_window": sparse_attn_window,
618
+ "global_window": global_window,
619
+ "sparsity": sparsity,
620
+ "auto_sparsity": auto_sparsity,
621
+ "batch_first": True,
622
+ }
623
+
624
+ kwargs_classic_encoder = dict(kwargs_common)
625
+ kwargs_classic_encoder.update({
626
+ "sparse": sparse_self_attn,
627
+ })
628
+ kwargs_cross_encoder = dict(kwargs_common)
629
+ kwargs_cross_encoder.update({
630
+ "sparse": sparse_cross_attn,
631
+ })
632
+
633
+ for idx in range(num_layers):
634
+ if idx % 2 == self.classic_parity:
635
+
636
+ self.layers.append(MyTransformerEncoderLayer(**kwargs_classic_encoder))
637
+ self.layers_t.append(
638
+ MyTransformerEncoderLayer(**kwargs_classic_encoder)
639
+ )
640
+
641
+ else:
642
+ self.layers.append(CrossTransformerEncoderLayer(**kwargs_cross_encoder))
643
+
644
+ self.layers_t.append(
645
+ CrossTransformerEncoderLayer(**kwargs_cross_encoder)
646
+ )
647
+
648
+ def forward(self, x, xt):
649
+ B, C, Fr, T1 = x.shape
650
+ pos_emb_2d = create_2d_sin_embedding(
651
+ C, Fr, T1, x.device, self.max_period
652
+ ) # (1, C, Fr, T1)
653
+ pos_emb_2d = rearrange(pos_emb_2d, "b c fr t1 -> b (t1 fr) c")
654
+ x = rearrange(x, "b c fr t1 -> b (t1 fr) c")
655
+ x = self.norm_in(x)
656
+ x = x + self.weight_pos_embed * pos_emb_2d
657
+
658
+ B, C, T2 = xt.shape
659
+ xt = rearrange(xt, "b c t2 -> b t2 c") # now T2, B, C
660
+ pos_emb = self._get_pos_embedding(T2, B, C, x.device)
661
+ pos_emb = rearrange(pos_emb, "t2 b c -> b t2 c")
662
+ xt = self.norm_in_t(xt)
663
+ xt = xt + self.weight_pos_embed * pos_emb
664
+
665
+ for idx in range(self.num_layers):
666
+ if idx % 2 == self.classic_parity:
667
+ x = self.layers[idx](x)
668
+ xt = self.layers_t[idx](xt)
669
+ else:
670
+ old_x = x
671
+ x = self.layers[idx](x, xt)
672
+ xt = self.layers_t[idx](xt, old_x)
673
+
674
+ x = rearrange(x, "b (t1 fr) c -> b c fr t1", t1=T1)
675
+ xt = rearrange(xt, "b t2 c -> b c t2")
676
+ return x, xt
677
+
678
+ def _get_pos_embedding(self, T, B, C, device):
679
+ if self.emb == "sin":
680
+ shift = random.randrange(self.sin_random_shift + 1)
681
+ pos_emb = create_sin_embedding(
682
+ T, C, shift=shift, device=device, max_period=self.max_period
683
+ )
684
+ elif self.emb == "cape":
685
+ if self.training:
686
+ pos_emb = create_sin_embedding_cape(
687
+ T,
688
+ C,
689
+ B,
690
+ device=device,
691
+ max_period=self.max_period,
692
+ mean_normalize=self.cape_mean_normalize,
693
+ augment=self.cape_augment,
694
+ max_global_shift=self.cape_glob_loc_scale[0],
695
+ max_local_shift=self.cape_glob_loc_scale[1],
696
+ max_scale=self.cape_glob_loc_scale[2],
697
+ )
698
+ else:
699
+ pos_emb = create_sin_embedding_cape(
700
+ T,
701
+ C,
702
+ B,
703
+ device=device,
704
+ max_period=self.max_period,
705
+ mean_normalize=self.cape_mean_normalize,
706
+ augment=False,
707
+ )
708
+
709
+ elif self.emb == "scaled":
710
+ pos = torch.arange(T, device=device)
711
+ pos_emb = self.position_embeddings(pos)[:, None]
712
+
713
+ return pos_emb
714
+
715
+ def make_optim_group(self):
716
+ group = {"params": list(self.parameters()), "weight_decay": self.weight_decay}
717
+ if self.lr is not None:
718
+ group["lr"] = self.lr
719
+ return group
720
+
721
+
722
+ # Attention Modules
723
+
724
+
725
+ class MultiheadAttention(nn.Module):
726
+ def __init__(
727
+ self,
728
+ embed_dim,
729
+ num_heads,
730
+ dropout=0.0,
731
+ bias=True,
732
+ add_bias_kv=False,
733
+ add_zero_attn=False,
734
+ kdim=None,
735
+ vdim=None,
736
+ batch_first=False,
737
+ auto_sparsity=None,
738
+ ):
739
+ super().__init__()
740
+ assert auto_sparsity is not None, "sanity check"
741
+ self.num_heads = num_heads
742
+ self.q = torch.nn.Linear(embed_dim, embed_dim, bias=bias)
743
+ self.k = torch.nn.Linear(embed_dim, embed_dim, bias=bias)
744
+ self.v = torch.nn.Linear(embed_dim, embed_dim, bias=bias)
745
+ self.attn_drop = torch.nn.Dropout(dropout)
746
+ self.proj = torch.nn.Linear(embed_dim, embed_dim, bias)
747
+ self.proj_drop = torch.nn.Dropout(dropout)
748
+ self.batch_first = batch_first
749
+ self.auto_sparsity = auto_sparsity
750
+
751
+ def forward(
752
+ self,
753
+ query,
754
+ key,
755
+ value,
756
+ key_padding_mask=None,
757
+ need_weights=True,
758
+ attn_mask=None,
759
+ average_attn_weights=True,
760
+ ):
761
+
762
+ if not self.batch_first: # N, B, C
763
+ query = query.permute(1, 0, 2) # B, N_q, C
764
+ key = key.permute(1, 0, 2) # B, N_k, C
765
+ value = value.permute(1, 0, 2) # B, N_k, C
766
+ B, N_q, C = query.shape
767
+ B, N_k, C = key.shape
768
+
769
+ q = (
770
+ self.q(query)
771
+ .reshape(B, N_q, self.num_heads, C // self.num_heads)
772
+ .permute(0, 2, 1, 3)
773
+ )
774
+ q = q.flatten(0, 1)
775
+ k = (
776
+ self.k(key)
777
+ .reshape(B, N_k, self.num_heads, C // self.num_heads)
778
+ .permute(0, 2, 1, 3)
779
+ )
780
+ k = k.flatten(0, 1)
781
+ v = (
782
+ self.v(value)
783
+ .reshape(B, N_k, self.num_heads, C // self.num_heads)
784
+ .permute(0, 2, 1, 3)
785
+ )
786
+ v = v.flatten(0, 1)
787
+
788
+ if self.auto_sparsity:
789
+ assert attn_mask is None
790
+ x = dynamic_sparse_attention(q, k, v, sparsity=self.auto_sparsity)
791
+ else:
792
+ x = scaled_dot_product_attention(q, k, v, attn_mask, dropout=self.attn_drop)
793
+ x = x.reshape(B, self.num_heads, N_q, C // self.num_heads)
794
+
795
+ x = x.transpose(1, 2).reshape(B, N_q, C)
796
+ x = self.proj(x)
797
+ x = self.proj_drop(x)
798
+ if not self.batch_first:
799
+ x = x.permute(1, 0, 2)
800
+ return x, None
801
+
802
+
803
+ def scaled_query_key_softmax(q, k, att_mask):
804
+ from xformers.ops import masked_matmul
805
+ q = q / (k.size(-1)) ** 0.5
806
+ att = masked_matmul(q, k.transpose(-2, -1), att_mask)
807
+ att = torch.nn.functional.softmax(att, -1)
808
+ return att
809
+
810
+
811
+ def scaled_dot_product_attention(q, k, v, att_mask, dropout):
812
+ att = scaled_query_key_softmax(q, k, att_mask=att_mask)
813
+ att = dropout(att)
814
+ y = att @ v
815
+ return y
816
+
817
+
818
+ def _compute_buckets(x, R):
819
+ qq = torch.einsum('btf,bfhi->bhti', x, R)
820
+ qq = torch.cat([qq, -qq], dim=-1)
821
+ buckets = qq.argmax(dim=-1)
822
+
823
+ return buckets.permute(0, 2, 1).byte().contiguous()
824
+
825
+
826
+ def dynamic_sparse_attention(query, key, value, sparsity, infer_sparsity=True, attn_bias=None):
827
+ # assert False, "The code for the custom sparse kernel is not ready for release yet."
828
+ from xformers.ops import find_locations, sparse_memory_efficient_attention
829
+ n_hashes = 32
830
+ proj_size = 4
831
+ query, key, value = [x.contiguous() for x in [query, key, value]]
832
+ with torch.no_grad():
833
+ R = torch.randn(1, query.shape[-1], n_hashes, proj_size // 2, device=query.device)
834
+ bucket_query = _compute_buckets(query, R)
835
+ bucket_key = _compute_buckets(key, R)
836
+ row_offsets, column_indices = find_locations(
837
+ bucket_query, bucket_key, sparsity, infer_sparsity)
838
+ return sparse_memory_efficient_attention(
839
+ query, key, value, row_offsets, column_indices, attn_bias)
MVSEP-MDX23-music-separation-model/demucs3/utils.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from collections import defaultdict
8
+ from contextlib import contextmanager
9
+ import math
10
+ import os
11
+ import tempfile
12
+ import typing as tp
13
+
14
+ import torch
15
+ from torch.nn import functional as F
16
+ from torch.utils.data import Subset
17
+
18
+
19
+ def unfold(a, kernel_size, stride):
20
+ """Given input of size [*OT, T], output Tensor of size [*OT, F, K]
21
+ with K the kernel size, by extracting frames with the given stride.
22
+
23
+ This will pad the input so that `F = ceil(T / K)`.
24
+
25
+ see https://github.com/pytorch/pytorch/issues/60466
26
+ """
27
+ *shape, length = a.shape
28
+ n_frames = math.ceil(length / stride)
29
+ tgt_length = (n_frames - 1) * stride + kernel_size
30
+ a = F.pad(a, (0, tgt_length - length))
31
+ strides = list(a.stride())
32
+ assert strides[-1] == 1, 'data should be contiguous'
33
+ strides = strides[:-1] + [stride, 1]
34
+ return a.as_strided([*shape, n_frames, kernel_size], strides)
35
+
36
+
37
+ def center_trim(tensor: torch.Tensor, reference: tp.Union[torch.Tensor, int]):
38
+ """
39
+ Center trim `tensor` with respect to `reference`, along the last dimension.
40
+ `reference` can also be a number, representing the length to trim to.
41
+ If the size difference != 0 mod 2, the extra sample is removed on the right side.
42
+ """
43
+ ref_size: int
44
+ if isinstance(reference, torch.Tensor):
45
+ ref_size = reference.size(-1)
46
+ else:
47
+ ref_size = reference
48
+ delta = tensor.size(-1) - ref_size
49
+ if delta < 0:
50
+ raise ValueError("tensor must be larger than reference. " f"Delta is {delta}.")
51
+ if delta:
52
+ tensor = tensor[..., delta // 2:-(delta - delta // 2)]
53
+ return tensor
54
+
55
+
56
+ def pull_metric(history: tp.List[dict], name: str):
57
+ out = []
58
+ for metrics in history:
59
+ metric = metrics
60
+ for part in name.split("."):
61
+ metric = metric[part]
62
+ out.append(metric)
63
+ return out
64
+
65
+
66
+ def EMA(beta: float = 1):
67
+ """
68
+ Exponential Moving Average callback.
69
+ Returns a single function that can be called to repeatidly update the EMA
70
+ with a dict of metrics. The callback will return
71
+ the new averaged dict of metrics.
72
+
73
+ Note that for `beta=1`, this is just plain averaging.
74
+ """
75
+ fix: tp.Dict[str, float] = defaultdict(float)
76
+ total: tp.Dict[str, float] = defaultdict(float)
77
+
78
+ def _update(metrics: dict, weight: float = 1) -> dict:
79
+ nonlocal total, fix
80
+ for key, value in metrics.items():
81
+ total[key] = total[key] * beta + weight * float(value)
82
+ fix[key] = fix[key] * beta + weight
83
+ return {key: tot / fix[key] for key, tot in total.items()}
84
+ return _update
85
+
86
+
87
+ def sizeof_fmt(num: float, suffix: str = 'B'):
88
+ """
89
+ Given `num` bytes, return human readable size.
90
+ Taken from https://stackoverflow.com/a/1094933
91
+ """
92
+ for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
93
+ if abs(num) < 1024.0:
94
+ return "%3.1f%s%s" % (num, unit, suffix)
95
+ num /= 1024.0
96
+ return "%.1f%s%s" % (num, 'Yi', suffix)
97
+
98
+
99
+ @contextmanager
100
+ def temp_filenames(count: int, delete=True):
101
+ names = []
102
+ try:
103
+ for _ in range(count):
104
+ names.append(tempfile.NamedTemporaryFile(delete=False).name)
105
+ yield names
106
+ finally:
107
+ if delete:
108
+ for name in names:
109
+ os.unlink(name)
110
+
111
+
112
+ def random_subset(dataset, max_samples: int, seed: int = 42):
113
+ if max_samples >= len(dataset):
114
+ return dataset
115
+
116
+ generator = torch.Generator().manual_seed(seed)
117
+ perm = torch.randperm(len(dataset), generator=generator)
118
+ return Subset(dataset, perm[:max_samples].tolist())
119
+
120
+
121
+ class DummyPoolExecutor:
122
+ class DummyResult:
123
+ def __init__(self, func, *args, **kwargs):
124
+ self.func = func
125
+ self.args = args
126
+ self.kwargs = kwargs
127
+
128
+ def result(self):
129
+ return self.func(*self.args, **self.kwargs)
130
+
131
+ def __init__(self, workers=0):
132
+ pass
133
+
134
+ def submit(self, func, *args, **kwargs):
135
+ return DummyPoolExecutor.DummyResult(func, *args, **kwargs)
136
+
137
+ def __enter__(self):
138
+ return self
139
+
140
+ def __exit__(self, exc_type, exc_value, exc_tb):
141
+ return
MVSEP-MDX23-music-separation-model/gui.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ __author__ = 'Roman Solovyev (ZFTurbo), IPPM RAS'
3
+
4
+ if __name__ == '__main__':
5
+ import os
6
+
7
+ gpu_use = "0"
8
+ print('GPU use: {}'.format(gpu_use))
9
+ os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_use)
10
+
11
+ import time
12
+ import os
13
+ import numpy as np
14
+ from PyQt5.QtCore import *
15
+ from PyQt5 import QtCore
16
+ from PyQt5.QtWidgets import *
17
+ from PyQt5.QtGui import *
18
+ import sys
19
+ from inference import predict_with_model, __VERSION__
20
+ import torch
21
+
22
+
23
+ root = dict()
24
+
25
+
26
+ class Worker(QObject):
27
+ finished = pyqtSignal()
28
+ progress = pyqtSignal(int)
29
+
30
+ def __init__(self, options):
31
+ super().__init__()
32
+ self.options = options
33
+
34
+ def run(self):
35
+ global root
36
+ # Here we pass the update_progress (uncalled!)
37
+ self.options['update_percent_func'] = self.update_progress
38
+ predict_with_model(self.options)
39
+ root['button_start'].setDisabled(False)
40
+ root['button_finish'].setDisabled(True)
41
+ root['start_proc'] = False
42
+ self.finished.emit()
43
+
44
+ def update_progress(self, percent):
45
+ self.progress.emit(percent)
46
+
47
+
48
+ class Ui_Dialog(object):
49
+ def setupUi(self, Dialog):
50
+ global root
51
+
52
+ Dialog.setObjectName("Settings")
53
+ Dialog.resize(370, 320)
54
+
55
+ self.checkbox_cpu = QCheckBox("Use CPU instead of GPU?", Dialog)
56
+ self.checkbox_cpu.move(30, 10)
57
+ self.checkbox_cpu.resize(320, 40)
58
+ if root['cpu']:
59
+ self.checkbox_cpu.setChecked(True)
60
+
61
+ self.checkbox_single_onnx = QCheckBox("Use single ONNX?", Dialog)
62
+ self.checkbox_single_onnx.move(30, 40)
63
+ self.checkbox_single_onnx.resize(320, 40)
64
+ if root['single_onnx']:
65
+ self.checkbox_single_onnx.setChecked(True)
66
+
67
+ self.checkbox_large_gpu = QCheckBox("Use large GPU?", Dialog)
68
+ self.checkbox_large_gpu.move(30, 70)
69
+ self.checkbox_large_gpu.resize(320, 40)
70
+ if root['large_gpu']:
71
+ self.checkbox_large_gpu.setChecked(True)
72
+
73
+ self.checkbox_kim_1 = QCheckBox("Use old Kim Vocal model?", Dialog)
74
+ self.checkbox_kim_1.move(30, 100)
75
+ self.checkbox_kim_1.resize(320, 40)
76
+ if root['use_kim_model_1']:
77
+ self.checkbox_kim_1.setChecked(True)
78
+
79
+ self.checkbox_only_vocals = QCheckBox("Generate only vocals/instrumental?", Dialog)
80
+ self.checkbox_only_vocals.move(30, 130)
81
+ self.checkbox_only_vocals.resize(320, 40)
82
+ if root['only_vocals']:
83
+ self.checkbox_only_vocals.setChecked(True)
84
+
85
+ self.chunk_size_label = QLabel(Dialog)
86
+ self.chunk_size_label.setText('Chunk size')
87
+ self.chunk_size_label.move(30, 160)
88
+ self.chunk_size_label.resize(320, 40)
89
+
90
+ self.chunk_size_valid = QIntValidator(bottom=100000, top=10000000)
91
+ self.chunk_size = QLineEdit(Dialog)
92
+ self.chunk_size.setFixedWidth(140)
93
+ self.chunk_size.move(130, 170)
94
+ self.chunk_size.setValidator(self.chunk_size_valid)
95
+ self.chunk_size.setText(str(root['chunk_size']))
96
+
97
+ self.overlap_large_label = QLabel(Dialog)
98
+ self.overlap_large_label.setText('Overlap large')
99
+ self.overlap_large_label.move(30, 190)
100
+ self.overlap_large_label.resize(320, 40)
101
+
102
+ self.overlap_large_valid = QDoubleValidator(bottom=0.001, top=0.999, decimals=10)
103
+ self.overlap_large_valid.setNotation(QDoubleValidator.Notation.StandardNotation)
104
+ self.overlap_large = QLineEdit(Dialog)
105
+ self.overlap_large.setFixedWidth(140)
106
+ self.overlap_large.move(130, 200)
107
+ self.overlap_large.setValidator(self.overlap_large_valid)
108
+ self.overlap_large.setText(str(root['overlap_large']))
109
+
110
+ self.overlap_small_label = QLabel(Dialog)
111
+ self.overlap_small_label.setText('Overlap small')
112
+ self.overlap_small_label.move(30, 220)
113
+ self.overlap_small_label.resize(320, 40)
114
+
115
+ self.overlap_small_valid = QDoubleValidator(0.001, 0.999, 10)
116
+ self.overlap_small_valid.setNotation(QDoubleValidator.Notation.StandardNotation)
117
+ self.overlap_small = QLineEdit(Dialog)
118
+ self.overlap_small.setFixedWidth(140)
119
+ self.overlap_small.move(130, 230)
120
+ self.overlap_small.setValidator(self.overlap_small_valid)
121
+ self.overlap_small.setText(str(root['overlap_small']))
122
+
123
+ self.pushButton_save = QPushButton(Dialog)
124
+ self.pushButton_save.setObjectName("pushButton_save")
125
+ self.pushButton_save.move(30, 280)
126
+ self.pushButton_save.resize(150, 35)
127
+
128
+ self.pushButton_cancel = QPushButton(Dialog)
129
+ self.pushButton_cancel.setObjectName("pushButton_cancel")
130
+ self.pushButton_cancel.move(190, 280)
131
+ self.pushButton_cancel.resize(150, 35)
132
+
133
+ self.retranslateUi(Dialog)
134
+ QtCore.QMetaObject.connectSlotsByName(Dialog)
135
+ self.Dialog = Dialog
136
+
137
+ # connect the two functions
138
+ self.pushButton_save.clicked.connect(self.return_save)
139
+ self.pushButton_cancel.clicked.connect(self.return_cancel)
140
+
141
+ def retranslateUi(self, Dialog):
142
+ _translate = QtCore.QCoreApplication.translate
143
+ Dialog.setWindowTitle(_translate("Settings", "Settings"))
144
+ self.pushButton_cancel.setText(_translate("Settings", "Cancel"))
145
+ self.pushButton_save.setText(_translate("Settings", "Save settings"))
146
+
147
+ def return_save(self):
148
+ global root
149
+ # print("save")
150
+ root['cpu'] = self.checkbox_cpu.isChecked()
151
+ root['single_onnx'] = self.checkbox_single_onnx.isChecked()
152
+ root['large_gpu'] = self.checkbox_large_gpu.isChecked()
153
+ root['use_kim_model_1'] = self.checkbox_kim_1.isChecked()
154
+ root['only_vocals'] = self.checkbox_only_vocals.isChecked()
155
+
156
+ chunk_size_text = self.chunk_size.text()
157
+ state = self.chunk_size_valid.validate(chunk_size_text, 0)
158
+ if state[0] == QValidator.State.Acceptable:
159
+ root['chunk_size'] = chunk_size_text
160
+
161
+ overlap_large_text = self.overlap_large.text()
162
+ # locale problems... it wants comma instead of dot
163
+ if 0:
164
+ state = self.overlap_large_valid.validate(overlap_large_text, 0)
165
+ if state[0] == QValidator.State.Acceptable:
166
+ root['overlap_large'] = float(overlap_large_text)
167
+ else:
168
+ root['overlap_large'] = float(overlap_large_text)
169
+
170
+ overlap_small_text = self.overlap_small.text()
171
+ if 0:
172
+ state = self.overlap_small_valid.validate(overlap_small_text, 0)
173
+ if state[0] == QValidator.State.Acceptable:
174
+ root['overlap_small'] = float(overlap_small_text)
175
+ else:
176
+ root['overlap_small'] = float(overlap_small_text)
177
+
178
+ self.Dialog.close()
179
+
180
+ def return_cancel(self):
181
+ global root
182
+ # print("cancel")
183
+ self.Dialog.close()
184
+
185
+
186
+ class MyWidget(QWidget):
187
+ def __init__(self):
188
+ super().__init__()
189
+ self.initUI()
190
+
191
+ def initUI(self):
192
+ self.resize(560, 360)
193
+ self.move(300, 300)
194
+ self.setWindowTitle('MVSEP music separation model')
195
+ self.setAcceptDrops(True)
196
+
197
+ def dragEnterEvent(self, event):
198
+ if event.mimeData().hasUrls():
199
+ event.accept()
200
+ else:
201
+ event.ignore()
202
+
203
+ def dropEvent(self, event):
204
+ global root
205
+ files = [u.toLocalFile() for u in event.mimeData().urls()]
206
+ txt = ''
207
+ root['input_files'] = []
208
+ for f in files:
209
+ root['input_files'].append(f)
210
+ txt += f + '\n'
211
+ root['input_files_list_text_area'].insertPlainText(txt)
212
+ root['progress_bar'].setValue(0)
213
+
214
+ def execute_long_task(self):
215
+ global root
216
+
217
+ if len(root['input_files']) == 0 and 1:
218
+ QMessageBox.about(root['w'], "Error", "No input files specified!")
219
+ return
220
+
221
+ root['progress_bar'].show()
222
+ root['button_start'].setDisabled(True)
223
+ root['button_finish'].setDisabled(False)
224
+ root['start_proc'] = True
225
+
226
+ options = {
227
+ 'input_audio': root['input_files'],
228
+ 'output_folder': root['output_folder'],
229
+ 'cpu': root['cpu'],
230
+ 'single_onnx': root['single_onnx'],
231
+ 'large_gpu': root['large_gpu'],
232
+ 'chunk_size': root['chunk_size'],
233
+ 'overlap_large': root['overlap_large'],
234
+ 'overlap_small': root['overlap_small'],
235
+ 'use_kim_model_1': root['use_kim_model_1'],
236
+ 'only_vocals': root['only_vocals'],
237
+ }
238
+
239
+ self.update_progress(0)
240
+ self.thread = QThread()
241
+ self.worker = Worker(options)
242
+ self.worker.moveToThread(self.thread)
243
+
244
+ self.thread.started.connect(self.worker.run)
245
+ self.worker.finished.connect(self.thread.quit)
246
+ self.worker.finished.connect(self.worker.deleteLater)
247
+ self.thread.finished.connect(self.thread.deleteLater)
248
+ self.worker.progress.connect(self.update_progress)
249
+
250
+ self.thread.start()
251
+
252
+ def stop_separation(self):
253
+ global root
254
+ self.thread.terminate()
255
+ root['button_start'].setDisabled(False)
256
+ root['button_finish'].setDisabled(True)
257
+ root['start_proc'] = False
258
+ root['progress_bar'].hide()
259
+
260
+ def update_progress(self, progress):
261
+ global root
262
+ root['progress_bar'].setValue(progress)
263
+
264
+ def open_settings(self):
265
+ global root
266
+ dialog = QDialog()
267
+ dialog.ui = Ui_Dialog()
268
+ dialog.ui.setupUi(dialog)
269
+ dialog.exec_()
270
+
271
+
272
+ def dialog_select_input_files():
273
+ global root
274
+ files, _ = QFileDialog.getOpenFileNames(
275
+ None,
276
+ "QFileDialog.getOpenFileNames()",
277
+ "",
278
+ "All Files (*);;Audio Files (*.wav, *.mp3, *.flac)",
279
+ )
280
+ if files:
281
+ txt = ''
282
+ root['input_files'] = []
283
+ for f in files:
284
+ root['input_files'].append(f)
285
+ txt += f + '\n'
286
+ root['input_files_list_text_area'].insertPlainText(txt)
287
+ root['progress_bar'].setValue(0)
288
+ return files
289
+
290
+
291
+ def dialog_select_output_folder():
292
+ global root
293
+ foldername = QFileDialog.getExistingDirectory(
294
+ None,
295
+ "Select Directory"
296
+ )
297
+ root['output_folder'] = foldername + '/'
298
+ root['output_folder_line_edit'].setText(root['output_folder'])
299
+ return foldername
300
+
301
+
302
+ def create_dialog():
303
+ global root
304
+ app = QApplication(sys.argv)
305
+
306
+ w = MyWidget()
307
+
308
+ root['input_files'] = []
309
+ root['output_folder'] = os.path.dirname(os.path.abspath(__file__)) + '/results/'
310
+ root['cpu'] = False
311
+ root['large_gpu'] = False
312
+ root['single_onnx'] = False
313
+ root['chunk_size'] = 1000000
314
+ root['overlap_large'] = 0.6
315
+ root['overlap_small'] = 0.5
316
+ root['use_kim_model_1'] = False
317
+ root['only_vocals'] = False
318
+
319
+ t = torch.cuda.get_device_properties(0).total_memory / (1024 * 1024 * 1024)
320
+ if t > 11.5:
321
+ print('You have enough GPU memory ({:.2f} GB), so we set fast GPU mode. You can change in settings!'.format(t))
322
+ root['large_gpu'] = True
323
+ root['single_onnx'] = False
324
+ elif t < 8:
325
+ root['large_gpu'] = False
326
+ root['single_onnx'] = True
327
+ root['chunk_size'] = 500000
328
+
329
+ button_select_input_files = QPushButton(w)
330
+ button_select_input_files.setText("Input audio files")
331
+ button_select_input_files.clicked.connect(dialog_select_input_files)
332
+ button_select_input_files.setFixedHeight(35)
333
+ button_select_input_files.setFixedWidth(150)
334
+ button_select_input_files.move(30, 20)
335
+
336
+ input_files_list_text_area = QTextEdit(w)
337
+ input_files_list_text_area.setReadOnly(True)
338
+ input_files_list_text_area.setLineWrapMode(QTextEdit.NoWrap)
339
+ font = input_files_list_text_area.font()
340
+ font.setFamily("Courier")
341
+ font.setPointSize(10)
342
+ input_files_list_text_area.move(30, 60)
343
+ input_files_list_text_area.resize(500, 100)
344
+
345
+ button_select_output_folder = QPushButton(w)
346
+ button_select_output_folder.setText("Output folder")
347
+ button_select_output_folder.setFixedHeight(35)
348
+ button_select_output_folder.setFixedWidth(150)
349
+ button_select_output_folder.clicked.connect(dialog_select_output_folder)
350
+ button_select_output_folder.move(30, 180)
351
+
352
+ output_folder_line_edit = QLineEdit(w)
353
+ output_folder_line_edit.setReadOnly(True)
354
+ font = output_folder_line_edit.font()
355
+ font.setFamily("Courier")
356
+ font.setPointSize(10)
357
+ output_folder_line_edit.move(30, 220)
358
+ output_folder_line_edit.setFixedWidth(500)
359
+ output_folder_line_edit.setText(root['output_folder'])
360
+
361
+ progress_bar = QProgressBar(w)
362
+ # progress_bar.move(30, 310)
363
+ progress_bar.setValue(0)
364
+ progress_bar.setGeometry(30, 310, 500, 35)
365
+ progress_bar.setAlignment(QtCore.Qt.AlignCenter)
366
+ progress_bar.hide()
367
+ root['progress_bar'] = progress_bar
368
+
369
+ button_start = QPushButton('Start separation', w)
370
+ button_start.clicked.connect(w.execute_long_task)
371
+ button_start.setFixedHeight(35)
372
+ button_start.setFixedWidth(150)
373
+ button_start.move(30, 270)
374
+
375
+ button_finish = QPushButton('Stop separation', w)
376
+ button_finish.clicked.connect(w.stop_separation)
377
+ button_finish.setFixedHeight(35)
378
+ button_finish.setFixedWidth(150)
379
+ button_finish.move(200, 270)
380
+ button_finish.setDisabled(True)
381
+
382
+ button_settings = QPushButton('⚙', w)
383
+ button_settings.clicked.connect(w.open_settings)
384
+ button_settings.setFixedHeight(35)
385
+ button_settings.setFixedWidth(35)
386
+ button_settings.move(495, 270)
387
+ button_settings.setDisabled(False)
388
+
389
+ mvsep_link = QLabel(w)
390
+ mvsep_link.setOpenExternalLinks(True)
391
+ font = mvsep_link.font()
392
+ font.setFamily("Courier")
393
+ font.setPointSize(10)
394
+ mvsep_link.move(415, 30)
395
+ mvsep_link.setText('Powered by <a href="https://mvsep.com">MVSep.com</a>')
396
+
397
+ root['w'] = w
398
+ root['input_files_list_text_area'] = input_files_list_text_area
399
+ root['output_folder_line_edit'] = output_folder_line_edit
400
+ root['button_start'] = button_start
401
+ root['button_finish'] = button_finish
402
+ root['button_settings'] = button_settings
403
+
404
+ # w.showMaximized()
405
+ w.show()
406
+ sys.exit(app.exec_())
407
+
408
+
409
+ if __name__ == '__main__':
410
+ print('Version: {}'.format(__VERSION__))
411
+ create_dialog()
MVSEP-MDX23-music-separation-model/images/MVSep-Window.png ADDED
MVSEP-MDX23-music-separation-model/inference.py ADDED
@@ -0,0 +1,914 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ __author__ = 'https://github.com/ZFTurbo/'
3
+
4
+ if __name__ == '__main__':
5
+ import os
6
+
7
+ gpu_use = "0"
8
+ print('GPU use: {}'.format(gpu_use))
9
+ os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_use)
10
+
11
+
12
+ import numpy as np
13
+ import torch
14
+ import torch.nn as nn
15
+ import os
16
+ import argparse
17
+ import soundfile as sf
18
+
19
+ from demucs.states import load_model
20
+ from demucs import pretrained
21
+ from demucs.apply import apply_model
22
+ import onnxruntime as ort
23
+ from time import time
24
+ import librosa
25
+ import hashlib
26
+
27
+
28
+ __VERSION__ = '1.0.1'
29
+
30
+
31
+ class Conv_TDF_net_trim_model(nn.Module):
32
+ def __init__(self, device, target_name, L, n_fft, hop=1024):
33
+
34
+ super(Conv_TDF_net_trim_model, self).__init__()
35
+
36
+ self.dim_c = 4
37
+ self.dim_f, self.dim_t = 3072, 256
38
+ self.n_fft = n_fft
39
+ self.hop = hop
40
+ self.n_bins = self.n_fft // 2 + 1
41
+ self.chunk_size = hop * (self.dim_t - 1)
42
+ self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device)
43
+ self.target_name = target_name
44
+
45
+ out_c = self.dim_c * 4 if target_name == '*' else self.dim_c
46
+ self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device)
47
+
48
+ self.n = L // 2
49
+
50
+ def stft(self, x):
51
+ x = x.reshape([-1, self.chunk_size])
52
+ x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True, return_complex=True)
53
+ x = torch.view_as_real(x)
54
+ x = x.permute([0, 3, 1, 2])
55
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, self.dim_c, self.n_bins, self.dim_t])
56
+ return x[:, :, :self.dim_f]
57
+
58
+ def istft(self, x, freq_pad=None):
59
+ freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad
60
+ x = torch.cat([x, freq_pad], -2)
61
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t])
62
+ x = x.permute([0, 2, 3, 1])
63
+ x = x.contiguous()
64
+ x = torch.view_as_complex(x)
65
+ x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True)
66
+ return x.reshape([-1, 2, self.chunk_size])
67
+
68
+ def forward(self, x):
69
+ x = self.first_conv(x)
70
+ x = x.transpose(-1, -2)
71
+
72
+ ds_outputs = []
73
+ for i in range(self.n):
74
+ x = self.ds_dense[i](x)
75
+ ds_outputs.append(x)
76
+ x = self.ds[i](x)
77
+
78
+ x = self.mid_dense(x)
79
+ for i in range(self.n):
80
+ x = self.us[i](x)
81
+ x *= ds_outputs[-i - 1]
82
+ x = self.us_dense[i](x)
83
+
84
+ x = x.transpose(-1, -2)
85
+ x = self.final_conv(x)
86
+ return x
87
+
88
+
89
+ def get_models(name, device, load=True, vocals_model_type=0):
90
+ if vocals_model_type == 2:
91
+ model_vocals = Conv_TDF_net_trim_model(
92
+ device=device,
93
+ target_name='vocals',
94
+ L=11,
95
+ n_fft=7680
96
+ )
97
+ elif vocals_model_type == 3:
98
+ model_vocals = Conv_TDF_net_trim_model(
99
+ device=device,
100
+ target_name='vocals',
101
+ L=11,
102
+ n_fft=6144
103
+ )
104
+
105
+ return [model_vocals]
106
+
107
+
108
+ def demix_base(mix, device, models, infer_session):
109
+ start_time = time()
110
+ sources = []
111
+ n_sample = mix.shape[1]
112
+ for model in models:
113
+ trim = model.n_fft // 2
114
+ gen_size = model.chunk_size - 2 * trim
115
+ pad = gen_size - n_sample % gen_size
116
+ mix_p = np.concatenate(
117
+ (
118
+ np.zeros((2, trim)),
119
+ mix,
120
+ np.zeros((2, pad)),
121
+ np.zeros((2, trim))
122
+ ), 1
123
+ )
124
+
125
+ mix_waves = []
126
+ i = 0
127
+ while i < n_sample + pad:
128
+ waves = np.array(mix_p[:, i:i + model.chunk_size])
129
+ mix_waves.append(waves)
130
+ i += gen_size
131
+ mix_waves = np.array(mix_waves) # Convert the list to a single numpy.ndarray
132
+ mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(device)
133
+
134
+ with torch.no_grad():
135
+ _ort = infer_session
136
+ stft_res = model.stft(mix_waves)
137
+ res = _ort.run(None, {'input': stft_res.cpu().numpy()})[0]
138
+ ten = torch.tensor(res).to(device) # Move result tensor to device
139
+ tar_waves = model.istft(ten) # This operation is performed on the GPU
140
+ tar_waves = tar_waves.cpu() # Move the result back to CPU only after all computations
141
+ tar_signal = tar_waves[:, :, trim:-trim].transpose(0, 1).reshape(2, -1).numpy()[:, :-pad]
142
+
143
+ sources.append(tar_signal)
144
+ # print('Time demix base: {:.2f} sec'.format(time() - start_time))
145
+ return np.array(sources)
146
+
147
+
148
+ def demix_full(mix, device, chunk_size, models, infer_session, overlap=0.75):
149
+ start_time = time()
150
+
151
+ step = int(chunk_size * (1 - overlap))
152
+ # print('Initial shape: {} Chunk size: {} Step: {} Device: {}'.format(mix.shape, chunk_size, step, device))
153
+ result = np.zeros((1, 2, mix.shape[-1]), dtype=np.float32)
154
+ divider = np.zeros((1, 2, mix.shape[-1]), dtype=np.float32)
155
+
156
+ total = 0
157
+ for i in range(0, mix.shape[-1], step):
158
+ total += 1
159
+
160
+ start = i
161
+ end = min(i + chunk_size, mix.shape[-1])
162
+ # print('Chunk: {} Start: {} End: {}'.format(total, start, end))
163
+ mix_part = mix[:, start:end]
164
+ sources = demix_base(mix_part, device, models, infer_session)
165
+ # print(sources.shape)
166
+ result[..., start:end] += sources
167
+ divider[..., start:end] += 1
168
+ sources = result / divider
169
+ # print('Final shape: {} Overall time: {:.2f}'.format(sources.shape, time() - start_time))
170
+ return sources
171
+
172
+
173
+ class EnsembleDemucsMDXMusicSeparationModel:
174
+ def __init__(self, options):
175
+ """
176
+ options - user options
177
+ """
178
+ # print(options)
179
+
180
+ if torch.cuda.is_available():
181
+ device = 'cuda:0'
182
+ else:
183
+ device = 'cpu'
184
+ if 'cpu' in options:
185
+ if options['cpu']:
186
+ device = 'cpu'
187
+ print('Use device: {}'.format(device))
188
+ self.single_onnx = False
189
+ if 'single_onnx' in options:
190
+ if options['single_onnx']:
191
+ self.single_onnx = True
192
+ print('Use single vocal ONNX')
193
+
194
+ self.kim_model_1 = False
195
+ if 'use_kim_model_1' in options:
196
+ if options['use_kim_model_1']:
197
+ self.kim_model_1 = True
198
+ if self.kim_model_1:
199
+ print('Use Kim model 1')
200
+ else:
201
+ print('Use Kim model 2')
202
+
203
+ self.overlap_large = float(options['overlap_large'])
204
+ self.overlap_small = float(options['overlap_small'])
205
+ if self.overlap_large > 0.99:
206
+ self.overlap_large = 0.99
207
+ if self.overlap_large < 0.0:
208
+ self.overlap_large = 0.0
209
+ if self.overlap_small > 0.99:
210
+ self.overlap_small = 0.99
211
+ if self.overlap_small < 0.0:
212
+ self.overlap_small = 0.0
213
+
214
+ model_folder = os.path.dirname(os.path.realpath(__file__)) + '/models/'
215
+ remote_url = 'https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th'
216
+ model_path = model_folder + '04573f0d-f3cf25b2.th'
217
+ if not os.path.isfile(model_path):
218
+ torch.hub.download_url_to_file(remote_url, model_folder + '04573f0d-f3cf25b2.th')
219
+ model_vocals = load_model(model_path)
220
+ model_vocals.to(device)
221
+ self.model_vocals_only = model_vocals
222
+
223
+ self.models = []
224
+ self.weights_vocals = np.array([10, 1, 8, 9])
225
+ self.weights_bass = np.array([19, 4, 5, 8])
226
+ self.weights_drums = np.array([18, 2, 4, 9])
227
+ self.weights_other = np.array([14, 2, 5, 10])
228
+
229
+ model1 = pretrained.get_model('htdemucs_ft')
230
+ model1.to(device)
231
+ self.models.append(model1)
232
+
233
+ model2 = pretrained.get_model('htdemucs')
234
+ model2.to(device)
235
+ self.models.append(model2)
236
+
237
+ model3 = pretrained.get_model('htdemucs_6s')
238
+ model3.to(device)
239
+ self.models.append(model3)
240
+
241
+ model4 = pretrained.get_model('hdemucs_mmi')
242
+ model4.to(device)
243
+ self.models.append(model4)
244
+
245
+ if 0:
246
+ for model in self.models:
247
+ print(model.sources)
248
+ '''
249
+ ['drums', 'bass', 'other', 'vocals']
250
+ ['drums', 'bass', 'other', 'vocals']
251
+ ['drums', 'bass', 'other', 'vocals', 'guitar', 'piano']
252
+ ['drums', 'bass', 'other', 'vocals']
253
+ '''
254
+
255
+ if device == 'cpu':
256
+ chunk_size = 200000000
257
+ providers = ["CPUExecutionProvider"]
258
+ else:
259
+ chunk_size = 1000000
260
+ providers = ["CUDAExecutionProvider"]
261
+ if 'chunk_size' in options:
262
+ chunk_size = int(options['chunk_size'])
263
+
264
+ # MDX-B model 1 initialization
265
+ self.chunk_size = chunk_size
266
+ self.mdx_models1 = get_models('tdf_extra', load=False, device=device, vocals_model_type=2)
267
+ if self.kim_model_1:
268
+ model_path_onnx1 = model_folder + 'Kim_Vocal_1.onnx'
269
+ remote_url_onnx1 = 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Vocal_1.onnx'
270
+ else:
271
+ model_path_onnx1 = model_folder + 'Kim_Vocal_2.onnx'
272
+ remote_url_onnx1 = 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Vocal_2.onnx'
273
+ if not os.path.isfile(model_path_onnx1):
274
+ torch.hub.download_url_to_file(remote_url_onnx1, model_path_onnx1)
275
+ print('Model path: {}'.format(model_path_onnx1))
276
+ print('Device: {} Chunk size: {}'.format(device, chunk_size))
277
+ self.infer_session1 = ort.InferenceSession(
278
+ model_path_onnx1,
279
+ providers=providers,
280
+ provider_options=[{"device_id": 0}],
281
+ )
282
+
283
+ if self.single_onnx is False:
284
+ # MDX-B model 2 initialization
285
+ self.chunk_size = chunk_size
286
+ self.mdx_models2 = get_models('tdf_extra', load=False, device=device, vocals_model_type=2)
287
+ root_path = os.path.dirname(os.path.realpath(__file__)) + '/'
288
+ model_path_onnx2 = model_folder + 'Kim_Inst.onnx'
289
+ remote_url_onnx2 = 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Inst.onnx'
290
+ if not os.path.isfile(model_path_onnx2):
291
+ torch.hub.download_url_to_file(remote_url_onnx2, model_path_onnx2)
292
+ print('Model path: {}'.format(model_path_onnx2))
293
+ print('Device: {} Chunk size: {}'.format(device, chunk_size))
294
+ self.infer_session2 = ort.InferenceSession(
295
+ model_path_onnx2,
296
+ providers=providers,
297
+ provider_options=[{"device_id": 0}],
298
+ )
299
+
300
+ self.device = device
301
+ pass
302
+
303
+ @property
304
+ def instruments(self):
305
+ """ DO NOT CHANGE """
306
+ return ['bass', 'drums', 'other', 'vocals']
307
+
308
+ def raise_aicrowd_error(self, msg):
309
+ """ Will be used by the evaluator to provide logs, DO NOT CHANGE """
310
+ raise NameError(msg)
311
+
312
+ def separate_music_file(
313
+ self,
314
+ mixed_sound_array,
315
+ sample_rate,
316
+ update_percent_func=None,
317
+ current_file_number=0,
318
+ total_files=0,
319
+ only_vocals=False,
320
+ ):
321
+ """
322
+ Implements the sound separation for a single sound file
323
+ Inputs: Outputs from soundfile.read('mixture.wav')
324
+ mixed_sound_array
325
+ sample_rate
326
+
327
+ Outputs:
328
+ separated_music_arrays: Dictionary numpy array of each separated instrument
329
+ output_sample_rates: Dictionary of sample rates separated sequence
330
+ """
331
+
332
+ # print('Update percent func: {}'.format(update_percent_func))
333
+
334
+ separated_music_arrays = {}
335
+ output_sample_rates = {}
336
+
337
+ audio = np.expand_dims(mixed_sound_array.T, axis=0)
338
+ audio = torch.from_numpy(audio).type('torch.FloatTensor').to(self.device)
339
+
340
+ overlap_large = self.overlap_large
341
+ overlap_small = self.overlap_small
342
+
343
+ # Get Demics vocal only
344
+ model = self.model_vocals_only
345
+ shifts = 1
346
+ overlap = overlap_large
347
+ vocals_demucs = 0.5 * apply_model(model, audio, shifts=shifts, overlap=overlap)[0][3].cpu().numpy()
348
+
349
+ if update_percent_func is not None:
350
+ val = 100 * (current_file_number + 0.10) / total_files
351
+ update_percent_func(int(val))
352
+
353
+ vocals_demucs += 0.5 * -apply_model(model, -audio, shifts=shifts, overlap=overlap)[0][3].cpu().numpy()
354
+
355
+ if update_percent_func is not None:
356
+ val = 100 * (current_file_number + 0.20) / total_files
357
+ update_percent_func(int(val))
358
+
359
+ overlap = overlap_large
360
+ sources1 = demix_full(
361
+ mixed_sound_array.T,
362
+ self.device,
363
+ self.chunk_size,
364
+ self.mdx_models1,
365
+ self.infer_session1,
366
+ overlap=overlap
367
+ )[0]
368
+
369
+ vocals_mdxb1 = sources1
370
+
371
+ if update_percent_func is not None:
372
+ val = 100 * (current_file_number + 0.30) / total_files
373
+ update_percent_func(int(val))
374
+
375
+ if self.single_onnx is False:
376
+ sources2 = -demix_full(
377
+ -mixed_sound_array.T,
378
+ self.device,
379
+ self.chunk_size,
380
+ self.mdx_models2,
381
+ self.infer_session2,
382
+ overlap=overlap
383
+ )[0]
384
+
385
+ # it's instrumental so need to invert
386
+ instrum_mdxb2 = sources2
387
+ vocals_mdxb2 = mixed_sound_array.T - instrum_mdxb2
388
+
389
+ if update_percent_func is not None:
390
+ val = 100 * (current_file_number + 0.40) / total_files
391
+ update_percent_func(int(val))
392
+
393
+ # Ensemble vocals for MDX and Demucs
394
+ if self.single_onnx is False:
395
+ weights = np.array([12, 8, 3])
396
+ vocals = (weights[0] * vocals_mdxb1.T + weights[1] * vocals_mdxb2.T + weights[2] * vocals_demucs.T) / weights.sum()
397
+ else:
398
+ weights = np.array([6, 1])
399
+ vocals = (weights[0] * vocals_mdxb1.T + weights[1] * vocals_demucs.T) / weights.sum()
400
+
401
+ # vocals
402
+ separated_music_arrays['vocals'] = vocals
403
+ output_sample_rates['vocals'] = sample_rate
404
+
405
+ if not only_vocals:
406
+ # Generate instrumental
407
+ instrum = mixed_sound_array - vocals
408
+
409
+ audio = np.expand_dims(instrum.T, axis=0)
410
+ audio = torch.from_numpy(audio).type('torch.FloatTensor').to(self.device)
411
+
412
+ all_outs = []
413
+ for i, model in enumerate(self.models):
414
+ if i == 0:
415
+ overlap = overlap_small
416
+ elif i > 0:
417
+ overlap = overlap_large
418
+ out = 0.5 * apply_model(model, audio, shifts=shifts, overlap=overlap)[0].cpu().numpy() \
419
+ + 0.5 * -apply_model(model, -audio, shifts=shifts, overlap=overlap)[0].cpu().numpy()
420
+
421
+ if update_percent_func is not None:
422
+ val = 100 * (current_file_number + 0.50 + i * 0.10) / total_files
423
+ update_percent_func(int(val))
424
+
425
+ if i == 2:
426
+ # ['drums', 'bass', 'other', 'vocals', 'guitar', 'piano']
427
+ out[2] = out[2] + out[4] + out[5]
428
+ out = out[:4]
429
+
430
+ out[0] = self.weights_drums[i] * out[0]
431
+ out[1] = self.weights_bass[i] * out[1]
432
+ out[2] = self.weights_other[i] * out[2]
433
+ out[3] = self.weights_vocals[i] * out[3]
434
+
435
+ all_outs.append(out)
436
+ out = np.array(all_outs).sum(axis=0)
437
+ out[0] = out[0] / self.weights_drums.sum()
438
+ out[1] = out[1] / self.weights_bass.sum()
439
+ out[2] = out[2] / self.weights_other.sum()
440
+ out[3] = out[3] / self.weights_vocals.sum()
441
+
442
+ # other
443
+ res = mixed_sound_array - vocals - out[0].T - out[1].T
444
+ res = np.clip(res, -1, 1)
445
+ separated_music_arrays['other'] = (2 * res + out[2].T) / 3.0
446
+ output_sample_rates['other'] = sample_rate
447
+
448
+ # drums
449
+ res = mixed_sound_array - vocals - out[1].T - out[2].T
450
+ res = np.clip(res, -1, 1)
451
+ separated_music_arrays['drums'] = (res + 2 * out[0].T.copy()) / 3.0
452
+ output_sample_rates['drums'] = sample_rate
453
+
454
+ # bass
455
+ res = mixed_sound_array - vocals - out[0].T - out[2].T
456
+ res = np.clip(res, -1, 1)
457
+ separated_music_arrays['bass'] = (res + 2 * out[1].T) / 3.0
458
+ output_sample_rates['bass'] = sample_rate
459
+
460
+ bass = separated_music_arrays['bass']
461
+ drums = separated_music_arrays['drums']
462
+ other = separated_music_arrays['other']
463
+
464
+ separated_music_arrays['other'] = mixed_sound_array - vocals - bass - drums
465
+ separated_music_arrays['drums'] = mixed_sound_array - vocals - bass - other
466
+ separated_music_arrays['bass'] = mixed_sound_array - vocals - drums - other
467
+
468
+ if update_percent_func is not None:
469
+ val = 100 * (current_file_number + 0.95) / total_files
470
+ update_percent_func(int(val))
471
+
472
+ return separated_music_arrays, output_sample_rates
473
+
474
+
475
+ class EnsembleDemucsMDXMusicSeparationModelLowGPU:
476
+ def __init__(self, options):
477
+ """
478
+ options - user options
479
+ """
480
+ # print(options)
481
+
482
+ if torch.cuda.is_available():
483
+ device = 'cuda:0'
484
+ else:
485
+ device = 'cpu'
486
+ if 'cpu' in options:
487
+ if options['cpu']:
488
+ device = 'cpu'
489
+ print('Use device: {}'.format(device))
490
+ self.single_onnx = False
491
+ if 'single_onnx' in options:
492
+ if options['single_onnx']:
493
+ self.single_onnx = True
494
+ print('Use single vocal ONNX')
495
+
496
+ self.kim_model_1 = False
497
+ if 'use_kim_model_1' in options:
498
+ if options['use_kim_model_1']:
499
+ self.kim_model_1 = True
500
+ if self.kim_model_1:
501
+ print('Use Kim model 1')
502
+ else:
503
+ print('Use Kim model 2')
504
+
505
+ self.overlap_large = float(options['overlap_large'])
506
+ self.overlap_small = float(options['overlap_small'])
507
+ if self.overlap_large > 0.99:
508
+ self.overlap_large = 0.99
509
+ if self.overlap_large < 0.0:
510
+ self.overlap_large = 0.0
511
+ if self.overlap_small > 0.99:
512
+ self.overlap_small = 0.99
513
+ if self.overlap_small < 0.0:
514
+ self.overlap_small = 0.0
515
+
516
+ self.weights_vocals = np.array([10, 1, 8, 9])
517
+ self.weights_bass = np.array([19, 4, 5, 8])
518
+ self.weights_drums = np.array([18, 2, 4, 9])
519
+ self.weights_other = np.array([14, 2, 5, 10])
520
+
521
+ if device == 'cpu':
522
+ chunk_size = 200000000
523
+ self.providers = ["CPUExecutionProvider"]
524
+ else:
525
+ chunk_size = 1000000
526
+ self.providers = ["CUDAExecutionProvider"]
527
+ if 'chunk_size' in options:
528
+ chunk_size = int(options['chunk_size'])
529
+ self.chunk_size = chunk_size
530
+ self.device = device
531
+ pass
532
+
533
+ @property
534
+ def instruments(self):
535
+ """ DO NOT CHANGE """
536
+ return ['bass', 'drums', 'other', 'vocals']
537
+
538
+ def raise_aicrowd_error(self, msg):
539
+ """ Will be used by the evaluator to provide logs, DO NOT CHANGE """
540
+ raise NameError(msg)
541
+
542
+ def separate_music_file(
543
+ self,
544
+ mixed_sound_array,
545
+ sample_rate,
546
+ update_percent_func=None,
547
+ current_file_number=0,
548
+ total_files=0,
549
+ only_vocals=False
550
+ ):
551
+ """
552
+ Implements the sound separation for a single sound file
553
+ Inputs: Outputs from soundfile.read('mixture.wav')
554
+ mixed_sound_array
555
+ sample_rate
556
+
557
+ Outputs:
558
+ separated_music_arrays: Dictionary numpy array of each separated instrument
559
+ output_sample_rates: Dictionary of sample rates separated sequence
560
+ """
561
+
562
+ # print('Update percent func: {}'.format(update_percent_func))
563
+
564
+ separated_music_arrays = {}
565
+ output_sample_rates = {}
566
+
567
+ audio = np.expand_dims(mixed_sound_array.T, axis=0)
568
+ audio = torch.from_numpy(audio).type('torch.FloatTensor').to(self.device)
569
+
570
+ overlap_large = self.overlap_large
571
+ overlap_small = self.overlap_small
572
+
573
+ # Get Demucs vocal only
574
+ model_folder = os.path.dirname(os.path.realpath(__file__)) + '/models/'
575
+ remote_url = 'https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th'
576
+ model_path = model_folder + '04573f0d-f3cf25b2.th'
577
+ if not os.path.isfile(model_path):
578
+ torch.hub.download_url_to_file(remote_url, model_folder + '04573f0d-f3cf25b2.th')
579
+ model_vocals = load_model(model_path)
580
+ model_vocals.to(self.device)
581
+ shifts = 1
582
+ overlap = overlap_large
583
+ vocals_demucs = 0.5 * apply_model(model_vocals, audio, shifts=shifts, overlap=overlap)[0][3].cpu().numpy()
584
+
585
+ if update_percent_func is not None:
586
+ val = 100 * (current_file_number + 0.10) / total_files
587
+ update_percent_func(int(val))
588
+
589
+ vocals_demucs += 0.5 * -apply_model(model_vocals, -audio, shifts=shifts, overlap=overlap)[0][3].cpu().numpy()
590
+ model_vocals = model_vocals.cpu()
591
+ del model_vocals
592
+
593
+ if update_percent_func is not None:
594
+ val = 100 * (current_file_number + 0.20) / total_files
595
+ update_percent_func(int(val))
596
+
597
+ # MDX-B model 1 initialization
598
+ mdx_models1 = get_models('tdf_extra', load=False, device=self.device, vocals_model_type=2)
599
+ if self.kim_model_1:
600
+ model_path_onnx1 = model_folder + 'Kim_Vocal_1.onnx'
601
+ remote_url_onnx1 = 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Vocal_1.onnx'
602
+ else:
603
+ model_path_onnx1 = model_folder + 'Kim_Vocal_2.onnx'
604
+ remote_url_onnx1 = 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Vocal_2.onnx'
605
+ if not os.path.isfile(model_path_onnx1):
606
+ torch.hub.download_url_to_file(remote_url_onnx1, model_path_onnx1)
607
+ print('Model path: {}'.format(model_path_onnx1))
608
+ print('Device: {} Chunk size: {}'.format(self.device, self.chunk_size))
609
+ infer_session1 = ort.InferenceSession(
610
+ model_path_onnx1,
611
+ providers=self.providers,
612
+ provider_options=[{"device_id": 0}],
613
+ )
614
+ overlap = overlap_large
615
+ sources1 = demix_full(
616
+ mixed_sound_array.T,
617
+ self.device,
618
+ self.chunk_size,
619
+ mdx_models1,
620
+ infer_session1,
621
+ overlap=overlap
622
+ )[0]
623
+ vocals_mdxb1 = sources1
624
+ del infer_session1
625
+ del mdx_models1
626
+
627
+ if update_percent_func is not None:
628
+ val = 100 * (current_file_number + 0.30) / total_files
629
+ update_percent_func(int(val))
630
+
631
+ if self.single_onnx is False:
632
+ # MDX-B model 2 initialization
633
+ mdx_models2 = get_models('tdf_extra', load=False, device=self.device, vocals_model_type=2)
634
+ root_path = os.path.dirname(os.path.realpath(__file__)) + '/'
635
+ model_path_onnx2 = model_folder + 'Kim_Inst.onnx'
636
+ remote_url_onnx2 = 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Inst.onnx'
637
+ if not os.path.isfile(model_path_onnx2):
638
+ torch.hub.download_url_to_file(remote_url_onnx2, model_path_onnx2)
639
+ print('Model path: {}'.format(model_path_onnx2))
640
+ print('Device: {} Chunk size: {}'.format(self.device, self.chunk_size))
641
+ infer_session2 = ort.InferenceSession(
642
+ model_path_onnx2,
643
+ providers=self.providers,
644
+ provider_options=[{"device_id": 0}],
645
+ )
646
+
647
+ overlap = overlap_large
648
+ sources2 = -demix_full(
649
+ -mixed_sound_array.T,
650
+ self.device,
651
+ self.chunk_size,
652
+ mdx_models2,
653
+ infer_session2,
654
+ overlap=overlap
655
+ )[0]
656
+
657
+ # it's instrumental so need to invert
658
+ instrum_mdxb2 = sources2
659
+ vocals_mdxb2 = mixed_sound_array.T - instrum_mdxb2
660
+ del infer_session2
661
+ del mdx_models2
662
+
663
+ if update_percent_func is not None:
664
+ val = 100 * (current_file_number + 0.40) / total_files
665
+ update_percent_func(int(val))
666
+
667
+ # Ensemble vocals for MDX and Demucs
668
+ if self.single_onnx is False:
669
+ weights = np.array([12, 8, 3])
670
+ vocals = (weights[0] * vocals_mdxb1.T + weights[1] * vocals_mdxb2.T + weights[2] * vocals_demucs.T) / weights.sum()
671
+ else:
672
+ weights = np.array([6, 1])
673
+ vocals = (weights[0] * vocals_mdxb1.T + weights[1] * vocals_demucs.T) / weights.sum()
674
+
675
+ # Generate instrumental
676
+ instrum = mixed_sound_array - vocals
677
+
678
+ audio = np.expand_dims(instrum.T, axis=0)
679
+ audio = torch.from_numpy(audio).type('torch.FloatTensor').to(self.device)
680
+
681
+ all_outs = []
682
+
683
+ i = 0
684
+ overlap = overlap_small
685
+ model = pretrained.get_model('htdemucs_ft')
686
+ model.to(self.device)
687
+ out = 0.5 * apply_model(model, audio, shifts=shifts, overlap=overlap)[0].cpu().numpy() \
688
+ + 0.5 * -apply_model(model, -audio, shifts=shifts, overlap=overlap)[0].cpu().numpy()
689
+
690
+ if update_percent_func is not None:
691
+ val = 100 * (current_file_number + 0.50 + i * 0.10) / total_files
692
+ update_percent_func(int(val))
693
+
694
+ out[0] = self.weights_drums[i] * out[0]
695
+ out[1] = self.weights_bass[i] * out[1]
696
+ out[2] = self.weights_other[i] * out[2]
697
+ out[3] = self.weights_vocals[i] * out[3]
698
+ all_outs.append(out)
699
+ model = model.cpu()
700
+ del model
701
+
702
+ i = 1
703
+ overlap = overlap_large
704
+ model = pretrained.get_model('htdemucs')
705
+ model.to(self.device)
706
+ out = 0.5 * apply_model(model, audio, shifts=shifts, overlap=overlap)[0].cpu().numpy() \
707
+ + 0.5 * -apply_model(model, -audio, shifts=shifts, overlap=overlap)[0].cpu().numpy()
708
+
709
+ if update_percent_func is not None:
710
+ val = 100 * (current_file_number + 0.50 + i * 0.10) / total_files
711
+ update_percent_func(int(val))
712
+
713
+ out[0] = self.weights_drums[i] * out[0]
714
+ out[1] = self.weights_bass[i] * out[1]
715
+ out[2] = self.weights_other[i] * out[2]
716
+ out[3] = self.weights_vocals[i] * out[3]
717
+ all_outs.append(out)
718
+ model = model.cpu()
719
+ del model
720
+
721
+ i = 2
722
+ overlap = overlap_large
723
+ model = pretrained.get_model('htdemucs_6s')
724
+ model.to(self.device)
725
+ out = 0.5 * apply_model(model, audio, shifts=shifts, overlap=overlap)[0].cpu().numpy() \
726
+ + 0.5 * -apply_model(model, -audio, shifts=shifts, overlap=overlap)[0].cpu().numpy()
727
+
728
+ if update_percent_func is not None:
729
+ val = 100 * (current_file_number + 0.50 + i * 0.10) / total_files
730
+ update_percent_func(int(val))
731
+
732
+ # More stems need to add
733
+ out[2] = out[2] + out[4] + out[5]
734
+ out = out[:4]
735
+ out[0] = self.weights_drums[i] * out[0]
736
+ out[1] = self.weights_bass[i] * out[1]
737
+ out[2] = self.weights_other[i] * out[2]
738
+ out[3] = self.weights_vocals[i] * out[3]
739
+ all_outs.append(out)
740
+ model = model.cpu()
741
+ del model
742
+
743
+ i = 3
744
+ model = pretrained.get_model('hdemucs_mmi')
745
+ model.to(self.device)
746
+ out = 0.5 * apply_model(model, audio, shifts=shifts, overlap=overlap)[0].cpu().numpy() \
747
+ + 0.5 * -apply_model(model, -audio, shifts=shifts, overlap=overlap)[0].cpu().numpy()
748
+
749
+ if update_percent_func is not None:
750
+ val = 100 * (current_file_number + 0.50 + i * 0.10) / total_files
751
+ update_percent_func(int(val))
752
+
753
+ out[0] = self.weights_drums[i] * out[0]
754
+ out[1] = self.weights_bass[i] * out[1]
755
+ out[2] = self.weights_other[i] * out[2]
756
+ out[3] = self.weights_vocals[i] * out[3]
757
+ all_outs.append(out)
758
+ model = model.cpu()
759
+ del model
760
+
761
+ out = np.array(all_outs).sum(axis=0)
762
+ out[0] = out[0] / self.weights_drums.sum()
763
+ out[1] = out[1] / self.weights_bass.sum()
764
+ out[2] = out[2] / self.weights_other.sum()
765
+ out[3] = out[3] / self.weights_vocals.sum()
766
+
767
+ # vocals
768
+ separated_music_arrays['vocals'] = vocals
769
+ output_sample_rates['vocals'] = sample_rate
770
+
771
+ # other
772
+ res = mixed_sound_array - vocals - out[0].T - out[1].T
773
+ res = np.clip(res, -1, 1)
774
+ separated_music_arrays['other'] = (2 * res + out[2].T) / 3.0
775
+ output_sample_rates['other'] = sample_rate
776
+
777
+ # drums
778
+ res = mixed_sound_array - vocals - out[1].T - out[2].T
779
+ res = np.clip(res, -1, 1)
780
+ separated_music_arrays['drums'] = (res + 2 * out[0].T.copy()) / 3.0
781
+ output_sample_rates['drums'] = sample_rate
782
+
783
+ # bass
784
+ res = mixed_sound_array - vocals - out[0].T - out[2].T
785
+ res = np.clip(res, -1, 1)
786
+ separated_music_arrays['bass'] = (res + 2 * out[1].T) / 3.0
787
+ output_sample_rates['bass'] = sample_rate
788
+
789
+ bass = separated_music_arrays['bass']
790
+ drums = separated_music_arrays['drums']
791
+ other = separated_music_arrays['other']
792
+
793
+ separated_music_arrays['other'] = mixed_sound_array - vocals - bass - drums
794
+ separated_music_arrays['drums'] = mixed_sound_array - vocals - bass - other
795
+ separated_music_arrays['bass'] = mixed_sound_array - vocals - drums - other
796
+
797
+ if update_percent_func is not None:
798
+ val = 100 * (current_file_number + 0.95) / total_files
799
+ update_percent_func(int(val))
800
+
801
+ return separated_music_arrays, output_sample_rates
802
+
803
+
804
+ def predict_with_model(options):
805
+ for input_audio in options['input_audio']:
806
+ if not os.path.isfile(input_audio):
807
+ print('Error. No such file: {}. Please check path!'.format(input_audio))
808
+ return
809
+ output_folder = options['output_folder']
810
+ if not os.path.isdir(output_folder):
811
+ os.mkdir(output_folder)
812
+
813
+ only_vocals = False
814
+ if 'only_vocals' in options:
815
+ if options['only_vocals'] is True:
816
+ print('Generate only vocals and instrumental')
817
+ only_vocals = True
818
+
819
+ model = None
820
+ if 'large_gpu' in options:
821
+ if options['large_gpu'] is True:
822
+ print('Use fast large GPU memory version of code')
823
+ model = EnsembleDemucsMDXMusicSeparationModel(options)
824
+ if model is None:
825
+ print('Use low GPU memory version of code')
826
+ model = EnsembleDemucsMDXMusicSeparationModelLowGPU(options)
827
+
828
+ update_percent_func = None
829
+ if 'update_percent_func' in options:
830
+ update_percent_func = options['update_percent_func']
831
+
832
+ for i, input_audio in enumerate(options['input_audio']):
833
+ print('Go for: {}'.format(input_audio))
834
+ audio, sr = librosa.load(input_audio, mono=False, sr=44100)
835
+ if len(audio.shape) == 1:
836
+ audio = np.stack([audio, audio], axis=0)
837
+ print("Input audio: {} Sample rate: {}".format(audio.shape, sr))
838
+ result, sample_rates = model.separate_music_file(
839
+ audio.T,
840
+ sr,
841
+ update_percent_func,
842
+ i,
843
+ len(options['input_audio']),
844
+ only_vocals,
845
+ )
846
+ all_instrum = model.instruments
847
+ if only_vocals:
848
+ all_instrum = ['vocals']
849
+ for instrum in all_instrum:
850
+ output_name = os.path.splitext(os.path.basename(input_audio))[0] + '_{}.wav'.format(instrum)
851
+ sf.write(output_folder + '/' + output_name, result[instrum], sample_rates[instrum], subtype='FLOAT')
852
+ print('File created: {}'.format(output_folder + '/' + output_name))
853
+
854
+ # instrumental part 1
855
+ inst = audio.T - result['vocals']
856
+ output_name = os.path.splitext(os.path.basename(input_audio))[0] + '_{}.wav'.format('instrum')
857
+ sf.write(output_folder + '/' + output_name, inst, sr, subtype='FLOAT')
858
+ print('File created: {}'.format(output_folder + '/' + output_name))
859
+
860
+ if not only_vocals:
861
+ # instrumental part 2
862
+ inst2 = result['bass'] + result['drums'] + result['other']
863
+ output_name = os.path.splitext(os.path.basename(input_audio))[0] + '_{}.wav'.format('instrum2')
864
+ sf.write(output_folder + '/' + output_name, inst2, sr, subtype='FLOAT')
865
+ print('File created: {}'.format(output_folder + '/' + output_name))
866
+
867
+ if update_percent_func is not None:
868
+ val = 100
869
+ update_percent_func(int(val))
870
+
871
+
872
+ def md5(fname):
873
+ hash_md5 = hashlib.md5()
874
+ with open(fname, "rb") as f:
875
+ for chunk in iter(lambda: f.read(4096), b""):
876
+ hash_md5.update(chunk)
877
+ return hash_md5.hexdigest()
878
+
879
+
880
+ if __name__ == '__main__':
881
+ start_time = time()
882
+
883
+ print("Version: {}".format(__VERSION__))
884
+ m = argparse.ArgumentParser()
885
+ m.add_argument("--input_audio", "-i", nargs='+', type=str, help="Input audio location. You can provide multiple files at once", required=True)
886
+ m.add_argument("--output_folder", "-r", type=str, help="Output audio folder", required=True)
887
+ m.add_argument("--cpu", action='store_true', help="Choose CPU instead of GPU for processing. Can be very slow.")
888
+ m.add_argument("--overlap_large", "-ol", type=float, help="Overlap of splited audio for light models. Closer to 1.0 - slower", required=False, default=0.6)
889
+ m.add_argument("--overlap_small", "-os", type=float, help="Overlap of splited audio for heavy models. Closer to 1.0 - slower", required=False, default=0.5)
890
+ m.add_argument("--single_onnx", action='store_true', help="Only use single ONNX model for vocals. Can be useful if you have not enough GPU memory.")
891
+ m.add_argument("--chunk_size", "-cz", type=int, help="Chunk size for ONNX models. Set lower to reduce GPU memory consumption. Default: 1000000", required=False, default=1000000)
892
+ m.add_argument("--large_gpu", action='store_true', help="It will store all models on GPU for faster processing of multiple audio files. Requires 11 and more GB of free GPU memory.")
893
+ m.add_argument("--use_kim_model_1", action='store_true', help="Use first version of Kim model (as it was on contest).")
894
+ m.add_argument("--only_vocals", action='store_true', help="Only create vocals and instrumental. Skip bass, drums, other")
895
+
896
+ options = m.parse_args().__dict__
897
+ print("Options: ".format(options))
898
+ for el in options:
899
+ print('{}: {}'.format(el, options[el]))
900
+ predict_with_model(options)
901
+ print('Time: {:.0f} sec'.format(time() - start_time))
902
+ print('Presented by https://mvsep.com')
903
+
904
+
905
+ """
906
+ Example:
907
+ python inference.py
908
+ --input_audio mixture.wav mixture1.wav
909
+ --output_folder ./results/
910
+ --cpu
911
+ --overlap_large 0.25
912
+ --overlap_small 0.25
913
+ --chunk_size 500000
914
+ """
MVSEP-MDX23-music-separation-model/models/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
MVSEP-MDX23-music-separation-model/requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ soundfile
3
+ scipy
4
+ torch<2.6
5
+ tqdm
6
+ librosa
7
+ demucs
8
+ onnxruntime-gpu
9
+ PyQt5
10
+ gradio==3.27.0
11
+ matplotlib
MVSEP-MDX23-music-separation-model/web-ui.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This script is licensed under the "GNU Affero General Public License v3.0"
2
+ # See https://choosealicense.com/licenses/agpl-3.0/ for details.
3
+ import os
4
+ import time
5
+ import numpy as np
6
+ import tempfile
7
+ from scipy.io import wavfile
8
+ import gradio as gr
9
+ from inference import EnsembleDemucsMDXMusicSeparationModel, predict_with_model
10
+ import torch
11
+ import librosa
12
+ import librosa.display
13
+ import matplotlib.pyplot as plt
14
+ import asyncio
15
+
16
+ # prevent connection from being closed after inference (windows Error)
17
+ if os.name == 'nt':
18
+ # Change event loop policy to SelectorEventLoop instead of the default ProactorEventLoop
19
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
20
+
21
+
22
+ if torch.cuda.is_available():
23
+ print("CUDA is available!")
24
+ else:
25
+ print("CUDA is not available.")
26
+
27
+ def check_file_readiness(filepath):
28
+ # If the loop finished, it means the file size has not changed for 5 seconds
29
+ # which indicates that the file is ready
30
+ num_same_size_checks = 0
31
+ last_size = -1
32
+ while num_same_size_checks < 5:
33
+ current_size = os.path.getsize(filepath)
34
+ if current_size == last_size:
35
+ num_same_size_checks += 1
36
+ else:
37
+ num_same_size_checks = 0
38
+ last_size = current_size
39
+ time.sleep(0.5)
40
+ return True
41
+
42
+ def generate_spectrogram(audio_file_path):
43
+ y, sr = librosa.load(audio_file_path)
44
+ plt.figure(figsize=(10, 4))
45
+ S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
46
+ librosa.display.specshow(librosa.power_to_db(S, ref=np.max),
47
+ y_axis='mel', fmax=22050, x_axis='time')
48
+ plt.colorbar(format='%+2.0f dB')
49
+ plt.title('Mel spectrogram')
50
+ plt.tight_layout()
51
+ image_path = tempfile.mktemp('.png')
52
+ plt.savefig(image_path)
53
+ plt.close()
54
+ return image_path
55
+
56
+ def generate_spectrograms(audio_files):
57
+ output_spectrograms = []
58
+ for audio_file in audio_files:
59
+ output_spectrograms.append(generate_spectrogram(audio_file))
60
+ return tuple(output_spectrograms)
61
+
62
+ def separate_music_file_wrapper(input_audio, use_cpu, use_single_onnx, large_overlap, small_overlap, chunk_size, use_large_gpu):
63
+ print(f"type(input_audio): {type(input_audio)}, input_audio: {input_audio[:10]}") # truncate printout
64
+ sample_rate, audio_data = input_audio
65
+ output_file = "input_audio.wav"
66
+ if isinstance(audio_data, np.ndarray):
67
+ audio_data = audio_data.astype(np.int16)
68
+ wavfile.write(output_file, sample_rate, audio_data)
69
+
70
+
71
+ input_files = [output_file]
72
+
73
+ options = {
74
+ 'input_audio': input_files,
75
+ 'output_folder': 'results',
76
+ 'cpu': use_cpu,
77
+ 'single_onnx': use_single_onnx,
78
+ 'overlap_large': large_overlap,
79
+ 'overlap_small': small_overlap,
80
+ 'chunk_size': chunk_size,
81
+ 'large_gpu': use_large_gpu,
82
+ }
83
+
84
+ print(f'use_cpu: {use_cpu}, use_large_gpu: {use_large_gpu}')
85
+ predict_with_model(options)
86
+
87
+ # Clear GPU cache once the separation finishes
88
+ if torch.cuda.is_available():
89
+ torch.cuda.empty_cache()
90
+
91
+ output_files = {}
92
+ for f in input_files:
93
+ audio_file_name = os.path.splitext(os.path.basename(f))[0]
94
+ output_files["vocals"] = os.path.join(options['output_folder'], audio_file_name + "_vocals.wav")
95
+ output_files["instrumental"] = os.path.join(options['output_folder'], audio_file_name + "_instrum.wav")
96
+ output_files["instrumental2"] = os.path.join(options['output_folder'], audio_file_name + "_instrum2.wav") # For the second instrumental output
97
+ output_files["bass"] = os.path.join(options['output_folder'], audio_file_name + "_bass.wav")
98
+ output_files["drums"] = os.path.join(options['output_folder'], audio_file_name + "_drums.wav")
99
+ output_files["other"] = os.path.join(options['output_folder'], audio_file_name + "_other.wav")
100
+
101
+ # Check the readiness of the files
102
+ output_files_ready = []
103
+ for k, v in output_files.items():
104
+ if os.path.exists(v) and check_file_readiness(v):
105
+ output_files_ready.append(v)
106
+ else:
107
+ empty_data = np.zeros((44100, 2)) # 2 channels, 1 second of silence at 44100Hz
108
+ empty_file = tempfile.mktemp('.wav')
109
+ wavfile.write(empty_file, 44100, empty_data.astype(np.int16)) # Cast to int16 as wavfile does not support float32
110
+ output_files_ready.append(empty_file)
111
+
112
+ # Generate spectrograms right after separating the audio
113
+ output_spectrograms = generate_spectrograms(output_files_ready)
114
+
115
+ #print(len(output_files_ready)) # should print 6
116
+ #print(len(output_spectrograms)) # should print 6
117
+ return tuple(output_files_ready) + output_spectrograms
118
+
119
+ separation_description = """
120
+ # MVSEP-MDX23 Web-UI
121
+ Web-UI created by [Ma5onic](https://github.com/Ma5onic)
122
+
123
+ Models created by [KimberleyJensen](https://github.com/KimberleyJensen) & [Alexandre Défossez](https://github.com/adefossez)
124
+
125
+ Separation method created by [ZFTurbo](https://github.com/ZFTurbo) and [MVSep.cpm](https://mvsep.com/)
126
+ - Further reading: [Benchmarks and leaderboards for sound demixing tasks - arxiv paper](https://arxiv.org/abs/2305.07489)
127
+
128
+ ## Options:
129
+ - **Use CPU Only:** Select this if you have not enough GPU memory. It will be slower.
130
+ - **Use Single ONNX:** Select this to use a single ONNX model. It will decrease quality but can help with GPU memory usage.
131
+ - **Large Overlap:** The overlap for large chunks. Adjust as needed.
132
+ - **Small Overlap:** The overlap for small chunks. Adjust as needed.
133
+ - **Chunk Size:** The size of chunks to be processed at a time. Reduce this if facing memory issues.
134
+ - **Use Fast Large GPU Version:** Select this for best separation results (requires > 11 GB of GPU memory).
135
+ """
136
+
137
+ theme = gr.themes.Base(
138
+ primary_hue="cyan",
139
+ secondary_hue="cyan",
140
+ )
141
+
142
+ with gr.Blocks(theme=theme, title="MVSEP MDX23 music separation model") as demo:
143
+ gr.Markdown(separation_description)
144
+ input_audio = gr.Audio(label="Upload Audio", interactive=True)
145
+ use_cpu = gr.Checkbox(label="Use CPU Only", value=False)
146
+ use_single_onnx = gr.Checkbox(label="Use Single ONNX", value=False)
147
+ large_overlap = gr.Number(label="Large Overlap", value=0.6)
148
+ small_overlap = gr.Number(label="Small Overlap", value=0.5)
149
+ chunk_size = gr.Number(label="Chunk Size", value=1000000)
150
+ use_large_gpu = gr.Checkbox(label="Large GPU Version (Generates better results but requires GPU RAM above 11Gb)", value=True)
151
+ process_button = gr.Button("Process Audio", variant="primary")
152
+
153
+ vocals = gr.Audio(label="Vocals")
154
+ vocals_spectrogram = gr.Image(label="Vocals Spectrogram")
155
+ instrumental = gr.Audio(label="Instrumental")
156
+ instrumental_spectrogram = gr.Image(label="Instrumental Spectrogram")
157
+ instrumental2 = gr.Audio(label="Instrumental 2")
158
+ instrumental2_spectrogram = gr.Image(label="Instrumental 2 Spectrogram")
159
+ bass = gr.Audio(label="Bass")
160
+ bass_spectrogram = gr.Image(label="Bass Spectrogram")
161
+ drums = gr.Audio(label="Drums")
162
+ drums_spectrogram = gr.Image(label="Drums Spectrogram")
163
+ other = gr.Audio(label="Other")
164
+ other_spectrogram = gr.Image(label="Other Spectrogram")
165
+
166
+ process_button.click(
167
+ separate_music_file_wrapper,
168
+ inputs=[input_audio, use_cpu, use_single_onnx, large_overlap, small_overlap, chunk_size, use_large_gpu],
169
+ outputs=[vocals, instrumental, instrumental2, bass, drums, other, vocals_spectrogram, instrumental_spectrogram, instrumental2_spectrogram, bass_spectrogram, drums_spectrogram, other_spectrogram],
170
+ )
171
+
172
+ demo.queue().launch(debug=True, share=False)
Politrees/UVR_resources/.gitattributes ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Demucs_models/04573f0d-f3cf25b2.th filter=lfs diff=lfs merge=lfs -text
37
+ Demucs_models/0d19c1c6-0f06f20e.th filter=lfs diff=lfs merge=lfs -text
38
+ Demucs_models/14fc6a69-a89dd0ee.th filter=lfs diff=lfs merge=lfs -text
39
+ Demucs_models/1ef250f1-592467ce.th filter=lfs diff=lfs merge=lfs -text
40
+ Demucs_models/305bc58f-18378783.th filter=lfs diff=lfs merge=lfs -text
41
+ Demucs_models/42e558d4-196e0e1b.th filter=lfs diff=lfs merge=lfs -text
42
+ Demucs_models/464b36d7-e5a9386e.th filter=lfs diff=lfs merge=lfs -text
43
+ Demucs_models/5c90dfd2-34c22ccb.th filter=lfs diff=lfs merge=lfs -text
44
+ Demucs_models/5d2d6c55-db83574e.th filter=lfs diff=lfs merge=lfs -text
45
+ Demucs_models/6b9c2ca1-3fd82607.th filter=lfs diff=lfs merge=lfs -text
46
+ Demucs_models/75fc33f5-1941ce65.th filter=lfs diff=lfs merge=lfs -text
47
+ Demucs_models/7d865c68-3d5dd56b.th filter=lfs diff=lfs merge=lfs -text
48
+ Demucs_models/7ecf8ec1-70f50cc9.th filter=lfs diff=lfs merge=lfs -text
49
+ Demucs_models/7fd6ef75-a905dd85.th filter=lfs diff=lfs merge=lfs -text
50
+ Demucs_models/83fc094f-4a16d450.th filter=lfs diff=lfs merge=lfs -text
51
+ Demucs_models/902315c2-b39ce9c9.th filter=lfs diff=lfs merge=lfs -text
52
+ Demucs_models/92cfc3b6-ef3bcb9c.th filter=lfs diff=lfs merge=lfs -text
53
+ Demucs_models/955717e8-8726e21a.th filter=lfs diff=lfs merge=lfs -text
54
+ Demucs_models/9a6b4851-03af0aa6.th filter=lfs diff=lfs merge=lfs -text
55
+ Demucs_models/a1d90b5c-ae9d2452.th filter=lfs diff=lfs merge=lfs -text
56
+ Demucs_models/b72baf4e-8778635e.th filter=lfs diff=lfs merge=lfs -text
57
+ Demucs_models/c511e2ab-fe698775.th filter=lfs diff=lfs merge=lfs -text
58
+ Demucs_models/cfa93e08-61801ae1.th filter=lfs diff=lfs merge=lfs -text
59
+ Demucs_models/d12395a8-e57c48e6.th filter=lfs diff=lfs merge=lfs -text
60
+ Demucs_models/demucs_extra-3646af93.th filter=lfs diff=lfs merge=lfs -text
61
+ Demucs_models/demucs_extra.th filter=lfs diff=lfs merge=lfs -text
62
+ Demucs_models/demucs_unittest-09ebc15f.th filter=lfs diff=lfs merge=lfs -text
63
+ Demucs_models/demucs-e07c671f.th filter=lfs diff=lfs merge=lfs -text
64
+ Demucs_models/demucs.th filter=lfs diff=lfs merge=lfs -text
65
+ Demucs_models/demucs48_hq-28a1282c.th filter=lfs diff=lfs merge=lfs -text
66
+ Demucs_models/e51eebcc-c1b80bdd.th filter=lfs diff=lfs merge=lfs -text
67
+ Demucs_models/ebf34a2db.th filter=lfs diff=lfs merge=lfs -text
68
+ Demucs_models/f7e0c4bc-ba3fe64a.th filter=lfs diff=lfs merge=lfs -text
69
+ Demucs_models/fa0cb7f9-100d8bf4.th filter=lfs diff=lfs merge=lfs -text
70
+ Demucs_models/light_extra.th filter=lfs diff=lfs merge=lfs -text
71
+ Demucs_models/light.th filter=lfs diff=lfs merge=lfs -text
72
+ Demucs_models/tasnet_extra-df3777b2.th filter=lfs diff=lfs merge=lfs -text
73
+ Demucs_models/tasnet_extra.th filter=lfs diff=lfs merge=lfs -text
74
+ Demucs_models/tasnet-beb46fac.th filter=lfs diff=lfs merge=lfs -text
75
+ Demucs_models/tasnet.th filter=lfs diff=lfs merge=lfs -text
76
+ impulse/VS8F-1/100-Reverb.wav filter=lfs diff=lfs merge=lfs -text
77
+ impulse/VS8F-1/101-LargeHall.wav filter=lfs diff=lfs merge=lfs -text
78
+ impulse/VS8F-1/102-SmallHall.wav filter=lfs diff=lfs merge=lfs -text
79
+ impulse/VS8F-1/103-Strings.wav filter=lfs diff=lfs merge=lfs -text
80
+ impulse/VS8F-1/104-PianoHall.wav filter=lfs diff=lfs merge=lfs -text
81
+ impulse/VS8F-1/105-OrchRoom.wav filter=lfs diff=lfs merge=lfs -text
82
+ impulse/VS8F-1/106-VocalRoom.wav filter=lfs diff=lfs merge=lfs -text
83
+ impulse/VS8F-1/107-MediumRm.wav filter=lfs diff=lfs merge=lfs -text
84
+ impulse/VS8F-1/108-LargeRoom.wav filter=lfs diff=lfs merge=lfs -text
85
+ impulse/VS8F-1/109-CoolPlate.wav filter=lfs diff=lfs merge=lfs -text
86
+ impulse/VS8F-1/110-ShortPlt.wav filter=lfs diff=lfs merge=lfs -text
87
+ impulse/VS8F-1/111-VocalPlt.wav filter=lfs diff=lfs merge=lfs -text
88
+ impulse/VS8F-1/112-SoftAmb.wav filter=lfs diff=lfs merge=lfs -text
89
+ impulse/VS8F-1/113-RoomAmb.wav filter=lfs diff=lfs merge=lfs -text
90
+ impulse/VS8F-1/114-Cathedral.wav filter=lfs diff=lfs merge=lfs -text
91
+ impulse/VS8F-1/115-LongCave.wav filter=lfs diff=lfs merge=lfs -text
92
+ impulse/VS8F-1/116-GarageDr.wav filter=lfs diff=lfs merge=lfs -text
93
+ impulse/VS8F-1/117-RockKick.wav filter=lfs diff=lfs merge=lfs -text
94
+ impulse/VS8F-1/118-RockSnare.wav filter=lfs diff=lfs merge=lfs -text
95
+ impulse/VS8F-2/200-R1_Reverb1.wav filter=lfs diff=lfs merge=lfs -text
96
+ impulse/VS8F-2/201-R1_LargeHall.wav filter=lfs diff=lfs merge=lfs -text
97
+ impulse/VS8F-2/202-R1_SmallHall.wav filter=lfs diff=lfs merge=lfs -text
98
+ impulse/VS8F-2/203-R1_Strings.wav filter=lfs diff=lfs merge=lfs -text
99
+ impulse/VS8F-2/204-R1_PianoHall.wav filter=lfs diff=lfs merge=lfs -text
100
+ impulse/VS8F-2/205-R1_OrchRoom.wav filter=lfs diff=lfs merge=lfs -text
101
+ impulse/VS8F-2/206-R1_VocalRoom.wav filter=lfs diff=lfs merge=lfs -text
102
+ impulse/VS8F-2/207-R1_MediumRm.wav filter=lfs diff=lfs merge=lfs -text
103
+ impulse/VS8F-2/208-R1_LargeRoom.wav filter=lfs diff=lfs merge=lfs -text
104
+ impulse/VS8F-2/209-R1_CoolPlate.wav filter=lfs diff=lfs merge=lfs -text
105
+ impulse/VS8F-2/210-R1_ShortPlt.wav filter=lfs diff=lfs merge=lfs -text
106
+ impulse/VS8F-2/211-R1_VocalPlt.wav filter=lfs diff=lfs merge=lfs -text
107
+ impulse/VS8F-2/212-R1_SoftAmb.wav filter=lfs diff=lfs merge=lfs -text
108
+ impulse/VS8F-2/213-R1_RoomAmb.wav filter=lfs diff=lfs merge=lfs -text
109
+ impulse/VS8F-2/214-R1_Cathedral.wav filter=lfs diff=lfs merge=lfs -text
110
+ impulse/VS8F-2/215-R1_LongCave.wav filter=lfs diff=lfs merge=lfs -text
111
+ impulse/VS8F-2/216-R1_GarageDr.wav filter=lfs diff=lfs merge=lfs -text
112
+ impulse/VS8F-2/217-R1_RockKick.wav filter=lfs diff=lfs merge=lfs -text
113
+ impulse/VS8F-2/218-R1_RockSnare.wav filter=lfs diff=lfs merge=lfs -text
114
+ impulse/VS8F-2/220-R2_Reverb2.wav filter=lfs diff=lfs merge=lfs -text
115
+ impulse/VS8F-2/221-R2_LargeHall.wav filter=lfs diff=lfs merge=lfs -text
116
+ impulse/VS8F-2/222-R2_SmallHall.wav filter=lfs diff=lfs merge=lfs -text
117
+ impulse/VS8F-2/223-R2_Strings.wav filter=lfs diff=lfs merge=lfs -text
118
+ impulse/VS8F-2/224-R2_PianoHall.wav filter=lfs diff=lfs merge=lfs -text
119
+ impulse/VS8F-2/225-R2_OrchRoom.wav filter=lfs diff=lfs merge=lfs -text
120
+ impulse/VS8F-2/226-R2_VocalRoom.wav filter=lfs diff=lfs merge=lfs -text
121
+ impulse/VS8F-2/227-R2_MediumRm.wav filter=lfs diff=lfs merge=lfs -text
122
+ impulse/VS8F-2/228-R2_LargeRoom.wav filter=lfs diff=lfs merge=lfs -text
123
+ impulse/VS8F-2/229-R2_CoolPlate.wav filter=lfs diff=lfs merge=lfs -text
124
+ impulse/VS8F-2/230-R2_ShortPlt.wav filter=lfs diff=lfs merge=lfs -text
125
+ impulse/VS8F-2/231-R2_VocalPlt.wav filter=lfs diff=lfs merge=lfs -text
126
+ impulse/VS8F-2/232-R2_SoftAmb.wav filter=lfs diff=lfs merge=lfs -text
127
+ impulse/VS8F-2/233-R2_RoomAmb.wav filter=lfs diff=lfs merge=lfs -text
128
+ impulse/VS8F-2/234-R2_Cathedral.wav filter=lfs diff=lfs merge=lfs -text
129
+ impulse/VS8F-2/235-R2_LongCave.wav filter=lfs diff=lfs merge=lfs -text
130
+ impulse/VS8F-2/236-R2_GarageDr.wav filter=lfs diff=lfs merge=lfs -text
131
+ impulse/VS8F-2/237-R2_RockKick.wav filter=lfs diff=lfs merge=lfs -text
132
+ impulse/VS8F-2/238-R2_RockSnare.wav filter=lfs diff=lfs merge=lfs -text
133
+ impulse/VS8F-3/301-LargeHall.wav filter=lfs diff=lfs merge=lfs -text
134
+ impulse/VS8F-3/302-SmallHall.wav filter=lfs diff=lfs merge=lfs -text
135
+ impulse/VS8F-3/303-Strings.wav filter=lfs diff=lfs merge=lfs -text
136
+ impulse/VS8F-3/304-PianoHall.wav filter=lfs diff=lfs merge=lfs -text
137
+ impulse/VS8F-3/305-OrchRoom.wav filter=lfs diff=lfs merge=lfs -text
138
+ impulse/VS8F-3/306-VocalRoom.wav filter=lfs diff=lfs merge=lfs -text
139
+ impulse/VS8F-3/307-MediumRm.wav filter=lfs diff=lfs merge=lfs -text
140
+ impulse/VS8F-3/308-LargeRoom.wav filter=lfs diff=lfs merge=lfs -text
141
+ impulse/VS8F-3/309-CoolPlate.wav filter=lfs diff=lfs merge=lfs -text
142
+ impulse/VS8F-3/310-ShortPlt.wav filter=lfs diff=lfs merge=lfs -text
143
+ impulse/VS8F-3/311-VocalPlt.wav filter=lfs diff=lfs merge=lfs -text
144
+ impulse/VS8F-3/312-SoftAmb.wav filter=lfs diff=lfs merge=lfs -text
145
+ impulse/VS8F-3/313-RoomAmb.wav filter=lfs diff=lfs merge=lfs -text
146
+ impulse/VS8F-3/314-Cathedral.wav filter=lfs diff=lfs merge=lfs -text
147
+ impulse/VS8F-3/315-LongCave.wav filter=lfs diff=lfs merge=lfs -text
148
+ impulse/VS8F-3/316-GarageDr.wav filter=lfs diff=lfs merge=lfs -text
149
+ impulse/VS8F-3/317-RockKick.wav filter=lfs diff=lfs merge=lfs -text
150
+ impulse/VS8F-3/318-RockSnare.wav filter=lfs diff=lfs merge=lfs -text
151
+ models/Demucs/Demucs_v1/demucs_extra.th filter=lfs diff=lfs merge=lfs -text
152
+ models/Demucs/Demucs_v1/demucs.th filter=lfs diff=lfs merge=lfs -text
153
+ models/Demucs/Demucs_v1/light_extra.th filter=lfs diff=lfs merge=lfs -text
154
+ models/Demucs/Demucs_v1/light.th filter=lfs diff=lfs merge=lfs -text
155
+ models/Demucs/Demucs_v1/tasnet_extra.th filter=lfs diff=lfs merge=lfs -text
156
+ models/Demucs/Demucs_v1/tasnet.th filter=lfs diff=lfs merge=lfs -text
157
+ models/Demucs/Demucs_v2/demucs_extra-3646af93.th filter=lfs diff=lfs merge=lfs -text
158
+ models/Demucs/Demucs_v2/demucs_unittest-09ebc15f.th filter=lfs diff=lfs merge=lfs -text
159
+ models/Demucs/Demucs_v2/demucs-e07c671f.th filter=lfs diff=lfs merge=lfs -text
160
+ models/Demucs/Demucs_v2/demucs48_hq-28a1282c.th filter=lfs diff=lfs merge=lfs -text
161
+ models/Demucs/Demucs_v2/tasnet_extra-df3777b2.th filter=lfs diff=lfs merge=lfs -text
162
+ models/Demucs/Demucs_v2/tasnet-beb46fac.th filter=lfs diff=lfs merge=lfs -text
163
+ models/Demucs/Demucs_v3/0d19c1c6-0f06f20e.th filter=lfs diff=lfs merge=lfs -text
164
+ models/Demucs/Demucs_v3/14fc6a69-a89dd0ee.th filter=lfs diff=lfs merge=lfs -text
165
+ models/Demucs/Demucs_v3/1ef250f1-592467ce.th filter=lfs diff=lfs merge=lfs -text
166
+ models/Demucs/Demucs_v3/305bc58f-18378783.th filter=lfs diff=lfs merge=lfs -text
167
+ models/Demucs/Demucs_v3/42e558d4-196e0e1b.th filter=lfs diff=lfs merge=lfs -text
168
+ models/Demucs/Demucs_v3/464b36d7-e5a9386e.th filter=lfs diff=lfs merge=lfs -text
169
+ models/Demucs/Demucs_v3/5d2d6c55-db83574e.th filter=lfs diff=lfs merge=lfs -text
170
+ models/Demucs/Demucs_v3/6b9c2ca1-3fd82607.th filter=lfs diff=lfs merge=lfs -text
171
+ models/Demucs/Demucs_v3/7d865c68-3d5dd56b.th filter=lfs diff=lfs merge=lfs -text
172
+ models/Demucs/Demucs_v3/7ecf8ec1-70f50cc9.th filter=lfs diff=lfs merge=lfs -text
173
+ models/Demucs/Demucs_v3/7fd6ef75-a905dd85.th filter=lfs diff=lfs merge=lfs -text
174
+ models/Demucs/Demucs_v3/83fc094f-4a16d450.th filter=lfs diff=lfs merge=lfs -text
175
+ models/Demucs/Demucs_v3/902315c2-b39ce9c9.th filter=lfs diff=lfs merge=lfs -text
176
+ models/Demucs/Demucs_v3/9a6b4851-03af0aa6.th filter=lfs diff=lfs merge=lfs -text
177
+ models/Demucs/Demucs_v3/a1d90b5c-ae9d2452.th filter=lfs diff=lfs merge=lfs -text
178
+ models/Demucs/Demucs_v3/b72baf4e-8778635e.th filter=lfs diff=lfs merge=lfs -text
179
+ models/Demucs/Demucs_v3/c511e2ab-fe698775.th filter=lfs diff=lfs merge=lfs -text
180
+ models/Demucs/Demucs_v3/cfa93e08-61801ae1.th filter=lfs diff=lfs merge=lfs -text
181
+ models/Demucs/Demucs_v3/e51eebcc-c1b80bdd.th filter=lfs diff=lfs merge=lfs -text
182
+ models/Demucs/Demucs_v3/ebf34a2db.th filter=lfs diff=lfs merge=lfs -text
183
+ models/Demucs/Demucs_v3/fa0cb7f9-100d8bf4.th filter=lfs diff=lfs merge=lfs -text
184
+ models/Demucs/Demucs_v4/04573f0d-f3cf25b2.th filter=lfs diff=lfs merge=lfs -text
185
+ models/Demucs/Demucs_v4/5c90dfd2-34c22ccb.th filter=lfs diff=lfs merge=lfs -text
186
+ models/Demucs/Demucs_v4/75fc33f5-1941ce65.th filter=lfs diff=lfs merge=lfs -text
187
+ models/Demucs/Demucs_v4/92cfc3b6-ef3bcb9c.th filter=lfs diff=lfs merge=lfs -text
188
+ models/Demucs/Demucs_v4/955717e8-8726e21a.th filter=lfs diff=lfs merge=lfs -text
189
+ models/Demucs/Demucs_v4/d12395a8-e57c48e6.th filter=lfs diff=lfs merge=lfs -text
190
+ models/Demucs/Demucs_v4/f7e0c4bc-ba3fe64a.th filter=lfs diff=lfs merge=lfs -text
Politrees/UVR_resources/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - uvr
5
+ - uvr5
6
+ - ultimatevocalremover
7
+ - demucs
8
+ - vr-arch
9
+ - mdx-net
10
+ - mdx23c
11
+ - roformer
12
+ - scnet
13
+ - bandit
14
+ ---
15
+
16
+ <div align="center">
17
+ <h1><big><big><big>Made for <a href="https://github.com/Politrees/UVR_resources">UVR_resources on GitHub</a></big></big></big></h1>
18
+ </div>
Politrees/UVR_resources/models/Apollo/apollo_edm_big_by_essid.yaml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exp:
2
+ dir: ./exps # directory to save the experiment
3
+ name: bluearchive # name of the experiment
4
+
5
+ datas:
6
+ _target_: look2hear.datas.DataModule
7
+ dataset_type: 1 # 1 or 2. see README for more details
8
+ sr: 44100 # sample rate
9
+ segments: 4 # cropped audio in seconds. chunksize = sr * segments
10
+ num_steps: 1000 # number of samples to be used for training in one epoch.
11
+ batch_size: 1 # batch size
12
+ num_workers: 0 # number of workers for data loading
13
+ pin_memory: true # pin memory for data loading
14
+
15
+ stems:
16
+ original: original # key for the original audio files, don't change it
17
+ codec: codec # key for the codec audio files, don't change it
18
+
19
+ train:
20
+ dir: # dataset where the training audio files are stored
21
+ - output # list of directories
22
+ original_format: wav # the format of the original audio files
23
+ codec_format: mp3 # the format of the codec audio files
24
+
25
+ valid:
26
+ dir: # dataset where the validation audio files are stored
27
+ - output_v # list of directories
28
+ original_format: wav # the format of the original audio files
29
+ codec_format: mp3 # the format of the codec audio files
30
+
31
+ model:
32
+ _target_: look2hear.models.apollo.Apollo
33
+ sr: 44100 # sample rate
34
+ win: 20 # window size in milliseconds
35
+ feature_dim: 256 # feature dimension
36
+ layer: 6 # number of layers
37
+
38
+ discriminator:
39
+ _target_: look2hear.discriminators.frequencydis.MultiFrequencyDiscriminator
40
+ nch: 2
41
+ window: [32, 64, 128, 256, 512, 1024, 2048]
42
+
43
+ optimizer_g:
44
+ _target_: torch.optim.AdamW
45
+ lr: 0.001
46
+ weight_decay: 0.01
47
+
48
+ optimizer_d:
49
+ _target_: torch.optim.AdamW
50
+ lr: 0.0001
51
+ weight_decay: 0.01
52
+ betas: [0.5, 0.99]
53
+
54
+ scheduler_g:
55
+ _target_: torch.optim.lr_scheduler.StepLR
56
+ step_size: 2
57
+ gamma: 0.98
58
+
59
+ scheduler_d:
60
+ _target_: torch.optim.lr_scheduler.StepLR
61
+ step_size: 2
62
+ gamma: 0.98
63
+
64
+ loss_g:
65
+ _target_: look2hear.losses.gan_losses.MultiFrequencyGenLoss
66
+ eps: 1e-8
67
+
68
+ loss_d:
69
+ _target_: look2hear.losses.gan_losses.MultiFrequencyDisLoss
70
+ eps: 1e-8
71
+
72
+ metrics:
73
+ _target_: look2hear.losses.MultiSrcNegSDR
74
+ sdr_type: sisdr # metric for validation, one of [snr, sisdr, sdsdr]
75
+
76
+ system:
77
+ _target_: look2hear.system.audio_litmodule.AudioLightningModule
78
+
79
+ # comment out the early_topping content below, if you do not wish to have early_topping
80
+ early_stopping:
81
+ _target_: pytorch_lightning.callbacks.EarlyStopping
82
+ monitor: val_loss # metric to monitor
83
+ patience: 50 # number of epochs with no improvement after which training will be stopped
84
+ mode: min
85
+ verbose: true
86
+
87
+ checkpoint:
88
+ _target_: pytorch_lightning.callbacks.ModelCheckpoint
89
+ dirpath: ${exp.dir}/${exp.name}/checkpoints
90
+ monitor: val_loss # metric to monitor
91
+ mode: min
92
+ verbose: true
93
+ save_top_k: 10 # number of best models to save
94
+ save_last: true # save the last checkpoint
95
+ filename: '{epoch}-{val_loss:.4f}'
96
+
97
+ logger:
98
+ _target_: pytorch_lightning.loggers.WandbLogger
99
+ name: ${exp.name}
100
+ save_dir: ${exp.dir}/${exp.name}/logs
101
+ offline: false # if true, the logs will not be uploaded to wandb
102
+ project: Audio-Restoration
103
+
104
+ trainer:
105
+ _target_: pytorch_lightning.Trainer
106
+ devices: [0] # number of GPUs to use
107
+ max_epochs: 1000 # max number of epochs
108
+ sync_batchnorm: true
109
+ default_root_dir: ${exp.dir}/${exp.name}/
110
+ accelerator: cuda
111
+ limit_train_batches: 1.0
112
+ fast_dev_run: false
113
+ precision: bf16 # [16, bf16, 32, 64]
114
+ enable_model_summary: true
Politrees/UVR_resources/models/Apollo/apollo_edm_by_essid.yaml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exp:
2
+ dir: ./exps # directory to save the experiment
3
+ name: bluearchive # name of the experiment
4
+
5
+ datas:
6
+ _target_: look2hear.datas.DataModule
7
+ dataset_type: 1 # 1 or 2. see README for more details
8
+ sr: 44100 # sample rate
9
+ segments: 4 # cropped audio in seconds. chunksize = sr * segments
10
+ num_steps: 1000 # number of samples to be used for training in one epoch.
11
+ batch_size: 1 # batch size
12
+ num_workers: 0 # number of workers for data loading
13
+ pin_memory: true # pin memory for data loading
14
+
15
+ stems:
16
+ original: original # key for the original audio files, don't change it
17
+ codec: codec # key for the codec audio files, don't change it
18
+
19
+ train:
20
+ dir: # dataset where the training audio files are stored
21
+ - output # list of directories
22
+ original_format: wav # the format of the original audio files
23
+ codec_format: mp3 # the format of the codec audio files
24
+
25
+ valid:
26
+ dir: # dataset where the validation audio files are stored
27
+ - output_v # list of directories
28
+ original_format: wav # the format of the original audio files
29
+ codec_format: mp3 # the format of the codec audio files
30
+
31
+ model:
32
+ _target_: look2hear.models.apollo.Apollo
33
+ sr: 44100 # sample rate
34
+ win: 20 # window size in milliseconds
35
+ feature_dim: 128 # feature dimension
36
+ layer: 6 # number of layers
37
+
38
+ discriminator:
39
+ _target_: look2hear.discriminators.frequencydis.MultiFrequencyDiscriminator
40
+ nch: 2
41
+ window: [32, 64, 128, 256, 512, 1024, 2048]
42
+
43
+ optimizer_g:
44
+ _target_: torch.optim.AdamW
45
+ lr: 0.001
46
+ weight_decay: 0.01
47
+
48
+ optimizer_d:
49
+ _target_: torch.optim.AdamW
50
+ lr: 0.0001
51
+ weight_decay: 0.01
52
+ betas: [0.5, 0.99]
53
+
54
+ scheduler_g:
55
+ _target_: torch.optim.lr_scheduler.StepLR
56
+ step_size: 2
57
+ gamma: 0.98
58
+
59
+ scheduler_d:
60
+ _target_: torch.optim.lr_scheduler.StepLR
61
+ step_size: 2
62
+ gamma: 0.98
63
+
64
+ loss_g:
65
+ _target_: look2hear.losses.gan_losses.MultiFrequencyGenLoss
66
+ eps: 1e-8
67
+
68
+ loss_d:
69
+ _target_: look2hear.losses.gan_losses.MultiFrequencyDisLoss
70
+ eps: 1e-8
71
+
72
+ metrics:
73
+ _target_: look2hear.losses.MultiSrcNegSDR
74
+ sdr_type: sisdr # metric for validation, one of [snr, sisdr, sdsdr]
75
+
76
+ system:
77
+ _target_: look2hear.system.audio_litmodule.AudioLightningModule
78
+
79
+ # comment out the early_topping content below, if you do not wish to have early_topping
80
+ early_stopping:
81
+ _target_: pytorch_lightning.callbacks.EarlyStopping
82
+ monitor: val_loss # metric to monitor
83
+ patience: 50 # number of epochs with no improvement after which training will be stopped
84
+ mode: min
85
+ verbose: true
86
+
87
+ checkpoint:
88
+ _target_: pytorch_lightning.callbacks.ModelCheckpoint
89
+ dirpath: ${exp.dir}/${exp.name}/checkpoints
90
+ monitor: val_loss # metric to monitor
91
+ mode: min
92
+ verbose: true
93
+ save_top_k: 10 # number of best models to save
94
+ save_last: true # save the last checkpoint
95
+ filename: '{epoch}-{val_loss:.4f}'
96
+
97
+ logger:
98
+ _target_: pytorch_lightning.loggers.WandbLogger
99
+ name: ${exp.name}
100
+ save_dir: ${exp.dir}/${exp.name}/logs
101
+ offline: false # if true, the logs will not be uploaded to wandb
102
+ project: Audio-Restoration
103
+
104
+ trainer:
105
+ _target_: pytorch_lightning.Trainer
106
+ devices: [0] # number of GPUs to use
107
+ max_epochs: 1000 # max number of epochs
108
+ sync_batchnorm: true
109
+ default_root_dir: ${exp.dir}/${exp.name}/
110
+ accelerator: cuda
111
+ limit_train_batches: 1.0
112
+ fast_dev_run: false
113
+ precision: bf16 # [16, bf16, 32, 64]
114
+ enable_model_summary: true
Politrees/UVR_resources/models/MDX23C/config_dereverb_mdx23c.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261120
3
+ dim_f: 4096
4
+ dim_t: 256
5
+ hop_length: 1024
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ bottleneck_factor: 4
14
+ growth: 128
15
+ norm: InstanceNorm
16
+ num_blocks_per_scale: 2
17
+ num_channels: 128
18
+ num_scales: 5
19
+ num_subbands: 4
20
+ scale:
21
+ - 2
22
+ - 2
23
+
24
+ training:
25
+ batch_size: 2
26
+ gradient_accumulation_steps: 1
27
+ grad_clip: 0
28
+ instruments:
29
+ - dry
30
+ - No dry
31
+ lr: 1.0e-06
32
+ patience: 4
33
+ reduce_factor: 0.93
34
+ target_instrument: null
35
+ num_epochs: 40
36
+ num_steps: 1000
37
+ q: 0.95
38
+ coarse_loss_clip: true
39
+ ema_momentum: 0.999
40
+ optimizer: adamw
41
+ read_metadata_procs: 8 # Number of processes to use during metadata reading for dataset. Can speed up metadata generation
42
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
43
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
44
+
45
+ augmentations:
46
+ enable: false # enable or disable all augmentations (to fast disable if needed)
47
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
48
+ loudness_min: 0.5
49
+ loudness_max: 1.5
50
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
51
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
52
+ - 0.2
53
+ - 0.02
54
+ mixup_loudness_min: 0.5
55
+ mixup_loudness_max: 1.5
56
+
57
+ all:
58
+ channel_shuffle: 0.5 # Set 0 or lower to disable
59
+ random_inverse: 0.05 # inverse track (better lower probability)
60
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
61
+
62
+ # pedalboard chorus block
63
+ pedalboard_chorus: 0.001
64
+ pedalboard_chorus_rate_hz_min: 1.0
65
+ pedalboard_chorus_rate_hz_max: 7.0
66
+ pedalboard_chorus_depth_min: 0.25
67
+ pedalboard_chorus_depth_max: 0.95
68
+ pedalboard_chorus_centre_delay_ms_min: 3
69
+ pedalboard_chorus_centre_delay_ms_max: 10
70
+ pedalboard_chorus_feedback_min: 0.0
71
+ pedalboard_chorus_feedback_max: 0.01
72
+ pedalboard_chorus_mix_min: 0.1
73
+ pedalboard_chorus_mix_max: 0.9
74
+
75
+ # pedalboard phazer block
76
+ pedalboard_phazer: 0.001
77
+ pedalboard_phazer_rate_hz_min: 1.0
78
+ pedalboard_phazer_rate_hz_max: 10.0
79
+ pedalboard_phazer_depth_min: 0.25
80
+ pedalboard_phazer_depth_max: 0.95
81
+ pedalboard_phazer_centre_frequency_hz_min: 200
82
+ pedalboard_phazer_centre_frequency_hz_max: 12000
83
+ pedalboard_phazer_feedback_min: 0.0
84
+ pedalboard_phazer_feedback_max: 0.5
85
+ pedalboard_phazer_mix_min: 0.1
86
+ pedalboard_phazer_mix_max: 0.9
87
+
88
+ # pedalboard pitch shift block
89
+ pedalboard_pitch_shift: 0.01
90
+ pedalboard_pitch_shift_semitones_min: -7
91
+ pedalboard_pitch_shift_semitones_max: 7
92
+
93
+ # pedalboard resample block
94
+ pedalboard_resample: 0.001
95
+ pedalboard_resample_target_sample_rate_min: 4000
96
+ pedalboard_resample_target_sample_rate_max: 44100
97
+
98
+ mp3_compression_min_bitrate: 32
99
+ mp3_compression_max_bitrate: 320
100
+ mp3_compression_backend: "lameenc"
101
+
102
+ dry:
103
+ # pedalboard distortion block
104
+ pedalboard_distortion: 0.001
105
+ pedalboard_distortion_drive_db_min: 1.0
106
+ pedalboard_distortion_drive_db_max: 25.0
107
+
108
+ tanh_distortion: 0.05
109
+ tanh_distortion_min: 0.1
110
+ tanh_distortion_max: 0.7
111
+ # pedalboard bitcrash block
112
+ pedalboard_bitcrash: 0.005
113
+ pedalboard_bitcrash_bit_depth_min: 4
114
+ pedalboard_bitcrash_bit_depth_max: 16
115
+
116
+ seven_band_parametric_eq: 0.24
117
+ seven_band_parametric_eq_min_gain_db: -9
118
+ seven_band_parametric_eq_max_gain_db: 9
119
+
120
+ gaussian_noise: 0.005
121
+ gaussian_noise_min_amplitude: 0.001
122
+ gaussian_noise_max_amplitude: 0.01
123
+
124
+ time_stretch: 0.01
125
+ time_stretch_min_rate: 0.8
126
+ time_stretch_max_rate: 1.25
127
+ other:
128
+ seven_band_parametric_eq: 0.24
129
+ seven_band_parametric_eq_min_gain_db: -9
130
+ seven_band_parametric_eq_max_gain_db: 9
131
+
132
+ inference:
133
+ batch_size: 2
134
+ dim_t: 256
135
+ num_overlap: 4
Politrees/UVR_resources/models/MDX23C/config_drumsep_mdx23c.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 130560
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ bottleneck_factor: 4
14
+ growth: 128
15
+ norm: InstanceNorm
16
+ num_blocks_per_scale: 2
17
+ num_channels: 128
18
+ num_scales: 5
19
+ num_subbands: 4
20
+ scale:
21
+ - 2
22
+ - 2
23
+
24
+ training:
25
+ batch_size: 12
26
+ gradient_accumulation_steps: 1
27
+ grad_clip: 0
28
+ instruments:
29
+ - kick
30
+ - snare
31
+ - toms
32
+ - hh
33
+ - ride
34
+ - crash
35
+ lr: 9.0e-05
36
+ patience: 30
37
+ reduce_factor: 0.95
38
+ target_instrument: null
39
+ num_epochs: 1000
40
+ num_steps: 1268
41
+ q: 0.95
42
+ coarse_loss_clip: true
43
+ ema_momentum: 0.999
44
+ optimizer: adam
45
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
46
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
47
+
48
+ augmentations:
49
+ enable: true # enable or disable all augmentations (to fast disable if needed)
50
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
51
+ loudness_min: 0.5
52
+ loudness_max: 1.5
53
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
54
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
55
+ - 0.2
56
+ - 0.02
57
+ mixup_loudness_min: 0.5
58
+ mixup_loudness_max: 1.5
59
+
60
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
61
+ mp3_compression_on_mixture: 0.0
62
+ mp3_compression_on_mixture_bitrate_min: 32
63
+ mp3_compression_on_mixture_bitrate_max: 320
64
+ mp3_compression_on_mixture_backend: "lameenc"
65
+
66
+ all:
67
+ channel_shuffle: 0.5 # Set 0 or lower to disable
68
+ random_inverse: 0.01 # inverse track (better lower probability)
69
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
70
+ mp3_compression: 0.0
71
+ mp3_compression_min_bitrate: 32
72
+ mp3_compression_max_bitrate: 320
73
+ mp3_compression_backend: "lameenc"
74
+ pitch_shift: 0.1
75
+ pitch_shift_min_semitones: -3
76
+ pitch_shift_max_semitones: 3
77
+ seven_band_parametric_eq: 0.5
78
+ seven_band_parametric_eq_min_gain_db: -6
79
+ seven_band_parametric_eq_max_gain_db: 6
80
+ tanh_distortion: 0.2
81
+ tanh_distortion_min: 0.1
82
+ tanh_distortion_max: 0.5
83
+
84
+ inference:
85
+ batch_size: 1
86
+ dim_t: 256
87
+ num_overlap: 4
Politrees/UVR_resources/models/MDX23C/config_mdx23c_similarity.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 130560
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ bottleneck_factor: 4
14
+ growth: 128
15
+ norm: InstanceNorm
16
+ num_blocks_per_scale: 2
17
+ num_channels: 128
18
+ num_scales: 5
19
+ num_subbands: 4
20
+ scale:
21
+ - 2
22
+ - 2
23
+
24
+ training:
25
+ batch_size: 2
26
+ gradient_accumulation_steps: 3
27
+ grad_clip: 0
28
+ instruments:
29
+ - Similarity
30
+ - Difference
31
+ lr: 1.0
32
+ patience: 15
33
+ reduce_factor: 0.95
34
+ target_instrument: Similarity
35
+ num_epochs: 1000
36
+ num_steps: 2235
37
+ q: 0.95
38
+ coarse_loss_clip: true
39
+ ema_momentum: 0.999
40
+ optimizer: prodigy
41
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
42
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
43
+
44
+ inference:
45
+ batch_size: 8
46
+ dim_t: 256
47
+ num_overlap: 8
Politrees/UVR_resources/models/MDX23C/model_2_stem_061321.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 260096
3
+ dim_f: 4096
4
+ dim_t: 256
5
+ hop_length: 2048
6
+ n_fft: 12288
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+ model:
11
+ act: gelu
12
+ bottleneck_factor: 4
13
+ growth: 64
14
+ norm: InstanceNorm
15
+ num_blocks_per_scale: 2
16
+ num_channels: 128
17
+ num_scales: 5
18
+ num_subbands: 4
19
+ scale:
20
+ - 2
21
+ - 2
22
+ name: epoch_10.ckpt
23
+ training:
24
+ batch_size: 16
25
+ grad_clip: 0
26
+ instruments:
27
+ - Vocals
28
+ - Instrumental
29
+ lr: 5.0e-05
30
+ target_instrument: null
31
+ num_epochs: 100
32
+ num_steps: 1000
33
+ inference:
34
+ batch_size: 1
35
+ dim_t: 256
36
+ num_overlap: 8
Politrees/UVR_resources/models/MDX23C/model_2_stem_full_band_8k.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261120
3
+ dim_f: 4096
4
+ dim_t: 256
5
+ hop_length: 1024
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+ model:
11
+ act: gelu
12
+ bottleneck_factor: 4
13
+ growth: 128
14
+ norm: InstanceNorm
15
+ num_blocks_per_scale: 2
16
+ num_channels: 128
17
+ num_scales: 5
18
+ num_subbands: 4
19
+ scale:
20
+ - 2
21
+ - 2
22
+ training:
23
+ batch_size: 6
24
+ grad_clip: 0
25
+ instruments:
26
+ - Vocals
27
+ - Instrumental
28
+ lr: 1.0e-05
29
+ patience: 2
30
+ reduce_factor: 0.95
31
+ target_instrument: null
32
+ num_epochs: 1000
33
+ num_steps: 1000
34
+ augmentation: 1
35
+ augmentation_type: simple1
36
+ augmentation_mix: true
37
+ q: 0.95
38
+ coarse_loss_clip: true
39
+ ema_momentum: 0.999
40
+ inference:
41
+ batch_size: 1
42
+ dim_t: 256
43
+ num_overlap: 8
Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_FNO_by-Unwa.yaml ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 749259
3
+ dim_f: 1024
4
+ dim_t: 1700 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.
85
+ ff_dropout: 0.
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 441
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+ mlp_expansion_factor: 4
103
+
104
+ training:
105
+ batch_size: 2
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments: ['vocals', 'other']
109
+ patience: 3
110
+ reduce_factor: 0.95
111
+ target_instrument: other
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ augmentation: false # enable augmentations by audiomentations and pedalboard
115
+ augmentation_type: simple1
116
+ use_mp3_compress: false # Deprecated
117
+ augmentation_mix: true # Mix several stems of the same type with some probability
118
+ augmentation_loudness: true # randomly change loudness of each stem
119
+ augmentation_loudness_type: 1 # Type 1 or 2
120
+ augmentation_loudness_min: 0.5
121
+ augmentation_loudness_max: 1.5
122
+ q: 0.95
123
+ coarse_loss_clip: true
124
+ ema_momentum: 0.999
125
+ # optimizer: prodigy
126
+ optimizer: adam
127
+ # lr: 1.0
128
+ lr: 1.0e-5
129
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
130
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
131
+
132
+ inference:
133
+ batch_size: 2
134
+ dim_t: 1700
135
+ num_overlap: 2
136
+ normalize: false
Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_Karaoke_Frazer_by-becruily.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 882000
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0
85
+ ff_dropout: 0
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: false
102
+ mlp_expansion_factor: 4
103
+
104
+ training:
105
+ batch_size: 1
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - Vocals
110
+ - Instrumental
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: Vocals
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ q: 0.95
117
+ coarse_loss_clip: true
118
+ ema_momentum: 0.999
119
+ # optimizer: prodigy
120
+ optimizer: adam
121
+ lr: 1.0e-5
122
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
123
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
124
+
125
+ inference:
126
+ batch_size: 2
127
+ dim_t: 2001
128
+ num_overlap: 4
129
+ normalize: false
Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_Resurrection_Instrumental_by-Unwa.yaml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 749259
3
+ dim_f: 1024
4
+ dim_t: 1700 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.
85
+ ff_dropout: 0.
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 441
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+ mlp_expansion_factor: 4
103
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
104
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
105
+
106
+ training:
107
+ batch_size: 2
108
+ gradient_accumulation_steps: 1
109
+ grad_clip: 0
110
+ instruments: ['vocals', 'other']
111
+ patience: 3
112
+ reduce_factor: 0.95
113
+ target_instrument: other
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ augmentation: false # enable augmentations by audiomentations and pedalboard
117
+ augmentation_type: simple1
118
+ use_mp3_compress: false # Deprecated
119
+ augmentation_mix: true # Mix several stems of the same type with some probability
120
+ augmentation_loudness: true # randomly change loudness of each stem
121
+ augmentation_loudness_type: 1 # Type 1 or 2
122
+ augmentation_loudness_min: 0.5
123
+ augmentation_loudness_max: 1.5
124
+ q: 0.95
125
+ coarse_loss_clip: true
126
+ ema_momentum: 0.999
127
+ # optimizer: prodigy
128
+ optimizer: adam
129
+ # lr: 1.0
130
+ lr: 1.0e-5
131
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
132
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
133
+
134
+ inference:
135
+ batch_size: 2
136
+ dim_t: 1700
137
+ num_overlap: 2
138
+ normalize: false
Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_Resurrection_Vocals_by-Unwa.yaml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 785920
3
+ dim_f: 1024
4
+ dim_t: 1536 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.
85
+ ff_dropout: 0.
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+ mlp_expansion_factor: 4
103
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
104
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
105
+
106
+ training:
107
+ batch_size: 2
108
+ gradient_accumulation_steps: 1
109
+ grad_clip: 0
110
+ instruments: ['vocals', 'other']
111
+ patience: 3
112
+ reduce_factor: 0.95
113
+ target_instrument: vocals
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ augmentation: false # enable augmentations by audiomentations and pedalboard
117
+ augmentation_type: simple1
118
+ use_mp3_compress: false # Deprecated
119
+ augmentation_mix: true # Mix several stems of the same type with some probability
120
+ augmentation_loudness: true # randomly change loudness of each stem
121
+ augmentation_loudness_type: 1 # Type 1 or 2
122
+ augmentation_loudness_min: 0.5
123
+ augmentation_loudness_max: 1.5
124
+ q: 0.95
125
+ coarse_loss_clip: true
126
+ ema_momentum: 0.999
127
+ # optimizer: prodigy
128
+ optimizer: adam
129
+ # lr: 1.0
130
+ lr: 1.0e-5
131
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
132
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
133
+
134
+ inference:
135
+ batch_size: 2
136
+ dim_t: 1536
137
+ num_overlap: 2
138
+ normalize: false
Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit-Roformer_SW_by-jarredou.yaml ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 588800 #882000
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 6
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+ mlp_expansion_factor: 4
103
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
104
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
105
+
106
+ training:
107
+ batch_size: 2
108
+ gradient_accumulation_steps: 1
109
+ grad_clip: 0
110
+ instruments: ['bass', 'drums', 'other', 'vocals', 'guitar', 'piano']
111
+ patience: 3
112
+ reduce_factor: 0.95
113
+ target_instrument: null
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ augmentation: false # enable augmentations by audiomentations and pedalboard
117
+ augmentation_type: simple1
118
+ use_mp3_compress: false # Deprecated
119
+ augmentation_mix: true # Mix several stems of the same type with some probability
120
+ augmentation_loudness: true # randomly change loudness of each stem
121
+ augmentation_loudness_type: 1 # Type 1 or 2
122
+ augmentation_loudness_min: 0.5
123
+ augmentation_loudness_max: 1.5
124
+ q: 0.95
125
+ coarse_loss_clip: true
126
+ ema_momentum: 0.999
127
+ # optimizer: prodigy
128
+ optimizer: adam
129
+ # lr: 1.0
130
+ lr: 1.0e-5
131
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
132
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
133
+
134
+ augmentations:
135
+ enable: true # enable or disable all augmentations (to fast disable if needed)
136
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
137
+ loudness_min: 0.5
138
+ loudness_max: 1.5
139
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
140
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
141
+ - 0.2
142
+ - 0.02
143
+ mixup_loudness_min: 0.5
144
+ mixup_loudness_max: 1.5
145
+
146
+ all:
147
+ channel_shuffle: 0.5 # Set 0 or lower to disable
148
+ random_inverse: 0.1 # inverse track (better lower probability)
149
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
150
+
151
+ vocals:
152
+ pitch_shift: 0.1
153
+ pitch_shift_min_semitones: -5
154
+ pitch_shift_max_semitones: 5
155
+ seven_band_parametric_eq: 0.1
156
+ seven_band_parametric_eq_min_gain_db: -9
157
+ seven_band_parametric_eq_max_gain_db: 9
158
+ tanh_distortion: 0.1
159
+ tanh_distortion_min: 0.1
160
+ tanh_distortion_max: 0.7
161
+ bass:
162
+ pitch_shift: 0.1
163
+ pitch_shift_min_semitones: -2
164
+ pitch_shift_max_semitones: 2
165
+ seven_band_parametric_eq: 0.1
166
+ seven_band_parametric_eq_min_gain_db: -3
167
+ seven_band_parametric_eq_max_gain_db: 6
168
+ tanh_distortion: 0.1
169
+ tanh_distortion_min: 0.1
170
+ tanh_distortion_max: 0.5
171
+ drums:
172
+ pitch_shift: 0.1
173
+ pitch_shift_min_semitones: -5
174
+ pitch_shift_max_semitones: 5
175
+ seven_band_parametric_eq: 0.1
176
+ seven_band_parametric_eq_min_gain_db: -9
177
+ seven_band_parametric_eq_max_gain_db: 9
178
+ tanh_distortion: 0.1
179
+ tanh_distortion_min: 0.1
180
+ tanh_distortion_max: 0.6
181
+ other:
182
+ pitch_shift: 0.1
183
+ pitch_shift_min_semitones: -4
184
+ pitch_shift_max_semitones: 4
185
+ gaussian_noise: 0.1
186
+ gaussian_noise_min_amplitude: 0.001
187
+ gaussian_noise_max_amplitude: 0.015
188
+ time_stretch: 0.1
189
+ time_stretch_min_rate: 0.8
190
+ time_stretch_max_rate: 1.25
191
+
192
+
193
+ inference:
194
+ batch_size: 1
195
+ dim_t: 1101
196
+ num_overlap: 2
197
+ normalize: false
Politrees/UVR_resources/models/Roformer/BandSplit/config_BandSplit_Roformer_4stems_FT_by_SYH99999.yaml ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 8
14
+ stereo: true
15
+ num_stems: 4
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 441
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+ mlp_expansion_factor: 2
103
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
104
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
105
+
106
+ training:
107
+ batch_size: 2
108
+ gradient_accumulation_steps: 1
109
+ grad_clip: 0
110
+ instruments: ['drums', 'bass', 'other', 'vocals']
111
+ patience: 3
112
+ reduce_factor: 0.95
113
+ target_instrument: null
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ augmentation: false # enable augmentations by audiomentations and pedalboard
117
+ augmentation_type: simple1
118
+ use_mp3_compress: false # Deprecated
119
+ augmentation_mix: true # Mix several stems of the same type with some probability
120
+ augmentation_loudness: true # randomly change loudness of each stem
121
+ augmentation_loudness_type: 1 # Type 1 or 2
122
+ augmentation_loudness_min: 0.5
123
+ augmentation_loudness_max: 1.5
124
+ q: 0.95
125
+ coarse_loss_clip: true
126
+ ema_momentum: 0.999
127
+ # optimizer: prodigy
128
+ optimizer: adam
129
+ # lr: 1.0
130
+ lr: 1.0e-5
131
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
132
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
133
+
134
+ augmentations:
135
+ enable: true # enable or disable all augmentations (to fast disable if needed)
136
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
137
+ loudness_min: 0.5
138
+ loudness_max: 1.5
139
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
140
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
141
+ - 0.2
142
+ - 0.02
143
+ mixup_loudness_min: 0.5
144
+ mixup_loudness_max: 1.5
145
+
146
+ all:
147
+ channel_shuffle: 0.5 # Set 0 or lower to disable
148
+ random_inverse: 0.1 # inverse track (better lower probability)
149
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
150
+
151
+ vocals:
152
+ pitch_shift: 0.1
153
+ pitch_shift_min_semitones: -5
154
+ pitch_shift_max_semitones: 5
155
+ seven_band_parametric_eq: 0.1
156
+ seven_band_parametric_eq_min_gain_db: -9
157
+ seven_band_parametric_eq_max_gain_db: 9
158
+ tanh_distortion: 0.1
159
+ tanh_distortion_min: 0.1
160
+ tanh_distortion_max: 0.7
161
+ bass:
162
+ pitch_shift: 0.1
163
+ pitch_shift_min_semitones: -2
164
+ pitch_shift_max_semitones: 2
165
+ seven_band_parametric_eq: 0.1
166
+ seven_band_parametric_eq_min_gain_db: -3
167
+ seven_band_parametric_eq_max_gain_db: 6
168
+ tanh_distortion: 0.1
169
+ tanh_distortion_min: 0.1
170
+ tanh_distortion_max: 0.5
171
+ drums:
172
+ pitch_shift: 0.1
173
+ pitch_shift_min_semitones: -5
174
+ pitch_shift_max_semitones: 5
175
+ seven_band_parametric_eq: 0.1
176
+ seven_band_parametric_eq_min_gain_db: -9
177
+ seven_band_parametric_eq_max_gain_db: 9
178
+ tanh_distortion: 0.1
179
+ tanh_distortion_min: 0.1
180
+ tanh_distortion_max: 0.6
181
+ other:
182
+ pitch_shift: 0.1
183
+ pitch_shift_min_semitones: -4
184
+ pitch_shift_max_semitones: 4
185
+ gaussian_noise: 0.1
186
+ gaussian_noise_min_amplitude: 0.001
187
+ gaussian_noise_max_amplitude: 0.015
188
+ time_stretch: 0.1
189
+ time_stretch_min_rate: 0.8
190
+ time_stretch_max_rate: 1.25
191
+
192
+
193
+ inference:
194
+ batch_size: 2
195
+ dim_t: 2048
196
+ num_overlap: 4
Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_chorus_male_female.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 8
14
+ stereo: true
15
+ num_stems: 2
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: False
101
+
102
+ training:
103
+ batch_size: 1
104
+ gradient_accumulation_steps: 1
105
+ grad_clip: 0
106
+ instruments:
107
+ - male
108
+ - female
109
+ lr: 1.0e-05
110
+ patience: 2
111
+ reduce_factor: 0.95
112
+ target_instrument: null
113
+ num_epochs: 1000
114
+ num_steps: 1000
115
+ q: 0.95
116
+ coarse_loss_clip: true
117
+ ema_momentum: 0.999
118
+ optimizer: adam
119
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
120
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
121
+
122
+ inference:
123
+ batch_size: 1
124
+ dim_t: 801
125
+ num_overlap: 2
Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_deverb_8_384dim_10depth.yaml ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352768
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 10
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: False
101
+
102
+ training:
103
+ batch_size: 1
104
+ gradient_accumulation_steps: 1
105
+ grad_clip: 0
106
+ instruments:
107
+ - noreverb
108
+ - reverb
109
+ lr: 5.0e-05
110
+ patience: 2
111
+ reduce_factor: 0.95
112
+ target_instrument: noreverb
113
+ num_epochs: 1000
114
+ num_steps: 1000
115
+ q: 0.95
116
+ coarse_loss_clip: true
117
+ ema_momentum: 0.999
118
+ optimizer: adam
119
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
120
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
121
+
122
+ augmentations:
123
+ enable: true # enable or disable all augmentations (to fast disable if needed)
124
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
125
+ loudness_min: 0.5
126
+ loudness_max: 1.5
127
+ mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
128
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
129
+ - 0.2
130
+ - 0.02
131
+ mixup_loudness_min: 0.5
132
+ mixup_loudness_max: 1.5
133
+
134
+ inference:
135
+ batch_size: 4
136
+ dim_t: 801
137
+ num_overlap: 4
Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_ep_317_sdr_12.9755.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 512
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: False
101
+
102
+ training:
103
+ batch_size: 16
104
+ gradient_accumulation_steps: 1
105
+ grad_clip: 0
106
+ instruments:
107
+ - Vocals
108
+ - Instrumental
109
+ lr: 5.0e-05
110
+ patience: 2
111
+ reduce_factor: 0.95
112
+ target_instrument: Vocals
113
+ num_epochs: 1000
114
+ num_steps: 1000
115
+ augmentation: false # enable augmentations by audiomentations and pedalboard
116
+ augmentation_type: simple1
117
+ use_mp3_compress: false # Deprecated
118
+ augmentation_mix: true # Mix several stems of the same type with some probability
119
+ augmentation_loudness: true # randomly change loudness of each stem
120
+ augmentation_loudness_type: 1 # Type 1 or 2
121
+ augmentation_loudness_min: 0.5
122
+ augmentation_loudness_max: 1.5
123
+ q: 0.95
124
+ coarse_loss_clip: true
125
+ ema_momentum: 0.999
126
+ optimizer: adam
127
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
128
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
129
+
130
+ inference:
131
+ batch_size: 1
132
+ dim_t: 801
133
+ num_overlap: 4
Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_ep_368_sdr_12.9628.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 512
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: False
101
+
102
+ training:
103
+ batch_size: 16
104
+ gradient_accumulation_steps: 1
105
+ grad_clip: 0
106
+ instruments:
107
+ - Vocals
108
+ - Instrumental
109
+ lr: 5.0e-05
110
+ patience: 2
111
+ reduce_factor: 0.95
112
+ target_instrument: Vocals
113
+ num_epochs: 1000
114
+ num_steps: 1000
115
+ augmentation: false # enable augmentations by audiomentations and pedalboard
116
+ augmentation_type: simple1
117
+ use_mp3_compress: false # Deprecated
118
+ augmentation_mix: true # Mix several stems of the same type with some probability
119
+ augmentation_loudness: true # randomly change loudness of each stem
120
+ augmentation_loudness_type: 1 # Type 1 or 2
121
+ augmentation_loudness_min: 0.5
122
+ augmentation_loudness_max: 1.5
123
+ q: 0.95
124
+ coarse_loss_clip: true
125
+ ema_momentum: 0.999
126
+ optimizer: adam
127
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
128
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
129
+
130
+ inference:
131
+ batch_size: 1
132
+ dim_t: 801
133
+ num_overlap: 4
Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_ep_937_sdr_10.5309.yaml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 4
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - No Drum-Bass
109
+ - Drum-Bass
110
+ lr: 5.0e-05
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: No Drum-Bass
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ q: 0.95
117
+ coarse_loss_clip: true
118
+ ema_momentum: 0.999
119
+ optimizer: adam
120
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
121
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
+
123
+ augmentations:
124
+ enable: true # enable or disable all augmentations (to fast disable if needed)
125
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
126
+ loudness_min: 0.5
127
+ loudness_max: 1.5
128
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
129
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
130
+ - 0.2
131
+ - 0.02
132
+ mixup_loudness_min: 0.5
133
+ mixup_loudness_max: 1.5
134
+
135
+ inference:
136
+ batch_size: 1
137
+ dim_t: 512
138
+ num_overlap: 4
Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_inst_exp_vrl.yaml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 #352800 #485100
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0
84
+ ff_dropout: 0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: False
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - Vocals
107
+ - Instrumental
108
+ lr: 1.0e-04
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: Instrumental
112
+ num_epochs: 1
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adamw
118
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
119
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
120
+
121
+ inference:
122
+ batch_size: 1
123
+ dim_t: 1101
124
+ num_overlap: 2
Politrees/UVR_resources/models/Roformer/BandSplit/config_bs_roformer_revive_by_unwa.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 #352800 #485100
3
+ dim_f: 1024
4
+ dim_t: 1101
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.
10
+
11
+ model:
12
+ dim: 512
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.
85
+ ff_dropout: 0.
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 441
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 1
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - vocals
109
+ - other
110
+ lr: 1.0e-05
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: vocals
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ augmentation: false # enable augmentations by audiomentations and pedalboard
117
+ augmentation_type: null
118
+ use_mp3_compress: false # Deprecated
119
+ augmentation_mix: false # Mix several stems of the same type with some probability
120
+ augmentation_loudness: false # randomly change loudness of each stem
121
+ augmentation_loudness_type: 1 # Type 1 or 2
122
+ augmentation_loudness_min: 0
123
+ augmentation_loudness_max: 0
124
+ q: 0.95
125
+ coarse_loss_clip: false
126
+ ema_momentum: 0.999
127
+ optimizer: adam
128
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
129
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
130
+
131
+ inference:
132
+ batch_size: 2
133
+ dim_t: 1101
134
+ num_overlap: 2
Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_BVE_by-Gonza.yaml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 411
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0
22
+ ff_dropout: 0
23
+ flash_attn: true
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: false
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 1.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: true
40
+
41
+ training:
42
+ batch_size: 2
43
+ gradient_accumulation_steps: 1
44
+ grad_clip: 0
45
+ instruments:
46
+ - Lead
47
+ - Back
48
+ lr: 5.0e-05
49
+ patience: 2
50
+ reduce_factor: 0.95
51
+ target_instrument: Lead
52
+ num_epochs: 1000
53
+ num_steps: 1000
54
+ augmentation: false # enable augmentations by audiomentations and pedalboard
55
+ augmentation_type:
56
+ use_mp3_compress: false # Deprecated
57
+ augmentation_mix: false # Mix several stems of the same type with some probability
58
+ augmentation_loudness: false # randomly change loudness of each stem
59
+ augmentation_loudness_type: 1 # Type 1 or 2
60
+ augmentation_loudness_min: 0
61
+ augmentation_loudness_max: 0
62
+ q: 0.95
63
+ coarse_loss_clip: true
64
+ ema_momentum: 0.999
65
+ optimizer: adam
66
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+
69
+ augmentations:
70
+ enable: false
71
+
72
+ inference:
73
+ batch_size: 1
74
+ dim_t: 1101
75
+ num_overlap: 8
Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_Duality_v1_by-Aname.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 661500
3
+ dim_f: 1024
4
+ dim_t: 1101
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0
22
+ ff_dropout: 0
23
+ flash_attn: True
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: False
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 1.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: False
40
+
41
+ training:
42
+ batch_size: 4
43
+ gradient_accumulation_steps: 1
44
+ grad_clip: 0
45
+ instruments:
46
+ - vocals
47
+ - other
48
+ lr: 1.0e-05
49
+ patience: 2
50
+ reduce_factor: 0.95
51
+ target_instrument: vocals
52
+ num_epochs: 1000
53
+ num_steps: 1000
54
+ augmentation: false # enable augmentations by audiomentations and pedalboard
55
+ augmentation_type: null
56
+ use_mp3_compress: false # Deprecated
57
+ augmentation_mix: false # Mix several stems of the same type with some probability
58
+ augmentation_loudness: false # randomly change loudness of each stem
59
+ augmentation_loudness_type: 1 # Type 1 or 2
60
+ augmentation_loudness_min: 0
61
+ augmentation_loudness_max: 0
62
+ q: 0.95
63
+ coarse_loss_clip: false
64
+ ema_momentum: 0.999
65
+ optimizer: adam
66
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+
69
+ inference:
70
+ batch_size: 4
71
+ dim_t: 1101
72
+ num_overlap: 4
Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_Karaoke_Fusion_Total_by-Gonza.yaml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 411
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0
22
+ ff_dropout: 0
23
+ flash_attn: true
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: false
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 1.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: true
40
+
41
+ training:
42
+ batch_size: 2
43
+ gradient_accumulation_steps: 1
44
+ grad_clip: 0
45
+ instruments:
46
+ - Vocals
47
+ - Instrumental
48
+ lr: 1.0
49
+ patience: 2
50
+ reduce_factor: 0.95
51
+ target_instrument: Vocals
52
+ num_epochs: 100
53
+ num_steps: 200
54
+ augmentation: false # enable augmentations by audiomentations and pedalboard
55
+ augmentation_type:
56
+ use_mp3_compress: false # Deprecated
57
+ augmentation_mix: false # Mix several stems of the same type with some probability
58
+ augmentation_loudness: false # randomly change loudness of each stem
59
+ augmentation_loudness_type: 1 # Type 1 or 2
60
+ augmentation_loudness_min: 0
61
+ augmentation_loudness_max: 0
62
+ q: 0.95
63
+ coarse_loss_clip: true
64
+ ema_momentum: 0.999
65
+ optimizer: prodigy
66
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+
69
+ augmentations:
70
+ enable: false
71
+
72
+ inference:
73
+ batch_size: 1
74
+ dim_t: 1101
75
+ num_overlap: 8
Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_Karaoke_Fusion_by-Gonza.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 411
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0
22
+ ff_dropout: 0
23
+ flash_attn: true
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: false
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 1.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: true
40
+
41
+ training:
42
+ batch_size: 2
43
+ gradient_accumulation_steps: 1
44
+ grad_clip: 0
45
+ instruments:
46
+ - Vocals
47
+ - Instrumental
48
+ lr: 1.0
49
+ patience: 2
50
+ reduce_factor: 0.95
51
+ target_instrument: Vocals
52
+ num_epochs: 150
53
+ num_steps: 100
54
+ augmentation: false # enable augmentations by audiomentations and pedalboard
55
+ augmentation_type:
56
+ use_mp3_compress: false # Deprecated
57
+ augmentation_mix: false # Mix several stems of the same type with some probability
58
+ augmentation_loudness: false # randomly change loudness of each stem
59
+ augmentation_loudness_type: 1 # Type 1 or 2
60
+ augmentation_loudness_min: 0
61
+ augmentation_loudness_max: 0
62
+ q: 0.95
63
+ coarse_loss_clip: true
64
+ ema_momentum: 0.999
65
+ optimizer: prodigy
66
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+
69
+ augmentations:
70
+ enable: false
71
+
72
+ lora:
73
+ r: 4
74
+ lora_alpha: 8 # alpha / rank > 1
75
+ lora_dropout: 0.01
76
+ merge_weights: True
77
+ fan_in_fan_out: False
78
+ enable_lora: [True]
79
+
80
+ inference:
81
+ batch_size: 1
82
+ dim_t: 1101
83
+ num_overlap: 8
Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand-Roformer_Karaoke_Fusion_v2_by-Gonza.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 411
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0
22
+ ff_dropout: 0
23
+ flash_attn: true
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: false
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 1.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: true
40
+
41
+ training:
42
+ batch_size: 2
43
+ gradient_accumulation_steps: 1
44
+ grad_clip: 0
45
+ instruments:
46
+ - Vocals
47
+ - Instrumental
48
+ lr: 1.0
49
+ patience: 2
50
+ reduce_factor: 0.95
51
+ target_instrument: Vocals
52
+ num_epochs: 100
53
+ num_steps: 100
54
+ augmentation: false # enable augmentations by audiomentations and pedalboard
55
+ augmentation_type:
56
+ use_mp3_compress: false # Deprecated
57
+ augmentation_mix: false # Mix several stems of the same type with some probability
58
+ augmentation_loudness: false # randomly change loudness of each stem
59
+ augmentation_loudness_type: 1 # Type 1 or 2
60
+ augmentation_loudness_min: 0
61
+ augmentation_loudness_max: 0
62
+ q: 0.95
63
+ coarse_loss_clip: true
64
+ ema_momentum: 0.999
65
+ optimizer: prodigy
66
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+
69
+ lora:
70
+ r: 8
71
+ lora_alpha: 16 # alpha / rank > 1
72
+ lora_dropout: 0.05
73
+ merge_weights: False
74
+ fan_in_fan_out: False
75
+ enable_lora: [True]
76
+
77
+ augmentations:
78
+ enable: false
79
+
80
+ inference:
81
+ batch_size: 1
82
+ dim_t: 1101
83
+ num_overlap: 8
Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand_Roformer_4stems_FT_Large_by_SYH99999.yaml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 1101
5
+ hop_length: 882
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 8
14
+ stereo: true
15
+ num_stems: 4
16
+ linear_transformer_depth: 0
17
+ time_transformer_depth: 1
18
+ freq_transformer_depth: 1
19
+ num_bands: 60
20
+ dim_head: 64
21
+ heads: 8
22
+ attn_dropout: 0.0
23
+ ff_dropout: 0.0
24
+ flash_attn: true
25
+ dim_freqs_in: 2049
26
+ sample_rate: 44100 # needed for mel filter bank from librosa
27
+ stft_n_fft: 4096
28
+ stft_hop_length: 882
29
+ stft_win_length: 4096
30
+ stft_normalized: False
31
+ mask_estimator_depth: 2
32
+ multi_stft_resolution_loss_weight: 1.0
33
+ multi_stft_resolutions_window_sizes: !!python/tuple
34
+ - 4096
35
+ - 2048
36
+ - 1024
37
+ - 512
38
+ - 256
39
+ multi_stft_hop_size: 147
40
+ multi_stft_normalized: False
41
+ mlp_expansion_factor: 4
42
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
43
+ skip_connection: True # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
44
+
45
+ training:
46
+ batch_size: 1
47
+ gradient_accumulation_steps: 1
48
+ grad_clip: 0
49
+ instruments: ['drums', 'bass', 'other', 'vocals']
50
+ lr: 1.0
51
+ patience: 3
52
+ reduce_factor: 0.95
53
+ target_instrument: null
54
+ num_epochs: 1000
55
+ num_steps: 1000
56
+ q: 0.95
57
+ coarse_loss_clip: false
58
+ ema_momentum: 0.999
59
+ optimizer: prodigy
60
+ read_metadata_procs: 8 # Number of processes to use during metadata reading for dataset. Can speed up metadata generation
61
+ normalize: false
62
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
63
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
64
+
65
+ inference:
66
+ batch_size: 4
67
+ dim_t: 1101
68
+ num_overlap: 4
69
+ normalize: false
Politrees/UVR_resources/models/Roformer/MelBand/config_MelBand_Roformer_4stems_Large_v1_by_Aname.yaml ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 661500
3
+ dim_f: 1024
4
+ dim_t: 1101
5
+ hop_length: 882
6
+ n_fft: 4096
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0001
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 8
14
+ stereo: true
15
+ num_stems: 4
16
+ linear_transformer_depth: 0
17
+ time_transformer_depth: 1
18
+ freq_transformer_depth: 1
19
+ num_bands: 60
20
+ dim_head: 64
21
+ heads: 8
22
+ attn_dropout: 0.0
23
+ ff_dropout: 0.0
24
+ flash_attn: true
25
+ dim_freqs_in: 2049
26
+ sample_rate: 44100 # needed for mel filter bank from librosa
27
+ stft_n_fft: 4096
28
+ stft_hop_length: 882
29
+ stft_win_length: 4096
30
+ stft_normalized: False
31
+ mask_estimator_depth: 2
32
+ multi_stft_resolution_loss_weight: 1.0
33
+ multi_stft_resolutions_window_sizes: !!python/tuple
34
+ - 4096
35
+ - 2048
36
+ - 1024
37
+ - 512
38
+ - 256
39
+ multi_stft_hop_size: 147
40
+ multi_stft_normalized: False
41
+ mlp_expansion_factor: 4
42
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
43
+ skip_connection: True # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
44
+
45
+ training:
46
+ batch_size: 1
47
+ gradient_accumulation_steps: 4
48
+ grad_clip: 0
49
+ instruments: ['drums', 'bass', 'other', 'vocals']
50
+ lr: 2.0e-05
51
+ patience: 2
52
+ reduce_factor: 0.95
53
+ target_instrument: null
54
+ num_epochs: 1000
55
+ num_steps: 300
56
+ q: 0.95
57
+ coarse_loss_clip: false
58
+ ema_momentum: 0.999
59
+ optimizer: adamw
60
+ read_metadata_procs: 8 # Number of processes to use during metadata reading for dataset. Can speed up metadata generation
61
+ normalize: false
62
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
63
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
64
+
65
+ augmentations:
66
+ enable: false # enable or disable all augmentations (to fast disable if needed)
67
+ loudness: false # randomly change loudness of each stem on the range (loudness_min; loudness_max)
68
+ loudness_min: 0.5
69
+ loudness_max: 1.5
70
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
71
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
72
+ - 0.2
73
+ - 0.02
74
+ - 0.002
75
+ mixup_loudness_min: 0.5
76
+ mixup_loudness_max: 1.5
77
+
78
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
79
+ mp3_compression_on_mixture: 0.01
80
+ mp3_compression_on_mixture_bitrate_min: 32
81
+ mp3_compression_on_mixture_bitrate_max: 320
82
+ mp3_compression_on_mixture_backend: "lameenc"
83
+
84
+ all:
85
+ channel_shuffle: 0.5 # Set 0 or lower to disable
86
+ random_inverse: 0.01 # inverse track (better lower probability)
87
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
88
+
89
+ vocals:
90
+ pitch_shift: 1.0
91
+ pitch_shift_min_semitones: -12
92
+ pitch_shift_max_semitones: 12
93
+ seven_band_parametric_eq: 0.5
94
+ seven_band_parametric_eq_min_gain_db: -80
95
+ seven_band_parametric_eq_max_gain_db: 9
96
+ tanh_distortion: 0.5
97
+ tanh_distortion_min: 0.1
98
+ tanh_distortion_max: 1
99
+ time_stretch: 1.0
100
+ time_stretch_min_rate: 0.5
101
+ time_stretch_max_rate: 2
102
+ bass:
103
+ pitch_shift: 1.0
104
+ pitch_shift_min_semitones: -6
105
+ pitch_shift_max_semitones: 6
106
+ seven_band_parametric_eq: 0.4
107
+ seven_band_parametric_eq_min_gain_db: -32
108
+ seven_band_parametric_eq_max_gain_db: 6
109
+ tanh_distortion: 1.0
110
+ tanh_distortion_min: 0.1
111
+ tanh_distortion_max: 0.5
112
+ time_stretch: 1.0
113
+ time_stretch_min_rate: 0.5
114
+ time_stretch_max_rate: 1.5
115
+ drums:
116
+ pitch_shift: 0.1
117
+ pitch_shift_min_semitones: -6
118
+ pitch_shift_max_semitones: 6
119
+ seven_band_parametric_eq: 0.5
120
+ seven_band_parametric_eq_min_gain_db: -24
121
+ seven_band_parametric_eq_max_gain_db: 12
122
+ tanh_distortion: 0.3
123
+ tanh_distortion_min: 0.1
124
+ tanh_distortion_max: 0.6
125
+ time_stretch: 1.0
126
+ time_stretch_min_rate: 0.333
127
+ time_stretch_max_rate: 1.5
128
+ other:
129
+ pitch_shift: 1.0
130
+ pitch_shift_min_semitones: -12
131
+ pitch_shift_max_semitones: 12
132
+ gaussian_noise: 0.4
133
+ gaussian_noise_min_amplitude: 0.001
134
+ gaussian_noise_max_amplitude: 0.15
135
+ time_stretch: 0.01
136
+ time_stretch_min_rate: 0.25
137
+ time_stretch_max_rate: 1.5
138
+
139
+ inference:
140
+ batch_size: 1
141
+ dim_t: 256
142
+ num_overlap: 4
143
+ normalize: false
144
+
145
+ loss_multistft:
146
+ fft_sizes:
147
+ - 1024
148
+ - 2048
149
+ - 4096
150
+ hop_sizes:
151
+ - 147
152
+ - 256
153
+ - 512
154
+ win_lengths:
155
+ - 1024
156
+ - 2048
157
+ - 4096
158
+ window: "hann_window"
159
+ scale: "mel"
160
+ n_bins: 128
161
+ sample_rate: 44100
162
+ perceptual_weighting: true
163
+ w_sc: 16.0
164
+ w_log_mag: 16.0
165
+ w_lin_mag: 16.0
166
+ w_phs: 0.0
167
+ mag_distance: "L1"
Politrees/UVR_resources/models/SCnet/config_musdb18_scnet.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 32
17
+ - 64
18
+ - 128
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.175
25
+ - 0.392
26
+ - 0.433
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 6
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 10
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - Drums
50
+ - Bass
51
+ - Other
52
+ - Vocals
53
+ lr: 5.0e-04
54
+ patience: 2
55
+ reduce_factor: 0.95
56
+ target_instrument: null
57
+ num_epochs: 1000
58
+ num_steps: 1000
59
+ q: 0.95
60
+ coarse_loss_clip: true
61
+ ema_momentum: 0.999
62
+ optimizer: adam
63
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
64
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
65
+
66
+ augmentations:
67
+ enable: true # enable or disable all augmentations (to fast disable if needed)
68
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
69
+ loudness_min: 0.5
70
+ loudness_max: 1.5
71
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
72
+ mixup_probs:
73
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
74
+ - 0.2
75
+ - 0.02
76
+ mixup_loudness_min: 0.5
77
+ mixup_loudness_max: 1.5
78
+
79
+ inference:
80
+ batch_size: 8
81
+ dim_t: 256
82
+ num_overlap: 4
83
+ normalize: true
Politrees/UVR_resources/models/SCnet/config_musdb18_scnet_large.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 64
17
+ - 128
18
+ - 256
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.225
25
+ - 0.372
26
+ - 0.403
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 6
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 6
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - Drums
50
+ - Bass
51
+ - Other
52
+ - Vocals
53
+ # lr: 1.0e-04
54
+ lr: 1.0
55
+ patience: 2
56
+ reduce_factor: 0.95
57
+ target_instrument: null
58
+ num_epochs: 1000
59
+ num_steps: 1000
60
+ q: 0.95
61
+ coarse_loss_clip: true
62
+ ema_momentum: 0.999
63
+ optimizer: prodigy
64
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
65
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
66
+
67
+ augmentations:
68
+ enable: true # enable or disable all augmentations (to fast disable if needed)
69
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
70
+ loudness_min: 0.5
71
+ loudness_max: 1.5
72
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
73
+ mixup_probs:
74
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
75
+ - 0.2
76
+ - 0.02
77
+ mixup_loudness_min: 0.5
78
+ mixup_loudness_max: 1.5
79
+ all:
80
+ channel_shuffle: 0.5 # Set 0 or lower to disable
81
+ random_inverse: 0.1 # inverse track (better lower probability)
82
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
83
+
84
+ inference:
85
+ batch_size: 8
86
+ dim_t: 256
87
+ num_overlap: 4
88
+ normalize: false
Politrees/UVR_resources/models/SCnet/config_musdb18_scnet_large_starrytong.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 64
17
+ - 128
18
+ - 256
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.225
25
+ - 0.372
26
+ - 0.403
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 6
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 6
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - Drums
50
+ - Bass
51
+ - Other
52
+ - Vocals
53
+ # lr: 1.0e-04
54
+ lr: 1.0
55
+ patience: 2
56
+ reduce_factor: 0.95
57
+ target_instrument: null
58
+ num_epochs: 1000
59
+ num_steps: 1000
60
+ q: 0.95
61
+ coarse_loss_clip: true
62
+ ema_momentum: 0.999
63
+ optimizer: prodigy
64
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
65
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
66
+
67
+ augmentations:
68
+ enable: true # enable or disable all augmentations (to fast disable if needed)
69
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
70
+ loudness_min: 0.5
71
+ loudness_max: 1.5
72
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
73
+ mixup_probs:
74
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
75
+ - 0.2
76
+ - 0.02
77
+ mixup_loudness_min: 0.5
78
+ mixup_loudness_max: 1.5
79
+ all:
80
+ channel_shuffle: 0.5 # Set 0 or lower to disable
81
+ random_inverse: 0.1 # inverse track (better lower probability)
82
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
83
+
84
+ inference:
85
+ batch_size: 8
86
+ dim_t: 256
87
+ num_overlap: 4
88
+ normalize: true
Politrees/UVR_resources/models/SCnet/config_musdb18_scnet_xl.yaml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 64
17
+ - 128
18
+ - 256
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.230
25
+ - 0.370
26
+ - 0.400
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 8
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 4
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - Drums
50
+ - Bass
51
+ - Other
52
+ - Vocals
53
+ patience: 2
54
+ reduce_factor: 0.95
55
+ target_instrument: null
56
+ num_epochs: 1000
57
+ num_steps: 1000
58
+ q: 0.95
59
+ coarse_loss_clip: true
60
+ ema_momentum: 0.999
61
+ # optimizer: prodigy
62
+ optimizer: adam
63
+ lr: 1.0e-05
64
+ # lr: 1.0
65
+ normalize: false # perform normalization on input of model (use the same for inference!)
66
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+
69
+
70
+ augmentations:
71
+ enable: false # enable or disable all augmentations (to fast disable if needed)
72
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
73
+ loudness_min: 0.5
74
+ loudness_max: 1.5
75
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
76
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
77
+ - 0.2
78
+ - 0.02
79
+ mixup_loudness_min: 0.5
80
+ mixup_loudness_max: 1.5
81
+
82
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
83
+ mp3_compression_on_mixture: 0.01
84
+ mp3_compression_on_mixture_bitrate_min: 32
85
+ mp3_compression_on_mixture_bitrate_max: 320
86
+ mp3_compression_on_mixture_backend: "lameenc"
87
+
88
+ all:
89
+ channel_shuffle: 0.5 # Set 0 or lower to disable
90
+ random_inverse: 0.1 # inverse track (better lower probability)
91
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
92
+
93
+ mp3_compression: 0.01
94
+ mp3_compression_min_bitrate: 32
95
+ mp3_compression_max_bitrate: 320
96
+ mp3_compression_backend: "lameenc"
97
+
98
+ # pedalboard reverb block
99
+ pedalboard_reverb: 0.01
100
+ pedalboard_reverb_room_size_min: 0.1
101
+ pedalboard_reverb_room_size_max: 0.9
102
+ pedalboard_reverb_damping_min: 0.1
103
+ pedalboard_reverb_damping_max: 0.9
104
+ pedalboard_reverb_wet_level_min: 0.1
105
+ pedalboard_reverb_wet_level_max: 0.9
106
+ pedalboard_reverb_dry_level_min: 0.1
107
+ pedalboard_reverb_dry_level_max: 0.9
108
+ pedalboard_reverb_width_min: 0.9
109
+ pedalboard_reverb_width_max: 1.0
110
+
111
+ # pedalboard chorus block
112
+ pedalboard_chorus: 0.01
113
+ pedalboard_chorus_rate_hz_min: 1.0
114
+ pedalboard_chorus_rate_hz_max: 7.0
115
+ pedalboard_chorus_depth_min: 0.25
116
+ pedalboard_chorus_depth_max: 0.95
117
+ pedalboard_chorus_centre_delay_ms_min: 3
118
+ pedalboard_chorus_centre_delay_ms_max: 10
119
+ pedalboard_chorus_feedback_min: 0.0
120
+ pedalboard_chorus_feedback_max: 0.5
121
+ pedalboard_chorus_mix_min: 0.1
122
+ pedalboard_chorus_mix_max: 0.9
123
+
124
+ # pedalboard phazer block
125
+ pedalboard_phazer: 0.01
126
+ pedalboard_phazer_rate_hz_min: 1.0
127
+ pedalboard_phazer_rate_hz_max: 10.0
128
+ pedalboard_phazer_depth_min: 0.25
129
+ pedalboard_phazer_depth_max: 0.95
130
+ pedalboard_phazer_centre_frequency_hz_min: 200
131
+ pedalboard_phazer_centre_frequency_hz_max: 12000
132
+ pedalboard_phazer_feedback_min: 0.0
133
+ pedalboard_phazer_feedback_max: 0.5
134
+ pedalboard_phazer_mix_min: 0.1
135
+ pedalboard_phazer_mix_max: 0.9
136
+
137
+ # pedalboard distortion block
138
+ pedalboard_distortion: 0.01
139
+ pedalboard_distortion_drive_db_min: 1.0
140
+ pedalboard_distortion_drive_db_max: 25.0
141
+
142
+ # pedalboard pitch shift block
143
+ pedalboard_pitch_shift: 0.01
144
+ pedalboard_pitch_shift_semitones_min: -7
145
+ pedalboard_pitch_shift_semitones_max: 7
146
+
147
+ # pedalboard resample block
148
+ pedalboard_resample: 0.01
149
+ pedalboard_resample_target_sample_rate_min: 4000
150
+ pedalboard_resample_target_sample_rate_max: 44100
151
+
152
+ # pedalboard bitcrash block
153
+ pedalboard_bitcrash: 0.01
154
+ pedalboard_bitcrash_bit_depth_min: 4
155
+ pedalboard_bitcrash_bit_depth_max: 16
156
+
157
+ # pedalboard mp3 compressor block
158
+ pedalboard_mp3_compressor: 0.01
159
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
160
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
161
+
162
+ vocals:
163
+ pitch_shift: 0.1
164
+ pitch_shift_min_semitones: -5
165
+ pitch_shift_max_semitones: 5
166
+ seven_band_parametric_eq: 0.25
167
+ seven_band_parametric_eq_min_gain_db: -9
168
+ seven_band_parametric_eq_max_gain_db: 9
169
+ tanh_distortion: 0.1
170
+ tanh_distortion_min: 0.1
171
+ tanh_distortion_max: 0.7
172
+ bass:
173
+ pitch_shift: 0.1
174
+ pitch_shift_min_semitones: -2
175
+ pitch_shift_max_semitones: 2
176
+ seven_band_parametric_eq: 0.25
177
+ seven_band_parametric_eq_min_gain_db: -3
178
+ seven_band_parametric_eq_max_gain_db: 6
179
+ tanh_distortion: 0.2
180
+ tanh_distortion_min: 0.1
181
+ tanh_distortion_max: 0.5
182
+ drums:
183
+ pitch_shift: 0.33
184
+ pitch_shift_min_semitones: -5
185
+ pitch_shift_max_semitones: 5
186
+ seven_band_parametric_eq: 0.25
187
+ seven_band_parametric_eq_min_gain_db: -9
188
+ seven_band_parametric_eq_max_gain_db: 9
189
+ tanh_distortion: 0.33
190
+ tanh_distortion_min: 0.1
191
+ tanh_distortion_max: 0.6
192
+ other:
193
+ pitch_shift: 0.1
194
+ pitch_shift_min_semitones: -4
195
+ pitch_shift_max_semitones: 4
196
+ gaussian_noise: 0.1
197
+ gaussian_noise_min_amplitude: 0.001
198
+ gaussian_noise_max_amplitude: 0.015
199
+ time_stretch: 0.01
200
+ time_stretch_min_rate: 0.8
201
+ time_stretch_max_rate: 1.25
202
+
203
+ inference:
204
+ batch_size: 4
205
+ dim_t: 256
206
+ num_overlap: 4
207
+ normalize: false