Spaces:
Runtime error
Runtime error
update README
Browse files- README.md +64 -4
- mel.png +0 -0
- notebooks/test-mel.ipynb +2 -2
- notebooks/test-model.ipynb +5 -5
- src/train_unconditional.py +6 -3
README.md
CHANGED
|
@@ -1,16 +1,76 @@
|
|
| 1 |
# audio-diffusion
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
```bash
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
```
|
|
|
|
|
|
|
|
|
|
| 5 |
```bash
|
| 6 |
python src/audio_to_images.py \
|
| 7 |
--resolution 256 \
|
| 8 |
--input_dir path-to-audio-files \
|
| 9 |
-
--output_dir data-256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
```
|
|
|
|
|
|
|
|
|
|
| 11 |
```bash
|
| 12 |
-
accelerate launch
|
| 13 |
-
|
|
|
|
| 14 |
--resolution 256 \
|
| 15 |
--output_dir ddpm-ema-audio-256 \
|
| 16 |
--train_batch_size 16 \
|
|
|
|
| 1 |
# audio-diffusion
|
| 2 |
+
|
| 3 |
+
### Apply [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) using the new Hugging Face [diffusers](https://github.com/huggingface/diffusers) package to synthesize music instead of images.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+

|
| 8 |
+
|
| 9 |
+
Audio can be represented as images by transforming to a [mel spectrogram](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum), such as the one shown above. The class `Mel` in `mel.py` can convert a slice of audio into a mel spectrogram of `x_res` x `y_res` and vice-versa. The higher the resolution, the less audio information will be lost. You can see how this works in the `test-mel.ipynb` notebook.
|
| 10 |
+
|
| 11 |
+
A DDPM model is trained on a set of mel spectrograms that have been generated from a directory of audio files. It is then used to synthesize similar mel spectrograms, which are then converted back into audio. See the `test-model.ipynb` notebook for an example.
|
| 12 |
+
|
| 13 |
+
## Generate Mel spectrogram dataset from directory of audio files
|
| 14 |
+
### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results.
|
| 15 |
+
|
| 16 |
```bash
|
| 17 |
+
python src/audio_to_images.py \
|
| 18 |
+
--resolution 64 \
|
| 19 |
+
--hop_length 1024\
|
| 20 |
+
--input_dir path-to-audio-files \
|
| 21 |
+
--output_dir data-test
|
| 22 |
```
|
| 23 |
+
|
| 24 |
+
### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
|
| 25 |
+
|
| 26 |
```bash
|
| 27 |
python src/audio_to_images.py \
|
| 28 |
--resolution 256 \
|
| 29 |
--input_dir path-to-audio-files \
|
| 30 |
+
--output_dir data-256 \
|
| 31 |
+
--push_to_hub teticio\audio-diffusion-256
|
| 32 |
+
```
|
| 33 |
+
## Train model
|
| 34 |
+
### Run training on local machine.
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
accelerate launch --config_file accelerate_local.yaml \
|
| 38 |
+
src/train_unconditional.py \
|
| 39 |
+
--dataset_name data-64 \
|
| 40 |
+
--resolution 64 \
|
| 41 |
+
--hop_length 1024 \
|
| 42 |
+
--output_dir ddpm-ema-audio-64 \
|
| 43 |
+
--train_batch_size 16 \
|
| 44 |
+
--num_epochs 100 \
|
| 45 |
+
--gradient_accumulation_steps 1 \
|
| 46 |
+
--learning_rate 1e-4 \
|
| 47 |
+
--lr_warmup_steps 500 \
|
| 48 |
+
--mixed_precision no
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### Run training on local machine with `batch_size` of 1 and `gradient_accumulation_steps` 16 to compensate, so that 256x256 resolution model fits on commercial grade GPU.
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
accelerate launch --config_file accelerate_local.yaml \
|
| 55 |
+
src/train_unconditional.py \
|
| 56 |
+
--dataset_name teticio/audio-diffusion-256 \
|
| 57 |
+
--resolution 256 \
|
| 58 |
+
--output_dir ddpm-ema-audio-256 \
|
| 59 |
+
--num_epochs 100 \
|
| 60 |
+
--train_batch_size 1 \
|
| 61 |
+
--eval_batch_size 1 \
|
| 62 |
+
--gradient_accumulation_steps 16 \
|
| 63 |
+
--learning_rate 1e-4 \
|
| 64 |
+
--lr_warmup_steps 500 \
|
| 65 |
+
--mixed_precision no
|
| 66 |
```
|
| 67 |
+
|
| 68 |
+
### Run training on SageMaker.
|
| 69 |
+
|
| 70 |
```bash
|
| 71 |
+
accelerate launch --config_file accelerate_sagemaker.yaml \
|
| 72 |
+
src/train_unconditional.py \
|
| 73 |
+
--dataset_name teticio/audio-diffusion-256 \
|
| 74 |
--resolution 256 \
|
| 75 |
--output_dir ddpm-ema-audio-256 \
|
| 76 |
--train_batch_size 16 \
|
mel.png
ADDED
|
notebooks/test-mel.ipynb
CHANGED
|
@@ -49,7 +49,7 @@
|
|
| 49 |
"id": "b2178c3f",
|
| 50 |
"metadata": {},
|
| 51 |
"source": [
|
| 52 |
-
"### Transform slice of audio to
|
| 53 |
]
|
| 54 |
},
|
| 55 |
{
|
|
@@ -120,7 +120,7 @@
|
|
| 120 |
"id": "fe112fef",
|
| 121 |
"metadata": {},
|
| 122 |
"source": [
|
| 123 |
-
"### Transform
|
| 124 |
]
|
| 125 |
},
|
| 126 |
{
|
|
|
|
| 49 |
"id": "b2178c3f",
|
| 50 |
"metadata": {},
|
| 51 |
"source": [
|
| 52 |
+
"### Transform slice of audio to mel spectrogram"
|
| 53 |
]
|
| 54 |
},
|
| 55 |
{
|
|
|
|
| 120 |
"id": "fe112fef",
|
| 121 |
"metadata": {},
|
| 122 |
"source": [
|
| 123 |
+
"### Transform mel spectrogram back to audio"
|
| 124 |
]
|
| 125 |
},
|
| 126 |
{
|
notebooks/test-model.ipynb
CHANGED
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"id": "011fb5a1",
|
| 43 |
"metadata": {},
|
| 44 |
"source": [
|
| 45 |
-
"### Run model inference to generate
|
| 46 |
]
|
| 47 |
},
|
| 48 |
{
|
|
@@ -76,7 +76,7 @@
|
|
| 76 |
{
|
| 77 |
"cell_type": "code",
|
| 78 |
"execution_count": 6,
|
| 79 |
-
"id": "
|
| 80 |
"metadata": {},
|
| 81 |
"outputs": [
|
| 82 |
{
|
|
@@ -101,7 +101,7 @@
|
|
| 101 |
"id": "7230c280",
|
| 102 |
"metadata": {},
|
| 103 |
"source": [
|
| 104 |
-
"### Transform
|
| 105 |
]
|
| 106 |
},
|
| 107 |
{
|
|
@@ -155,7 +155,7 @@
|
|
| 155 |
{
|
| 156 |
"cell_type": "code",
|
| 157 |
"execution_count": 8,
|
| 158 |
-
"id": "
|
| 159 |
"metadata": {},
|
| 160 |
"outputs": [
|
| 161 |
{
|
|
@@ -208,7 +208,7 @@
|
|
| 208 |
{
|
| 209 |
"cell_type": "code",
|
| 210 |
"execution_count": null,
|
| 211 |
-
"id": "
|
| 212 |
"metadata": {},
|
| 213 |
"outputs": [],
|
| 214 |
"source": []
|
|
|
|
| 42 |
"id": "011fb5a1",
|
| 43 |
"metadata": {},
|
| 44 |
"source": [
|
| 45 |
+
"### Run model inference to generate mel spectrogram"
|
| 46 |
]
|
| 47 |
},
|
| 48 |
{
|
|
|
|
| 76 |
{
|
| 77 |
"cell_type": "code",
|
| 78 |
"execution_count": 6,
|
| 79 |
+
"id": "75db4b7c",
|
| 80 |
"metadata": {},
|
| 81 |
"outputs": [
|
| 82 |
{
|
|
|
|
| 101 |
"id": "7230c280",
|
| 102 |
"metadata": {},
|
| 103 |
"source": [
|
| 104 |
+
"### Transform mel spectrogram to audio"
|
| 105 |
]
|
| 106 |
},
|
| 107 |
{
|
|
|
|
| 155 |
{
|
| 156 |
"cell_type": "code",
|
| 157 |
"execution_count": 8,
|
| 158 |
+
"id": "b9023846",
|
| 159 |
"metadata": {},
|
| 160 |
"outputs": [
|
| 161 |
{
|
|
|
|
| 208 |
{
|
| 209 |
"cell_type": "code",
|
| 210 |
"execution_count": null,
|
| 211 |
+
"id": "acf96aba",
|
| 212 |
"metadata": {},
|
| 213 |
"outputs": [],
|
| 214 |
"source": []
|
src/train_unconditional.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import argparse
|
| 2 |
import os
|
| 3 |
|
|
@@ -30,7 +32,8 @@ logger = get_logger(__name__)
|
|
| 30 |
|
| 31 |
|
| 32 |
def main(args):
|
| 33 |
-
|
|
|
|
| 34 |
accelerator = Accelerator(
|
| 35 |
mixed_precision=args.mixed_precision,
|
| 36 |
log_with="tensorboard",
|
|
@@ -122,7 +125,7 @@ def main(args):
|
|
| 122 |
)
|
| 123 |
|
| 124 |
ema_model = EMAModel(
|
| 125 |
-
model,
|
| 126 |
inv_gamma=args.ema_inv_gamma,
|
| 127 |
power=args.ema_power,
|
| 128 |
max_value=args.ema_max_decay,
|
|
@@ -234,7 +237,7 @@ def main(args):
|
|
| 234 |
blocking=False,
|
| 235 |
)
|
| 236 |
else:
|
| 237 |
-
pipeline.save_pretrained(
|
| 238 |
accelerator.wait_for_everyone()
|
| 239 |
|
| 240 |
accelerator.end_training()
|
|
|
|
| 1 |
+
# based on https://github.com/huggingface/diffusers/blob/main/examples/train_unconditional.py
|
| 2 |
+
|
| 3 |
import argparse
|
| 4 |
import os
|
| 5 |
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def main(args):
|
| 35 |
+
output_dir = os.environ.get("SM_MODEL_DIR", None) or args.output_dir
|
| 36 |
+
logging_dir = os.path.join(output_dir, args.logging_dir)
|
| 37 |
accelerator = Accelerator(
|
| 38 |
mixed_precision=args.mixed_precision,
|
| 39 |
log_with="tensorboard",
|
|
|
|
| 125 |
)
|
| 126 |
|
| 127 |
ema_model = EMAModel(
|
| 128 |
+
getattr(model, "module", model),
|
| 129 |
inv_gamma=args.ema_inv_gamma,
|
| 130 |
power=args.ema_power,
|
| 131 |
max_value=args.ema_max_decay,
|
|
|
|
| 237 |
blocking=False,
|
| 238 |
)
|
| 239 |
else:
|
| 240 |
+
pipeline.save_pretrained(output_dir)
|
| 241 |
accelerator.wait_for_everyone()
|
| 242 |
|
| 243 |
accelerator.end_training()
|