ZheqiDAI commited on Oct 18, 2024

Commit

070e26e

1 Parent(s): f40e29b

Initial commit

Browse files

Files changed (39) hide show

LICENSE +21 -0
README copy.md +141 -0
config/train.yaml +14 -0
dataset/gtzan_test.h5 +3 -0
dataset/gtzan_train.h5 +3 -0
diffusion/__init__.py +46 -0
diffusion/diffusion_utils.py +88 -0
diffusion/gaussian_diffusion.py +873 -0
diffusion/respace.py +129 -0
diffusion/timestep_sampler.py +150 -0
gtzan-ck/model_epoch_20000.pt +3 -0
gtzan-test.csv +101 -0
models.py +375 -0
requirement.txt +26 -0
sample.py +135 -0
sample/gn/blues_blues.00000.mp3 +0 -0
sample/gn/blues_blues.00001.mp3 +0 -0
sample/gn/blues_blues.00002.mp3 +0 -0
sample/gn/blues_blues.00003.mp3 +0 -0
sample/gn/blues_blues.00004.mp3 +0 -0
sample/gn/blues_blues.00005.mp3 +0 -0
sample/gn/blues_blues.00006.mp3 +0 -0
sample/gn/blues_blues.00007.mp3 +0 -0
sample/gn/blues_blues.00008.mp3 +0 -0
sample/gn/blues_blues.00009.mp3 +0 -0
sample/gt/blues_blues.00000.mp3 +0 -0
sample/gt/blues_blues.00001.mp3 +0 -0
sample/gt/blues_blues.00002.mp3 +0 -0
sample/gt/blues_blues.00003.mp3 +0 -0
sample/gt/blues_blues.00004.mp3 +0 -0
sample/gt/blues_blues.00005.mp3 +0 -0
sample/gt/blues_blues.00006.mp3 +0 -0
sample/gt/blues_blues.00007.mp3 +0 -0
sample/gt/blues_blues.00008.mp3 +0 -0
sample/gt/blues_blues.00009.mp3 +0 -0
tools/bigvgan_v2_22khz_80band_256x +1 -0
tools/gtzan2h5.py +160 -0
tools/gtzan_split.py +98 -0
train.py +114 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 AudioFans
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README copy.md ADDED Viewed

	@@ -0,0 +1,141 @@

+# Musimple:Text2Music with DiT Made simple
+## Introduction
+This repository provides a simple and clear implementation of a **Text-to-Music Generation** pipeline using a **DiT (Diffusion Transformer)** model. The codebase includes key components such as **model training**, **inference**, and **evaluation**. We use the **GTZAN dataset** as an example to demonstrate a minimal, working pipeline for text-conditioned music generation.
+The repository is designed to be easy to use and customize, making it simple to reproduce our results on a single **NVIDIA RTX 4090 GPU**. Additionally, the code is structured to be flexible, allowing you to modify it for your own tasks and datasets.
+We plan to continue maintaining and improving this repository with new features, model improvements, and extended documentation in the future.
+## Features
+- **Text-to-Music Generation**: Generate music directly from text descriptions using a DiT model.
+- **GTZAN Example**: A simple pipeline using the GTZAN dataset to demonstrate the workflow.
+- **End-to-End Pipeline**: Includes model training, inference, and evaluation with support for generating audio files.
+- **Customizable**: Easy to modify and extend for different datasets or use cases.
+- **Single GPU Training**: Optimized for training on a single RTX 4090 GPU but adaptable to different hardware setups.
+## Requirements
+Before using the code, ensure that the following dependencies are installed:
+- Python >= 3.9
+- CUDA (if available)
+- Required Python libraries from `requirements.txt`
+You can install the dependencies using:
+```bash
+conda create -n musimple python=3.9
+conda activate musimple
+pip install -r requirements.txt
+```
+## Data Preprocessing
+To begin with, you will need to download the **GTZAN dataset**. Once downloaded, you can use the `gtzan_split.py` script located in the `tools` directory to split the dataset into training and testing sets. Run the following command:
+```bash
+python gtzan_split.py --root_dir /path/to/gtzan/genres --output_dir /path/to/output/directory
+```
+Next, convert the audio files into an HDF5 format using the gtzan2h5.py script:
+```bash
+python gtzan2h5.py --root_dir /path/to/audio/files --output_h5_file /path/to/output.h5 --config_path bigvgan_v2_22khz_80band_256x/config.json --sr 22050
+```
+Preprocessed Data
+If this process seems cumbersome, don’t worry! **We have already preprocessed the dataset**, and you can find it in the **musimple/dataset** directory. You can download and use this data directly to skip the preprocessing steps.
+Data Breakdown
+In this preprocessing stage, there are two main parts:
+Text to Latent Transformation: We use a Sentence Transformer to convert text labels into latent representations.
+Audio to Mel Spectrogram: The original audio files are converted into mel spectrograms.
+Both the latent representations and mel spectrograms are stored in an HDF5 file, making them easily accessible during training and inference.
+## Training
+To begin training, simply navigate to the `Musimple` directory and run the following command:
+```bash
+cd Musimple
+python train.py
+```
+Configurable Parameters
+All training-related parameters can be adjusted in the configuration file located at:
+```
+./config/train.yaml
+```
+This allows you to easily modify aspects like the learning rate, batch size, number of epochs, and more to suit your hardware or dataset requirements.
+We also provide a **pre-trained checkpoint** trained for two days on a single **NVIDIA RTX 4090**. You can use this checkpoint for inference or fine-tuning. The key training parameters for this checkpoint are as follows:
+- `batch_size`: 48
+- `mel_frames`: 800
+- `lr`: 0.0001
+- `num_epochs`: 100000
+- `sample_interval`: 250
+- `h5_file_path`: './dataset/gtzan_train.h5'
+- `device`: 'cuda:4'
+- `input_size`: [80, 800]
+- `patch_size`: 8
+- `in_channels`: 1
+- `hidden_size`: 384
+- `depth`: 12
+- `num_heads`: 6
+- `checkpoint_dir`: 'gtzan-ck'
+You can modify the model architecture and parameters in the `train.yaml` configuration file to compare your models against ours. We will continue to release more checkpoints and models in future updates.
+## Inference
+Once you have trained your own model, you can perform inference using the trained model. To do so, run the following command:
+```bash
+python sample.py --checkpoint ./gtzan-ck/model_epoch_20000.pt \
+                 --h5_file ./dataset/gtzan_test.h5 \
+                 --output_gt_dir ./sample/gt \
+                 --output_gen_dir ./sample/gn \
+                 --segment_length 800 \
+                 --sample_rate 22050
+```
+You can also try running inference using our pre-trained model to familiarize yourself with the inference process. We have saved some inference results in the sample folder as a demo. However, due to the limited size of our model, the generated results are not of the highest quality and are intended as simple examples to guide further evaluation.
+## Evaluation
+For the evaluation phase, we highly recommend creating a new environment and using the evaluation library available at [Generated Music Evaluation](https://github.com/HarlandZZC/generated_music_evaluation). This repository provides detailed instructions on setting up the environment and how to use the evaluation tools. New features and functionality will be added to this library over time.
+Once you have set up the environment following the instructions from the evaluation repository, you can run the following script to evaluate your generated music:
+```bash
+python eval.py \
+    --ref_path  ../sample/gt \
+    --gen_path ../sample/gn \
+    --id2text_csv_path ../gtzan-test.csv \
+    --output_path ./output \
+    --device_id 0 \
+    --batch_size 32 \
+    --original_sample_rate 24000 \
+    --fad_sample_rate 16000 \
+    --kl_sample_rate 16000 \
+    --clap_sample_rate 48000 \
+    --run_fad 1 \
+    --run_kl 1 \
+    --run_clap 1
+```
+This script evaluates the generated music against reference music, producing evaluation metrics such as CLAP, KL, and FAD scores.
+## To-Do
+The following features and improvements are planned for future updates:
+- **EMA Model**: Implement Exponential Moving Average (EMA) for model weights to stabilize training and improve final generation quality.
+- **Long-Term Music Fine-tuning**: Explore fine-tuning the model to generate longer-term music with more coherent structures.
+- **VAE Integration**: Integrate a Variational Autoencoder (VAE) to improve latent space representations and potentially enhance generation diversity.
+- **T5-based Text Conditioning**: Add T5 to enhance text conditioning, improving the control and accuracy of the text-to-music generation process.

config/train.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+batch_size: 48
+mel_frames: 800
+lr: 0.0001
+num_epochs: 100000
+sample_interval: 250
+h5_file_path: './dataset/gtzan_train.h5'
+device: 'cuda:4'
+input_size: [80, 800]
+patch_size: 8
+in_channels: 1
+hidden_size: 384
+depth: 12
+num_heads: 6
+checkpoint_dir: 'gtzan-ck'

dataset/gtzan_test.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f9c40a6548fcd65c8bf4296968e1bf8289ba422e9fdfacd6745d4c9dfc86082
+size 90507648

dataset/gtzan_train.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:682e8998af88b14af1132d3fafc916f30fcfe21d4e91743fa2e91828667b9d6d
+size 813506352

diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+def create_diffusion(
+    timestep_respacing,
+    noise_schedule="linear",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    learn_sigma=True,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd.ModelVarType.FIXED_LARGE
+                if not sigma_small
+                else gd.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type
+        # rescale_timesteps=rescale_timesteps,
+    )

diffusion/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import torch as th
+import numpy as np
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def continuous_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a continuous Gaussian distribution.
+    :param x: the targets
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    normalized_x = centered_x * inv_stdv
+    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
+    return log_probs
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,873 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import math
+import numpy as np
+import torch as th
+import enum
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

diffusion/timestep_sampler.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

gtzan-ck/model_epoch_20000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47347904d66464d7c77044b00ec00c6c24ce4a034df87f8c3f735564b2a328cb
+size 392135773

gtzan-test.csv ADDED Viewed

	@@ -0,0 +1,101 @@

+ids,descri
+classical_classical.00019,classical
+rock_rock.00092,rock
+reggae_reggae.00083,reggae
+reggae_reggae.00087,reggae
+country_country.00020,country
+reggae_reggae.00080,reggae
+metal_metal.00061,metal
+jazz_jazz.00058,jazz
+disco_disco.00033,disco
+metal_metal.00068,metal
+rock_rock.00096,rock
+pop_pop.00070,pop
+blues_blues.00001,blues
+jazz_jazz.00050,jazz
+country_country.00022,country
+rock_rock.00095,rock
+metal_metal.00066,metal
+disco_disco.00032,disco
+pop_pop.00071,pop
+blues_blues.00006,blues
+disco_disco.00039,disco
+reggae_reggae.00081,reggae
+reggae_reggae.00088,reggae
+hiphop_hiphop.00046,hiphop
+country_country.00023,country
+hiphop_hiphop.00040,hiphop
+classical_classical.00012,classical
+reggae_reggae.00084,reggae
+reggae_reggae.00085,reggae
+hiphop_hiphop.00043,hiphop
+jazz_jazz.00052,jazz
+blues_blues.00004,blues
+disco_disco.00037,disco
+hiphop_hiphop.00047,hiphop
+pop_pop.00076,pop
+classical_classical.00014,classical
+rock_rock.00090,rock
+classical_classical.00013,classical
+blues_blues.00002,blues
+rock_rock.00098,rock
+hiphop_hiphop.00044,hiphop
+rock_rock.00099,rock
+metal_metal.00065,metal
+metal_metal.00062,metal
+blues_blues.00007,blues
+pop_pop.00073,pop
+jazz_jazz.00053,jazz
+country_country.00024,country
+pop_pop.00078,pop
+blues_blues.00000,blues
+jazz_jazz.00055,jazz
+blues_blues.00003,blues
+hiphop_hiphop.00041,hiphop
+hiphop_hiphop.00048,hiphop
+pop_pop.00077,pop
+metal_metal.00067,metal
+reggae_reggae.00089,reggae
+jazz_jazz.00056,jazz
+hiphop_hiphop.00049,hiphop
+disco_disco.00038,disco
+jazz_jazz.00057,jazz
+reggae_reggae.00082,reggae
+rock_rock.00091,rock
+metal_metal.00060,metal
+country_country.00028,country
+pop_pop.00075,pop
+rock_rock.00094,rock
+classical_classical.00010,classical
+rock_rock.00097,rock
+jazz_jazz.00051,jazz
+country_country.00025,country
+country_country.00029,country
+country_country.00027,country
+pop_pop.00072,pop
+metal_metal.00063,metal
+classical_classical.00011,classical
+blues_blues.00008,blues
+classical_classical.00018,classical
+pop_pop.00079,pop
+jazz_jazz.00059,jazz
+disco_disco.00034,disco
+country_country.00021,country
+hiphop_hiphop.00045,hiphop
+reggae_reggae.00086,reggae
+metal_metal.00069,metal
+classical_classical.00016,classical
+classical_classical.00015,classical
+disco_disco.00036,disco
+blues_blues.00009,blues
+country_country.00026,country
+jazz_jazz.00054,jazz
+disco_disco.00035,disco
+pop_pop.00074,pop
+rock_rock.00093,rock
+hiphop_hiphop.00042,hiphop
+disco_disco.00031,disco
+blues_blues.00005,blues
+disco_disco.00030,disco
+classical_classical.00017,classical
+metal_metal.00064,metal

models.py ADDED Viewed

	@@ -0,0 +1,375 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                 Core DiT Model                                #
+#################################################################################
+class DiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        #num_classes=1000,
+        learn_sigma=True,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        #self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        #self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        #pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], self.x_embedder.grid_size)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        #nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        #h = w = int(x.shape[1] ** 0.5)
+        h = int(self.x_embedder.grid_size[0])
+        w = int(self.x_embedder.grid_size[1])
+        #assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        #imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
+    def forward(self, x, t, y):
+        """
+        Forward pass of DiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(t)                   # (N, D)
+        #y = self.y_embedder(y, self.training)    # (N, D)
+        y = y.squeeze(dim=1)
+        c = t + y                                # (N, D)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, T, D)
+        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        return x
+    def forward_with_cfg(self, x, t, y, cfg_scale):
+        """
+        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, y)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   DiT Configs                                  #
+#################################################################################
+def DiT_XL_2(**kwargs):
+    return DiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def DiT_XL_4(**kwargs):
+    return DiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)
+def DiT_XL_8(**kwargs):
+    return DiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+def DiT_L_2(**kwargs):
+    return DiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def DiT_L_4(**kwargs):
+    return DiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)
+def DiT_L_8(**kwargs):
+    return DiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)
+def DiT_B_2(**kwargs):
+    return DiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def DiT_B_4(**kwargs):
+    return DiT(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def DiT_B_8(**kwargs):
+    return DiT(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def DiT_S_2(**kwargs):
+    return DiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def DiT_S_4(**kwargs):
+    return DiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def DiT_S_8(**kwargs):
+    return DiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+DiT_models = {
+    'DiT-XL/2': DiT_XL_2,  'DiT-XL/4': DiT_XL_4,  'DiT-XL/8': DiT_XL_8,
+    'DiT-L/2':  DiT_L_2,   'DiT-L/4':  DiT_L_4,   'DiT-L/8':  DiT_L_8,
+    'DiT-B/2':  DiT_B_2,   'DiT-B/4':  DiT_B_4,   'DiT-B/8':  DiT_B_8,
+    'DiT-S/2':  DiT_S_2,   'DiT-S/4':  DiT_S_4,   'DiT-S/8':  DiT_S_8,
+}

requirement.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+absl-py==2.1.0
+aiohttp==3.10.0
+attrs==23.2.0
+audioread==3.0.1
+cffi==1.16.0
+datasets==2.20.0
+einops==0.8.0
+fsspec==2024.5.0
+GitPython==3.1.43
+h5py==3.11.0
+huggingface-hub==0.24.5
+joblib==1.4.2
+librosa==0.10.2.post1
+numpy==1.26.4
+pandas==2.2.2
+pydub==0.25.1
+scipy==1.13.1
+sentence-transformers==3.1.0
+six==1.16.0
+soundfile==0.12.1
+timm==0.9.2
+tqdm==4.66.4
+torch==2.0.0
+torchmetrics==1.4.1
+transformers==4.43.3
+tensorboard

sample.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import os
+import torch
+import h5py
+import random
+import numpy as np
+import soundfile as sf
+from models import DiT
+from diffusion import create_diffusion
+from tqdm import tqdm
+import sys
+sys.path.append('./tools/bigvgan_v2_22khz_80band_256x')
+from bigvgan import BigVGAN
+from torch import nn
+import torch.nn.functional as F
+import argparse
+device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
+class MelToAudio_bigvgan(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.vocoder = BigVGAN.from_pretrained('/home/zheqid/workspace/music_dit/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
+        self.vocoder.remove_weight_norm()
+    def __call__(self, z):
+        x = self.mel_to_audio(z)
+        return x
+    def mel_to_audio(self, x):
+        with torch.no_grad():
+            self.vocoder.eval()
+            y = self.vocoder(x[:, :, :])
+            y = y.squeeze(0)
+        return y
+vocoder = MelToAudio_bigvgan().to(device)
+def load_trained_model(checkpoint_path):
+    model = DiT(
+        input_size=(80, 800),
+        patch_size=8,
+        in_channels=1,
+        hidden_size=384,
+        depth=12,
+        num_heads=6,
+    )
+    model.to(device)
+    checkpoint = torch.load(checkpoint_path)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    return model
+def load_all_meta_and_mel_from_h5(h5_file):
+    with h5py.File(h5_file, 'r') as f:
+        keys = list(f.keys())
+        for key in keys:
+            meta_latent = torch.FloatTensor(f[key]['meta'][:]).to(device)
+            mel = torch.FloatTensor(f[key]['mel'][:]).to(device)
+            yield key, meta_latent, mel
+def extract_random_mel_segment(mel, segment_length=800):
+    total_length = mel.shape[2]
+    if total_length > segment_length:
+        start = np.random.randint(0, total_length - segment_length)
+        mel_segment = mel[:, :, start:start + segment_length]
+    else:
+        padding = segment_length - total_length
+        mel_segment = F.pad(mel, (0, padding), mode='constant', value=0)
+    mel_segment = (mel_segment + 10) / 20
+    return mel_segment
+def infer_and_generate_audio(model, diffusion, meta_latent):
+    latent_size = (80, 800)
+    z = torch.randn(1, 1, latent_size[0], latent_size[1], device=device)
+    model_kwargs = dict(y=meta_latent)
+    with torch.no_grad():
+        samples = diffusion.p_sample_loop(
+            model.forward, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device
+        )
+    return samples
+def save_audio(mel, vocoder, output_path, sample_rate=24000):
+    with torch.no_grad():
+        if mel.dim() == 4 and mel.shape[1] == 1:
+            mel = mel[0, 0, :, :]
+        elif mel.dim() == 3 and mel.shape[0] == 1:
+            mel = mel[0]
+        else:
+            raise ValueError(f"Unexpected mel shape: {mel.shape}")
+        mel = mel.unsqueeze(0)
+        wav = vocoder(mel * 20 - 10).cpu().numpy()
+    sf.write(output_path, wav[0], samplerate=sample_rate)
+    print(f"Saved audio to: {output_path}")
+def main():
+    parser = argparse.ArgumentParser(description='Generate audio using DiT and BigVGAN')
+    parser.add_argument('--checkpoint', type=str, required=True, help='Path to model checkpoint')
+    parser.add_argument('--h5_file', type=str, required=True, help='Path to input H5 file')
+    parser.add_argument('--output_gt_dir', type=str, required=True, help='Directory to save ground truth audio')
+    parser.add_argument('--output_gen_dir', type=str, required=True, help='Directory to save generated audio')
+    parser.add_argument('--segment_length', type=int, default=800, help='Segment length for mel slices (default: 800)')
+    parser.add_argument('--sample_rate', type=int, default=22050, help='Sample rate for output audio (default: 24000)')
+    args = parser.parse_args()
+    model = load_trained_model(args.checkpoint)
+    diffusion = create_diffusion(timestep_respacing="")
+    for i, (key, meta_latent, mel) in enumerate(tqdm(load_all_meta_and_mel_from_h5(args.h5_file))):
+        mel_segment = extract_random_mel_segment(mel, segment_length=args.segment_length)
+        ground_truth_wav_path = os.path.join(args.output_gt_dir, f"{key}.wav")
+        save_audio(mel_segment, vocoder, ground_truth_wav_path, sample_rate=args.sample_rate)
+        generated_mel = infer_and_generate_audio(model, diffusion, meta_latent)
+        output_wav_path = os.path.join(args.output_gen_dir, f"{key}.wav")
+        save_audio(generated_mel, vocoder, output_wav_path, sample_rate=args.sample_rate)
+if __name__ == "__main__":
+    main()
+### how to use
+'''
+python sample.py --checkpoint ./gtzan-ck/model_epoch_20000.pt \
+                      --h5_file ./dataset/gtzan_test.h5 \
+                      --output_gt_dir ./sample/gn \
+                      --output_gen_dir ./sample/gt \
+                      --segment_length 800 \
+                      --sample_rate 22050
+'''

sample/gn/blues_blues.00000.mp3 ADDED Viewed

Binary file (58.2 kB). View file

sample/gn/blues_blues.00001.mp3 ADDED Viewed

Binary file (57.9 kB). View file

sample/gn/blues_blues.00002.mp3 ADDED Viewed

Binary file (56.7 kB). View file

sample/gn/blues_blues.00003.mp3 ADDED Viewed

Binary file (55.7 kB). View file

sample/gn/blues_blues.00004.mp3 ADDED Viewed

Binary file (53.4 kB). View file

sample/gn/blues_blues.00005.mp3 ADDED Viewed

Binary file (60.2 kB). View file

sample/gn/blues_blues.00006.mp3 ADDED Viewed

Binary file (53.8 kB). View file

sample/gn/blues_blues.00007.mp3 ADDED Viewed

Binary file (55 kB). View file

sample/gn/blues_blues.00008.mp3 ADDED Viewed

Binary file (54.8 kB). View file

sample/gn/blues_blues.00009.mp3 ADDED Viewed

Binary file (54.1 kB). View file

sample/gt/blues_blues.00000.mp3 ADDED Viewed

Binary file (54.8 kB). View file

sample/gt/blues_blues.00001.mp3 ADDED Viewed

Binary file (55.4 kB). View file

sample/gt/blues_blues.00002.mp3 ADDED Viewed

Binary file (63.9 kB). View file

sample/gt/blues_blues.00003.mp3 ADDED Viewed

Binary file (57.2 kB). View file

sample/gt/blues_blues.00004.mp3 ADDED Viewed

Binary file (59.8 kB). View file

sample/gt/blues_blues.00005.mp3 ADDED Viewed

Binary file (58.2 kB). View file

sample/gt/blues_blues.00006.mp3 ADDED Viewed

Binary file (60.4 kB). View file

sample/gt/blues_blues.00007.mp3 ADDED Viewed

Binary file (59.6 kB). View file

sample/gt/blues_blues.00008.mp3 ADDED Viewed

Binary file (56.6 kB). View file

sample/gt/blues_blues.00009.mp3 ADDED Viewed

Binary file (52.6 kB). View file

tools/bigvgan_v2_22khz_80band_256x ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 633ff708ed5b74903e86ff1298cf4a98e921c513

tools/gtzan2h5.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import torch
+import h5py
+import random
+import numpy as np
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer
+import librosa
+from bigvgan_v2_22khz_80band_256x.meldataset import get_mel_spectrogram
+from types import SimpleNamespace
+from torch import nn
+from einops import rearrange
+import json
+import argparse
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# Load SentenceTransformer model
+sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
+class AudioToMel_bigvgan(nn.Module):
+    def __init__(self, config_path):
+        super().__init__()
+        # Load configuration file
+        with open(config_path, 'r') as f:
+            self.h = json.load(f, object_hook=lambda d: SimpleNamespace(**d))
+    def __call__(self, audio):
+        x = self.audio_to_mel(audio)  # Extract mel spectrogram
+        return x
+    def audio_to_mel(self, audio):
+        # Convert to mono channel
+        audio = audio[:, 0, :]  # Assuming input is (b, c, t), take first channel
+        audio = torch.tensor(audio)
+        # Extract mel spectrogram
+        x = get_mel_spectrogram(
+            wav=audio[:, :],
+            h=self.h
+        )  # Shape: (b, f, t)
+        return x
+# Initialize BigVGAN Mel extraction model
+audio_to_mel_model = None  # Placeholder, will be initialized later
+def extract_mel_features(audio_path, sr=24000):
+    """
+    Extract Mel features using BigVGAN model, with normalization.
+    :param audio_path: Path to the audio file
+    :param sr: Sampling rate (default 24000)
+    :return: Mel spectrogram
+    """
+    # Load and normalize audio
+    wav, _ = librosa.load(audio_path, sr=sr)
+    max_val = np.max(np.abs(wav))
+    if max_val > 1.0:
+        wav = wav / max_val
+    wav_tensor = torch.FloatTensor(wav).unsqueeze(0).unsqueeze(0).to(device)  # Shape: (1, 1, T)
+    # Extract Mel spectrogram
+    mel_spectrogram = audio_to_mel_model(wav_tensor).cpu().numpy()
+    return mel_spectrogram
+def get_embedding_from_folder_name(folder_name):
+    """
+    Convert folder name into embedding using SentenceTransformer.
+    :param folder_name: Name of the folder
+    :return: Corresponding embedding
+    """
+    try:
+        embedding = sentence_model.encode([folder_name])
+        return embedding
+    except Exception as e:
+        print(f"Error encoding label for {folder_name}: {e}")
+        return None
+def process_single_file(file_info):
+    """
+    Process a single audio file and return its key, mel features, and meta embedding.
+    :param file_info: (root_dir, audio_path) tuple
+    :return: (key, mel_features, embedding)
+    """
+    root_dir, audio_path = file_info
+    try:
+        # Get file and folder names
+        file_name_with_ext = os.path.basename(audio_path)
+        folder_name = os.path.basename(os.path.dirname(audio_path))
+        # Extract Mel features
+        mel_features = extract_mel_features(audio_path)
+        # Get embedding from folder name
+        embedding = get_embedding_from_folder_name(folder_name)
+        if embedding is None:
+            return None, None, None
+        key = os.path.relpath(audio_path, root_dir).replace('/', '_').replace('\\', '_')
+        return key, mel_features, embedding
+    except Exception as e:
+        print(f"Error processing {audio_path}: {e}")
+        return None, None, None
+def process_and_save_files(audio_files, output_h5_file):
+    """
+    Process audio files and save Mel features and meta embeddings to an HDF5 file.
+    :param audio_files: List of audio file paths
+    :param output_h5_file: Path to the HDF5 output file
+    """
+    with h5py.File(output_h5_file, 'w') as h5f:
+        for file_info in tqdm(audio_files, desc="Processing audio files"):
+            key, mel_features, embedding = process_single_file(file_info)
+            if key is not None and mel_features is not None and embedding is not None:
+                group = h5f.create_group(key)
+                group.create_dataset('mel', data=mel_features)
+                group.create_dataset('meta', data=embedding)
+def process_audio_files(root_dir, output_h5_file):
+    """
+    Walk through a directory and process all audio files, saving them to an HDF5 file.
+    :param root_dir: Root directory containing audio files
+    :param output_h5_file: Path to the HDF5 output file
+    """
+    audio_files = []
+    for subdir, _, files in os.walk(root_dir):
+        for file in files:
+            if file.endswith('.wav') or file.endswith('.mp3') or file.endswith('.flac'):
+                audio_path = os.path.join(subdir, file)
+                audio_files.append((root_dir, audio_path))
+    random.shuffle(audio_files)
+    print(f"Processing {len(audio_files)} files...")
+    process_and_save_files(audio_files, output_h5_file)
+if __name__ == "__main__":
+    # Argument parser for command line arguments
+    parser = argparse.ArgumentParser(description="Process audio files and extract mel features.")
+    parser.add_argument('--root_dir', type=str, required=True, help='Root directory of the audio files.')
+    parser.add_argument('--output_h5_file', type=str, required=True, help='Output HDF5 file path.')
+    parser.add_argument('--config_path', type=str, required=True, help='Path to the BigVGAN config.json file.')
+    parser.add_argument('--sr', type=int, default=22050, help='Sampling rate (default: 24000).')
+    args = parser.parse_args()
+    # Initialize the BigVGAN Mel extraction model
+    audio_to_mel_model = AudioToMel_bigvgan(args.config_path).to(device)
+    # Process audio files
+    process_audio_files(args.root_dir, args.output_h5_file)
+    print(f"Processing completed. H5 file saved at: {args.output_h5_file}")
+### how to use
+# python process_audio.py --root_dir /path/to/audio/files --output_h5_file /path/to/output.h5 --config_path --config_path bigvgan_v2_22khz_80band_256x/config.json  --sr 22050

tools/gtzan_split.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import re
+import argparse
+from pydub import AudioSegment
+class GTZAN:
+    def __init__(self, root_dir, output_dir, labels):
+        """
+        Args:
+            root_dir (str): Root directory of the dataset.
+            output_dir (str): Output directory to save converted MP3 files.
+            labels (list): List of genres in the dataset.
+        """
+        self.root_dir = root_dir
+        self.output_dir = output_dir
+        self.labels = labels
+        # Create output directory structure for MP3 files
+        self.create_output_dirs()
+    def create_output_dirs(self):
+        """Create directories to store train and test audio files"""
+        for split in ['train', 'test']:
+            for genre in self.labels:
+                genre_dir = os.path.join(self.output_dir, split, genre)
+                os.makedirs(genre_dir, exist_ok=True)
+    def split_train_test(self, audio_names, test_fold):
+        """
+        Split the dataset into train and test sets based on test_fold.
+        E.g., test_ids = [30, 31, 32, ..., 39].
+        """
+        test_audio_names = []
+        train_audio_names = []
+        test_ids = range(test_fold * 10, (test_fold + 1) * 10)
+        for audio_name in audio_names:
+            # Extract the numeric ID from the audio file name
+            audio_id = int(re.search(r'\d+', audio_name).group())
+            if audio_id in test_ids:
+                test_audio_names.append(audio_name)
+            else:
+                train_audio_names.append(audio_name)
+        return train_audio_names, test_audio_names
+    def convert_and_save(self, file_path, target_path):
+        """Convert AU format to MP3 and save to target path"""
+        audio = AudioSegment.from_file(file_path, format="au")
+        audio.export(target_path, format="mp3")
+        print(f"Converted and saved {target_path}")
+    def process_genre(self, genre, test_fold):
+        """Process a single genre, split the dataset, and convert formats"""
+        genre_path = os.path.join(self.root_dir, genre)
+        audio_files = os.listdir(genre_path)
+        # Split the dataset
+        train_files, test_files = self.split_train_test(audio_files, test_fold)
+        # Process training set
+        for audio_name in train_files:
+            file_path = os.path.join(genre_path, audio_name)
+            target_path = os.path.join(self.output_dir, 'train', genre, audio_name.replace('.au', '.mp3'))
+            self.convert_and_save(file_path, target_path)
+        # Process test set
+        for audio_name in test_files:
+            file_path = os.path.join(genre_path, audio_name)
+            target_path = os.path.join(self.output_dir, 'test', genre, audio_name.replace('.au', '.mp3'))
+            self.convert_and_save(file_path, target_path)
+    def process_dataset(self):
+        """Process the entire GTZAN dataset and split it into train and test sets"""
+        for idx, genre in enumerate(self.labels):
+            print(f"Processing genre: {genre}...")
+            test_fold = idx % 10  # Each genre has a different test_fold
+            self.process_genre(genre, test_fold)
+if __name__ == "__main__":
+    # Define argument parser
+    parser = argparse.ArgumentParser(description="GTZAN Dataset Converter")
+    parser.add_argument('--root_dir', type=str, required=True, help='Root directory of the GTZAN dataset')
+    parser.add_argument('--output_dir', type=str, required=True, help='Directory to save the converted MP3 files')
+    args = parser.parse_args()
+    # Example genre labels in the GTZAN dataset
+    labels = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
+    # Initialize the GTZAN processor
+    gtzan = GTZAN(args.root_dir, args.output_dir, labels)
+    gtzan.process_dataset()
+### how to use
+# python gtzan_converter.py --root_dir /path/to/gtzan/genres --output_dir /path/to/output/directory

train.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import h5py
+import torch
+import random
+import yaml
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from tqdm import tqdm
+from diffusion import create_diffusion
+from models import DiT
+import torch.optim as optim
+from torch.utils.tensorboard import SummaryWriter  # TensorBoard
+# Load hyperparameters from YAML file
+with open('config/train.yaml', 'r') as file:
+    config = yaml.safe_load(file)
+# Create TensorBoard writer
+writer = SummaryWriter()
+class MelMetaDataset(Dataset):
+    def __init__(self, h5_file, mel_frames):
+        self.h5_file = h5_file
+        self.mel_frames = mel_frames
+        with h5py.File(h5_file, 'r') as f:
+            self.keys = list(f.keys())
+    def __len__(self):
+        return len(self.keys)
+    def pad_mel(self, mel_segment, total_frames):
+        if total_frames < self.mel_frames:
+            padding_frames = self.mel_frames - total_frames
+            mel_segment = F.pad(mel_segment, (0, padding_frames), mode='constant', value=0)
+        return mel_segment
+    def __getitem__(self, idx):
+        key = self.keys[idx]
+        with h5py.File(self.h5_file, 'r') as f:
+            mel = torch.FloatTensor(f[key]['mel'][:])
+            meta_latent = torch.FloatTensor(f[key]['meta'][:])
+        total_frames = mel.shape[2]
+        if total_frames > self.mel_frames:
+            start_frame = random.randint(0, total_frames - self.mel_frames)
+            mel_segment = mel[:, :, start_frame:start_frame + self.mel_frames]
+        else:
+            mel_segment = self.pad_mel(mel, total_frames)
+        mel_segment = (mel_segment + 10) / 20
+        return mel_segment, meta_latent
+# Dataset & DataLoader
+dataset = MelMetaDataset(config['h5_file_path'], mel_frames=config['mel_frames'])
+dataloader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=True)
+# Model and optimizer
+device = config['device'] if torch.cuda.is_available() else "cpu"
+model = DiT(
+    input_size=tuple(config['input_size']),
+    patch_size=config['patch_size'],
+    in_channels=config['in_channels'],
+    hidden_size=config['hidden_size'],
+    depth=config['depth'],
+    num_heads=config['num_heads'],
+)
+model.to(device)
+# Create diffusion model
+diffusion = create_diffusion(timestep_respacing="")
+# Optimizer
+optimizer = optim.AdamW(model.parameters(), lr=config['lr'])
+# Create directory to save model checkpoints
+os.makedirs(config['checkpoint_dir'], exist_ok=True)
+# Training function
+def train_model(model, dataloader, optimizer, diffusion, num_epochs, sample_interval):
+    model.train()
+    for epoch in range(num_epochs):
+        total_loss = 0.0
+        for step, (mel_segment, meta_latent) in enumerate(tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")):
+            mel_segment = mel_segment.to(device)
+            meta_latent = meta_latent.to(device)
+            t = torch.randint(0, diffusion.num_timesteps, (mel_segment.shape[0],), device=device)
+            model_kwargs = dict(y=meta_latent)
+            loss_dict = diffusion.training_losses(model, mel_segment, t, model_kwargs)
+            loss = loss_dict["loss"].mean()
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        avg_loss = total_loss / len(dataloader)
+        print(f"Epoch {epoch + 1}/{num_epochs}: Average Loss: {avg_loss:.4f}")
+        writer.add_scalar('Loss/epoch', avg_loss, epoch + 1)
+        if (epoch + 1) % sample_interval == 0:
+            checkpoint = {
+                'epoch': epoch + 1,
+                'model_state_dict': model.state_dict(),
+                'optimizer_state_dict': optimizer.state_dict(),
+            }
+            checkpoint_path = f"{config['checkpoint_dir']}/model_epoch_{epoch + 1}.pt"
+            torch.save(checkpoint, checkpoint_path)
+            print(f"Model checkpoint saved at epoch {epoch + 1}")
+# Start training
+train_model(model, dataloader, optimizer, diffusion, num_epochs=config['num_epochs'], sample_interval=config['sample_interval'])
+# Close TensorBoard writer
+writer.close()