Spaces:
Runtime error
Runtime error
| from typing import Iterable, Tuple | |
| import numpy as np | |
| import torch | |
| from librosa.beat import beat_track | |
| from PIL import Image | |
| from tqdm.auto import tqdm | |
| # from diffusers import AudioDiffusionPipeline | |
| from .pipeline_audio_diffusion import AudioDiffusionPipeline | |
| VERSION = "1.5.2" | |
| class AudioDiffusion: | |
| def __init__( | |
| self, | |
| model_id: str = "teticio/audio-diffusion-256", | |
| cuda: bool = torch.cuda.is_available(), | |
| progress_bar: Iterable = tqdm, | |
| ): | |
| """Class for generating audio using De-noising Diffusion Probabilistic Models. | |
| Args: | |
| model_id (String): name of model (local directory or Hugging Face Hub) | |
| cuda (bool): use CUDA? | |
| progress_bar (iterable): iterable callback for progress updates or None | |
| """ | |
| self.model_id = model_id | |
| self.pipe = AudioDiffusionPipeline.from_pretrained(self.model_id) | |
| if cuda: | |
| self.pipe.to("cuda") | |
| self.progress_bar = progress_bar or (lambda _: _) | |
| def generate_spectrogram_and_audio( | |
| self, | |
| steps: int = None, | |
| generator: torch.Generator = None, | |
| step_generator: torch.Generator = None, | |
| eta: float = 0, | |
| noise: torch.Tensor = None, | |
| encoding: torch.Tensor = None, | |
| ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]: | |
| """Generate random mel spectrogram and convert to audio. | |
| Args: | |
| steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM) | |
| generator (torch.Generator): random number generator or None | |
| step_generator (torch.Generator): random number generator used to de-noise or None | |
| eta (float): parameter between 0 and 1 used with DDIM scheduler | |
| noise (torch.Tensor): noisy image or None | |
| encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim) | |
| Returns: | |
| PIL Image: mel spectrogram | |
| (float, np.ndarray): sample rate and raw audio | |
| """ | |
| images, (sample_rate, audios) = self.pipe( | |
| batch_size=1, | |
| steps=steps, | |
| generator=generator, | |
| step_generator=step_generator, | |
| eta=eta, | |
| noise=noise, | |
| encoding=encoding, | |
| return_dict=False, | |
| ) | |
| return images[0], (sample_rate, audios[0]) | |
| def generate_spectrogram_and_audio_from_audio( | |
| self, | |
| audio_file: str = None, | |
| raw_audio: np.ndarray = None, | |
| slice: int = 0, | |
| start_step: int = 0, | |
| steps: int = None, | |
| generator: torch.Generator = None, | |
| mask_start_secs: float = 0, | |
| mask_end_secs: float = 0, | |
| step_generator: torch.Generator = None, | |
| eta: float = 0, | |
| encoding: torch.Tensor = None, | |
| noise: torch.Tensor = None, | |
| ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]: | |
| """Generate random mel spectrogram from audio input and convert to audio. | |
| Args: | |
| audio_file (str): must be a file on disk due to Librosa limitation or | |
| raw_audio (np.ndarray): audio as numpy array | |
| slice (int): slice number of audio to convert | |
| start_step (int): step to start from | |
| steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM) | |
| generator (torch.Generator): random number generator or None | |
| mask_start_secs (float): number of seconds of audio to mask (not generate) at start | |
| mask_end_secs (float): number of seconds of audio to mask (not generate) at end | |
| step_generator (torch.Generator): random number generator used to de-noise or None | |
| eta (float): parameter between 0 and 1 used with DDIM scheduler | |
| encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim) | |
| noise (torch.Tensor): noisy image or None | |
| Returns: | |
| PIL Image: mel spectrogram | |
| (float, np.ndarray): sample rate and raw audio | |
| """ | |
| images, (sample_rate, audios) = self.pipe( | |
| batch_size=1, | |
| audio_file=audio_file, | |
| raw_audio=raw_audio, | |
| slice=slice, | |
| start_step=start_step, | |
| steps=steps, | |
| generator=generator, | |
| mask_start_secs=mask_start_secs, | |
| mask_end_secs=mask_end_secs, | |
| step_generator=step_generator, | |
| eta=eta, | |
| noise=noise, | |
| encoding=encoding, | |
| return_dict=False, | |
| ) | |
| return images[0], (sample_rate, audios[0]) | |
| def loop_it(audio: np.ndarray, sample_rate: int, loops: int = 12) -> np.ndarray: | |
| """Loop audio | |
| Args: | |
| audio (np.ndarray): audio as numpy array | |
| sample_rate (int): sample rate of audio | |
| loops (int): number of times to loop | |
| Returns: | |
| (float, np.ndarray): sample rate and raw audio or None | |
| """ | |
| _, beats = beat_track(y=audio, sr=sample_rate, units="samples") | |
| beats_in_bar = (len(beats) - 1) // 4 * 4 | |
| if beats_in_bar > 0: | |
| return np.tile(audio[beats[0] : beats[beats_in_bar]], loops) | |
| return None | |