| import contextlib |
| import importlib |
|
|
| from inspect import isfunction |
| import os |
| import soundfile as sf |
| import time |
| import wave |
|
|
| import urllib.request |
| import progressbar |
|
|
| CACHE_DIR = os.getenv( |
| "AUDIOLDM_CACHE_DIR", |
| os.path.join(os.path.expanduser("~"), ".cache/audioldm")) |
|
|
| def get_duration(fname): |
| with contextlib.closing(wave.open(fname, 'r')) as f: |
| frames = f.getnframes() |
| rate = f.getframerate() |
| return frames / float(rate) |
| |
| def get_bit_depth(fname): |
| with contextlib.closing(wave.open(fname, 'r')) as f: |
| bit_depth = f.getsampwidth() * 8 |
| return bit_depth |
| |
| def get_time(): |
| t = time.localtime() |
| return time.strftime("%d_%m_%Y_%H_%M_%S", t) |
|
|
| def seed_everything(seed): |
| import random, os |
| import numpy as np |
| import torch |
|
|
| random.seed(seed) |
| os.environ["PYTHONHASHSEED"] = str(seed) |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| torch.cuda.manual_seed(seed) |
| torch.backends.cudnn.deterministic = True |
| torch.backends.cudnn.benchmark = True |
|
|
|
|
| def save_wave(waveform, savepath, name="outwav"): |
| if type(name) is not list: |
| name = [name] * waveform.shape[0] |
|
|
| for i in range(waveform.shape[0]): |
| path = os.path.join( |
| savepath, |
| "%s_%s.wav" |
| % ( |
| os.path.basename(name[i]) |
| if (not ".wav" in name[i]) |
| else os.path.basename(name[i]).split(".")[0], |
| i, |
| ), |
| ) |
| print("Save audio to %s" % path) |
| sf.write(path, waveform[i, 0], samplerate=16000) |
|
|
|
|
| def exists(x): |
| return x is not None |
|
|
|
|
| def default(val, d): |
| if exists(val): |
| return val |
| return d() if isfunction(d) else d |
|
|
|
|
| def count_params(model, verbose=False): |
| total_params = sum(p.numel() for p in model.parameters()) |
| if verbose: |
| print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.") |
| return total_params |
|
|
|
|
| def get_obj_from_str(string, reload=False): |
| module, cls = string.rsplit(".", 1) |
| if reload: |
| module_imp = importlib.import_module(module) |
| importlib.reload(module_imp) |
| return getattr(importlib.import_module(module, package=None), cls) |
|
|
|
|
| def instantiate_from_config(config): |
| if not "target" in config: |
| if config == "__is_first_stage__": |
| return None |
| elif config == "__is_unconditional__": |
| return None |
| raise KeyError("Expected key `target` to instantiate.") |
| return get_obj_from_str(config["target"])(**config.get("params", dict())) |
|
|
|
|
| def default_audioldm_config(model_name="audioldm-s-full"): |
| basic_config = { |
| "wave_file_save_path": "./output", |
| "id": { |
| "version": "v1", |
| "name": "default", |
| "root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml", |
| }, |
| "preprocessing": { |
| "audio": {"sampling_rate": 16000, "max_wav_value": 32768}, |
| "stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024}, |
| "mel": { |
| "n_mel_channels": 64, |
| "mel_fmin": 0, |
| "mel_fmax": 8000, |
| "freqm": 0, |
| "timem": 0, |
| "blur": False, |
| "mean": -4.63, |
| "std": 2.74, |
| "target_length": 1024, |
| }, |
| }, |
| "model": { |
| "device": "cuda", |
| "target": "audioldm.pipline.LatentDiffusion", |
| "params": { |
| "base_learning_rate": 5e-06, |
| "linear_start": 0.0015, |
| "linear_end": 0.0195, |
| "num_timesteps_cond": 1, |
| "log_every_t": 200, |
| "timesteps": 1000, |
| "first_stage_key": "fbank", |
| "cond_stage_key": "waveform", |
| "latent_t_size": 256, |
| "latent_f_size": 16, |
| "channels": 8, |
| "cond_stage_trainable": True, |
| "conditioning_key": "film", |
| "monitor": "val/loss_simple_ema", |
| "scale_by_std": True, |
| "unet_config": { |
| "target": "audioldm.latent_diffusion.openaimodel.UNetModel", |
| "params": { |
| "image_size": 64, |
| "extra_film_condition_dim": 512, |
| "extra_film_use_concat": True, |
| "in_channels": 8, |
| "out_channels": 8, |
| "model_channels": 128, |
| "attention_resolutions": [8, 4, 2], |
| "num_res_blocks": 2, |
| "channel_mult": [1, 2, 3, 5], |
| "num_head_channels": 32, |
| "use_spatial_transformer": True, |
| }, |
| }, |
| "first_stage_config": { |
| "base_learning_rate": 4.5e-05, |
| "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL", |
| "params": { |
| "monitor": "val/rec_loss", |
| "image_key": "fbank", |
| "subband": 1, |
| "embed_dim": 8, |
| "time_shuffle": 1, |
| "ddconfig": { |
| "double_z": True, |
| "z_channels": 8, |
| "resolution": 256, |
| "downsample_time": False, |
| "in_channels": 1, |
| "out_ch": 1, |
| "ch": 128, |
| "ch_mult": [1, 2, 4], |
| "num_res_blocks": 2, |
| "attn_resolutions": [], |
| "dropout": 0.0, |
| }, |
| }, |
| }, |
| "cond_stage_config": { |
| "target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2", |
| "params": { |
| "key": "waveform", |
| "sampling_rate": 16000, |
| "embed_mode": "audio", |
| "unconditional_prob": 0.1, |
| }, |
| }, |
| }, |
| }, |
| } |
| |
| if("-l-" in model_name): |
| basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 256 |
| basic_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = 64 |
| elif("-m-" in model_name): |
| basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 192 |
| basic_config["model"]["params"]["cond_stage_config"]["params"]["amodel"] = "HTSAT-base" |
| |
| return basic_config |
| |
| def get_metadata(): |
| return { |
| "audioldm-s-full": { |
| "path": os.path.join( |
| CACHE_DIR, |
| "audioldm-s-full.ckpt", |
| ), |
| "url": "https://zenodo.org/record/7600541/files/audioldm-s-full?download=1", |
| }, |
| "audioldm-l-full": { |
| "path": os.path.join( |
| CACHE_DIR, |
| "audioldm-l-full.ckpt", |
| ), |
| "url": "https://zenodo.org/record/7698295/files/audioldm-full-l.ckpt?download=1", |
| }, |
| "audioldm-s-full-v2": { |
| "path": os.path.join( |
| CACHE_DIR, |
| "audioldm-s-full-v2.ckpt", |
| ), |
| "url": "https://zenodo.org/record/7698295/files/audioldm-full-s-v2.ckpt?download=1", |
| }, |
| "audioldm-m-text-ft": { |
| "path": os.path.join( |
| CACHE_DIR, |
| "audioldm-m-text-ft.ckpt", |
| ), |
| "url": "https://zenodo.org/record/7813012/files/audioldm-m-text-ft.ckpt?download=1", |
| }, |
| "audioldm-s-text-ft": { |
| "path": os.path.join( |
| CACHE_DIR, |
| "audioldm-s-text-ft.ckpt", |
| ), |
| "url": "https://zenodo.org/record/7813012/files/audioldm-s-text-ft.ckpt?download=1", |
| }, |
| "audioldm-m-full": { |
| "path": os.path.join( |
| CACHE_DIR, |
| "audioldm-m-full.ckpt", |
| ), |
| "url": "https://zenodo.org/record/7813012/files/audioldm-m-full.ckpt?download=1", |
| }, |
| } |
| |
| class MyProgressBar(): |
| def __init__(self): |
| self.pbar = None |
|
|
| def __call__(self, block_num, block_size, total_size): |
| if not self.pbar: |
| self.pbar=progressbar.ProgressBar(maxval=total_size) |
| self.pbar.start() |
|
|
| downloaded = block_num * block_size |
| if downloaded < total_size: |
| self.pbar.update(downloaded) |
| else: |
| self.pbar.finish() |
| |
| def download_checkpoint(checkpoint_name="audioldm-s-full"): |
| meta = get_metadata() |
| if(checkpoint_name not in meta.keys()): |
| print("The model name you provided is not supported. Please use one of the following: ", meta.keys()) |
|
|
| if not os.path.exists(meta[checkpoint_name]["path"]) or os.path.getsize(meta[checkpoint_name]["path"]) < 2*10**9: |
| os.makedirs(os.path.dirname(meta[checkpoint_name]["path"]), exist_ok=True) |
| print(f"Downloading the main structure of {checkpoint_name} into {os.path.dirname(meta[checkpoint_name]['path'])}") |
|
|
| urllib.request.urlretrieve(meta[checkpoint_name]["url"], meta[checkpoint_name]["path"], MyProgressBar()) |
| print( |
| "Weights downloaded in: {} Size: {}".format( |
| meta[checkpoint_name]["path"], |
| os.path.getsize(meta[checkpoint_name]["path"]), |
| ) |
| ) |
| |