Spaces:
Runtime error
Runtime error
| import os | |
| os.system("git clone https://github.com/v-iashin/SpecVQGAN") | |
| os.system("pip install pytorch-lightning==1.2.10 omegaconf==2.0.6 streamlit==0.80 matplotlib==3.4.1 albumentations==0.5.2 SoundFile torch torchvision librosa gdown") | |
| from pathlib import Path | |
| import soundfile | |
| import torch | |
| import gradio as gr | |
| import sys | |
| sys.path.append('./SpecVQGAN') | |
| from feature_extraction.demo_utils import (calculate_codebook_bitrate, | |
| extract_melspectrogram, | |
| get_audio_file_bitrate, | |
| get_duration, | |
| load_neural_audio_codec) | |
| from sample_visualization import tensor_to_plt | |
| from torch.utils.data.dataloader import default_collate | |
| os.chdir("SpecVQGAN") | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| os.system("gdown https://drive.google.com/uc?id=1KGof44Sx4yIn4Hohpp9-VVTh2zGucKeY") | |
| model_name = '2021-05-19T22-16-54_vggsound_codebook' | |
| log_dir = './logs' | |
| # loading the models might take a few minutes | |
| config, model, vocoder = load_neural_audio_codec(model_name, log_dir, device) | |
| def inference(audio): | |
| # Select an Audio | |
| input_wav = audio.name | |
| # Spectrogram Extraction | |
| model_sr = config.data.params.sample_rate | |
| duration = get_duration(input_wav) | |
| spec = extract_melspectrogram(input_wav, sr=model_sr, duration=duration) | |
| print(f'Audio Duration: {duration} seconds') | |
| print('Original Spectrogram Shape:', spec.shape) | |
| # Prepare Input | |
| spectrogram = {'input': spec} | |
| batch = default_collate([spectrogram]) | |
| batch['image'] = batch['input'].to(device) | |
| x = model.get_input(batch, 'image') | |
| with torch.no_grad(): | |
| quant_z, diff, info = model.encode(x) | |
| xrec = model.decode(quant_z) | |
| print('Compressed representation (it is all you need to recover the audio):') | |
| F, T = quant_z.shape[-2:] | |
| print(info[2].reshape(F, T)) | |
| # Calculate Bitrate | |
| bitrate = calculate_codebook_bitrate(duration, quant_z, model.quantize.n_e) | |
| orig_bitrate = get_audio_file_bitrate(input_wav) | |
| # Save and Display | |
| x = x.squeeze(0) | |
| xrec = xrec.squeeze(0) | |
| # specs are in [-1, 1], making them in [0, 1] | |
| wav_x = vocoder((x + 1) / 2).squeeze().detach().cpu().numpy() | |
| wav_xrec = vocoder((xrec + 1) / 2).squeeze().detach().cpu().numpy() | |
| # Save paths | |
| x_save_path = 'vocoded_orig_spec.wav' | |
| xrec_save_path = f'specvqgan_{bitrate:.2f}kbps.wav' | |
| # Save | |
| soundfile.write(x_save_path, wav_x, model_sr, 'PCM_16') | |
| soundfile.write(xrec_save_path, wav_xrec, model_sr, 'PCM_16') | |
| return 'vocoded_orig_spec.wav', f'specvqgan_{bitrate:.2f}kbps.wav', tensor_to_plt(x, flip_dims=(2,)), tensor_to_plt(xrec, flip_dims=(2,)) | |
| title = "SpecVQGAN Neural Audio Codec" | |
| description = "Gradio demo for Spectrogram VQGAN as a Neural Audio Codec. To use it, simply add your audio, or click one of the examples to load them. Read more at the links below." | |
| article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2110.08791' target='_blank'>Taming Visually Guided Sound Generation</a> | <a href='https://github.com/v-iashin/SpecVQGAN' target='_blank'>Github Repo</a></p>" | |
| examples=[['example.wav']] | |
| gr.Interface( | |
| inference, | |
| gr.Audio(type="file", label="Input Audio"), | |
| [gr.Audio(type="file", label="Original audio"),gr.Audio(type="file", label="Reconstructed audio"),gr.Plot(label="Original Spectrogram:"),gr.Plot(label="Reconstructed Spectrogram:")], | |
| title=title, | |
| description=description, | |
| article=article, | |
| enable_queue=True, | |
| examples=examples, | |
| cache_examples=True | |
| ).launch(debug=True) |