| import gradio as gr | |
| from diffusers import AudioLDMPipeline | |
| # The recommended "fast" scheduler: | |
| #from diffusers import DPMSolverMultistepScheduler | |
| # The Default AudioLDM scheduler: | |
| #from diffusers import DDIMScheduler | |
| #from diffusers import DDPMScheduler | |
| #from diffusers import DEISMultistepScheduler | |
| #from diffusers import DPMSolverSinglestepScheduler | |
| #from diffusers import HeunDiscreteScheduler | |
| from diffusers import KDPM2DiscreteScheduler | |
| #from diffusers import KDPM2AncestralDiscreteScheduler | |
| #from diffusers import LMSDiscreteScheduler | |
| #from diffusers import PNDMScheduler | |
| #from diffusers import EulerDiscreteScheduler | |
| #from diffusers import EulerAncestralDiscreteScheduler | |
| #from diffusers import UniPCMultistepScheduler | |
| from transformers import AutoProcessor, ClapModel | |
| import torch | |
| # import scipy | |
| device="cpu" | |
| repo_id = "cvssp/audioldm-s-full-v2" | |
| pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float32) | |
| #pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = DDPMScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = HeunDiscreteScheduler.from_config(pipe.scheduler.config) | |
| pipe.scheduler = KDPM2DiscreteScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) | |
| #pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) | |
| pipe = pipe.to(device) | |
| clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device) | |
| processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full", sampling_rate=16000) | |
| generator = torch.Generator(device) | |
| def texttoaudio(prompt, neg_prompt, seed, inf_steps, guidance_scale, n_candidates): | |
| if prompt is None: | |
| raise gr.Error("Please provide a text input.") | |
| waveforms = pipe( | |
| prompt, | |
| negative_prompt=neg_prompt, | |
| num_inference_steps=int(inf_steps), | |
| guidance_scale=guidance_scale, | |
| audio_length_in_s=5.0, | |
| generator=generator.manual_seed(int(seed)), | |
| num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1, | |
| )["audios"] | |
| # save the audio sample as a .wav file | |
| # scipy.io.wavfile.write("output.wav", rate=16000, data=audio) | |
| if waveforms.shape[0] > 1: | |
| waveform = score_waveforms(prompt, waveforms) | |
| else: | |
| waveform = waveforms[0] | |
| return (16000, waveform) | |
| def score_waveforms(text, waveforms): | |
| inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True, sampling_rate=16000) | |
| inputs = {key: inputs[key].to(device) for key in inputs} | |
| with torch.no_grad(): | |
| logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score | |
| probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities | |
| most_probable = torch.argmax(probs) # and now select the most likely audio waveform | |
| waveform = waveforms[most_probable] | |
| return waveform | |
| iface = gr.Interface(fn=texttoaudio, title="AudioLDM Testing Playground", inputs=["text", "text", "number", "number", "number", "number"], outputs="audio") | |
| iface.launch() |