Spaces:
Sleeping
Sleeping
| import torch | |
| import gradio as gr | |
| from diffusers import AudioLDMPipeline | |
| from transformers import AutoProcessor, ClapModel | |
| # make Space compatible with CPU duplicates | |
| if torch.cuda.is_available(): | |
| device = "cuda" | |
| torch_dtype = torch.float16 | |
| else: | |
| device = "cpu" | |
| torch_dtype = torch.float32 | |
| # load the diffusers pipeline | |
| repo_id = "cvssp/audioldm-m-full" | |
| pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device) | |
| pipe.unet = torch.compile(pipe.unet) | |
| # CLAP model (only required for automatic scoring) | |
| clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device) | |
| processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full") | |
| generator = torch.Generator(device) | |
| def score_waveforms(text, waveforms): | |
| inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True) | |
| inputs = {key: inputs[key].to(device) for key in inputs} | |
| with torch.no_grad(): | |
| logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score | |
| probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities | |
| most_probable = torch.argmax(probs) # and now select the most likely audio waveform | |
| waveform = waveforms[most_probable] | |
| return waveform | |
| def text_to_music(text_input, negative_prompt, seed, duration, guidance_scale, n_candidates): | |
| waveforms = pipe( | |
| text_input, | |
| audio_length_in_s=duration, | |
| guidance_scale=guidance_scale, | |
| num_inference_steps=100, | |
| negative_prompt=negative_prompt, | |
| num_waveforms_per_prompt=n_candidates if n_candidates else 1, | |
| generator=generator.manual_seed(int(seed)), | |
| )["audios"] | |
| if waveforms.shape[0] > 1: | |
| waveform = score_waveforms(text_input, waveforms) | |
| else: | |
| waveform = waveforms[0] | |
| return waveform.detach().cpu().numpy() | |
| iface = gr.Interface( | |
| fn=text_to_music, | |
| inputs=[ | |
| gr.inputs.Textbox(label="Input text", default="A hammer is hitting a wooden surface"), | |
| gr.inputs.Textbox(label="Negative prompt", default="low quality, average quality"), | |
| gr.inputs.Number(label="Seed", default=45), | |
| gr.inputs.Slider(label="Duration (seconds)", minimum=2.5, maximum=10.0, default=5.0, step=0.1), | |
| gr.inputs.Slider(label="Guidance scale", minimum=0.0, maximum=4.0, default=2.5, step=0.1), | |
| gr.inputs.Slider(label="Number waveforms to generate", minimum=1, maximum=3, default=3, step=1), | |
| ], | |
| outputs=gr.outputs.Audio(label="Generated Audio", type="numpy"), | |
| live=True, | |
| title="Text to Music", | |
| description="Convert text into music using a pre-trained model.", | |
| theme="default", | |
| ) | |
| iface.launch() | |