music_generation / app_audioldm.py
IsraelRM's picture
Update app_audioldm.py
65438c6 verified
import gradio as gr
import torch
import numpy as np
import spaces
from diffusers import AudioLDM2Pipeline
from transformers import GPT2LMHeadModel, GPT2Tokenizer
_pipe = None
def _load_model():
global _pipe
if _pipe is None:
# Elegir dispositivo
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32
# Cargar tokenizer y modelo de lenguaje correcto
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
language_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device, dtype=torch_dtype)
# Cargar pipeline de AudioLDM2
_pipe = AudioLDM2Pipeline.from_pretrained(
"cvssp/audioldm2-music",
torch_dtype=torch_dtype,
language_model=language_model, # PASAR el modelo de lenguaje correcto
tokenizer=tokenizer
).to(device)
return _pipe
def _build_style_prompt(instruments, voice, mood, genre, tempo, bpm):
if instruments:
if len(instruments) == 1:
inst_txt = instruments[0]
else:
inst_txt = ", ".join(instruments[:-1]) + " and " + instruments[-1]
else:
inst_txt = "various instruments"
prompt = (
f"A {mood} {genre} song in {tempo} tempo at {int(bpm)} BPM, "
f"featuring {inst_txt}"
)
if voice and voice != "none":
prompt += f", sung by a {voice} voice"
return prompt
def _build_full_prompt(style_prompt, lyrics):
"""
AudioLDM2 no tiene un slot separado para lyrics, pero entiende
descripciones largas que incluyen texto de canciones.
Se concatenan al prompt de estilo con un separador claro.
"""
if not lyrics.strip():
return style_prompt
# Truncar lyrics para no saturar el tokenizer (límite ~200 tokens aprox)
lyrics_trimmed = lyrics.strip()[:600]
return f"{style_prompt}. Song lyrics: {lyrics_trimmed}"
@spaces.GPU(duration=180)
def generate_music(
instruments,
voice,
mood,
genre,
tempo,
bpm,
lyrics,
duration,
guidance_scale,
num_steps,
negative_prompt,
):
pipe = _load_model()
style_prompt = _build_style_prompt(instruments, voice, mood, genre, tempo, bpm)
full_prompt = _build_full_prompt(style_prompt, lyrics)
print(f"[AudioLDM2] Prompt: {full_prompt}")
full_prompt = str(full_prompt)
full_prompt = full_prompt.replace("\n", " ")
result = pipe(
prompt=full_prompt,
negative_prompt=negative_prompt or None,
audio_length_in_s=float(duration),
guidance_scale=guidance_scale,
num_inference_steps=int(num_steps),
num_waveforms_per_prompt=1,
)
audio = result.audios[0] # (samples,) numpy float32
return (16000, audio)
_GENRES = [
"pop", "rock", "jazz", "classical", "electronic", "folk", "metal",
"hip hop", "r&b", "soul", "blues", "country", "reggae", "ska",
"house", "techno", "trance", "dubstep", "drum and bass",
"ambient", "lofi", "synthwave", "electro", "idm",
"indie", "indie rock", "alternative", "grunge", "punk",
"heavy metal", "black metal", "death metal", "thrash metal",
"orchestral", "film score", "soundtrack", "bossa nova", "samba",
"flamenco", "celtic", "afrobeat", "k-pop", "city pop",
"experimental", "new age",
]
_MOODS = [
"happy", "sad", "romantic", "energetic", "calm", "melancholic",
"dark", "epic", "mysterious", "peaceful", "angry",
]
_TEMPOS = ["slow", "moderate", "fast", "upbeat", "relaxed", "driving", "laid-back"]
_VOICES = ["none", "male", "female", "choir", "opera singer", "rap vocals"]
_INSTRUMENTS = [
"piano", "guitar", "electric guitar", "bass guitar",
"drums", "synthesizer", "violin", "cello", "flute",
"saxophone", "trumpet", "organ", "harp",
]
_EXAMPLE_LYRICS = """\
[Verse 1]
Midnight drips on mirrored stone,
Neon whispers, all alone.
Rain keeps time on empty streets,
Where past and future softly meet.
[Chorus]
She walks like smoke through circuits wide,
A neon ghost I cannot hide.
Reflections lost in silver rain,
I call her name — she won't remain.
[Bridge]
Static snow in every glance,
Trapped inside a cyber trance.
"""
def crear_tab3():
with gr.Blocks(title="AudioLDM2 — Music + Lyrics", theme="Nymbo/Nymbo_Theme") as tab3:
gr.Image(
value="banners/generator_banner.png",
show_label=False,
container=False
)
gr.Markdown(
"# AudioLDM2 — Música con Lyrics Embebidas\n"
"Generación de música con letras usando **AudioLDM2 Music** "
"(`cvssp/audioldm2-music`). El estilo (tags) y las lyrics se combinan "
"en un único prompt enriquecido que el modelo procesa con CLAP + T5."
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Estilo musical")
instruments = gr.CheckboxGroup(
choices=_INSTRUMENTS,
value=["synthesizer", "drums", "bass guitar"],
label="Instrumentos",
)
voice = gr.Dropdown(
choices=_VOICES,
value="female",
label="Voz del cantante",
)
mood = gr.Dropdown(
choices=_MOODS,
value="energetic",
label="Mood / Emoción",
)
genre = gr.Dropdown(
choices=_GENRES,
value="synthwave",
label="Género",
allow_custom_value=True,
)
with gr.Row():
tempo = gr.Dropdown(
choices=_TEMPOS,
value="fast",
label="Tempo",
)
bpm = gr.Number(value=130, label="BPM", minimum=40, maximum=240)
with gr.Column(scale=1):
gr.Markdown("### Lyrics")
lyrics = gr.Textbox(
label="Letra de la canción",
lines=12,
value=_EXAMPLE_LYRICS,
placeholder="[Verse 1]\nTu letra aquí...\n\n[Chorus]\n...",
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Parámetros de generación")
duration = gr.Slider(
minimum=5, maximum=30, value=10, step=1,
label="Duración (segundos)",
)
guidance_scale = gr.Slider(
minimum=1.0, maximum=10.0, value=3.5, step=0.5,
label="Guidance Scale",
)
num_steps = gr.Slider(
minimum=10, maximum=200, value=50, step=10,
label="Pasos de inferencia",
)
negative_prompt = gr.Textbox(
label="Negative Prompt",
value="low quality, noise, distorted, muffled, speech, talking",
placeholder="Qué evitar en la generación",
)
with gr.Column(scale=1):
generate_btn = gr.Button(
"Generar música con lyrics", variant="primary", size="lg"
)
output_audio = gr.Audio(label="Música generada", type="numpy")
generate_btn.click(
fn=generate_music,
inputs=[
instruments, voice, mood, genre, tempo, bpm,
lyrics,
duration, guidance_scale, num_steps, negative_prompt,
],
outputs=output_audio,
)
return tab3