Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import spaces | |
| from diffusers import AudioLDM2Pipeline | |
| from transformers import GPT2LMHeadModel, GPT2Tokenizer | |
| _pipe = None | |
| def _load_model(): | |
| global _pipe | |
| if _pipe is None: | |
| # Elegir dispositivo | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if device == "cuda" else torch.float32 | |
| # Cargar tokenizer y modelo de lenguaje correcto | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| language_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device, dtype=torch_dtype) | |
| # Cargar pipeline de AudioLDM2 | |
| _pipe = AudioLDM2Pipeline.from_pretrained( | |
| "cvssp/audioldm2-music", | |
| torch_dtype=torch_dtype, | |
| language_model=language_model, # PASAR el modelo de lenguaje correcto | |
| tokenizer=tokenizer | |
| ).to(device) | |
| return _pipe | |
| def _build_style_prompt(instruments, voice, mood, genre, tempo, bpm): | |
| if instruments: | |
| if len(instruments) == 1: | |
| inst_txt = instruments[0] | |
| else: | |
| inst_txt = ", ".join(instruments[:-1]) + " and " + instruments[-1] | |
| else: | |
| inst_txt = "various instruments" | |
| prompt = ( | |
| f"A {mood} {genre} song in {tempo} tempo at {int(bpm)} BPM, " | |
| f"featuring {inst_txt}" | |
| ) | |
| if voice and voice != "none": | |
| prompt += f", sung by a {voice} voice" | |
| return prompt | |
| def _build_full_prompt(style_prompt, lyrics): | |
| """ | |
| AudioLDM2 no tiene un slot separado para lyrics, pero entiende | |
| descripciones largas que incluyen texto de canciones. | |
| Se concatenan al prompt de estilo con un separador claro. | |
| """ | |
| if not lyrics.strip(): | |
| return style_prompt | |
| # Truncar lyrics para no saturar el tokenizer (límite ~200 tokens aprox) | |
| lyrics_trimmed = lyrics.strip()[:600] | |
| return f"{style_prompt}. Song lyrics: {lyrics_trimmed}" | |
| def generate_music( | |
| instruments, | |
| voice, | |
| mood, | |
| genre, | |
| tempo, | |
| bpm, | |
| lyrics, | |
| duration, | |
| guidance_scale, | |
| num_steps, | |
| negative_prompt, | |
| ): | |
| pipe = _load_model() | |
| style_prompt = _build_style_prompt(instruments, voice, mood, genre, tempo, bpm) | |
| full_prompt = _build_full_prompt(style_prompt, lyrics) | |
| print(f"[AudioLDM2] Prompt: {full_prompt}") | |
| full_prompt = str(full_prompt) | |
| full_prompt = full_prompt.replace("\n", " ") | |
| result = pipe( | |
| prompt=full_prompt, | |
| negative_prompt=negative_prompt or None, | |
| audio_length_in_s=float(duration), | |
| guidance_scale=guidance_scale, | |
| num_inference_steps=int(num_steps), | |
| num_waveforms_per_prompt=1, | |
| ) | |
| audio = result.audios[0] # (samples,) numpy float32 | |
| return (16000, audio) | |
| _GENRES = [ | |
| "pop", "rock", "jazz", "classical", "electronic", "folk", "metal", | |
| "hip hop", "r&b", "soul", "blues", "country", "reggae", "ska", | |
| "house", "techno", "trance", "dubstep", "drum and bass", | |
| "ambient", "lofi", "synthwave", "electro", "idm", | |
| "indie", "indie rock", "alternative", "grunge", "punk", | |
| "heavy metal", "black metal", "death metal", "thrash metal", | |
| "orchestral", "film score", "soundtrack", "bossa nova", "samba", | |
| "flamenco", "celtic", "afrobeat", "k-pop", "city pop", | |
| "experimental", "new age", | |
| ] | |
| _MOODS = [ | |
| "happy", "sad", "romantic", "energetic", "calm", "melancholic", | |
| "dark", "epic", "mysterious", "peaceful", "angry", | |
| ] | |
| _TEMPOS = ["slow", "moderate", "fast", "upbeat", "relaxed", "driving", "laid-back"] | |
| _VOICES = ["none", "male", "female", "choir", "opera singer", "rap vocals"] | |
| _INSTRUMENTS = [ | |
| "piano", "guitar", "electric guitar", "bass guitar", | |
| "drums", "synthesizer", "violin", "cello", "flute", | |
| "saxophone", "trumpet", "organ", "harp", | |
| ] | |
| _EXAMPLE_LYRICS = """\ | |
| [Verse 1] | |
| Midnight drips on mirrored stone, | |
| Neon whispers, all alone. | |
| Rain keeps time on empty streets, | |
| Where past and future softly meet. | |
| [Chorus] | |
| She walks like smoke through circuits wide, | |
| A neon ghost I cannot hide. | |
| Reflections lost in silver rain, | |
| I call her name — she won't remain. | |
| [Bridge] | |
| Static snow in every glance, | |
| Trapped inside a cyber trance. | |
| """ | |
| def crear_tab3(): | |
| with gr.Blocks(title="AudioLDM2 — Music + Lyrics", theme="Nymbo/Nymbo_Theme") as tab3: | |
| gr.Image( | |
| value="banners/generator_banner.png", | |
| show_label=False, | |
| container=False | |
| ) | |
| gr.Markdown( | |
| "# AudioLDM2 — Música con Lyrics Embebidas\n" | |
| "Generación de música con letras usando **AudioLDM2 Music** " | |
| "(`cvssp/audioldm2-music`). El estilo (tags) y las lyrics se combinan " | |
| "en un único prompt enriquecido que el modelo procesa con CLAP + T5." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Estilo musical") | |
| instruments = gr.CheckboxGroup( | |
| choices=_INSTRUMENTS, | |
| value=["synthesizer", "drums", "bass guitar"], | |
| label="Instrumentos", | |
| ) | |
| voice = gr.Dropdown( | |
| choices=_VOICES, | |
| value="female", | |
| label="Voz del cantante", | |
| ) | |
| mood = gr.Dropdown( | |
| choices=_MOODS, | |
| value="energetic", | |
| label="Mood / Emoción", | |
| ) | |
| genre = gr.Dropdown( | |
| choices=_GENRES, | |
| value="synthwave", | |
| label="Género", | |
| allow_custom_value=True, | |
| ) | |
| with gr.Row(): | |
| tempo = gr.Dropdown( | |
| choices=_TEMPOS, | |
| value="fast", | |
| label="Tempo", | |
| ) | |
| bpm = gr.Number(value=130, label="BPM", minimum=40, maximum=240) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Lyrics") | |
| lyrics = gr.Textbox( | |
| label="Letra de la canción", | |
| lines=12, | |
| value=_EXAMPLE_LYRICS, | |
| placeholder="[Verse 1]\nTu letra aquí...\n\n[Chorus]\n...", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Parámetros de generación") | |
| duration = gr.Slider( | |
| minimum=5, maximum=30, value=10, step=1, | |
| label="Duración (segundos)", | |
| ) | |
| guidance_scale = gr.Slider( | |
| minimum=1.0, maximum=10.0, value=3.5, step=0.5, | |
| label="Guidance Scale", | |
| ) | |
| num_steps = gr.Slider( | |
| minimum=10, maximum=200, value=50, step=10, | |
| label="Pasos de inferencia", | |
| ) | |
| negative_prompt = gr.Textbox( | |
| label="Negative Prompt", | |
| value="low quality, noise, distorted, muffled, speech, talking", | |
| placeholder="Qué evitar en la generación", | |
| ) | |
| with gr.Column(scale=1): | |
| generate_btn = gr.Button( | |
| "Generar música con lyrics", variant="primary", size="lg" | |
| ) | |
| output_audio = gr.Audio(label="Música generada", type="numpy") | |
| generate_btn.click( | |
| fn=generate_music, | |
| inputs=[ | |
| instruments, voice, mood, genre, tempo, bpm, | |
| lyrics, | |
| duration, guidance_scale, num_steps, negative_prompt, | |
| ], | |
| outputs=output_audio, | |
| ) | |
| return tab3 | |