music_generation

Sleeping

App Files Files Community

music_generation / app_audioldm.py

IsraelRM

Update app_audioldm.py

65438c6 verified about 2 months ago

raw

history blame contribute delete

7.81 kB

	import gradio as gr
	import torch
	import numpy as np
	import spaces
	from diffusers import AudioLDM2Pipeline
	from transformers import GPT2LMHeadModel, GPT2Tokenizer

	_pipe = None

	def _load_model():
	global _pipe
	if _pipe is None:
	# Elegir dispositivo
	device = "cuda" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if device == "cuda" else torch.float32

	# Cargar tokenizer y modelo de lenguaje correcto
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
	language_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device, dtype=torch_dtype)

	# Cargar pipeline de AudioLDM2
	_pipe = AudioLDM2Pipeline.from_pretrained(
	"cvssp/audioldm2-music",
	torch_dtype=torch_dtype,
	language_model=language_model, # PASAR el modelo de lenguaje correcto
	tokenizer=tokenizer
	).to(device)

	return _pipe


	def _build_style_prompt(instruments, voice, mood, genre, tempo, bpm):
	if instruments:
	if len(instruments) == 1:
	inst_txt = instruments[0]
	else:
	inst_txt = ", ".join(instruments[:-1]) + " and " + instruments[-1]
	else:
	inst_txt = "various instruments"

	prompt = (
	f"A {mood} {genre} song in {tempo} tempo at {int(bpm)} BPM, "
	f"featuring {inst_txt}"
	)
	if voice and voice != "none":
	prompt += f", sung by a {voice} voice"
	return prompt


	def _build_full_prompt(style_prompt, lyrics):
	"""
	AudioLDM2 no tiene un slot separado para lyrics, pero entiende
	descripciones largas que incluyen texto de canciones.
	Se concatenan al prompt de estilo con un separador claro.
	"""
	if not lyrics.strip():
	return style_prompt
	# Truncar lyrics para no saturar el tokenizer (límite ~200 tokens aprox)
	lyrics_trimmed = lyrics.strip()[:600]
	return f"{style_prompt}. Song lyrics: {lyrics_trimmed}"


	@spaces.GPU(duration=180)
	def generate_music(
	instruments,
	voice,
	mood,
	genre,
	tempo,
	bpm,
	lyrics,
	duration,
	guidance_scale,
	num_steps,
	negative_prompt,
	):
	pipe = _load_model()

	style_prompt = _build_style_prompt(instruments, voice, mood, genre, tempo, bpm)
	full_prompt = _build_full_prompt(style_prompt, lyrics)
	print(f"[AudioLDM2] Prompt: {full_prompt}")
	full_prompt = str(full_prompt)
	full_prompt = full_prompt.replace("\n", " ")

	result = pipe(
	prompt=full_prompt,
	negative_prompt=negative_prompt or None,
	audio_length_in_s=float(duration),
	guidance_scale=guidance_scale,
	num_inference_steps=int(num_steps),
	num_waveforms_per_prompt=1,
	)
	audio = result.audios[0] # (samples,) numpy float32
	return (16000, audio)


	_GENRES = [
	"pop", "rock", "jazz", "classical", "electronic", "folk", "metal",
	"hip hop", "r&b", "soul", "blues", "country", "reggae", "ska",
	"house", "techno", "trance", "dubstep", "drum and bass",
	"ambient", "lofi", "synthwave", "electro", "idm",
	"indie", "indie rock", "alternative", "grunge", "punk",
	"heavy metal", "black metal", "death metal", "thrash metal",
	"orchestral", "film score", "soundtrack", "bossa nova", "samba",
	"flamenco", "celtic", "afrobeat", "k-pop", "city pop",
	"experimental", "new age",
	]

	_MOODS = [
	"happy", "sad", "romantic", "energetic", "calm", "melancholic",
	"dark", "epic", "mysterious", "peaceful", "angry",
	]

	_TEMPOS = ["slow", "moderate", "fast", "upbeat", "relaxed", "driving", "laid-back"]

	_VOICES = ["none", "male", "female", "choir", "opera singer", "rap vocals"]

	_INSTRUMENTS = [
	"piano", "guitar", "electric guitar", "bass guitar",
	"drums", "synthesizer", "violin", "cello", "flute",
	"saxophone", "trumpet", "organ", "harp",
	]

	_EXAMPLE_LYRICS = """\
	[Verse 1]
	Midnight drips on mirrored stone,
	Neon whispers, all alone.
	Rain keeps time on empty streets,
	Where past and future softly meet.

	[Chorus]
	She walks like smoke through circuits wide,
	A neon ghost I cannot hide.
	Reflections lost in silver rain,
	I call her name — she won't remain.

	[Bridge]
	Static snow in every glance,
	Trapped inside a cyber trance.
	"""


	def crear_tab3():
	with gr.Blocks(title="AudioLDM2 — Music + Lyrics", theme="Nymbo/Nymbo_Theme") as tab3:

	gr.Image(
	value="banners/generator_banner.png",
	show_label=False,
	container=False
	)

	gr.Markdown(
	"# AudioLDM2 — Música con Lyrics Embebidas\n"
	"Generación de música con letras usando AudioLDM2 Music "
	"(`cvssp/audioldm2-music`). El estilo (tags) y las lyrics se combinan "
	"en un único prompt enriquecido que el modelo procesa con CLAP + T5."
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Estilo musical")

	instruments = gr.CheckboxGroup(
	choices=_INSTRUMENTS,
	value=["synthesizer", "drums", "bass guitar"],
	label="Instrumentos",
	)
	voice = gr.Dropdown(
	choices=_VOICES,
	value="female",
	label="Voz del cantante",
	)
	mood = gr.Dropdown(
	choices=_MOODS,
	value="energetic",
	label="Mood / Emoción",
	)
	genre = gr.Dropdown(
	choices=_GENRES,
	value="synthwave",
	label="Género",
	allow_custom_value=True,
	)
	with gr.Row():
	tempo = gr.Dropdown(
	choices=_TEMPOS,
	value="fast",
	label="Tempo",
	)
	bpm = gr.Number(value=130, label="BPM", minimum=40, maximum=240)

	with gr.Column(scale=1):
	gr.Markdown("### Lyrics")
	lyrics = gr.Textbox(
	label="Letra de la canción",
	lines=12,
	value=_EXAMPLE_LYRICS,
	placeholder="[Verse 1]\nTu letra aquí...\n\n[Chorus]\n...",
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Parámetros de generación")
	duration = gr.Slider(
	minimum=5, maximum=30, value=10, step=1,
	label="Duración (segundos)",
	)
	guidance_scale = gr.Slider(
	minimum=1.0, maximum=10.0, value=3.5, step=0.5,
	label="Guidance Scale",
	)
	num_steps = gr.Slider(
	minimum=10, maximum=200, value=50, step=10,
	label="Pasos de inferencia",
	)
	negative_prompt = gr.Textbox(
	label="Negative Prompt",
	value="low quality, noise, distorted, muffled, speech, talking",
	placeholder="Qué evitar en la generación",
	)

	with gr.Column(scale=1):
	generate_btn = gr.Button(
	"Generar música con lyrics", variant="primary", size="lg"
	)
	output_audio = gr.Audio(label="Música generada", type="numpy")

	generate_btn.click(
	fn=generate_music,
	inputs=[
	instruments, voice, mood, genre, tempo, bpm,
	lyrics,
	duration, guidance_scale, num_steps, negative_prompt,
	],
	outputs=output_audio,
	)

	return tab3