IsraelRM commited on
Commit
10d6080
·
verified ·
1 Parent(s): 2a6b66c

Upload apps

Browse files
Files changed (3) hide show
  1. app.py +24 -0
  2. app_audioldm.py +219 -0
  3. app_musicldm.py +184 -0
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+
4
+ # HuggingFace token
5
+ mi_token = os.environ.get("MI_TOKEN_HF")
6
+ if mi_token:
7
+ os.environ["HF_HUB_TOKEN"] = mi_token
8
+ else:
9
+ raise ValueError("No se encontró la variable de entorno MI_TOKEN_HF")
10
+
11
+ from app_heartmula import crear_tab1
12
+ from app_musicldm import crear_tab2
13
+ from app_audioldm import crear_tab3
14
+
15
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as principal:
16
+ with gr.Tabs():
17
+ with gr.Tab("HeartMuLa"):
18
+ tab1 = crear_tab1()
19
+ with gr.Tab("MusicLDM — Estilo & Tags"):
20
+ tab2 = crear_tab2()
21
+ with gr.Tab("AudioLDM2 — Music + Lyrics"):
22
+ tab3 = crear_tab3()
23
+
24
+ principal.launch()
app_audioldm.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import spaces
5
+ from diffusers import AudioLDM2Pipeline
6
+
7
+ _pipe = None
8
+
9
+
10
+ def _load_model():
11
+ global _pipe
12
+ if _pipe is None:
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ _pipe = AudioLDM2Pipeline.from_pretrained(
15
+ "cvssp/audioldm2-music",
16
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
17
+ ).to(device)
18
+ return _pipe
19
+
20
+
21
+ def _build_style_prompt(instruments, voice, mood, genre, tempo, bpm):
22
+ if instruments:
23
+ if len(instruments) == 1:
24
+ inst_txt = instruments[0]
25
+ else:
26
+ inst_txt = ", ".join(instruments[:-1]) + " and " + instruments[-1]
27
+ else:
28
+ inst_txt = "various instruments"
29
+
30
+ prompt = (
31
+ f"A {mood} {genre} song in {tempo} tempo at {int(bpm)} BPM, "
32
+ f"featuring {inst_txt}"
33
+ )
34
+ if voice and voice != "none":
35
+ prompt += f", sung by a {voice} voice"
36
+ return prompt
37
+
38
+
39
+ def _build_full_prompt(style_prompt, lyrics):
40
+ """
41
+ AudioLDM2 no tiene un slot separado para lyrics, pero entiende
42
+ descripciones largas que incluyen texto de canciones.
43
+ Se concatenan al prompt de estilo con un separador claro.
44
+ """
45
+ if not lyrics.strip():
46
+ return style_prompt
47
+ # Truncar lyrics para no saturar el tokenizer (límite ~200 tokens aprox)
48
+ lyrics_trimmed = lyrics.strip()[:600]
49
+ return f"{style_prompt}. Song lyrics: {lyrics_trimmed}"
50
+
51
+
52
+ @spaces.GPU(duration=180)
53
+ def generate_music(
54
+ instruments,
55
+ voice,
56
+ mood,
57
+ genre,
58
+ tempo,
59
+ bpm,
60
+ lyrics,
61
+ duration,
62
+ guidance_scale,
63
+ num_steps,
64
+ negative_prompt,
65
+ ):
66
+ pipe = _load_model()
67
+
68
+ style_prompt = _build_style_prompt(instruments, voice, mood, genre, tempo, bpm)
69
+ full_prompt = _build_full_prompt(style_prompt, lyrics)
70
+ print(f"[AudioLDM2] Prompt: {full_prompt}")
71
+
72
+ result = pipe(
73
+ full_prompt,
74
+ negative_prompt=negative_prompt or None,
75
+ audio_length_in_s=float(duration),
76
+ guidance_scale=guidance_scale,
77
+ num_inference_steps=int(num_steps),
78
+ num_waveforms_per_prompt=1,
79
+ )
80
+ audio = result.audios[0] # (samples,) numpy float32
81
+ return (16000, audio)
82
+
83
+
84
+ _GENRES = [
85
+ "pop", "rock", "jazz", "classical", "electronic", "folk", "metal",
86
+ "hip hop", "r&b", "soul", "blues", "country", "reggae", "ska",
87
+ "house", "techno", "trance", "dubstep", "drum and bass",
88
+ "ambient", "lofi", "synthwave", "electro", "idm",
89
+ "indie", "indie rock", "alternative", "grunge", "punk",
90
+ "heavy metal", "black metal", "death metal", "thrash metal",
91
+ "orchestral", "film score", "soundtrack", "bossa nova", "samba",
92
+ "flamenco", "celtic", "afrobeat", "k-pop", "city pop",
93
+ "experimental", "new age",
94
+ ]
95
+
96
+ _MOODS = [
97
+ "happy", "sad", "romantic", "energetic", "calm", "melancholic",
98
+ "dark", "epic", "mysterious", "peaceful", "angry",
99
+ ]
100
+
101
+ _TEMPOS = ["slow", "moderate", "fast", "upbeat", "relaxed", "driving", "laid-back"]
102
+
103
+ _VOICES = ["none", "male", "female", "choir", "opera singer", "rap vocals"]
104
+
105
+ _INSTRUMENTS = [
106
+ "piano", "guitar", "electric guitar", "bass guitar",
107
+ "drums", "synthesizer", "violin", "cello", "flute",
108
+ "saxophone", "trumpet", "organ", "harp",
109
+ ]
110
+
111
+ _EXAMPLE_LYRICS = """\
112
+ [Verse 1]
113
+ Midnight drips on mirrored stone,
114
+ Neon whispers, all alone.
115
+ Rain keeps time on empty streets,
116
+ Where past and future softly meet.
117
+
118
+ [Chorus]
119
+ She walks like smoke through circuits wide,
120
+ A neon ghost I cannot hide.
121
+ Reflections lost in silver rain,
122
+ I call her name — she won't remain.
123
+
124
+ [Bridge]
125
+ Static snow in every glance,
126
+ Trapped inside a cyber trance.
127
+ """
128
+
129
+
130
+ def crear_tab3():
131
+ with gr.Blocks(title="AudioLDM2 — Music + Lyrics", theme="Nymbo/Nymbo_Theme") as tab3:
132
+
133
+ gr.Markdown(
134
+ "# AudioLDM2 — Música con Lyrics Embebidas\n"
135
+ "Generación de música con letras usando **AudioLDM2 Music** "
136
+ "(`cvssp/audioldm2-music`). El estilo (tags) y las lyrics se combinan "
137
+ "en un único prompt enriquecido que el modelo procesa con CLAP + T5."
138
+ )
139
+
140
+ with gr.Row():
141
+ with gr.Column(scale=1):
142
+ gr.Markdown("### Estilo musical")
143
+
144
+ instruments = gr.CheckboxGroup(
145
+ choices=_INSTRUMENTS,
146
+ value=["synthesizer", "drums", "bass guitar"],
147
+ label="Instrumentos",
148
+ )
149
+ voice = gr.Dropdown(
150
+ choices=_VOICES,
151
+ value="female",
152
+ label="Voz del cantante",
153
+ )
154
+ mood = gr.Dropdown(
155
+ choices=_MOODS,
156
+ value="energetic",
157
+ label="Mood / Emoción",
158
+ )
159
+ genre = gr.Dropdown(
160
+ choices=_GENRES,
161
+ value="synthwave",
162
+ label="Género",
163
+ allow_custom_value=True,
164
+ )
165
+ with gr.Row():
166
+ tempo = gr.Dropdown(
167
+ choices=_TEMPOS,
168
+ value="fast",
169
+ label="Tempo",
170
+ )
171
+ bpm = gr.Number(value=130, label="BPM", minimum=40, maximum=240)
172
+
173
+ with gr.Column(scale=1):
174
+ gr.Markdown("### Lyrics")
175
+ lyrics = gr.Textbox(
176
+ label="Letra de la canción",
177
+ lines=12,
178
+ value=_EXAMPLE_LYRICS,
179
+ placeholder="[Verse 1]\nTu letra aquí...\n\n[Chorus]\n...",
180
+ )
181
+
182
+ with gr.Row():
183
+ with gr.Column(scale=1):
184
+ gr.Markdown("### Parámetros de generación")
185
+ duration = gr.Slider(
186
+ minimum=5, maximum=30, value=10, step=1,
187
+ label="Duración (segundos)",
188
+ )
189
+ guidance_scale = gr.Slider(
190
+ minimum=1.0, maximum=10.0, value=3.5, step=0.5,
191
+ label="Guidance Scale",
192
+ )
193
+ num_steps = gr.Slider(
194
+ minimum=10, maximum=200, value=50, step=10,
195
+ label="Pasos de inferencia",
196
+ )
197
+ negative_prompt = gr.Textbox(
198
+ label="Negative Prompt",
199
+ value="low quality, noise, distorted, muffled, speech, talking",
200
+ placeholder="Qué evitar en la generación",
201
+ )
202
+
203
+ with gr.Column(scale=1):
204
+ generate_btn = gr.Button(
205
+ "Generar música con lyrics", variant="primary", size="lg"
206
+ )
207
+ output_audio = gr.Audio(label="Música generada", type="numpy")
208
+
209
+ generate_btn.click(
210
+ fn=generate_music,
211
+ inputs=[
212
+ instruments, voice, mood, genre, tempo, bpm,
213
+ lyrics,
214
+ duration, guidance_scale, num_steps, negative_prompt,
215
+ ],
216
+ outputs=output_audio,
217
+ )
218
+
219
+ return tab3
app_musicldm.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import spaces
5
+ from diffusers import MusicLDMPipeline
6
+
7
+ _pipe = None
8
+
9
+
10
+ def _load_model():
11
+ global _pipe
12
+ if _pipe is None:
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ _pipe = MusicLDMPipeline.from_pretrained(
15
+ "ucsd-reach/musicldm",
16
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
17
+ ).to(device)
18
+ return _pipe
19
+
20
+
21
+ def _build_prompt(instruments, voice, mood, genre, tempo, bpm, lyrics=""):
22
+ if instruments:
23
+ if len(instruments) == 1:
24
+ inst_txt = instruments[0]
25
+ else:
26
+ inst_txt = ", ".join(instruments[:-1]) + " and " + instruments[-1]
27
+ else:
28
+ inst_txt = "various instruments"
29
+
30
+ prompt = (
31
+ f"A {mood} {genre} song in {tempo} tempo at {int(bpm)} BPM, "
32
+ f"featuring {inst_txt}"
33
+ )
34
+ if voice and voice != "none":
35
+ prompt += f", sung by a {voice} voice"
36
+ if lyrics and lyrics.strip():
37
+ prompt += f". Song lyrics: {lyrics.strip()[:600]}"
38
+ return prompt
39
+
40
+
41
+ @spaces.GPU(duration=130)
42
+ def generate_music(
43
+ instruments,
44
+ voice,
45
+ mood,
46
+ genre,
47
+ tempo,
48
+ bpm,
49
+ lyrics,
50
+ duration,
51
+ guidance_scale,
52
+ num_steps,
53
+ negative_prompt,
54
+ ):
55
+ pipe = _load_model()
56
+ prompt = _build_prompt(instruments, voice, mood, genre, tempo, bpm, lyrics)
57
+ print(f"[MusicLDM] Prompt: {prompt}")
58
+
59
+ result = pipe(
60
+ prompt,
61
+ negative_prompt=negative_prompt or None,
62
+ audio_length_in_s=float(duration),
63
+ guidance_scale=guidance_scale,
64
+ num_inference_steps=int(num_steps),
65
+ )
66
+ audio = result.audios[0] # (samples,) numpy float32
67
+ return (16000, audio)
68
+
69
+
70
+ _GENRES = [
71
+ "pop", "rock", "jazz", "classical", "electronic", "folk", "metal",
72
+ "hip hop", "r&b", "soul", "blues", "country", "reggae", "ska",
73
+ "house", "techno", "trance", "dubstep", "drum and bass",
74
+ "ambient", "lofi", "synthwave", "electro", "idm",
75
+ "indie", "indie rock", "alternative", "grunge", "punk",
76
+ "heavy metal", "black metal", "death metal", "thrash metal",
77
+ "orchestral", "film score", "soundtrack", "bossa nova", "samba",
78
+ "flamenco", "celtic", "afrobeat", "k-pop", "city pop",
79
+ "experimental", "new age",
80
+ ]
81
+
82
+ _MOODS = [
83
+ "happy", "sad", "romantic", "energetic", "calm", "melancholic",
84
+ "dark", "epic", "mysterious", "peaceful", "angry",
85
+ ]
86
+
87
+ _TEMPOS = ["slow", "moderate", "fast", "upbeat", "relaxed", "driving", "laid-back"]
88
+
89
+ _VOICES = ["none", "male", "female", "choir", "opera singer", "rap vocals"]
90
+
91
+ _INSTRUMENTS = [
92
+ "piano", "guitar", "electric guitar", "bass guitar",
93
+ "drums", "synthesizer", "violin", "cello", "flute",
94
+ "saxophone", "trumpet", "organ", "harp",
95
+ ]
96
+
97
+
98
+ def crear_tab2():
99
+ with gr.Blocks(title="MusicLDM", theme="Nymbo/Nymbo_Theme") as tab2:
100
+
101
+ gr.Markdown(
102
+ "# MusicLDM — Latent Diffusion Music Generator\n"
103
+ "Generación de música desde tags de estilo usando **MusicLDM** "
104
+ "(`ucsd-reach/musicldm`). Los tags se convierten en un prompt "
105
+ "estructurado en lenguaje natural para que el modelo los entienda correctamente."
106
+ )
107
+
108
+ with gr.Row():
109
+ with gr.Column(scale=1):
110
+ gr.Markdown("### Estilo musical")
111
+
112
+ instruments = gr.CheckboxGroup(
113
+ choices=_INSTRUMENTS,
114
+ value=["synthesizer", "drums", "bass guitar"],
115
+ label="Instrumentos",
116
+ )
117
+ voice = gr.Dropdown(
118
+ choices=_VOICES,
119
+ value="female",
120
+ label="Voz del cantante",
121
+ )
122
+ mood = gr.Dropdown(
123
+ choices=_MOODS,
124
+ value="energetic",
125
+ label="Mood / Emoción",
126
+ )
127
+ genre = gr.Dropdown(
128
+ choices=_GENRES,
129
+ value="synthwave",
130
+ label="Género",
131
+ allow_custom_value=True,
132
+ )
133
+ with gr.Row():
134
+ tempo = gr.Dropdown(
135
+ choices=_TEMPOS,
136
+ value="fast",
137
+ label="Tempo",
138
+ )
139
+ bpm = gr.Number(value=130, label="BPM", minimum=40, maximum=240)
140
+
141
+ with gr.Column(scale=1):
142
+ gr.Markdown("### Lyrics (opcional)")
143
+ lyrics = gr.Textbox(
144
+ label="Letra de la canción",
145
+ lines=8,
146
+ placeholder="[Verse 1]\nTu letra aquí...\n\n[Chorus]\n...",
147
+ )
148
+
149
+ gr.Markdown("### Parámetros de generación")
150
+
151
+ duration = gr.Slider(
152
+ minimum=5, maximum=30, value=10, step=1,
153
+ label="Duración (segundos)",
154
+ )
155
+ guidance_scale = gr.Slider(
156
+ minimum=1.0, maximum=10.0, value=3.5, step=0.5,
157
+ label="Guidance Scale",
158
+ info="Mayor = más fiel al prompt",
159
+ )
160
+ num_steps = gr.Slider(
161
+ minimum=10, maximum=200, value=50, step=10,
162
+ label="Pasos de inferencia",
163
+ info="Más pasos = mejor calidad, más lento",
164
+ )
165
+ negative_prompt = gr.Textbox(
166
+ label="Negative Prompt",
167
+ value="low quality, noise, distorted, muffled",
168
+ placeholder="Qué evitar en la generación",
169
+ )
170
+
171
+ generate_btn = gr.Button("Generar música", variant="primary", size="lg")
172
+ output_audio = gr.Audio(label="Música generada", type="numpy")
173
+
174
+ generate_btn.click(
175
+ fn=generate_music,
176
+ inputs=[
177
+ instruments, voice, mood, genre, tempo, bpm,
178
+ lyrics,
179
+ duration, guidance_scale, num_steps, negative_prompt,
180
+ ],
181
+ outputs=output_audio,
182
+ )
183
+
184
+ return tab2