OnyxMunk commited on
Commit
93e00ec
·
verified ·
1 Parent(s): 8a29941

Upload folder using huggingface_hub

Browse files
demos/__pycache__/musicgen_app.cpython-313.pyc ADDED
Binary file (27 kB). View file
 
demos/app_minimal.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def greet(name):
4
+ return "Hello " + name
5
+
6
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
+ iface.launch()
demos/app_simple.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simplified MusicGen Gradio App for Hugging Face Spaces
4
+ Uses Docker for reliable deployment with pre-built dependencies
5
+ """
6
+
7
+ import gradio as gr
8
+ import torch
9
+ from audiocraft.models import MusicGen
10
+ import numpy as np
11
+ import logging
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Initialize model
17
+ logger.info("Loading MusicGen model...")
18
+ model = MusicGen.get_pretrained('facebook/musicgen-small') # Using small model for faster loading
19
+ logger.info("Model loaded successfully!")
20
+
21
+ def generate_music(prompt, duration=10, temperature=1.0, cfg_coeff=3.0):
22
+ """
23
+ Generate music from text prompt
24
+
25
+ Args:
26
+ prompt: Text description of the music
27
+ duration: Length in seconds (max 30)
28
+ temperature: Sampling temperature (higher = more random)
29
+ cfg_coeff: Classifier-free guidance coefficient (higher = follow prompt more)
30
+ """
31
+ try:
32
+ logger.info(f"Generating music: '{prompt}' ({duration}s)")
33
+
34
+ model.set_generation_params(
35
+ duration=min(duration, 30),
36
+ temperature=temperature,
37
+ cfg_coeff=cfg_coeff
38
+ )
39
+
40
+ # Generate
41
+ wav = model.generate([prompt])
42
+
43
+ # Convert to numpy array for Gradio
44
+ audio_data = wav[0].cpu().numpy()
45
+ sample_rate = model.sample_rate
46
+
47
+ logger.info(f"Generated audio: shape={audio_data.shape}, sr={sample_rate}")
48
+
49
+ return (sample_rate, audio_data.T)
50
+
51
+ except Exception as e:
52
+ logger.error(f"Generation error: {e}")
53
+ raise gr.Error(f"Failed to generate music: {str(e)}")
54
+
55
+ # Example prompts
56
+ examples = [
57
+ ["upbeat electronic dance music with a catchy melody", 10, 1.0, 3.0],
58
+ ["relaxing piano melody with soft strings", 10, 1.0, 3.0],
59
+ ["energetic rock music with electric guitar", 10, 1.0, 3.0],
60
+ ["lo-fi hip hop beats for studying", 10, 1.0, 3.0],
61
+ ["epic orchestral soundtrack with dramatic strings", 10, 1.0, 3.0],
62
+ ]
63
+
64
+ # Build Gradio interface
65
+ with gr.Blocks(title="MusicGen - AI Music Generator") as demo:
66
+ gr.Markdown("# 🎵 MusicGen - AI Music Generator")
67
+ gr.Markdown("Generate music from text descriptions using Meta's MusicGen AI model.")
68
+
69
+ with gr.Row():
70
+ with gr.Column():
71
+ prompt_input = gr.Textbox(
72
+ label="Music Description",
73
+ placeholder="Describe the music you want to generate...",
74
+ lines=3
75
+ )
76
+
77
+ with gr.Row():
78
+ duration_input = gr.Slider(
79
+ minimum=5,
80
+ maximum=30,
81
+ value=10,
82
+ step=1,
83
+ label="Duration (seconds)"
84
+ )
85
+
86
+ temperature_input = gr.Slider(
87
+ minimum=0.1,
88
+ maximum=1.5,
89
+ value=1.0,
90
+ step=0.1,
91
+ label="Temperature (creativity)"
92
+ )
93
+
94
+ cfg_input = gr.Slider(
95
+ minimum=1.0,
96
+ maximum=10.0,
97
+ value=3.0,
98
+ step=0.5,
99
+ label="CFG Scale (prompt following)"
100
+ )
101
+
102
+ generate_btn = gr.Button("🎵 Generate Music", variant="primary")
103
+
104
+ with gr.Column():
105
+ audio_output = gr.Audio(
106
+ label="Generated Music",
107
+ type="numpy"
108
+ )
109
+
110
+ gr.Examples(
111
+ examples=examples,
112
+ inputs=[prompt_input, duration_input, temperature_input, cfg_input],
113
+ outputs=audio_output,
114
+ fn=generate_music,
115
+ cache_examples=False,
116
+ )
117
+
118
+ generate_btn.click(
119
+ fn=generate_music,
120
+ inputs=[prompt_input, duration_input, temperature_input, cfg_input],
121
+ outputs=audio_output
122
+ )
123
+
124
+ if __name__ == "__main__":
125
+ logger.info("Launching Gradio interface...")
126
+ demo.launch(
127
+ server_name="0.0.0.0",
128
+ server_port=7860,
129
+ share=False
130
+ )
demos/musicgen_app.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
8
+ # also released under the MIT license.
9
+
10
+ import argparse
11
+ from concurrent.futures import ProcessPoolExecutor
12
+ import logging
13
+ import os
14
+ from pathlib import Path
15
+ import subprocess as sp
16
+ import sys
17
+ from tempfile import NamedTemporaryFile
18
+ import time
19
+ import typing as tp
20
+ import warnings
21
+
22
+ from einops import rearrange
23
+ import torch
24
+ import gradio as gr
25
+
26
+ from audiocraft.data.audio_utils import convert_audio
27
+ from audiocraft.data.audio import audio_write
28
+ from audiocraft.models.encodec import InterleaveStereoCompressionModel
29
+ from audiocraft.models import MusicGen, MultiBandDiffusion
30
+
31
+
32
+ MODEL = None # Last used model
33
+ SPACE_ID = os.environ.get('SPACE_ID', '')
34
+ IS_BATCHED = "facebook/MusicGen" in SPACE_ID or 'musicgen-internal/musicgen_dev' in SPACE_ID
35
+ print(IS_BATCHED)
36
+ MAX_BATCH_SIZE = 12
37
+ BATCHED_DURATION = 15
38
+ INTERRUPTING = False
39
+ MBD = None
40
+ # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
41
+ _old_call = sp.call
42
+
43
+
44
+ def _call_nostderr(*args, **kwargs):
45
+ # Avoid ffmpeg vomiting on the logs.
46
+ kwargs['stderr'] = sp.DEVNULL
47
+ kwargs['stdout'] = sp.DEVNULL
48
+ _old_call(*args, **kwargs)
49
+
50
+
51
+ sp.call = _call_nostderr
52
+ # Preallocating the pool of processes.
53
+ pool = ProcessPoolExecutor(4)
54
+ pool.__enter__()
55
+
56
+
57
+ def interrupt():
58
+ global INTERRUPTING
59
+ INTERRUPTING = True
60
+
61
+
62
+ class FileCleaner:
63
+ def __init__(self, file_lifetime: float = 3600):
64
+ self.file_lifetime = file_lifetime
65
+ self.files = []
66
+
67
+ def add(self, path: tp.Union[str, Path]):
68
+ self._cleanup()
69
+ self.files.append((time.time(), Path(path)))
70
+
71
+ def _cleanup(self):
72
+ now = time.time()
73
+ for time_added, path in list(self.files):
74
+ if now - time_added > self.file_lifetime:
75
+ if path.exists():
76
+ path.unlink()
77
+ self.files.pop(0)
78
+ else:
79
+ break
80
+
81
+
82
+ file_cleaner = FileCleaner()
83
+
84
+
85
+ def make_waveform(*args, **kwargs):
86
+ # Further remove some warnings.
87
+ be = time.time()
88
+ with warnings.catch_warnings():
89
+ warnings.simplefilter('ignore')
90
+ out = gr.make_waveform(*args, **kwargs)
91
+ print("Make a video took", time.time() - be)
92
+ return out
93
+
94
+
95
+ def load_model(version='facebook/musicgen-melody'):
96
+ global MODEL
97
+ print("Loading model", version)
98
+ if MODEL is None or MODEL.name != version:
99
+ del MODEL
100
+ MODEL = None # in case loading would crash
101
+ MODEL = MusicGen.get_pretrained(version)
102
+
103
+
104
+ def load_diffusion():
105
+ global MBD
106
+ if MBD is None:
107
+ print("loading MBD")
108
+ MBD = MultiBandDiffusion.get_mbd_musicgen()
109
+
110
+
111
+ def _do_predictions(texts, melodies, duration, progress=False, gradio_progress=None, **gen_kwargs):
112
+ MODEL.set_generation_params(duration=duration, **gen_kwargs)
113
+ print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
114
+ be = time.time()
115
+ processed_melodies = []
116
+ target_sr = 32000
117
+ target_ac = 1
118
+ for melody in melodies:
119
+ if melody is None:
120
+ processed_melodies.append(None)
121
+ else:
122
+ sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
123
+ if melody.dim() == 1:
124
+ melody = melody[None]
125
+ melody = melody[..., :int(sr * duration)]
126
+ melody = convert_audio(melody, sr, target_sr, target_ac)
127
+ processed_melodies.append(melody)
128
+
129
+ try:
130
+ if any(m is not None for m in processed_melodies):
131
+ outputs = MODEL.generate_with_chroma(
132
+ descriptions=texts,
133
+ melody_wavs=processed_melodies,
134
+ melody_sample_rate=target_sr,
135
+ progress=progress,
136
+ return_tokens=USE_DIFFUSION
137
+ )
138
+ else:
139
+ outputs = MODEL.generate(texts, progress=progress, return_tokens=USE_DIFFUSION)
140
+ except RuntimeError as e:
141
+ raise gr.Error("Error while generating " + e.args[0])
142
+ if USE_DIFFUSION:
143
+ if gradio_progress is not None:
144
+ gradio_progress(1, desc='Running MultiBandDiffusion...')
145
+ tokens = outputs[1]
146
+ if isinstance(MODEL.compression_model, InterleaveStereoCompressionModel):
147
+ left, right = MODEL.compression_model.get_left_right_codes(tokens)
148
+ tokens = torch.cat([left, right])
149
+ outputs_diffusion = MBD.tokens_to_wav(tokens)
150
+ if isinstance(MODEL.compression_model, InterleaveStereoCompressionModel):
151
+ assert outputs_diffusion.shape[1] == 1 # output is mono
152
+ outputs_diffusion = rearrange(outputs_diffusion, '(s b) c t -> b (s c) t', s=2)
153
+ outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
154
+ outputs = outputs.detach().cpu().float()
155
+ pending_videos = []
156
+ out_wavs = []
157
+ for output in outputs:
158
+ with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
159
+ audio_write(
160
+ file.name, output, MODEL.sample_rate, strategy="loudness",
161
+ loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
162
+ pending_videos.append(pool.submit(make_waveform, file.name))
163
+ out_wavs.append(file.name)
164
+ file_cleaner.add(file.name)
165
+ out_videos = [pending_video.result() for pending_video in pending_videos]
166
+ for video in out_videos:
167
+ file_cleaner.add(video)
168
+ print("batch finished", len(texts), time.time() - be)
169
+ print("Tempfiles currently stored: ", len(file_cleaner.files))
170
+ return out_videos, out_wavs
171
+
172
+
173
+ def predict_batched(texts, melodies):
174
+ max_text_length = 512
175
+ texts = [text[:max_text_length] for text in texts]
176
+ load_model('facebook/musicgen-stereo-melody')
177
+ res = _do_predictions(texts, melodies, BATCHED_DURATION)
178
+ return res
179
+
180
+
181
+ def predict_full(model, model_path, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
182
+ global INTERRUPTING
183
+ global USE_DIFFUSION
184
+ INTERRUPTING = False
185
+ progress(0, desc="Loading model...")
186
+ model_path = model_path.strip()
187
+ if model_path:
188
+ if not Path(model_path).exists():
189
+ raise gr.Error(f"Model path {model_path} doesn't exist.")
190
+ if not Path(model_path).is_dir():
191
+ raise gr.Error(f"Model path {model_path} must be a folder containing "
192
+ "state_dict.bin and compression_state_dict_.bin.")
193
+ model = model_path
194
+ if temperature < 0:
195
+ raise gr.Error("Temperature must be >= 0.")
196
+ if topk < 0:
197
+ raise gr.Error("Topk must be non-negative.")
198
+ if topp < 0:
199
+ raise gr.Error("Topp must be non-negative.")
200
+
201
+ topk = int(topk)
202
+ if decoder == "MultiBand_Diffusion":
203
+ USE_DIFFUSION = True
204
+ progress(0, desc="Loading diffusion model...")
205
+ load_diffusion()
206
+ else:
207
+ USE_DIFFUSION = False
208
+ load_model(model)
209
+
210
+ max_generated = 0
211
+
212
+ def _progress(generated, to_generate):
213
+ nonlocal max_generated
214
+ max_generated = max(generated, max_generated)
215
+ progress((min(max_generated, to_generate), to_generate))
216
+ if INTERRUPTING:
217
+ raise gr.Error("Interrupted.")
218
+ MODEL.set_custom_progress_callback(_progress)
219
+
220
+ videos, wavs = _do_predictions(
221
+ [text], [melody], duration, progress=True,
222
+ top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef,
223
+ gradio_progress=progress)
224
+ if USE_DIFFUSION:
225
+ return videos[0], wavs[0], videos[1], wavs[1]
226
+ return videos[0], wavs[0], None, None
227
+
228
+
229
+ def toggle_audio_src(choice):
230
+ if choice == "mic":
231
+ return gr.update(sources=["microphone"], value=None, label="Microphone")
232
+ else:
233
+ return gr.update(sources=["upload"], value=None, label="File")
234
+
235
+
236
+ def toggle_diffusion(choice):
237
+ if choice == "MultiBand_Diffusion":
238
+ return [gr.update(visible=True)] * 2
239
+ else:
240
+ return [gr.update(visible=False)] * 2
241
+
242
+
243
+ def ui_full(launch_kwargs):
244
+ with gr.Blocks() as interface:
245
+ gr.Markdown(
246
+ """
247
+ # MusicGen
248
+ This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
249
+ a simple and controllable model for music generation
250
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
251
+ """
252
+ )
253
+ with gr.Row():
254
+ with gr.Column():
255
+ with gr.Row():
256
+ text = gr.Text(label="Input Text", interactive=True)
257
+ with gr.Column():
258
+ radio = gr.Radio(["file", "mic"], value="file",
259
+ label="Condition on a melody (optional) File or Mic")
260
+ melody = gr.Audio(sources=["upload"], type="numpy", label="File",
261
+ interactive=True, elem_id="melody-input")
262
+ with gr.Row():
263
+ submit = gr.Button("Submit")
264
+ # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
265
+ _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
266
+ with gr.Row():
267
+ model = gr.Radio(["facebook/musicgen-melody", "facebook/musicgen-medium", "facebook/musicgen-small",
268
+ "facebook/musicgen-large", "facebook/musicgen-melody-large",
269
+ "facebook/musicgen-stereo-small", "facebook/musicgen-stereo-medium",
270
+ "facebook/musicgen-stereo-melody", "facebook/musicgen-stereo-large",
271
+ "facebook/musicgen-stereo-melody-large"],
272
+ label="Model", value="facebook/musicgen-stereo-melody", interactive=True)
273
+ model_path = gr.Text(label="Model Path (custom models)")
274
+ with gr.Row():
275
+ decoder = gr.Radio(["Default", "MultiBand_Diffusion"],
276
+ label="Decoder", value="Default", interactive=True)
277
+ with gr.Row():
278
+ duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
279
+ with gr.Row():
280
+ topk = gr.Number(label="Top-k", value=250, interactive=True)
281
+ topp = gr.Number(label="Top-p", value=0, interactive=True)
282
+ temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
283
+ cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
284
+ with gr.Column():
285
+ output = gr.Video(label="Generated Music")
286
+ audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
287
+ diffusion_output = gr.Video(label="MultiBand Diffusion Decoder")
288
+ audio_diffusion = gr.Audio(label="MultiBand Diffusion Decoder (wav)", type='filepath')
289
+ submit.click(toggle_diffusion, decoder, [diffusion_output, audio_diffusion], queue=False,
290
+ show_progress=False).then(predict_full, inputs=[model, model_path, decoder, text, melody, duration, topk, topp,
291
+ temperature, cfg_coef],
292
+ outputs=[output, audio_output, diffusion_output, audio_diffusion])
293
+ radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
294
+
295
+ gr.Examples(
296
+ fn=predict_full,
297
+ examples=[
298
+ [
299
+ "An 80s driving pop song with heavy drums and synth pads in the background",
300
+ "./assets/bach.mp3",
301
+ "facebook/musicgen-stereo-melody",
302
+ "Default"
303
+ ],
304
+ [
305
+ "A cheerful country song with acoustic guitars",
306
+ "./assets/bolero_ravel.mp3",
307
+ "facebook/musicgen-stereo-melody",
308
+ "Default"
309
+ ],
310
+ [
311
+ "90s rock song with electric guitar and heavy drums",
312
+ None,
313
+ "facebook/musicgen-stereo-medium",
314
+ "Default"
315
+ ],
316
+ [
317
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
318
+ "./assets/bach.mp3",
319
+ "facebook/musicgen-stereo-melody",
320
+ "Default"
321
+ ],
322
+ [
323
+ "lofi slow bpm electro chill with organic samples",
324
+ None,
325
+ "facebook/musicgen-stereo-medium",
326
+ "Default"
327
+ ],
328
+ [
329
+ "Punk rock with loud drum and power guitar",
330
+ None,
331
+ "facebook/musicgen-stereo-medium",
332
+ "MultiBand_Diffusion"
333
+ ],
334
+ ],
335
+ inputs=[text, melody, model, decoder],
336
+ outputs=[output]
337
+ )
338
+ gr.Markdown(
339
+ """
340
+ ### More details
341
+
342
+ The model will generate a short music extract based on the description you provided.
343
+ The model can generate up to 30 seconds of audio in one pass.
344
+
345
+ The model was trained with description from a stock music catalog, descriptions that will work best
346
+ should include some level of details on the instruments present, along with some intended use case
347
+ (e.g. adding "perfect for a commercial" can somehow help).
348
+
349
+ Using one of the `melody` model (e.g. `musicgen-melody-*`), you can optionally provide a reference audio
350
+ from which a broad melody will be extracted.
351
+ The model will then try to follow both the description and melody provided.
352
+ For best results, the melody should be 30 seconds long (I know, the samples we provide are not...)
353
+
354
+ It is now possible to extend the generation by feeding back the end of the previous chunk of audio.
355
+ This can take a long time, and the model might lose consistency. The model might also
356
+ decide at arbitrary positions that the song ends.
357
+
358
+ **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min).
359
+ An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
360
+ are generated each time.
361
+
362
+ We present 10 model variations:
363
+ 1. facebook/musicgen-melody -- a music generation model capable of generating music condition
364
+ on text and melody inputs. **Note**, you can also use text only.
365
+ 2. facebook/musicgen-small -- a 300M transformer decoder conditioned on text only.
366
+ 3. facebook/musicgen-medium -- a 1.5B transformer decoder conditioned on text only.
367
+ 4. facebook/musicgen-large -- a 3.3B transformer decoder conditioned on text only.
368
+ 5. facebook/musicgen-melody-large -- a 3.3B transformer decoder conditioned on and melody.
369
+ 6. facebook/musicgen-stereo-*: same as the previous models but fine tuned to output stereo audio.
370
+
371
+ We also present two way of decoding the audio tokens
372
+ 1. Use the default GAN based compression model. It can suffer from artifacts especially
373
+ for crashes, snares etc.
374
+ 2. Use [MultiBand Diffusion](https://arxiv.org/abs/2308.02560). Should improve the audio quality,
375
+ at an extra computational cost. When this is selected, we provide both the GAN based decoded
376
+ audio, and the one obtained with MBD.
377
+
378
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md)
379
+ for more details.
380
+ """
381
+ )
382
+
383
+ interface.queue().launch(**launch_kwargs)
384
+
385
+
386
+ def ui_batched(launch_kwargs):
387
+ with gr.Blocks() as demo:
388
+ gr.Markdown(
389
+ """
390
+ # MusicGen
391
+
392
+ This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md),
393
+ a simple and controllable model for music generation
394
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
395
+ <br/>
396
+ <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
397
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
398
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
399
+ src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
400
+ for longer sequences, more control and no queue.</p>
401
+ """
402
+ )
403
+ with gr.Row():
404
+ with gr.Column():
405
+ with gr.Row():
406
+ text = gr.Text(label="Describe your music", lines=2, interactive=True)
407
+ with gr.Column():
408
+ radio = gr.Radio(["file", "mic"], value="file",
409
+ label="Condition on a melody (optional) File or Mic")
410
+ melody = gr.Audio(sources=["upload"], type="numpy", label="File",
411
+ interactive=True, elem_id="melody-input")
412
+ with gr.Row():
413
+ submit = gr.Button("Generate")
414
+ with gr.Column():
415
+ output = gr.Video(label="Generated Music")
416
+ audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
417
+ submit.click(predict_batched, inputs=[text, melody],
418
+ outputs=[output, audio_output], batch=True, max_batch_size=MAX_BATCH_SIZE)
419
+ radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
420
+ gr.Examples(
421
+ fn=predict_batched,
422
+ examples=[
423
+ [
424
+ "An 80s driving pop song with heavy drums and synth pads in the background",
425
+ "./assets/bach.mp3",
426
+ ],
427
+ [
428
+ "A cheerful country song with acoustic guitars",
429
+ "./assets/bolero_ravel.mp3",
430
+ ],
431
+ [
432
+ "90s rock song with electric guitar and heavy drums",
433
+ None,
434
+ ],
435
+ [
436
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
437
+ "./assets/bach.mp3",
438
+ ],
439
+ [
440
+ "lofi slow bpm electro chill with organic samples",
441
+ None,
442
+ ],
443
+ ],
444
+ inputs=[text, melody],
445
+ outputs=[output]
446
+ )
447
+ gr.Markdown("""
448
+ ### More details
449
+
450
+ The model will generate 15 seconds of audio based on the description you provided.
451
+ The model was trained with description from a stock music catalog, descriptions that will work best
452
+ should include some level of details on the instruments present, along with some intended use case
453
+ (e.g. adding "perfect for a commercial" can somehow help).
454
+
455
+ You can optionally provide a reference audio from which a broad melody will be extracted.
456
+ The model will then try to follow both the description and melody provided.
457
+ For best results, the melody should be 30 seconds long (I know, the samples we provide are not...)
458
+
459
+ You can access more control (longer generation, more models etc.) by clicking
460
+ the <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
461
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
462
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
463
+ src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
464
+ (you will then need a paid GPU from HuggingFace).
465
+ If you have a GPU, you can run the gradio demo locally (click the link to our repo below for more info).
466
+ Finally, you can get a GPU for free from Google
467
+ and run the demo in [a Google Colab.](https://ai.honu.io/red/musicgen-colab).
468
+
469
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md)
470
+ for more details. All samples are generated with the `stereo-melody` model.
471
+ """)
472
+
473
+ demo.queue(max_size=8 * 4).launch(**launch_kwargs)
474
+
475
+
476
+ if __name__ == "__main__":
477
+ parser = argparse.ArgumentParser()
478
+ parser.add_argument(
479
+ '--listen',
480
+ type=str,
481
+ default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
482
+ help='IP to listen on for connections to Gradio',
483
+ )
484
+ parser.add_argument(
485
+ '--username', type=str, default='', help='Username for authentication'
486
+ )
487
+ parser.add_argument(
488
+ '--password', type=str, default='', help='Password for authentication'
489
+ )
490
+ parser.add_argument(
491
+ '--server_port',
492
+ type=int,
493
+ default=0,
494
+ help='Port to run the server listener on',
495
+ )
496
+ parser.add_argument(
497
+ '--inbrowser', action='store_true', help='Open in browser'
498
+ )
499
+ parser.add_argument(
500
+ '--share', action='store_true', help='Share the gradio UI'
501
+ )
502
+
503
+ args = parser.parse_args()
504
+
505
+ launch_kwargs = {}
506
+ launch_kwargs['server_name'] = args.listen
507
+
508
+ if args.username and args.password:
509
+ launch_kwargs['auth'] = (args.username, args.password)
510
+ if args.server_port:
511
+ launch_kwargs['server_port'] = args.server_port
512
+ if args.inbrowser:
513
+ launch_kwargs['inbrowser'] = args.inbrowser
514
+ if args.share:
515
+ launch_kwargs['share'] = True
516
+
517
+ # Hugging Face Spaces require share=True in certain configurations
518
+ # if 'SPACE_ID' in os.environ:
519
+ # launch_kwargs['share'] = True
520
+
521
+ logging.basicConfig(level=logging.INFO, stream=sys.stderr)
522
+
523
+ # Show the interface
524
+ if IS_BATCHED:
525
+ global USE_DIFFUSION
526
+ USE_DIFFUSION = False
527
+ ui_batched(launch_kwargs)
528
+ else:
529
+ ui_full(launch_kwargs)