Image-to-MusicGen

Paused

App Files Files Community

fffiloni commited on Jun 9, 2023

Commit

d94e8fe

1 Parent(s): c7e6202

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -40

app.py CHANGED Viewed

@@ -16,13 +16,16 @@ from audiocraft.data.audio import audio_write
 MODEL = None
 def load_model(version):
     print("Loading model", version)
     return MusicGen.get_pretrained(version)
-def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
     global MODEL
     topk = int(topk)
     if MODEL is None or MODEL.name != model:
@@ -57,8 +60,8 @@ def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
     output = output.detach().cpu().float()[0]
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
         audio_write(file.name, output, MODEL.sample_rate, strategy="loudness", add_suffix=False)
-        waveform_video = gr.make_waveform(file.name)
-    return waveform_video
 with gr.Blocks() as demo:
@@ -77,7 +80,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             with gr.Row():
-                text = gr.Text(label="Input Text", interactive=True)
                 melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
             with gr.Row():
                 submit = gr.Button("Submit")
@@ -90,46 +93,15 @@ with gr.Blocks() as demo:
                 topp = gr.Number(label="Top-p", value=0, interactive=True)
                 temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
                 cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
-        with gr.Column():
-            output = gr.Video(label="Generated Music")
-    submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
-    gr.Examples(
-        fn=predict,
-        examples=[
-            [
-                "An 80s driving pop song with heavy drums and synth pads in the background",
-                "./assets/bach.mp3",
-                "melody"
-            ],
-            [
-                "A cheerful country song with acoustic guitars",
-                "./assets/bolero_ravel.mp3",
-                "melody"
-            ],
-            [
-                "90s rock song with electric guitar and heavy drums",
-                None,
-                "medium"
-            ],
-            [
-                "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
-                "./assets/bach.mp3",
-                "melody"
-            ],
-            [
-                "lofi slow bpm electro chill with organic samples",
-                None,
-                "medium",
-            ],
-        ],
-        inputs=[text, melody, model],
-        outputs=[output]
-    )
     gr.Markdown(
         """
         ### More details
-        The model will generate a short music extract based on the description you provided.
         You can generate up to 30 seconds of audio.
         We present 4 model variations:

 MODEL = None
+img_to_text = gr.Blocks.load(name="spaces/fffiloni/CLIP-Interrogator-2")
 def load_model(version):
     print("Loading model", version)
     return MusicGen.get_pretrained(version)
+def predict(model, uploaded_image, melody, duration, topk, topp, temperature, cfg_coef):
+    text = img_to_text(uploaded_image, 'best', 4, fn_index=1)[0]
     global MODEL
     topk = int(topk)
     if MODEL is None or MODEL.name != model:
     output = output.detach().cpu().float()[0]
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
         audio_write(file.name, output, MODEL.sample_rate, strategy="loudness", add_suffix=False)
+        #waveform_video = gr.make_waveform(file.name)
+    return file.name
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             with gr.Row():
+                uploaded_image = gr.Image(label="Input Image", interactive=True, source="upload", type="filepath")
                 melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
             with gr.Row():
                 submit = gr.Button("Submit")
                 topp = gr.Number(label="Top-p", value=0, interactive=True)
                 temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
                 cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
+        with gr.Column():t
+            output = gr.Audio(label="Generated Music")
+    submit.click(predict, inputs=[model, uploaded_image, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
     gr.Markdown(
         """
         ### More details
+        The model will generate a short music extract based on the image you provided.
         You can generate up to 30 seconds of audio.
         We present 4 model variations: