Spaces:

fffiloni
/

Music-To-Image

Paused

App Files Files Community

fffiloni commited on Aug 7, 2023

Commit

4bc845a

1 Parent(s): d18abca

added lyrics optional step

Browse files

Files changed (1) hide show

app.py +54 -33

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces")
 from gradio_client import Client
 client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
 from compel import Compel, ReturnedEmbeddingsType
 from diffusers import DiffusionPipeline
@@ -60,42 +62,58 @@ def solo_xd(prompt):
     images = pipe(prompt=prompt).images[0]
     return images
-def infer(audio_file):
     truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
     cap_result = lpmc_client(
     				truncated_audio,	# str (filepath or URL to file) in 'audio_path' Audio component
     				api_name="predict"
     )
-    print(cap_result)
-    #summarize_q = f"""
-    #I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance.
-    #Do not processs each segment, but provide a summary for the whole instead.
-    #Here's the list:
-    #{cap_result}
-    #"""
-    #summary_result = client.predict(
-    #				summarize_q,	# str in 'Message' Textbox component
-    #				api_name="/chat_1"
-    #)
-    #print(f"SUMMARY: {summary_result}")
-    llama_q = f"""
-    I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
-    Do not processs each segment or song, but provide a summary for the whole instead.
-    Answer with only one image description. Never do lists. Maximum 77 tokens.
-    Here's the music description :
-    {cap_result}
-    """
     result = client.predict(
     				llama_q,	# str in 'Message' Textbox component
     				api_name="/predict"
@@ -105,8 +123,10 @@ def infer(audio_file):
     print(f"Llama2 result: {result}")
-    # ———
     prompt = result
     conditioning, pooled = compel(prompt)
     images = pipe(prompt_embeds=conditioning, pooled_prompt_embeds=pooled).images[0]
@@ -142,21 +162,22 @@ with gr.Blocks(css=css) as demo:
                 </p>
             </div>""")
         audio_input = gr.Audio(label="Music input", type="filepath", source="upload")
         infer_btn = gr.Button("Generate Image from Music")
         #lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
         llama_trans_cap = gr.Textbox(label="Llama translation", visible=False)
         img_result = gr.Image(label="Image Result")
-        tryagain_btn = gr.Button("Try again ?", visible=False)
-        gr.Examples(examples=[["./examples/electronic.mp3"],["./examples/folk.wav"], ["./examples/orchestra.wav"]],
                     fn=infer,
-                    inputs=[audio_input],
                     outputs=[img_result, llama_trans_cap, tryagain_btn],
                     cache_examples=True
                    )
     #infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])
-    infer_btn.click(fn=infer, inputs=[audio_input], outputs=[img_result, llama_trans_cap, tryagain_btn])
     tryagain_btn.click(fn=solo_xd, inputs=[llama_trans_cap], outputs=[img_result])
 demo.queue(max_size=20).launch()

 from gradio_client import Client
 client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
+lyrics_client = Client("https://fffiloni-music-to-lyrics.hf.space/")
 from compel import Compel, ReturnedEmbeddingsType
 from diffusers import DiffusionPipeline
     images = pipe(prompt=prompt).images[0]
     return images
+def infer(audio_file, has_lyrics):
+    print("NEW INFERENCE ...")
     truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
+    print("Calling LP Music Caps...")
     cap_result = lpmc_client(
     				truncated_audio,	# str (filepath or URL to file) in 'audio_path' Audio component
     				api_name="predict"
     )
+    print(f"MUSIC DESC: {cap_result}")
+    if has_lyrics == "Yes" :
+        print("""———
+        Getting Lyrics ...
+        """)
+        lyrics_result = lyrics_client.predict(
+        				audio_file,	# str (filepath or URL to file) in 'Song input' Audio component
+        				fn_index=0
+        )
+        print(f"LYRICS: {lyrics_result}")
+        llama_q = f"""
+        I'll give you a music description + the lyrics of the song.
+        Give me an image description that would fit well with the music description, reflecting the lyrics too.
+        Be creative, do not do list, just an image description as required. Try to think about human characters first.
+        Your image description must fit well for a stable diffusion prompt.
+        Here's the music description :
+        « {cap_result} »
+        And here are the lyrics :
+        « {lyrics_result} »
+        """
+    elif has_lyrics == "No" :
+        llama_q = f"""
+        I'll give you a music description.
+        Give me an image description that would fit well with the music description.
+        Be creative, do not do list, just an image description as required. Try to think about human characters first.
+        Your image description must fit well for a stable diffusion prompt.
+        Here's the music description :
+        « {cap_result} »
+        """
+    print("""———
+    Calling Llama2 ...
+    """)
     result = client.predict(
     				llama_q,	# str in 'Message' Textbox component
     				api_name="/predict"
     print(f"Llama2 result: {result}")
+    # ———
+    print("""———
+    Calling SD-XL ...
+    """)
     prompt = result
     conditioning, pooled = compel(prompt)
     images = pipe(prompt_embeds=conditioning, pooled_prompt_embeds=pooled).images[0]
                 </p>
             </div>""")
         audio_input = gr.Audio(label="Music input", type="filepath", source="upload")
+        has_lyrics = gr.Radio(label="Does your audio has lyrics ?", choices=["Yes", "No"], value="No", info="If yes, the image should reflect the lyrics, but be aware that because we add a step (getting lyrics), inference will take more time.")
         infer_btn = gr.Button("Generate Image from Music")
         #lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
         llama_trans_cap = gr.Textbox(label="Llama translation", visible=False)
         img_result = gr.Image(label="Image Result")
+        tryagain_btn = gr.Button("Try another image ?", visible=False)
+        gr.Examples(examples=[["./examples/electronic.mp3", "No"],["./examples/folk.wav", "No"], ["./examples/orchestra.wav", "No"]],
                     fn=infer,
+                    inputs=[audio_input, has_lyrics],
                     outputs=[img_result, llama_trans_cap, tryagain_btn],
                     cache_examples=True
                    )
     #infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])
+    infer_btn.click(fn=infer, inputs=[audio_input, has_lyrics], outputs=[img_result, llama_trans_cap, tryagain_btn])
     tryagain_btn.click(fn=solo_xd, inputs=[llama_trans_cap], outputs=[img_result])
 demo.queue(max_size=20).launch()