Spaces:

beingcognitive
/

Image_to_Music

Runtime error

App Files Files Community

beingcognitive commited on Aug 28, 2024

Commit

53fe2ef

1 Parent(s): e1d8b7d

streamlit app

Browse files

Files changed (2) hide show

app.py +141 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import streamlit as st
+from transformers import AutoProcessor, BlipForConditionalGeneration, pipeline, AutoModelForCausalLM, AutoTokenizer
+from PIL import Image as PILImage
+import scipy.io.wavfile as wavfile
+import os
+import uuid
+# Set page config at the very beginning
+st.set_page_config(page_title="Image to Music", layout="wide")
+# Load models outside of functions
+@st.cache_resource
+def load_models():
+    model_id = "Salesforce/blip-image-captioning-large"
+    processor = AutoProcessor.from_pretrained(model_id)
+    blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
+    synthesiser = pipeline("text-to-audio", model="facebook/musicgen-small")
+    phi_model = AutoModelForCausalLM.from_pretrained(
+        "microsoft/Phi-3.5-mini-instruct",
+        device_map="auto",
+        torch_dtype="auto",
+        trust_remote_code=True
+    )
+    phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
+    return processor, blip_model, synthesiser, phi_model, phi_tokenizer
+processor, blip_model, synthesiser, phi_model, phi_tokenizer = load_models()
+@st.cache_data
+def image_to_text(_image: PILImage.Image):
+    try:
+        # Prepare the image for the model
+        inputs = processor(images=_image, return_tensors="pt")
+        # Generate caption
+        output = blip_model.generate(**inputs, max_new_tokens=100)
+        # Decode the output
+        caption = processor.decode(output[0], skip_special_tokens=True)
+        return caption
+        # # Create a music generation prompt based on the caption
+        # music_prompt = f"Generate music inspired by this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions evoked by the scene."
+        # return music_prompt
+    except Exception as e:
+        return f"Error in image_to_text: {str(e)}"
+@st.cache_data
+def refine_prompt(caption: str):
+    try:
+        messages = [
+            {"role": "system", "content": "You are a helpful AI assistant for generating music prompts."},
+            {"role": "user", "content": f"Generate a detailed music prompt based on this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions."}
+        ]
+        pipe = pipeline(
+            "text-generation",
+            model=phi_model,
+            tokenizer=phi_tokenizer,
+        )
+        generation_args = {
+            "max_new_tokens": 500,
+            "return_full_text": False,
+            "temperature": 0.7,
+            "do_sample": True,
+        }
+        output = pipe(messages, **generation_args)
+        refined_prompt = output[0]['generated_text']
+        return refined_prompt
+    except Exception as e:
+        return f"Error in refine_prompt: {str(e)}"
+def text_to_music(response: str):
+    try:
+        music = synthesiser(response, forward_params={"do_sample": True})
+        output_path = f"musicgen_out_{uuid.uuid4()}.wav"
+        wavfile.write(output_path, rate=music["sampling_rate"], data=music["audio"])
+        return output_path
+    except Exception as e:
+        return f"Error in text_to_music: {str(e)}"
+def cleanup_old_files():
+    for file in os.listdir():
+        if file.startswith("musicgen_out_") and file.endswith(".wav"):
+            os.remove(file)
+def main():
+    # st.set_page_config(page_title="Image to Music", layout="wide")
+    st.title("Image to Music")
+    st.write("""
+    Generate music inspired by an image.
+    This project enables the creation of music based on the inspiration drawn from an image, leveraging multiple AI technologies.
+    ## How It Works
+    1. **Image to Text Description**
+       - Use Salesforce BLIP to convert the image into a caption.
+    2. **Text to Refined Music Prompt**
+       - Use Microsoft Phi-3.5-mini- to generate a detailed music prompt based on the caption.
+    3. **Music Prompt to Music**
+       - Use Facebook MusicGen to generate music from the refined prompt.
+    ## Steps
+    1. **Image -> [ Salesforce BLIP ] -> Caption**
+    2. **Caption -> [ Microsoft Phi-3.5-mini ] -> Refined Music Prompt**
+    3. **Refined Music Prompt -> [ Facebook MusicGen ] -> Music**
+    Let's turn your visual inspirations into beautiful melodies!
+    **Please Note:**
+    The music generation process may take several minutes to complete.
+    This is due to the complex AI models working behind the scenes to create unique music based on your image.
+    Thank you for your patience!    """)
+    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
+    if uploaded_file is not None:
+        image = PILImage.open(uploaded_file)
+        st.image(image, caption="Uploaded Image", use_column_width=True)
+        if st.button("Generate Music"):
+            with st.spinner("Processing image..."):
+                caption = image_to_text(image)
+            st.text_area("Generated Caption", caption, height=100)
+            with st.spinner("Refining music prompt..."):
+                refined_prompt = refine_prompt(caption)
+            st.text_area("Refined Music Prompt", refined_prompt, height=150)
+            with st.spinner("Generating music..."):
+                music_file = text_to_music(refined_prompt)
+            st.audio(music_file)
+            cleanup_old_files()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+scipy
+torch
+torchvision
+transformers
+accelerate