Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from transformers import AutoProcessor, BlipForConditionalGeneration, pipeline, AutoModelForCausalLM, AutoTokenizer | |
| from PIL import Image as PILImage | |
| import scipy.io.wavfile as wavfile | |
| import os | |
| import uuid | |
| # Set page config at the very beginning | |
| st.set_page_config(page_title="Image to Music", layout="wide") | |
| # Load models outside of functions | |
| def load_models(): | |
| model_id = "Salesforce/blip-image-captioning-large" | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| blip_model = BlipForConditionalGeneration.from_pretrained(model_id) | |
| synthesiser = pipeline("text-to-audio", model="facebook/musicgen-small") | |
| phi_model = AutoModelForCausalLM.from_pretrained( | |
| "microsoft/Phi-3.5-mini-instruct", | |
| device_map="auto", | |
| torch_dtype="auto", | |
| trust_remote_code=True | |
| ) | |
| phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct") | |
| return processor, blip_model, synthesiser, phi_model, phi_tokenizer | |
| processor, blip_model, synthesiser, phi_model, phi_tokenizer = load_models() | |
| def image_to_text(_image: PILImage.Image): | |
| try: | |
| # Prepare the image for the model | |
| inputs = processor(images=_image, return_tensors="pt") | |
| # Generate caption | |
| output = blip_model.generate(**inputs, max_new_tokens=100) | |
| # Decode the output | |
| caption = processor.decode(output[0], skip_special_tokens=True) | |
| return caption | |
| # # Create a music generation prompt based on the caption | |
| # music_prompt = f"Generate music inspired by this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions evoked by the scene." | |
| # return music_prompt | |
| except Exception as e: | |
| return f"Error in image_to_text: {str(e)}" | |
| def refine_prompt(caption: str): | |
| try: | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful AI assistant for generating music prompts."}, | |
| {"role": "user", "content": f"Generate a detailed music prompt based on this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions."} | |
| ] | |
| pipe = pipeline( | |
| "text-generation", | |
| model=phi_model, | |
| tokenizer=phi_tokenizer, | |
| ) | |
| generation_args = { | |
| "max_new_tokens": 500, | |
| "return_full_text": False, | |
| "temperature": 0.7, | |
| "do_sample": True, | |
| } | |
| output = pipe(messages, **generation_args) | |
| refined_prompt = output[0]['generated_text'] | |
| return refined_prompt | |
| except Exception as e: | |
| return f"Error in refine_prompt: {str(e)}" | |
| def text_to_music(response: str): | |
| try: | |
| music = synthesiser(response, forward_params={"do_sample": True}) | |
| output_path = f"musicgen_out_{uuid.uuid4()}.wav" | |
| wavfile.write(output_path, rate=music["sampling_rate"], data=music["audio"]) | |
| return output_path | |
| except Exception as e: | |
| return f"Error in text_to_music: {str(e)}" | |
| def cleanup_old_files(): | |
| for file in os.listdir(): | |
| if file.startswith("musicgen_out_") and file.endswith(".wav"): | |
| os.remove(file) | |
| def main(): | |
| # st.set_page_config(page_title="Image to Music", layout="wide") | |
| st.title("Image to Music") | |
| st.write(""" | |
| Generate music inspired by an image. | |
| This project enables the creation of music based on the inspiration drawn from an image, leveraging multiple AI technologies. | |
| ## How It Works | |
| 1. **Image to Text Description** | |
| - Use Salesforce BLIP to convert the image into a caption. | |
| 2. **Text to Refined Music Prompt** | |
| - Use Microsoft Phi-3.5-mini- to generate a detailed music prompt based on the caption. | |
| 3. **Music Prompt to Music** | |
| - Use Facebook MusicGen to generate music from the refined prompt. | |
| ## Steps | |
| 1. **Image -> [ Salesforce BLIP ] -> Caption** | |
| 2. **Caption -> [ Microsoft Phi-3.5-mini ] -> Refined Music Prompt** | |
| 3. **Refined Music Prompt -> [ Facebook MusicGen ] -> Music** | |
| Let's turn your visual inspirations into beautiful melodies! | |
| **Please Note:** | |
| The music generation process may take several minutes to complete. | |
| This is due to the complex AI models working behind the scenes to create unique music based on your image. | |
| Thank you for your patience! """) | |
| uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) | |
| if uploaded_file is not None: | |
| image = PILImage.open(uploaded_file) | |
| st.image(image, caption="Uploaded Image", use_column_width=True) | |
| if st.button("Generate Music"): | |
| with st.spinner("Processing image..."): | |
| caption = image_to_text(image) | |
| st.text_area("Generated Caption", caption, height=100) | |
| with st.spinner("Refining music prompt..."): | |
| refined_prompt = refine_prompt(caption) | |
| st.text_area("Refined Music Prompt", refined_prompt, height=150) | |
| with st.spinner("Generating music..."): | |
| music_file = text_to_music(refined_prompt) | |
| st.audio(music_file) | |
| cleanup_old_files() | |
| if __name__ == "__main__": | |
| main() |