Spaces:

napstablook911
/

ImageToSound

Running

App Files Files Community

napstablook911 commited on Jun 23, 2025

Commit

8e93ddc

verified ·

1 Parent(s): 463aeec

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +17 -16

src/streamlit_app.py CHANGED Viewed

@@ -35,15 +35,16 @@ def load_blip_model():
         st.error(f"Failed to load BLIP model: {e}")
         return None
-@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open 1.0)...")
 def load_stable_audio_model():
     """
-    Loads the Stable Audio Open 1.0 pipeline using Hugging Face diffusers.
     The pipeline is cached to prevent reloading on every Streamlit rerun.
     """
     try:
         audio_pipeline = StableAudioPipeline.from_pretrained(
-            "stabilityai/stable-audio-open-1.0",
             torch_dtype=TORCH_DTYPE
         ).to(DEVICE)
         return audio_pipeline
@@ -90,7 +91,7 @@ uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jp
 if uploaded_file is not None:
     st.session_state.image_uploaded = True
     image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
-    st.image(image, caption="Uploaded Image", use_container_width=True)
     # Button to trigger the generation pipeline
     if st.button("Generate Soundscape"):
@@ -108,9 +109,9 @@ if uploaded_file is not None:
                 # Generate caption
                 # The BLIP pipeline expects a PIL Image object directly
                 caption_results = captioner(image)
-                # Extract the generated text from the pipeline's output [7]
                 generated_caption = caption_results[0]['generated_text']
                 # Optional: Enhance prompt for soundscape generation
                 # This helps guide the audio model towards environmental sounds
                 soundscape_prompt = f"A soundscape of {generated_caption}"
@@ -122,18 +123,18 @@ if uploaded_file is not None:
                     st.session_state.image_uploaded = False # Reset to allow re-upload
                     st.stop()
-                # Generate audio with optimized parameters for speed [8, 9]
                 # num_inference_steps: Lower for faster generation, higher for better quality
-                # audio_end_in_s: Shorter audio for faster generation
-                # negative_prompt: Helps improve perceived quality [9]
                 audio_output = audio_pipeline(
                     prompt=soundscape_prompt,
-                    num_inference_steps=50,  # Tuned for faster generation [9]
-                    audio_end_in_s=10.0,     # 10 seconds audio length [8]
-                    negative_prompt="low quality, average quality, distorted" # [9]
                 )
-                # Extract the NumPy array and sample rate [10]
                 audio_numpy_array = audio_output.audios
                 sample_rate = audio_pipeline.config.sampling_rate
@@ -143,15 +144,15 @@ if uploaded_file is not None:
                 st.success("Soundscape generated successfully!")
             except Exception as e:
-                st.error(f"An error occurred during generation: {e}") # [11]
                 st.session_state.audio_bytes = None # Clear any partial audio
                 st.session_state.image_uploaded = False # Reset to allow re-upload
-                st.exception(e) # Display full traceback for debugging [11]
 # Display generated soundscape if available in session state
 if st.session_state.audio_bytes:
     st.subheader("Generated Soundscape:")
-    st.audio(st.session_state.audio_bytes, format='audio/wav') # [6, 12]
     st.markdown("You can download the audio using the controls above.")
 # Reset button for new image upload

         st.error(f"Failed to load BLIP model: {e}")
         return None
+@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open Small)...")
 def load_stable_audio_model():
     """
+    Loads the Stable Audio Open Small pipeline using Hugging Face diffusers.
     The pipeline is cached to prevent reloading on every Streamlit rerun.
     """
     try:
+        # Changed model to stabilityai/stable-audio-open-small
         audio_pipeline = StableAudioPipeline.from_pretrained(
+            "stabilityai/stable-audio-open-small", # <--- MODEL CHANGED HERE
             torch_dtype=TORCH_DTYPE
         ).to(DEVICE)
         return audio_pipeline
 if uploaded_file is not None:
     st.session_state.image_uploaded = True
     image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
+    st.image(image, caption="Uploaded Image", use_container_width=True) # Updated deprecated parameter [6]
     # Button to trigger the generation pipeline
     if st.button("Generate Soundscape"):
                 # Generate caption
                 # The BLIP pipeline expects a PIL Image object directly
                 caption_results = captioner(image)
+                # Extract the generated text from the pipeline's output
                 generated_caption = caption_results[0]['generated_text']
                 # Optional: Enhance prompt for soundscape generation
                 # This helps guide the audio model towards environmental sounds
                 soundscape_prompt = f"A soundscape of {generated_caption}"
                     st.session_state.image_uploaded = False # Reset to allow re-upload
                     st.stop()
+                # Generate audio with optimized parameters for speed [7, 8]
                 # num_inference_steps: Lower for faster generation, higher for better quality
+                # audio_end_in_s: Shorter audio for faster generation (max 11s for stable-audio-open-small) [10, 11, 12]
+                # negative_prompt: Helps improve perceived quality [8]
                 audio_output = audio_pipeline(
                     prompt=soundscape_prompt,
+                    num_inference_steps=50,  # Tuned for faster generation [8]
+                    audio_end_in_s=10.0,     # 10 seconds audio length (within 11s limit for small model) [10, 11, 12]
+                    negative_prompt="low quality, average quality, distorted" # [8]
                 )
+                # Extract the NumPy array and sample rate [9]
                 audio_numpy_array = audio_output.audios
                 sample_rate = audio_pipeline.config.sampling_rate
                 st.success("Soundscape generated successfully!")
             except Exception as e:
+                st.error(f"An error occurred during generation: {e}") #
                 st.session_state.audio_bytes = None # Clear any partial audio
                 st.session_state.image_uploaded = False # Reset to allow re-upload
+                st.exception(e) # Display full traceback for debugging
 # Display generated soundscape if available in session state
 if st.session_state.audio_bytes:
     st.subheader("Generated Soundscape:")
+    st.audio(st.session_state.audio_bytes, format='audio/wav') #
     st.markdown("You can download the audio using the controls above.")
 # Reset button for new image upload