Spaces:
Running
Running
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +17 -16
src/streamlit_app.py
CHANGED
|
@@ -35,15 +35,16 @@ def load_blip_model():
|
|
| 35 |
st.error(f"Failed to load BLIP model: {e}")
|
| 36 |
return None
|
| 37 |
|
| 38 |
-
@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open
|
| 39 |
def load_stable_audio_model():
|
| 40 |
"""
|
| 41 |
-
Loads the Stable Audio Open
|
| 42 |
The pipeline is cached to prevent reloading on every Streamlit rerun.
|
| 43 |
"""
|
| 44 |
try:
|
|
|
|
| 45 |
audio_pipeline = StableAudioPipeline.from_pretrained(
|
| 46 |
-
"stabilityai/stable-audio-open-
|
| 47 |
torch_dtype=TORCH_DTYPE
|
| 48 |
).to(DEVICE)
|
| 49 |
return audio_pipeline
|
|
@@ -90,7 +91,7 @@ uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jp
|
|
| 90 |
if uploaded_file is not None:
|
| 91 |
st.session_state.image_uploaded = True
|
| 92 |
image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
|
| 93 |
-
st.image(image, caption="Uploaded Image", use_container_width=True)
|
| 94 |
|
| 95 |
# Button to trigger the generation pipeline
|
| 96 |
if st.button("Generate Soundscape"):
|
|
@@ -108,9 +109,9 @@ if uploaded_file is not None:
|
|
| 108 |
# Generate caption
|
| 109 |
# The BLIP pipeline expects a PIL Image object directly
|
| 110 |
caption_results = captioner(image)
|
| 111 |
-
# Extract the generated text from the pipeline's output
|
| 112 |
generated_caption = caption_results[0]['generated_text']
|
| 113 |
-
|
| 114 |
# Optional: Enhance prompt for soundscape generation
|
| 115 |
# This helps guide the audio model towards environmental sounds
|
| 116 |
soundscape_prompt = f"A soundscape of {generated_caption}"
|
|
@@ -122,18 +123,18 @@ if uploaded_file is not None:
|
|
| 122 |
st.session_state.image_uploaded = False # Reset to allow re-upload
|
| 123 |
st.stop()
|
| 124 |
|
| 125 |
-
# Generate audio with optimized parameters for speed [
|
| 126 |
# num_inference_steps: Lower for faster generation, higher for better quality
|
| 127 |
-
# audio_end_in_s: Shorter audio for faster generation
|
| 128 |
-
# negative_prompt: Helps improve perceived quality [
|
| 129 |
audio_output = audio_pipeline(
|
| 130 |
prompt=soundscape_prompt,
|
| 131 |
-
num_inference_steps=50, # Tuned for faster generation [
|
| 132 |
-
audio_end_in_s=10.0, # 10 seconds audio length [
|
| 133 |
-
negative_prompt="low quality, average quality, distorted" # [
|
| 134 |
)
|
| 135 |
|
| 136 |
-
# Extract the NumPy array and sample rate [
|
| 137 |
audio_numpy_array = audio_output.audios
|
| 138 |
sample_rate = audio_pipeline.config.sampling_rate
|
| 139 |
|
|
@@ -143,15 +144,15 @@ if uploaded_file is not None:
|
|
| 143 |
st.success("Soundscape generated successfully!")
|
| 144 |
|
| 145 |
except Exception as e:
|
| 146 |
-
st.error(f"An error occurred during generation: {e}") #
|
| 147 |
st.session_state.audio_bytes = None # Clear any partial audio
|
| 148 |
st.session_state.image_uploaded = False # Reset to allow re-upload
|
| 149 |
-
st.exception(e) # Display full traceback for debugging
|
| 150 |
|
| 151 |
# Display generated soundscape if available in session state
|
| 152 |
if st.session_state.audio_bytes:
|
| 153 |
st.subheader("Generated Soundscape:")
|
| 154 |
-
st.audio(st.session_state.audio_bytes, format='audio/wav') #
|
| 155 |
st.markdown("You can download the audio using the controls above.")
|
| 156 |
|
| 157 |
# Reset button for new image upload
|
|
|
|
| 35 |
st.error(f"Failed to load BLIP model: {e}")
|
| 36 |
return None
|
| 37 |
|
| 38 |
+
@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open Small)...")
|
| 39 |
def load_stable_audio_model():
|
| 40 |
"""
|
| 41 |
+
Loads the Stable Audio Open Small pipeline using Hugging Face diffusers.
|
| 42 |
The pipeline is cached to prevent reloading on every Streamlit rerun.
|
| 43 |
"""
|
| 44 |
try:
|
| 45 |
+
# Changed model to stabilityai/stable-audio-open-small
|
| 46 |
audio_pipeline = StableAudioPipeline.from_pretrained(
|
| 47 |
+
"stabilityai/stable-audio-open-small", # <--- MODEL CHANGED HERE
|
| 48 |
torch_dtype=TORCH_DTYPE
|
| 49 |
).to(DEVICE)
|
| 50 |
return audio_pipeline
|
|
|
|
| 91 |
if uploaded_file is not None:
|
| 92 |
st.session_state.image_uploaded = True
|
| 93 |
image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
|
| 94 |
+
st.image(image, caption="Uploaded Image", use_container_width=True) # Updated deprecated parameter [6]
|
| 95 |
|
| 96 |
# Button to trigger the generation pipeline
|
| 97 |
if st.button("Generate Soundscape"):
|
|
|
|
| 109 |
# Generate caption
|
| 110 |
# The BLIP pipeline expects a PIL Image object directly
|
| 111 |
caption_results = captioner(image)
|
| 112 |
+
# Extract the generated text from the pipeline's output
|
| 113 |
generated_caption = caption_results[0]['generated_text']
|
| 114 |
+
|
| 115 |
# Optional: Enhance prompt for soundscape generation
|
| 116 |
# This helps guide the audio model towards environmental sounds
|
| 117 |
soundscape_prompt = f"A soundscape of {generated_caption}"
|
|
|
|
| 123 |
st.session_state.image_uploaded = False # Reset to allow re-upload
|
| 124 |
st.stop()
|
| 125 |
|
| 126 |
+
# Generate audio with optimized parameters for speed [7, 8]
|
| 127 |
# num_inference_steps: Lower for faster generation, higher for better quality
|
| 128 |
+
# audio_end_in_s: Shorter audio for faster generation (max 11s for stable-audio-open-small) [10, 11, 12]
|
| 129 |
+
# negative_prompt: Helps improve perceived quality [8]
|
| 130 |
audio_output = audio_pipeline(
|
| 131 |
prompt=soundscape_prompt,
|
| 132 |
+
num_inference_steps=50, # Tuned for faster generation [8]
|
| 133 |
+
audio_end_in_s=10.0, # 10 seconds audio length (within 11s limit for small model) [10, 11, 12]
|
| 134 |
+
negative_prompt="low quality, average quality, distorted" # [8]
|
| 135 |
)
|
| 136 |
|
| 137 |
+
# Extract the NumPy array and sample rate [9]
|
| 138 |
audio_numpy_array = audio_output.audios
|
| 139 |
sample_rate = audio_pipeline.config.sampling_rate
|
| 140 |
|
|
|
|
| 144 |
st.success("Soundscape generated successfully!")
|
| 145 |
|
| 146 |
except Exception as e:
|
| 147 |
+
st.error(f"An error occurred during generation: {e}") #
|
| 148 |
st.session_state.audio_bytes = None # Clear any partial audio
|
| 149 |
st.session_state.image_uploaded = False # Reset to allow re-upload
|
| 150 |
+
st.exception(e) # Display full traceback for debugging
|
| 151 |
|
| 152 |
# Display generated soundscape if available in session state
|
| 153 |
if st.session_state.audio_bytes:
|
| 154 |
st.subheader("Generated Soundscape:")
|
| 155 |
+
st.audio(st.session_state.audio_bytes, format='audio/wav') #
|
| 156 |
st.markdown("You can download the audio using the controls above.")
|
| 157 |
|
| 158 |
# Reset button for new image upload
|