Spaces:

garyuzair
/

Video-To-SoundFX

Runtime error

App Files Files Community

garyuzair commited on May 7, 2025

Commit

f60e24b

verified ·

1 Parent(s): 367b87b

Update app.py

Browse files

Files changed (1) hide show

app.py +331 -437

app.py CHANGED Viewed

@@ -1,487 +1,381 @@
 import streamlit as st
-import torch
-import numpy as np
-from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
-import imageio
 from PIL import Image
-import soundfile as sf
 import os
 import tempfile
-import subprocess
-from pydub import AudioSegment, effects
-import moviepy.editor as mpy
-import time
-from concurrent.futures import ThreadPoolExecutor
-# Set page configuration
-st.set_page_config(page_title="🎬 AI SoundFX Studio", layout="wide", initial_sidebar_state="expanded")
-# Enhanced CSS for better UI
 st.markdown("""
-<style>
-    .reportview-container {
-        background: #1a1a1a;
-        color: white;
-    }
-    .sidebar .sidebar-content {
-        width: 350px;
-    }
-    .stProgress > div > div {
-        background-color: #4CAF50;
-    }
-    .stButton>button {
-        background-color: #4CAF50;
-        color: white;
-    }
-</style>
-""", unsafe_allow_html=True)
-# Optional scene detection
-scene_detect_available = True
-try:
-    from scenedetect import VideoManager, SceneManager
-    from scenedetect.detectors import ContentDetector
-except ImportError:
-    scene_detect_available = False
-# Model Management
-@st.cache_resource
-def load_blip_model():
-    """Load BLIP model with optimized settings"""
     try:
-        processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        model = BlipForConditionalGeneration.from_pretrained(
-            "Salesforce/blip-image-captioning-base",
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            low_mem=True
-        ).to("cuda" if torch.cuda.is_available() else "cpu")
         return processor, model
     except Exception as e:
-        st.error(f"BLIP model load error: {str(e)}")
         return None, None
-@st.cache_resource
-def load_musicgen_model(model_name="facebook/musicgen-medium"):
-    """Load MusicGen model with optimized settings"""
     try:
-        model = MusicgenForConditionalGeneration.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            low_cpu_mem_usage=True
-        ).to("cuda" if torch.cuda.is_available() else "cpu")
-        processor = AutoProcessor.from_pretrained(model_name)
         return processor, model
     except Exception as e:
-        st.error(f"MusicGen model load error: {str(e)}")
         return None, None
-def extract_frames(video_path, num_frames, method="uniform", segment_start=0, segment_end=None):
-    """Optimized frame extraction with smart sampling"""
     try:
-        video = imageio.get_reader(video_path, "ffmpeg")
-        meta = video.get_meta_data()
-        fps = meta['fps']
-        total_frames = int(meta['duration'] * fps)
-        # Optimize frame count based on duration
-        actual_num_frames = min(num_frames, int(total_frames / 5) or 5)
-        if segment_end is None:
-            segment_end = total_frames / fps
-        start_frame = int(segment_start * fps)
-        end_frame = int(segment_end * fps)
-        total_segment_frames = end_frame - start_frame
-        # Smart frame selection
-        if method == "scene" and scene_detect_available:
             try:
-                video_manager = VideoManager([video_path])
-                scene_manager = SceneManager()
-                scene_manager.add_detector(ContentDetector(threshold=30))
-                video_manager.set_downscale_factor(2)
-                video_manager.start()
-                scene_manager.detect_scenes(frame_source=video_manager)
-                scene_list = scene_manager.get_scene_list()
-                segment_scenes = [scene for scene in scene_list
-                                if segment_start <= scene[0].get_seconds() < segment_end]
-                frames = []
-                for scene in segment_scenes[:actual_num_frames]:
-                    frame = video_manager.get_frame(scene[0].get_frames())
-                    if frame is not None:
-                        frames.append(Image.fromarray(frame))
-                video_manager.release()
-                # Fill remaining frames if needed
-                if len(frames) < actual_num_frames and total_segment_frames > 0:
-                    remaining = actual_num_frames - len(frames)
-                    step = max(1, total_segment_frames // (remaining + 1))
-                    for i in range(1, remaining + 1):
-                        frame_idx = start_frame + i * step
-                        if frame_idx < end_frame:
-                            frames.append(Image.fromarray(video.get_data(frame_idx)))
-                return frames[:actual_num_frames]
-            except Exception as e:
-                st.warning(f"Scene detection failed: {e}. Using uniform extraction.")
-        # Uniform extraction with numpy optimization
-        frame_indices = np.linspace(start_frame, end_frame, actual_num_frames, endpoint=False).astype(int)
-        frames = []
-        for idx in frame_indices:
-            if idx < total_frames:
-                frame = video.get_data(idx)
-                frames.append(Image.fromarray(frame))
-        return frames[:actual_num_frames]
     except Exception as e:
-        st.error(f"Frame extraction error: {str(e)}")
         return []
-def generate_captions_parallel(frames, processor, model):
-    """Parallel caption generation with error handling"""
-    def process_frame(frame):
-        try:
-            inputs = processor(images=frame, return_tensors="pt").to(model.device)
-            out = model.generate(**inputs, max_length=25, num_beams=3)
-            return processor.decode(out[0], skip_special_tokens=True)
-        except Exception as e:
-            st.warning(f"Captioning error: {str(e)}")
-            return ""
-    with ThreadPoolExecutor() as executor:
-        return list(executor.map(process_frame, frames))
-def enhance_prompt(descriptions, mood="default"):
-    """Advanced prompt engineering with pattern recognition"""
-    if not descriptions:
-        return f"{mood} ambient sound with subtle effects"
-    combined = ". ".join(set(desc.lower() for desc in descriptions))
-    # Enhanced pattern matching dictionary
-    pattern_map = {
-        ('walk', 'run'): "crisp footsteps on varied surfaces with immersive movement sounds",
-        ('car', 'drive', 'vehicle'): "roaring engine, tire screeches, and dynamic road noise with spatial positioning",
-        ('talk', 'person', 'people', 'conversation'): "lively voices, crowd murmur, and spatial chatter with natural reverb",
-        ('wind', 'tree', 'forest', 'nature'): "rustling leaves, gentle wind gusts, and natural ambiance with atmospheric depth",
-        ('crash', 'fall', 'impact'): "intense crash impact, debris scattering, and sharp transient effects with dynamic range",
-        ('water', 'ocean', 'sea'): "realistic water movement, wave dynamics, and aquatic ambiance",
-        ('fire', 'explosion'): "realistic fire crackling, explosions, and heat distortion audio",
-        ('space', 'sci-fi'): "futuristic ambient textures, synth effects, and spatial audio design"
-    }
-    # Advanced pattern matching logic
-    matched_patterns = []
-    for keywords, effect in pattern_map.items():
-        if any(keyword in combined for keyword in keywords):
-            matched_patterns.append(effect)
-    if matched_patterns:
-        return f"{mood} {combined}, {'; '.join(matched_patterns)}, cinematic sound design with spatial audio"
-    return f"{mood} {combined}, rich ambient soundscape with professional effects, 4K audio resolution"
-def generate_audio(prompt, processor, model, duration, sample_rate=44100):
-    """Optimized audio generation with smart parameters"""
     try:
-        inputs = processor(text=[prompt], padding=True, return_tensors="pt").to(model.device)
-        audio_values = model.generate(
-            **inputs,
-            max_new_tokens=int(256 * (duration / 8)),
-            num_beams=3,
-            early_stopping=True,
-            do_sample=True,
-            temperature=0.85,
-            guidance_scale=5.0,
-            top_k=80,
-            top_p=0.85
-        )
-        audio_array = audio_values[0].cpu().numpy()
-        audio_array = np.tanh(audio_array)  # Faster than clip + max normalization
-        return audio_array
-    except Exception as e:
-        st.error(f"Audio generation error: {str(e)}")
-        return np.zeros(int(duration * sample_rate))
-def apply_audio_effects(audio_path, settings):
-    """Enhanced audio effects processing"""
-    try:
-        sound = AudioSegment.from_wav(audio_path)
-        # Reverb
-        if settings['reverb_ms'] > 0:
-            sound = sound.overlay(sound - 15, position=settings['reverb_ms'])
-        # Echo
-        if settings['echo_ms'] > 0:
-            echo = sound - 15
-            sound = sound.overlay(echo, position=settings['echo_ms'])
-        # Filters
-        if settings['highpass'] > 0:
-            sound = sound.high_pass_filter(settings['highpass'])
-        if settings['lowpass'] < 20000:
-            sound = sound.low_pass_filter(settings['lowpass'])
-        # Dynamic processing
-        if settings['compress']:
-            sound = effects.compress_dynamic_range(sound)
-        # Stereo imaging
-        sound = sound.pan(settings['stereo_pan'])
-        sound = effects.normalize(sound)
-        processed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-        sound.export(processed_path, format="wav")
-        return processed_path
     except Exception as e:
-        st.error(f"Audio effects error: {str(e)}")
-        return audio_path
-def sync_audio_video(video_path, audio_path, output_path, mix_original=False,
-                    original_volume=0.5, generated_volume=0.5):
-    """Enhanced video/audio synchronization"""
     try:
-        if mix_original:
-            video_clip = mpy.VideoFileClip(video_path)
-            if video_clip.audio:
-                original_audio_seg = AudioSegment.from_file(video_path, format="mp4")
-                generated_audio_seg = AudioSegment.from_wav(audio_path)
-                # Volume adjustment
-                original_audio_seg = original_audio_seg - (20 * (1 - original_volume))
-                generated_audio_seg = generated_audio_seg - (20 * (1 - generated_volume))
-                mixed_audio = original_audio_seg.overlay(generated_audio_seg)
-                mixed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-                mixed_audio.export(mixed_path, format="wav")
-                audio_path = mixed_path
-            else:
-                st.warning("No original audio found. Using generated audio only.")
-        # FFmpeg command with hardware acceleration
-        cmd = [
-            'ffmpeg',
-            '-i', video_path,
-            '-i', audio_path,
-            '-c:v', 'copy',
-            '-c:a', 'aac',
-            '-map', '0:v:0',
-            '-map', '1:a:0',
-            '-shortest',
-            '-y',
-            '-preset', 'ultrafast',
-            '-vsync', '2',
-            output_path
-        ]
-        subprocess.run(cmd, check=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-    except Exception as e:
-        st.error(f"Sync error: {str(e)}")
-def unload_model(model):
-    """Memory management utility"""
-    if torch.cuda.is_available():
-        model.to("cpu")
-        torch.cuda.empty_cache()
-def main():
-    st.title("🎬 AI SoundFX Studio")
-    st.markdown("### Create immersive soundscapes from video with optimized AI processing")
-    # Initialize session state
-    if 'processing_time' not in st.session_state:
-        st.session_state.processing_time = 0
-    # User Guide
-    with st.expander("📖 User Guide & Tips"):
-        st.markdown("""
-        **How to Use:**
-        1. Upload a video file (MP4, MOV, AVI)
-        2. Choose between Automatic (AI-generated) or Manual sound description
-        3. Adjust settings in the sidebar:
-           - Model size (small/medium/large)
-           - Frame analysis parameters
-           - Audio effects customization
-        4. Click "Generate Sound Effects"
-        5. Download the enhanced video
-        **Optimization Tips:**
-        - Use "small" model for quick previews
-        - Enable "Scene Detection" for better context
-        - Adjust audio effects for custom sound design
-        - Use "Mix with Original Audio" for balanced results
-        """)
-    # Sidebar Settings
-    with st.sidebar:
-        st.header("⚙️ Processing Settings")
-        # Processing Mode
-        prompt_mode = st.selectbox("Prompt Generation", ["Automatic", "Manual"])
-        # Model Selection
-        model_size = st.selectbox("Model Size", ["small", "medium", "large"], index=1,
-                                 help="Larger models = better quality but slower processing")
-        # Audio Mixing
-        mix_original = st.checkbox("Mix with Original Audio", value=False)
-        col1, col2 = st.columns(2)
-        with col1:
-            original_vol = st.slider("Original Volume", 0.0, 1.0, 0.5) if mix_original else 0.5
-        with col2:
-            generated_vol = st.slider("Generated Volume", 0.0, 1.0, 0.5) if mix_original else 0.5
-        # Frame Analysis
-        st.subheader("🎥 Frame Analysis")
-        num_frames = st.slider("Frames to Analyze", 3, 10, 5,
-                              help="More frames improve accuracy but increase processing time")
-        frame_method = st.selectbox("Frame Extraction",
-                                  ["Uniform", "Scene"] if scene_detect_available else ["Uniform"],
-                                  help="Scene detection provides better contextual analysis")
-        # Audio Effects
-        st.subheader("🎛️ Audio Effects")
-        effects_settings = {
-            'reverb_ms': st.slider("Reverb (ms)", 0, 500, 50),
-            'echo_ms': st.slider("Echo (ms)", 0, 1000, 100),
-            'highpass': st.slider("High-pass Filter (Hz)", 0, 3000, 50),
-            'lowpass': st.slider("Low-pass Filter (Hz)", 5000, 20000, 12000),
-            'compress': st.checkbox("Dynamic Compression", value=True),
-            'stereo_pan': st.slider("Stereo Pan (-1 left, 1 right)", -1.0, 1.0, 0.0)
-        }
-        # Performance Presets
-        st.subheader("⚡ Performance")
-        quality_preset = st.selectbox("Quality Preset", ["Fast", "Balanced", "High Quality"])
-        presets = {
-            "Fast": {"num_frames": 3, "model_size": "small"},
-            "Balanced": {"num_frames": 5, "model_size": "medium"},
-            "High Quality": {"num_frames": 8, "model_size": "large"}
-        }
-        if quality_preset != "Balanced":
-            num_frames = presets[quality_preset]["num_frames"]
-            model_size = presets[quality_preset]["model_size"]
-        st.info("Processing time estimate: 2-5 minutes (varies by settings)")
-    # Main Content Area
-    uploaded_file = st.file_uploader("Upload Video File", type=["mp4", "mov", "avi"])
-    if uploaded_file:
-        # Create temporary files
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
-            tmp.write(uploaded_file.read())
-            video_path = tmp.name
-        # Video Preview
-        st.video(video_path)
-        # Get video duration
-        video_clip = mpy.VideoFileClip(video_path)
-        duration = video_clip.duration
-        video_clip.close()
-        # Prompt Generation
-        if prompt_mode == "Automatic":
-            with st.spinner("Analyzing video content..."):
-                blip_processor, blip_model = load_blip_model()
-                if not blip_processor or not blip_model:
-                    st.error("Failed to load BLIP model")
-                    return
-                frames = extract_frames(video_path, num_frames, frame_method)
-                if not frames:
-                    st.error("No frames extracted. Try a different video or settings.")
-                    return
-                # Display analyzed frames
-                cols = st.columns(len(frames))
-                for col, frame in zip(cols, frames):
-                    with col:
-                        st.image(frame, use_column_width=True)
-                descriptions = generate_captions_parallel(frames, blip_processor, blip_model)
-                unload_model(blip_model)
-                # Mood selection
-                mood = st.selectbox("Sound Mood", [
-                    "default", "dramatic", "ambient", "action", "sci-fi", "horror", "comedy"
-                ], help="Select the overall atmosphere for the sound design")
-                # Enhanced prompt with AI suggestions
-                text_prompt = enhance_prompt(descriptions, mood)
-                st.subheader("Generated Prompt")
-                text_prompt = st.text_area("Edit Prompt", text_prompt, height=150)
-                st.markdown("*Suggested modifications: Add specific instrument types, intensity levels, or emotional cues*")
-        else:
-            st.subheader("Enter Sound Description")
-            text_prompt = st.text_area("Describe the desired sound effects",
-                                     "E.g., 'Cinematic trailer music with thunderous impacts and soaring strings'",
-                                     height=150)
-        # Generation Button
-        if st.button("🔊 Generate Sound Effects", key="generate", use_container_width=True):
-            start_time = time.time()
-            # Progress tracking
-            progress_bar = st.progress(0)
-            status_text = st.empty()
-            status_text.text("Loading models...")
-            # Load MusicGen model
-            musicgen_processor, musicgen_model = load_musicgen_model(f"facebook/musicgen-{model_size}")
-            if not musicgen_processor or not musicgen_model:
-                st.error("Failed to load MusicGen model")
-                return
-            progress_bar.progress(20)
-            # Audio Generation
-            status_text.text("Generating audio...")
-            audio_array = generate_audio(text_prompt, musicgen_processor, musicgen_model, duration)
-            unload_model(musicgen_model)
-            temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-            sf.write(temp_audio, audio_array, 44100)
-            progress_bar.progress(50)
-            # Apply Effects
-            status_text.text("Applying audio effects...")
-            processed_audio = apply_audio_effects(temp_audio, effects_settings)
-            progress_bar.progress(75)
-            # Sync with Video
-            status_text.text("Syncing with video...")
-            output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
-            sync_audio_video(video_path, processed_audio, output_video, mix_original, original_vol, generated_vol)
-            progress_bar.progress(100)
-            # Finalize
-            status_text.text("Processing complete!")
-            st.success("✅ Sound effects applied successfully!")
-            # Display result
-            st.video(output_video)
-            # Download button
-            with open(output_video, "rb") as f:
-                st.download_button("📥 Download Enhanced Video",
-                                 f, "enhanced_video.mp4", "video/mp4",
-                                 use_container_width=True)
-            # Timing info
-            processing_time = time.time() - start_time
-            st.session_state.processing_time = processing_time
-            st.info(f"⏱️ Processing time: {processing_time:.1f} seconds")
-            # Cleanup
-            for file in [video_path, temp_audio, processed_audio, output_video]:
-                if os.path.exists(file):
-                    os.remove(file)
-if __name__ == "__main__":
-    main()

 import streamlit as st
 from PIL import Image
+import numpy as np
+import torch
+import gc
 import os
 import tempfile
+import math
+import imageio
+import traceback
+import scipy.io.wavfile # For saving WAV files
+# --- Attempt to import moviepy for video processing ---
+try:
+    import moviepy.editor as mpy
+    MOVIEPY_AVAILABLE = True
+except Exception as e: # Catch any exception during moviepy import
+    MOVIEPY_AVAILABLE = False
+    st.warning(
+        f"MoviePy library could not be loaded (Error: {e}). "
+        "Video syncing features will be disabled. "
+        "If running locally, ensure MoviePy and its dependency ffmpeg are correctly installed."
+    )
+    print(f"MoviePy load error: {e}")
+# --- Model Configuration ---
+IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-base"
+AUDIO_GEN_MODEL = "facebook/musicgen-small"
+# --- Constants ---
+DEFAULT_NUM_FRAMES = 2
+DEFAULT_AUDIO_DURATION_S = 7 # Slightly increased default
+MAX_FRAMES_TO_SHOW_UI = 2 # Reducing for smaller UI footprint
+DEVICE = torch.device("cpu") # Explicitly use CPU
+# --- Page Setup ---
+st.set_page_config(page_title="AI Video Sound Designer (HF Space)", layout="wide", page_icon="🎬")
+st.title("🎬 AI Video Sound Designer")
 st.markdown("""
+    Upload a short video (MP4, MOV, AVI). The tool will:
+    1. Extract frames.
+    2. Analyze frames with **BLIP** to generate sound ideas.
+    3. Synthesize audio with **MusicGen** based on these ideas.
+    4. Optionally, combine the new audio with your video.
+    ---
+    **Important:** This app runs on CPU. **Audio generation can be very slow (several minutes for a few seconds of audio).** Please be patient!
+""")
+# --- Utility Functions ---
+def clear_memory(model_obj=None, processor_obj=None):
+    if model_obj:
+        del model_obj
+    if processor_obj:
+        del processor_obj
+    gc.collect()
+    print("Memory cleared.")
+@st.cache_resource(show_spinner="Loading Image Analysis Model...")
+def load_image_caption_model_and_processor():
     try:
+        from transformers import BlipProcessor, BlipForConditionalGeneration
+        print(f"Loading Image Captioning Model: {IMAGE_CAPTION_MODEL} to {DEVICE}")
+        processor = BlipProcessor.from_pretrained(IMAGE_CAPTION_MODEL)
+        # Standard loading for BLIP on CPU. No 'low_mem' or 'low_cpu_mem_usage'
+        model = BlipForConditionalGeneration.from_pretrained(IMAGE_CAPTION_MODEL).to(DEVICE)
+        model.eval() # Set to evaluation mode
+        st.toast("Image Analysis model (BLIP) loaded!", icon="🖼️")
         return processor, model
     except Exception as e:
+        st.error(f"Error loading BLIP model ({IMAGE_CAPTION_MODEL}): {e}")
+        st.error(traceback.format_exc())
         return None, None
+@st.cache_resource(show_spinner="Loading Audio Generation Model (can be slow)...")
+def load_audio_gen_model_and_processor():
     try:
+        from transformers import AutoProcessor, MusicgenForConditionalGeneration
+        print(f"Loading Audio Generation Model: {AUDIO_GEN_MODEL} to {DEVICE}")
+        processor = AutoProcessor.from_pretrained(AUDIO_GEN_MODEL)
+        # Standard loading for MusicGen on CPU.
+        # `low_cpu_mem_usage` could be used here if accelerate is properly configured,
+        # but for simplicity and robustness on free tier, direct .to(DEVICE) is safer.
+        model = MusicgenForConditionalGeneration.from_pretrained(AUDIO_GEN_MODEL).to(DEVICE)
+        model.eval() # Set to evaluation mode
+        st.toast("Audio Generation model (MusicGen) loaded! (CPU generation will be slow)", icon="🎶")
         return processor, model
     except Exception as e:
+        st.error(f"Error loading MusicGen model ({AUDIO_GEN_MODEL}): {e}")
+        st.error(traceback.format_exc())
         return None, None
+def extract_frames_from_video(video_path, num_frames_to_extract):
+    frames = []
+    reader = None
     try:
+        reader = imageio.get_reader(video_path, "ffmpeg")
+        total_frames_in_video = 0
+        try: # Try to get frame count
+            total_frames_in_video = reader.count_frames()
+        except Exception:
+            meta_data = reader.get_meta_data()
+            duration = meta_data.get('duration')
+            fps = meta_data.get('fps', 25) # Default FPS if not found
+            if duration and fps:
+                total_frames_in_video = int(duration * fps)
+        if not total_frames_in_video or total_frames_in_video < 1:
+            # Fallback: try to read a few frames directly if length is unknown
+            print("Video length unknown or zero, attempting to read initial frames.")
+            temp_frames = []
+            for i, frame_data in enumerate(reader):
+                temp_frames.append(Image.fromarray(frame_data).convert("RGB"))
+                if len(temp_frames) >= num_frames_to_extract * 2: # Read a bit more
+                    break
+            if not temp_frames:
+                st.error("Could not extract any frames. Video might be empty or corrupted.")
+                if reader: reader.close()
+                return []
+            # Select frames from what was read
+            indices = np.linspace(0, len(temp_frames) - 1, num_frames_to_extract, dtype=int, endpoint=True)
+            frames = [temp_frames[i] for i in indices]
+            if reader: reader.close()
+            return frames
+        # If frame count is known
+        num_to_sample = min(num_frames_to_extract, total_frames_in_video)
+        indices = np.linspace(0, total_frames_in_video - 1, num_to_sample, dtype=int, endpoint=True)
+        for i in indices:
             try:
+                frame_data = reader.get_data(i)
+                frames.append(Image.fromarray(frame_data).convert("RGB"))
+            except Exception as frame_e:
+                st.warning(f"Skipping problematic frame at index {i}: {frame_e}")
+        return frames
+    except (imageio.core.fetching.NeedDownloadError, OSError) as e_ffmpeg:
+        st.error(f"FFmpeg error during frame extraction: {e_ffmpeg}. Ensure ffmpeg is available.")
+        return []
     except Exception as e:
+        st.error(f"Could not extract frames: {e}")
+        st.error(traceback.format_exc())
         return []
+    finally:
+        if reader:
+            reader.close()
+def generate_sound_prompt_from_frames(frames, caption_proc, caption_mod):
+    if not frames: return "ambient background noise"
+    descriptions = []
+    # BLIP doesn't need a complex instruction, it captions directly.
+    # We ask for "sound-producing elements" in post-processing of the descriptions.
+    progress_bar = st.progress(0.0, text="Analyzing frames for sound ideas...")
+    for i, frame in enumerate(frames):
+        try:
+            inputs = caption_proc(images=frame, return_tensors="pt").to(DEVICE)
+            generated_ids = caption_mod.generate(**inputs, max_new_tokens=40) # Shorter captions
+            description = caption_proc.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+            if description: descriptions.append(description)
+            progress_bar.progress((i + 1) / len(frames), text=f"Frame {i+1}/{len(frames)} analyzed.")
+        except Exception as e:
+            st.warning(f"Could not get description for a frame: {e}")
+    progress_bar.empty()
+    if not descriptions: return "general ambiance, subtle environmental sounds"
+    unique_descs = list(dict.fromkeys(descriptions)) # Remove duplicates
+    combined_prompt = ". ".join(unique_descs)
+    # Refine for MusicGen
+    final_prompt = f"Soundscape for a scene with: {combined_prompt}. Emphasize distinct sounds and overall mood."
+    # Limit prompt length for MusicGen
+    if len(final_prompt) > 300: # Arbitrary limit for conciseness
+        final_prompt = final_prompt[:300] + "..."
+    return final_prompt
+def generate_audio_from_prompt(prompt, duration_s, audio_proc, audio_mod, guidance, temp):
     try:
+        inputs = audio_proc(text=[prompt], return_tensors="pt", padding=True).to(DEVICE)
+        tokens_per_second = audio_mod.config.audio_encoder.token_per_second
+        max_new_tokens = min(int(duration_s * tokens_per_second), 1500) # Cap for stability
+        with st.spinner(f"Synthesizing {duration_s}s audio (CPU: This is the SLOW part!)... Please wait patiently."):
+            audio_values = audio_mod.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=True, # Sampling is important for diversity
+                guidance_scale=guidance,
+                temperature=temp,
+            )
+        audio_array = audio_values[0, 0].cpu().numpy()
+        sampling_rate = audio_mod.config.audio_encoder.sampling_rate
+        peak = np.abs(audio_array).max()
+        if peak > 1e-5: # Avoid division by zero or near-zero
+            audio_array = (audio_array / peak) * 0.9 # Normalize with headroom
+        else:
+            audio_array = np.zeros_like(audio_array) # Output silence if generated audio is too quiet
+        return audio_array, sampling_rate
     except Exception as e:
+        st.error(f"Error generating audio: {e}")
+        st.error(traceback.format_exc())
+        return None, None
+def combine_audio_video(video_path, audio_arr, sr, mix_orig):
+    if not MOVIEPY_AVAILABLE:
+        st.error("MoviePy is unavailable. Cannot combine audio and video.")
+        return None
+    out_vid_path = None
+    tmp_audio_path = None
+    vid_clip = gen_audio_clip = final_aud = comp_clip = None
     try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_f:
+            scipy.io.wavfile.write(tmp_audio_f.name, sr, audio_arr.astype(np.float32)) # Ensure float32 for some moviepy versions
+            tmp_audio_path = tmp_audio_f.name
+        vid_clip = mpy.VideoFileClip(video_path)
+        gen_audio_clip = mpy.AudioFileClip(tmp_audio_path)
+        target_duration = vid_clip.duration
+        if gen_audio_clip.duration < target_duration:
+            gen_audio_clip = gen_audio_clip.fx(mpy.afx.audio_loop, duration=target_duration)
+        gen_audio_clip = gen_audio_clip.subclip(0, target_duration)
+        if mix_orig and vid_clip.audio:
+            orig_audio = vid_clip.audio.volumex(0.6) # Lower original audio
+            gen_audio_clip = gen_audio_clip.volumex(0.9) # Slightly boost generated
+            final_aud = mpy.CompositeAudioClip([orig_audio, gen_audio_clip]).set_duration(target_duration)
+        else:
+            final_aud = gen_audio_clip.set_duration(target_duration)
+        comp_clip = vid_clip.set_audio(final_aud)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_out_vid_f:
+            out_vid_path = tmp_out_vid_f.name
+        comp_clip.write_videofile(
+            out_vid_path, codec="libx264", audio_codec="aac",
+            threads=max(1, os.cpu_count() // 2), logger=None # Be less verbose
+        )
+        return out_vid_path
+    except Exception as e:
+        st.error(f"Error combining audio/video: {e}")
+        st.error(traceback.format_exc())
+        return None
+    finally:
+        # Close all clips
+        if vid_clip: vid_clip.close()
+        if gen_audio_clip: gen_audio_clip.close()
+        # final_aud is usually a derivative, not needing separate close if others are.
+        if comp_clip: comp_clip.close()
+        if tmp_audio_path and os.path.exists(tmp_audio_path): os.remove(tmp_audio_path)
+# --- Sidebar for Settings ---
+with st.sidebar:
+    st.header("⚙️ Settings")
+    num_frames_analysis = st.slider("Frames to Analyze", 1, 4, DEFAULT_NUM_FRAMES, 1,
+                                   help="Fewer frames = faster analysis.")
+    audio_duration = st.slider("Target Audio Duration (s)", 3, 15, DEFAULT_AUDIO_DURATION_S, 1, # Max 15s for CPU
+                               help="Shorter = MUCH faster on CPU. MusicGen is slow.")
+    st.subheader("MusicGen Parameters")
+    guidance = st.slider("Guidance Scale", 1.0, 7.0, 3.0, 0.5)
+    temperature = st.slider("Temperature", 0.5, 1.5, 1.0, 0.1)
+    mix_audio = False
+    if MOVIEPY_AVAILABLE:
+        st.subheader("Video Output")
+        mix_audio = st.checkbox("Mix with original video audio", value=False)
+# --- Main Application ---
+uploaded_file = st.file_uploader("📤 Upload Video (MP4, MOV, AVI - short clips best):", type=["mp4", "mov", "avi"])
+if 'generated_audio_path_sess' not in st.session_state:
+    st.session_state.generated_audio_path_sess = None
+if 'output_video_path_sess' not in st.session_state:
+    st.session_state.output_video_path_sess = None
+if uploaded_file is not None:
+    st.video(uploaded_file)
+    if st.button("✨ Generate Sound Design!", type="primary", use_container_width=True):
+        # --- Clear previous results from session state and disk ---
+        for key in ['generated_audio_path_sess', 'output_video_path_sess']:
+            if st.session_state.get(key) and os.path.exists(st.session_state[key]):
+                try: os.remove(st.session_state[key])
+                except Exception as e_rem: print(f"Error removing old temp file: {e_rem}")
+            st.session_state[key] = None
+        clear_memory() # General memory clear
+        temp_video_path = None
+        try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_v:
+                tmp_v.write(uploaded_file.read()) # Use read() for BytesIO from uploader
+                temp_video_path = tmp_v.name
+            # === Stage 1: Frame Extraction ===
+            st.markdown("--- \n### 1. Extracting Frames")
+            frames = extract_frames_from_video(temp_video_path, num_frames_analysis)
+            if not frames: st.error("Frame extraction failed. Cannot continue."); st.stop()
+            st.success(f"Extracted {len(frames)} frames.")
+            if frames:
+                cols = st.columns(min(len(frames), MAX_FRAMES_TO_SHOW_UI))
+                for i, frame_img in enumerate(frames[:len(cols)]):
+                    cols[i].image(frame_img, caption=f"Frame {i+1}", use_column_width=True)
+            # === Stage 2: Image Captioning (Sound Prompt Generation) ===
+            st.markdown("--- \n### 2. Analyzing Frames for Sound Ideas (BLIP)")
+            cap_proc, cap_model = load_image_caption_model_and_processor()
+            sound_prompt = "ambient environmental sounds" # Default
+            if cap_proc and cap_model:
+                sound_prompt = generate_sound_prompt_from_frames(frames, cap_proc, cap_model)
+                clear_memory(cap_model, cap_proc) # Unload BLIP
+            else: st.error("BLIP model failed to load. Using default sound prompt.")
+            st.info(f"✍️ **Sound Prompt for MusicGen:** {sound_prompt}")
+            # === Stage 3: Audio Generation ===
+            st.markdown("--- \n### 3. Synthesizing Audio (MusicGen)")
+            st.warning("🎧 This step is very slow on CPU. Your patience is appreciated!")
+            aud_proc, aud_model = load_audio_gen_model_and_processor()
+            gen_aud_arr, s_r = None, None
+            if aud_proc and aud_model:
+                gen_aud_arr, s_r = generate_audio_from_prompt(sound_prompt, audio_duration, aud_proc, aud_model, guidance, temperature)
+                clear_memory(aud_model, aud_proc) # Unload MusicGen
+            else: st.error("MusicGen model failed to load. Cannot generate audio.")
+            if gen_aud_arr is not None and s_r is not None:
+                st.success("Audio successfully generated!")
+                st.audio(gen_aud_arr, sample_rate=s_r)
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_a_out:
+                    scipy.io.wavfile.write(tmp_a_out.name, s_r, gen_aud_arr.astype(np.float32))
+                    st.session_state.generated_audio_path_sess = tmp_a_out.name
+                with open(st.session_state.generated_audio_path_sess, "rb") as f_aud:
+                    st.download_button("📥 Download Audio Only (.wav)", f_aud, "generated_sound.wav", "audio/wav")
+                # === Stage 4: (Optional) Video and Audio Syncing ===
+                if MOVIEPY_AVAILABLE:
+                    st.markdown("--- \n### 4. Combining Audio with Video")
+                    with st.spinner("Processing video with new audio... (also can be slow)"):
+                        out_vid_p = combine_audio_video(temp_video_path, gen_aud_arr, s_r, mix_audio)
+                    if out_vid_p and os.path.exists(out_vid_p):
+                        st.success("Video processing complete!")
+                        st.video(out_vid_p)
+                        st.session_state.output_video_path_sess = out_vid_p
+                        with open(out_vid_p, "rb") as f_vid:
+                            st.download_button("🎬 Download Video with New Sound (.mp4)", f_vid, "video_with_sound.mp4", "video/mp4")
+                    elif MOVIEPY_AVAILABLE: st.error("Failed to combine audio and video.")
+            else: st.error("Audio generation failed. Video syncing skipped.")
+        except Exception as e_main:
+            st.error(f"An unexpected error occurred in main processing: {e_main}")
+            st.error(traceback.format_exc())
+        finally:
+            if temp_video_path and os.path.exists(temp_video_path): os.remove(temp_video_path)
+            clear_memory() # Final general clear
+    # Show download buttons for files from a previous successful run in the same session
+    elif st.session_state.generated_audio_path_sess and os.path.exists(st.session_state.generated_audio_path_sess):
+        st.markdown("---")
+        st.write("Previously generated audio available:")
+        st.audio(st.session_state.generated_audio_path_sess)
+        with open(st.session_state.generated_audio_path_sess, "rb") as f_aud_prev:
+            st.download_button("📥 Download Previous Audio (.wav)", f_aud_prev, "generated_sound_prev.wav", "audio/wav", key="prev_aud_dl")
+    if st.session_state.output_video_path_sess and os.path.exists(st.session_state.output_video_path_sess) and MOVIEPY_AVAILABLE:
+        st.markdown("---") # This might appear even if audio only was generated, so careful with flow
+        st.write("Previously generated video available:")
+        st.video(st.session_state.output_video_path_sess)
+        with open(st.session_state.output_video_path_sess, "rb") as f_vid_prev:
+            st.download_button("🎬 Download Previous Video (.mp4)", f_vid_prev, "video_with_sound_prev.mp4", "video/mp4", key="prev_vid_dl")
+else:
+    st.info("☝️ Upload a video to begin.")
+st.markdown("---")
+st.caption("Built for Hugging Face Spaces (CPU). Patience is key for generation times!")