Spaces:

garyuzair
/

Video-To-SoundFX

Running

App Files Files Community

garyuzair commited on May 7, 2025

Commit

4a69d6b

verified ·

1 Parent(s): 5e03617

Update app.py

Browse files

Files changed (1) hide show

app.py +401 -209

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import streamlit as st
 import torch
 from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
 import imageio
-import numpy as np
 from PIL import Image
 import soundfile as sf
 import os
@@ -10,6 +10,31 @@ import tempfile
 import subprocess
 from pydub import AudioSegment, effects
 import moviepy.editor as mpy
 # Optional scene detection
 scene_detect_available = True
@@ -19,273 +44,440 @@ try:
 except ImportError:
     scene_detect_available = False
-# Set page configuration
-st.set_page_config(page_title="Video Sound Effect Generator", layout="centered")
-# Load BLIP model for captioning
 @st.cache_resource
 def load_blip_model():
-    processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-    if torch.cuda.is_available():
-        model = model.half().to("cuda")
-    return processor, model
-# Load MusicGen model
 @st.cache_resource
 def load_musicgen_model(model_name="facebook/musicgen-medium"):
-    processor = AutoProcessor.from_pretrained(model_name)
-    model = MusicgenForConditionalGeneration.from_pretrained(model_name)
-    if torch.cuda.is_available():
-        model = model.half().to("cuda")
-    return processor, model
-# Extract frames efficiently
 def extract_frames(video_path, num_frames, method="uniform", segment_start=0, segment_end=None):
-    video = imageio.get_reader(video_path, "ffmpeg")
-    meta = video.get_meta_data()
-    fps = meta['fps']
-    total_frames = int(meta['duration'] * fps)
-    if segment_end is None:
-        segment_end = total_frames / fps
-    start_frame = int(segment_start * fps)
-    end_frame = int(segment_end * fps)
-    total_segment_frames = end_frame - start_frame
-    if method == "scene" and scene_detect_available:
         try:
-            video_manager = VideoManager([video_path])
-            scene_manager = SceneManager()
-            scene_manager.add_detector(ContentDetector(threshold=30))
-            video_manager.set_downscale_factor(2)  # Optimize for speed
-            video_manager.start()
-            scene_manager.detect_scenes(frame_source=video_manager)
-            scene_list = scene_manager.get_scene_list()
-            segment_scenes = [scene for scene in scene_list if scene[0].get_seconds() >= segment_start and scene[0].get_seconds() < segment_end]
-            frames = []
-            for scene in segment_scenes[:num_frames]:
-                frame = video_manager.get_frame(scene[0].get_frames())
-                if frame is not None:
-                    frames.append(Image.fromarray(frame))
-            video_manager.release()
-            if len(frames) < num_frames and total_segment_frames > 0:
-                remaining = num_frames - len(frames)
-                step = total_segment_frames // (remaining + 1)
-                for i in range(1, remaining + 1):
-                    frame_idx = start_frame + i * step
-                    if frame_idx < end_frame:
-                        frames.append(Image.fromarray(video.get_data(frame_idx)))
-            return frames[:num_frames]
         except Exception as e:
-            st.warning(f"Scene detection failed: {e}. Using uniform extraction.")
-    # Uniform extraction
-    step = max(1, total_segment_frames // num_frames)
-    frame_indices = [start_frame + i * step for i in range(num_frames) if start_frame + i * step < end_frame]
-    frames = [Image.fromarray(video.get_data(idx)) for idx in frame_indices]
-    return frames[:num_frames]
-# Generate captions
-def generate_captions(frames, processor, model):
-    descriptions = []
-    for frame in frames:
-        inputs = processor(images=frame, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        out = model.generate(**inputs, max_length=30)
-        description = processor.decode(out[0], skip_special_tokens=True)
-        descriptions.append(description)
-    return descriptions
-# Enhance prompts
 def enhance_prompt(descriptions, mood="default"):
     if not descriptions:
         return f"{mood} ambient sound with subtle effects"
-    combined = ". ".join(descriptions).lower()
-    base_prompts = {
-        "walk|run": "crisp footsteps on varied surfaces, immersive movement sounds",
-        "car|drive": "roaring engine, tire screeches, dynamic road noise",
-        "talk|person": "lively voices, crowd murmur, spatial chatter",
-        "wind|tree|forest": "rustling leaves, gentle wind gusts, natural ambiance",
-        "crash|fall": "intense crash impact, debris scattering, sharp effects"
     }
-    for pattern, effect in base_prompts.items():
-        if any(word in combined for word in pattern.split("|")):
-            return f"{mood} {combined}, {effect}"
-    return f"{mood} {combined}, rich ambient soundscape with engaging effects"
-# Generate audio
 def generate_audio(prompt, processor, model, duration, sample_rate=44100):
-    inputs = processor(text=[prompt], padding=True, return_tensors="pt")
-    if torch.cuda.is_available():
-        inputs = {k: v.to("cuda") for k, v in inputs.items()}
-    audio_values = model.generate(
-        **inputs,
-        max_new_tokens=int(512 * (duration / 8)),  # Optimized for speed
-        do_sample=True,
-        guidance_scale=7.0,
-        top_k=120,
-        top_p=0.9
-    )
-    audio_array = audio_values[0].cpu().numpy()
-    audio_array = audio_array / np.max(np.abs(audio_array)) * 0.95
-    audio_array = np.clip(audio_array, -1.0, 1.0)
-    return audio_array
-# Apply audio effects
 def apply_audio_effects(audio_path, settings):
-    sound = AudioSegment.from_wav(audio_path)
-    if settings['reverb_ms'] > 0:
-        sound = sound + AudioSegment.silent(duration=settings['reverb_ms']) - 10
-    if settings['echo_ms'] > 0:
-        echo = sound - 15
-        sound = sound.overlay(echo, position=settings['echo_ms'])
-    if settings['highpass'] > 0:
-        sound = sound.high_pass_filter(settings['highpass'])
-    if settings['lowpass'] < 20000:
-        sound = sound.low_pass_filter(settings['lowpass'])
-    if settings['compress']:
-        sound = effects.compress_dynamic_range(sound)
-    sound = sound.pan(settings['stereo_pan'])
-    sound = effects.normalize(sound)
-    processed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-    sound.export(processed_path, format="wav")
-    return processed_path
-# Sync audio with video
-def sync_audio_video(video_path, audio_path, output_path, mix_original=False, original_volume=0.5, generated_volume=0.5):
-    if mix_original:
-        video_clip = mpy.VideoFileClip(video_path)
-        if video_clip.audio:
-            original_audio_seg = AudioSegment.from_file(video_path, format="mp4")
-            generated_audio_seg = AudioSegment.from_wav(audio_path)
-            original_audio_seg = original_audio_seg - (20 * (1 - original_volume))
-            generated_audio_seg = generated_audio_seg - (20 * (1 - generated_volume))
-            mixed_audio = original_audio_seg.overlay(generated_audio_seg)
-            mixed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-            mixed_audio.export(mixed_path, format="wav")
-            audio_path = mixed_path
-        else:
-            st.warning("No original audio found. Using generated audio only.")
-    cmd = [
-        'ffmpeg',
-        '-i', video_path,
-        '-i', audio_path,
-        '-c:v', 'copy',
-        '-c:a', 'aac',
-        '-map', '0:v:0',
-        '-map', '1:a:0',
-        '-shortest',
-        '-y',
-        output_path
-    ]
-    subprocess.run(cmd, check=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-# Main application
-def main():
-    st.title("🎬 Video Sound Effect Generator")
-    st.markdown("Upload a video to create immersive, video-specific sound effects with AI.")
     # User Guide
-    with st.expander("📖 User Guide"):
         st.markdown("""
         **How to Use:**
-        1. **Upload a Video**: Choose an MP4, MOV, or AVI file.
-        2. **Select Prompt Mode**:
-           - **Automatic**: Analyzes video frames to generate sound prompts.
-           - **Manual**: Enter your own sound description.
-        3. **Configure Settings**: Adjust frame analysis, audio effects, and model size in the sidebar.
-        4. **Generate**: Click "Generate Sound Effects" to process the video.
-        5. **Download**: Save the enhanced video with sound effects.
-        **Tips**:
-        - Use at least 5 frames for better sound relevance.
-        - Scene-based frame extraction (if available) improves accuracy.
-        - Adjust audio effects for a customized sound experience.
         """)
     # Sidebar Settings
     with st.sidebar:
-        st.header("⚙️ Settings")
-        prompt_mode = st.selectbox("Prompt Mode", ["Automatic", "Manual"])
-        model_size = st.selectbox("Model Size", ["small", "medium", "large"], index=1)
         mix_original = st.checkbox("Mix with Original Audio", value=False)
-        original_volume, generated_volume = 0.5, 0.5
-        if mix_original:
-            original_volume = st.slider("Original Audio Volume", 0.0, 1.0, 0.5)
-            generated_volume = st.slider("Generated Audio Volume", 0.0, 1.0, 0.5)
-        st.subheader("Frame Analysis")
-        num_frames = st.slider("Frames to Analyze", 5, 10, 5, help="More frames improve sound relevance but increase processing time")
-        frame_method = st.selectbox("Frame Extraction Method", ["Uniform", "Scene"] if scene_detect_available else ["Uniform"])
-        st.subheader("Audio Effects")
         effects_settings = {
-            'reverb_ms': st.slider("Reverb (ms)", 0, 500, 100),
-            'echo_ms': st.slider("Echo (ms)", 0, 1000, 200),
-            'highpass': st.slider("High-pass Filter (Hz)", 0, 3000, 100),
-            'lowpass': st.slider("Low-pass Filter (Hz)", 5000, 20000, 15000),
             'compress': st.checkbox("Dynamic Compression", value=True),
             'stereo_pan': st.slider("Stereo Pan (-1 left, 1 right)", -1.0, 1.0, 0.0)
         }
-    # Main Content
-    uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mov", "avi"])
     if uploaded_file:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
             tmp.write(uploaded_file.read())
             video_path = tmp.name
         st.video(video_path)
         video_clip = mpy.VideoFileClip(video_path)
         duration = video_clip.duration
         video_clip.close()
         if prompt_mode == "Automatic":
-            with st.spinner("Analyzing frames..."):
                 blip_processor, blip_model = load_blip_model()
                 frames = extract_frames(video_path, num_frames, frame_method)
                 if not frames:
                     st.error("No frames extracted. Try a different video or settings.")
                     return
-                descriptions = generate_captions(frames, blip_processor, blip_model)
-                mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"])
                 text_prompt = enhance_prompt(descriptions, mood)
-                text_prompt = st.text_area("Edit Prompt", text_prompt, height=100)
         else:
-            text_prompt = st.text_area("Enter Sound Description", "Describe the desired sound effects", height=100)
-        if st.button("Generate Sound Effects", key="generate"):
-            progress = st.progress(0)
-            status = st.empty()
-            status.text("Loading model...")
             musicgen_processor, musicgen_model = load_musicgen_model(f"facebook/musicgen-{model_size}")
-            progress.progress(20)
-            status.text("Generating audio...")
             audio_array = generate_audio(text_prompt, musicgen_processor, musicgen_model, duration)
             temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
             sf.write(temp_audio, audio_array, 44100)
-            progress.progress(50)
-            status.text("Applying audio effects...")
             processed_audio = apply_audio_effects(temp_audio, effects_settings)
-            progress.progress(75)
-            status.text("Syncing with video...")
             output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
-            sync_audio_video(video_path, processed_audio, output_video, mix_original, original_volume, generated_volume)
-            progress.progress(100)
-            status.text("Done!")
-            st.success("Sound effects applied!")
             st.video(output_video)
             with open(output_video, "rb") as f:
-                st.download_button("Download Enhanced Video", f, "enhanced_video.mp4", "video/mp4")
             # Cleanup
             for file in [video_path, temp_audio, processed_audio, output_video]:
                 if os.path.exists(file):

 import streamlit as st
 import torch
+import numpy as np
 from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
 import imageio
 from PIL import Image
 import soundfile as sf
 import os
 import subprocess
 from pydub import AudioSegment, effects
 import moviepy.editor as mpy
+import time
+from concurrent.futures import ThreadPoolExecutor
+# Set page configuration
+st.set_page_config(page_title="🎬 AI SoundFX Studio", layout="wide", initial_sidebar_state="expanded")
+# Enhanced CSS for better UI
+st.markdown("""
+<style>
+    .reportview-container {
+        background: #1a1a1a;
+        color: white;
+    }
+    .sidebar .sidebar-content {
+        width: 350px;
+    }
+    .stProgress > div > div {
+        background-color: #4CAF50;
+    }
+    .stButton>button {
+        background-color: #4CAF50;
+        color: white;
+    }
+</style>
+""", unsafe_allow_html=True)
 # Optional scene detection
 scene_detect_available = True
 except ImportError:
     scene_detect_available = False
+# Model Management
 @st.cache_resource
 def load_blip_model():
+    """Load BLIP model with optimized settings"""
+    try:
+        processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        model = BlipForConditionalGeneration.from_pretrained(
+            "Salesforce/blip-image-captioning-base",
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            low_mem=True
+        ).to("cuda" if torch.cuda.is_available() else "cpu")
+        return processor, model
+    except Exception as e:
+        st.error(f"BLIP model load error: {str(e)}")
+        return None, None
 @st.cache_resource
 def load_musicgen_model(model_name="facebook/musicgen-medium"):
+    """Load MusicGen model with optimized settings"""
+    try:
+        model = MusicgenForConditionalGeneration.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            low_cpu_mem_usage=True
+        ).to("cuda" if torch.cuda.is_available() else "cpu")
+        processor = AutoProcessor.from_pretrained(model_name)
+        return processor, model
+    except Exception as e:
+        st.error(f"MusicGen model load error: {str(e)}")
+        return None, None
 def extract_frames(video_path, num_frames, method="uniform", segment_start=0, segment_end=None):
+    """Optimized frame extraction with smart sampling"""
+    try:
+        video = imageio.get_reader(video_path, "ffmpeg")
+        meta = video.get_meta_data()
+        fps = meta['fps']
+        total_frames = int(meta['duration'] * fps)
+        # Optimize frame count based on duration
+        actual_num_frames = min(num_frames, int(total_frames / 5) or 5)
+        if segment_end is None:
+            segment_end = total_frames / fps
+        start_frame = int(segment_start * fps)
+        end_frame = int(segment_end * fps)
+        total_segment_frames = end_frame - start_frame
+        # Smart frame selection
+        if method == "scene" and scene_detect_available:
+            try:
+                video_manager = VideoManager([video_path])
+                scene_manager = SceneManager()
+                scene_manager.add_detector(ContentDetector(threshold=30))
+                video_manager.set_downscale_factor(2)
+                video_manager.start()
+                scene_manager.detect_scenes(frame_source=video_manager)
+                scene_list = scene_manager.get_scene_list()
+                segment_scenes = [scene for scene in scene_list
+                                if segment_start <= scene[0].get_seconds() < segment_end]
+                frames = []
+                for scene in segment_scenes[:actual_num_frames]:
+                    frame = video_manager.get_frame(scene[0].get_frames())
+                    if frame is not None:
+                        frames.append(Image.fromarray(frame))
+                video_manager.release()
+                # Fill remaining frames if needed
+                if len(frames) < actual_num_frames and total_segment_frames > 0:
+                    remaining = actual_num_frames - len(frames)
+                    step = max(1, total_segment_frames // (remaining + 1))
+                    for i in range(1, remaining + 1):
+                        frame_idx = start_frame + i * step
+                        if frame_idx < end_frame:
+                            frames.append(Image.fromarray(video.get_data(frame_idx)))
+                return frames[:actual_num_frames]
+            except Exception as e:
+                st.warning(f"Scene detection failed: {e}. Using uniform extraction.")
+        # Uniform extraction with numpy optimization
+        frame_indices = np.linspace(start_frame, end_frame, actual_num_frames, endpoint=False).astype(int)
+        frames = []
+        for idx in frame_indices:
+            if idx < total_frames:
+                frame = video.get_data(idx)
+                frames.append(Image.fromarray(frame))
+        return frames[:actual_num_frames]
+    except Exception as e:
+        st.error(f"Frame extraction error: {str(e)}")
+        return []
+def generate_captions_parallel(frames, processor, model):
+    """Parallel caption generation with error handling"""
+    def process_frame(frame):
         try:
+            inputs = processor(images=frame, return_tensors="pt").to(model.device)
+            out = model.generate(**inputs, max_length=25, num_beams=3)
+            return processor.decode(out[0], skip_special_tokens=True)
         except Exception as e:
+            st.warning(f"Captioning error: {str(e)}")
+            return ""
+    with ThreadPoolExecutor() as executor:
+        return list(executor.map(process_frame, frames))
 def enhance_prompt(descriptions, mood="default"):
+    """Advanced prompt engineering with pattern recognition"""
     if not descriptions:
         return f"{mood} ambient sound with subtle effects"
+    combined = ". ".join(set(desc.lower() for desc in descriptions))
+    # Enhanced pattern matching dictionary
+    pattern_map = {
+        ('walk', 'run'): "crisp footsteps on varied surfaces with immersive movement sounds",
+        ('car', 'drive', 'vehicle'): "roaring engine, tire screeches, and dynamic road noise with spatial positioning",
+        ('talk', 'person', 'people', 'conversation'): "lively voices, crowd murmur, and spatial chatter with natural reverb",
+        ('wind', 'tree', 'forest', 'nature'): "rustling leaves, gentle wind gusts, and natural ambiance with atmospheric depth",
+        ('crash', 'fall', 'impact'): "intense crash impact, debris scattering, and sharp transient effects with dynamic range",
+        ('water', 'ocean', 'sea'): "realistic water movement, wave dynamics, and aquatic ambiance",
+        ('fire', 'explosion'): "realistic fire crackling, explosions, and heat distortion audio",
+        ('space', 'sci-fi'): "futuristic ambient textures, synth effects, and spatial audio design"
     }
+    # Advanced pattern matching logic
+    matched_patterns = []
+    for keywords, effect in pattern_map.items():
+        if any(keyword in combined for keyword in keywords):
+            matched_patterns.append(effect)
+    if matched_patterns:
+        return f"{mood} {combined}, {'; '.join(matched_patterns)}, cinematic sound design with spatial audio"
+    return f"{mood} {combined}, rich ambient soundscape with professional effects, 4K audio resolution"
 def generate_audio(prompt, processor, model, duration, sample_rate=44100):
+    """Optimized audio generation with smart parameters"""
+    try:
+        inputs = processor(text=[prompt], padding=True, return_tensors="pt").to(model.device)
+        audio_values = model.generate(
+            **inputs,
+            max_new_tokens=int(256 * (duration / 8)),
+            num_beams=3,
+            early_stopping=True,
+            do_sample=True,
+            temperature=0.85,
+            guidance_scale=5.0,
+            top_k=80,
+            top_p=0.85
+        )
+        audio_array = audio_values[0].cpu().numpy()
+        audio_array = np.tanh(audio_array)  # Faster than clip + max normalization
+        return audio_array
+    except Exception as e:
+        st.error(f"Audio generation error: {str(e)}")
+        return np.zeros(int(duration * sample_rate))
 def apply_audio_effects(audio_path, settings):
+    """Enhanced audio effects processing"""
+    try:
+        sound = AudioSegment.from_wav(audio_path)
+        # Reverb
+        if settings['reverb_ms'] > 0:
+            sound = sound.overlay(sound - 15, position=settings['reverb_ms'])
+        # Echo
+        if settings['echo_ms'] > 0:
+            echo = sound - 15
+            sound = sound.overlay(echo, position=settings['echo_ms'])
+        # Filters
+        if settings['highpass'] > 0:
+            sound = sound.high_pass_filter(settings['highpass'])
+        if settings['lowpass'] < 20000:
+            sound = sound.low_pass_filter(settings['lowpass'])
+        # Dynamic processing
+        if settings['compress']:
+            sound = effects.compress_dynamic_range(sound)
+        # Stereo imaging
+        sound = sound.pan(settings['stereo_pan'])
+        sound = effects.normalize(sound)
+        processed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+        sound.export(processed_path, format="wav")
+        return processed_path
+    except Exception as e:
+        st.error(f"Audio effects error: {str(e)}")
+        return audio_path
+def sync_audio_video(video_path, audio_path, output_path, mix_original=False,
+                    original_volume=0.5, generated_volume=0.5):
+    """Enhanced video/audio synchronization"""
+    try:
+        if mix_original:
+            video_clip = mpy.VideoFileClip(video_path)
+            if video_clip.audio:
+                original_audio_seg = AudioSegment.from_file(video_path, format="mp4")
+                generated_audio_seg = AudioSegment.from_wav(audio_path)
+                # Volume adjustment
+                original_audio_seg = original_audio_seg - (20 * (1 - original_volume))
+                generated_audio_seg = generated_audio_seg - (20 * (1 - generated_volume))
+                mixed_audio = original_audio_seg.overlay(generated_audio_seg)
+                mixed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+                mixed_audio.export(mixed_path, format="wav")
+                audio_path = mixed_path
+            else:
+                st.warning("No original audio found. Using generated audio only.")
+        # FFmpeg command with hardware acceleration
+        cmd = [
+            'ffmpeg',
+            '-i', video_path,
+            '-i', audio_path,
+            '-c:v', 'copy',
+            '-c:a', 'aac',
+            '-map', '0:v:0',
+            '-map', '1:a:0',
+            '-shortest',
+            '-y',
+            '-preset', 'ultrafast',
+            '-vsync', '2',
+            output_path
+        ]
+        subprocess.run(cmd, check=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+    except Exception as e:
+        st.error(f"Sync error: {str(e)}")
+def unload_model(model):
+    """Memory management utility"""
+    if torch.cuda.is_available():
+        model.to("cpu")
+        torch.cuda.empty_cache()
+def main():
+    st.title("🎬 AI SoundFX Studio")
+    st.markdown("### Create immersive soundscapes from video with optimized AI processing")
+    # Initialize session state
+    if 'processing_time' not in st.session_state:
+        st.session_state.processing_time = 0
     # User Guide
+    with st.expander("📖 User Guide & Tips"):
         st.markdown("""
         **How to Use:**
+        1. Upload a video file (MP4, MOV, AVI)
+        2. Choose between Automatic (AI-generated) or Manual sound description
+        3. Adjust settings in the sidebar:
+           - Model size (small/medium/large)
+           - Frame analysis parameters
+           - Audio effects customization
+        4. Click "Generate Sound Effects"
+        5. Download the enhanced video
+        **Optimization Tips:**
+        - Use "small" model for quick previews
+        - Enable "Scene Detection" for better context
+        - Adjust audio effects for custom sound design
+        - Use "Mix with Original Audio" for balanced results
         """)
     # Sidebar Settings
     with st.sidebar:
+        st.header("⚙️ Processing Settings")
+        # Processing Mode
+        prompt_mode = st.selectbox("Prompt Generation", ["Automatic", "Manual"])
+        # Model Selection
+        model_size = st.selectbox("Model Size", ["small", "medium", "large"], index=1,
+                                 help="Larger models = better quality but slower processing")
+        # Audio Mixing
         mix_original = st.checkbox("Mix with Original Audio", value=False)
+        col1, col2 = st.columns(2)
+        with col1:
+            original_vol = st.slider("Original Volume", 0.0, 1.0, 0.5) if mix_original else 0.5
+        with col2:
+            generated_vol = st.slider("Generated Volume", 0.0, 1.0, 0.5) if mix_original else 0.5
+        # Frame Analysis
+        st.subheader("🎥 Frame Analysis")
+        num_frames = st.slider("Frames to Analyze", 3, 10, 5,
+                              help="More frames improve accuracy but increase processing time")
+        frame_method = st.selectbox("Frame Extraction",
+                                  ["Uniform", "Scene"] if scene_detect_available else ["Uniform"],
+                                  help="Scene detection provides better contextual analysis")
+        # Audio Effects
+        st.subheader("🎛️ Audio Effects")
         effects_settings = {
+            'reverb_ms': st.slider("Reverb (ms)", 0, 500, 50),
+            'echo_ms': st.slider("Echo (ms)", 0, 1000, 100),
+            'highpass': st.slider("High-pass Filter (Hz)", 0, 3000, 50),
+            'lowpass': st.slider("Low-pass Filter (Hz)", 5000, 20000, 12000),
             'compress': st.checkbox("Dynamic Compression", value=True),
             'stereo_pan': st.slider("Stereo Pan (-1 left, 1 right)", -1.0, 1.0, 0.0)
         }
+        # Performance Presets
+        st.subheader("⚡ Performance")
+        quality_preset = st.selectbox("Quality Preset", ["Fast", "Balanced", "High Quality"])
+        presets = {
+            "Fast": {"num_frames": 3, "model_size": "small"},
+            "Balanced": {"num_frames": 5, "model_size": "medium"},
+            "High Quality": {"num_frames": 8, "model_size": "large"}
+        }
+        if quality_preset != "Balanced":
+            num_frames = presets[quality_preset]["num_frames"]
+            model_size = presets[quality_preset]["model_size"]
+        st.info("Processing time estimate: 2-5 minutes (varies by settings)")
+    # Main Content Area
+    uploaded_file = st.file_uploader("Upload Video File", type=["mp4", "mov", "avi"])
     if uploaded_file:
+        # Create temporary files
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
             tmp.write(uploaded_file.read())
             video_path = tmp.name
+        # Video Preview
         st.video(video_path)
+        # Get video duration
         video_clip = mpy.VideoFileClip(video_path)
         duration = video_clip.duration
         video_clip.close()
+        # Prompt Generation
         if prompt_mode == "Automatic":
+            with st.spinner("Analyzing video content..."):
                 blip_processor, blip_model = load_blip_model()
+                if not blip_processor or not blip_model:
+                    st.error("Failed to load BLIP model")
+                    return
                 frames = extract_frames(video_path, num_frames, frame_method)
                 if not frames:
                     st.error("No frames extracted. Try a different video or settings.")
                     return
+                # Display analyzed frames
+                cols = st.columns(len(frames))
+                for col, frame in zip(cols, frames):
+                    with col:
+                        st.image(frame, use_column_width=True)
+                descriptions = generate_captions_parallel(frames, blip_processor, blip_model)
+                unload_model(blip_model)
+                # Mood selection
+                mood = st.selectbox("Sound Mood", [
+                    "default", "dramatic", "ambient", "action", "sci-fi", "horror", "comedy"
+                ], help="Select the overall atmosphere for the sound design")
+                # Enhanced prompt with AI suggestions
                 text_prompt = enhance_prompt(descriptions, mood)
+                st.subheader("Generated Prompt")
+                text_prompt = st.text_area("Edit Prompt", text_prompt, height=150)
+                st.markdown("*Suggested modifications: Add specific instrument types, intensity levels, or emotional cues*")
         else:
+            st.subheader("Enter Sound Description")
+            text_prompt = st.text_area("Describe the desired sound effects",
+                                     "E.g., 'Cinematic trailer music with thunderous impacts and soaring strings'",
+                                     height=150)
+        # Generation Button
+        if st.button("🔊 Generate Sound Effects", key="generate", use_container_width=True):
+            start_time = time.time()
+            # Progress tracking
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            status_text.text("Loading models...")
+            # Load MusicGen model
             musicgen_processor, musicgen_model = load_musicgen_model(f"facebook/musicgen-{model_size}")
+            if not musicgen_processor or not musicgen_model:
+                st.error("Failed to load MusicGen model")
+                return
+            progress_bar.progress(20)
+            # Audio Generation
+            status_text.text("Generating audio...")
             audio_array = generate_audio(text_prompt, musicgen_processor, musicgen_model, duration)
+            unload_model(musicgen_model)
             temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
             sf.write(temp_audio, audio_array, 44100)
+            progress_bar.progress(50)
+            # Apply Effects
+            status_text.text("Applying audio effects...")
             processed_audio = apply_audio_effects(temp_audio, effects_settings)
+            progress_bar.progress(75)
+            # Sync with Video
+            status_text.text("Syncing with video...")
             output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+            sync_audio_video(video_path, processed_audio, output_video, mix_original, original_vol, generated_vol)
+            progress_bar.progress(100)
+            # Finalize
+            status_text.text("Processing complete!")
+            st.success("✅ Sound effects applied successfully!")
+            # Display result
             st.video(output_video)
+            # Download button
             with open(output_video, "rb") as f:
+                st.download_button("📥 Download Enhanced Video",
+                                 f, "enhanced_video.mp4", "video/mp4",
+                                 use_container_width=True)
+            # Timing info
+            processing_time = time.time() - start_time
+            st.session_state.processing_time = processing_time
+            st.info(f"⏱️ Processing time: {processing_time:.1f} seconds")
             # Cleanup
             for file in [video_path, temp_audio, processed_audio, output_video]:
                 if os.path.exists(file):