Spaces:

garyuzair
/

Video-To-SoundFX

Runtime error

App Files Files Community

garyuzair commited on May 7, 2025

Commit

5763c8a

verified ·

1 Parent(s): 71b6093

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -304

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
-import imageio
 import numpy as np
 from PIL import Image
 from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
@@ -18,6 +19,7 @@ except ModuleNotFoundError:
 # --- Constants ---
 TEMP_DIR = tempfile.gettempdir()
 DEFAULT_SAMPLING_RATE = 32000 # MusicGen default
 # --- Model Loading (Cached) ---
 @st.cache_resource
@@ -41,39 +43,48 @@ def load_musicgen_model():
     return processor, model
 # --- Core Functions ---
-def extract_frames_from_video(video_path, num_frames_to_extract):
-    """Extracts a specified number of frames evenly from a video."""
     frames = []
-    try:
-        video_reader = imageio.get_reader(video_path, "ffmpeg")
-        total_frames = video_reader.count_frames() if hasattr(video_reader, 'count_frames') else len(list(video_reader.iter_data())) # more robust way to get frame count
-        if total_frames == 0:
-            st.warning("Video appears to have 0 frames. Please check the video file.")
-            return []
-        step = max(1, total_frames // num_frames_to_extract)
-        extracted_count = 0
-        for i in range(0, total_frames, step):
-            if extracted_count >= num_frames_to_extract:
-                break
-            frame_data = video_reader.get_data(i)
-            frames.append(Image.fromarray(frame_data))
-            extracted_count += 1
-        video_reader.close()
-    except Exception as e:
-        st.error(f"Error extracting frames: {e}")
         return []
     return frames
 def generate_enhanced_prompt(base_description, context_style="cinematic"):
-    """
-    Generates a more detailed and evocative sound-specific prompt from a BLIP caption.
-    """
     base = base_description.lower().strip().replace("a photo of ", "").replace("an image of ", "")
-    # Keywords for actions, objects, environments, and sound qualities
     action_sounds = {
         "walking": "footsteps, rhythmic, on {surface}", "running": "rapid footsteps, heavy breathing, on {surface}",
         "driving": "engine rumble, tire sounds on {surface}", "talking": "clear voices, conversational tone",
@@ -96,142 +107,67 @@ def generate_enhanced_prompt(base_description, context_style="cinematic"):
         "office": "office hum, keyboard typing, distant chatter",
         "street": "city street sounds, traffic, distant sirens, pedestrian chatter",
         "forest": "forest ambience, rustling leaves, distant bird calls, twigs snapping",
-        "beach": "ocean waves crashing, seagulls, gentle wind",
-        "cave": "echoing drips, damp air, low rumble",
-        "space": "eerie silence, low hum, occasional electronic beep",
-        "underwater": "muffled sounds, bubbling, deep water pressure"
     }
     sound_qualities = {
-        "cinematic": "high quality, clear, immersive, dynamic range",
-        "realistic": "natural, authentic, detailed",
-        "cartoon": "exaggerated, playful, boings, zips",
-        "ominous": "low rumble, dissonant, suspenseful",
         "peaceful": "gentle, calming, serene"
     }
-    found_elements = []
-    prompt_parts = [f"A {context_style} soundscape of:"]
-    # Prioritize actions
     for action_keyword, sound_desc in action_sounds.items():
         if action_keyword in base:
-            # Basic surface detection
-            surface = "generic surface"
-            if "grass" in base: surface = "grass"
-            elif "wood" in base or "floor" in base: surface = "wooden floor"
-            elif "concrete" in base or "pavement" in base: surface = "concrete"
-            elif "water" in base: surface = "water"
-            prompt_parts.append(sound_desc.format(surface=surface))
-            found_elements.append(action_keyword)
-            break # Often one main action is enough focus
-    # Add objects
     for obj_keyword, sound_desc in object_sounds.items():
         if obj_keyword in base and obj_keyword not in found_elements:
-            prompt_parts.append(sound_desc)
-            found_elements.append(obj_keyword)
-            # Limit to 1-2 dominant objects to avoid overly complex prompts
-            if len(found_elements) > (1 if any(action_keyword in found_elements for action_keyword in action_sounds) else 2):
-                break
-    # Add environment
     added_env = False
     for env_keyword, sound_desc in environment_ambience.items():
-        if env_keyword in base:
-            prompt_parts.append(f"environment: {sound_desc}")
-            added_env = True
-            break
-    if not found_elements and not added_env: # if nothing specific found
-        prompt_parts.append(f"subtle ambient sound related to '{base}'")
-    # Add general quality
-    if context_style in sound_qualities:
-        prompt_parts.append(sound_qualities[context_style])
-    else:
-        prompt_parts.append(sound_qualities["cinematic"]) # default
     return ", ".join(prompt_parts) + "."
 def generate_audio(text_prompt, negative_prompt, duration_s, guidance_scale, top_k, top_p, device):
-    """Generates audio using MusicGen model."""
     musicgen_processor, musicgen_model = load_musicgen_model()
-    inputs = musicgen_processor(
-        text=[text_prompt],
-        negative_prompt=[negative_prompt] if negative_prompt else None,
-        padding=True,
-        return_tensors="pt",
-    ).to(device)
-    # max_new_tokens: 1 token approx 0.02 seconds at 50Hz. So, duration_s * 50.
-    # MusicGen generates around 5-8s with 256 tokens, but let's be more direct.
-    # The model has a fixed sampling rate (e.g., 32kHz for musicgen-small).
-    # The number of tokens directly influences duration.
-    # model.config.audio_encoder.frame_rate seems to be 50 (tokens per second of audio)
     max_new_tokens = int(duration_s * musicgen_model.config.audio_encoder.frame_rate)
-    if max_new_tokens > 1500: # MusicGen small has context limit around 30s (1500 tokens)
-        st.warning(f"Requested duration ({duration_s}s) is long. Capping to ~30s for musicgen-small.")
         max_new_tokens = 1500
-    audio_values = musicgen_model.generate(
-        **inputs,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        guidance_scale=guidance_scale,
-        top_k=top_k,
-        top_p=top_p
-    )
     audio_array = audio_values[0].cpu().numpy()
-    # Normalize and ensure single channel for broader compatibility
-    if audio_array.ndim > 1:
-        audio_array = np.mean(audio_array, axis=0) # Convert to mono by averaging channels
-    audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9 # Normalize
     audio_array = np.clip(audio_array, -1.0, 1.0)
     return audio_array, musicgen_model.config.audio_encoder.sampling_rate
-def sync_audio_to_video(video_path, audio_path, output_path, mix_original, original_vol, sfx_vol):
-    """Synchronizes generated audio with the video, mixing or replacing original audio."""
     video_clip = mpy.VideoFileClip(video_path)
     generated_audio_clip = mpy.AudioFileClip(audio_path)
     video_duration = video_clip.duration
-    # Adjust generated audio length to match video duration
-    if generated_audio_clip.duration < video_duration:
-        # Loop the audio if it's shorter
-        num_loops = int(np.ceil(video_duration / generated_audio_clip.duration))
-        looped_clips = [generated_audio_clip] * num_loops
-        final_generated_audio = mpy.concatenate_audioclips(looped_clips).subclip(0, video_duration)
-    else:
-        # Trim the audio if it's longer
-        final_generated_audio = generated_audio_clip.subclip(0, video_duration)
-    # Apply SFX volume
     final_generated_audio = final_generated_audio.volumex(sfx_vol)
-    # Mix or replace audio
     if mix_original and video_clip.audio:
         original_audio = video_clip.audio.volumex(original_vol)
         composite_audio = mpy.CompositeAudioClip([original_audio, final_generated_audio])
         final_video = video_clip.set_audio(composite_audio)
     else:
         final_video = video_clip.set_audio(final_generated_audio)
-    # Write final video
-    final_video.write_videofile(
-        output_path,
-        codec="libx264",      # Good quality and widely compatible
-        audio_codec="aac",    # Good quality audio codec
-        preset="medium",      # Balance between speed and quality (faster: 'ultrafast', 'superfast')
-        bitrate="5000k",      # Decent video bitrate for web
-        audio_bitrate="192k", # Good audio quality
-        threads=os.cpu_count() or 2, # Use multiple threads for encoding
-        logger='bar' # Show moviepy progress bar
-    )
     video_clip.close()
     generated_audio_clip.close()
     if 'original_audio' in locals(): original_audio.close()
@@ -242,221 +178,127 @@ def sync_audio_to_video(video_path, audio_path, output_path, mix_original, origi
 # --- Streamlit UI ---
 st.set_page_config(layout="wide", page_title="Video To SoundFX Generator")
-st.title("🎬 Video To SoundFX Generator 🎶")
-st.markdown("Upload an MP4 video, and this tool will automatically generate relevant sound effects and sync them to your video.")
 # --- Sidebar for Settings ---
 with st.sidebar:
     st.header("⚙️ Generation Settings")
     num_frames_to_analyze = st.slider(
-        "Number of Frames to Analyze from Video", 1, 10, 3,
-        help="More frames give more context but take longer. Frames are picked evenly throughout the video."
     )
-    prompt_style = st.selectbox(
-        "Sound Style for Prompt",
-        ["cinematic", "realistic", "cartoon", "ominous", "peaceful"],
-        index=0, help="Influences the adjectives added to the sound prompt."
     )
     st.subheader("Audio Mixing")
     mix_original_audio = st.checkbox("Mix with Original Video Audio", value=True)
-    original_audio_volume = st.slider(
-        "Original Audio Volume", 0.0, 1.0, 0.5,
-        disabled=not mix_original_audio,
-        help="Volume of the video's original audio when mixed."
-    )
-    sfx_audio_volume = st.slider(
-        "Generated SFX Volume", 0.0, 1.0, 0.8,
-        help="Volume of the generated sound effect."
-    )
-    with st.expander("Advanced MusicGen Settings"):
-        # Simplified duration - MusicGen has its own duration mechanism via max_new_tokens
-        # We will primarily make one sound effect and loop/trim it.
-        # However, we can set a target for the initial generation.
-        generated_sfx_duration = st.slider(
-            "Target SFX Clip Duration (s)", 5, 30, 10,
-            help="Duration of the base sound effect clip before looping/trimming. Max ~30s for musicgen-small."
-        )
-        guidance_scale = st.slider(
-            "Guidance Scale (CFG)", 1.0, 10.0, 3.0,
-            help="Higher values make the audio follow the prompt more closely, but can reduce diversity."
-        )
-        top_k = st.slider(
-            "Top-K Sampling", 0, 250, 50,
-            help="Filters to the K most likely next tokens. 0 means no filtering."
-        )
-        top_p = st.slider(
-            "Top-P (Nucleus) Sampling", 0.0, 1.0, 0.95,
-            help="Selects tokens from the smallest set whose cumulative probability exceeds P."
-        )
-        negative_prompt_text = st.text_input(
-            "Negative Prompt (Optional)",
-            placeholder="e.g., low quality, noisy, muffled, music, speech",
-            help="Describe sounds to avoid."
-        )
 # --- Main Area for Upload and Results ---
-uploaded_file = st.file_uploader("📤 Upload an MP4 Video", type=["mp4"])
 if uploaded_file:
-    # Determine device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    st.write(f"Using device: {device}")
-    if device.type == 'cuda':
-        st.write(f"GPU: {torch.cuda.get_device_name(0)}")
-    # Temporary file paths
     temp_video_path = os.path.join(TEMP_DIR, f"temp_video_{uploaded_file.name}")
     temp_audio_path = os.path.join(TEMP_DIR, "temp_generated_audio.wav")
     output_video_path = os.path.join(TEMP_DIR, f"output_{uploaded_file.name}")
     try:
-        with open(temp_video_path, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        progress_bar = st.progress(0)
-        status_text = st.empty()
-        # 1. Extract Frames
-        status_text.info("⏳ Step 1/5: Extracting frames from video...")
-        frames = extract_frames_from_video(temp_video_path, num_frames_to_analyze)
-        if not frames:
-            st.error("Could not extract frames. Please try another video or check settings.")
-            st.stop()
         progress_bar.progress(20)
-        # 2. Generate Descriptions
-        status_text.info("⏳ Step 2/5: Analyzing frames and generating descriptions...")
         blip_processor, blip_model = load_blip_model()
         descriptions = []
         for i, frame_pil in enumerate(frames):
             inputs = blip_processor(images=frame_pil, return_tensors="pt")
-            if device.type == 'cuda':
-                inputs = {k: v.to(device) for k, v in inputs.items()}
-            with torch.no_grad(): # Important for inference
-                out = blip_model.generate(**inputs, max_new_tokens=50) # Increased max_new_tokens for more detail
             base_desc = blip_processor.decode(out[0], skip_special_tokens=True)
             enhanced_desc = generate_enhanced_prompt(base_desc, prompt_style)
             descriptions.append(enhanced_desc)
-            status_text.info(f"⏳ Step 2/5: Analyzing frames... Frame {i+1}/{len(frames)}: {base_desc[:50]}...")
-        combined_description = " Then, ".join(descriptions)
-        progress_bar.progress(40)
-        st.subheader("📝 Generated Sound Prompt")
-        st.markdown(f"Based on video analysis, the following prompt was generated for the sound effect. **You can edit it below before generating the audio.**")
-        # Allow user to edit the prompt
-        editable_prompt = st.text_area(
-            "Sound Effect Prompt:",
-            value=combined_description,
-            height=150,
-            help="Edit this prompt to fine-tune the sound generation."
-        )
-        if st.button("✨ Generate Sound & Sync Video"):
-            if not editable_prompt.strip():
-                st.error("Prompt cannot be empty!")
-            else:
-                # 3. Generate Sound Effect
-                status_text.info(f"⏳ Step 3/5: Generating sound effect for: '{editable_prompt[:100]}...'")
-                audio_array, sample_rate = generate_audio(
-                    editable_prompt,
-                    negative_prompt_text,
-                    generated_sfx_duration,
-                    guidance_scale,
-                    top_k,
-                    top_p,
-                    device
-                )
-                sf.write(temp_audio_path, audio_array, sample_rate)
-                progress_bar.progress(60)
-                # 4. Synchronize Audio with Video
-                status_text.info("⏳ Step 4/5: Syncing audio with video...")
-                # Display moviepy progress within Streamlit
-                with st.spinner("MoviePy is processing the video... This can take a while for longer videos."):
-                    sync_audio_to_video(
-                        temp_video_path,
-                        temp_audio_path,
-                        output_video_path,
-                        mix_original_audio,
-                        original_audio_volume,
-                        sfx_audio_volume
-                    )
-                progress_bar.progress(90)
-                # 5. Display Results
-                status_text.success("✅ Step 5/5: Processing Complete!")
-                st.subheader("🎉 Your Sound-Enhanced Video:")
-                try:
-                    video_file = open(output_video_path, 'rb')
-                    video_bytes = video_file.read()
-                    st.video(video_bytes)
-                    video_file.close() # Close it after reading bytes
-                    st.download_button(
-                        label="📥 Download Synced Video",
-                        data=video_bytes, # Use bytes directly
-                        file_name=f"sfx_synced_{uploaded_file.name}",
-                        mime="video/mp4"
-                    )
-                except FileNotFoundError:
-                    st.error("Output video file not found. Something went wrong during processing.")
-                except Exception as e:
-                    st.error(f"Error displaying or preparing download for video: {e}")
-                progress_bar.progress(100)
-                with st.expander("Generation Details", expanded=False):
-                    st.write("**Original BLIP Captions:**")
-                    for i, desc_pair in enumerate(zip([blip_processor.decode(blip_model.generate(**blip_processor(images=f, return_tensors="pt").to(device), max_new_tokens=50)[0], skip_special_tokens=True) for f in frames], descriptions)):
-                        st.markdown(f"- Frame {i+1} Raw: `{desc_pair[0]}`\n- Frame {i+1} Enhanced: `{desc_pair[1]}`")
-                    st.write(f"**Final Prompt Used for MusicGen:** `{editable_prompt}`")
-                    if negative_prompt_text:
-                        st.write(f"**Negative Prompt Used:** `{negative_prompt_text}`")
-                    st.write(f"**Base SFX Duration:** {generated_sfx_duration}s (looped/trimmed to video length)")
-                    st.write(f"**MusicGen Sampling Rate:** {sample_rate} Hz")
     except Exception as e:
         st.error(f"An unexpected error occurred: {e}")
-        st.error("Troubleshooting tips:")
-        st.markdown("- Try a shorter or smaller resolution video.")
-        st.markdown("- Reduce the 'Number of Frames to Analyze'.")
-        st.markdown("- Ensure your Hugging Face Space has enough resources (CPU/RAM, GPU if applicable).")
-        st.markdown("- Check the console logs in your Hugging Face Space for more detailed errors.")
         import traceback
         st.code(traceback.format_exc())
     finally:
-        # Clean up temporary files
-        for path in [temp_video_path, temp_audio_path, output_video_path]:
-            if os.path.exists(path):
-                try:
-                    os.remove(path)
-                except Exception as e:
-                    st.warning(f"Could not remove temporary file {path}: {e}")
 else:
-    st.info("👋 Welcome! Upload an MP4 video to get started.")
     st.markdown("""
-    **How it works:**
-    1.  You upload an MP4 video.
-    2.  The system analyzes a few frames from your video to understand its content (using BLIP image captioning).
-    3.  It generates a descriptive prompt based on these frames, tailored for sound effects.
-    4.  You can review and **edit this prompt**!
-    5.  The prompt is fed into MusicGen (a powerful AI sound generator) to create a sound effect.
-    6.  The generated sound is then automatically synced with your original video.
-    7.  You can download the final video with the new sound!
-    **Tips for best results:**
-    - Use videos with clear actions or distinct environments.
-    - Experiment with the 'Number of Frames to Analyze' and 'Sound Style'.
-    - Edit the generated prompt to be more specific or to change the mood.
     """)

 import streamlit as st
+# import imageio # Replaced by cv2
+import cv2 # For faster frame extraction
 import numpy as np
 from PIL import Image
 from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
 # --- Constants ---
 TEMP_DIR = tempfile.gettempdir()
 DEFAULT_SAMPLING_RATE = 32000 # MusicGen default
+BLIP_PROCESS_SIZE = 384 # Resize frames to this size for BLIP
 # --- Model Loading (Cached) ---
 @st.cache_resource
     return processor, model
 # --- Core Functions ---
+def extract_frames_from_video_cv2(video_path, num_frames_to_extract, target_size=BLIP_PROCESS_SIZE):
+    """Extracts a specified number of frames evenly from a video using OpenCV, and resizes them."""
     frames = []
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        st.error("Error: Could not open video file.")
+        return []
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if total_frames == 0:
+        st.warning("Video appears to have 0 frames. Please check the video file.")
+        cap.release()
         return []
+    step = max(1, total_frames // num_frames_to_extract)
+    extracted_count = 0
+    for i in range(0, total_frames, step):
+        if extracted_count >= num_frames_to_extract:
+            break
+        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        ret, frame = cap.read()
+        if ret:
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(frame_rgb)
+            # Resize for BLIP
+            pil_image_resized = pil_image.resize((target_size, target_size), Image.Resampling.LANCZOS)
+            frames.append(pil_image_resized)
+            extracted_count += 1
+        else:
+            # Could mean end of video or read error
+            break
+    cap.release()
+    if not frames and num_frames_to_extract > 0:
+        st.warning(f"Could not extract any frames. Tried to extract {num_frames_to_extract} frames with step {step} from {total_frames} total frames.")
     return frames
 def generate_enhanced_prompt(base_description, context_style="cinematic"):
     base = base_description.lower().strip().replace("a photo of ", "").replace("an image of ", "")
     action_sounds = {
         "walking": "footsteps, rhythmic, on {surface}", "running": "rapid footsteps, heavy breathing, on {surface}",
         "driving": "engine rumble, tire sounds on {surface}", "talking": "clear voices, conversational tone",
         "office": "office hum, keyboard typing, distant chatter",
         "street": "city street sounds, traffic, distant sirens, pedestrian chatter",
         "forest": "forest ambience, rustling leaves, distant bird calls, twigs snapping",
+        "beach": "ocean waves crashing, seagulls, gentle wind", "cave": "echoing drips, damp air, low rumble",
+        "space": "eerie silence, low hum, occasional electronic beep", "underwater": "muffled sounds, bubbling, deep water pressure"
     }
     sound_qualities = {
+        "cinematic": "high quality, clear, immersive, dynamic range", "realistic": "natural, authentic, detailed",
+        "cartoon": "exaggerated, playful, boings, zips", "ominous": "low rumble, dissonant, suspenseful",
         "peaceful": "gentle, calming, serene"
     }
+    found_elements, prompt_parts = [], [f"A {context_style} soundscape of:"]
     for action_keyword, sound_desc in action_sounds.items():
         if action_keyword in base:
+            surface = "generic surface"; ("grass" in base and (surface := "grass")) or (("wood" in base or "floor" in base) and (surface := "wooden floor")) or (("concrete" in base or "pavement" in base) and (surface := "concrete")) or (("water" in base) and (surface := "water"))
+            prompt_parts.append(sound_desc.format(surface=surface)); found_elements.append(action_keyword); break
     for obj_keyword, sound_desc in object_sounds.items():
         if obj_keyword in base and obj_keyword not in found_elements:
+            prompt_parts.append(sound_desc); found_elements.append(obj_keyword)
+            if len(found_elements) > (1 if any(ak in found_elements for ak in action_sounds) else 2): break
     added_env = False
     for env_keyword, sound_desc in environment_ambience.items():
+        if env_keyword in base: prompt_parts.append(f"environment: {sound_desc}"); added_env = True; break
+    if not found_elements and not added_env: prompt_parts.append(f"subtle ambient sound related to '{base}'")
+    prompt_parts.append(sound_qualities.get(context_style, sound_qualities["cinematic"]))
     return ", ".join(prompt_parts) + "."
 def generate_audio(text_prompt, negative_prompt, duration_s, guidance_scale, top_k, top_p, device):
     musicgen_processor, musicgen_model = load_musicgen_model()
+    inputs = musicgen_processor(text=[text_prompt], negative_prompt=[negative_prompt] if negative_prompt else None, padding=True, return_tensors="pt").to(device)
     max_new_tokens = int(duration_s * musicgen_model.config.audio_encoder.frame_rate)
+    if max_new_tokens > 1500: # MusicGen small context limit
+        st.warning(f"Requested SFX duration ({duration_s}s) is long. Capping to ~30s for musicgen-small to ensure stability.")
         max_new_tokens = 1500
+    audio_values = musicgen_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, guidance_scale=guidance_scale, top_k=top_k, top_p=top_p)
     audio_array = audio_values[0].cpu().numpy()
+    if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=0)
+    audio_array = audio_array / (np.max(np.abs(audio_array)) + 1e-6) * 0.9 # Normalize, avoid div by zero
     audio_array = np.clip(audio_array, -1.0, 1.0)
     return audio_array, musicgen_model.config.audio_encoder.sampling_rate
+def sync_audio_to_video(video_path, audio_path, output_path, mix_original, original_vol, sfx_vol, encoding_preset):
     video_clip = mpy.VideoFileClip(video_path)
     generated_audio_clip = mpy.AudioFileClip(audio_path)
     video_duration = video_clip.duration
+    final_generated_audio = generated_audio_clip.subclip(0, video_duration) if generated_audio_clip.duration >= video_duration else mpy.concatenate_audioclips([generated_audio_clip] * int(np.ceil(video_duration / generated_audio_clip.duration))).subclip(0, video_duration)
     final_generated_audio = final_generated_audio.volumex(sfx_vol)
     if mix_original and video_clip.audio:
         original_audio = video_clip.audio.volumex(original_vol)
         composite_audio = mpy.CompositeAudioClip([original_audio, final_generated_audio])
         final_video = video_clip.set_audio(composite_audio)
     else:
         final_video = video_clip.set_audio(final_generated_audio)
+    # Use more threads for encoding, and the selected preset
+    num_threads = os.cpu_count() or 2
+    st.write(f"MoviePy encoding with preset: '{encoding_preset}', threads: {num_threads}")
+    final_video.write_videofile(output_path, codec="libx264", audio_codec="aac", preset=encoding_preset,
+                                bitrate="4000k", # Reduced bitrate slightly for faster presets
+                                audio_bitrate="192k", threads=num_threads, logger='bar')
+    # Close clips to free resources
     video_clip.close()
     generated_audio_clip.close()
     if 'original_audio' in locals(): original_audio.close()
 # --- Streamlit UI ---
 st.set_page_config(layout="wide", page_title="Video To SoundFX Generator")
+st.title("⚡ Speedy Video To SoundFX Generator 🎶")
+st.markdown("Upload an MP4 video. This tool generates sound effects and syncs them. **Optimized for speed!**")
+st.markdown("> For *truly* fast performance, especially the AI parts, ensure your Hugging Face Space is running on **GPU hardware**.")
 # --- Sidebar for Settings ---
 with st.sidebar:
     st.header("⚙️ Generation Settings")
     num_frames_to_analyze = st.slider(
+        "Number of Frames to Analyze (Fewer = Faster)", 1, 5, 2, # Reduced max and default
+        help=f"Fewer frames speed up analysis. Frames are resized to {BLIP_PROCESS_SIZE}x{BLIP_PROCESS_SIZE} before analysis."
     )
+    video_encoding_preset = st.selectbox(
+        "Video Encoding Speed (Faster = Lower Quality/Larger File)",
+        ('ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium'),
+        index=1, # Default to 'superfast'
+        help="Controls video encoding speed. 'ultrafast' is quickest but may reduce quality or increase file size. 'medium' is balanced."
     )
+    prompt_style = st.selectbox("Sound Style for Prompt",
+        ["cinematic", "realistic", "cartoon", "ominous", "peaceful"], index=0)
     st.subheader("Audio Mixing")
     mix_original_audio = st.checkbox("Mix with Original Video Audio", value=True)
+    original_audio_volume = st.slider("Original Audio Volume", 0.0, 1.0, 0.4, disabled=not mix_original_audio)
+    sfx_audio_volume = st.slider("Generated SFX Volume", 0.0, 1.0, 0.9)
+    with st.expander("Advanced MusicGen Settings (Impacts Speed & Quality)"):
+        generated_sfx_duration = st.slider("Target SFX Clip Duration (s) (Shorter = Faster)", 3, 20, 8, # Reduced max & default
+            help="Shorter base SFX clips generate faster. Max ~30s for musicgen-small.")
+        guidance_scale = st.slider("Guidance Scale (CFG)", 1.0, 7.0, 3.0)
+        top_k = st.slider("Top-K Sampling", 0, 250, 50)
+        top_p = st.slider("Top-P (Nucleus) Sampling", 0.0, 1.0, 0.95)
+        negative_prompt_text = st.text_input("Negative Prompt (Optional)", placeholder="e.g., low quality, noisy, music")
 # --- Main Area for Upload and Results ---
+uploaded_file = st.file_uploader("📤 Upload an MP4 Video (Shorter videos process faster!)", type=["mp4"])
 if uploaded_file:
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    st.write(f"Using device: `{device}`. {'GPU detected: ' + torch.cuda.get_device_name(0) if device.type == 'cuda' else 'Warning: CPU processing will be slow for AI models.'}")
     temp_video_path = os.path.join(TEMP_DIR, f"temp_video_{uploaded_file.name}")
     temp_audio_path = os.path.join(TEMP_DIR, "temp_generated_audio.wav")
     output_video_path = os.path.join(TEMP_DIR, f"output_{uploaded_file.name}")
     try:
+        with open(temp_video_path, "wb") as f: f.write(uploaded_file.getbuffer())
+        progress_bar = st.progress(0); status_text = st.empty()
+        status_text.info("⏳ 1/5: Extracting & resizing frames...")
+        frames = extract_frames_from_video_cv2(temp_video_path, num_frames_to_analyze, BLIP_PROCESS_SIZE)
+        if not frames: st.error("Could not extract frames."); st.stop()
         progress_bar.progress(20)
+        status_text.info("⏳ 2/5: Analyzing frames (BLIP)...")
         blip_processor, blip_model = load_blip_model()
         descriptions = []
+        raw_blip_captions = [] # For display
         for i, frame_pil in enumerate(frames):
             inputs = blip_processor(images=frame_pil, return_tensors="pt")
+            if device.type == 'cuda': inputs = {k: v.to(device) for k, v in inputs.items()}
+            with torch.no_grad(): out = blip_model.generate(**inputs, max_new_tokens=30) # Shorter BLIP captions
             base_desc = blip_processor.decode(out[0], skip_special_tokens=True)
+            raw_blip_captions.append(base_desc)
             enhanced_desc = generate_enhanced_prompt(base_desc, prompt_style)
             descriptions.append(enhanced_desc)
+            status_text.info(f"⏳ 2/5: Frame {i+1}/{len(frames)} analyzed.")
+        combined_description = " Then, ".join(descriptions); progress_bar.progress(40)
+        st.subheader("📝 Generated Sound Prompt (Editable)")
+        editable_prompt = st.text_area("Sound Effect Prompt:", value=combined_description, height=100)
+        if st.button("✨ Generate Sound & Sync Video (FAST MODE)", type="primary"):
+            if not editable_prompt.strip(): st.error("Prompt cannot be empty!"); st.stop()
+            status_text.info(f"⏳ 3/5: Generating sound (MusicGen)...")
+            audio_array, sample_rate = generate_audio(editable_prompt, negative_prompt_text, generated_sfx_duration, guidance_scale, top_k, top_p, device)
+            sf.write(temp_audio_path, audio_array, sample_rate); progress_bar.progress(60)
+            status_text.info(f"⏳ 4/5: Syncing audio with video (MoviePy @ {video_encoding_preset})...")
+            with st.spinner(f"MoviePy is encoding (preset: {video_encoding_preset})... This is the slowest part for CPU users."):
+                sync_audio_to_video(temp_video_path, temp_audio_path, output_video_path, mix_original_audio, original_audio_volume, sfx_audio_volume, video_encoding_preset)
+            progress_bar.progress(90)
+            status_text.success("✅ 5/5: Processing Complete!")
+            st.subheader("🎉 Your Sound-Enhanced Video:")
+            try:
+                with open(output_video_path, 'rb') as vf: video_bytes = vf.read()
+                st.video(video_bytes)
+                st.download_button("📥 Download Synced Video", video_bytes, f"sfx_synced_{uploaded_file.name}", "video/mp4")
+            except FileNotFoundError: st.error("Output video file not found.")
+            except Exception as e: st.error(f"Error displaying video: {e}")
+            progress_bar.progress(100)
+            with st.expander("Generation Details", expanded=False):
+                st.write("**Original BLIP Captions (on resized frames):**")
+                for i, cap in enumerate(raw_blip_captions): st.markdown(f"- Frame {i+1}: `{cap}`")
+                st.write(f"**Final Prompt Used for MusicGen:** `{editable_prompt}`")
+                if negative_prompt_text: st.write(f"**Negative Prompt Used:** `{negative_prompt_text}`")
+                st.write(f"**Base SFX Duration:** {generated_sfx_duration}s")
+                st.write(f"**MusicGen Sampling Rate:** {sample_rate} Hz")
+                st.write(f"**Video Encoding Preset:** {video_encoding_preset}")
     except Exception as e:
         st.error(f"An unexpected error occurred: {e}")
+        st.error("Troubleshooting: Try a shorter video, fewer frames, or 'ultrafast' encoding. Ensure GPU hardware on Spaces for AI speed.")
         import traceback
         st.code(traceback.format_exc())
     finally:
+        for p in [temp_video_path, temp_audio_path, output_video_path]:
+            if os.path.exists(p):
+                try: os.remove(p)
+                except Exception as e: st.warning(f"Could not remove temp file {p}: {e}")
 else:
+    st.info("👋 Welcome! Upload an MP4 to get started. Adjust settings in the sidebar for speed/quality.")
     st.markdown("""
+    **How it's faster:**
+    1.  **OpenCV & Frame Resizing:** Faster frame grabbing, smaller frames for quicker AI analysis (BLIP).
+    2.  **Encoding Presets:** Choose 'ultrafast' or 'superfast' for quicker video output (MoviePy).
+    3.  **Optimized Defaults:** Fewer frames analyzed & shorter SFX clip by default.
+    4.  **GPU Strongly Recommended:** For AI models (BLIP, MusicGen), GPU hardware on Hugging Face Spaces is key for true speed.
     """)