Spaces:

garyuzair
/

Video-To-SoundFX

Runtime error

App Files Files Community

garyuzair commited on May 7, 2025

Commit

3cdf79d

verified ·

1 Parent(s): 10506b0

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -268

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import streamlit as st
-# import imageio # Replaced by cv2
-import cv2 # For faster frame extraction
 import numpy as np
 from PIL import Image
 from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
@@ -13,292 +12,216 @@ import tempfile
 try:
     import moviepy.editor as mpy
 except ModuleNotFoundError:
-    st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3' (or a compatible version) is in your requirements.txt and the environment is rebuilt.")
     st.stop()
-# --- Constants ---
-TEMP_DIR = tempfile.gettempdir()
-DEFAULT_SAMPLING_RATE = 32000 # MusicGen default
-BLIP_PROCESS_SIZE = 384 # Resize frames to this size for BLIP
-# --- Model Loading (Cached) ---
-@st.cache_resource
-def load_blip_model():
-    st.write("Loading BLIP model for image captioning...")
-    processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-    if torch.cuda.is_available():
-        model = model.half().to("cuda")
-    st.write("BLIP model loaded.")
-    return processor, model
-@st.cache_resource
-def load_musicgen_model():
-    st.write("Loading MusicGen model for sound generation...")
-    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
-    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
-    if torch.cuda.is_available():
-        model = model.half().to("cuda")
-    st.write("MusicGen model loaded.")
-    return processor, model
-# --- Core Functions ---
-def extract_frames_from_video_cv2(video_path, num_frames_to_extract, target_size=BLIP_PROCESS_SIZE):
-    """Extracts a specified number of frames evenly from a video using OpenCV, and resizes them."""
-    frames = []
-    cap = cv2.VideoCapture(video_path)
-    if not cap.isOpened():
-        st.error("Error: Could not open video file.")
-        return []
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    if total_frames == 0:
-        st.warning("Video appears to have 0 frames. Please check the video file.")
-        cap.release()
-        return []
-    step = max(1, total_frames // num_frames_to_extract)
-    extracted_count = 0
-    for i in range(0, total_frames, step):
-        if extracted_count >= num_frames_to_extract:
-            break
-        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        ret, frame = cap.read()
-        if ret:
-            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(frame_rgb)
-            # Resize for BLIP
-            pil_image_resized = pil_image.resize((target_size, target_size), Image.Resampling.LANCZOS)
-            frames.append(pil_image_resized)
-            extracted_count += 1
-        else:
-            # Could mean end of video or read error
-            break
-    cap.release()
-    if not frames and num_frames_to_extract > 0:
-        st.warning(f"Could not extract any frames. Tried to extract {num_frames_to_extract} frames with step {step} from {total_frames} total frames.")
-    return frames
-def generate_enhanced_prompt(base_description, context_style="cinematic"):
-    base = base_description.lower().strip().replace("a photo of ", "").replace("an image of ", "")
-    action_sounds = {
-        "walking": "footsteps, rhythmic, on {surface}", "running": "rapid footsteps, heavy breathing, on {surface}",
-        "driving": "engine rumble, tire sounds on {surface}", "talking": "clear voices, conversational tone",
-        "shouting": "loud voice, urgent tone", "whispering": "soft whisper, subtle breath sounds",
-        "crashing": "loud impact, debris scattering, metallic clang", "falling": "thud, impact sound, rustle of clothing",
-        "flying": "whooshing sound, wind noise", "swimming": "water splashing, rhythmic strokes",
-        "exploding": "deep boom, crackling, debris sounds", "door opening": "creak of a door, latch click",
-        "door closing": "solid thud of a door, latch click"
-    }
-    object_sounds = {
-        "person": "human presence, subtle movements, breathing", "dog": "dog barking, panting, collar jingle",
-        "cat": "cat meow, purring, soft paw steps", "car": "engine hum, tire whine, occasional horn",
-        "truck": "heavy engine drone, air brakes", "motorcycle": "motorcycle engine revving, exhaust pop",
-        "train": "train horn, rhythmic clatter on tracks", "plane": "jet engine whine, distant rumble",
-        "bird": "birdsong, flapping wings", "water": "flowing water, lapping waves, drips",
-        "wind": "wind howling, gentle breeze through leaves", "fire": "crackling fire, burning wood"
     }
-    environment_ambience = {
-        "room": "indoor ambience, slight room tone, muffled distant sounds",
-        "office": "office hum, keyboard typing, distant chatter",
-        "street": "city street sounds, traffic, distant sirens, pedestrian chatter",
-        "forest": "forest ambience, rustling leaves, distant bird calls, twigs snapping",
-        "beach": "ocean waves crashing, seagulls, gentle wind", "cave": "echoing drips, damp air, low rumble",
-        "space": "eerie silence, low hum, occasional electronic beep", "underwater": "muffled sounds, bubbling, deep water pressure"
     }
-    sound_qualities = {
-        "cinematic": "high quality, clear, immersive, dynamic range", "realistic": "natural, authentic, detailed",
-        "cartoon": "exaggerated, playful, boings, zips", "ominous": "low rumble, dissonant, suspenseful",
-        "peaceful": "gentle, calming, serene"
     }
-    found_elements, prompt_parts = [], [f"A {context_style} soundscape of:"]
-    for action_keyword, sound_desc in action_sounds.items():
-        if action_keyword in base:
-            surface = "generic surface"; ("grass" in base and (surface := "grass")) or (("wood" in base or "floor" in base) and (surface := "wooden floor")) or (("concrete" in base or "pavement" in base) and (surface := "concrete")) or (("water" in base) and (surface := "water"))
-            prompt_parts.append(sound_desc.format(surface=surface)); found_elements.append(action_keyword); break
-    for obj_keyword, sound_desc in object_sounds.items():
-        if obj_keyword in base and obj_keyword not in found_elements:
-            prompt_parts.append(sound_desc); found_elements.append(obj_keyword)
-            if len(found_elements) > (1 if any(ak in found_elements for ak in action_sounds) else 2): break
-    added_env = False
-    for env_keyword, sound_desc in environment_ambience.items():
-        if env_keyword in base: prompt_parts.append(f"environment: {sound_desc}"); added_env = True; break
-    if not found_elements and not added_env: prompt_parts.append(f"subtle ambient sound related to '{base}'")
-    prompt_parts.append(sound_qualities.get(context_style, sound_qualities["cinematic"]))
-    return ", ".join(prompt_parts) + "."
-def generate_audio(text_prompt, negative_prompt, duration_s, guidance_scale, top_k, top_p, device):
-    musicgen_processor, musicgen_model = load_musicgen_model()
-    inputs = musicgen_processor(text=[text_prompt], negative_prompt=[negative_prompt] if negative_prompt else None, padding=True, return_tensors="pt").to(device)
-    max_new_tokens = int(duration_s * musicgen_model.config.audio_encoder.frame_rate)
-    if max_new_tokens > 1500: # MusicGen small context limit
-        st.warning(f"Requested SFX duration ({duration_s}s) is long. Capping to ~30s for musicgen-small to ensure stability.")
-        max_new_tokens = 1500
-    audio_values = musicgen_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, guidance_scale=guidance_scale, top_k=top_k, top_p=top_p)
-    audio_array = audio_values[0].cpu().numpy()
-    if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=0)
-    audio_array = audio_array / (np.max(np.abs(audio_array)) + 1e-6) * 0.9 # Normalize, avoid div by zero
-    audio_array = np.clip(audio_array, -1.0, 1.0)
-    return audio_array, musicgen_model.config.audio_encoder.sampling_rate
-def sync_audio_to_video(video_path, audio_path, output_path, mix_original, original_vol, sfx_vol, encoding_preset):
-    video_clip = mpy.VideoFileClip(video_path)
-    generated_audio_clip = mpy.AudioFileClip(audio_path)
-    video_duration = video_clip.duration
-    final_generated_audio = generated_audio_clip.subclip(0, video_duration) if generated_audio_clip.duration >= video_duration else mpy.concatenate_audioclips([generated_audio_clip] * int(np.ceil(video_duration / generated_audio_clip.duration))).subclip(0, video_duration)
-    final_generated_audio = final_generated_audio.volumex(sfx_vol)
-    if mix_original and video_clip.audio:
-        original_audio = video_clip.audio.volumex(original_vol)
-        composite_audio = mpy.CompositeAudioClip([original_audio, final_generated_audio])
-        final_video = video_clip.set_audio(composite_audio)
-    else:
-        final_video = video_clip.set_audio(final_generated_audio)
-    # Use more threads for encoding, and the selected preset
-    num_threads = os.cpu_count() or 2
-    st.write(f"MoviePy encoding with preset: '{encoding_preset}', threads: {num_threads}")
-    final_video.write_videofile(output_path, codec="libx264", audio_codec="aac", preset=encoding_preset,
-                                bitrate="4000k", # Reduced bitrate slightly for faster presets
-                                audio_bitrate="192k", threads=num_threads, logger='bar')
-    # Close clips to free resources
-    video_clip.close()
-    generated_audio_clip.close()
-    if 'original_audio' in locals(): original_audio.close()
-    final_generated_audio.close()
-    if 'composite_audio' in locals(): composite_audio.close()
-    final_video.close()
-# --- Streamlit UI ---
-st.set_page_config(layout="wide", page_title="Video To SoundFX Generator")
-st.title("⚡ Speedy Video To SoundFX Generator 🎶")
-st.markdown("Upload an MP4 video. This tool generates sound effects and syncs them. **Optimized for speed!**")
-st.markdown("> For *truly* fast performance, especially the AI parts, ensure your Hugging Face Space is running on **GPU hardware**.")
-# --- Sidebar for Settings ---
-with st.sidebar:
-    st.header("⚙️ Generation Settings")
-    num_frames_to_analyze = st.slider(
-        "Number of Frames to Analyze (Fewer = Faster)", 1, 5, 2, # Reduced max and default
-        help=f"Fewer frames speed up analysis. Frames are resized to {BLIP_PROCESS_SIZE}x{BLIP_PROCESS_SIZE} before analysis."
-    )
-    video_encoding_preset = st.selectbox(
-        "Video Encoding Speed (Faster = Lower Quality/Larger File)",
-        ('ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium'),
-        index=1, # Default to 'superfast'
-        help="Controls video encoding speed. 'ultrafast' is quickest but may reduce quality or increase file size. 'medium' is balanced."
-    )
-    prompt_style = st.selectbox("Sound Style for Prompt",
-        ["cinematic", "realistic", "cartoon", "ominous", "peaceful"], index=0)
-    st.subheader("Audio Mixing")
-    mix_original_audio = st.checkbox("Mix with Original Video Audio", value=True)
-    original_audio_volume = st.slider("Original Audio Volume", 0.0, 1.0, 0.4, disabled=not mix_original_audio)
-    sfx_audio_volume = st.slider("Generated SFX Volume", 0.0, 1.0, 0.9)
-    with st.expander("Advanced MusicGen Settings (Impacts Speed & Quality)"):
-        generated_sfx_duration = st.slider("Target SFX Clip Duration (s) (Shorter = Faster)", 3, 20, 8, # Reduced max & default
-            help="Shorter base SFX clips generate faster. Max ~30s for musicgen-small.")
-        guidance_scale = st.slider("Guidance Scale (CFG)", 1.0, 7.0, 3.0)
-        top_k = st.slider("Top-K Sampling", 0, 250, 50)
-        top_p = st.slider("Top-P (Nucleus) Sampling", 0.0, 1.0, 0.95)
-        negative_prompt_text = st.text_input("Negative Prompt (Optional)", placeholder="e.g., low quality, noisy, music")
-# --- Main Area for Upload and Results ---
-uploaded_file = st.file_uploader("📤 Upload an MP4 Video (Shorter videos process faster!)", type=["mp4"])
-if uploaded_file:
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    st.write(f"Using device: `{device}`. {'GPU detected: ' + torch.cuda.get_device_name(0) if device.type == 'cuda' else 'Warning: CPU processing will be slow for AI models.'}")
-    temp_video_path = os.path.join(TEMP_DIR, f"temp_video_{uploaded_file.name}")
-    temp_audio_path = os.path.join(TEMP_DIR, "temp_generated_audio.wav")
-    output_video_path = os.path.join(TEMP_DIR, f"output_{uploaded_file.name}")
     try:
-        with open(temp_video_path, "wb") as f: f.write(uploaded_file.getbuffer())
-        progress_bar = st.progress(0); status_text = st.empty()
-        status_text.info("⏳ 1/5: Extracting & resizing frames...")
-        frames = extract_frames_from_video_cv2(temp_video_path, num_frames_to_analyze, BLIP_PROCESS_SIZE)
-        if not frames: st.error("Could not extract frames."); st.stop()
         progress_bar.progress(20)
-        status_text.info("⏳ 2/5: Analyzing frames (BLIP)...")
-        blip_processor, blip_model = load_blip_model()
-        descriptions = []
-        raw_blip_captions = [] # For display
-        for i, frame_pil in enumerate(frames):
-            inputs = blip_processor(images=frame_pil, return_tensors="pt")
-            if device.type == 'cuda': inputs = {k: v.to(device) for k, v in inputs.items()}
-            with torch.no_grad(): out = blip_model.generate(**inputs, max_new_tokens=30) # Shorter BLIP captions
-            base_desc = blip_processor.decode(out[0], skip_special_tokens=True)
-            raw_blip_captions.append(base_desc)
-            enhanced_desc = generate_enhanced_prompt(base_desc, prompt_style)
-            descriptions.append(enhanced_desc)
-            status_text.info(f"⏳ 2/5: Frame {i+1}/{len(frames)} analyzed.")
-        combined_description = " Then, ".join(descriptions); progress_bar.progress(40)
-        st.subheader("📝 Generated Sound Prompt (Editable)")
-        editable_prompt = st.text_area("Sound Effect Prompt:", value=combined_description, height=100)
-        if st.button("✨ Generate Sound & Sync Video (FAST MODE)", type="primary"):
-            if not editable_prompt.strip(): st.error("Prompt cannot be empty!"); st.stop()
-            status_text.info(f"⏳ 3/5: Generating sound (MusicGen)...")
-            audio_array, sample_rate = generate_audio(editable_prompt, negative_prompt_text, generated_sfx_duration, guidance_scale, top_k, top_p, device)
-            sf.write(temp_audio_path, audio_array, sample_rate); progress_bar.progress(60)
-            status_text.info(f"⏳ 4/5: Syncing audio with video (MoviePy @ {video_encoding_preset})...")
-            with st.spinner(f"MoviePy is encoding (preset: {video_encoding_preset})... This is the slowest part for CPU users."):
-                sync_audio_to_video(temp_video_path, temp_audio_path, output_video_path, mix_original_audio, original_audio_volume, sfx_audio_volume, video_encoding_preset)
-            progress_bar.progress(90)
-            status_text.success("✅ 5/5: Processing Complete!")
-            st.subheader("🎉 Your Sound-Enhanced Video:")
-            try:
-                with open(output_video_path, 'rb') as vf: video_bytes = vf.read()
-                st.video(video_bytes)
-                st.download_button("📥 Download Synced Video", video_bytes, f"sfx_synced_{uploaded_file.name}", "video/mp4")
-            except FileNotFoundError: st.error("Output video file not found.")
-            except Exception as e: st.error(f"Error displaying video: {e}")
-            progress_bar.progress(100)
-            with st.expander("Generation Details", expanded=False):
-                st.write("**Original BLIP Captions (on resized frames):**")
-                for i, cap in enumerate(raw_blip_captions): st.markdown(f"- Frame {i+1}: `{cap}`")
-                st.write(f"**Final Prompt Used for MusicGen:** `{editable_prompt}`")
-                if negative_prompt_text: st.write(f"**Negative Prompt Used:** `{negative_prompt_text}`")
-                st.write(f"**Base SFX Duration:** {generated_sfx_duration}s")
-                st.write(f"**MusicGen Sampling Rate:** {sample_rate} Hz")
-                st.write(f"**Video Encoding Preset:** {video_encoding_preset}")
     except Exception as e:
-        st.error(f"An unexpected error occurred: {e}")
-        st.error("Troubleshooting: Try a shorter video, fewer frames, or 'ultrafast' encoding. Ensure GPU hardware on Spaces for AI speed.")
-        import traceback
-        st.code(traceback.format_exc())
     finally:
-        for p in [temp_video_path, temp_audio_path, output_video_path]:
-            if os.path.exists(p):
-                try: os.remove(p)
-                except Exception as e: st.warning(f"Could not remove temp file {p}: {e}")
-else:
-    st.info("👋 Welcome! Upload an MP4 to get started. Adjust settings in the sidebar for speed/quality.")
-    st.markdown("""
-    **How it's faster:**
-    1.  **OpenCV & Frame Resizing:** Faster frame grabbing, smaller frames for quicker AI analysis (BLIP).
-    2.  **Encoding Presets:** Choose 'ultrafast' or 'superfast' for quicker video output (MoviePy).
-    3.  **Optimized Defaults:** Fewer frames analyzed & shorter SFX clip by default.
-    4.  **GPU Strongly Recommended:** For AI models (BLIP, MusicGen), GPU hardware on Hugging Face Spaces is key for true speed.
-    """)

 import streamlit as st
+import imageio
 import numpy as np
 from PIL import Image
 from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
 try:
     import moviepy.editor as mpy
 except ModuleNotFoundError:
+    st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3' is in requirements.txt and installed.")
     st.stop()
+# Set page title and instructions
+st.title("Story Video Sound Effect Sync Generator")
+st.write("Upload an MP4 video to auto-generate and sync a high-quality sound effect.")
+# User-configurable settings
+num_frames_to_extract = st.slider("Number of frames to analyze", 1, 3, 1, help="Fewer frames = faster processing")
+mix_original_audio = st.checkbox("Mix with original audio", value=False, help="Blend sound effect with video’s original sound")
+# Enhanced prompt generation function
+def enhance_prompt(base_description):
+    """Generate a detailed, sound-specific prompt from BLIP caption."""
+    base = base_description.lower().strip()
+    # Define action, object, and environment keywords
+    actions = {
+        "walk": "crisp footsteps on a wooden floor",
+        "run": "rapid footsteps and heavy breathing",
+        "drive": "engine roar and tires screeching",
+        "talk": "soft voices and background murmur",
+        "crash": "loud crash and debris scattering",
+        "fall": "thud of impact and rustling debris"
     }
+    objects = {
+        "person": "human activity with subtle breathing",
+        "dog": "playful barks and pawsteps",
+        "car": "mechanical hum and tire friction",
+        "tree": "rustling leaves in a breeze",
+        "forest": "gentle wind and distant bird calls"
     }
+    environments = {
+        "room": "echoing footsteps and muffled sounds",
+        "street": "distant traffic and urban hum",
+        "forest": "wind through trees and twigs snapping",
+        "outside": "open air with faint wind"
     }
+    # Extract key elements from the caption
+    sound_description = ""
+    for action, sound in actions.items():
+        if action in base:
+            sound_description = sound
+            break
+    if not sound_description:  # Default to subtle ambient if no action
+        sound_description = "subtle ambient hum"
+    # Add object-specific sounds
+    for obj, sound in objects.items():
+        if obj in base:
+            sound_description += f" and {sound}"
+            break
+    # Add environment if detected
+    for env, sound in environments.items():
+        if env in base:
+            sound_description += f" in a {env} with {sound}"
+            break
+    # Construct final prompt
+    return f"{base} with {sound_description}"
+# File uploader for video
+uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
+if uploaded_file is not None:
     try:
+        # Temporary video file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
+            temp_video.write(uploaded_file.getbuffer())
+            temp_video_path = temp_video.name
+        # Progress bar setup
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        # Extract frames
+        status_text.text("Extracting frames...")
+        video = imageio.get_reader(temp_video_path, "ffmpeg")
+        total_frames = len(list(video.iter_data()))
+        step = max(1, total_frames // num_frames_to_extract)
+        frames = [
+            Image.fromarray(video.get_data(i))
+            for i in range(0, min(total_frames, num_frames_to_extract * step), step)
+        ][:num_frames_to_extract]
         progress_bar.progress(20)
+        # Load BLIP model
+        @st.cache_resource
+        def load_blip_model():
+            processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+            model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+            if torch.cuda.is_available():
+                model = model.half().to("cuda")
+            return processor, model
+        processor, model = load_blip_model()
+        # Generate and enhance text descriptions
+        status_text.text("Analyzing frames...")
+        descriptions = []
+        for i, frame in enumerate(frames):
+            inputs = processor(images=frame, return_tensors="pt")
+            if torch.cuda.is_available():
+                inputs = {k: v.to("cuda") for k, v in inputs.items()}
+            out = model.generate(**inputs)
+            base_description = processor.decode(out[0], skip_special_tokens=True)
+            enhanced_description = enhance_prompt(base_description)
+            descriptions.append(enhanced_description)
+            progress_bar.progress(20 + int(30 * (i + 1) / len(frames)))
+        text_prompt = ". ".join(descriptions)
+        st.write("Enhanced text prompt:", text_prompt)
+        # Load MusicGen model
+        @st.cache_resource
+        def load_musicgen_model():
+            processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+            model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+            if torch.cuda.is_available():
+                model = model.half().to("cuda")
+            return processor, model
+        musicgen_processor, musicgen_model = load_musicgen_model()
+        # Generate sound effect (~8 seconds)
+        status_text.text("Generating sound effect...")
+        inputs = musicgen_processor(
+            text=[text_prompt],
+            padding=True,
+            return_tensors="pt",
+        )
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        audio_values = musicgen_model.generate(
+            **inputs,
+            max_new_tokens=256,
+            do_sample=True,
+            guidance_scale=3.0,
+            top_k=50,
+            top_p=0.95
+        )
+        audio_array = audio_values[0].cpu().numpy()
+        if audio_array.ndim > 1:
+            audio_array = audio_array.flatten()
+        audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9
+        audio_array = np.clip(audio_array, -1.0, 1.0)
+        sample_rate = 32000
+        progress_bar.progress(60)
+        # Save temporary audio
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
+            sf.write(temp_audio.name, audio_array, sample_rate)
+            temp_audio_path = temp_audio.name
+        # Synchronize with video using mpy
+        status_text.text("Syncing audio with video...")
+        video_clip = mpy.VideoFileClip(temp_video_path)
+        video_duration = video_clip.duration
+        audio_clip = mpy.AudioFileClip(temp_audio_path)
+        # Adjust audio length
+        if audio_clip.duration < video_duration:
+            loops_needed = int(np.ceil(video_duration / audio_clip.duration))
+            audio_clip = mpy.concatenate_audioclips([audio_clip] * loops_needed).subclip(0, video_duration)
+        else:
+            audio_clip = audio_clip.subclip(0, video_duration)
+        # Mix or replace audio
+        if mix_original_audio and video_clip.audio:
+            final_audio = video_clip.audio.volumex(0.5) + audio_clip.volumex(0.5)
+        else:
+            final_audio = audio_clip
+        # Set audio to video
+        final_video = video_clip.set_audio(final_audio)
+        # Save final video with high quality
+        output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+        final_video.write_videofile(
+            output_path,
+            codec="libx264",
+            audio_codec="aac",
+            preset="medium",  # Better quality than ultrafast
+            bitrate="8000k",  # Higher bitrate for video quality
+            audio_bitrate="192k",  # Good audio quality
+            temp_audiofile="temp-audio.m4a",
+            remove_temp=True
+        )
+        progress_bar.progress(90)
+        # Provide playback and download
+        status_text.text("Done!")
+        st.video(output_path)
+        with open(output_path, "rb") as video_file:
+            st.download_button(
+                label="Download Synced Video",
+                data=video_file,
+                file_name="synced_story_video.mp4",
+                mime="video/mp4"
+            )
+        progress_bar.progress(100)
     except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
+        st.write("Try reducing frames or uploading a smaller video.")
     finally:
+        # Clean up
+        for path in [temp_video_path, temp_audio_path, output_path]:
+            if 'path' in locals() and os.path.exists(path):
+                os.remove(path)