Spaces:

garyuzair
/

Video-To-SoundFX

Runtime error

App Files Files Community

garyuzair commited on May 7, 2025

Commit

64b2c99

verified ·

1 Parent(s): da54eb1

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -91

app.py CHANGED Viewed

@@ -8,8 +8,9 @@ import soundfile as sf
 import os
 import tempfile
 import subprocess
-from pydub import AudioSegment, effects
 import moviepy.editor as mpy
 # Optional scene detection
 scene_detect_available = True
@@ -22,7 +23,34 @@ except ImportError:
 # Set page configuration
 st.set_page_config(page_title="Video Sound Effect Generator", layout="centered")
-# Load BLIP model for captioning
 @st.cache_resource
 def load_blip_model():
     processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
@@ -33,68 +61,58 @@ def load_blip_model():
 # Load MusicGen model
 @st.cache_resource
-def load_musicgen_model(model_name="facebook/musicgen-medium"):
     processor = AutoProcessor.from_pretrained(model_name)
     model = MusicgenForConditionalGeneration.from_pretrained(model_name)
     if torch.cuda.is_available():
-        model = model.half().to("cuda")
     return processor, model
-# Extract frames efficiently
-def extract_frames(video_path, num_frames, method="uniform", segment_start=0, segment_end=None):
     video = imageio.get_reader(video_path, "ffmpeg")
     meta = video.get_meta_data()
     fps = meta['fps']
     total_frames = int(meta['duration'] * fps)
-    if segment_end is None:
-        segment_end = total_frames / fps
-    start_frame = int(segment_start * fps)
-    end_frame = int(segment_end * fps)
-    total_segment_frames = end_frame - start_frame
     if method == "scene" and scene_detect_available:
         try:
             video_manager = VideoManager([video_path])
             scene_manager = SceneManager()
-            scene_manager.add_detector(ContentDetector(threshold=30))
-            video_manager.set_downscale_factor(2)  # Optimize for speed
             video_manager.start()
             scene_manager.detect_scenes(frame_source=video_manager)
             scene_list = scene_manager.get_scene_list()
-            segment_scenes = [scene for scene in scene_list if scene[0].get_seconds() >= segment_start and scene[0].get_seconds() < segment_end]
-            frames = []
-            for scene in segment_scenes[:num_frames]:
-                frame = video_manager.get_frame(scene[0].get_frames())
-                if frame is not None:
-                    frames.append(Image.fromarray(frame))
             video_manager.release()
-            if len(frames) < num_frames and total_segment_frames > 0:
-                remaining = num_frames - len(frames)
-                step = total_segment_frames // (remaining + 1)
-                for i in range(1, remaining + 1):
-                    frame_idx = start_frame + i * step
-                    if frame_idx < end_frame:
-                        frames.append(Image.fromarray(video.get_data(frame_idx)))
-            return frames[:num_frames]
-        except Exception as e:
-            st.warning(f"Scene detection failed: {e}. Using uniform extraction.")
-    # Uniform extraction
-    step = max(1, total_segment_frames // num_frames)
-    frame_indices = [start_frame + i * step for i in range(num_frames) if start_frame + i * step < end_frame]
-    frames = [Image.fromarray(video.get_data(idx)) for idx in frame_indices]
-    return frames[:num_frames]
 # Generate captions
-def generate_captions(frames, processor, model):
     descriptions = []
     for frame in frames:
         inputs = processor(images=frame, return_tensors="pt")
         if torch.cuda.is_available():
             inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        out = model.generate(**inputs, max_length=30)
         description = processor.decode(out[0], skip_special_tokens=True)
         descriptions.append(description)
     return descriptions
@@ -102,19 +120,19 @@ def generate_captions(frames, processor, model):
 # Enhance prompts
 def enhance_prompt(descriptions, mood="default"):
     if not descriptions:
-        return f"{mood} ambient sound with subtle effects"
     combined = ". ".join(descriptions).lower()
     base_prompts = {
-        "walk|run": "crisp footsteps on varied surfaces, immersive movement sounds",
-        "car|drive": "roaring engine, tire screeches, dynamic road noise",
-        "talk|person": "lively voices, crowd murmur, spatial chatter",
-        "wind|tree|forest": "rustling leaves, gentle wind gusts, natural ambiance",
-        "crash|fall": "intense crash impact, debris scattering, sharp effects"
     }
     for pattern, effect in base_prompts.items():
         if any(word in combined for word in pattern.split("|")):
-            return f"{mood} {combined}, {effect}"
-    return f"{mood} {combined}, rich ambient soundscape with engaging effects"
 # Generate audio
 def generate_audio(prompt, processor, model, duration, sample_rate=44100):
@@ -123,14 +141,15 @@ def generate_audio(prompt, processor, model, duration, sample_rate=44100):
         inputs = {k: v.to("cuda") for k, v in inputs.items()}
     audio_values = model.generate(
         **inputs,
-        max_new_tokens=int(512 * (duration / 8)),  # Optimized for speed
         do_sample=True,
-        guidance_scale=7.0,
-        top_k=120,
-        top_p=0.9
     )
     audio_array = audio_values[0].cpu().numpy()
-    audio_array = audio_array / np.max(np.abs(audio_array)) * 0.95
     audio_array = np.clip(audio_array, -1.0, 1.0)
     return audio_array
@@ -138,18 +157,18 @@ def generate_audio(prompt, processor, model, duration, sample_rate=44100):
 def apply_audio_effects(audio_path, settings):
     sound = AudioSegment.from_wav(audio_path)
     if settings['reverb_ms'] > 0:
-        sound = sound + AudioSegment.silent(duration=settings['reverb_ms']) - 10
     if settings['echo_ms'] > 0:
-        echo = sound - 15
         sound = sound.overlay(echo, position=settings['echo_ms'])
     if settings['highpass'] > 0:
         sound = sound.high_pass_filter(settings['highpass'])
     if settings['lowpass'] < 20000:
         sound = sound.low_pass_filter(settings['lowpass'])
     if settings['compress']:
-        sound = effects.compress_dynamic_range(sound)
     sound = sound.pan(settings['stereo_pan'])
-    sound = effects.normalize(sound)
     processed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
     sound.export(processed_path, format="wav")
     return processed_path
@@ -169,6 +188,7 @@ def sync_audio_video(video_path, audio_path, output_path, mix_original=False, or
             audio_path = mixed_path
         else:
             st.warning("No original audio found. Using generated audio only.")
     cmd = [
         'ffmpeg',
@@ -187,76 +207,80 @@ def sync_audio_video(video_path, audio_path, output_path, mix_original=False, or
 # Main application
 def main():
     st.title("🎬 Video Sound Effect Generator")
-    st.markdown("Upload a video to create immersive, video-specific sound effects with AI.")
     # User Guide
-    with st.expander("📖 User Guide"):
         st.markdown("""
-        **How to Use:**
-        1. **Upload a Video**: Choose an MP4, MOV, or AVI file.
-        2. **Select Prompt Mode**:
-           - **Automatic**: Analyzes video frames to generate sound prompts.
-           - **Manual**: Enter your own sound description.
-        3. **Configure Settings**: Adjust frame analysis, audio effects, and model size in the sidebar.
-        4. **Generate**: Click "Generate Sound Effects" to process the video.
-        5. **Download**: Save the enhanced video with sound effects.
         **Tips**:
-        - Use at least 5 frames for better sound relevance.
-        - Scene-based frame extraction (if available) improves accuracy.
-        - Adjust audio effects for a customized sound experience.
         """)
     # Sidebar Settings
     with st.sidebar:
         st.header("⚙️ Settings")
-        prompt_mode = st.selectbox("Prompt Mode", ["Automatic", "Manual"])
-        model_size = st.selectbox("Model Size", ["small", "medium", "large"], index=1)
-        mix_original = st.checkbox("Mix with Original Audio", value=False)
         original_volume, generated_volume = 0.5, 0.5
         if mix_original:
-            original_volume = st.slider("Original Audio Volume", 0.0, 1.0, 0.5)
-            generated_volume = st.slider("Generated Audio Volume", 0.0, 1.0, 0.5)
         st.subheader("Frame Analysis")
-        num_frames = st.slider("Frames to Analyze", 5, 10, 5, help="More frames improve sound relevance but increase processing time")
-        frame_method = st.selectbox("Frame Extraction Method", ["Uniform", "Scene"] if scene_detect_available else ["Uniform"])
         st.subheader("Audio Effects")
         effects_settings = {
-            'reverb_ms': st.slider("Reverb (ms)", 0, 500, 100),
-            'echo_ms': st.slider("Echo (ms)", 0, 1000, 200),
-            'highpass': st.slider("High-pass Filter (Hz)", 0, 3000, 100),
-            'lowpass': st.slider("Low-pass Filter (Hz)", 5000, 20000, 15000),
-            'compress': st.checkbox("Dynamic Compression", value=True),
-            'stereo_pan': st.slider("Stereo Pan (-1 left, 1 right)", -1.0, 1.0, 0.0)
         }
     # Main Content
-    uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mov", "avi"])
     if uploaded_file:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
             tmp.write(uploaded_file.read())
             video_path = tmp.name
         st.video(video_path)
         video_clip = mpy.VideoFileClip(video_path)
         duration = video_clip.duration
         video_clip.close()
         if prompt_mode == "Automatic":
             with st.spinner("Analyzing frames..."):
                 blip_processor, blip_model = load_blip_model()
                 frames = extract_frames(video_path, num_frames, frame_method)
                 if not frames:
-                    st.error("No frames extracted. Try a different video or settings.")
                     return
-                descriptions = generate_captions(frames, blip_processor, blip_model)
-                mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"])
                 text_prompt = enhance_prompt(descriptions, mood)
-                text_prompt = st.text_area("Edit Prompt", text_prompt, height=100)
         else:
-            text_prompt = st.text_area("Enter Sound Description", "Describe the desired sound effects", height=100)
         if st.button("Generate Sound Effects", key="generate"):
             progress = st.progress(0)
@@ -271,20 +295,22 @@ def main():
             sf.write(temp_audio, audio_array, 44100)
             progress.progress(50)
-            status.text("Applying audio effects...")
             processed_audio = apply_audio_effects(temp_audio, effects_settings)
             progress.progress(75)
-            status.text("Syncing with video...")
             output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
             sync_audio_video(video_path, processed_audio, output_video, mix_original, original_volume, generated_volume)
             progress.progress(100)
             status.text("Done!")
             st.success("Sound effects applied!")
             st.video(output_video)
             with open(output_video, "rb") as f:
-                st.download_button("Download Enhanced Video", f, "enhanced_video.mp4", "video/mp4")
             # Cleanup
             for file in [video_path, temp_audio, processed_audio, output_video]:

 import os
 import tempfile
 import subprocess
+from pydub import AudioSegment
 import moviepy.editor as mpy
+from functools import lru_cache
 # Optional scene detection
 scene_detect_available = True
 # Set page configuration
 st.set_page_config(page_title="Video Sound Effect Generator", layout="centered")
+# CSS for compact video preview
+st.markdown("""
+    <style>
+    .video-container {
+        max-width: 640px;
+        margin: auto;
+        overflow: hidden;
+        border-radius: 8px;
+        box-shadow: 0 4px 8px rgba(0,0,0,0.1);
+    }
+    video {
+        width: 100%;
+        height: auto;
+        display: block;
+    }
+    .stButton>button {
+        background-color: #007bff;
+        color: white;
+        border-radius: 5px;
+        padding: 10px 20px;
+    }
+    .stButton>button:hover {
+        background-color: #0056b3;
+    }
+    </style>
+""", unsafe_allow_html=True)
+# Load BLIP model
 @st.cache_resource
 def load_blip_model():
     processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 # Load MusicGen model
 @st.cache_resource
+def load_musicgen_model(model_name="facebook/musicgen-small"):
     processor = AutoProcessor.from_pretrained(model_name)
     model = MusicgenForConditionalGeneration.from_pretrained(model_name)
     if torch.cuda.is_available():
+        model = model.to("cuda")
     return processor, model
+# Optimized frame extraction
+def extract_frames(video_path, num_frames, method="uniform"):
     video = imageio.get_reader(video_path, "ffmpeg")
     meta = video.get_meta_data()
     fps = meta['fps']
     total_frames = int(meta['duration'] * fps)
     if method == "scene" and scene_detect_available:
         try:
             video_manager = VideoManager([video_path])
             scene_manager = SceneManager()
+            scene_manager.add_detector(ContentDetector(threshold=25))
+            video_manager.set_downscale_factor(4)  # Aggressive downscaling
             video_manager.start()
             scene_manager.detect_scenes(frame_source=video_manager)
             scene_list = scene_manager.get_scene_list()
+            frame_indices = [scene[0].get_frames() for scene in scene_list[:num_frames]]
             video_manager.release()
+            if len(frame_indices) < num_frames:
+                step = total_frames // (num_frames - len(frame_indices) + 1)
+                frame_indices.extend(range(step, total_frames, step)[:num_frames - len(frame_indices)])
+        except Exception:
+            frame_indices = list(range(0, total_frames, total_frames // num_frames))[:num_frames]
+    else:
+        frame_indices = list(range(0, total_frames, total_frames // num_frames))[:num_frames]
+    frames = []
+    for idx in frame_indices[:num_frames]:
+        try:
+            frames.append(Image.fromarray(video.get_data(idx)).resize((320, 180)))  # Downscale frames
+        except:
+            continue
+    video.close()
+    return frames
 # Generate captions
+@lru_cache(maxsize=100)
+def generate_captions(frames_tuple, processor, model):
+    frames = [Image.frombytes(frame[0], frame[1], frame[2]) for frame in frames_tuple]
     descriptions = []
     for frame in frames:
         inputs = processor(images=frame, return_tensors="pt")
         if torch.cuda.is_available():
             inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        out = model.generate(**inputs, max_length=20, num_beams=3)  # Faster with beam search
         description = processor.decode(out[0], skip_special_tokens=True)
         descriptions.append(description)
     return descriptions
 # Enhance prompts
 def enhance_prompt(descriptions, mood="default"):
     if not descriptions:
+        return f"{mood} cinematic ambient sound with dynamic effects"
     combined = ". ".join(descriptions).lower()
     base_prompts = {
+        "walk|run": "crisp footsteps on diverse surfaces, vivid movement sounds",
+        "car|drive": "powerful engine roar, tire screeches, immersive road noise",
+        "talk|person": "rich voices, layered crowd chatter, spatial depth",
+        "wind|tree|forest": "whistling wind, rustling foliage, natural resonance",
+        "crash|fall": "sharp crash impact, debris scatter, intense bursts"
     }
     for pattern, effect in base_prompts.items():
         if any(word in combined for word in pattern.split("|")):
+            return f"{mood} {combined}, {effect}, high-fidelity cinematic quality"
+    return f"{mood} {combined}, vibrant ambient soundscape with compelling effects, high-fidelity cinematic quality"
 # Generate audio
 def generate_audio(prompt, processor, model, duration, sample_rate=44100):
         inputs = {k: v.to("cuda") for k, v in inputs.items()}
     audio_values = model.generate(
         **inputs,
+        max_new_tokens=int(256 * (duration / 6)),  # Optimized token scaling
         do_sample=True,
+        guidance_scale=8.0,
+        top_k=150,
+        top_p=0.85,
+        num_beams=2  # Beam search for quality
     )
     audio_array = audio_values[0].cpu().numpy()
+    audio_array = audio_array / np.max(np.abs(audio_array)) * 0.98
     audio_array = np.clip(audio_array, -1.0, 1.0)
     return audio_array
 def apply_audio_effects(audio_path, settings):
     sound = AudioSegment.from_wav(audio_path)
     if settings['reverb_ms'] > 0:
+        sound = sound + AudioSegment.silent(duration=settings['reverb_ms']) - 8
     if settings['echo_ms'] > 0:
+        echo = sound - 12
         sound = sound.overlay(echo, position=settings['echo_ms'])
     if settings['highpass'] > 0:
         sound = sound.high_pass_filter(settings['highpass'])
     if settings['lowpass'] < 20000:
         sound = sound.low_pass_filter(settings['lowpass'])
     if settings['compress']:
+        sound = sound - 6  # Simulate compression
     sound = sound.pan(settings['stereo_pan'])
+    sound = sound + 2  # Slight volume boost
     processed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
     sound.export(processed_path, format="wav")
     return processed_path
             audio_path = mixed_path
         else:
             st.warning("No original audio found. Using generated audio only.")
+        video_clip.close()
     cmd = [
         'ffmpeg',
 # Main application
 def main():
     st.title("🎬 Video Sound Effect Generator")
+    st.markdown("Create high-quality, cinematic sound effects for your videos with AI.")
     # User Guide
+    with st.expander("📖 How to Use"):
         st.markdown("""
+        1. **Upload Video**: Select an MP4, MOV, or AVI file (keep under 1 minute for best performance).
+        2. **Choose Mode**:
+           - **Automatic**: AI analyzes video frames to create sound prompts.
+           - **Manual**: Write your own sound description.
+        3. **Adjust Settings**: Use the sidebar to tweak frame analysis, audio effects, and model size.
+        4. **Generate**: Click "Generate" to process and download the enhanced video.
         **Tips**:
+        - 5+ frames ensure accurate sound effects.
+        - Scene extraction (if available) enhances relevance.
+        - Experiment with audio effects for a polished result.
         """)
     # Sidebar Settings
     with st.sidebar:
         st.header("⚙️ Settings")
+        prompt_mode = st.selectbox("Mode", ["Automatic", "Manual"], help="Automatic uses AI to analyze video; Manual lets you describe the sound.")
+        model_size = st.selectbox("Model Size", ["small", "medium"], index=0, help="Small is faster; Medium is higher quality.")
+        mix_original = st.checkbox("Mix Original Audio", help="Blend with video's audio if available.")
         original_volume, generated_volume = 0.5, 0.5
         if mix_original:
+            original_volume = st.slider("Original Volume", 0.0, 1.0, 0.5)
+            generated_volume = st.slider("Generated Volume", 0.0, 1.0, 0.5)
         st.subheader("Frame Analysis")
+        num_frames = st.slider("Frames to Analyze", 5, 8, 5, help="More frames improve sound accuracy but slow processing.")
+        frame_method = st.selectbox("Extraction Method", ["Uniform", "Scene"] if scene_detect_available else ["Uniform"], help="Scene is more accurate but slower.")
         st.subheader("Audio Effects")
         effects_settings = {
+            'reverb_ms': st.slider("Reverb (ms)", 0, 300, 50, help="Adds depth to sound."),
+            'echo_ms': st.slider("Echo (ms)", 0, 500, 100, help="Creates repeating sound effects."),
+            'highpass': st.slider("High-pass Filter (Hz)", 0, 2000, 50, help="Removes low frequencies."),
+            'lowpass': st.slider("Low-pass Filter (Hz)", 5000, 20000, 18000, help="Removes high frequencies."),
+            'compress': st.checkbox("Compression", value=True, help="Balances audio dynamics."),
+            'stereo_pan': st.slider("Stereo Pan", -1.0, 1.0, 0.0, help="-1 is left, 1 is right.")
         }
     # Main Content
+    uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mov", "avi"], help="Max 1 minute recommended.")
     if uploaded_file:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
             tmp.write(uploaded_file.read())
             video_path = tmp.name
+        st.markdown('<div class="video-container">', unsafe_allow_html=True)
         st.video(video_path)
+        st.markdown('</div>', unsafe_allow_html=True)
         video_clip = mpy.VideoFileClip(video_path)
         duration = video_clip.duration
         video_clip.close()
+        if duration > 60:
+            st.warning("Videos over 1 minute may slow processing. Consider trimming.")
         if prompt_mode == "Automatic":
             with st.spinner("Analyzing frames..."):
                 blip_processor, blip_model = load_blip_model()
                 frames = extract_frames(video_path, num_frames, frame_method)
                 if not frames:
+                    st.error("Failed to extract frames. Try another video or method.")
                     return
+                # Convert frames to tuple for caching
+                frames_tuple = tuple((frame.mode, frame.size, frame.rgb) for frame in frames)
+                descriptions = generate_captions(frames_tuple, blip_processor, blip_model)
+                mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"], help="Sets the tone of sound effects.")
                 text_prompt = enhance_prompt(descriptions, mood)
+                text_prompt = st.text_area("Edit Prompt", text_prompt, height=80)
         else:
+            text_prompt = st.text_area("Sound Description", "E.g., 'intense action with explosions'", height=80)
         if st.button("Generate Sound Effects", key="generate"):
             progress = st.progress(0)
             sf.write(temp_audio, audio_array, 44100)
             progress.progress(50)
+            status.text("Applying effects...")
             processed_audio = apply_audio_effects(temp_audio, effects_settings)
             progress.progress(75)
+            status.text("Syncing video...")
             output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
             sync_audio_video(video_path, processed_audio, output_video, mix_original, original_volume, generated_volume)
             progress.progress(100)
             status.text("Done!")
             st.success("Sound effects applied!")
+            st.markdown('<div class="video-container">', unsafe_allow_html=True)
             st.video(output_video)
+            st.markdown('</div>', unsafe_allow_html=True)
             with open(output_video, "rb") as f:
+                st.download_button("Download Video", f, "enhanced_video.mp4", "video/mp4")
             # Cleanup
             for file in [video_path, temp_audio, processed_audio, output_video]: