garyuzair commited on
Commit
64b2c99
·
verified ·
1 Parent(s): da54eb1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -91
app.py CHANGED
@@ -8,8 +8,9 @@ import soundfile as sf
8
  import os
9
  import tempfile
10
  import subprocess
11
- from pydub import AudioSegment, effects
12
  import moviepy.editor as mpy
 
13
 
14
  # Optional scene detection
15
  scene_detect_available = True
@@ -22,7 +23,34 @@ except ImportError:
22
  # Set page configuration
23
  st.set_page_config(page_title="Video Sound Effect Generator", layout="centered")
24
 
25
- # Load BLIP model for captioning
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  @st.cache_resource
27
  def load_blip_model():
28
  processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
@@ -33,68 +61,58 @@ def load_blip_model():
33
 
34
  # Load MusicGen model
35
  @st.cache_resource
36
- def load_musicgen_model(model_name="facebook/musicgen-medium"):
37
  processor = AutoProcessor.from_pretrained(model_name)
38
  model = MusicgenForConditionalGeneration.from_pretrained(model_name)
39
  if torch.cuda.is_available():
40
- model = model.half().to("cuda")
41
  return processor, model
42
 
43
- # Extract frames efficiently
44
- def extract_frames(video_path, num_frames, method="uniform", segment_start=0, segment_end=None):
45
  video = imageio.get_reader(video_path, "ffmpeg")
46
  meta = video.get_meta_data()
47
  fps = meta['fps']
48
  total_frames = int(meta['duration'] * fps)
49
 
50
- if segment_end is None:
51
- segment_end = total_frames / fps
52
-
53
- start_frame = int(segment_start * fps)
54
- end_frame = int(segment_end * fps)
55
- total_segment_frames = end_frame - start_frame
56
-
57
  if method == "scene" and scene_detect_available:
58
  try:
59
  video_manager = VideoManager([video_path])
60
  scene_manager = SceneManager()
61
- scene_manager.add_detector(ContentDetector(threshold=30))
62
- video_manager.set_downscale_factor(2) # Optimize for speed
63
  video_manager.start()
64
  scene_manager.detect_scenes(frame_source=video_manager)
65
  scene_list = scene_manager.get_scene_list()
66
- segment_scenes = [scene for scene in scene_list if scene[0].get_seconds() >= segment_start and scene[0].get_seconds() < segment_end]
67
- frames = []
68
- for scene in segment_scenes[:num_frames]:
69
- frame = video_manager.get_frame(scene[0].get_frames())
70
- if frame is not None:
71
- frames.append(Image.fromarray(frame))
72
  video_manager.release()
73
- if len(frames) < num_frames and total_segment_frames > 0:
74
- remaining = num_frames - len(frames)
75
- step = total_segment_frames // (remaining + 1)
76
- for i in range(1, remaining + 1):
77
- frame_idx = start_frame + i * step
78
- if frame_idx < end_frame:
79
- frames.append(Image.fromarray(video.get_data(frame_idx)))
80
- return frames[:num_frames]
81
- except Exception as e:
82
- st.warning(f"Scene detection failed: {e}. Using uniform extraction.")
83
-
84
- # Uniform extraction
85
- step = max(1, total_segment_frames // num_frames)
86
- frame_indices = [start_frame + i * step for i in range(num_frames) if start_frame + i * step < end_frame]
87
- frames = [Image.fromarray(video.get_data(idx)) for idx in frame_indices]
88
- return frames[:num_frames]
89
 
90
  # Generate captions
91
- def generate_captions(frames, processor, model):
 
 
92
  descriptions = []
93
  for frame in frames:
94
  inputs = processor(images=frame, return_tensors="pt")
95
  if torch.cuda.is_available():
96
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
97
- out = model.generate(**inputs, max_length=30)
98
  description = processor.decode(out[0], skip_special_tokens=True)
99
  descriptions.append(description)
100
  return descriptions
@@ -102,19 +120,19 @@ def generate_captions(frames, processor, model):
102
  # Enhance prompts
103
  def enhance_prompt(descriptions, mood="default"):
104
  if not descriptions:
105
- return f"{mood} ambient sound with subtle effects"
106
  combined = ". ".join(descriptions).lower()
107
  base_prompts = {
108
- "walk|run": "crisp footsteps on varied surfaces, immersive movement sounds",
109
- "car|drive": "roaring engine, tire screeches, dynamic road noise",
110
- "talk|person": "lively voices, crowd murmur, spatial chatter",
111
- "wind|tree|forest": "rustling leaves, gentle wind gusts, natural ambiance",
112
- "crash|fall": "intense crash impact, debris scattering, sharp effects"
113
  }
114
  for pattern, effect in base_prompts.items():
115
  if any(word in combined for word in pattern.split("|")):
116
- return f"{mood} {combined}, {effect}"
117
- return f"{mood} {combined}, rich ambient soundscape with engaging effects"
118
 
119
  # Generate audio
120
  def generate_audio(prompt, processor, model, duration, sample_rate=44100):
@@ -123,14 +141,15 @@ def generate_audio(prompt, processor, model, duration, sample_rate=44100):
123
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
124
  audio_values = model.generate(
125
  **inputs,
126
- max_new_tokens=int(512 * (duration / 8)), # Optimized for speed
127
  do_sample=True,
128
- guidance_scale=7.0,
129
- top_k=120,
130
- top_p=0.9
 
131
  )
132
  audio_array = audio_values[0].cpu().numpy()
133
- audio_array = audio_array / np.max(np.abs(audio_array)) * 0.95
134
  audio_array = np.clip(audio_array, -1.0, 1.0)
135
  return audio_array
136
 
@@ -138,18 +157,18 @@ def generate_audio(prompt, processor, model, duration, sample_rate=44100):
138
  def apply_audio_effects(audio_path, settings):
139
  sound = AudioSegment.from_wav(audio_path)
140
  if settings['reverb_ms'] > 0:
141
- sound = sound + AudioSegment.silent(duration=settings['reverb_ms']) - 10
142
  if settings['echo_ms'] > 0:
143
- echo = sound - 15
144
  sound = sound.overlay(echo, position=settings['echo_ms'])
145
  if settings['highpass'] > 0:
146
  sound = sound.high_pass_filter(settings['highpass'])
147
  if settings['lowpass'] < 20000:
148
  sound = sound.low_pass_filter(settings['lowpass'])
149
  if settings['compress']:
150
- sound = effects.compress_dynamic_range(sound)
151
  sound = sound.pan(settings['stereo_pan'])
152
- sound = effects.normalize(sound)
153
  processed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
154
  sound.export(processed_path, format="wav")
155
  return processed_path
@@ -169,6 +188,7 @@ def sync_audio_video(video_path, audio_path, output_path, mix_original=False, or
169
  audio_path = mixed_path
170
  else:
171
  st.warning("No original audio found. Using generated audio only.")
 
172
 
173
  cmd = [
174
  'ffmpeg',
@@ -187,76 +207,80 @@ def sync_audio_video(video_path, audio_path, output_path, mix_original=False, or
187
  # Main application
188
  def main():
189
  st.title("🎬 Video Sound Effect Generator")
190
- st.markdown("Upload a video to create immersive, video-specific sound effects with AI.")
191
 
192
  # User Guide
193
- with st.expander("📖 User Guide"):
194
  st.markdown("""
195
- **How to Use:**
196
- 1. **Upload a Video**: Choose an MP4, MOV, or AVI file.
197
- 2. **Select Prompt Mode**:
198
- - **Automatic**: Analyzes video frames to generate sound prompts.
199
- - **Manual**: Enter your own sound description.
200
- 3. **Configure Settings**: Adjust frame analysis, audio effects, and model size in the sidebar.
201
- 4. **Generate**: Click "Generate Sound Effects" to process the video.
202
- 5. **Download**: Save the enhanced video with sound effects.
203
 
204
  **Tips**:
205
- - Use at least 5 frames for better sound relevance.
206
- - Scene-based frame extraction (if available) improves accuracy.
207
- - Adjust audio effects for a customized sound experience.
208
  """)
209
 
210
  # Sidebar Settings
211
  with st.sidebar:
212
  st.header("⚙️ Settings")
213
- prompt_mode = st.selectbox("Prompt Mode", ["Automatic", "Manual"])
214
- model_size = st.selectbox("Model Size", ["small", "medium", "large"], index=1)
215
- mix_original = st.checkbox("Mix with Original Audio", value=False)
216
  original_volume, generated_volume = 0.5, 0.5
217
  if mix_original:
218
- original_volume = st.slider("Original Audio Volume", 0.0, 1.0, 0.5)
219
- generated_volume = st.slider("Generated Audio Volume", 0.0, 1.0, 0.5)
220
 
221
  st.subheader("Frame Analysis")
222
- num_frames = st.slider("Frames to Analyze", 5, 10, 5, help="More frames improve sound relevance but increase processing time")
223
- frame_method = st.selectbox("Frame Extraction Method", ["Uniform", "Scene"] if scene_detect_available else ["Uniform"])
224
 
225
  st.subheader("Audio Effects")
226
  effects_settings = {
227
- 'reverb_ms': st.slider("Reverb (ms)", 0, 500, 100),
228
- 'echo_ms': st.slider("Echo (ms)", 0, 1000, 200),
229
- 'highpass': st.slider("High-pass Filter (Hz)", 0, 3000, 100),
230
- 'lowpass': st.slider("Low-pass Filter (Hz)", 5000, 20000, 15000),
231
- 'compress': st.checkbox("Dynamic Compression", value=True),
232
- 'stereo_pan': st.slider("Stereo Pan (-1 left, 1 right)", -1.0, 1.0, 0.0)
233
  }
234
 
235
  # Main Content
236
- uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mov", "avi"])
237
  if uploaded_file:
238
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
239
  tmp.write(uploaded_file.read())
240
  video_path = tmp.name
 
241
  st.video(video_path)
 
242
 
243
  video_clip = mpy.VideoFileClip(video_path)
244
  duration = video_clip.duration
245
  video_clip.close()
 
 
246
 
247
  if prompt_mode == "Automatic":
248
  with st.spinner("Analyzing frames..."):
249
  blip_processor, blip_model = load_blip_model()
250
  frames = extract_frames(video_path, num_frames, frame_method)
251
  if not frames:
252
- st.error("No frames extracted. Try a different video or settings.")
253
  return
254
- descriptions = generate_captions(frames, blip_processor, blip_model)
255
- mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"])
 
 
256
  text_prompt = enhance_prompt(descriptions, mood)
257
- text_prompt = st.text_area("Edit Prompt", text_prompt, height=100)
258
  else:
259
- text_prompt = st.text_area("Enter Sound Description", "Describe the desired sound effects", height=100)
260
 
261
  if st.button("Generate Sound Effects", key="generate"):
262
  progress = st.progress(0)
@@ -271,20 +295,22 @@ def main():
271
  sf.write(temp_audio, audio_array, 44100)
272
  progress.progress(50)
273
 
274
- status.text("Applying audio effects...")
275
  processed_audio = apply_audio_effects(temp_audio, effects_settings)
276
  progress.progress(75)
277
 
278
- status.text("Syncing with video...")
279
  output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
280
  sync_audio_video(video_path, processed_audio, output_video, mix_original, original_volume, generated_volume)
281
  progress.progress(100)
282
  status.text("Done!")
283
 
284
  st.success("Sound effects applied!")
 
285
  st.video(output_video)
 
286
  with open(output_video, "rb") as f:
287
- st.download_button("Download Enhanced Video", f, "enhanced_video.mp4", "video/mp4")
288
 
289
  # Cleanup
290
  for file in [video_path, temp_audio, processed_audio, output_video]:
 
8
  import os
9
  import tempfile
10
  import subprocess
11
+ from pydub import AudioSegment
12
  import moviepy.editor as mpy
13
+ from functools import lru_cache
14
 
15
  # Optional scene detection
16
  scene_detect_available = True
 
23
  # Set page configuration
24
  st.set_page_config(page_title="Video Sound Effect Generator", layout="centered")
25
 
26
+ # CSS for compact video preview
27
+ st.markdown("""
28
+ <style>
29
+ .video-container {
30
+ max-width: 640px;
31
+ margin: auto;
32
+ overflow: hidden;
33
+ border-radius: 8px;
34
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1);
35
+ }
36
+ video {
37
+ width: 100%;
38
+ height: auto;
39
+ display: block;
40
+ }
41
+ .stButton>button {
42
+ background-color: #007bff;
43
+ color: white;
44
+ border-radius: 5px;
45
+ padding: 10px 20px;
46
+ }
47
+ .stButton>button:hover {
48
+ background-color: #0056b3;
49
+ }
50
+ </style>
51
+ """, unsafe_allow_html=True)
52
+
53
+ # Load BLIP model
54
  @st.cache_resource
55
  def load_blip_model():
56
  processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 
61
 
62
  # Load MusicGen model
63
  @st.cache_resource
64
+ def load_musicgen_model(model_name="facebook/musicgen-small"):
65
  processor = AutoProcessor.from_pretrained(model_name)
66
  model = MusicgenForConditionalGeneration.from_pretrained(model_name)
67
  if torch.cuda.is_available():
68
+ model = model.to("cuda")
69
  return processor, model
70
 
71
+ # Optimized frame extraction
72
+ def extract_frames(video_path, num_frames, method="uniform"):
73
  video = imageio.get_reader(video_path, "ffmpeg")
74
  meta = video.get_meta_data()
75
  fps = meta['fps']
76
  total_frames = int(meta['duration'] * fps)
77
 
 
 
 
 
 
 
 
78
  if method == "scene" and scene_detect_available:
79
  try:
80
  video_manager = VideoManager([video_path])
81
  scene_manager = SceneManager()
82
+ scene_manager.add_detector(ContentDetector(threshold=25))
83
+ video_manager.set_downscale_factor(4) # Aggressive downscaling
84
  video_manager.start()
85
  scene_manager.detect_scenes(frame_source=video_manager)
86
  scene_list = scene_manager.get_scene_list()
87
+ frame_indices = [scene[0].get_frames() for scene in scene_list[:num_frames]]
 
 
 
 
 
88
  video_manager.release()
89
+ if len(frame_indices) < num_frames:
90
+ step = total_frames // (num_frames - len(frame_indices) + 1)
91
+ frame_indices.extend(range(step, total_frames, step)[:num_frames - len(frame_indices)])
92
+ except Exception:
93
+ frame_indices = list(range(0, total_frames, total_frames // num_frames))[:num_frames]
94
+ else:
95
+ frame_indices = list(range(0, total_frames, total_frames // num_frames))[:num_frames]
96
+
97
+ frames = []
98
+ for idx in frame_indices[:num_frames]:
99
+ try:
100
+ frames.append(Image.fromarray(video.get_data(idx)).resize((320, 180))) # Downscale frames
101
+ except:
102
+ continue
103
+ video.close()
104
+ return frames
105
 
106
  # Generate captions
107
+ @lru_cache(maxsize=100)
108
+ def generate_captions(frames_tuple, processor, model):
109
+ frames = [Image.frombytes(frame[0], frame[1], frame[2]) for frame in frames_tuple]
110
  descriptions = []
111
  for frame in frames:
112
  inputs = processor(images=frame, return_tensors="pt")
113
  if torch.cuda.is_available():
114
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
115
+ out = model.generate(**inputs, max_length=20, num_beams=3) # Faster with beam search
116
  description = processor.decode(out[0], skip_special_tokens=True)
117
  descriptions.append(description)
118
  return descriptions
 
120
  # Enhance prompts
121
  def enhance_prompt(descriptions, mood="default"):
122
  if not descriptions:
123
+ return f"{mood} cinematic ambient sound with dynamic effects"
124
  combined = ". ".join(descriptions).lower()
125
  base_prompts = {
126
+ "walk|run": "crisp footsteps on diverse surfaces, vivid movement sounds",
127
+ "car|drive": "powerful engine roar, tire screeches, immersive road noise",
128
+ "talk|person": "rich voices, layered crowd chatter, spatial depth",
129
+ "wind|tree|forest": "whistling wind, rustling foliage, natural resonance",
130
+ "crash|fall": "sharp crash impact, debris scatter, intense bursts"
131
  }
132
  for pattern, effect in base_prompts.items():
133
  if any(word in combined for word in pattern.split("|")):
134
+ return f"{mood} {combined}, {effect}, high-fidelity cinematic quality"
135
+ return f"{mood} {combined}, vibrant ambient soundscape with compelling effects, high-fidelity cinematic quality"
136
 
137
  # Generate audio
138
  def generate_audio(prompt, processor, model, duration, sample_rate=44100):
 
141
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
142
  audio_values = model.generate(
143
  **inputs,
144
+ max_new_tokens=int(256 * (duration / 6)), # Optimized token scaling
145
  do_sample=True,
146
+ guidance_scale=8.0,
147
+ top_k=150,
148
+ top_p=0.85,
149
+ num_beams=2 # Beam search for quality
150
  )
151
  audio_array = audio_values[0].cpu().numpy()
152
+ audio_array = audio_array / np.max(np.abs(audio_array)) * 0.98
153
  audio_array = np.clip(audio_array, -1.0, 1.0)
154
  return audio_array
155
 
 
157
  def apply_audio_effects(audio_path, settings):
158
  sound = AudioSegment.from_wav(audio_path)
159
  if settings['reverb_ms'] > 0:
160
+ sound = sound + AudioSegment.silent(duration=settings['reverb_ms']) - 8
161
  if settings['echo_ms'] > 0:
162
+ echo = sound - 12
163
  sound = sound.overlay(echo, position=settings['echo_ms'])
164
  if settings['highpass'] > 0:
165
  sound = sound.high_pass_filter(settings['highpass'])
166
  if settings['lowpass'] < 20000:
167
  sound = sound.low_pass_filter(settings['lowpass'])
168
  if settings['compress']:
169
+ sound = sound - 6 # Simulate compression
170
  sound = sound.pan(settings['stereo_pan'])
171
+ sound = sound + 2 # Slight volume boost
172
  processed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
173
  sound.export(processed_path, format="wav")
174
  return processed_path
 
188
  audio_path = mixed_path
189
  else:
190
  st.warning("No original audio found. Using generated audio only.")
191
+ video_clip.close()
192
 
193
  cmd = [
194
  'ffmpeg',
 
207
  # Main application
208
  def main():
209
  st.title("🎬 Video Sound Effect Generator")
210
+ st.markdown("Create high-quality, cinematic sound effects for your videos with AI.")
211
 
212
  # User Guide
213
+ with st.expander("📖 How to Use"):
214
  st.markdown("""
215
+ 1. **Upload Video**: Select an MP4, MOV, or AVI file (keep under 1 minute for best performance).
216
+ 2. **Choose Mode**:
217
+ - **Automatic**: AI analyzes video frames to create sound prompts.
218
+ - **Manual**: Write your own sound description.
219
+ 3. **Adjust Settings**: Use the sidebar to tweak frame analysis, audio effects, and model size.
220
+ 4. **Generate**: Click "Generate" to process and download the enhanced video.
 
 
221
 
222
  **Tips**:
223
+ - 5+ frames ensure accurate sound effects.
224
+ - Scene extraction (if available) enhances relevance.
225
+ - Experiment with audio effects for a polished result.
226
  """)
227
 
228
  # Sidebar Settings
229
  with st.sidebar:
230
  st.header("⚙️ Settings")
231
+ prompt_mode = st.selectbox("Mode", ["Automatic", "Manual"], help="Automatic uses AI to analyze video; Manual lets you describe the sound.")
232
+ model_size = st.selectbox("Model Size", ["small", "medium"], index=0, help="Small is faster; Medium is higher quality.")
233
+ mix_original = st.checkbox("Mix Original Audio", help="Blend with video's audio if available.")
234
  original_volume, generated_volume = 0.5, 0.5
235
  if mix_original:
236
+ original_volume = st.slider("Original Volume", 0.0, 1.0, 0.5)
237
+ generated_volume = st.slider("Generated Volume", 0.0, 1.0, 0.5)
238
 
239
  st.subheader("Frame Analysis")
240
+ num_frames = st.slider("Frames to Analyze", 5, 8, 5, help="More frames improve sound accuracy but slow processing.")
241
+ frame_method = st.selectbox("Extraction Method", ["Uniform", "Scene"] if scene_detect_available else ["Uniform"], help="Scene is more accurate but slower.")
242
 
243
  st.subheader("Audio Effects")
244
  effects_settings = {
245
+ 'reverb_ms': st.slider("Reverb (ms)", 0, 300, 50, help="Adds depth to sound."),
246
+ 'echo_ms': st.slider("Echo (ms)", 0, 500, 100, help="Creates repeating sound effects."),
247
+ 'highpass': st.slider("High-pass Filter (Hz)", 0, 2000, 50, help="Removes low frequencies."),
248
+ 'lowpass': st.slider("Low-pass Filter (Hz)", 5000, 20000, 18000, help="Removes high frequencies."),
249
+ 'compress': st.checkbox("Compression", value=True, help="Balances audio dynamics."),
250
+ 'stereo_pan': st.slider("Stereo Pan", -1.0, 1.0, 0.0, help="-1 is left, 1 is right.")
251
  }
252
 
253
  # Main Content
254
+ uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mov", "avi"], help="Max 1 minute recommended.")
255
  if uploaded_file:
256
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
257
  tmp.write(uploaded_file.read())
258
  video_path = tmp.name
259
+ st.markdown('<div class="video-container">', unsafe_allow_html=True)
260
  st.video(video_path)
261
+ st.markdown('</div>', unsafe_allow_html=True)
262
 
263
  video_clip = mpy.VideoFileClip(video_path)
264
  duration = video_clip.duration
265
  video_clip.close()
266
+ if duration > 60:
267
+ st.warning("Videos over 1 minute may slow processing. Consider trimming.")
268
 
269
  if prompt_mode == "Automatic":
270
  with st.spinner("Analyzing frames..."):
271
  blip_processor, blip_model = load_blip_model()
272
  frames = extract_frames(video_path, num_frames, frame_method)
273
  if not frames:
274
+ st.error("Failed to extract frames. Try another video or method.")
275
  return
276
+ # Convert frames to tuple for caching
277
+ frames_tuple = tuple((frame.mode, frame.size, frame.rgb) for frame in frames)
278
+ descriptions = generate_captions(frames_tuple, blip_processor, blip_model)
279
+ mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"], help="Sets the tone of sound effects.")
280
  text_prompt = enhance_prompt(descriptions, mood)
281
+ text_prompt = st.text_area("Edit Prompt", text_prompt, height=80)
282
  else:
283
+ text_prompt = st.text_area("Sound Description", "E.g., 'intense action with explosions'", height=80)
284
 
285
  if st.button("Generate Sound Effects", key="generate"):
286
  progress = st.progress(0)
 
295
  sf.write(temp_audio, audio_array, 44100)
296
  progress.progress(50)
297
 
298
+ status.text("Applying effects...")
299
  processed_audio = apply_audio_effects(temp_audio, effects_settings)
300
  progress.progress(75)
301
 
302
+ status.text("Syncing video...")
303
  output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
304
  sync_audio_video(video_path, processed_audio, output_video, mix_original, original_volume, generated_volume)
305
  progress.progress(100)
306
  status.text("Done!")
307
 
308
  st.success("Sound effects applied!")
309
+ st.markdown('<div class="video-container">', unsafe_allow_html=True)
310
  st.video(output_video)
311
+ st.markdown('</div>', unsafe_allow_html=True)
312
  with open(output_video, "rb") as f:
313
+ st.download_button("Download Video", f, "enhanced_video.mp4", "video/mp4")
314
 
315
  # Cleanup
316
  for file in [video_path, temp_audio, processed_audio, output_video]: