garyuzair commited on
Commit
5763c8a
Β·
verified Β·
1 Parent(s): 71b6093

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -304
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
- import imageio
 
3
  import numpy as np
4
  from PIL import Image
5
  from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
@@ -18,6 +19,7 @@ except ModuleNotFoundError:
18
  # --- Constants ---
19
  TEMP_DIR = tempfile.gettempdir()
20
  DEFAULT_SAMPLING_RATE = 32000 # MusicGen default
 
21
 
22
  # --- Model Loading (Cached) ---
23
  @st.cache_resource
@@ -41,39 +43,48 @@ def load_musicgen_model():
41
  return processor, model
42
 
43
  # --- Core Functions ---
44
- def extract_frames_from_video(video_path, num_frames_to_extract):
45
- """Extracts a specified number of frames evenly from a video."""
46
  frames = []
47
- try:
48
- video_reader = imageio.get_reader(video_path, "ffmpeg")
49
- total_frames = video_reader.count_frames() if hasattr(video_reader, 'count_frames') else len(list(video_reader.iter_data())) # more robust way to get frame count
50
-
51
- if total_frames == 0:
52
- st.warning("Video appears to have 0 frames. Please check the video file.")
53
- return []
54
 
55
- step = max(1, total_frames // num_frames_to_extract)
56
-
57
- extracted_count = 0
58
- for i in range(0, total_frames, step):
59
- if extracted_count >= num_frames_to_extract:
60
- break
61
- frame_data = video_reader.get_data(i)
62
- frames.append(Image.fromarray(frame_data))
63
- extracted_count += 1
64
- video_reader.close()
65
- except Exception as e:
66
- st.error(f"Error extracting frames: {e}")
67
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  return frames
69
 
 
70
  def generate_enhanced_prompt(base_description, context_style="cinematic"):
71
- """
72
- Generates a more detailed and evocative sound-specific prompt from a BLIP caption.
73
- """
74
  base = base_description.lower().strip().replace("a photo of ", "").replace("an image of ", "")
75
-
76
- # Keywords for actions, objects, environments, and sound qualities
77
  action_sounds = {
78
  "walking": "footsteps, rhythmic, on {surface}", "running": "rapid footsteps, heavy breathing, on {surface}",
79
  "driving": "engine rumble, tire sounds on {surface}", "talking": "clear voices, conversational tone",
@@ -96,142 +107,67 @@ def generate_enhanced_prompt(base_description, context_style="cinematic"):
96
  "office": "office hum, keyboard typing, distant chatter",
97
  "street": "city street sounds, traffic, distant sirens, pedestrian chatter",
98
  "forest": "forest ambience, rustling leaves, distant bird calls, twigs snapping",
99
- "beach": "ocean waves crashing, seagulls, gentle wind",
100
- "cave": "echoing drips, damp air, low rumble",
101
- "space": "eerie silence, low hum, occasional electronic beep",
102
- "underwater": "muffled sounds, bubbling, deep water pressure"
103
  }
104
  sound_qualities = {
105
- "cinematic": "high quality, clear, immersive, dynamic range",
106
- "realistic": "natural, authentic, detailed",
107
- "cartoon": "exaggerated, playful, boings, zips",
108
- "ominous": "low rumble, dissonant, suspenseful",
109
  "peaceful": "gentle, calming, serene"
110
  }
111
-
112
- found_elements = []
113
- prompt_parts = [f"A {context_style} soundscape of:"]
114
-
115
- # Prioritize actions
116
  for action_keyword, sound_desc in action_sounds.items():
117
  if action_keyword in base:
118
- # Basic surface detection
119
- surface = "generic surface"
120
- if "grass" in base: surface = "grass"
121
- elif "wood" in base or "floor" in base: surface = "wooden floor"
122
- elif "concrete" in base or "pavement" in base: surface = "concrete"
123
- elif "water" in base: surface = "water"
124
- prompt_parts.append(sound_desc.format(surface=surface))
125
- found_elements.append(action_keyword)
126
- break # Often one main action is enough focus
127
-
128
- # Add objects
129
  for obj_keyword, sound_desc in object_sounds.items():
130
  if obj_keyword in base and obj_keyword not in found_elements:
131
- prompt_parts.append(sound_desc)
132
- found_elements.append(obj_keyword)
133
- # Limit to 1-2 dominant objects to avoid overly complex prompts
134
- if len(found_elements) > (1 if any(action_keyword in found_elements for action_keyword in action_sounds) else 2):
135
- break
136
-
137
- # Add environment
138
  added_env = False
139
  for env_keyword, sound_desc in environment_ambience.items():
140
- if env_keyword in base:
141
- prompt_parts.append(f"environment: {sound_desc}")
142
- added_env = True
143
- break
144
-
145
- if not found_elements and not added_env: # if nothing specific found
146
- prompt_parts.append(f"subtle ambient sound related to '{base}'")
147
-
148
- # Add general quality
149
- if context_style in sound_qualities:
150
- prompt_parts.append(sound_qualities[context_style])
151
- else:
152
- prompt_parts.append(sound_qualities["cinematic"]) # default
153
-
154
  return ", ".join(prompt_parts) + "."
155
 
156
 
157
  def generate_audio(text_prompt, negative_prompt, duration_s, guidance_scale, top_k, top_p, device):
158
- """Generates audio using MusicGen model."""
159
  musicgen_processor, musicgen_model = load_musicgen_model()
160
-
161
- inputs = musicgen_processor(
162
- text=[text_prompt],
163
- negative_prompt=[negative_prompt] if negative_prompt else None,
164
- padding=True,
165
- return_tensors="pt",
166
- ).to(device)
167
-
168
- # max_new_tokens: 1 token approx 0.02 seconds at 50Hz. So, duration_s * 50.
169
- # MusicGen generates around 5-8s with 256 tokens, but let's be more direct.
170
- # The model has a fixed sampling rate (e.g., 32kHz for musicgen-small).
171
- # The number of tokens directly influences duration.
172
- # model.config.audio_encoder.frame_rate seems to be 50 (tokens per second of audio)
173
  max_new_tokens = int(duration_s * musicgen_model.config.audio_encoder.frame_rate)
174
- if max_new_tokens > 1500: # MusicGen small has context limit around 30s (1500 tokens)
175
- st.warning(f"Requested duration ({duration_s}s) is long. Capping to ~30s for musicgen-small.")
176
  max_new_tokens = 1500
177
 
178
- audio_values = musicgen_model.generate(
179
- **inputs,
180
- max_new_tokens=max_new_tokens,
181
- do_sample=True,
182
- guidance_scale=guidance_scale,
183
- top_k=top_k,
184
- top_p=top_p
185
- )
186
  audio_array = audio_values[0].cpu().numpy()
187
-
188
- # Normalize and ensure single channel for broader compatibility
189
- if audio_array.ndim > 1:
190
- audio_array = np.mean(audio_array, axis=0) # Convert to mono by averaging channels
191
-
192
- audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9 # Normalize
193
  audio_array = np.clip(audio_array, -1.0, 1.0)
194
-
195
  return audio_array, musicgen_model.config.audio_encoder.sampling_rate
196
 
197
- def sync_audio_to_video(video_path, audio_path, output_path, mix_original, original_vol, sfx_vol):
198
- """Synchronizes generated audio with the video, mixing or replacing original audio."""
199
  video_clip = mpy.VideoFileClip(video_path)
200
  generated_audio_clip = mpy.AudioFileClip(audio_path)
201
  video_duration = video_clip.duration
202
-
203
- # Adjust generated audio length to match video duration
204
- if generated_audio_clip.duration < video_duration:
205
- # Loop the audio if it's shorter
206
- num_loops = int(np.ceil(video_duration / generated_audio_clip.duration))
207
- looped_clips = [generated_audio_clip] * num_loops
208
- final_generated_audio = mpy.concatenate_audioclips(looped_clips).subclip(0, video_duration)
209
- else:
210
- # Trim the audio if it's longer
211
- final_generated_audio = generated_audio_clip.subclip(0, video_duration)
212
-
213
- # Apply SFX volume
214
  final_generated_audio = final_generated_audio.volumex(sfx_vol)
215
-
216
- # Mix or replace audio
217
  if mix_original and video_clip.audio:
218
  original_audio = video_clip.audio.volumex(original_vol)
219
  composite_audio = mpy.CompositeAudioClip([original_audio, final_generated_audio])
220
  final_video = video_clip.set_audio(composite_audio)
221
  else:
222
  final_video = video_clip.set_audio(final_generated_audio)
223
-
224
- # Write final video
225
- final_video.write_videofile(
226
- output_path,
227
- codec="libx264", # Good quality and widely compatible
228
- audio_codec="aac", # Good quality audio codec
229
- preset="medium", # Balance between speed and quality (faster: 'ultrafast', 'superfast')
230
- bitrate="5000k", # Decent video bitrate for web
231
- audio_bitrate="192k", # Good audio quality
232
- threads=os.cpu_count() or 2, # Use multiple threads for encoding
233
- logger='bar' # Show moviepy progress bar
234
- )
235
  video_clip.close()
236
  generated_audio_clip.close()
237
  if 'original_audio' in locals(): original_audio.close()
@@ -242,221 +178,127 @@ def sync_audio_to_video(video_path, audio_path, output_path, mix_original, origi
242
 
243
  # --- Streamlit UI ---
244
  st.set_page_config(layout="wide", page_title="Video To SoundFX Generator")
245
- st.title("🎬 Video To SoundFX Generator 🎢")
246
- st.markdown("Upload an MP4 video, and this tool will automatically generate relevant sound effects and sync them to your video.")
 
247
 
248
  # --- Sidebar for Settings ---
249
  with st.sidebar:
250
  st.header("βš™οΈ Generation Settings")
251
 
252
  num_frames_to_analyze = st.slider(
253
- "Number of Frames to Analyze from Video", 1, 10, 3,
254
- help="More frames give more context but take longer. Frames are picked evenly throughout the video."
255
  )
256
 
257
- prompt_style = st.selectbox(
258
- "Sound Style for Prompt",
259
- ["cinematic", "realistic", "cartoon", "ominous", "peaceful"],
260
- index=0, help="Influences the adjectives added to the sound prompt."
 
261
  )
 
 
 
262
 
263
  st.subheader("Audio Mixing")
264
  mix_original_audio = st.checkbox("Mix with Original Video Audio", value=True)
265
- original_audio_volume = st.slider(
266
- "Original Audio Volume", 0.0, 1.0, 0.5,
267
- disabled=not mix_original_audio,
268
- help="Volume of the video's original audio when mixed."
269
- )
270
- sfx_audio_volume = st.slider(
271
- "Generated SFX Volume", 0.0, 1.0, 0.8,
272
- help="Volume of the generated sound effect."
273
- )
274
 
275
- with st.expander("Advanced MusicGen Settings"):
276
- # Simplified duration - MusicGen has its own duration mechanism via max_new_tokens
277
- # We will primarily make one sound effect and loop/trim it.
278
- # However, we can set a target for the initial generation.
279
- generated_sfx_duration = st.slider(
280
- "Target SFX Clip Duration (s)", 5, 30, 10,
281
- help="Duration of the base sound effect clip before looping/trimming. Max ~30s for musicgen-small."
282
- )
283
- guidance_scale = st.slider(
284
- "Guidance Scale (CFG)", 1.0, 10.0, 3.0,
285
- help="Higher values make the audio follow the prompt more closely, but can reduce diversity."
286
- )
287
- top_k = st.slider(
288
- "Top-K Sampling", 0, 250, 50,
289
- help="Filters to the K most likely next tokens. 0 means no filtering."
290
- )
291
- top_p = st.slider(
292
- "Top-P (Nucleus) Sampling", 0.0, 1.0, 0.95,
293
- help="Selects tokens from the smallest set whose cumulative probability exceeds P."
294
- )
295
- negative_prompt_text = st.text_input(
296
- "Negative Prompt (Optional)",
297
- placeholder="e.g., low quality, noisy, muffled, music, speech",
298
- help="Describe sounds to avoid."
299
- )
300
 
301
  # --- Main Area for Upload and Results ---
302
- uploaded_file = st.file_uploader("πŸ“€ Upload an MP4 Video", type=["mp4"])
303
 
304
  if uploaded_file:
305
- # Determine device
306
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
307
- st.write(f"Using device: {device}")
308
- if device.type == 'cuda':
309
- st.write(f"GPU: {torch.cuda.get_device_name(0)}")
310
 
311
- # Temporary file paths
312
  temp_video_path = os.path.join(TEMP_DIR, f"temp_video_{uploaded_file.name}")
313
  temp_audio_path = os.path.join(TEMP_DIR, "temp_generated_audio.wav")
314
  output_video_path = os.path.join(TEMP_DIR, f"output_{uploaded_file.name}")
315
 
316
  try:
317
- with open(temp_video_path, "wb") as f:
318
- f.write(uploaded_file.getbuffer())
319
 
320
- progress_bar = st.progress(0)
321
- status_text = st.empty()
322
-
323
- # 1. Extract Frames
324
- status_text.info("⏳ Step 1/5: Extracting frames from video...")
325
- frames = extract_frames_from_video(temp_video_path, num_frames_to_analyze)
326
- if not frames:
327
- st.error("Could not extract frames. Please try another video or check settings.")
328
- st.stop()
329
  progress_bar.progress(20)
330
 
331
- # 2. Generate Descriptions
332
- status_text.info("⏳ Step 2/5: Analyzing frames and generating descriptions...")
333
  blip_processor, blip_model = load_blip_model()
334
  descriptions = []
 
335
  for i, frame_pil in enumerate(frames):
336
  inputs = blip_processor(images=frame_pil, return_tensors="pt")
337
- if device.type == 'cuda':
338
- inputs = {k: v.to(device) for k, v in inputs.items()}
339
-
340
- with torch.no_grad(): # Important for inference
341
- out = blip_model.generate(**inputs, max_new_tokens=50) # Increased max_new_tokens for more detail
342
-
343
  base_desc = blip_processor.decode(out[0], skip_special_tokens=True)
 
344
  enhanced_desc = generate_enhanced_prompt(base_desc, prompt_style)
345
  descriptions.append(enhanced_desc)
346
- status_text.info(f"⏳ Step 2/5: Analyzing frames... Frame {i+1}/{len(frames)}: {base_desc[:50]}...")
347
-
348
- combined_description = " Then, ".join(descriptions)
349
- progress_bar.progress(40)
350
-
351
- st.subheader("πŸ“ Generated Sound Prompt")
352
- st.markdown(f"Based on video analysis, the following prompt was generated for the sound effect. **You can edit it below before generating the audio.**")
353
-
354
- # Allow user to edit the prompt
355
- editable_prompt = st.text_area(
356
- "Sound Effect Prompt:",
357
- value=combined_description,
358
- height=150,
359
- help="Edit this prompt to fine-tune the sound generation."
360
- )
361
-
362
- if st.button("✨ Generate Sound & Sync Video"):
363
- if not editable_prompt.strip():
364
- st.error("Prompt cannot be empty!")
365
- else:
366
- # 3. Generate Sound Effect
367
- status_text.info(f"⏳ Step 3/5: Generating sound effect for: '{editable_prompt[:100]}...'")
368
- audio_array, sample_rate = generate_audio(
369
- editable_prompt,
370
- negative_prompt_text,
371
- generated_sfx_duration,
372
- guidance_scale,
373
- top_k,
374
- top_p,
375
- device
376
- )
377
- sf.write(temp_audio_path, audio_array, sample_rate)
378
- progress_bar.progress(60)
379
 
380
- # 4. Synchronize Audio with Video
381
- status_text.info("⏳ Step 4/5: Syncing audio with video...")
382
- # Display moviepy progress within Streamlit
383
- with st.spinner("MoviePy is processing the video... This can take a while for longer videos."):
384
- sync_audio_to_video(
385
- temp_video_path,
386
- temp_audio_path,
387
- output_video_path,
388
- mix_original_audio,
389
- original_audio_volume,
390
- sfx_audio_volume
391
- )
392
- progress_bar.progress(90)
393
 
394
- # 5. Display Results
395
- status_text.success("βœ… Step 5/5: Processing Complete!")
396
- st.subheader("πŸŽ‰ Your Sound-Enhanced Video:")
397
-
398
- try:
399
- video_file = open(output_video_path, 'rb')
400
- video_bytes = video_file.read()
401
- st.video(video_bytes)
402
- video_file.close() # Close it after reading bytes
403
-
404
- st.download_button(
405
- label="πŸ“₯ Download Synced Video",
406
- data=video_bytes, # Use bytes directly
407
- file_name=f"sfx_synced_{uploaded_file.name}",
408
- mime="video/mp4"
409
- )
410
- except FileNotFoundError:
411
- st.error("Output video file not found. Something went wrong during processing.")
412
- except Exception as e:
413
- st.error(f"Error displaying or preparing download for video: {e}")
414
-
415
- progress_bar.progress(100)
416
-
417
- with st.expander("Generation Details", expanded=False):
418
- st.write("**Original BLIP Captions:**")
419
- for i, desc_pair in enumerate(zip([blip_processor.decode(blip_model.generate(**blip_processor(images=f, return_tensors="pt").to(device), max_new_tokens=50)[0], skip_special_tokens=True) for f in frames], descriptions)):
420
- st.markdown(f"- Frame {i+1} Raw: `{desc_pair[0]}`\n- Frame {i+1} Enhanced: `{desc_pair[1]}`")
421
- st.write(f"**Final Prompt Used for MusicGen:** `{editable_prompt}`")
422
- if negative_prompt_text:
423
- st.write(f"**Negative Prompt Used:** `{negative_prompt_text}`")
424
- st.write(f"**Base SFX Duration:** {generated_sfx_duration}s (looped/trimmed to video length)")
425
- st.write(f"**MusicGen Sampling Rate:** {sample_rate} Hz")
426
 
427
  except Exception as e:
428
  st.error(f"An unexpected error occurred: {e}")
429
- st.error("Troubleshooting tips:")
430
- st.markdown("- Try a shorter or smaller resolution video.")
431
- st.markdown("- Reduce the 'Number of Frames to Analyze'.")
432
- st.markdown("- Ensure your Hugging Face Space has enough resources (CPU/RAM, GPU if applicable).")
433
- st.markdown("- Check the console logs in your Hugging Face Space for more detailed errors.")
434
  import traceback
435
  st.code(traceback.format_exc())
436
-
437
-
438
  finally:
439
- # Clean up temporary files
440
- for path in [temp_video_path, temp_audio_path, output_video_path]:
441
- if os.path.exists(path):
442
- try:
443
- os.remove(path)
444
- except Exception as e:
445
- st.warning(f"Could not remove temporary file {path}: {e}")
446
  else:
447
- st.info("πŸ‘‹ Welcome! Upload an MP4 video to get started.")
448
  st.markdown("""
449
- **How it works:**
450
- 1. You upload an MP4 video.
451
- 2. The system analyzes a few frames from your video to understand its content (using BLIP image captioning).
452
- 3. It generates a descriptive prompt based on these frames, tailored for sound effects.
453
- 4. You can review and **edit this prompt**!
454
- 5. The prompt is fed into MusicGen (a powerful AI sound generator) to create a sound effect.
455
- 6. The generated sound is then automatically synced with your original video.
456
- 7. You can download the final video with the new sound!
457
-
458
- **Tips for best results:**
459
- - Use videos with clear actions or distinct environments.
460
- - Experiment with the 'Number of Frames to Analyze' and 'Sound Style'.
461
- - Edit the generated prompt to be more specific or to change the mood.
462
  """)
 
1
  import streamlit as st
2
+ # import imageio # Replaced by cv2
3
+ import cv2 # For faster frame extraction
4
  import numpy as np
5
  from PIL import Image
6
  from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
 
19
  # --- Constants ---
20
  TEMP_DIR = tempfile.gettempdir()
21
  DEFAULT_SAMPLING_RATE = 32000 # MusicGen default
22
+ BLIP_PROCESS_SIZE = 384 # Resize frames to this size for BLIP
23
 
24
  # --- Model Loading (Cached) ---
25
  @st.cache_resource
 
43
  return processor, model
44
 
45
  # --- Core Functions ---
46
+ def extract_frames_from_video_cv2(video_path, num_frames_to_extract, target_size=BLIP_PROCESS_SIZE):
47
+ """Extracts a specified number of frames evenly from a video using OpenCV, and resizes them."""
48
  frames = []
49
+ cap = cv2.VideoCapture(video_path)
50
+ if not cap.isOpened():
51
+ st.error("Error: Could not open video file.")
52
+ return []
 
 
 
53
 
54
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
55
+ if total_frames == 0:
56
+ st.warning("Video appears to have 0 frames. Please check the video file.")
57
+ cap.release()
 
 
 
 
 
 
 
 
58
  return []
59
+
60
+ step = max(1, total_frames // num_frames_to_extract)
61
+
62
+ extracted_count = 0
63
+ for i in range(0, total_frames, step):
64
+ if extracted_count >= num_frames_to_extract:
65
+ break
66
+ cap.set(cv2.CAP_PROP_POS_FRAMES, i)
67
+ ret, frame = cap.read()
68
+ if ret:
69
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
70
+ pil_image = Image.fromarray(frame_rgb)
71
+
72
+ # Resize for BLIP
73
+ pil_image_resized = pil_image.resize((target_size, target_size), Image.Resampling.LANCZOS)
74
+ frames.append(pil_image_resized)
75
+ extracted_count += 1
76
+ else:
77
+ # Could mean end of video or read error
78
+ break
79
+
80
+ cap.release()
81
+ if not frames and num_frames_to_extract > 0:
82
+ st.warning(f"Could not extract any frames. Tried to extract {num_frames_to_extract} frames with step {step} from {total_frames} total frames.")
83
  return frames
84
 
85
+
86
  def generate_enhanced_prompt(base_description, context_style="cinematic"):
 
 
 
87
  base = base_description.lower().strip().replace("a photo of ", "").replace("an image of ", "")
 
 
88
  action_sounds = {
89
  "walking": "footsteps, rhythmic, on {surface}", "running": "rapid footsteps, heavy breathing, on {surface}",
90
  "driving": "engine rumble, tire sounds on {surface}", "talking": "clear voices, conversational tone",
 
107
  "office": "office hum, keyboard typing, distant chatter",
108
  "street": "city street sounds, traffic, distant sirens, pedestrian chatter",
109
  "forest": "forest ambience, rustling leaves, distant bird calls, twigs snapping",
110
+ "beach": "ocean waves crashing, seagulls, gentle wind", "cave": "echoing drips, damp air, low rumble",
111
+ "space": "eerie silence, low hum, occasional electronic beep", "underwater": "muffled sounds, bubbling, deep water pressure"
 
 
112
  }
113
  sound_qualities = {
114
+ "cinematic": "high quality, clear, immersive, dynamic range", "realistic": "natural, authentic, detailed",
115
+ "cartoon": "exaggerated, playful, boings, zips", "ominous": "low rumble, dissonant, suspenseful",
 
 
116
  "peaceful": "gentle, calming, serene"
117
  }
118
+ found_elements, prompt_parts = [], [f"A {context_style} soundscape of:"]
 
 
 
 
119
  for action_keyword, sound_desc in action_sounds.items():
120
  if action_keyword in base:
121
+ surface = "generic surface"; ("grass" in base and (surface := "grass")) or (("wood" in base or "floor" in base) and (surface := "wooden floor")) or (("concrete" in base or "pavement" in base) and (surface := "concrete")) or (("water" in base) and (surface := "water"))
122
+ prompt_parts.append(sound_desc.format(surface=surface)); found_elements.append(action_keyword); break
 
 
 
 
 
 
 
 
 
123
  for obj_keyword, sound_desc in object_sounds.items():
124
  if obj_keyword in base and obj_keyword not in found_elements:
125
+ prompt_parts.append(sound_desc); found_elements.append(obj_keyword)
126
+ if len(found_elements) > (1 if any(ak in found_elements for ak in action_sounds) else 2): break
 
 
 
 
 
127
  added_env = False
128
  for env_keyword, sound_desc in environment_ambience.items():
129
+ if env_keyword in base: prompt_parts.append(f"environment: {sound_desc}"); added_env = True; break
130
+ if not found_elements and not added_env: prompt_parts.append(f"subtle ambient sound related to '{base}'")
131
+ prompt_parts.append(sound_qualities.get(context_style, sound_qualities["cinematic"]))
 
 
 
 
 
 
 
 
 
 
 
132
  return ", ".join(prompt_parts) + "."
133
 
134
 
135
  def generate_audio(text_prompt, negative_prompt, duration_s, guidance_scale, top_k, top_p, device):
 
136
  musicgen_processor, musicgen_model = load_musicgen_model()
137
+ inputs = musicgen_processor(text=[text_prompt], negative_prompt=[negative_prompt] if negative_prompt else None, padding=True, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
 
 
 
138
  max_new_tokens = int(duration_s * musicgen_model.config.audio_encoder.frame_rate)
139
+ if max_new_tokens > 1500: # MusicGen small context limit
140
+ st.warning(f"Requested SFX duration ({duration_s}s) is long. Capping to ~30s for musicgen-small to ensure stability.")
141
  max_new_tokens = 1500
142
 
143
+ audio_values = musicgen_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, guidance_scale=guidance_scale, top_k=top_k, top_p=top_p)
 
 
 
 
 
 
 
144
  audio_array = audio_values[0].cpu().numpy()
145
+ if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=0)
146
+ audio_array = audio_array / (np.max(np.abs(audio_array)) + 1e-6) * 0.9 # Normalize, avoid div by zero
 
 
 
 
147
  audio_array = np.clip(audio_array, -1.0, 1.0)
 
148
  return audio_array, musicgen_model.config.audio_encoder.sampling_rate
149
 
150
+ def sync_audio_to_video(video_path, audio_path, output_path, mix_original, original_vol, sfx_vol, encoding_preset):
 
151
  video_clip = mpy.VideoFileClip(video_path)
152
  generated_audio_clip = mpy.AudioFileClip(audio_path)
153
  video_duration = video_clip.duration
154
+ final_generated_audio = generated_audio_clip.subclip(0, video_duration) if generated_audio_clip.duration >= video_duration else mpy.concatenate_audioclips([generated_audio_clip] * int(np.ceil(video_duration / generated_audio_clip.duration))).subclip(0, video_duration)
 
 
 
 
 
 
 
 
 
 
 
155
  final_generated_audio = final_generated_audio.volumex(sfx_vol)
 
 
156
  if mix_original and video_clip.audio:
157
  original_audio = video_clip.audio.volumex(original_vol)
158
  composite_audio = mpy.CompositeAudioClip([original_audio, final_generated_audio])
159
  final_video = video_clip.set_audio(composite_audio)
160
  else:
161
  final_video = video_clip.set_audio(final_generated_audio)
162
+
163
+ # Use more threads for encoding, and the selected preset
164
+ num_threads = os.cpu_count() or 2
165
+ st.write(f"MoviePy encoding with preset: '{encoding_preset}', threads: {num_threads}")
166
+
167
+ final_video.write_videofile(output_path, codec="libx264", audio_codec="aac", preset=encoding_preset,
168
+ bitrate="4000k", # Reduced bitrate slightly for faster presets
169
+ audio_bitrate="192k", threads=num_threads, logger='bar')
170
+ # Close clips to free resources
 
 
 
171
  video_clip.close()
172
  generated_audio_clip.close()
173
  if 'original_audio' in locals(): original_audio.close()
 
178
 
179
  # --- Streamlit UI ---
180
  st.set_page_config(layout="wide", page_title="Video To SoundFX Generator")
181
+ st.title("⚑ Speedy Video To SoundFX Generator 🎢")
182
+ st.markdown("Upload an MP4 video. This tool generates sound effects and syncs them. **Optimized for speed!**")
183
+ st.markdown("> For *truly* fast performance, especially the AI parts, ensure your Hugging Face Space is running on **GPU hardware**.")
184
 
185
  # --- Sidebar for Settings ---
186
  with st.sidebar:
187
  st.header("βš™οΈ Generation Settings")
188
 
189
  num_frames_to_analyze = st.slider(
190
+ "Number of Frames to Analyze (Fewer = Faster)", 1, 5, 2, # Reduced max and default
191
+ help=f"Fewer frames speed up analysis. Frames are resized to {BLIP_PROCESS_SIZE}x{BLIP_PROCESS_SIZE} before analysis."
192
  )
193
 
194
+ video_encoding_preset = st.selectbox(
195
+ "Video Encoding Speed (Faster = Lower Quality/Larger File)",
196
+ ('ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium'),
197
+ index=1, # Default to 'superfast'
198
+ help="Controls video encoding speed. 'ultrafast' is quickest but may reduce quality or increase file size. 'medium' is balanced."
199
  )
200
+
201
+ prompt_style = st.selectbox("Sound Style for Prompt",
202
+ ["cinematic", "realistic", "cartoon", "ominous", "peaceful"], index=0)
203
 
204
  st.subheader("Audio Mixing")
205
  mix_original_audio = st.checkbox("Mix with Original Video Audio", value=True)
206
+ original_audio_volume = st.slider("Original Audio Volume", 0.0, 1.0, 0.4, disabled=not mix_original_audio)
207
+ sfx_audio_volume = st.slider("Generated SFX Volume", 0.0, 1.0, 0.9)
 
 
 
 
 
 
 
208
 
209
+ with st.expander("Advanced MusicGen Settings (Impacts Speed & Quality)"):
210
+ generated_sfx_duration = st.slider("Target SFX Clip Duration (s) (Shorter = Faster)", 3, 20, 8, # Reduced max & default
211
+ help="Shorter base SFX clips generate faster. Max ~30s for musicgen-small.")
212
+ guidance_scale = st.slider("Guidance Scale (CFG)", 1.0, 7.0, 3.0)
213
+ top_k = st.slider("Top-K Sampling", 0, 250, 50)
214
+ top_p = st.slider("Top-P (Nucleus) Sampling", 0.0, 1.0, 0.95)
215
+ negative_prompt_text = st.text_input("Negative Prompt (Optional)", placeholder="e.g., low quality, noisy, music")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  # --- Main Area for Upload and Results ---
218
+ uploaded_file = st.file_uploader("πŸ“€ Upload an MP4 Video (Shorter videos process faster!)", type=["mp4"])
219
 
220
  if uploaded_file:
 
221
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
222
+ st.write(f"Using device: `{device}`. {'GPU detected: ' + torch.cuda.get_device_name(0) if device.type == 'cuda' else 'Warning: CPU processing will be slow for AI models.'}")
 
 
223
 
 
224
  temp_video_path = os.path.join(TEMP_DIR, f"temp_video_{uploaded_file.name}")
225
  temp_audio_path = os.path.join(TEMP_DIR, "temp_generated_audio.wav")
226
  output_video_path = os.path.join(TEMP_DIR, f"output_{uploaded_file.name}")
227
 
228
  try:
229
+ with open(temp_video_path, "wb") as f: f.write(uploaded_file.getbuffer())
230
+ progress_bar = st.progress(0); status_text = st.empty()
231
 
232
+ status_text.info("⏳ 1/5: Extracting & resizing frames...")
233
+ frames = extract_frames_from_video_cv2(temp_video_path, num_frames_to_analyze, BLIP_PROCESS_SIZE)
234
+ if not frames: st.error("Could not extract frames."); st.stop()
 
 
 
 
 
 
235
  progress_bar.progress(20)
236
 
237
+ status_text.info("⏳ 2/5: Analyzing frames (BLIP)...")
 
238
  blip_processor, blip_model = load_blip_model()
239
  descriptions = []
240
+ raw_blip_captions = [] # For display
241
  for i, frame_pil in enumerate(frames):
242
  inputs = blip_processor(images=frame_pil, return_tensors="pt")
243
+ if device.type == 'cuda': inputs = {k: v.to(device) for k, v in inputs.items()}
244
+ with torch.no_grad(): out = blip_model.generate(**inputs, max_new_tokens=30) # Shorter BLIP captions
 
 
 
 
245
  base_desc = blip_processor.decode(out[0], skip_special_tokens=True)
246
+ raw_blip_captions.append(base_desc)
247
  enhanced_desc = generate_enhanced_prompt(base_desc, prompt_style)
248
  descriptions.append(enhanced_desc)
249
+ status_text.info(f"⏳ 2/5: Frame {i+1}/{len(frames)} analyzed.")
250
+ combined_description = " Then, ".join(descriptions); progress_bar.progress(40)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ st.subheader("πŸ“ Generated Sound Prompt (Editable)")
253
+ editable_prompt = st.text_area("Sound Effect Prompt:", value=combined_description, height=100)
 
 
 
 
 
 
 
 
 
 
 
254
 
255
+ if st.button("✨ Generate Sound & Sync Video (FAST MODE)", type="primary"):
256
+ if not editable_prompt.strip(): st.error("Prompt cannot be empty!"); st.stop()
257
+
258
+ status_text.info(f"⏳ 3/5: Generating sound (MusicGen)...")
259
+ audio_array, sample_rate = generate_audio(editable_prompt, negative_prompt_text, generated_sfx_duration, guidance_scale, top_k, top_p, device)
260
+ sf.write(temp_audio_path, audio_array, sample_rate); progress_bar.progress(60)
261
+
262
+ status_text.info(f"⏳ 4/5: Syncing audio with video (MoviePy @ {video_encoding_preset})...")
263
+ with st.spinner(f"MoviePy is encoding (preset: {video_encoding_preset})... This is the slowest part for CPU users."):
264
+ sync_audio_to_video(temp_video_path, temp_audio_path, output_video_path, mix_original_audio, original_audio_volume, sfx_audio_volume, video_encoding_preset)
265
+ progress_bar.progress(90)
266
+
267
+ status_text.success("βœ… 5/5: Processing Complete!")
268
+ st.subheader("πŸŽ‰ Your Sound-Enhanced Video:")
269
+ try:
270
+ with open(output_video_path, 'rb') as vf: video_bytes = vf.read()
271
+ st.video(video_bytes)
272
+ st.download_button("πŸ“₯ Download Synced Video", video_bytes, f"sfx_synced_{uploaded_file.name}", "video/mp4")
273
+ except FileNotFoundError: st.error("Output video file not found.")
274
+ except Exception as e: st.error(f"Error displaying video: {e}")
275
+ progress_bar.progress(100)
276
+
277
+ with st.expander("Generation Details", expanded=False):
278
+ st.write("**Original BLIP Captions (on resized frames):**")
279
+ for i, cap in enumerate(raw_blip_captions): st.markdown(f"- Frame {i+1}: `{cap}`")
280
+ st.write(f"**Final Prompt Used for MusicGen:** `{editable_prompt}`")
281
+ if negative_prompt_text: st.write(f"**Negative Prompt Used:** `{negative_prompt_text}`")
282
+ st.write(f"**Base SFX Duration:** {generated_sfx_duration}s")
283
+ st.write(f"**MusicGen Sampling Rate:** {sample_rate} Hz")
284
+ st.write(f"**Video Encoding Preset:** {video_encoding_preset}")
 
 
285
 
286
  except Exception as e:
287
  st.error(f"An unexpected error occurred: {e}")
288
+ st.error("Troubleshooting: Try a shorter video, fewer frames, or 'ultrafast' encoding. Ensure GPU hardware on Spaces for AI speed.")
 
 
 
 
289
  import traceback
290
  st.code(traceback.format_exc())
 
 
291
  finally:
292
+ for p in [temp_video_path, temp_audio_path, output_video_path]:
293
+ if os.path.exists(p):
294
+ try: os.remove(p)
295
+ except Exception as e: st.warning(f"Could not remove temp file {p}: {e}")
 
 
 
296
  else:
297
+ st.info("πŸ‘‹ Welcome! Upload an MP4 to get started. Adjust settings in the sidebar for speed/quality.")
298
  st.markdown("""
299
+ **How it's faster:**
300
+ 1. **OpenCV & Frame Resizing:** Faster frame grabbing, smaller frames for quicker AI analysis (BLIP).
301
+ 2. **Encoding Presets:** Choose 'ultrafast' or 'superfast' for quicker video output (MoviePy).
302
+ 3. **Optimized Defaults:** Fewer frames analyzed & shorter SFX clip by default.
303
+ 4. **GPU Strongly Recommended:** For AI models (BLIP, MusicGen), GPU hardware on Hugging Face Spaces is key for true speed.
 
 
 
 
 
 
 
 
304
  """)