garyuzair commited on
Commit
3cdf79d
·
verified ·
1 Parent(s): 10506b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -268
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import streamlit as st
2
- # import imageio # Replaced by cv2
3
- import cv2 # For faster frame extraction
4
  import numpy as np
5
  from PIL import Image
6
  from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
@@ -13,292 +12,216 @@ import tempfile
13
  try:
14
  import moviepy.editor as mpy
15
  except ModuleNotFoundError:
16
- st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3' (or a compatible version) is in your requirements.txt and the environment is rebuilt.")
17
  st.stop()
18
 
19
- # --- Constants ---
20
- TEMP_DIR = tempfile.gettempdir()
21
- DEFAULT_SAMPLING_RATE = 32000 # MusicGen default
22
- BLIP_PROCESS_SIZE = 384 # Resize frames to this size for BLIP
23
 
24
- # --- Model Loading (Cached) ---
25
- @st.cache_resource
26
- def load_blip_model():
27
- st.write("Loading BLIP model for image captioning...")
28
- processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
29
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
30
- if torch.cuda.is_available():
31
- model = model.half().to("cuda")
32
- st.write("BLIP model loaded.")
33
- return processor, model
34
 
35
- @st.cache_resource
36
- def load_musicgen_model():
37
- st.write("Loading MusicGen model for sound generation...")
38
- processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
39
- model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
40
- if torch.cuda.is_available():
41
- model = model.half().to("cuda")
42
- st.write("MusicGen model loaded.")
43
- return processor, model
44
-
45
- # --- Core Functions ---
46
- def extract_frames_from_video_cv2(video_path, num_frames_to_extract, target_size=BLIP_PROCESS_SIZE):
47
- """Extracts a specified number of frames evenly from a video using OpenCV, and resizes them."""
48
- frames = []
49
- cap = cv2.VideoCapture(video_path)
50
- if not cap.isOpened():
51
- st.error("Error: Could not open video file.")
52
- return []
53
-
54
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
55
- if total_frames == 0:
56
- st.warning("Video appears to have 0 frames. Please check the video file.")
57
- cap.release()
58
- return []
59
-
60
- step = max(1, total_frames // num_frames_to_extract)
61
 
62
- extracted_count = 0
63
- for i in range(0, total_frames, step):
64
- if extracted_count >= num_frames_to_extract:
65
- break
66
- cap.set(cv2.CAP_PROP_POS_FRAMES, i)
67
- ret, frame = cap.read()
68
- if ret:
69
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
70
- pil_image = Image.fromarray(frame_rgb)
71
-
72
- # Resize for BLIP
73
- pil_image_resized = pil_image.resize((target_size, target_size), Image.Resampling.LANCZOS)
74
- frames.append(pil_image_resized)
75
- extracted_count += 1
76
- else:
77
- # Could mean end of video or read error
78
- break
79
-
80
- cap.release()
81
- if not frames and num_frames_to_extract > 0:
82
- st.warning(f"Could not extract any frames. Tried to extract {num_frames_to_extract} frames with step {step} from {total_frames} total frames.")
83
- return frames
84
-
85
-
86
- def generate_enhanced_prompt(base_description, context_style="cinematic"):
87
- base = base_description.lower().strip().replace("a photo of ", "").replace("an image of ", "")
88
- action_sounds = {
89
- "walking": "footsteps, rhythmic, on {surface}", "running": "rapid footsteps, heavy breathing, on {surface}",
90
- "driving": "engine rumble, tire sounds on {surface}", "talking": "clear voices, conversational tone",
91
- "shouting": "loud voice, urgent tone", "whispering": "soft whisper, subtle breath sounds",
92
- "crashing": "loud impact, debris scattering, metallic clang", "falling": "thud, impact sound, rustle of clothing",
93
- "flying": "whooshing sound, wind noise", "swimming": "water splashing, rhythmic strokes",
94
- "exploding": "deep boom, crackling, debris sounds", "door opening": "creak of a door, latch click",
95
- "door closing": "solid thud of a door, latch click"
96
- }
97
- object_sounds = {
98
- "person": "human presence, subtle movements, breathing", "dog": "dog barking, panting, collar jingle",
99
- "cat": "cat meow, purring, soft paw steps", "car": "engine hum, tire whine, occasional horn",
100
- "truck": "heavy engine drone, air brakes", "motorcycle": "motorcycle engine revving, exhaust pop",
101
- "train": "train horn, rhythmic clatter on tracks", "plane": "jet engine whine, distant rumble",
102
- "bird": "birdsong, flapping wings", "water": "flowing water, lapping waves, drips",
103
- "wind": "wind howling, gentle breeze through leaves", "fire": "crackling fire, burning wood"
104
  }
105
- environment_ambience = {
106
- "room": "indoor ambience, slight room tone, muffled distant sounds",
107
- "office": "office hum, keyboard typing, distant chatter",
108
- "street": "city street sounds, traffic, distant sirens, pedestrian chatter",
109
- "forest": "forest ambience, rustling leaves, distant bird calls, twigs snapping",
110
- "beach": "ocean waves crashing, seagulls, gentle wind", "cave": "echoing drips, damp air, low rumble",
111
- "space": "eerie silence, low hum, occasional electronic beep", "underwater": "muffled sounds, bubbling, deep water pressure"
112
  }
113
- sound_qualities = {
114
- "cinematic": "high quality, clear, immersive, dynamic range", "realistic": "natural, authentic, detailed",
115
- "cartoon": "exaggerated, playful, boings, zips", "ominous": "low rumble, dissonant, suspenseful",
116
- "peaceful": "gentle, calming, serene"
 
117
  }
118
- found_elements, prompt_parts = [], [f"A {context_style} soundscape of:"]
119
- for action_keyword, sound_desc in action_sounds.items():
120
- if action_keyword in base:
121
- surface = "generic surface"; ("grass" in base and (surface := "grass")) or (("wood" in base or "floor" in base) and (surface := "wooden floor")) or (("concrete" in base or "pavement" in base) and (surface := "concrete")) or (("water" in base) and (surface := "water"))
122
- prompt_parts.append(sound_desc.format(surface=surface)); found_elements.append(action_keyword); break
123
- for obj_keyword, sound_desc in object_sounds.items():
124
- if obj_keyword in base and obj_keyword not in found_elements:
125
- prompt_parts.append(sound_desc); found_elements.append(obj_keyword)
126
- if len(found_elements) > (1 if any(ak in found_elements for ak in action_sounds) else 2): break
127
- added_env = False
128
- for env_keyword, sound_desc in environment_ambience.items():
129
- if env_keyword in base: prompt_parts.append(f"environment: {sound_desc}"); added_env = True; break
130
- if not found_elements and not added_env: prompt_parts.append(f"subtle ambient sound related to '{base}'")
131
- prompt_parts.append(sound_qualities.get(context_style, sound_qualities["cinematic"]))
132
- return ", ".join(prompt_parts) + "."
133
-
134
-
135
- def generate_audio(text_prompt, negative_prompt, duration_s, guidance_scale, top_k, top_p, device):
136
- musicgen_processor, musicgen_model = load_musicgen_model()
137
- inputs = musicgen_processor(text=[text_prompt], negative_prompt=[negative_prompt] if negative_prompt else None, padding=True, return_tensors="pt").to(device)
138
- max_new_tokens = int(duration_s * musicgen_model.config.audio_encoder.frame_rate)
139
- if max_new_tokens > 1500: # MusicGen small context limit
140
- st.warning(f"Requested SFX duration ({duration_s}s) is long. Capping to ~30s for musicgen-small to ensure stability.")
141
- max_new_tokens = 1500
142
-
143
- audio_values = musicgen_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, guidance_scale=guidance_scale, top_k=top_k, top_p=top_p)
144
- audio_array = audio_values[0].cpu().numpy()
145
- if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=0)
146
- audio_array = audio_array / (np.max(np.abs(audio_array)) + 1e-6) * 0.9 # Normalize, avoid div by zero
147
- audio_array = np.clip(audio_array, -1.0, 1.0)
148
- return audio_array, musicgen_model.config.audio_encoder.sampling_rate
149
-
150
- def sync_audio_to_video(video_path, audio_path, output_path, mix_original, original_vol, sfx_vol, encoding_preset):
151
- video_clip = mpy.VideoFileClip(video_path)
152
- generated_audio_clip = mpy.AudioFileClip(audio_path)
153
- video_duration = video_clip.duration
154
- final_generated_audio = generated_audio_clip.subclip(0, video_duration) if generated_audio_clip.duration >= video_duration else mpy.concatenate_audioclips([generated_audio_clip] * int(np.ceil(video_duration / generated_audio_clip.duration))).subclip(0, video_duration)
155
- final_generated_audio = final_generated_audio.volumex(sfx_vol)
156
- if mix_original and video_clip.audio:
157
- original_audio = video_clip.audio.volumex(original_vol)
158
- composite_audio = mpy.CompositeAudioClip([original_audio, final_generated_audio])
159
- final_video = video_clip.set_audio(composite_audio)
160
- else:
161
- final_video = video_clip.set_audio(final_generated_audio)
162
-
163
- # Use more threads for encoding, and the selected preset
164
- num_threads = os.cpu_count() or 2
165
- st.write(f"MoviePy encoding with preset: '{encoding_preset}', threads: {num_threads}")
166
-
167
- final_video.write_videofile(output_path, codec="libx264", audio_codec="aac", preset=encoding_preset,
168
- bitrate="4000k", # Reduced bitrate slightly for faster presets
169
- audio_bitrate="192k", threads=num_threads, logger='bar')
170
- # Close clips to free resources
171
- video_clip.close()
172
- generated_audio_clip.close()
173
- if 'original_audio' in locals(): original_audio.close()
174
- final_generated_audio.close()
175
- if 'composite_audio' in locals(): composite_audio.close()
176
- final_video.close()
177
-
178
-
179
- # --- Streamlit UI ---
180
- st.set_page_config(layout="wide", page_title="Video To SoundFX Generator")
181
- st.title("⚡ Speedy Video To SoundFX Generator 🎶")
182
- st.markdown("Upload an MP4 video. This tool generates sound effects and syncs them. **Optimized for speed!**")
183
- st.markdown("> For *truly* fast performance, especially the AI parts, ensure your Hugging Face Space is running on **GPU hardware**.")
184
 
185
- # --- Sidebar for Settings ---
186
- with st.sidebar:
187
- st.header("⚙️ Generation Settings")
188
-
189
- num_frames_to_analyze = st.slider(
190
- "Number of Frames to Analyze (Fewer = Faster)", 1, 5, 2, # Reduced max and default
191
- help=f"Fewer frames speed up analysis. Frames are resized to {BLIP_PROCESS_SIZE}x{BLIP_PROCESS_SIZE} before analysis."
192
- )
193
-
194
- video_encoding_preset = st.selectbox(
195
- "Video Encoding Speed (Faster = Lower Quality/Larger File)",
196
- ('ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium'),
197
- index=1, # Default to 'superfast'
198
- help="Controls video encoding speed. 'ultrafast' is quickest but may reduce quality or increase file size. 'medium' is balanced."
199
- )
200
-
201
- prompt_style = st.selectbox("Sound Style for Prompt",
202
- ["cinematic", "realistic", "cartoon", "ominous", "peaceful"], index=0)
203
-
204
- st.subheader("Audio Mixing")
205
- mix_original_audio = st.checkbox("Mix with Original Video Audio", value=True)
206
- original_audio_volume = st.slider("Original Audio Volume", 0.0, 1.0, 0.4, disabled=not mix_original_audio)
207
- sfx_audio_volume = st.slider("Generated SFX Volume", 0.0, 1.0, 0.9)
208
 
209
- with st.expander("Advanced MusicGen Settings (Impacts Speed & Quality)"):
210
- generated_sfx_duration = st.slider("Target SFX Clip Duration (s) (Shorter = Faster)", 3, 20, 8, # Reduced max & default
211
- help="Shorter base SFX clips generate faster. Max ~30s for musicgen-small.")
212
- guidance_scale = st.slider("Guidance Scale (CFG)", 1.0, 7.0, 3.0)
213
- top_k = st.slider("Top-K Sampling", 0, 250, 50)
214
- top_p = st.slider("Top-P (Nucleus) Sampling", 0.0, 1.0, 0.95)
215
- negative_prompt_text = st.text_input("Negative Prompt (Optional)", placeholder="e.g., low quality, noisy, music")
216
 
217
- # --- Main Area for Upload and Results ---
218
- uploaded_file = st.file_uploader("📤 Upload an MP4 Video (Shorter videos process faster!)", type=["mp4"])
 
 
 
219
 
220
- if uploaded_file:
221
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
222
- st.write(f"Using device: `{device}`. {'GPU detected: ' + torch.cuda.get_device_name(0) if device.type == 'cuda' else 'Warning: CPU processing will be slow for AI models.'}")
223
 
224
- temp_video_path = os.path.join(TEMP_DIR, f"temp_video_{uploaded_file.name}")
225
- temp_audio_path = os.path.join(TEMP_DIR, "temp_generated_audio.wav")
226
- output_video_path = os.path.join(TEMP_DIR, f"output_{uploaded_file.name}")
227
 
 
228
  try:
229
- with open(temp_video_path, "wb") as f: f.write(uploaded_file.getbuffer())
230
- progress_bar = st.progress(0); status_text = st.empty()
231
-
232
- status_text.info("⏳ 1/5: Extracting & resizing frames...")
233
- frames = extract_frames_from_video_cv2(temp_video_path, num_frames_to_analyze, BLIP_PROCESS_SIZE)
234
- if not frames: st.error("Could not extract frames."); st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
235
  progress_bar.progress(20)
236
 
237
- status_text.info("⏳ 2/5: Analyzing frames (BLIP)...")
238
- blip_processor, blip_model = load_blip_model()
239
- descriptions = []
240
- raw_blip_captions = [] # For display
241
- for i, frame_pil in enumerate(frames):
242
- inputs = blip_processor(images=frame_pil, return_tensors="pt")
243
- if device.type == 'cuda': inputs = {k: v.to(device) for k, v in inputs.items()}
244
- with torch.no_grad(): out = blip_model.generate(**inputs, max_new_tokens=30) # Shorter BLIP captions
245
- base_desc = blip_processor.decode(out[0], skip_special_tokens=True)
246
- raw_blip_captions.append(base_desc)
247
- enhanced_desc = generate_enhanced_prompt(base_desc, prompt_style)
248
- descriptions.append(enhanced_desc)
249
- status_text.info(f"⏳ 2/5: Frame {i+1}/{len(frames)} analyzed.")
250
- combined_description = " Then, ".join(descriptions); progress_bar.progress(40)
251
-
252
- st.subheader("📝 Generated Sound Prompt (Editable)")
253
- editable_prompt = st.text_area("Sound Effect Prompt:", value=combined_description, height=100)
254
 
255
- if st.button("✨ Generate Sound & Sync Video (FAST MODE)", type="primary"):
256
- if not editable_prompt.strip(): st.error("Prompt cannot be empty!"); st.stop()
257
-
258
- status_text.info(f"⏳ 3/5: Generating sound (MusicGen)...")
259
- audio_array, sample_rate = generate_audio(editable_prompt, negative_prompt_text, generated_sfx_duration, guidance_scale, top_k, top_p, device)
260
- sf.write(temp_audio_path, audio_array, sample_rate); progress_bar.progress(60)
261
 
262
- status_text.info(f"⏳ 4/5: Syncing audio with video (MoviePy @ {video_encoding_preset})...")
263
- with st.spinner(f"MoviePy is encoding (preset: {video_encoding_preset})... This is the slowest part for CPU users."):
264
- sync_audio_to_video(temp_video_path, temp_audio_path, output_video_path, mix_original_audio, original_audio_volume, sfx_audio_volume, video_encoding_preset)
265
- progress_bar.progress(90)
266
-
267
- status_text.success("✅ 5/5: Processing Complete!")
268
- st.subheader("🎉 Your Sound-Enhanced Video:")
269
- try:
270
- with open(output_video_path, 'rb') as vf: video_bytes = vf.read()
271
- st.video(video_bytes)
272
- st.download_button("📥 Download Synced Video", video_bytes, f"sfx_synced_{uploaded_file.name}", "video/mp4")
273
- except FileNotFoundError: st.error("Output video file not found.")
274
- except Exception as e: st.error(f"Error displaying video: {e}")
275
- progress_bar.progress(100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
- with st.expander("Generation Details", expanded=False):
278
- st.write("**Original BLIP Captions (on resized frames):**")
279
- for i, cap in enumerate(raw_blip_captions): st.markdown(f"- Frame {i+1}: `{cap}`")
280
- st.write(f"**Final Prompt Used for MusicGen:** `{editable_prompt}`")
281
- if negative_prompt_text: st.write(f"**Negative Prompt Used:** `{negative_prompt_text}`")
282
- st.write(f"**Base SFX Duration:** {generated_sfx_duration}s")
283
- st.write(f"**MusicGen Sampling Rate:** {sample_rate} Hz")
284
- st.write(f"**Video Encoding Preset:** {video_encoding_preset}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  except Exception as e:
287
- st.error(f"An unexpected error occurred: {e}")
288
- st.error("Troubleshooting: Try a shorter video, fewer frames, or 'ultrafast' encoding. Ensure GPU hardware on Spaces for AI speed.")
289
- import traceback
290
- st.code(traceback.format_exc())
291
  finally:
292
- for p in [temp_video_path, temp_audio_path, output_video_path]:
293
- if os.path.exists(p):
294
- try: os.remove(p)
295
- except Exception as e: st.warning(f"Could not remove temp file {p}: {e}")
296
- else:
297
- st.info("👋 Welcome! Upload an MP4 to get started. Adjust settings in the sidebar for speed/quality.")
298
- st.markdown("""
299
- **How it's faster:**
300
- 1. **OpenCV & Frame Resizing:** Faster frame grabbing, smaller frames for quicker AI analysis (BLIP).
301
- 2. **Encoding Presets:** Choose 'ultrafast' or 'superfast' for quicker video output (MoviePy).
302
- 3. **Optimized Defaults:** Fewer frames analyzed & shorter SFX clip by default.
303
- 4. **GPU Strongly Recommended:** For AI models (BLIP, MusicGen), GPU hardware on Hugging Face Spaces is key for true speed.
304
- """)
 
1
  import streamlit as st
2
+ import imageio
 
3
  import numpy as np
4
  from PIL import Image
5
  from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
 
12
  try:
13
  import moviepy.editor as mpy
14
  except ModuleNotFoundError:
15
+ st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3' is in requirements.txt and installed.")
16
  st.stop()
17
 
18
+ # Set page title and instructions
19
+ st.title("Story Video Sound Effect Sync Generator")
20
+ st.write("Upload an MP4 video to auto-generate and sync a high-quality sound effect.")
 
21
 
22
+ # User-configurable settings
23
+ num_frames_to_extract = st.slider("Number of frames to analyze", 1, 3, 1, help="Fewer frames = faster processing")
24
+ mix_original_audio = st.checkbox("Mix with original audio", value=False, help="Blend sound effect with video’s original sound")
 
 
 
 
 
 
 
25
 
26
+ # Enhanced prompt generation function
27
+ def enhance_prompt(base_description):
28
+ """Generate a detailed, sound-specific prompt from BLIP caption."""
29
+ base = base_description.lower().strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Define action, object, and environment keywords
32
+ actions = {
33
+ "walk": "crisp footsteps on a wooden floor",
34
+ "run": "rapid footsteps and heavy breathing",
35
+ "drive": "engine roar and tires screeching",
36
+ "talk": "soft voices and background murmur",
37
+ "crash": "loud crash and debris scattering",
38
+ "fall": "thud of impact and rustling debris"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  }
40
+ objects = {
41
+ "person": "human activity with subtle breathing",
42
+ "dog": "playful barks and pawsteps",
43
+ "car": "mechanical hum and tire friction",
44
+ "tree": "rustling leaves in a breeze",
45
+ "forest": "gentle wind and distant bird calls"
 
46
  }
47
+ environments = {
48
+ "room": "echoing footsteps and muffled sounds",
49
+ "street": "distant traffic and urban hum",
50
+ "forest": "wind through trees and twigs snapping",
51
+ "outside": "open air with faint wind"
52
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ # Extract key elements from the caption
55
+ sound_description = ""
56
+ for action, sound in actions.items():
57
+ if action in base:
58
+ sound_description = sound
59
+ break
60
+ if not sound_description: # Default to subtle ambient if no action
61
+ sound_description = "subtle ambient hum"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ # Add object-specific sounds
64
+ for obj, sound in objects.items():
65
+ if obj in base:
66
+ sound_description += f" and {sound}"
67
+ break
 
 
68
 
69
+ # Add environment if detected
70
+ for env, sound in environments.items():
71
+ if env in base:
72
+ sound_description += f" in a {env} with {sound}"
73
+ break
74
 
75
+ # Construct final prompt
76
+ return f"{base} with {sound_description}"
 
77
 
78
+ # File uploader for video
79
+ uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
 
80
 
81
+ if uploaded_file is not None:
82
  try:
83
+ # Temporary video file
84
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
85
+ temp_video.write(uploaded_file.getbuffer())
86
+ temp_video_path = temp_video.name
87
+
88
+ # Progress bar setup
89
+ progress_bar = st.progress(0)
90
+ status_text = st.empty()
91
+
92
+ # Extract frames
93
+ status_text.text("Extracting frames...")
94
+ video = imageio.get_reader(temp_video_path, "ffmpeg")
95
+ total_frames = len(list(video.iter_data()))
96
+ step = max(1, total_frames // num_frames_to_extract)
97
+ frames = [
98
+ Image.fromarray(video.get_data(i))
99
+ for i in range(0, min(total_frames, num_frames_to_extract * step), step)
100
+ ][:num_frames_to_extract]
101
  progress_bar.progress(20)
102
 
103
+ # Load BLIP model
104
+ @st.cache_resource
105
+ def load_blip_model():
106
+ processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
107
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
108
+ if torch.cuda.is_available():
109
+ model = model.half().to("cuda")
110
+ return processor, model
 
 
 
 
 
 
 
 
 
111
 
112
+ processor, model = load_blip_model()
 
 
 
 
 
113
 
114
+ # Generate and enhance text descriptions
115
+ status_text.text("Analyzing frames...")
116
+ descriptions = []
117
+ for i, frame in enumerate(frames):
118
+ inputs = processor(images=frame, return_tensors="pt")
119
+ if torch.cuda.is_available():
120
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
121
+ out = model.generate(**inputs)
122
+ base_description = processor.decode(out[0], skip_special_tokens=True)
123
+ enhanced_description = enhance_prompt(base_description)
124
+ descriptions.append(enhanced_description)
125
+ progress_bar.progress(20 + int(30 * (i + 1) / len(frames)))
126
+
127
+ text_prompt = ". ".join(descriptions)
128
+ st.write("Enhanced text prompt:", text_prompt)
129
+
130
+ # Load MusicGen model
131
+ @st.cache_resource
132
+ def load_musicgen_model():
133
+ processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
134
+ model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
135
+ if torch.cuda.is_available():
136
+ model = model.half().to("cuda")
137
+ return processor, model
138
+
139
+ musicgen_processor, musicgen_model = load_musicgen_model()
140
+
141
+ # Generate sound effect (~8 seconds)
142
+ status_text.text("Generating sound effect...")
143
+ inputs = musicgen_processor(
144
+ text=[text_prompt],
145
+ padding=True,
146
+ return_tensors="pt",
147
+ )
148
+ if torch.cuda.is_available():
149
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
150
+ audio_values = musicgen_model.generate(
151
+ **inputs,
152
+ max_new_tokens=256,
153
+ do_sample=True,
154
+ guidance_scale=3.0,
155
+ top_k=50,
156
+ top_p=0.95
157
+ )
158
+ audio_array = audio_values[0].cpu().numpy()
159
+ if audio_array.ndim > 1:
160
+ audio_array = audio_array.flatten()
161
+ audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9
162
+ audio_array = np.clip(audio_array, -1.0, 1.0)
163
+ sample_rate = 32000
164
+ progress_bar.progress(60)
165
+
166
+ # Save temporary audio
167
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
168
+ sf.write(temp_audio.name, audio_array, sample_rate)
169
+ temp_audio_path = temp_audio.name
170
+
171
+ # Synchronize with video using mpy
172
+ status_text.text("Syncing audio with video...")
173
+ video_clip = mpy.VideoFileClip(temp_video_path)
174
+ video_duration = video_clip.duration
175
+ audio_clip = mpy.AudioFileClip(temp_audio_path)
176
+
177
+ # Adjust audio length
178
+ if audio_clip.duration < video_duration:
179
+ loops_needed = int(np.ceil(video_duration / audio_clip.duration))
180
+ audio_clip = mpy.concatenate_audioclips([audio_clip] * loops_needed).subclip(0, video_duration)
181
+ else:
182
+ audio_clip = audio_clip.subclip(0, video_duration)
183
 
184
+ # Mix or replace audio
185
+ if mix_original_audio and video_clip.audio:
186
+ final_audio = video_clip.audio.volumex(0.5) + audio_clip.volumex(0.5)
187
+ else:
188
+ final_audio = audio_clip
189
+
190
+ # Set audio to video
191
+ final_video = video_clip.set_audio(final_audio)
192
+
193
+ # Save final video with high quality
194
+ output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
195
+ final_video.write_videofile(
196
+ output_path,
197
+ codec="libx264",
198
+ audio_codec="aac",
199
+ preset="medium", # Better quality than ultrafast
200
+ bitrate="8000k", # Higher bitrate for video quality
201
+ audio_bitrate="192k", # Good audio quality
202
+ temp_audiofile="temp-audio.m4a",
203
+ remove_temp=True
204
+ )
205
+ progress_bar.progress(90)
206
+
207
+ # Provide playback and download
208
+ status_text.text("Done!")
209
+ st.video(output_path)
210
+ with open(output_path, "rb") as video_file:
211
+ st.download_button(
212
+ label="Download Synced Video",
213
+ data=video_file,
214
+ file_name="synced_story_video.mp4",
215
+ mime="video/mp4"
216
+ )
217
+ progress_bar.progress(100)
218
 
219
  except Exception as e:
220
+ st.error(f"An error occurred: {str(e)}")
221
+ st.write("Try reducing frames or uploading a smaller video.")
222
+
 
223
  finally:
224
+ # Clean up
225
+ for path in [temp_video_path, temp_audio_path, output_path]:
226
+ if 'path' in locals() and os.path.exists(path):
227
+ os.remove(path)