Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
|
| 3 |
-
import cv2 # For faster frame extraction
|
| 4 |
import numpy as np
|
| 5 |
from PIL import Image
|
| 6 |
from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
|
|
@@ -13,292 +12,216 @@ import tempfile
|
|
| 13 |
try:
|
| 14 |
import moviepy.editor as mpy
|
| 15 |
except ModuleNotFoundError:
|
| 16 |
-
st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3'
|
| 17 |
st.stop()
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
BLIP_PROCESS_SIZE = 384 # Resize frames to this size for BLIP
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
st.write("Loading BLIP model for image captioning...")
|
| 28 |
-
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 29 |
-
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 30 |
-
if torch.cuda.is_available():
|
| 31 |
-
model = model.half().to("cuda")
|
| 32 |
-
st.write("BLIP model loaded.")
|
| 33 |
-
return processor, model
|
| 34 |
|
| 35 |
-
|
| 36 |
-
def
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
| 40 |
-
if torch.cuda.is_available():
|
| 41 |
-
model = model.half().to("cuda")
|
| 42 |
-
st.write("MusicGen model loaded.")
|
| 43 |
-
return processor, model
|
| 44 |
-
|
| 45 |
-
# --- Core Functions ---
|
| 46 |
-
def extract_frames_from_video_cv2(video_path, num_frames_to_extract, target_size=BLIP_PROCESS_SIZE):
|
| 47 |
-
"""Extracts a specified number of frames evenly from a video using OpenCV, and resizes them."""
|
| 48 |
-
frames = []
|
| 49 |
-
cap = cv2.VideoCapture(video_path)
|
| 50 |
-
if not cap.isOpened():
|
| 51 |
-
st.error("Error: Could not open video file.")
|
| 52 |
-
return []
|
| 53 |
-
|
| 54 |
-
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 55 |
-
if total_frames == 0:
|
| 56 |
-
st.warning("Video appears to have 0 frames. Please check the video file.")
|
| 57 |
-
cap.release()
|
| 58 |
-
return []
|
| 59 |
-
|
| 60 |
-
step = max(1, total_frames // num_frames_to_extract)
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
pil_image = Image.fromarray(frame_rgb)
|
| 71 |
-
|
| 72 |
-
# Resize for BLIP
|
| 73 |
-
pil_image_resized = pil_image.resize((target_size, target_size), Image.Resampling.LANCZOS)
|
| 74 |
-
frames.append(pil_image_resized)
|
| 75 |
-
extracted_count += 1
|
| 76 |
-
else:
|
| 77 |
-
# Could mean end of video or read error
|
| 78 |
-
break
|
| 79 |
-
|
| 80 |
-
cap.release()
|
| 81 |
-
if not frames and num_frames_to_extract > 0:
|
| 82 |
-
st.warning(f"Could not extract any frames. Tried to extract {num_frames_to_extract} frames with step {step} from {total_frames} total frames.")
|
| 83 |
-
return frames
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
def generate_enhanced_prompt(base_description, context_style="cinematic"):
|
| 87 |
-
base = base_description.lower().strip().replace("a photo of ", "").replace("an image of ", "")
|
| 88 |
-
action_sounds = {
|
| 89 |
-
"walking": "footsteps, rhythmic, on {surface}", "running": "rapid footsteps, heavy breathing, on {surface}",
|
| 90 |
-
"driving": "engine rumble, tire sounds on {surface}", "talking": "clear voices, conversational tone",
|
| 91 |
-
"shouting": "loud voice, urgent tone", "whispering": "soft whisper, subtle breath sounds",
|
| 92 |
-
"crashing": "loud impact, debris scattering, metallic clang", "falling": "thud, impact sound, rustle of clothing",
|
| 93 |
-
"flying": "whooshing sound, wind noise", "swimming": "water splashing, rhythmic strokes",
|
| 94 |
-
"exploding": "deep boom, crackling, debris sounds", "door opening": "creak of a door, latch click",
|
| 95 |
-
"door closing": "solid thud of a door, latch click"
|
| 96 |
-
}
|
| 97 |
-
object_sounds = {
|
| 98 |
-
"person": "human presence, subtle movements, breathing", "dog": "dog barking, panting, collar jingle",
|
| 99 |
-
"cat": "cat meow, purring, soft paw steps", "car": "engine hum, tire whine, occasional horn",
|
| 100 |
-
"truck": "heavy engine drone, air brakes", "motorcycle": "motorcycle engine revving, exhaust pop",
|
| 101 |
-
"train": "train horn, rhythmic clatter on tracks", "plane": "jet engine whine, distant rumble",
|
| 102 |
-
"bird": "birdsong, flapping wings", "water": "flowing water, lapping waves, drips",
|
| 103 |
-
"wind": "wind howling, gentle breeze through leaves", "fire": "crackling fire, burning wood"
|
| 104 |
}
|
| 105 |
-
|
| 106 |
-
"
|
| 107 |
-
"
|
| 108 |
-
"
|
| 109 |
-
"
|
| 110 |
-
"
|
| 111 |
-
"space": "eerie silence, low hum, occasional electronic beep", "underwater": "muffled sounds, bubbling, deep water pressure"
|
| 112 |
}
|
| 113 |
-
|
| 114 |
-
"
|
| 115 |
-
"
|
| 116 |
-
"
|
|
|
|
| 117 |
}
|
| 118 |
-
found_elements, prompt_parts = [], [f"A {context_style} soundscape of:"]
|
| 119 |
-
for action_keyword, sound_desc in action_sounds.items():
|
| 120 |
-
if action_keyword in base:
|
| 121 |
-
surface = "generic surface"; ("grass" in base and (surface := "grass")) or (("wood" in base or "floor" in base) and (surface := "wooden floor")) or (("concrete" in base or "pavement" in base) and (surface := "concrete")) or (("water" in base) and (surface := "water"))
|
| 122 |
-
prompt_parts.append(sound_desc.format(surface=surface)); found_elements.append(action_keyword); break
|
| 123 |
-
for obj_keyword, sound_desc in object_sounds.items():
|
| 124 |
-
if obj_keyword in base and obj_keyword not in found_elements:
|
| 125 |
-
prompt_parts.append(sound_desc); found_elements.append(obj_keyword)
|
| 126 |
-
if len(found_elements) > (1 if any(ak in found_elements for ak in action_sounds) else 2): break
|
| 127 |
-
added_env = False
|
| 128 |
-
for env_keyword, sound_desc in environment_ambience.items():
|
| 129 |
-
if env_keyword in base: prompt_parts.append(f"environment: {sound_desc}"); added_env = True; break
|
| 130 |
-
if not found_elements and not added_env: prompt_parts.append(f"subtle ambient sound related to '{base}'")
|
| 131 |
-
prompt_parts.append(sound_qualities.get(context_style, sound_qualities["cinematic"]))
|
| 132 |
-
return ", ".join(prompt_parts) + "."
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
def generate_audio(text_prompt, negative_prompt, duration_s, guidance_scale, top_k, top_p, device):
|
| 136 |
-
musicgen_processor, musicgen_model = load_musicgen_model()
|
| 137 |
-
inputs = musicgen_processor(text=[text_prompt], negative_prompt=[negative_prompt] if negative_prompt else None, padding=True, return_tensors="pt").to(device)
|
| 138 |
-
max_new_tokens = int(duration_s * musicgen_model.config.audio_encoder.frame_rate)
|
| 139 |
-
if max_new_tokens > 1500: # MusicGen small context limit
|
| 140 |
-
st.warning(f"Requested SFX duration ({duration_s}s) is long. Capping to ~30s for musicgen-small to ensure stability.")
|
| 141 |
-
max_new_tokens = 1500
|
| 142 |
-
|
| 143 |
-
audio_values = musicgen_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, guidance_scale=guidance_scale, top_k=top_k, top_p=top_p)
|
| 144 |
-
audio_array = audio_values[0].cpu().numpy()
|
| 145 |
-
if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=0)
|
| 146 |
-
audio_array = audio_array / (np.max(np.abs(audio_array)) + 1e-6) * 0.9 # Normalize, avoid div by zero
|
| 147 |
-
audio_array = np.clip(audio_array, -1.0, 1.0)
|
| 148 |
-
return audio_array, musicgen_model.config.audio_encoder.sampling_rate
|
| 149 |
-
|
| 150 |
-
def sync_audio_to_video(video_path, audio_path, output_path, mix_original, original_vol, sfx_vol, encoding_preset):
|
| 151 |
-
video_clip = mpy.VideoFileClip(video_path)
|
| 152 |
-
generated_audio_clip = mpy.AudioFileClip(audio_path)
|
| 153 |
-
video_duration = video_clip.duration
|
| 154 |
-
final_generated_audio = generated_audio_clip.subclip(0, video_duration) if generated_audio_clip.duration >= video_duration else mpy.concatenate_audioclips([generated_audio_clip] * int(np.ceil(video_duration / generated_audio_clip.duration))).subclip(0, video_duration)
|
| 155 |
-
final_generated_audio = final_generated_audio.volumex(sfx_vol)
|
| 156 |
-
if mix_original and video_clip.audio:
|
| 157 |
-
original_audio = video_clip.audio.volumex(original_vol)
|
| 158 |
-
composite_audio = mpy.CompositeAudioClip([original_audio, final_generated_audio])
|
| 159 |
-
final_video = video_clip.set_audio(composite_audio)
|
| 160 |
-
else:
|
| 161 |
-
final_video = video_clip.set_audio(final_generated_audio)
|
| 162 |
-
|
| 163 |
-
# Use more threads for encoding, and the selected preset
|
| 164 |
-
num_threads = os.cpu_count() or 2
|
| 165 |
-
st.write(f"MoviePy encoding with preset: '{encoding_preset}', threads: {num_threads}")
|
| 166 |
-
|
| 167 |
-
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac", preset=encoding_preset,
|
| 168 |
-
bitrate="4000k", # Reduced bitrate slightly for faster presets
|
| 169 |
-
audio_bitrate="192k", threads=num_threads, logger='bar')
|
| 170 |
-
# Close clips to free resources
|
| 171 |
-
video_clip.close()
|
| 172 |
-
generated_audio_clip.close()
|
| 173 |
-
if 'original_audio' in locals(): original_audio.close()
|
| 174 |
-
final_generated_audio.close()
|
| 175 |
-
if 'composite_audio' in locals(): composite_audio.close()
|
| 176 |
-
final_video.close()
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
# --- Streamlit UI ---
|
| 180 |
-
st.set_page_config(layout="wide", page_title="Video To SoundFX Generator")
|
| 181 |
-
st.title("⚡ Speedy Video To SoundFX Generator 🎶")
|
| 182 |
-
st.markdown("Upload an MP4 video. This tool generates sound effects and syncs them. **Optimized for speed!**")
|
| 183 |
-
st.markdown("> For *truly* fast performance, especially the AI parts, ensure your Hugging Face Space is running on **GPU hardware**.")
|
| 184 |
|
| 185 |
-
#
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
video_encoding_preset = st.selectbox(
|
| 195 |
-
"Video Encoding Speed (Faster = Lower Quality/Larger File)",
|
| 196 |
-
('ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium'),
|
| 197 |
-
index=1, # Default to 'superfast'
|
| 198 |
-
help="Controls video encoding speed. 'ultrafast' is quickest but may reduce quality or increase file size. 'medium' is balanced."
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
-
prompt_style = st.selectbox("Sound Style for Prompt",
|
| 202 |
-
["cinematic", "realistic", "cartoon", "ominous", "peaceful"], index=0)
|
| 203 |
-
|
| 204 |
-
st.subheader("Audio Mixing")
|
| 205 |
-
mix_original_audio = st.checkbox("Mix with Original Video Audio", value=True)
|
| 206 |
-
original_audio_volume = st.slider("Original Audio Volume", 0.0, 1.0, 0.4, disabled=not mix_original_audio)
|
| 207 |
-
sfx_audio_volume = st.slider("Generated SFX Volume", 0.0, 1.0, 0.9)
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
top_p = st.slider("Top-P (Nucleus) Sampling", 0.0, 1.0, 0.95)
|
| 215 |
-
negative_prompt_text = st.text_input("Negative Prompt (Optional)", placeholder="e.g., low quality, noisy, music")
|
| 216 |
|
| 217 |
-
#
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
st.write(f"Using device: `{device}`. {'GPU detected: ' + torch.cuda.get_device_name(0) if device.type == 'cuda' else 'Warning: CPU processing will be slow for AI models.'}")
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
output_video_path = os.path.join(TEMP_DIR, f"output_{uploaded_file.name}")
|
| 227 |
|
|
|
|
| 228 |
try:
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
progress_bar.progress(20)
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
base_desc = blip_processor.decode(out[0], skip_special_tokens=True)
|
| 246 |
-
raw_blip_captions.append(base_desc)
|
| 247 |
-
enhanced_desc = generate_enhanced_prompt(base_desc, prompt_style)
|
| 248 |
-
descriptions.append(enhanced_desc)
|
| 249 |
-
status_text.info(f"⏳ 2/5: Frame {i+1}/{len(frames)} analyzed.")
|
| 250 |
-
combined_description = " Then, ".join(descriptions); progress_bar.progress(40)
|
| 251 |
-
|
| 252 |
-
st.subheader("📝 Generated Sound Prompt (Editable)")
|
| 253 |
-
editable_prompt = st.text_area("Sound Effect Prompt:", value=combined_description, height=100)
|
| 254 |
|
| 255 |
-
|
| 256 |
-
if not editable_prompt.strip(): st.error("Prompt cannot be empty!"); st.stop()
|
| 257 |
-
|
| 258 |
-
status_text.info(f"⏳ 3/5: Generating sound (MusicGen)...")
|
| 259 |
-
audio_array, sample_rate = generate_audio(editable_prompt, negative_prompt_text, generated_sfx_duration, guidance_scale, top_k, top_p, device)
|
| 260 |
-
sf.write(temp_audio_path, audio_array, sample_rate); progress_bar.progress(60)
|
| 261 |
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
except Exception as e:
|
| 287 |
-
st.error(f"An
|
| 288 |
-
st.
|
| 289 |
-
|
| 290 |
-
st.code(traceback.format_exc())
|
| 291 |
finally:
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
else:
|
| 297 |
-
st.info("👋 Welcome! Upload an MP4 to get started. Adjust settings in the sidebar for speed/quality.")
|
| 298 |
-
st.markdown("""
|
| 299 |
-
**How it's faster:**
|
| 300 |
-
1. **OpenCV & Frame Resizing:** Faster frame grabbing, smaller frames for quicker AI analysis (BLIP).
|
| 301 |
-
2. **Encoding Presets:** Choose 'ultrafast' or 'superfast' for quicker video output (MoviePy).
|
| 302 |
-
3. **Optimized Defaults:** Fewer frames analyzed & shorter SFX clip by default.
|
| 303 |
-
4. **GPU Strongly Recommended:** For AI models (BLIP, MusicGen), GPU hardware on Hugging Face Spaces is key for true speed.
|
| 304 |
-
""")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import imageio
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
from PIL import Image
|
| 5 |
from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
|
|
|
|
| 12 |
try:
|
| 13 |
import moviepy.editor as mpy
|
| 14 |
except ModuleNotFoundError:
|
| 15 |
+
st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3' is in requirements.txt and installed.")
|
| 16 |
st.stop()
|
| 17 |
|
| 18 |
+
# Set page title and instructions
|
| 19 |
+
st.title("Story Video Sound Effect Sync Generator")
|
| 20 |
+
st.write("Upload an MP4 video to auto-generate and sync a high-quality sound effect.")
|
|
|
|
| 21 |
|
| 22 |
+
# User-configurable settings
|
| 23 |
+
num_frames_to_extract = st.slider("Number of frames to analyze", 1, 3, 1, help="Fewer frames = faster processing")
|
| 24 |
+
mix_original_audio = st.checkbox("Mix with original audio", value=False, help="Blend sound effect with video’s original sound")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
# Enhanced prompt generation function
|
| 27 |
+
def enhance_prompt(base_description):
|
| 28 |
+
"""Generate a detailed, sound-specific prompt from BLIP caption."""
|
| 29 |
+
base = base_description.lower().strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
# Define action, object, and environment keywords
|
| 32 |
+
actions = {
|
| 33 |
+
"walk": "crisp footsteps on a wooden floor",
|
| 34 |
+
"run": "rapid footsteps and heavy breathing",
|
| 35 |
+
"drive": "engine roar and tires screeching",
|
| 36 |
+
"talk": "soft voices and background murmur",
|
| 37 |
+
"crash": "loud crash and debris scattering",
|
| 38 |
+
"fall": "thud of impact and rustling debris"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
+
objects = {
|
| 41 |
+
"person": "human activity with subtle breathing",
|
| 42 |
+
"dog": "playful barks and pawsteps",
|
| 43 |
+
"car": "mechanical hum and tire friction",
|
| 44 |
+
"tree": "rustling leaves in a breeze",
|
| 45 |
+
"forest": "gentle wind and distant bird calls"
|
|
|
|
| 46 |
}
|
| 47 |
+
environments = {
|
| 48 |
+
"room": "echoing footsteps and muffled sounds",
|
| 49 |
+
"street": "distant traffic and urban hum",
|
| 50 |
+
"forest": "wind through trees and twigs snapping",
|
| 51 |
+
"outside": "open air with faint wind"
|
| 52 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
# Extract key elements from the caption
|
| 55 |
+
sound_description = ""
|
| 56 |
+
for action, sound in actions.items():
|
| 57 |
+
if action in base:
|
| 58 |
+
sound_description = sound
|
| 59 |
+
break
|
| 60 |
+
if not sound_description: # Default to subtle ambient if no action
|
| 61 |
+
sound_description = "subtle ambient hum"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
# Add object-specific sounds
|
| 64 |
+
for obj, sound in objects.items():
|
| 65 |
+
if obj in base:
|
| 66 |
+
sound_description += f" and {sound}"
|
| 67 |
+
break
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
# Add environment if detected
|
| 70 |
+
for env, sound in environments.items():
|
| 71 |
+
if env in base:
|
| 72 |
+
sound_description += f" in a {env} with {sound}"
|
| 73 |
+
break
|
| 74 |
|
| 75 |
+
# Construct final prompt
|
| 76 |
+
return f"{base} with {sound_description}"
|
|
|
|
| 77 |
|
| 78 |
+
# File uploader for video
|
| 79 |
+
uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
|
|
|
|
| 80 |
|
| 81 |
+
if uploaded_file is not None:
|
| 82 |
try:
|
| 83 |
+
# Temporary video file
|
| 84 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
|
| 85 |
+
temp_video.write(uploaded_file.getbuffer())
|
| 86 |
+
temp_video_path = temp_video.name
|
| 87 |
+
|
| 88 |
+
# Progress bar setup
|
| 89 |
+
progress_bar = st.progress(0)
|
| 90 |
+
status_text = st.empty()
|
| 91 |
+
|
| 92 |
+
# Extract frames
|
| 93 |
+
status_text.text("Extracting frames...")
|
| 94 |
+
video = imageio.get_reader(temp_video_path, "ffmpeg")
|
| 95 |
+
total_frames = len(list(video.iter_data()))
|
| 96 |
+
step = max(1, total_frames // num_frames_to_extract)
|
| 97 |
+
frames = [
|
| 98 |
+
Image.fromarray(video.get_data(i))
|
| 99 |
+
for i in range(0, min(total_frames, num_frames_to_extract * step), step)
|
| 100 |
+
][:num_frames_to_extract]
|
| 101 |
progress_bar.progress(20)
|
| 102 |
|
| 103 |
+
# Load BLIP model
|
| 104 |
+
@st.cache_resource
|
| 105 |
+
def load_blip_model():
|
| 106 |
+
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 107 |
+
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 108 |
+
if torch.cuda.is_available():
|
| 109 |
+
model = model.half().to("cuda")
|
| 110 |
+
return processor, model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
+
processor, model = load_blip_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
# Generate and enhance text descriptions
|
| 115 |
+
status_text.text("Analyzing frames...")
|
| 116 |
+
descriptions = []
|
| 117 |
+
for i, frame in enumerate(frames):
|
| 118 |
+
inputs = processor(images=frame, return_tensors="pt")
|
| 119 |
+
if torch.cuda.is_available():
|
| 120 |
+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 121 |
+
out = model.generate(**inputs)
|
| 122 |
+
base_description = processor.decode(out[0], skip_special_tokens=True)
|
| 123 |
+
enhanced_description = enhance_prompt(base_description)
|
| 124 |
+
descriptions.append(enhanced_description)
|
| 125 |
+
progress_bar.progress(20 + int(30 * (i + 1) / len(frames)))
|
| 126 |
+
|
| 127 |
+
text_prompt = ". ".join(descriptions)
|
| 128 |
+
st.write("Enhanced text prompt:", text_prompt)
|
| 129 |
+
|
| 130 |
+
# Load MusicGen model
|
| 131 |
+
@st.cache_resource
|
| 132 |
+
def load_musicgen_model():
|
| 133 |
+
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
|
| 134 |
+
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
| 135 |
+
if torch.cuda.is_available():
|
| 136 |
+
model = model.half().to("cuda")
|
| 137 |
+
return processor, model
|
| 138 |
+
|
| 139 |
+
musicgen_processor, musicgen_model = load_musicgen_model()
|
| 140 |
+
|
| 141 |
+
# Generate sound effect (~8 seconds)
|
| 142 |
+
status_text.text("Generating sound effect...")
|
| 143 |
+
inputs = musicgen_processor(
|
| 144 |
+
text=[text_prompt],
|
| 145 |
+
padding=True,
|
| 146 |
+
return_tensors="pt",
|
| 147 |
+
)
|
| 148 |
+
if torch.cuda.is_available():
|
| 149 |
+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 150 |
+
audio_values = musicgen_model.generate(
|
| 151 |
+
**inputs,
|
| 152 |
+
max_new_tokens=256,
|
| 153 |
+
do_sample=True,
|
| 154 |
+
guidance_scale=3.0,
|
| 155 |
+
top_k=50,
|
| 156 |
+
top_p=0.95
|
| 157 |
+
)
|
| 158 |
+
audio_array = audio_values[0].cpu().numpy()
|
| 159 |
+
if audio_array.ndim > 1:
|
| 160 |
+
audio_array = audio_array.flatten()
|
| 161 |
+
audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9
|
| 162 |
+
audio_array = np.clip(audio_array, -1.0, 1.0)
|
| 163 |
+
sample_rate = 32000
|
| 164 |
+
progress_bar.progress(60)
|
| 165 |
+
|
| 166 |
+
# Save temporary audio
|
| 167 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
| 168 |
+
sf.write(temp_audio.name, audio_array, sample_rate)
|
| 169 |
+
temp_audio_path = temp_audio.name
|
| 170 |
+
|
| 171 |
+
# Synchronize with video using mpy
|
| 172 |
+
status_text.text("Syncing audio with video...")
|
| 173 |
+
video_clip = mpy.VideoFileClip(temp_video_path)
|
| 174 |
+
video_duration = video_clip.duration
|
| 175 |
+
audio_clip = mpy.AudioFileClip(temp_audio_path)
|
| 176 |
+
|
| 177 |
+
# Adjust audio length
|
| 178 |
+
if audio_clip.duration < video_duration:
|
| 179 |
+
loops_needed = int(np.ceil(video_duration / audio_clip.duration))
|
| 180 |
+
audio_clip = mpy.concatenate_audioclips([audio_clip] * loops_needed).subclip(0, video_duration)
|
| 181 |
+
else:
|
| 182 |
+
audio_clip = audio_clip.subclip(0, video_duration)
|
| 183 |
|
| 184 |
+
# Mix or replace audio
|
| 185 |
+
if mix_original_audio and video_clip.audio:
|
| 186 |
+
final_audio = video_clip.audio.volumex(0.5) + audio_clip.volumex(0.5)
|
| 187 |
+
else:
|
| 188 |
+
final_audio = audio_clip
|
| 189 |
+
|
| 190 |
+
# Set audio to video
|
| 191 |
+
final_video = video_clip.set_audio(final_audio)
|
| 192 |
+
|
| 193 |
+
# Save final video with high quality
|
| 194 |
+
output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
| 195 |
+
final_video.write_videofile(
|
| 196 |
+
output_path,
|
| 197 |
+
codec="libx264",
|
| 198 |
+
audio_codec="aac",
|
| 199 |
+
preset="medium", # Better quality than ultrafast
|
| 200 |
+
bitrate="8000k", # Higher bitrate for video quality
|
| 201 |
+
audio_bitrate="192k", # Good audio quality
|
| 202 |
+
temp_audiofile="temp-audio.m4a",
|
| 203 |
+
remove_temp=True
|
| 204 |
+
)
|
| 205 |
+
progress_bar.progress(90)
|
| 206 |
+
|
| 207 |
+
# Provide playback and download
|
| 208 |
+
status_text.text("Done!")
|
| 209 |
+
st.video(output_path)
|
| 210 |
+
with open(output_path, "rb") as video_file:
|
| 211 |
+
st.download_button(
|
| 212 |
+
label="Download Synced Video",
|
| 213 |
+
data=video_file,
|
| 214 |
+
file_name="synced_story_video.mp4",
|
| 215 |
+
mime="video/mp4"
|
| 216 |
+
)
|
| 217 |
+
progress_bar.progress(100)
|
| 218 |
|
| 219 |
except Exception as e:
|
| 220 |
+
st.error(f"An error occurred: {str(e)}")
|
| 221 |
+
st.write("Try reducing frames or uploading a smaller video.")
|
| 222 |
+
|
|
|
|
| 223 |
finally:
|
| 224 |
+
# Clean up
|
| 225 |
+
for path in [temp_video_path, temp_audio_path, output_path]:
|
| 226 |
+
if 'path' in locals() and os.path.exists(path):
|
| 227 |
+
os.remove(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|