garyuzair commited on
Commit
4a69d6b
Β·
verified Β·
1 Parent(s): 5e03617

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +401 -209
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import streamlit as st
2
  import torch
 
3
  from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
4
  import imageio
5
- import numpy as np
6
  from PIL import Image
7
  import soundfile as sf
8
  import os
@@ -10,6 +10,31 @@ import tempfile
10
  import subprocess
11
  from pydub import AudioSegment, effects
12
  import moviepy.editor as mpy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Optional scene detection
15
  scene_detect_available = True
@@ -19,273 +44,440 @@ try:
19
  except ImportError:
20
  scene_detect_available = False
21
 
22
- # Set page configuration
23
- st.set_page_config(page_title="Video Sound Effect Generator", layout="centered")
24
-
25
- # Load BLIP model for captioning
26
  @st.cache_resource
27
  def load_blip_model():
28
- processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
29
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
30
- if torch.cuda.is_available():
31
- model = model.half().to("cuda")
32
- return processor, model
 
 
 
 
 
 
 
33
 
34
- # Load MusicGen model
35
  @st.cache_resource
36
  def load_musicgen_model(model_name="facebook/musicgen-medium"):
37
- processor = AutoProcessor.from_pretrained(model_name)
38
- model = MusicgenForConditionalGeneration.from_pretrained(model_name)
39
- if torch.cuda.is_available():
40
- model = model.half().to("cuda")
41
- return processor, model
 
 
 
 
 
 
 
42
 
43
- # Extract frames efficiently
44
  def extract_frames(video_path, num_frames, method="uniform", segment_start=0, segment_end=None):
45
- video = imageio.get_reader(video_path, "ffmpeg")
46
- meta = video.get_meta_data()
47
- fps = meta['fps']
48
- total_frames = int(meta['duration'] * fps)
49
-
50
- if segment_end is None:
51
- segment_end = total_frames / fps
52
-
53
- start_frame = int(segment_start * fps)
54
- end_frame = int(segment_end * fps)
55
- total_segment_frames = end_frame - start_frame
56
-
57
- if method == "scene" and scene_detect_available:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  try:
59
- video_manager = VideoManager([video_path])
60
- scene_manager = SceneManager()
61
- scene_manager.add_detector(ContentDetector(threshold=30))
62
- video_manager.set_downscale_factor(2) # Optimize for speed
63
- video_manager.start()
64
- scene_manager.detect_scenes(frame_source=video_manager)
65
- scene_list = scene_manager.get_scene_list()
66
- segment_scenes = [scene for scene in scene_list if scene[0].get_seconds() >= segment_start and scene[0].get_seconds() < segment_end]
67
- frames = []
68
- for scene in segment_scenes[:num_frames]:
69
- frame = video_manager.get_frame(scene[0].get_frames())
70
- if frame is not None:
71
- frames.append(Image.fromarray(frame))
72
- video_manager.release()
73
- if len(frames) < num_frames and total_segment_frames > 0:
74
- remaining = num_frames - len(frames)
75
- step = total_segment_frames // (remaining + 1)
76
- for i in range(1, remaining + 1):
77
- frame_idx = start_frame + i * step
78
- if frame_idx < end_frame:
79
- frames.append(Image.fromarray(video.get_data(frame_idx)))
80
- return frames[:num_frames]
81
  except Exception as e:
82
- st.warning(f"Scene detection failed: {e}. Using uniform extraction.")
 
83
 
84
- # Uniform extraction
85
- step = max(1, total_segment_frames // num_frames)
86
- frame_indices = [start_frame + i * step for i in range(num_frames) if start_frame + i * step < end_frame]
87
- frames = [Image.fromarray(video.get_data(idx)) for idx in frame_indices]
88
- return frames[:num_frames]
89
 
90
- # Generate captions
91
- def generate_captions(frames, processor, model):
92
- descriptions = []
93
- for frame in frames:
94
- inputs = processor(images=frame, return_tensors="pt")
95
- if torch.cuda.is_available():
96
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
97
- out = model.generate(**inputs, max_length=30)
98
- description = processor.decode(out[0], skip_special_tokens=True)
99
- descriptions.append(description)
100
- return descriptions
101
-
102
- # Enhance prompts
103
  def enhance_prompt(descriptions, mood="default"):
 
104
  if not descriptions:
105
  return f"{mood} ambient sound with subtle effects"
106
- combined = ". ".join(descriptions).lower()
107
- base_prompts = {
108
- "walk|run": "crisp footsteps on varied surfaces, immersive movement sounds",
109
- "car|drive": "roaring engine, tire screeches, dynamic road noise",
110
- "talk|person": "lively voices, crowd murmur, spatial chatter",
111
- "wind|tree|forest": "rustling leaves, gentle wind gusts, natural ambiance",
112
- "crash|fall": "intense crash impact, debris scattering, sharp effects"
 
 
 
 
 
 
113
  }
114
- for pattern, effect in base_prompts.items():
115
- if any(word in combined for word in pattern.split("|")):
116
- return f"{mood} {combined}, {effect}"
117
- return f"{mood} {combined}, rich ambient soundscape with engaging effects"
 
 
 
 
 
 
118
 
119
- # Generate audio
120
  def generate_audio(prompt, processor, model, duration, sample_rate=44100):
121
- inputs = processor(text=[prompt], padding=True, return_tensors="pt")
122
- if torch.cuda.is_available():
123
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
124
- audio_values = model.generate(
125
- **inputs,
126
- max_new_tokens=int(512 * (duration / 8)), # Optimized for speed
127
- do_sample=True,
128
- guidance_scale=7.0,
129
- top_k=120,
130
- top_p=0.9
131
- )
132
- audio_array = audio_values[0].cpu().numpy()
133
- audio_array = audio_array / np.max(np.abs(audio_array)) * 0.95
134
- audio_array = np.clip(audio_array, -1.0, 1.0)
135
- return audio_array
 
 
 
 
 
 
 
 
136
 
137
- # Apply audio effects
138
  def apply_audio_effects(audio_path, settings):
139
- sound = AudioSegment.from_wav(audio_path)
140
- if settings['reverb_ms'] > 0:
141
- sound = sound + AudioSegment.silent(duration=settings['reverb_ms']) - 10
142
- if settings['echo_ms'] > 0:
143
- echo = sound - 15
144
- sound = sound.overlay(echo, position=settings['echo_ms'])
145
- if settings['highpass'] > 0:
146
- sound = sound.high_pass_filter(settings['highpass'])
147
- if settings['lowpass'] < 20000:
148
- sound = sound.low_pass_filter(settings['lowpass'])
149
- if settings['compress']:
150
- sound = effects.compress_dynamic_range(sound)
151
- sound = sound.pan(settings['stereo_pan'])
152
- sound = effects.normalize(sound)
153
- processed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
154
- sound.export(processed_path, format="wav")
155
- return processed_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- # Sync audio with video
158
- def sync_audio_video(video_path, audio_path, output_path, mix_original=False, original_volume=0.5, generated_volume=0.5):
159
- if mix_original:
160
- video_clip = mpy.VideoFileClip(video_path)
161
- if video_clip.audio:
162
- original_audio_seg = AudioSegment.from_file(video_path, format="mp4")
163
- generated_audio_seg = AudioSegment.from_wav(audio_path)
164
- original_audio_seg = original_audio_seg - (20 * (1 - original_volume))
165
- generated_audio_seg = generated_audio_seg - (20 * (1 - generated_volume))
166
- mixed_audio = original_audio_seg.overlay(generated_audio_seg)
167
- mixed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
168
- mixed_audio.export(mixed_path, format="wav")
169
- audio_path = mixed_path
170
- else:
171
- st.warning("No original audio found. Using generated audio only.")
172
-
173
- cmd = [
174
- 'ffmpeg',
175
- '-i', video_path,
176
- '-i', audio_path,
177
- '-c:v', 'copy',
178
- '-c:a', 'aac',
179
- '-map', '0:v:0',
180
- '-map', '1:a:0',
181
- '-shortest',
182
- '-y',
183
- output_path
184
- ]
185
- subprocess.run(cmd, check=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- # Main application
188
- def main():
189
- st.title("🎬 Video Sound Effect Generator")
190
- st.markdown("Upload a video to create immersive, video-specific sound effects with AI.")
 
191
 
 
 
 
 
 
 
 
 
192
  # User Guide
193
- with st.expander("πŸ“– User Guide"):
194
  st.markdown("""
195
  **How to Use:**
196
- 1. **Upload a Video**: Choose an MP4, MOV, or AVI file.
197
- 2. **Select Prompt Mode**:
198
- - **Automatic**: Analyzes video frames to generate sound prompts.
199
- - **Manual**: Enter your own sound description.
200
- 3. **Configure Settings**: Adjust frame analysis, audio effects, and model size in the sidebar.
201
- 4. **Generate**: Click "Generate Sound Effects" to process the video.
202
- 5. **Download**: Save the enhanced video with sound effects.
 
203
 
204
- **Tips**:
205
- - Use at least 5 frames for better sound relevance.
206
- - Scene-based frame extraction (if available) improves accuracy.
207
- - Adjust audio effects for a customized sound experience.
 
208
  """)
209
-
210
  # Sidebar Settings
211
  with st.sidebar:
212
- st.header("βš™οΈ Settings")
213
- prompt_mode = st.selectbox("Prompt Mode", ["Automatic", "Manual"])
214
- model_size = st.selectbox("Model Size", ["small", "medium", "large"], index=1)
 
 
 
 
 
 
 
215
  mix_original = st.checkbox("Mix with Original Audio", value=False)
216
- original_volume, generated_volume = 0.5, 0.5
217
- if mix_original:
218
- original_volume = st.slider("Original Audio Volume", 0.0, 1.0, 0.5)
219
- generated_volume = st.slider("Generated Audio Volume", 0.0, 1.0, 0.5)
 
220
 
221
- st.subheader("Frame Analysis")
222
- num_frames = st.slider("Frames to Analyze", 5, 10, 5, help="More frames improve sound relevance but increase processing time")
223
- frame_method = st.selectbox("Frame Extraction Method", ["Uniform", "Scene"] if scene_detect_available else ["Uniform"])
224
-
225
- st.subheader("Audio Effects")
 
 
 
 
 
226
  effects_settings = {
227
- 'reverb_ms': st.slider("Reverb (ms)", 0, 500, 100),
228
- 'echo_ms': st.slider("Echo (ms)", 0, 1000, 200),
229
- 'highpass': st.slider("High-pass Filter (Hz)", 0, 3000, 100),
230
- 'lowpass': st.slider("Low-pass Filter (Hz)", 5000, 20000, 15000),
231
  'compress': st.checkbox("Dynamic Compression", value=True),
232
  'stereo_pan': st.slider("Stereo Pan (-1 left, 1 right)", -1.0, 1.0, 0.0)
233
  }
234
-
235
- # Main Content
236
- uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mov", "avi"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  if uploaded_file:
 
238
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
239
  tmp.write(uploaded_file.read())
240
  video_path = tmp.name
 
 
241
  st.video(video_path)
242
-
 
243
  video_clip = mpy.VideoFileClip(video_path)
244
  duration = video_clip.duration
245
  video_clip.close()
246
-
 
247
  if prompt_mode == "Automatic":
248
- with st.spinner("Analyzing frames..."):
249
  blip_processor, blip_model = load_blip_model()
 
 
 
 
250
  frames = extract_frames(video_path, num_frames, frame_method)
251
  if not frames:
252
  st.error("No frames extracted. Try a different video or settings.")
253
  return
254
- descriptions = generate_captions(frames, blip_processor, blip_model)
255
- mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  text_prompt = enhance_prompt(descriptions, mood)
257
- text_prompt = st.text_area("Edit Prompt", text_prompt, height=100)
 
 
258
  else:
259
- text_prompt = st.text_area("Enter Sound Description", "Describe the desired sound effects", height=100)
260
-
261
- if st.button("Generate Sound Effects", key="generate"):
262
- progress = st.progress(0)
263
- status = st.empty()
264
- status.text("Loading model...")
 
 
 
 
 
 
 
 
 
265
  musicgen_processor, musicgen_model = load_musicgen_model(f"facebook/musicgen-{model_size}")
266
- progress.progress(20)
267
-
268
- status.text("Generating audio...")
 
 
 
 
269
  audio_array = generate_audio(text_prompt, musicgen_processor, musicgen_model, duration)
 
 
270
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
271
  sf.write(temp_audio, audio_array, 44100)
272
- progress.progress(50)
273
-
274
- status.text("Applying audio effects...")
 
275
  processed_audio = apply_audio_effects(temp_audio, effects_settings)
276
- progress.progress(75)
277
-
278
- status.text("Syncing with video...")
 
279
  output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
280
- sync_audio_video(video_path, processed_audio, output_video, mix_original, original_volume, generated_volume)
281
- progress.progress(100)
282
- status.text("Done!")
283
-
284
- st.success("Sound effects applied!")
 
 
 
285
  st.video(output_video)
 
 
286
  with open(output_video, "rb") as f:
287
- st.download_button("Download Enhanced Video", f, "enhanced_video.mp4", "video/mp4")
288
-
 
 
 
 
 
 
 
289
  # Cleanup
290
  for file in [video_path, temp_audio, processed_audio, output_video]:
291
  if os.path.exists(file):
 
1
  import streamlit as st
2
  import torch
3
+ import numpy as np
4
  from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
5
  import imageio
 
6
  from PIL import Image
7
  import soundfile as sf
8
  import os
 
10
  import subprocess
11
  from pydub import AudioSegment, effects
12
  import moviepy.editor as mpy
13
+ import time
14
+ from concurrent.futures import ThreadPoolExecutor
15
+
16
+ # Set page configuration
17
+ st.set_page_config(page_title="🎬 AI SoundFX Studio", layout="wide", initial_sidebar_state="expanded")
18
+
19
+ # Enhanced CSS for better UI
20
+ st.markdown("""
21
+ <style>
22
+ .reportview-container {
23
+ background: #1a1a1a;
24
+ color: white;
25
+ }
26
+ .sidebar .sidebar-content {
27
+ width: 350px;
28
+ }
29
+ .stProgress > div > div {
30
+ background-color: #4CAF50;
31
+ }
32
+ .stButton>button {
33
+ background-color: #4CAF50;
34
+ color: white;
35
+ }
36
+ </style>
37
+ """, unsafe_allow_html=True)
38
 
39
  # Optional scene detection
40
  scene_detect_available = True
 
44
  except ImportError:
45
  scene_detect_available = False
46
 
47
+ # Model Management
 
 
 
48
  @st.cache_resource
49
  def load_blip_model():
50
+ """Load BLIP model with optimized settings"""
51
+ try:
52
+ processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
53
+ model = BlipForConditionalGeneration.from_pretrained(
54
+ "Salesforce/blip-image-captioning-base",
55
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
56
+ low_mem=True
57
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
58
+ return processor, model
59
+ except Exception as e:
60
+ st.error(f"BLIP model load error: {str(e)}")
61
+ return None, None
62
 
 
63
  @st.cache_resource
64
  def load_musicgen_model(model_name="facebook/musicgen-medium"):
65
+ """Load MusicGen model with optimized settings"""
66
+ try:
67
+ model = MusicgenForConditionalGeneration.from_pretrained(
68
+ model_name,
69
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
70
+ low_cpu_mem_usage=True
71
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
72
+ processor = AutoProcessor.from_pretrained(model_name)
73
+ return processor, model
74
+ except Exception as e:
75
+ st.error(f"MusicGen model load error: {str(e)}")
76
+ return None, None
77
 
 
78
  def extract_frames(video_path, num_frames, method="uniform", segment_start=0, segment_end=None):
79
+ """Optimized frame extraction with smart sampling"""
80
+ try:
81
+ video = imageio.get_reader(video_path, "ffmpeg")
82
+ meta = video.get_meta_data()
83
+ fps = meta['fps']
84
+ total_frames = int(meta['duration'] * fps)
85
+
86
+ # Optimize frame count based on duration
87
+ actual_num_frames = min(num_frames, int(total_frames / 5) or 5)
88
+
89
+ if segment_end is None:
90
+ segment_end = total_frames / fps
91
+
92
+ start_frame = int(segment_start * fps)
93
+ end_frame = int(segment_end * fps)
94
+ total_segment_frames = end_frame - start_frame
95
+
96
+ # Smart frame selection
97
+ if method == "scene" and scene_detect_available:
98
+ try:
99
+ video_manager = VideoManager([video_path])
100
+ scene_manager = SceneManager()
101
+ scene_manager.add_detector(ContentDetector(threshold=30))
102
+ video_manager.set_downscale_factor(2)
103
+ video_manager.start()
104
+ scene_manager.detect_scenes(frame_source=video_manager)
105
+ scene_list = scene_manager.get_scene_list()
106
+ segment_scenes = [scene for scene in scene_list
107
+ if segment_start <= scene[0].get_seconds() < segment_end]
108
+
109
+ frames = []
110
+ for scene in segment_scenes[:actual_num_frames]:
111
+ frame = video_manager.get_frame(scene[0].get_frames())
112
+ if frame is not None:
113
+ frames.append(Image.fromarray(frame))
114
+ video_manager.release()
115
+
116
+ # Fill remaining frames if needed
117
+ if len(frames) < actual_num_frames and total_segment_frames > 0:
118
+ remaining = actual_num_frames - len(frames)
119
+ step = max(1, total_segment_frames // (remaining + 1))
120
+ for i in range(1, remaining + 1):
121
+ frame_idx = start_frame + i * step
122
+ if frame_idx < end_frame:
123
+ frames.append(Image.fromarray(video.get_data(frame_idx)))
124
+ return frames[:actual_num_frames]
125
+ except Exception as e:
126
+ st.warning(f"Scene detection failed: {e}. Using uniform extraction.")
127
+
128
+ # Uniform extraction with numpy optimization
129
+ frame_indices = np.linspace(start_frame, end_frame, actual_num_frames, endpoint=False).astype(int)
130
+ frames = []
131
+ for idx in frame_indices:
132
+ if idx < total_frames:
133
+ frame = video.get_data(idx)
134
+ frames.append(Image.fromarray(frame))
135
+ return frames[:actual_num_frames]
136
+
137
+ except Exception as e:
138
+ st.error(f"Frame extraction error: {str(e)}")
139
+ return []
140
+
141
+ def generate_captions_parallel(frames, processor, model):
142
+ """Parallel caption generation with error handling"""
143
+ def process_frame(frame):
144
  try:
145
+ inputs = processor(images=frame, return_tensors="pt").to(model.device)
146
+ out = model.generate(**inputs, max_length=25, num_beams=3)
147
+ return processor.decode(out[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  except Exception as e:
149
+ st.warning(f"Captioning error: {str(e)}")
150
+ return ""
151
 
152
+ with ThreadPoolExecutor() as executor:
153
+ return list(executor.map(process_frame, frames))
 
 
 
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def enhance_prompt(descriptions, mood="default"):
156
+ """Advanced prompt engineering with pattern recognition"""
157
  if not descriptions:
158
  return f"{mood} ambient sound with subtle effects"
159
+
160
+ combined = ". ".join(set(desc.lower() for desc in descriptions))
161
+
162
+ # Enhanced pattern matching dictionary
163
+ pattern_map = {
164
+ ('walk', 'run'): "crisp footsteps on varied surfaces with immersive movement sounds",
165
+ ('car', 'drive', 'vehicle'): "roaring engine, tire screeches, and dynamic road noise with spatial positioning",
166
+ ('talk', 'person', 'people', 'conversation'): "lively voices, crowd murmur, and spatial chatter with natural reverb",
167
+ ('wind', 'tree', 'forest', 'nature'): "rustling leaves, gentle wind gusts, and natural ambiance with atmospheric depth",
168
+ ('crash', 'fall', 'impact'): "intense crash impact, debris scattering, and sharp transient effects with dynamic range",
169
+ ('water', 'ocean', 'sea'): "realistic water movement, wave dynamics, and aquatic ambiance",
170
+ ('fire', 'explosion'): "realistic fire crackling, explosions, and heat distortion audio",
171
+ ('space', 'sci-fi'): "futuristic ambient textures, synth effects, and spatial audio design"
172
  }
173
+
174
+ # Advanced pattern matching logic
175
+ matched_patterns = []
176
+ for keywords, effect in pattern_map.items():
177
+ if any(keyword in combined for keyword in keywords):
178
+ matched_patterns.append(effect)
179
+
180
+ if matched_patterns:
181
+ return f"{mood} {combined}, {'; '.join(matched_patterns)}, cinematic sound design with spatial audio"
182
+ return f"{mood} {combined}, rich ambient soundscape with professional effects, 4K audio resolution"
183
 
 
184
  def generate_audio(prompt, processor, model, duration, sample_rate=44100):
185
+ """Optimized audio generation with smart parameters"""
186
+ try:
187
+ inputs = processor(text=[prompt], padding=True, return_tensors="pt").to(model.device)
188
+
189
+ audio_values = model.generate(
190
+ **inputs,
191
+ max_new_tokens=int(256 * (duration / 8)),
192
+ num_beams=3,
193
+ early_stopping=True,
194
+ do_sample=True,
195
+ temperature=0.85,
196
+ guidance_scale=5.0,
197
+ top_k=80,
198
+ top_p=0.85
199
+ )
200
+
201
+ audio_array = audio_values[0].cpu().numpy()
202
+ audio_array = np.tanh(audio_array) # Faster than clip + max normalization
203
+ return audio_array
204
+
205
+ except Exception as e:
206
+ st.error(f"Audio generation error: {str(e)}")
207
+ return np.zeros(int(duration * sample_rate))
208
 
 
209
  def apply_audio_effects(audio_path, settings):
210
+ """Enhanced audio effects processing"""
211
+ try:
212
+ sound = AudioSegment.from_wav(audio_path)
213
+
214
+ # Reverb
215
+ if settings['reverb_ms'] > 0:
216
+ sound = sound.overlay(sound - 15, position=settings['reverb_ms'])
217
+
218
+ # Echo
219
+ if settings['echo_ms'] > 0:
220
+ echo = sound - 15
221
+ sound = sound.overlay(echo, position=settings['echo_ms'])
222
+
223
+ # Filters
224
+ if settings['highpass'] > 0:
225
+ sound = sound.high_pass_filter(settings['highpass'])
226
+ if settings['lowpass'] < 20000:
227
+ sound = sound.low_pass_filter(settings['lowpass'])
228
+
229
+ # Dynamic processing
230
+ if settings['compress']:
231
+ sound = effects.compress_dynamic_range(sound)
232
+
233
+ # Stereo imaging
234
+ sound = sound.pan(settings['stereo_pan'])
235
+ sound = effects.normalize(sound)
236
+
237
+ processed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
238
+ sound.export(processed_path, format="wav")
239
+ return processed_path
240
+
241
+ except Exception as e:
242
+ st.error(f"Audio effects error: {str(e)}")
243
+ return audio_path
244
 
245
+ def sync_audio_video(video_path, audio_path, output_path, mix_original=False,
246
+ original_volume=0.5, generated_volume=0.5):
247
+ """Enhanced video/audio synchronization"""
248
+ try:
249
+ if mix_original:
250
+ video_clip = mpy.VideoFileClip(video_path)
251
+ if video_clip.audio:
252
+ original_audio_seg = AudioSegment.from_file(video_path, format="mp4")
253
+ generated_audio_seg = AudioSegment.from_wav(audio_path)
254
+
255
+ # Volume adjustment
256
+ original_audio_seg = original_audio_seg - (20 * (1 - original_volume))
257
+ generated_audio_seg = generated_audio_seg - (20 * (1 - generated_volume))
258
+
259
+ mixed_audio = original_audio_seg.overlay(generated_audio_seg)
260
+ mixed_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
261
+ mixed_audio.export(mixed_path, format="wav")
262
+ audio_path = mixed_path
263
+ else:
264
+ st.warning("No original audio found. Using generated audio only.")
265
+
266
+ # FFmpeg command with hardware acceleration
267
+ cmd = [
268
+ 'ffmpeg',
269
+ '-i', video_path,
270
+ '-i', audio_path,
271
+ '-c:v', 'copy',
272
+ '-c:a', 'aac',
273
+ '-map', '0:v:0',
274
+ '-map', '1:a:0',
275
+ '-shortest',
276
+ '-y',
277
+ '-preset', 'ultrafast',
278
+ '-vsync', '2',
279
+ output_path
280
+ ]
281
+ subprocess.run(cmd, check=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
282
+
283
+ except Exception as e:
284
+ st.error(f"Sync error: {str(e)}")
285
 
286
+ def unload_model(model):
287
+ """Memory management utility"""
288
+ if torch.cuda.is_available():
289
+ model.to("cpu")
290
+ torch.cuda.empty_cache()
291
 
292
+ def main():
293
+ st.title("🎬 AI SoundFX Studio")
294
+ st.markdown("### Create immersive soundscapes from video with optimized AI processing")
295
+
296
+ # Initialize session state
297
+ if 'processing_time' not in st.session_state:
298
+ st.session_state.processing_time = 0
299
+
300
  # User Guide
301
+ with st.expander("πŸ“– User Guide & Tips"):
302
  st.markdown("""
303
  **How to Use:**
304
+ 1. Upload a video file (MP4, MOV, AVI)
305
+ 2. Choose between Automatic (AI-generated) or Manual sound description
306
+ 3. Adjust settings in the sidebar:
307
+ - Model size (small/medium/large)
308
+ - Frame analysis parameters
309
+ - Audio effects customization
310
+ 4. Click "Generate Sound Effects"
311
+ 5. Download the enhanced video
312
 
313
+ **Optimization Tips:**
314
+ - Use "small" model for quick previews
315
+ - Enable "Scene Detection" for better context
316
+ - Adjust audio effects for custom sound design
317
+ - Use "Mix with Original Audio" for balanced results
318
  """)
319
+
320
  # Sidebar Settings
321
  with st.sidebar:
322
+ st.header("βš™οΈ Processing Settings")
323
+
324
+ # Processing Mode
325
+ prompt_mode = st.selectbox("Prompt Generation", ["Automatic", "Manual"])
326
+
327
+ # Model Selection
328
+ model_size = st.selectbox("Model Size", ["small", "medium", "large"], index=1,
329
+ help="Larger models = better quality but slower processing")
330
+
331
+ # Audio Mixing
332
  mix_original = st.checkbox("Mix with Original Audio", value=False)
333
+ col1, col2 = st.columns(2)
334
+ with col1:
335
+ original_vol = st.slider("Original Volume", 0.0, 1.0, 0.5) if mix_original else 0.5
336
+ with col2:
337
+ generated_vol = st.slider("Generated Volume", 0.0, 1.0, 0.5) if mix_original else 0.5
338
 
339
+ # Frame Analysis
340
+ st.subheader("πŸŽ₯ Frame Analysis")
341
+ num_frames = st.slider("Frames to Analyze", 3, 10, 5,
342
+ help="More frames improve accuracy but increase processing time")
343
+ frame_method = st.selectbox("Frame Extraction",
344
+ ["Uniform", "Scene"] if scene_detect_available else ["Uniform"],
345
+ help="Scene detection provides better contextual analysis")
346
+
347
+ # Audio Effects
348
+ st.subheader("πŸŽ›οΈ Audio Effects")
349
  effects_settings = {
350
+ 'reverb_ms': st.slider("Reverb (ms)", 0, 500, 50),
351
+ 'echo_ms': st.slider("Echo (ms)", 0, 1000, 100),
352
+ 'highpass': st.slider("High-pass Filter (Hz)", 0, 3000, 50),
353
+ 'lowpass': st.slider("Low-pass Filter (Hz)", 5000, 20000, 12000),
354
  'compress': st.checkbox("Dynamic Compression", value=True),
355
  'stereo_pan': st.slider("Stereo Pan (-1 left, 1 right)", -1.0, 1.0, 0.0)
356
  }
357
+
358
+ # Performance Presets
359
+ st.subheader("⚑ Performance")
360
+ quality_preset = st.selectbox("Quality Preset", ["Fast", "Balanced", "High Quality"])
361
+ presets = {
362
+ "Fast": {"num_frames": 3, "model_size": "small"},
363
+ "Balanced": {"num_frames": 5, "model_size": "medium"},
364
+ "High Quality": {"num_frames": 8, "model_size": "large"}
365
+ }
366
+ if quality_preset != "Balanced":
367
+ num_frames = presets[quality_preset]["num_frames"]
368
+ model_size = presets[quality_preset]["model_size"]
369
+
370
+ st.info("Processing time estimate: 2-5 minutes (varies by settings)")
371
+
372
+ # Main Content Area
373
+ uploaded_file = st.file_uploader("Upload Video File", type=["mp4", "mov", "avi"])
374
+
375
  if uploaded_file:
376
+ # Create temporary files
377
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
378
  tmp.write(uploaded_file.read())
379
  video_path = tmp.name
380
+
381
+ # Video Preview
382
  st.video(video_path)
383
+
384
+ # Get video duration
385
  video_clip = mpy.VideoFileClip(video_path)
386
  duration = video_clip.duration
387
  video_clip.close()
388
+
389
+ # Prompt Generation
390
  if prompt_mode == "Automatic":
391
+ with st.spinner("Analyzing video content..."):
392
  blip_processor, blip_model = load_blip_model()
393
+ if not blip_processor or not blip_model:
394
+ st.error("Failed to load BLIP model")
395
+ return
396
+
397
  frames = extract_frames(video_path, num_frames, frame_method)
398
  if not frames:
399
  st.error("No frames extracted. Try a different video or settings.")
400
  return
401
+
402
+ # Display analyzed frames
403
+ cols = st.columns(len(frames))
404
+ for col, frame in zip(cols, frames):
405
+ with col:
406
+ st.image(frame, use_column_width=True)
407
+
408
+ descriptions = generate_captions_parallel(frames, blip_processor, blip_model)
409
+ unload_model(blip_model)
410
+
411
+ # Mood selection
412
+ mood = st.selectbox("Sound Mood", [
413
+ "default", "dramatic", "ambient", "action", "sci-fi", "horror", "comedy"
414
+ ], help="Select the overall atmosphere for the sound design")
415
+
416
+ # Enhanced prompt with AI suggestions
417
  text_prompt = enhance_prompt(descriptions, mood)
418
+ st.subheader("Generated Prompt")
419
+ text_prompt = st.text_area("Edit Prompt", text_prompt, height=150)
420
+ st.markdown("*Suggested modifications: Add specific instrument types, intensity levels, or emotional cues*")
421
  else:
422
+ st.subheader("Enter Sound Description")
423
+ text_prompt = st.text_area("Describe the desired sound effects",
424
+ "E.g., 'Cinematic trailer music with thunderous impacts and soaring strings'",
425
+ height=150)
426
+
427
+ # Generation Button
428
+ if st.button("πŸ”Š Generate Sound Effects", key="generate", use_container_width=True):
429
+ start_time = time.time()
430
+
431
+ # Progress tracking
432
+ progress_bar = st.progress(0)
433
+ status_text = st.empty()
434
+ status_text.text("Loading models...")
435
+
436
+ # Load MusicGen model
437
  musicgen_processor, musicgen_model = load_musicgen_model(f"facebook/musicgen-{model_size}")
438
+ if not musicgen_processor or not musicgen_model:
439
+ st.error("Failed to load MusicGen model")
440
+ return
441
+ progress_bar.progress(20)
442
+
443
+ # Audio Generation
444
+ status_text.text("Generating audio...")
445
  audio_array = generate_audio(text_prompt, musicgen_processor, musicgen_model, duration)
446
+ unload_model(musicgen_model)
447
+
448
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
449
  sf.write(temp_audio, audio_array, 44100)
450
+ progress_bar.progress(50)
451
+
452
+ # Apply Effects
453
+ status_text.text("Applying audio effects...")
454
  processed_audio = apply_audio_effects(temp_audio, effects_settings)
455
+ progress_bar.progress(75)
456
+
457
+ # Sync with Video
458
+ status_text.text("Syncing with video...")
459
  output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
460
+ sync_audio_video(video_path, processed_audio, output_video, mix_original, original_vol, generated_vol)
461
+ progress_bar.progress(100)
462
+
463
+ # Finalize
464
+ status_text.text("Processing complete!")
465
+ st.success("βœ… Sound effects applied successfully!")
466
+
467
+ # Display result
468
  st.video(output_video)
469
+
470
+ # Download button
471
  with open(output_video, "rb") as f:
472
+ st.download_button("πŸ“₯ Download Enhanced Video",
473
+ f, "enhanced_video.mp4", "video/mp4",
474
+ use_container_width=True)
475
+
476
+ # Timing info
477
+ processing_time = time.time() - start_time
478
+ st.session_state.processing_time = processing_time
479
+ st.info(f"⏱️ Processing time: {processing_time:.1f} seconds")
480
+
481
  # Cleanup
482
  for file in [video_path, temp_audio, processed_audio, output_video]:
483
  if os.path.exists(file):