garyuzair commited on
Commit
ba26b0a
·
verified ·
1 Parent(s): 320cecf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +189 -0
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import imageio
3
+ import numpy as np
4
+ from PIL import Image
5
+ from transformers import pipeline
6
+ import soundfile as sf
7
+ import torch
8
+ import os
9
+ import tempfile
10
+ import math
11
+ import gc # Garbage collector
12
+
13
+ # Try importing moviepy, with fallback
14
+ try:
15
+ import moviepy.editor as mpy
16
+ except ModuleNotFoundError:
17
+ st.error("The 'moviepy' library is not installed. Please install it (`pip install moviepy`) 🚨")
18
+ st.stop()
19
+ except OSError as e:
20
+ st.error(f"Error initializing moviepy: {e} 🚨")
21
+ st.warning("Ensure ffmpeg is installed and accessible in your PATH.")
22
+
23
+ # --- Constants & Defaults ---
24
+ MODEL_MUSICGEN = "facebook/musicgen-small"
25
+ MODEL_MOONDREAM = "vikhyatk/moondream2"
26
+ DEFAULT_AUDIO_DURATION_S = 15
27
+ DEFAULT_FRAMES = 5
28
+ DEFAULT_GUIDANCE = 5.0
29
+ DEFAULT_TEMPERATURE = 0.8
30
+ MAX_FRAMES_TO_SHOW = 5
31
+
32
+ # --- Page Config ---
33
+ st.set_page_config(
34
+ page_title="AI Video Sound Designer (Moondream2)",
35
+ page_icon="🎬",
36
+ layout="wide"
37
+ )
38
+
39
+ # --- Cached Loaders ---
40
+ @st.cache_resource
41
+ def load_moondream2():
42
+ return pipeline(
43
+ "image-text-to-text",
44
+ model=MODEL_MOONDREAM,
45
+ trust_remote_code=True,
46
+ device=0 if torch.cuda.is_available() else -1
47
+ )
48
+
49
+ @st.cache_resource
50
+ def load_musicgen_model():
51
+ from transformers import AutoProcessor, MusicgenForConditionalGeneration
52
+ processor = AutoProcessor.from_pretrained(MODEL_MUSICGEN)
53
+ model = MusicgenForConditionalGeneration.from_pretrained(MODEL_MUSICGEN)
54
+ if torch.cuda.is_available():
55
+ model = model.half().to("cuda")
56
+ return processor, model
57
+
58
+ # --- Utilities ---
59
+ def clear_gpu_memory():
60
+ if torch.cuda.is_available():
61
+ torch.cuda.empty_cache()
62
+ gc.collect()
63
+
64
+ # --- Frame Extraction ---
65
+ def extract_frames(video_path, num_frames):
66
+ frames = []
67
+ reader = imageio.get_reader(video_path, "ffmpeg")
68
+ meta = reader.get_meta_data()
69
+ fps = meta.get('fps', 24)
70
+ duration = meta.get('duration', 5)
71
+ total = int(fps * duration)
72
+ indices = np.linspace(0, total - 1, num_frames, dtype=int)
73
+ for i in indices:
74
+ frames.append(Image.fromarray(reader.get_data(i)).convert("RGB"))
75
+ reader.close()
76
+ return frames
77
+
78
+ # --- Sound Prompt Generation ---
79
+ def generate_sound_prompt(frames, pipe):
80
+ instruction = (
81
+ "Describe only the sounds implied by this image: ambient noise, textures, "
82
+ "actions producing sound, atmosphere. Be concise but evocative."
83
+ )
84
+ descriptions = []
85
+ for frame in frames:
86
+ out = pipe(image=frame, text=instruction)
87
+ text = out[0].get('generated_text', '').strip()
88
+ if text:
89
+ descriptions.append(text)
90
+ combined = "; ".join(dict.fromkeys(descriptions))
91
+ return combined or "ambient background noise"
92
+
93
+ # --- Audio Generation ---
94
+ def generate_audio(prompt, duration, processor, model, guidance, temp):
95
+ device = next(model.parameters()).device
96
+ inputs = processor(text=[prompt], return_tensors="pt", padding=True).to(device)
97
+ inputs = {k: (v.to(model.dtype) if v.dtype.is_floating_point else v) for k, v in inputs.items()}
98
+ tokens_per_sec = 50
99
+ max_tokens = min(int(duration * tokens_per_sec), 1500)
100
+ with torch.inference_mode():
101
+ audio_tensor = model.generate(
102
+ **inputs,
103
+ max_new_tokens=max_tokens,
104
+ do_sample=True,
105
+ guidance_scale=guidance,
106
+ temperature=temp,
107
+ pad_token_id=model.config.eos_token_id
108
+ )
109
+ arr = audio_tensor[0].cpu().float().numpy()
110
+ peak = np.max(np.abs(arr)) or 1e-6
111
+ arr = np.clip(arr / peak * 0.9, -1.0, 1.0)
112
+ clear_gpu_memory()
113
+ return arr, model.config.audio_encoder.sampling_rate
114
+
115
+ # --- Sync Audio/Video ---
116
+ def sync_audio_video(video_path, audio_arr, sr, mix):
117
+ tmp_wav = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
118
+ sf.write(tmp_wav.name, audio_arr, sr)
119
+ video = mpy.VideoFileClip(video_path)
120
+ sound = mpy.AudioFileClip(tmp_wav.name)
121
+ # Loop or trim to match
122
+ if sound.duration < video.duration:
123
+ loops = math.ceil(video.duration / sound.duration)
124
+ sound = mpy.concatenate_audioclips([sound] * loops).subclip(0, video.duration)
125
+ else:
126
+ sound = sound.subclip(0, video.duration)
127
+ if mix and video.audio:
128
+ sound = mpy.CompositeAudioClip([video.audio.volumex(0.5), sound.volumex(0.5)])
129
+ final = video.set_audio(sound)
130
+ out = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
131
+ final.write_videofile(out, codec='libx264', audio_codec='aac', threads=os.cpu_count())
132
+ video.close(); sound.close(); final.close()
133
+ os.remove(tmp_wav.name)
134
+ clear_gpu_memory()
135
+ return out
136
+
137
+ # --- UI ---
138
+ st.title("🎬 AI Video Sound Designer (Moondream2)")
139
+ st.markdown("Upload an MP4 video to generate immersive sound effects or download standalone audio.")
140
+
141
+ uploaded = st.file_uploader("Upload MP4 Video", type=['mp4'])
142
+ # Sidebar Settings
143
+ with st.sidebar:
144
+ st.header("Settings")
145
+ n_frames = st.slider("Frames to analyze", 1, 10, DEFAULT_FRAMES)
146
+ duration = st.slider("Audio Duration (s)", 5, 30, DEFAULT_AUDIO_DURATION_S)
147
+ mix = st.checkbox("Mix with original audio", False)
148
+ gs = st.slider("Guidance Scale", 1.0, 10.0, DEFAULT_GUIDANCE, 0.5)
149
+ temp = st.slider("Temperature", 0.1, 2.0, DEFAULT_TEMPERATURE, 0.1)
150
+
151
+ if uploaded:
152
+ with st.spinner("Extracting frames and analyzing visuals..."):
153
+ tmp_vid = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
154
+ tmp_vid.write(uploaded.getbuffer()); tmp_vid.flush()
155
+ frames = extract_frames(tmp_vid.name, n_frames)
156
+ st.subheader("Sample Frames")
157
+ for col, img in zip(st.columns(min(len(frames), MAX_FRAMES_TO_SHOW)), frames):
158
+ col.image(img, use_column_width=True)
159
+
160
+ # Sequential: first Moondream
161
+ moondream = load_moondream2()
162
+ prompt = generate_sound_prompt(frames, moondream)
163
+ del moondream; clear_gpu_memory()
164
+ st.info(f"🧠 Sound Prompt: {prompt}")
165
+
166
+ # Then MusicGen
167
+ with st.spinner("Synthesizing audio..."):
168
+ proc, model = load_musicgen_model()
169
+ audio_arr, sr = generate_audio(prompt, duration, proc, model, gs, temp)
170
+
171
+ # Playback & Downloads
172
+ st.subheader("Generated Sound Effect")
173
+ st.audio(audio_arr, sr)
174
+ # Save to temp file
175
+ wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
176
+ sf.write(wav_tmp.name, audio_arr, sr)
177
+ with open(wav_tmp.name, 'rb') as f:
178
+ st.download_button("Download Audio Only", f, file_name='sound_effect.wav')
179
+
180
+ # Video sync
181
+ with st.spinner("Syncing audio with video..."):
182
+ out_video = sync_audio_video(tmp_vid.name, audio_arr, sr, mix)
183
+ st.video(out_video)
184
+ with open(out_video, 'rb') as vf:
185
+ st.download_button("Download Video with Sound", vf, file_name='sound_designed.mp4')
186
+ # Cleanup
187
+ os.remove(tmp_vid.name)
188
+ else:
189
+ st.info("Upload a video above to get started.")