Spaces:

garyuzair
/

Video-To-SoundFX

Running

App Files Files Community

garyuzair commited on May 7, 2025

Commit

3c52655

verified ·

1 Parent(s): 64b2c99

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -12

app.py CHANGED Viewed

@@ -97,23 +97,31 @@ def extract_frames(video_path, num_frames, method="uniform"):
     frames = []
     for idx in frame_indices[:num_frames]:
         try:
-            frames.append(Image.fromarray(video.get_data(idx)).resize((320, 180)))  # Downscale frames
         except:
             continue
     video.close()
     return frames
-# Generate captions
 @lru_cache(maxsize=100)
-def generate_captions(frames_tuple, processor, model):
-    frames = [Image.frombytes(frame[0], frame[1], frame[2]) for frame in frames_tuple]
     descriptions = []
     for frame in frames:
-        inputs = processor(images=frame, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        out = model.generate(**inputs, max_length=20, num_beams=3)  # Faster with beam search
-        description = processor.decode(out[0], skip_special_tokens=True)
         descriptions.append(description)
     return descriptions
@@ -273,9 +281,8 @@ def main():
                 if not frames:
                     st.error("Failed to extract frames. Try another video or method.")
                     return
-                # Convert frames to tuple for caching
-                frames_tuple = tuple((frame.mode, frame.size, frame.rgb) for frame in frames)
-                descriptions = generate_captions(frames_tuple, blip_processor, blip_model)
                 mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"], help="Sets the tone of sound effects.")
                 text_prompt = enhance_prompt(descriptions, mood)
                 text_prompt = st.text_area("Edit Prompt", text_prompt, height=80)

     frames = []
     for idx in frame_indices[:num_frames]:
         try:
+            frame = Image.fromarray(video.get_data(idx)).resize((320, 180), Image.BILINEAR)  # Downscale frames
+            frames.append(frame)
         except:
             continue
     video.close()
     return frames
+# Cached helper function to generate caption for a single frame
 @lru_cache(maxsize=100)
+def get_caption_for_frame(frame_bytes, mode, size, processor, model):
+    frame = Image.frombytes(mode, size, frame_bytes)
+    inputs = processor(images=frame, return_tensors="pt")
+    if torch.cuda.is_available():
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    out = model.generate(**inputs, max_length=20, num_beams=3)
+    return processor.decode(out[0], skip_special_tokens=True)
+# Generate captions using cached helper
+def generate_captions(frames, processor, model):
     descriptions = []
     for frame in frames:
+        frame_bytes = frame.tobytes()
+        mode = frame.mode
+        size = frame.size
+        description = get_caption_for_frame(frame_bytes, mode, size, processor, model)
         descriptions.append(description)
     return descriptions
                 if not frames:
                     st.error("Failed to extract frames. Try another video or method.")
                     return
+                # Generate captions using the updated function
+                descriptions = generate_captions(frames, blip_processor, blip_model)
                 mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"], help="Sets the tone of sound effects.")
                 text_prompt = enhance_prompt(descriptions, mood)
                 text_prompt = st.text_area("Edit Prompt", text_prompt, height=80)