Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -97,23 +97,31 @@ def extract_frames(video_path, num_frames, method="uniform"):
|
|
| 97 |
frames = []
|
| 98 |
for idx in frame_indices[:num_frames]:
|
| 99 |
try:
|
| 100 |
-
|
|
|
|
| 101 |
except:
|
| 102 |
continue
|
| 103 |
video.close()
|
| 104 |
return frames
|
| 105 |
|
| 106 |
-
#
|
| 107 |
@lru_cache(maxsize=100)
|
| 108 |
-
def
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
descriptions = []
|
| 111 |
for frame in frames:
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
description = processor.decode(out[0], skip_special_tokens=True)
|
| 117 |
descriptions.append(description)
|
| 118 |
return descriptions
|
| 119 |
|
|
@@ -273,9 +281,8 @@ def main():
|
|
| 273 |
if not frames:
|
| 274 |
st.error("Failed to extract frames. Try another video or method.")
|
| 275 |
return
|
| 276 |
-
#
|
| 277 |
-
|
| 278 |
-
descriptions = generate_captions(frames_tuple, blip_processor, blip_model)
|
| 279 |
mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"], help="Sets the tone of sound effects.")
|
| 280 |
text_prompt = enhance_prompt(descriptions, mood)
|
| 281 |
text_prompt = st.text_area("Edit Prompt", text_prompt, height=80)
|
|
|
|
| 97 |
frames = []
|
| 98 |
for idx in frame_indices[:num_frames]:
|
| 99 |
try:
|
| 100 |
+
frame = Image.fromarray(video.get_data(idx)).resize((320, 180), Image.BILINEAR) # Downscale frames
|
| 101 |
+
frames.append(frame)
|
| 102 |
except:
|
| 103 |
continue
|
| 104 |
video.close()
|
| 105 |
return frames
|
| 106 |
|
| 107 |
+
# Cached helper function to generate caption for a single frame
|
| 108 |
@lru_cache(maxsize=100)
|
| 109 |
+
def get_caption_for_frame(frame_bytes, mode, size, processor, model):
|
| 110 |
+
frame = Image.frombytes(mode, size, frame_bytes)
|
| 111 |
+
inputs = processor(images=frame, return_tensors="pt")
|
| 112 |
+
if torch.cuda.is_available():
|
| 113 |
+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 114 |
+
out = model.generate(**inputs, max_length=20, num_beams=3)
|
| 115 |
+
return processor.decode(out[0], skip_special_tokens=True)
|
| 116 |
+
|
| 117 |
+
# Generate captions using cached helper
|
| 118 |
+
def generate_captions(frames, processor, model):
|
| 119 |
descriptions = []
|
| 120 |
for frame in frames:
|
| 121 |
+
frame_bytes = frame.tobytes()
|
| 122 |
+
mode = frame.mode
|
| 123 |
+
size = frame.size
|
| 124 |
+
description = get_caption_for_frame(frame_bytes, mode, size, processor, model)
|
|
|
|
| 125 |
descriptions.append(description)
|
| 126 |
return descriptions
|
| 127 |
|
|
|
|
| 281 |
if not frames:
|
| 282 |
st.error("Failed to extract frames. Try another video or method.")
|
| 283 |
return
|
| 284 |
+
# Generate captions using the updated function
|
| 285 |
+
descriptions = generate_captions(frames, blip_processor, blip_model)
|
|
|
|
| 286 |
mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"], help="Sets the tone of sound effects.")
|
| 287 |
text_prompt = enhance_prompt(descriptions, mood)
|
| 288 |
text_prompt = st.text_area("Edit Prompt", text_prompt, height=80)
|