garyuzair commited on
Commit
3c52655
·
verified ·
1 Parent(s): 64b2c99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -12
app.py CHANGED
@@ -97,23 +97,31 @@ def extract_frames(video_path, num_frames, method="uniform"):
97
  frames = []
98
  for idx in frame_indices[:num_frames]:
99
  try:
100
- frames.append(Image.fromarray(video.get_data(idx)).resize((320, 180))) # Downscale frames
 
101
  except:
102
  continue
103
  video.close()
104
  return frames
105
 
106
- # Generate captions
107
  @lru_cache(maxsize=100)
108
- def generate_captions(frames_tuple, processor, model):
109
- frames = [Image.frombytes(frame[0], frame[1], frame[2]) for frame in frames_tuple]
 
 
 
 
 
 
 
 
110
  descriptions = []
111
  for frame in frames:
112
- inputs = processor(images=frame, return_tensors="pt")
113
- if torch.cuda.is_available():
114
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
115
- out = model.generate(**inputs, max_length=20, num_beams=3) # Faster with beam search
116
- description = processor.decode(out[0], skip_special_tokens=True)
117
  descriptions.append(description)
118
  return descriptions
119
 
@@ -273,9 +281,8 @@ def main():
273
  if not frames:
274
  st.error("Failed to extract frames. Try another video or method.")
275
  return
276
- # Convert frames to tuple for caching
277
- frames_tuple = tuple((frame.mode, frame.size, frame.rgb) for frame in frames)
278
- descriptions = generate_captions(frames_tuple, blip_processor, blip_model)
279
  mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"], help="Sets the tone of sound effects.")
280
  text_prompt = enhance_prompt(descriptions, mood)
281
  text_prompt = st.text_area("Edit Prompt", text_prompt, height=80)
 
97
  frames = []
98
  for idx in frame_indices[:num_frames]:
99
  try:
100
+ frame = Image.fromarray(video.get_data(idx)).resize((320, 180), Image.BILINEAR) # Downscale frames
101
+ frames.append(frame)
102
  except:
103
  continue
104
  video.close()
105
  return frames
106
 
107
+ # Cached helper function to generate caption for a single frame
108
  @lru_cache(maxsize=100)
109
+ def get_caption_for_frame(frame_bytes, mode, size, processor, model):
110
+ frame = Image.frombytes(mode, size, frame_bytes)
111
+ inputs = processor(images=frame, return_tensors="pt")
112
+ if torch.cuda.is_available():
113
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
114
+ out = model.generate(**inputs, max_length=20, num_beams=3)
115
+ return processor.decode(out[0], skip_special_tokens=True)
116
+
117
+ # Generate captions using cached helper
118
+ def generate_captions(frames, processor, model):
119
  descriptions = []
120
  for frame in frames:
121
+ frame_bytes = frame.tobytes()
122
+ mode = frame.mode
123
+ size = frame.size
124
+ description = get_caption_for_frame(frame_bytes, mode, size, processor, model)
 
125
  descriptions.append(description)
126
  return descriptions
127
 
 
281
  if not frames:
282
  st.error("Failed to extract frames. Try another video or method.")
283
  return
284
+ # Generate captions using the updated function
285
+ descriptions = generate_captions(frames, blip_processor, blip_model)
 
286
  mood = st.selectbox("Sound Mood", ["default", "dramatic", "ambient", "action"], help="Sets the tone of sound effects.")
287
  text_prompt = enhance_prompt(descriptions, mood)
288
  text_prompt = st.text_area("Edit Prompt", text_prompt, height=80)