Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -20,19 +20,51 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
|
|
| 20 |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
|
| 21 |
|
| 22 |
|
| 23 |
-
def process_video(video_bytes):
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def predict_answer(image, video, question, max_tokens=100):
|
| 38 |
|
|
@@ -53,7 +85,7 @@ def predict_answer(image, video, question, max_tokens=100):
|
|
| 53 |
|
| 54 |
elif video:
|
| 55 |
# Process as a video
|
| 56 |
-
frames =
|
| 57 |
answers = []
|
| 58 |
for frame in frames:
|
| 59 |
frame = Image.open(frame).convert("RGB")
|
|
|
|
| 20 |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
|
| 21 |
|
| 22 |
|
| 23 |
+
# def process_video(video_bytes):
|
| 24 |
+
# """Extracts frames from the video, 1 per second."""
|
| 25 |
+
# video = cv2.VideoCapture(io.BytesIO(video_bytes))
|
| 26 |
+
# fps = video.get(cv2.CAP_PROP_FPS)
|
| 27 |
+
# frames = []
|
| 28 |
+
# success, frame = video.read()
|
| 29 |
+
# while success:
|
| 30 |
+
# frames.append(frame)
|
| 31 |
+
# for _ in range(int(fps)): # Skip fps frames
|
| 32 |
+
# success, frame = video.read()
|
| 33 |
+
# video.release()
|
| 34 |
+
# return frames[:4] # Return the first 4 frames
|
| 35 |
|
| 36 |
+
def video_to_frames(video_path):
|
| 37 |
+
"""Converts a video file into frames and stores them as PNG images in a list."""
|
| 38 |
+
# List to hold frames encoded as PNG
|
| 39 |
+
frames_png = []
|
| 40 |
+
|
| 41 |
+
# Open the video file
|
| 42 |
+
cap = cv2.VideoCapture(video_path)
|
| 43 |
+
|
| 44 |
+
# Check if video opened successfully
|
| 45 |
+
if not cap.isOpened():
|
| 46 |
+
print("Error opening video file")
|
| 47 |
+
return frames_png
|
| 48 |
+
|
| 49 |
+
# Read until video is completed
|
| 50 |
+
while cap.isOpened():
|
| 51 |
+
# Capture frame-by-frame
|
| 52 |
+
ret, frame = cap.read()
|
| 53 |
+
|
| 54 |
+
# If frame is read correctly ret is True
|
| 55 |
+
if not ret:
|
| 56 |
+
print("Can't receive frame (stream end?). Exiting ...")
|
| 57 |
+
break
|
| 58 |
+
|
| 59 |
+
# Convert the frame to PNG and store it
|
| 60 |
+
is_success, buffer = cv2.imencode(".png", frame)
|
| 61 |
+
if is_success:
|
| 62 |
+
frames_png.append(np.array(buffer).tobytes())
|
| 63 |
+
|
| 64 |
+
# When everything done, release the video capture object
|
| 65 |
+
cap.release()
|
| 66 |
+
|
| 67 |
+
return frames_png
|
| 68 |
|
| 69 |
def predict_answer(image, video, question, max_tokens=100):
|
| 70 |
|
|
|
|
| 85 |
|
| 86 |
elif video:
|
| 87 |
# Process as a video
|
| 88 |
+
frames = video_to_frames(video)
|
| 89 |
answers = []
|
| 90 |
for frame in frames:
|
| 91 |
frame = Image.open(frame).convert("RGB")
|