Spaces:
Runtime error
Runtime error
Commit
·
a6c8793
1
Parent(s):
a23243f
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,10 @@ from transformers import AutoProcessor, AutoModel
|
|
| 5 |
from PIL import Image
|
| 6 |
from decord import VideoReader, cpu
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
def sample_uniform_frame_indices(clip_len, seg_len):
|
| 9 |
if seg_len < clip_len:
|
| 10 |
repeat_factor = np.ceil(clip_len / seg_len).astype(int)
|
|
@@ -45,21 +49,28 @@ def model_interface(uploaded_video, model_choice, activity):
|
|
| 45 |
"microsoft/xclip-base-patch32-16-frames": 16,
|
| 46 |
"microsoft/xclip-base-patch32": 8
|
| 47 |
}.get(model_choice, 32)
|
|
|
|
| 48 |
indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
|
| 49 |
video = read_video_decord(uploaded_video, indices)
|
| 50 |
concatenated_image = concatenate_frames(video, clip_len)
|
| 51 |
|
| 52 |
-
# Appending "other" to the list of activities
|
| 53 |
activities_list = [activity, "other"]
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
inputs = processor(
|
| 57 |
text=activities_list,
|
| 58 |
-
videos=
|
| 59 |
return_tensors="pt",
|
| 60 |
padding=True,
|
| 61 |
)
|
| 62 |
|
|
|
|
|
|
|
| 63 |
with torch.no_grad():
|
| 64 |
outputs = model(**inputs)
|
| 65 |
|
|
|
|
| 5 |
from PIL import Image
|
| 6 |
from decord import VideoReader, cpu
|
| 7 |
|
| 8 |
+
# Use GPU if available
|
| 9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 10 |
+
torch.backends.cudnn.benchmark = True
|
| 11 |
+
|
| 12 |
def sample_uniform_frame_indices(clip_len, seg_len):
|
| 13 |
if seg_len < clip_len:
|
| 14 |
repeat_factor = np.ceil(clip_len / seg_len).astype(int)
|
|
|
|
| 49 |
"microsoft/xclip-base-patch32-16-frames": 16,
|
| 50 |
"microsoft/xclip-base-patch32": 8
|
| 51 |
}.get(model_choice, 32)
|
| 52 |
+
|
| 53 |
indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
|
| 54 |
video = read_video_decord(uploaded_video, indices)
|
| 55 |
concatenated_image = concatenate_frames(video, clip_len)
|
| 56 |
|
|
|
|
| 57 |
activities_list = [activity, "other"]
|
| 58 |
+
|
| 59 |
+
processor = AutoProcessor.from_pretrained(model_choice).to(device)
|
| 60 |
+
model = AutoModel.from_pretrained(model_choice).to(device)
|
| 61 |
+
|
| 62 |
+
# Convert the list of frames to a single numpy array for efficient conversion to a tensor
|
| 63 |
+
video_np_array = np.array(video)
|
| 64 |
+
|
| 65 |
inputs = processor(
|
| 66 |
text=activities_list,
|
| 67 |
+
videos=video_np_array,
|
| 68 |
return_tensors="pt",
|
| 69 |
padding=True,
|
| 70 |
)
|
| 71 |
|
| 72 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 73 |
+
|
| 74 |
with torch.no_grad():
|
| 75 |
outputs = model(**inputs)
|
| 76 |
|