VSL-VideoMAE / app.py
tanthinhdt's picture
fix(app): fix mediapipe
0189988 verified
raw
history blame
3.91 kB
import gradio as gr
from mediapipe.python.solutions import holistic
from torchvision.transforms.v2 import Compose, Lambda, Normalize
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
from utils import get_predictions, preprocess
title = '''
'''
cite_markdown = '''
'''
description = '''
'''
examples = [
['samples/000_con_cho.mp4', 'Con chó'],
['samples/001_con_meo.mp4', 'Con mèo'],
['samples/005_con_tho.mp4', 'Con thỏ'],
['samples/006_con_trau.mp4', 'Con trâu'],
['samples/007_con_bo.mp4', 'Con bò'],
['samples/008_con_de.mp4', 'Con dê'],
['samples/009_con_heo.mp4', 'Con heo'],
['samples/010_mau_den.mp4', 'Màu đen'],
['samples/021_qua_man.mp4', 'Quả mận'],
['samples/022_qua_dua.mp4', 'Quả dứa'],
['samples/023_qua_dao.mp4', 'Quả đào'],
['samples/029_qua_dua.mp4', 'Quả dưa'],
['samples/031_me.mp4', 'Mẹ'],
['samples/032_con_trai.mp4', 'Con trai'],
['samples/033_con_gai.mp4', 'Con gái'],
['samples/035_chong.mp4', 'Chồng'],
['samples/044_mach.mp4', 'Mách'],
['samples/051_chay.mp4', 'Chạy'],
['samples/054_mua.mp4', 'Múa'],
['samples/055_nau.mp4', 'Nấu'],
['samples/057_nham_lan.mp4', 'Nhầm lẫn'],
['samples/059_cam_trai.mp4', 'Cắm trại'],
['samples/060_cung_cap.mp4', 'Cung cấp'],
['samples/062_bat_buoc.mp4', 'Bắt buộc'],
['samples/064_mua_ban.mp4', 'Mua bán'],
['samples/066_khong_nen.mp4', 'Không nên'],
['samples/067_khong_can.mp4', 'Không cần'],
['samples/069_khong_nghe_loi.mp4', 'Không nghe lời'],
['samples/073_ngot.mp4', 'Ngọt'],
['samples/079_chat.mp4', 'Chật'],
['samples/080_hep.mp4', 'Hẹp'],
['samples/081_rong.mp4', 'Rộng'],
['samples/082_dai.mp4', 'Dài'],
['samples/085_om.mp4', 'Ốm'],
['samples/086_map.mp4', 'Mập'],
['samples/087_ngoan.mp4', 'Ngoan'],
['samples/089_khoe.mp4', 'Khoẻ'],
['samples/091_dau.mp4', 'Đau'],
['samples/095_tot_bung.mp4', 'Tốt bụng'],
['samples/097_thu_vi.mp4', 'Thú vị'],
]
device = 'cpu'
model_name = 'VieSignLang/videomae_skeleton_v1.0'
image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
model = VideoMAEForVideoClassification.from_pretrained(model_name)
model = model.eval().to(device)
mean = image_processor.image_mean
std = image_processor.image_std
if 'shortest_edge' in image_processor.size:
model_input_height = model_input_width = image_processor.size['shortest_edge']
else:
model_input_height = image_processor.size['height']
model_input_width = image_processor.size['width']
transform = Compose(
[
Lambda(lambda x: x / 255.0),
Normalize(mean=mean, std=std),
]
)
def inference(
video: str,
progress: gr.Progress = gr.Progress(),
) -> tuple:
progress(0, desc='Preprocessing video')
keypoints_detector = holistic.Holistic(
static_image_mode=False,
model_complexity=2,
enable_segmentation=True,
refine_face_landmarks=True,
)
inputs = preprocess(
model_num_frames=model.config.num_frames,
keypoints_detector=keypoints_detector,
source=video,
model_input_height=model_input_height,
model_input_width=model_input_width,
device=device,
transform=transform,
)
progress(1/2, desc='Getting predictions')
predictions = get_predictions(inputs=inputs, model=model)
output_message = ''
for i, prediction in enumerate(predictions):
output_message += f'{i}. {prediction["label"]} ({prediction["score"]})\n'
output_message = output_message.strip()
progress(1/2, desc='Completed')
return output_message
iface = gr.Interface(
fn=inference,
inputs='video',
outputs='text',
examples=examples,
title=title,
description=description,
)
iface.launch()