Spaces:
Build error
Build error
File size: 3,908 Bytes
3b64f55 5fee850 3b64f55 175bb4b 3b64f55 0189988 3b64f55 47c18ca 3b64f55 5fee850 47c18ca 5fee850 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import gradio as gr
from mediapipe.python.solutions import holistic
from torchvision.transforms.v2 import Compose, Lambda, Normalize
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
from utils import get_predictions, preprocess
title = '''
'''
cite_markdown = '''
'''
description = '''
'''
examples = [
['samples/000_con_cho.mp4', 'Con chó'],
['samples/001_con_meo.mp4', 'Con mèo'],
['samples/005_con_tho.mp4', 'Con thỏ'],
['samples/006_con_trau.mp4', 'Con trâu'],
['samples/007_con_bo.mp4', 'Con bò'],
['samples/008_con_de.mp4', 'Con dê'],
['samples/009_con_heo.mp4', 'Con heo'],
['samples/010_mau_den.mp4', 'Màu đen'],
['samples/021_qua_man.mp4', 'Quả mận'],
['samples/022_qua_dua.mp4', 'Quả dứa'],
['samples/023_qua_dao.mp4', 'Quả đào'],
['samples/029_qua_dua.mp4', 'Quả dưa'],
['samples/031_me.mp4', 'Mẹ'],
['samples/032_con_trai.mp4', 'Con trai'],
['samples/033_con_gai.mp4', 'Con gái'],
['samples/035_chong.mp4', 'Chồng'],
['samples/044_mach.mp4', 'Mách'],
['samples/051_chay.mp4', 'Chạy'],
['samples/054_mua.mp4', 'Múa'],
['samples/055_nau.mp4', 'Nấu'],
['samples/057_nham_lan.mp4', 'Nhầm lẫn'],
['samples/059_cam_trai.mp4', 'Cắm trại'],
['samples/060_cung_cap.mp4', 'Cung cấp'],
['samples/062_bat_buoc.mp4', 'Bắt buộc'],
['samples/064_mua_ban.mp4', 'Mua bán'],
['samples/066_khong_nen.mp4', 'Không nên'],
['samples/067_khong_can.mp4', 'Không cần'],
['samples/069_khong_nghe_loi.mp4', 'Không nghe lời'],
['samples/073_ngot.mp4', 'Ngọt'],
['samples/079_chat.mp4', 'Chật'],
['samples/080_hep.mp4', 'Hẹp'],
['samples/081_rong.mp4', 'Rộng'],
['samples/082_dai.mp4', 'Dài'],
['samples/085_om.mp4', 'Ốm'],
['samples/086_map.mp4', 'Mập'],
['samples/087_ngoan.mp4', 'Ngoan'],
['samples/089_khoe.mp4', 'Khoẻ'],
['samples/091_dau.mp4', 'Đau'],
['samples/095_tot_bung.mp4', 'Tốt bụng'],
['samples/097_thu_vi.mp4', 'Thú vị'],
]
device = 'cpu'
model_name = 'VieSignLang/videomae_skeleton_v1.0'
image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
model = VideoMAEForVideoClassification.from_pretrained(model_name)
model = model.eval().to(device)
mean = image_processor.image_mean
std = image_processor.image_std
if 'shortest_edge' in image_processor.size:
model_input_height = model_input_width = image_processor.size['shortest_edge']
else:
model_input_height = image_processor.size['height']
model_input_width = image_processor.size['width']
transform = Compose(
[
Lambda(lambda x: x / 255.0),
Normalize(mean=mean, std=std),
]
)
def inference(
video: str,
progress: gr.Progress = gr.Progress(),
) -> tuple:
progress(0, desc='Preprocessing video')
keypoints_detector = holistic.Holistic(
static_image_mode=False,
model_complexity=2,
enable_segmentation=True,
refine_face_landmarks=True,
)
inputs = preprocess(
model_num_frames=model.config.num_frames,
keypoints_detector=keypoints_detector,
source=video,
model_input_height=model_input_height,
model_input_width=model_input_width,
device=device,
transform=transform,
)
progress(1/2, desc='Getting predictions')
predictions = get_predictions(inputs=inputs, model=model)
output_message = ''
for i, prediction in enumerate(predictions):
output_message += f'{i}. {prediction["label"]} ({prediction["score"]})\n'
output_message = output_message.strip()
progress(1/2, desc='Completed')
return output_message
iface = gr.Interface(
fn=inference,
inputs='video',
outputs='text',
examples=examples,
title=title,
description=description,
)
iface.launch()
|