File size: 3,908 Bytes
3b64f55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fee850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b64f55
 
175bb4b
3b64f55
 
0189988
 
 
 
 
 
 
3b64f55
 
 
 
 
 
 
 
 
 
 
47c18ca
3b64f55
 
 
 
 
 
 
 
 
 
5fee850
 
47c18ca
5fee850
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
from mediapipe.python.solutions import holistic
from torchvision.transforms.v2 import Compose, Lambda, Normalize
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
from utils import get_predictions, preprocess


title = '''

'''

cite_markdown = '''

'''

description = '''

'''

examples = [
    ['samples/000_con_cho.mp4', 'Con chó'],
    ['samples/001_con_meo.mp4', 'Con mèo'],
    ['samples/005_con_tho.mp4', 'Con thỏ'],
    ['samples/006_con_trau.mp4', 'Con trâu'],
    ['samples/007_con_bo.mp4', 'Con bò'],
    ['samples/008_con_de.mp4', 'Con dê'],
    ['samples/009_con_heo.mp4', 'Con heo'],
    ['samples/010_mau_den.mp4', 'Màu đen'],
    ['samples/021_qua_man.mp4', 'Quả mận'],
    ['samples/022_qua_dua.mp4', 'Quả dứa'],
    ['samples/023_qua_dao.mp4', 'Quả đào'],
    ['samples/029_qua_dua.mp4', 'Quả dưa'],
    ['samples/031_me.mp4', 'Mẹ'],
    ['samples/032_con_trai.mp4', 'Con trai'],
    ['samples/033_con_gai.mp4', 'Con gái'],
    ['samples/035_chong.mp4', 'Chồng'],
    ['samples/044_mach.mp4', 'Mách'],
    ['samples/051_chay.mp4', 'Chạy'],
    ['samples/054_mua.mp4', 'Múa'],
    ['samples/055_nau.mp4', 'Nấu'],
    ['samples/057_nham_lan.mp4', 'Nhầm lẫn'],
    ['samples/059_cam_trai.mp4', 'Cắm trại'],
    ['samples/060_cung_cap.mp4', 'Cung cấp'],
    ['samples/062_bat_buoc.mp4', 'Bắt buộc'],
    ['samples/064_mua_ban.mp4', 'Mua bán'],
    ['samples/066_khong_nen.mp4', 'Không nên'],
    ['samples/067_khong_can.mp4', 'Không cần'],
    ['samples/069_khong_nghe_loi.mp4', 'Không nghe lời'],
    ['samples/073_ngot.mp4', 'Ngọt'],
    ['samples/079_chat.mp4', 'Chật'],
    ['samples/080_hep.mp4', 'Hẹp'],
    ['samples/081_rong.mp4', 'Rộng'],
    ['samples/082_dai.mp4', 'Dài'],
    ['samples/085_om.mp4', 'Ốm'],
    ['samples/086_map.mp4', 'Mập'],
    ['samples/087_ngoan.mp4', 'Ngoan'],
    ['samples/089_khoe.mp4', 'Khoẻ'],
    ['samples/091_dau.mp4', 'Đau'],
    ['samples/095_tot_bung.mp4', 'Tốt bụng'],
    ['samples/097_thu_vi.mp4', 'Thú vị'],
]


device = 'cpu'
model_name = 'VieSignLang/videomae_skeleton_v1.0'
image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
model = VideoMAEForVideoClassification.from_pretrained(model_name)
model = model.eval().to(device)

mean = image_processor.image_mean
std = image_processor.image_std
if 'shortest_edge' in image_processor.size:
    model_input_height = model_input_width = image_processor.size['shortest_edge']
else:
    model_input_height = image_processor.size['height']
    model_input_width = image_processor.size['width']

transform = Compose(
    [
        Lambda(lambda x: x / 255.0),
        Normalize(mean=mean, std=std),
    ]
)


def inference(
    video: str,
    progress: gr.Progress = gr.Progress(),
) -> tuple:
    progress(0, desc='Preprocessing video')
    keypoints_detector = holistic.Holistic(
        static_image_mode=False,
        model_complexity=2,
        enable_segmentation=True,
        refine_face_landmarks=True,
    )

    inputs = preprocess(
        model_num_frames=model.config.num_frames,
        keypoints_detector=keypoints_detector,
        source=video,
        model_input_height=model_input_height,
        model_input_width=model_input_width,
        device=device,
        transform=transform,
    )

    progress(1/2, desc='Getting predictions')
    predictions = get_predictions(inputs=inputs, model=model)
    output_message = ''
    for i, prediction in enumerate(predictions):
        output_message += f'{i}. {prediction["label"]} ({prediction["score"]})\n'
    output_message = output_message.strip()

    progress(1/2, desc='Completed')

    return output_message


iface = gr.Interface(
    fn=inference,
    inputs='video',
    outputs='text',
    examples=examples,
    title=title,
    description=description,
)
iface.launch()