ASL-MoViNet-T5-translator

Sleeping

App Files Files Community

deanna-emery commited on Dec 4, 2023

Commit

79a2238

1 Parent(s): 4470668

initial commit

Browse files

Files changed (9) hide show

.gitignore +5 -0
app.py +94 -0
models +1 -0
videos/videos_accident2.mp4 +0 -0
videos/videos_all.mp4 +0 -0
videos/videos_before.mp4 +0 -0
videos/videos_blue.mp4 +0 -0
videos/videos_no.mp4 +0 -0
videos/videos_white.mp4 +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*.DS_Store

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import cv2
+import numpy as np
+import gradio as gr
+import os
+os.chdir('models')
+import tensorflow as tf, tf_keras
+import tensorflow_hub as hub
+from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
+from official.projects.movinet.modeling import movinet
+from official.projects.movinet.modeling import movinet_model_a2_modified as movinet_model_modified
+movinet_path = 'movinet_checkpoints_a2_epoch9'
+movinet_model = tf_keras.models.load_model(movinet_path)
+movinet_model.trainable = False
+tokenizer = AutoTokenizer.from_pretrained("t5-base")
+t5_model = TFAutoModelForSeq2SeqLM.from_pretrained("deanna-emery/t5_word_epoch12_1203")
+t5_model.trainable = False
+def crop_center_square(frame):
+    y, x = frame.shape[0:2]
+    if x > y:
+        start_x = (x-y)/2
+        end_x = start_x + y
+        start_x = int(start_x)
+        end_x = int(end_x)
+        return frame[:, int(start_x):int(end_x)]
+    else:
+        return frame
+def preprocess(filename, max_frames=0, resize=(224,224)):
+    video_capture = cv2.VideoCapture(filename)
+    frames = []
+    try:
+      while video_capture.isOpened():
+        ret, frame = video_capture.read()
+        if not ret:
+          break
+        frame = crop_center_square(frame)
+        frame = cv2.resize(frame, resize)
+        frame = frame[:, :, [2, 1, 0]]
+        frames.append(frame)
+        if len(frames) == max_frames:
+          break
+    finally:
+      video_capture.release()
+    video = np.array(frames) / 255.0
+    video = np.expand_dims(video, axis=0)
+    return video
+def translate(video_file):
+    video = preprocess(video_file, max_frames=0, resize=(224,224))
+    embeddings = movinet_model(video)['vid_embedding']
+    tokens = t5_model.generate(inputs_embeds = embeddings,
+                               max_new_tokens=128,
+                                temperature=0.1,
+                                no_repeat_ngram_size=2,
+                                do_sample=True,
+                                top_k=80,
+                                top_p=0.90,
+                                )
+    translation = tokenizer.batch_decode(tokens, skip_special_tokens=True)
+    # Return dict {label:pred}
+    return {"translation":translation}
+# Gradio App config
+title = "ASL Translation (MoViNet + T5)"
+examples = [
+        ['videos/no.mp4'],
+        ['videos/all.mp4'],
+        ['videos/before.mp4'],
+        ['videos/blue.mp4'],
+        ['videos/white.mp4'],
+        ['videos/accident2.mp4']
+    ]
+# Gradio App interface
+gr.Interface(   fn=translate,
+                inputs=[gr.inputs.Video(label="Video (*.mp4)")],
+                outputs=[gr.outputs.Label(label='Translation')],
+                allow_flagging="never",
+                title=title,
+                examples=examples).launch()

models ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 882b879da2dfd2dc75795feb681575ed4320cf33

videos/videos_accident2.mp4 ADDED Viewed

Binary file (42.9 kB). View file

videos/videos_all.mp4 ADDED Viewed

Binary file (91 kB). View file

videos/videos_before.mp4 ADDED Viewed

Binary file (25.4 kB). View file

videos/videos_blue.mp4 ADDED Viewed

Binary file (718 kB). View file

videos/videos_no.mp4 ADDED Viewed

Binary file (235 kB). View file

videos/videos_white.mp4 ADDED Viewed

Binary file (302 kB). View file