Spaces:

kylemin
/

EK100MIR

Paused

App Files Files Community

gina9726 commited on Mar 24, 2024

Commit

81071ed

verified ·

1 Parent(s): 5f18375

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -8

app.py CHANGED Viewed

@@ -23,10 +23,8 @@ def load_v2t_samples(data_root):
 def load_t2v_samples(data_root):
     sample_text = ['cut the sausage', 'stir vegetables into salmon', 'rinse cutting board']
     idx2sid = {0: 2119, 1: 1730, 2: 1276}
     return sample_text, idx2sid
 def format_pred(pred, gt):
     tp = '[color=green]{}[/color]'
     fp = '[color=red]{}[/color]'
@@ -57,10 +55,10 @@ def main():
     def predict_t2v(idx):
         sid = idx2sid_t2v[idx]
-        zeroshot_video, gt_video = lavila.predict_t2v(idx, sid)
         egovpa_video, gt_video = egovpa.predict_t2v(idx, sid)
-        return gt_video, zeroshot_video, egovpa_video
     with gr.Blocks() as demo:
         with gr.Tab("Video-to-text retrieval"):
@@ -97,12 +95,12 @@ def main():
                     text = gr.Text(label="text query")
                 with gr.Column():
                     idx = gr.Number(label="Idx", visible=False)
-                    zeroshot = gr.Textbox(label="LaViLa (zero-shot) prediction")
                     #zeroshot = gr.Gallery(label="LaViLa (zero-shot) prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
-                    ours = gr.Textbox(label="Ego-VPA prediction")
-                    #ours = gr.Gallery(label="Ego-VPA prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
             btn = gr.Button("Predict", variant="primary")
-            btn.click(predict_t2v, inputs=[idx], outputs=[label, zeroshot, ours])
             gr.Examples(examples=[[i, x] for i, x in enumerate(t2v_samples)], inputs=[idx, text])

 def load_t2v_samples(data_root):
     sample_text = ['cut the sausage', 'stir vegetables into salmon', 'rinse cutting board']
     idx2sid = {0: 2119, 1: 1730, 2: 1276}
     return sample_text, idx2sid
 def format_pred(pred, gt):
     tp = '[color=green]{}[/color]'
     fp = '[color=red]{}[/color]'
     def predict_t2v(idx):
         sid = idx2sid_t2v[idx]
         egovpa_video, gt_video = egovpa.predict_t2v(idx, sid)
+        egovpa_video = [f'{data_root}/video/gif/{x}.gif' for x in ego_video]
+        return egovpa_video
     with gr.Blocks() as demo:
         with gr.Tab("Video-to-text retrieval"):
                     text = gr.Text(label="text query")
                 with gr.Column():
                     idx = gr.Number(label="Idx", visible=False)
+                    #zeroshot = gr.Textbox(label="LaViLa (zero-shot) prediction")
                     #zeroshot = gr.Gallery(label="LaViLa (zero-shot) prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
+                    #ours = gr.Textbox(label="Ego-VPA prediction")
+                    ours = gr.Gallery(label="Ego-VPA prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
             btn = gr.Button("Predict", variant="primary")
+            btn.click(predict_t2v, inputs=[idx], outputs=[ours])
             gr.Examples(examples=[[i, x] for i, x in enumerate(t2v_samples)], inputs=[idx, text])