Update app.py
Browse files
app.py
CHANGED
|
@@ -23,10 +23,8 @@ def load_v2t_samples(data_root):
|
|
| 23 |
def load_t2v_samples(data_root):
|
| 24 |
sample_text = ['cut the sausage', 'stir vegetables into salmon', 'rinse cutting board']
|
| 25 |
idx2sid = {0: 2119, 1: 1730, 2: 1276}
|
| 26 |
-
|
| 27 |
return sample_text, idx2sid
|
| 28 |
|
| 29 |
-
|
| 30 |
def format_pred(pred, gt):
|
| 31 |
tp = '[color=green]{}[/color]'
|
| 32 |
fp = '[color=red]{}[/color]'
|
|
@@ -57,10 +55,10 @@ def main():
|
|
| 57 |
|
| 58 |
def predict_t2v(idx):
|
| 59 |
sid = idx2sid_t2v[idx]
|
| 60 |
-
zeroshot_video, gt_video = lavila.predict_t2v(idx, sid)
|
| 61 |
egovpa_video, gt_video = egovpa.predict_t2v(idx, sid)
|
|
|
|
| 62 |
|
| 63 |
-
return
|
| 64 |
|
| 65 |
with gr.Blocks() as demo:
|
| 66 |
with gr.Tab("Video-to-text retrieval"):
|
|
@@ -97,12 +95,12 @@ def main():
|
|
| 97 |
text = gr.Text(label="text query")
|
| 98 |
with gr.Column():
|
| 99 |
idx = gr.Number(label="Idx", visible=False)
|
| 100 |
-
zeroshot = gr.Textbox(label="LaViLa (zero-shot) prediction")
|
| 101 |
#zeroshot = gr.Gallery(label="LaViLa (zero-shot) prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
|
| 102 |
-
ours = gr.Textbox(label="Ego-VPA prediction")
|
| 103 |
-
|
| 104 |
btn = gr.Button("Predict", variant="primary")
|
| 105 |
-
btn.click(predict_t2v, inputs=[idx], outputs=[
|
| 106 |
gr.Examples(examples=[[i, x] for i, x in enumerate(t2v_samples)], inputs=[idx, text])
|
| 107 |
|
| 108 |
|
|
|
|
| 23 |
def load_t2v_samples(data_root):
|
| 24 |
sample_text = ['cut the sausage', 'stir vegetables into salmon', 'rinse cutting board']
|
| 25 |
idx2sid = {0: 2119, 1: 1730, 2: 1276}
|
|
|
|
| 26 |
return sample_text, idx2sid
|
| 27 |
|
|
|
|
| 28 |
def format_pred(pred, gt):
|
| 29 |
tp = '[color=green]{}[/color]'
|
| 30 |
fp = '[color=red]{}[/color]'
|
|
|
|
| 55 |
|
| 56 |
def predict_t2v(idx):
|
| 57 |
sid = idx2sid_t2v[idx]
|
|
|
|
| 58 |
egovpa_video, gt_video = egovpa.predict_t2v(idx, sid)
|
| 59 |
+
egovpa_video = [f'{data_root}/video/gif/{x}.gif' for x in ego_video]
|
| 60 |
|
| 61 |
+
return egovpa_video
|
| 62 |
|
| 63 |
with gr.Blocks() as demo:
|
| 64 |
with gr.Tab("Video-to-text retrieval"):
|
|
|
|
| 95 |
text = gr.Text(label="text query")
|
| 96 |
with gr.Column():
|
| 97 |
idx = gr.Number(label="Idx", visible=False)
|
| 98 |
+
#zeroshot = gr.Textbox(label="LaViLa (zero-shot) prediction")
|
| 99 |
#zeroshot = gr.Gallery(label="LaViLa (zero-shot) prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
|
| 100 |
+
#ours = gr.Textbox(label="Ego-VPA prediction")
|
| 101 |
+
ours = gr.Gallery(label="Ego-VPA prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
|
| 102 |
btn = gr.Button("Predict", variant="primary")
|
| 103 |
+
btn.click(predict_t2v, inputs=[idx], outputs=[ours])
|
| 104 |
gr.Examples(examples=[[i, x] for i, x in enumerate(t2v_samples)], inputs=[idx, text])
|
| 105 |
|
| 106 |
|