Spaces:
Runtime error
Runtime error
update UI
Browse files- .gitignore +2 -1
- app.py +43 -17
- run_on_video/run.py +1 -0
.gitignore
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
*.mp4
|
| 2 |
*.MP4
|
| 3 |
*.mov
|
| 4 |
-
*.MOV
|
|
|
|
|
|
| 1 |
*.mp4
|
| 2 |
*.MP4
|
| 3 |
*.mov
|
| 4 |
+
*.MOV
|
| 5 |
+
testing_data
|
app.py
CHANGED
|
@@ -2,14 +2,16 @@ import gradio as gr
|
|
| 2 |
from run_on_video.run import MomentDETRPredictor
|
| 3 |
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
|
| 4 |
import torch
|
|
|
|
| 5 |
|
| 6 |
DESCRIPTION = """
|
| 7 |
_This Space demonstrates model [QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries](https://arxiv.org/abs/2107.09609), NeurIPS 2021, by [Jie Lei](http://www.cs.unc.edu/~jielei/), [Tamara L. Berg](http://tamaraberg.com/), [Mohit Bansal](http://www.cs.unc.edu/~mbansal/)_
|
| 8 |
"""
|
| 9 |
|
|
|
|
|
|
|
| 10 |
ckpt_path = "run_on_video/moment_detr_ckpt/model_best.ckpt"
|
| 11 |
clip_model_name_or_path = "ViT-B/32"
|
| 12 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 13 |
|
| 14 |
moment_detr_predictor = MomentDETRPredictor(
|
| 15 |
ckpt_path=ckpt_path,
|
|
@@ -22,11 +24,14 @@ def trim_video(video_path, start, end, output_file='result.mp4'):
|
|
| 22 |
return output_file
|
| 23 |
|
| 24 |
def display_prediction(result):
|
| 25 |
-
return f'###
|
| 26 |
|
| 27 |
with gr.Blocks(theme=gr.themes.Default()) as demo:
|
| 28 |
output_videos = gr.State(None)
|
|
|
|
| 29 |
moment_prediction = gr.State(None)
|
|
|
|
|
|
|
| 30 |
gr.HTML("""<h2 align="center"> 🎞️ Highlight Detection with MomentDETR </h2>""")
|
| 31 |
gr.Markdown(DESCRIPTION)
|
| 32 |
with gr.Column():
|
|
@@ -37,8 +42,14 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
|
|
| 37 |
input_video = gr.Video(label="Please input mp4", height=400)
|
| 38 |
with gr.Blocks():
|
| 39 |
with gr.Column():
|
| 40 |
-
gr.HTML("""<h3 align="center">
|
| 41 |
playable_video = gr.Video(height=400)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
with gr.Row():
|
| 43 |
with gr.Column():
|
| 44 |
retrieval_text = gr.Textbox(
|
|
@@ -50,24 +61,27 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
|
|
| 50 |
with gr.Column():
|
| 51 |
radio_button = gr.Radio(
|
| 52 |
choices=[i+1 for i in range(10)],
|
| 53 |
-
label="
|
| 54 |
value=1
|
| 55 |
)
|
| 56 |
-
display_score = gr.Markdown("### Moment Score: ")
|
|
|
|
| 57 |
|
| 58 |
-
def update_video_player(radio_value, output_videos, moment_prediction):
|
| 59 |
if output_videos is None or moment_prediction is None:
|
| 60 |
-
return [None, None]
|
| 61 |
return {
|
| 62 |
-
playable_video: output_videos[radio_value-1],
|
| 63 |
-
|
|
|
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
def submit_video(input_video, retrieval_text):
|
| 67 |
print(f'== video path: {input_video}')
|
| 68 |
print(f'== retrieval_text: {retrieval_text}')
|
| 69 |
if input_video is None:
|
| 70 |
-
return [None, None, None, None, 1]
|
| 71 |
if retrieval_text is None:
|
| 72 |
retrieval_text = ''
|
| 73 |
predictions, video_frames = moment_detr_predictor.localize_moment(
|
|
@@ -75,32 +89,44 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
|
|
| 75 |
query_list=[retrieval_text]
|
| 76 |
)
|
| 77 |
predictions = predictions[0]['pred_relevant_windows']
|
| 78 |
-
pred_windows = [[pred[0], pred[1]]for pred in predictions]
|
| 79 |
output_files = [ trim_video(
|
| 80 |
video_path=input_video,
|
| 81 |
-
start=
|
| 82 |
-
end=
|
| 83 |
output_file=f'{i}.mp4'
|
| 84 |
) for i in range(10)]
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
return {
|
| 87 |
output_videos: output_files,
|
|
|
|
| 88 |
moment_prediction: predictions,
|
|
|
|
| 89 |
playable_video: output_files[0],
|
|
|
|
| 90 |
display_score: display_prediction(predictions[0]),
|
|
|
|
| 91 |
radio_button: 1
|
| 92 |
}
|
| 93 |
|
| 94 |
radio_button.change(
|
| 95 |
fn=update_video_player,
|
| 96 |
-
inputs=[radio_button, output_videos, moment_prediction],
|
| 97 |
-
outputs=[playable_video, display_score]
|
| 98 |
)
|
| 99 |
|
| 100 |
submit.click(
|
| 101 |
fn=submit_video,
|
| 102 |
inputs=[input_video, retrieval_text],
|
| 103 |
-
outputs=[output_videos, moment_prediction, playable_video, display_score, radio_button]
|
| 104 |
)
|
| 105 |
|
| 106 |
demo.launch()
|
|
|
|
| 2 |
from run_on_video.run import MomentDETRPredictor
|
| 3 |
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
|
| 4 |
import torch
|
| 5 |
+
from lbhd.infer import lbhd_predict
|
| 6 |
|
| 7 |
DESCRIPTION = """
|
| 8 |
_This Space demonstrates model [QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries](https://arxiv.org/abs/2107.09609), NeurIPS 2021, by [Jie Lei](http://www.cs.unc.edu/~jielei/), [Tamara L. Berg](http://tamaraberg.com/), [Mohit Bansal](http://www.cs.unc.edu/~mbansal/)_
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 12 |
+
|
| 13 |
ckpt_path = "run_on_video/moment_detr_ckpt/model_best.ckpt"
|
| 14 |
clip_model_name_or_path = "ViT-B/32"
|
|
|
|
| 15 |
|
| 16 |
moment_detr_predictor = MomentDETRPredictor(
|
| 17 |
ckpt_path=ckpt_path,
|
|
|
|
| 24 |
return output_file
|
| 25 |
|
| 26 |
def display_prediction(result):
|
| 27 |
+
return f'### Start time: {result[0]:.2f}, End time: {result[1]:.2f}, Score: {result[2]:.2f}'
|
| 28 |
|
| 29 |
with gr.Blocks(theme=gr.themes.Default()) as demo:
|
| 30 |
output_videos = gr.State(None)
|
| 31 |
+
output_lbhd_videos = gr.State(None)
|
| 32 |
moment_prediction = gr.State(None)
|
| 33 |
+
our_prediction = gr.State(None)
|
| 34 |
+
|
| 35 |
gr.HTML("""<h2 align="center"> 🎞️ Highlight Detection with MomentDETR </h2>""")
|
| 36 |
gr.Markdown(DESCRIPTION)
|
| 37 |
with gr.Column():
|
|
|
|
| 42 |
input_video = gr.Video(label="Please input mp4", height=400)
|
| 43 |
with gr.Blocks():
|
| 44 |
with gr.Column():
|
| 45 |
+
gr.HTML("""<h3 align="center"> MomentDETR Result </h3>""")
|
| 46 |
playable_video = gr.Video(height=400)
|
| 47 |
+
display_score = gr.Markdown("### Start time, End time, Score")
|
| 48 |
+
with gr.Blocks():
|
| 49 |
+
with gr.Column():
|
| 50 |
+
gr.HTML("""<h3 align="center"> Ours Result </h3>""")
|
| 51 |
+
our_result_video = gr.Video(height=400)
|
| 52 |
+
display_clip_score = gr.Markdown("### Start time, End time, Score")
|
| 53 |
with gr.Row():
|
| 54 |
with gr.Column():
|
| 55 |
retrieval_text = gr.Textbox(
|
|
|
|
| 61 |
with gr.Column():
|
| 62 |
radio_button = gr.Radio(
|
| 63 |
choices=[i+1 for i in range(10)],
|
| 64 |
+
label="Top 10",
|
| 65 |
value=1
|
| 66 |
)
|
| 67 |
+
# display_score = gr.Markdown("### Moment Score: ")
|
| 68 |
+
|
| 69 |
|
| 70 |
+
def update_video_player(radio_value, output_videos, output_lbhd_videos, moment_prediction, our_prediction):
|
| 71 |
if output_videos is None or moment_prediction is None:
|
| 72 |
+
return [None, None, None, None]
|
| 73 |
return {
|
| 74 |
+
playable_video: output_videos[radio_value-1],
|
| 75 |
+
our_result_video: output_lbhd_videos[min(radio_value-1, len(output_lbhd_videos)-1)],
|
| 76 |
+
display_score: display_prediction(moment_prediction[radio_value-1]),
|
| 77 |
+
display_clip_score: display_prediction(our_prediction[min(radio_value-1, len(output_lbhd_videos)-1)])
|
| 78 |
}
|
| 79 |
|
| 80 |
def submit_video(input_video, retrieval_text):
|
| 81 |
print(f'== video path: {input_video}')
|
| 82 |
print(f'== retrieval_text: {retrieval_text}')
|
| 83 |
if input_video is None:
|
| 84 |
+
return [None, None, None, None, None, None, None, None, 1]
|
| 85 |
if retrieval_text is None:
|
| 86 |
retrieval_text = ''
|
| 87 |
predictions, video_frames = moment_detr_predictor.localize_moment(
|
|
|
|
| 89 |
query_list=[retrieval_text]
|
| 90 |
)
|
| 91 |
predictions = predictions[0]['pred_relevant_windows']
|
|
|
|
| 92 |
output_files = [ trim_video(
|
| 93 |
video_path=input_video,
|
| 94 |
+
start=predictions[i][0],
|
| 95 |
+
end=predictions[i][1],
|
| 96 |
output_file=f'{i}.mp4'
|
| 97 |
) for i in range(10)]
|
| 98 |
+
|
| 99 |
+
lbhd_predictions = lbhd_predict(input_video)
|
| 100 |
+
print(f'== lbhd_predictions: {lbhd_predictions}')
|
| 101 |
+
output_files_lbhd = [ trim_video(
|
| 102 |
+
video_path=input_video,
|
| 103 |
+
start=lbhd_predictions[i][0],
|
| 104 |
+
end=lbhd_predictions[i][1],
|
| 105 |
+
output_file=f'{i}_lbhd.mp4'
|
| 106 |
+
) for i in range(10)]
|
| 107 |
+
|
| 108 |
return {
|
| 109 |
output_videos: output_files,
|
| 110 |
+
output_lbhd_videos: output_files_lbhd,
|
| 111 |
moment_prediction: predictions,
|
| 112 |
+
our_prediction: lbhd_predictions,
|
| 113 |
playable_video: output_files[0],
|
| 114 |
+
our_result_video: output_files_lbhd[0],
|
| 115 |
display_score: display_prediction(predictions[0]),
|
| 116 |
+
display_clip_score: display_prediction(lbhd_predictions[0]),
|
| 117 |
radio_button: 1
|
| 118 |
}
|
| 119 |
|
| 120 |
radio_button.change(
|
| 121 |
fn=update_video_player,
|
| 122 |
+
inputs=[radio_button, output_videos, output_lbhd_videos, moment_prediction, our_prediction],
|
| 123 |
+
outputs=[playable_video, our_result_video, display_score, display_clip_score]
|
| 124 |
)
|
| 125 |
|
| 126 |
submit.click(
|
| 127 |
fn=submit_video,
|
| 128 |
inputs=[input_video, retrieval_text],
|
| 129 |
+
outputs=[output_videos, output_lbhd_videos, moment_prediction, our_prediction, playable_video, our_result_video, display_score, display_clip_score, radio_button]
|
| 130 |
)
|
| 131 |
|
| 132 |
demo.launch()
|
run_on_video/run.py
CHANGED
|
@@ -25,6 +25,7 @@ class MomentDETRPredictor:
|
|
| 25 |
)
|
| 26 |
print("Loading trained Moment-DETR model...")
|
| 27 |
self.model = build_inference_model(ckpt_path).to(self.device)
|
|
|
|
| 28 |
|
| 29 |
@torch.no_grad()
|
| 30 |
def localize_moment(self, video_path, query_list):
|
|
|
|
| 25 |
)
|
| 26 |
print("Loading trained Moment-DETR model...")
|
| 27 |
self.model = build_inference_model(ckpt_path).to(self.device)
|
| 28 |
+
self.model.eval()
|
| 29 |
|
| 30 |
@torch.no_grad()
|
| 31 |
def localize_moment(self, video_path, query_list):
|