Spaces:

AnsenH
/

Highlight_Detection_with_MomentDETR

Runtime error

App Files Files Community

AnsenH commited on Aug 30, 2023

Commit

84805b3

1 Parent(s): 24615d9

update UI

Browse files

Files changed (3) hide show

.gitignore +2 -1
app.py +43 -17
run_on_video/run.py +1 -0

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 *.mp4
 *.MP4
 *.mov
-*.MOV

 *.mp4
 *.MP4
 *.mov
+*.MOV
+testing_data

app.py CHANGED Viewed

@@ -2,14 +2,16 @@ import gradio as gr
 from run_on_video.run import MomentDETRPredictor
 from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
 import torch
 DESCRIPTION = """
 _This Space demonstrates model [QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries](https://arxiv.org/abs/2107.09609), NeurIPS 2021, by [Jie Lei](http://www.cs.unc.edu/~jielei/), [Tamara L. Berg](http://tamaraberg.com/), [Mohit Bansal](http://www.cs.unc.edu/~mbansal/)_
 """
 ckpt_path = "run_on_video/moment_detr_ckpt/model_best.ckpt"
 clip_model_name_or_path = "ViT-B/32"
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 moment_detr_predictor = MomentDETRPredictor(
     ckpt_path=ckpt_path,
@@ -22,11 +24,14 @@ def trim_video(video_path, start, end, output_file='result.mp4'):
     return output_file
 def display_prediction(result):
-    return f'### Moment  Start time:  {result[0]},   End time:  {result[1]},  Score:  {result[2]}'
 with gr.Blocks(theme=gr.themes.Default()) as demo:
     output_videos = gr.State(None)
     moment_prediction = gr.State(None)
     gr.HTML("""<h2 align="center"> 🎞️  Highlight Detection with MomentDETR </h2>""")
     gr.Markdown(DESCRIPTION)
     with gr.Column():
@@ -37,8 +42,14 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
                     input_video = gr.Video(label="Please input mp4", height=400)
             with gr.Blocks():
                 with gr.Column():
-                    gr.HTML("""<h3 align="center"> Highlight Videos </h3>""")
                     playable_video = gr.Video(height=400)
         with gr.Row():
             with gr.Column():
                 retrieval_text = gr.Textbox(
@@ -50,24 +61,27 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
             with gr.Column():
                 radio_button = gr.Radio(
                     choices=[i+1 for i in range(10)],
-                    label="Moments",
                     value=1
                 )
-                display_score = gr.Markdown("### Moment Score: ")
-        def update_video_player(radio_value, output_videos, moment_prediction):
             if output_videos is None or moment_prediction is None:
-                return [None, None]
             return {
-                playable_video: output_videos[radio_value-1],
-                display_score: display_prediction(moment_prediction[radio_value-1])
             }
     def submit_video(input_video, retrieval_text):
         print(f'== video path: {input_video}')
         print(f'== retrieval_text: {retrieval_text}')
         if input_video is None:
-            return [None, None, None, None, 1]
         if retrieval_text is None:
             retrieval_text = ''
         predictions, video_frames = moment_detr_predictor.localize_moment(
@@ -75,32 +89,44 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
             query_list=[retrieval_text]
         )
         predictions = predictions[0]['pred_relevant_windows']
-        pred_windows = [[pred[0], pred[1]]for pred in predictions]
         output_files = [ trim_video(
             video_path=input_video,
-            start=pred_windows[i][0],
-            end=pred_windows[i][1],
             output_file=f'{i}.mp4'
         ) for i in range(10)]
         return {
             output_videos: output_files,
             moment_prediction: predictions,
             playable_video:  output_files[0],
             display_score: display_prediction(predictions[0]),
             radio_button: 1
         }
     radio_button.change(
         fn=update_video_player,
-        inputs=[radio_button, output_videos, moment_prediction],
-        outputs=[playable_video, display_score]
     )
     submit.click(
         fn=submit_video,
         inputs=[input_video, retrieval_text],
-        outputs=[output_videos, moment_prediction, playable_video, display_score, radio_button]
     )
 demo.launch()

 from run_on_video.run import MomentDETRPredictor
 from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
 import torch
+from lbhd.infer import lbhd_predict
 DESCRIPTION = """
 _This Space demonstrates model [QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries](https://arxiv.org/abs/2107.09609), NeurIPS 2021, by [Jie Lei](http://www.cs.unc.edu/~jielei/), [Tamara L. Berg](http://tamaraberg.com/), [Mohit Bansal](http://www.cs.unc.edu/~mbansal/)_
 """
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
 ckpt_path = "run_on_video/moment_detr_ckpt/model_best.ckpt"
 clip_model_name_or_path = "ViT-B/32"
 moment_detr_predictor = MomentDETRPredictor(
     ckpt_path=ckpt_path,
     return output_file
 def display_prediction(result):
+    return f'### Start time:  {result[0]:.2f},   End time:  {result[1]:.2f},  Score:  {result[2]:.2f}'
 with gr.Blocks(theme=gr.themes.Default()) as demo:
     output_videos = gr.State(None)
+    output_lbhd_videos = gr.State(None)
     moment_prediction = gr.State(None)
+    our_prediction = gr.State(None)
     gr.HTML("""<h2 align="center"> 🎞️  Highlight Detection with MomentDETR </h2>""")
     gr.Markdown(DESCRIPTION)
     with gr.Column():
                     input_video = gr.Video(label="Please input mp4", height=400)
             with gr.Blocks():
                 with gr.Column():
+                    gr.HTML("""<h3 align="center"> MomentDETR Result </h3>""")
                     playable_video = gr.Video(height=400)
+                    display_score = gr.Markdown("### Start time, End time, Score")
+            with gr.Blocks():
+                with gr.Column():
+                    gr.HTML("""<h3 align="center"> Ours Result </h3>""")
+                    our_result_video = gr.Video(height=400)
+                    display_clip_score = gr.Markdown("### Start time, End time, Score")
         with gr.Row():
             with gr.Column():
                 retrieval_text = gr.Textbox(
             with gr.Column():
                 radio_button = gr.Radio(
                     choices=[i+1 for i in range(10)],
+                    label="Top 10",
                     value=1
                 )
+                # display_score = gr.Markdown("### Moment Score: ")
+        def update_video_player(radio_value, output_videos, output_lbhd_videos, moment_prediction, our_prediction):
             if output_videos is None or moment_prediction is None:
+                return [None, None, None, None]
             return {
+                playable_video: output_videos[radio_value-1],
+                our_result_video: output_lbhd_videos[min(radio_value-1, len(output_lbhd_videos)-1)],
+                display_score: display_prediction(moment_prediction[radio_value-1]),
+                display_clip_score: display_prediction(our_prediction[min(radio_value-1, len(output_lbhd_videos)-1)])
             }
     def submit_video(input_video, retrieval_text):
         print(f'== video path: {input_video}')
         print(f'== retrieval_text: {retrieval_text}')
         if input_video is None:
+            return [None, None, None, None, None, None, None, None, 1]
         if retrieval_text is None:
             retrieval_text = ''
         predictions, video_frames = moment_detr_predictor.localize_moment(
             query_list=[retrieval_text]
         )
         predictions = predictions[0]['pred_relevant_windows']
         output_files = [ trim_video(
             video_path=input_video,
+            start=predictions[i][0],
+            end=predictions[i][1],
             output_file=f'{i}.mp4'
         ) for i in range(10)]
+        lbhd_predictions = lbhd_predict(input_video)
+        print(f'== lbhd_predictions: {lbhd_predictions}')
+        output_files_lbhd = [ trim_video(
+            video_path=input_video,
+            start=lbhd_predictions[i][0],
+            end=lbhd_predictions[i][1],
+            output_file=f'{i}_lbhd.mp4'
+        ) for i in range(10)]
         return {
             output_videos: output_files,
+            output_lbhd_videos: output_files_lbhd,
             moment_prediction: predictions,
+            our_prediction: lbhd_predictions,
             playable_video:  output_files[0],
+            our_result_video: output_files_lbhd[0],
             display_score: display_prediction(predictions[0]),
+            display_clip_score: display_prediction(lbhd_predictions[0]),
             radio_button: 1
         }
     radio_button.change(
         fn=update_video_player,
+        inputs=[radio_button, output_videos, output_lbhd_videos, moment_prediction, our_prediction],
+        outputs=[playable_video, our_result_video, display_score, display_clip_score]
     )
     submit.click(
         fn=submit_video,
         inputs=[input_video, retrieval_text],
+        outputs=[output_videos, output_lbhd_videos, moment_prediction, our_prediction, playable_video, our_result_video, display_score, display_clip_score, radio_button]
     )
 demo.launch()

run_on_video/run.py CHANGED Viewed

@@ -25,6 +25,7 @@ class MomentDETRPredictor:
         )
         print("Loading trained Moment-DETR model...")
         self.model = build_inference_model(ckpt_path).to(self.device)
     @torch.no_grad()
     def localize_moment(self, video_path, query_list):

         )
         print("Loading trained Moment-DETR model...")
         self.model = build_inference_model(ckpt_path).to(self.device)
+        self.model.eval()
     @torch.no_grad()
     def localize_moment(self, video_path, query_list):