Spaces:

yulu2
/

FoundationMotion

Running on Zero

App Files Files Community

yulu2 commited on 21 days ago

Commit

b4d2fca

verified ·

1 Parent(s): d7ff42a

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -15

app.py CHANGED Viewed

@@ -16,6 +16,22 @@ MAX_FRAMES = 48
 MAX_NEW_TOKENS = 128
 TEMPERATURE = 1.0
 # ========== Load Model & Processor ==========
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -81,17 +97,40 @@ def build_messages(frames: List[Image.Image], question: str, fps: float = 1.0):
     ]
     return messages
 # ========== Inference ==========
 @spaces.GPU
 @torch.inference_mode()
-def answer(video, question):
-    if video is None:
-        return "Please upload a video first."
     if not question or question.strip() == "":
         question = "Describe this video in detail."
     # Extract frames from video
-    frames = extract_video_frames(video, max_frames=MAX_FRAMES)
     if not frames:
         return "Error: Unable to extract frames from video."
@@ -150,35 +189,69 @@ with gr.Blocks(title="Video Q&A with Qwen2.5-VL-7B") as demo:
     gr.Markdown(
         """
         # FoundationMotion: Auto-Labeling and Reasoning about Spatial Movement in Videos
-        Upload a video, ask a question, and get an answer!
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
-            video = gr.Video(label="Upload Video (mp4, mov, webm)", height=400)
         with gr.Column(scale=1):
             question = gr.Textbox(
                 label="Your Question",
-                placeholder="e.g., What is happening in this video?",
                 lines=2,
             )
             ask_btn = gr.Button("Ask", variant="primary")
             output = gr.Textbox(label="Answer", lines=10, show_copy_button=True)
-    gr.Examples(
-        examples=[
-            ["What is happening in this video?"],
-            ["Describe the main objects and actions in this video."],
-            ["Summarize this video in a few sentences."],
-        ],
-        inputs=[question],
     )
     ask_btn.click(
         fn=answer,
-        inputs=[video, question],
         outputs=[output],
     )

 MAX_NEW_TOKENS = 128
 TEMPERATURE = 1.0
+# ========== Video Examples Configuration ==========
+VIDEO_EXAMPLES = {
+    "1_raw.mp4": {
+        "path": "videos/1_raw.mp4",
+        "questions": ["What's happening in this video?", "Which hand holds the pen?"]
+    },
+    "4_raw.mp4": {
+        "path": "videos/4_raw.mp4",
+        "questions": ["What's happening in this video?", "What is the main action in the video?"]
+    },
+    "6_raw.mp4": {
+        "path": "videos/6_raw.mp4",
+        "questions": ["What's happening in this video?", "What's the right hand doing?"]
+    },
+}
 # ========== Load Model & Processor ==========
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     ]
     return messages
+# ========== Helper Functions ==========
+def update_video_display(video_name):
+    """Update video display and example questions when video is selected"""
+    if video_name is None:
+        return None, ""
+    video_info = VIDEO_EXAMPLES[video_name]
+    video_path = video_info["path"]
+    example_questions = "\n".join([f"• {q}" for q in video_info["questions"]])
+    return video_path, example_questions
+def fill_question(video_name, question_idx):
+    """Fill the question textbox with selected example question"""
+    if video_name is None:
+        return ""
+    questions = VIDEO_EXAMPLES[video_name]["questions"]
+    if 0 <= question_idx < len(questions):
+        return questions[question_idx]
+    return ""
 # ========== Inference ==========
 @spaces.GPU
 @torch.inference_mode()
+def answer(video_name, question):
+    if video_name is None:
+        return "Please select a video first."
     if not question or question.strip() == "":
         question = "Describe this video in detail."
+    video_path = VIDEO_EXAMPLES[video_name]["path"]
     # Extract frames from video
+    frames = extract_video_frames(video_path, max_frames=MAX_FRAMES)
     if not frames:
         return "Error: Unable to extract frames from video."
     gr.Markdown(
         """
         # FoundationMotion: Auto-Labeling and Reasoning about Spatial Movement in Videos
+        Select a video, ask a question, and get an answer!
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
+            # Video selector dropdown
+            video_selector = gr.Dropdown(
+                choices=list(VIDEO_EXAMPLES.keys()),
+                label="Select a Video",
+                value=None,
+                interactive=True,
+            )
+            # Video display (read-only)
+            video_display = gr.Video(
+                label="Video Preview",
+                height=400,
+                interactive=False,
+            )
         with gr.Column(scale=1):
+            # Example questions display
+            example_questions_display = gr.Textbox(
+                label="Example Questions (click buttons below to use)",
+                lines=3,
+                interactive=False,
+            )
+            # Buttons for quick question selection
+            with gr.Row():
+                q1_btn = gr.Button("Use Question 1", size="sm")
+                q2_btn = gr.Button("Use Question 2", size="sm")
             question = gr.Textbox(
                 label="Your Question",
+                placeholder="Type your question or click an example button above",
                 lines=2,
             )
             ask_btn = gr.Button("Ask", variant="primary")
             output = gr.Textbox(label="Answer", lines=10, show_copy_button=True)
+    # Event handlers
+    video_selector.change(
+        fn=update_video_display,
+        inputs=[video_selector],
+        outputs=[video_display, example_questions_display],
+    )
+    q1_btn.click(
+        fn=lambda v: fill_question(v, 0),
+        inputs=[video_selector],
+        outputs=[question],
+    )
+    q2_btn.click(
+        fn=lambda v: fill_question(v, 1),
+        inputs=[video_selector],
+        outputs=[question],
     )
     ask_btn.click(
         fn=answer,
+        inputs=[video_selector, question],
         outputs=[output],
     )