yulu2 commited on
Commit
b4d2fca
·
verified ·
1 Parent(s): d7ff42a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -15
app.py CHANGED
@@ -16,6 +16,22 @@ MAX_FRAMES = 48
16
  MAX_NEW_TOKENS = 128
17
  TEMPERATURE = 1.0
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # ========== Load Model & Processor ==========
20
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
21
  MODEL_ID,
@@ -81,17 +97,40 @@ def build_messages(frames: List[Image.Image], question: str, fps: float = 1.0):
81
  ]
82
  return messages
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # ========== Inference ==========
85
  @spaces.GPU
86
  @torch.inference_mode()
87
- def answer(video, question):
88
- if video is None:
89
- return "Please upload a video first."
90
  if not question or question.strip() == "":
91
  question = "Describe this video in detail."
92
 
 
 
93
  # Extract frames from video
94
- frames = extract_video_frames(video, max_frames=MAX_FRAMES)
95
  if not frames:
96
  return "Error: Unable to extract frames from video."
97
 
@@ -150,35 +189,69 @@ with gr.Blocks(title="Video Q&A with Qwen2.5-VL-7B") as demo:
150
  gr.Markdown(
151
  """
152
  # FoundationMotion: Auto-Labeling and Reasoning about Spatial Movement in Videos
153
- Upload a video, ask a question, and get an answer!
154
  """
155
  )
156
 
157
  with gr.Row():
158
  with gr.Column(scale=1):
159
- video = gr.Video(label="Upload Video (mp4, mov, webm)", height=400)
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
 
162
  question = gr.Textbox(
163
  label="Your Question",
164
- placeholder="e.g., What is happening in this video?",
165
  lines=2,
166
  )
167
  ask_btn = gr.Button("Ask", variant="primary")
168
  output = gr.Textbox(label="Answer", lines=10, show_copy_button=True)
169
 
170
- gr.Examples(
171
- examples=[
172
- ["What is happening in this video?"],
173
- ["Describe the main objects and actions in this video."],
174
- ["Summarize this video in a few sentences."],
175
- ],
176
- inputs=[question],
 
 
 
 
 
 
 
 
 
 
177
  )
178
 
179
  ask_btn.click(
180
  fn=answer,
181
- inputs=[video, question],
182
  outputs=[output],
183
  )
184
 
 
16
  MAX_NEW_TOKENS = 128
17
  TEMPERATURE = 1.0
18
 
19
+ # ========== Video Examples Configuration ==========
20
+ VIDEO_EXAMPLES = {
21
+ "1_raw.mp4": {
22
+ "path": "videos/1_raw.mp4",
23
+ "questions": ["What's happening in this video?", "Which hand holds the pen?"]
24
+ },
25
+ "4_raw.mp4": {
26
+ "path": "videos/4_raw.mp4",
27
+ "questions": ["What's happening in this video?", "What is the main action in the video?"]
28
+ },
29
+ "6_raw.mp4": {
30
+ "path": "videos/6_raw.mp4",
31
+ "questions": ["What's happening in this video?", "What's the right hand doing?"]
32
+ },
33
+ }
34
+
35
  # ========== Load Model & Processor ==========
36
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
37
  MODEL_ID,
 
97
  ]
98
  return messages
99
 
100
+ # ========== Helper Functions ==========
101
+ def update_video_display(video_name):
102
+ """Update video display and example questions when video is selected"""
103
+ if video_name is None:
104
+ return None, ""
105
+
106
+ video_info = VIDEO_EXAMPLES[video_name]
107
+ video_path = video_info["path"]
108
+ example_questions = "\n".join([f"• {q}" for q in video_info["questions"]])
109
+
110
+ return video_path, example_questions
111
+
112
+ def fill_question(video_name, question_idx):
113
+ """Fill the question textbox with selected example question"""
114
+ if video_name is None:
115
+ return ""
116
+ questions = VIDEO_EXAMPLES[video_name]["questions"]
117
+ if 0 <= question_idx < len(questions):
118
+ return questions[question_idx]
119
+ return ""
120
+
121
  # ========== Inference ==========
122
  @spaces.GPU
123
  @torch.inference_mode()
124
+ def answer(video_name, question):
125
+ if video_name is None:
126
+ return "Please select a video first."
127
  if not question or question.strip() == "":
128
  question = "Describe this video in detail."
129
 
130
+ video_path = VIDEO_EXAMPLES[video_name]["path"]
131
+
132
  # Extract frames from video
133
+ frames = extract_video_frames(video_path, max_frames=MAX_FRAMES)
134
  if not frames:
135
  return "Error: Unable to extract frames from video."
136
 
 
189
  gr.Markdown(
190
  """
191
  # FoundationMotion: Auto-Labeling and Reasoning about Spatial Movement in Videos
192
+ Select a video, ask a question, and get an answer!
193
  """
194
  )
195
 
196
  with gr.Row():
197
  with gr.Column(scale=1):
198
+ # Video selector dropdown
199
+ video_selector = gr.Dropdown(
200
+ choices=list(VIDEO_EXAMPLES.keys()),
201
+ label="Select a Video",
202
+ value=None,
203
+ interactive=True,
204
+ )
205
+ # Video display (read-only)
206
+ video_display = gr.Video(
207
+ label="Video Preview",
208
+ height=400,
209
+ interactive=False,
210
+ )
211
 
212
  with gr.Column(scale=1):
213
+ # Example questions display
214
+ example_questions_display = gr.Textbox(
215
+ label="Example Questions (click buttons below to use)",
216
+ lines=3,
217
+ interactive=False,
218
+ )
219
+
220
+ # Buttons for quick question selection
221
+ with gr.Row():
222
+ q1_btn = gr.Button("Use Question 1", size="sm")
223
+ q2_btn = gr.Button("Use Question 2", size="sm")
224
+
225
  question = gr.Textbox(
226
  label="Your Question",
227
+ placeholder="Type your question or click an example button above",
228
  lines=2,
229
  )
230
  ask_btn = gr.Button("Ask", variant="primary")
231
  output = gr.Textbox(label="Answer", lines=10, show_copy_button=True)
232
 
233
+ # Event handlers
234
+ video_selector.change(
235
+ fn=update_video_display,
236
+ inputs=[video_selector],
237
+ outputs=[video_display, example_questions_display],
238
+ )
239
+
240
+ q1_btn.click(
241
+ fn=lambda v: fill_question(v, 0),
242
+ inputs=[video_selector],
243
+ outputs=[question],
244
+ )
245
+
246
+ q2_btn.click(
247
+ fn=lambda v: fill_question(v, 1),
248
+ inputs=[video_selector],
249
+ outputs=[question],
250
  )
251
 
252
  ask_btn.click(
253
  fn=answer,
254
+ inputs=[video_selector, question],
255
  outputs=[output],
256
  )
257