John Ho commited on
Commit
ff0b093
·
1 Parent(s): 4361fd1

added control for fps and max tokens

Browse files
Files changed (1) hide show
  1. app.py +19 -4
app.py CHANGED
@@ -129,8 +129,8 @@ def inference(
129
  video_path: str,
130
  prompt: str = "Describe the camera motion in this video.",
131
  model_name: str = "qwen2.5-vl-7b-instruct",
132
- # use_flash_attention: bool = True,
133
- # apply_quantization: bool = True,
134
  ):
135
  # default processor
136
  # processor, model = PROCESSOR, MODEL
@@ -142,7 +142,7 @@ def inference(
142
  processor = PROCESSORS[model_name]
143
 
144
  # The model is trained on 8.0 FPS which we recommend for optimal inference
145
- fps = get_fps_ffmpeg(video_path)
146
  logger.info(f"{os.path.basename(video_path)} FPS: {fps}")
147
  messages = [
148
  {
@@ -180,7 +180,7 @@ def inference(
180
  inputs = inputs.to("cuda")
181
 
182
  # Inference
183
- generated_ids = model.generate(**inputs, max_new_tokens=128)
184
  generated_ids_trimmed = [
185
  out_ids[len(in_ids) :]
186
  for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -199,6 +199,21 @@ demo = gr.Interface(
199
  gr.Video(label="Input Video"),
200
  gr.Textbox(label="Prompt", value="Describe the camera motion in this video."),
201
  gr.Dropdown(label="Model", choices=list(MODEL_ZOO.keys())),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  # gr.Checkbox(label="Use Flash Attention", value=False),
203
  # gr.Checkbox(label="Apply Quantization", value=True),
204
  ],
 
129
  video_path: str,
130
  prompt: str = "Describe the camera motion in this video.",
131
  model_name: str = "qwen2.5-vl-7b-instruct",
132
+ custom_fps: int = 8,
133
+ max_tokens: int = 256,
134
  ):
135
  # default processor
136
  # processor, model = PROCESSOR, MODEL
 
142
  processor = PROCESSORS[model_name]
143
 
144
  # The model is trained on 8.0 FPS which we recommend for optimal inference
145
+ fps = custom_fps if custom_fps else get_fps_ffmpeg(video_path)
146
  logger.info(f"{os.path.basename(video_path)} FPS: {fps}")
147
  messages = [
148
  {
 
180
  inputs = inputs.to("cuda")
181
 
182
  # Inference
183
+ generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
184
  generated_ids_trimmed = [
185
  out_ids[len(in_ids) :]
186
  for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 
199
  gr.Video(label="Input Video"),
200
  gr.Textbox(label="Prompt", value="Describe the camera motion in this video."),
201
  gr.Dropdown(label="Model", choices=list(MODEL_ZOO.keys())),
202
+ gr.Number(
203
+ label="FPS",
204
+ info="inference sampling rate (Qwen2.5VL is trained on videos with 8 fps); a value of 0 means the FPS of the input video will be used",
205
+ value=8,
206
+ minimum=0,
207
+ step=1,
208
+ ),
209
+ gr.slider(
210
+ label="Max Tokens",
211
+ info="maximum number of tokens to generate",
212
+ value=128,
213
+ minimum=32,
214
+ maximum=512,
215
+ step=32,
216
+ ),
217
  # gr.Checkbox(label="Use Flash Attention", value=False),
218
  # gr.Checkbox(label="Apply Quantization", value=True),
219
  ],