Spaces:
Running
on
Zero
Running
on
Zero
John Ho
commited on
Commit
·
ff0b093
1
Parent(s):
4361fd1
added control for fps and max tokens
Browse files
app.py
CHANGED
|
@@ -129,8 +129,8 @@ def inference(
|
|
| 129 |
video_path: str,
|
| 130 |
prompt: str = "Describe the camera motion in this video.",
|
| 131 |
model_name: str = "qwen2.5-vl-7b-instruct",
|
| 132 |
-
|
| 133 |
-
|
| 134 |
):
|
| 135 |
# default processor
|
| 136 |
# processor, model = PROCESSOR, MODEL
|
|
@@ -142,7 +142,7 @@ def inference(
|
|
| 142 |
processor = PROCESSORS[model_name]
|
| 143 |
|
| 144 |
# The model is trained on 8.0 FPS which we recommend for optimal inference
|
| 145 |
-
fps = get_fps_ffmpeg(video_path)
|
| 146 |
logger.info(f"{os.path.basename(video_path)} FPS: {fps}")
|
| 147 |
messages = [
|
| 148 |
{
|
|
@@ -180,7 +180,7 @@ def inference(
|
|
| 180 |
inputs = inputs.to("cuda")
|
| 181 |
|
| 182 |
# Inference
|
| 183 |
-
generated_ids = model.generate(**inputs, max_new_tokens=
|
| 184 |
generated_ids_trimmed = [
|
| 185 |
out_ids[len(in_ids) :]
|
| 186 |
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
@@ -199,6 +199,21 @@ demo = gr.Interface(
|
|
| 199 |
gr.Video(label="Input Video"),
|
| 200 |
gr.Textbox(label="Prompt", value="Describe the camera motion in this video."),
|
| 201 |
gr.Dropdown(label="Model", choices=list(MODEL_ZOO.keys())),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
# gr.Checkbox(label="Use Flash Attention", value=False),
|
| 203 |
# gr.Checkbox(label="Apply Quantization", value=True),
|
| 204 |
],
|
|
|
|
| 129 |
video_path: str,
|
| 130 |
prompt: str = "Describe the camera motion in this video.",
|
| 131 |
model_name: str = "qwen2.5-vl-7b-instruct",
|
| 132 |
+
custom_fps: int = 8,
|
| 133 |
+
max_tokens: int = 256,
|
| 134 |
):
|
| 135 |
# default processor
|
| 136 |
# processor, model = PROCESSOR, MODEL
|
|
|
|
| 142 |
processor = PROCESSORS[model_name]
|
| 143 |
|
| 144 |
# The model is trained on 8.0 FPS which we recommend for optimal inference
|
| 145 |
+
fps = custom_fps if custom_fps else get_fps_ffmpeg(video_path)
|
| 146 |
logger.info(f"{os.path.basename(video_path)} FPS: {fps}")
|
| 147 |
messages = [
|
| 148 |
{
|
|
|
|
| 180 |
inputs = inputs.to("cuda")
|
| 181 |
|
| 182 |
# Inference
|
| 183 |
+
generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
|
| 184 |
generated_ids_trimmed = [
|
| 185 |
out_ids[len(in_ids) :]
|
| 186 |
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
|
|
| 199 |
gr.Video(label="Input Video"),
|
| 200 |
gr.Textbox(label="Prompt", value="Describe the camera motion in this video."),
|
| 201 |
gr.Dropdown(label="Model", choices=list(MODEL_ZOO.keys())),
|
| 202 |
+
gr.Number(
|
| 203 |
+
label="FPS",
|
| 204 |
+
info="inference sampling rate (Qwen2.5VL is trained on videos with 8 fps); a value of 0 means the FPS of the input video will be used",
|
| 205 |
+
value=8,
|
| 206 |
+
minimum=0,
|
| 207 |
+
step=1,
|
| 208 |
+
),
|
| 209 |
+
gr.slider(
|
| 210 |
+
label="Max Tokens",
|
| 211 |
+
info="maximum number of tokens to generate",
|
| 212 |
+
value=128,
|
| 213 |
+
minimum=32,
|
| 214 |
+
maximum=512,
|
| 215 |
+
step=32,
|
| 216 |
+
),
|
| 217 |
# gr.Checkbox(label="Use Flash Attention", value=False),
|
| 218 |
# gr.Checkbox(label="Apply Quantization", value=True),
|
| 219 |
],
|