Video-Text-to-Text
Transformers
English
video
video-question-answering
multimodal
vision-language
qwen3-vl
inference-time
frame-selection
clip
Instructions to use commandeaw/DW-KhotTaeVL-2B-QueryFrames with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use commandeaw/DW-KhotTaeVL-2B-QueryFrames with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("commandeaw/DW-KhotTaeVL-2B-QueryFrames", dtype="auto") - Notebooks
- Google Colab
- Kaggle
example_usage: rename Wild-mode → MCQ-mode terminology
Browse files- example_usage.py +5 -4
example_usage.py
CHANGED
|
@@ -20,7 +20,7 @@ def main() -> None:
|
|
| 20 |
n_candidates=32,
|
| 21 |
)
|
| 22 |
|
| 23 |
-
#
|
| 24 |
result = fv.answer_mcq(
|
| 25 |
video_path="example.mp4",
|
| 26 |
question="What does the chef do after pouring the oil into the pot?",
|
|
@@ -31,14 +31,15 @@ def main() -> None:
|
|
| 31 |
"Adds salt to the pot",
|
| 32 |
],
|
| 33 |
)
|
| 34 |
-
print("[
|
| 35 |
print(f" pred : {result['pred']}")
|
| 36 |
print(f" raw output : {result['raw']!r}")
|
| 37 |
print(f" frames used : {result['frames_used']}")
|
| 38 |
print(f" CLIP latency : {result['latency_clip_s']} s")
|
| 39 |
print(f" GEN latency : {result['latency_gen_s']} s")
|
| 40 |
|
| 41 |
-
# Task-aware
|
|
|
|
| 42 |
result2 = fv.answer_mcq(
|
| 43 |
video_path="example.mp4",
|
| 44 |
question="What is happening to the cabbage in the frying pan?",
|
|
@@ -50,7 +51,7 @@ def main() -> None:
|
|
| 50 |
],
|
| 51 |
task_type="Object Reasoning", # → uniform-fallback path
|
| 52 |
)
|
| 53 |
-
print("\n[
|
| 54 |
print(f" pred : {result2['pred']}")
|
| 55 |
print(f" frames used : {result2['frames_used']}") # 'uniform_fallback'
|
| 56 |
|
|
|
|
| 20 |
n_candidates=32,
|
| 21 |
)
|
| 22 |
|
| 23 |
+
# MCQ mode (no task_type) — default.
|
| 24 |
result = fv.answer_mcq(
|
| 25 |
video_path="example.mp4",
|
| 26 |
question="What does the chef do after pouring the oil into the pot?",
|
|
|
|
| 31 |
"Adds salt to the pot",
|
| 32 |
],
|
| 33 |
)
|
| 34 |
+
print("[MCQ mode (no task_type)]")
|
| 35 |
print(f" pred : {result['pred']}")
|
| 36 |
print(f" raw output : {result['raw']!r}")
|
| 37 |
print(f" frames used : {result['frames_used']}")
|
| 38 |
print(f" CLIP latency : {result['latency_clip_s']} s")
|
| 39 |
print(f" GEN latency : {result['latency_gen_s']} s")
|
| 40 |
|
| 41 |
+
# Task-aware MCQ mode (when a task taxonomy is supplied, e.g. by
|
| 42 |
+
# Video-MME or by an operational workflow).
|
| 43 |
result2 = fv.answer_mcq(
|
| 44 |
video_path="example.mp4",
|
| 45 |
question="What is happening to the cabbage in the frying pan?",
|
|
|
|
| 51 |
],
|
| 52 |
task_type="Object Reasoning", # → uniform-fallback path
|
| 53 |
)
|
| 54 |
+
print("\n[Task-aware MCQ mode]")
|
| 55 |
print(f" pred : {result2['pred']}")
|
| 56 |
print(f" frames used : {result2['frames_used']}") # 'uniform_fallback'
|
| 57 |
|