Video-Text-to-Text
Transformers
English
video
video-question-answering
multimodal
vision-language
qwen3-vl
inference-time
frame-selection
clip
Instructions to use commandeaw/DW-KhotTaeVL-2B-QueryFrames with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use commandeaw/DW-KhotTaeVL-2B-QueryFrames with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("commandeaw/DW-KhotTaeVL-2B-QueryFrames", dtype="auto") - Notebooks
- Google Colab
- Kaggle
eval_videomme: rename --mode wild → mcq (wild kept as deprecated alias); update docstrings to use MCQ-mode terminology
Browse files- eval_videomme.py +20 -13
eval_videomme.py
CHANGED
|
@@ -1,24 +1,27 @@
|
|
| 1 |
"""Standalone Video-MME mini eval for DW-KhotTaeVL-2B-QueryFrames.
|
| 2 |
|
| 3 |
-
This script reproduces the
|
| 4 |
-
model card. It is fully self-contained — only
|
| 5 |
-
`dw_queryframes.py` module shipped in this same
|
| 6 |
-
publicly-available datasets / models from Hugging Face.
|
| 7 |
|
| 8 |
Usage::
|
| 9 |
|
| 10 |
pip install torch transformers pillow decord huggingface_hub pandas pyarrow
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
python eval_videomme.py --mode
|
| 14 |
|
| 15 |
# Stock baseline (uniform 8 frames; matches the stock numbers
|
| 16 |
# in the model card)
|
| 17 |
python eval_videomme.py --mode stock-uniform --n-questions 50
|
| 18 |
|
| 19 |
-
For
|
| 20 |
-
|
| 21 |
-
modes above then combine via ``build_hybrid.py``.
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
Outputs JSON with ``summary`` + ``results`` keys.
|
| 24 |
"""
|
|
@@ -134,9 +137,10 @@ def main() -> int:
|
|
| 134 |
ap = argparse.ArgumentParser()
|
| 135 |
ap.add_argument("--base", default="Qwen/Qwen3-VL-2B-Instruct")
|
| 136 |
ap.add_argument("--clip-model", default="openai/clip-vit-large-patch14")
|
| 137 |
-
ap.add_argument("--mode", choices=["wild", "stock-uniform"],
|
| 138 |
-
default="
|
| 139 |
-
help="'
|
|
|
|
| 140 |
"'stock-uniform' = stock baseline (uniform 8 frames)")
|
| 141 |
ap.add_argument("--tag", default="")
|
| 142 |
ap.add_argument("--n-questions", type=int, default=50)
|
|
@@ -148,6 +152,9 @@ def main() -> int:
|
|
| 148 |
help="output JSON path (auto-named if omitted)")
|
| 149 |
ap.add_argument("--chunks", nargs="+", default=DEFAULT_CHUNKS)
|
| 150 |
args = ap.parse_args()
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
pq_path, zip_paths = download_assets(args.chunks)
|
| 153 |
video_dir = unzip_chunks(zip_paths)
|
|
@@ -171,7 +178,7 @@ def main() -> int:
|
|
| 171 |
for i, row in df.iterrows():
|
| 172 |
video_path = video_dir / f"{row['videoID']}.mp4"
|
| 173 |
|
| 174 |
-
#
|
| 175 |
# Stock-uniform = pass a known no-frame-gain task name to force
|
| 176 |
# the uniform-fallback path (matches stock 8f
|
| 177 |
# baseline behavior).
|
|
|
|
| 1 |
"""Standalone Video-MME mini eval for DW-KhotTaeVL-2B-QueryFrames.
|
| 2 |
|
| 3 |
+
This script reproduces the MCQ-mode (no task_type) QA-frame numbers
|
| 4 |
+
reported in the model card. It is fully self-contained — only
|
| 5 |
+
depends on the `dw_queryframes.py` module shipped in this same
|
| 6 |
+
directory plus publicly-available datasets / models from Hugging Face.
|
| 7 |
|
| 8 |
Usage::
|
| 9 |
|
| 10 |
pip install torch transformers pillow decord huggingface_hub pandas pyarrow
|
| 11 |
|
| 12 |
+
# MCQ mode (query-aware frame selection, no task_type)
|
| 13 |
+
python eval_videomme.py --mode mcq --n-questions 50
|
| 14 |
|
| 15 |
# Stock baseline (uniform 8 frames; matches the stock numbers
|
| 16 |
# in the model card)
|
| 17 |
python eval_videomme.py --mode stock-uniform --n-questions 50
|
| 18 |
|
| 19 |
+
For task-aware MCQ mode (uses Video-MME's own task_type label to
|
| 20 |
+
route Object/Temporal Reasoning questions to uniform sampling),
|
| 21 |
+
run both modes above then combine via ``build_hybrid.py``.
|
| 22 |
+
|
| 23 |
+
The legacy CLI value ``--mode wild`` is accepted as a deprecated
|
| 24 |
+
alias for ``--mode mcq``.
|
| 25 |
|
| 26 |
Outputs JSON with ``summary`` + ``results`` keys.
|
| 27 |
"""
|
|
|
|
| 137 |
ap = argparse.ArgumentParser()
|
| 138 |
ap.add_argument("--base", default="Qwen/Qwen3-VL-2B-Instruct")
|
| 139 |
ap.add_argument("--clip-model", default="openai/clip-vit-large-patch14")
|
| 140 |
+
ap.add_argument("--mode", choices=["mcq", "wild", "stock-uniform"],
|
| 141 |
+
default="mcq",
|
| 142 |
+
help="'mcq' = query-aware MCQ mode (default); "
|
| 143 |
+
"'wild' = deprecated alias for 'mcq'; "
|
| 144 |
"'stock-uniform' = stock baseline (uniform 8 frames)")
|
| 145 |
ap.add_argument("--tag", default="")
|
| 146 |
ap.add_argument("--n-questions", type=int, default=50)
|
|
|
|
| 152 |
help="output JSON path (auto-named if omitted)")
|
| 153 |
ap.add_argument("--chunks", nargs="+", default=DEFAULT_CHUNKS)
|
| 154 |
args = ap.parse_args()
|
| 155 |
+
# Legacy alias: 'wild' → 'mcq' (deprecated).
|
| 156 |
+
if args.mode == "wild":
|
| 157 |
+
args.mode = "mcq"
|
| 158 |
|
| 159 |
pq_path, zip_paths = download_assets(args.chunks)
|
| 160 |
video_dir = unzip_chunks(zip_paths)
|
|
|
|
| 178 |
for i, row in df.iterrows():
|
| 179 |
video_path = video_dir / f"{row['videoID']}.mp4"
|
| 180 |
|
| 181 |
+
# MCQ mode = query-aware (task_type=None lets QA path run).
|
| 182 |
# Stock-uniform = pass a known no-frame-gain task name to force
|
| 183 |
# the uniform-fallback path (matches stock 8f
|
| 184 |
# baseline behavior).
|