Video-Text-to-Text
Transformers
English
video
video-question-answering
multimodal
vision-language
qwen3-vl
inference-time
frame-selection
clip
Instructions to use commandeaw/DW-KhotTaeVL-2B-QueryFrames with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use commandeaw/DW-KhotTaeVL-2B-QueryFrames with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("commandeaw/DW-KhotTaeVL-2B-QueryFrames", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Add --start-idx for chunked evaluation (MPS resilience)
Browse filesApple MPS can corrupt accelerator state on long Qwen3-VL inference
runs (~800+ questions), producing AcceleratorError. Add --start-idx K
so callers can skip the first K filtered questions, enabling a
chunked-runner pattern: each chunk is a fresh Python process with
clean MPS state.
Also wrap the per-question inference in try/except: on
AcceleratorError or any other inference failure, save partial results
and exit gracefully so an outer chunked-runner can resume from
start-idx + completed_count.
- eval_videomme.py +44 -12
eval_videomme.py
CHANGED
|
@@ -103,13 +103,24 @@ def unzip_chunks(zip_paths: list[Path]) -> Path:
|
|
| 103 |
return video_dir
|
| 104 |
|
| 105 |
|
| 106 |
-
def load_questions(pq_path: Path, video_dir: Path, limit: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
df = pd.read_parquet(pq_path)
|
| 108 |
ids = {p.stem for p in video_dir.glob("*.mp4")}
|
| 109 |
df = df[df["videoID"].isin(ids)].reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
| 110 |
if limit > 0 and len(df) > limit:
|
| 111 |
df = df.iloc[:limit].copy()
|
| 112 |
-
print(f"[eval] using {len(df)} questions"
|
|
|
|
| 113 |
return df
|
| 114 |
|
| 115 |
|
|
@@ -143,7 +154,12 @@ def main() -> int:
|
|
| 143 |
"'wild' = deprecated alias for 'mcq'; "
|
| 144 |
"'stock-uniform' = stock baseline (uniform 8 frames)")
|
| 145 |
ap.add_argument("--tag", default="")
|
| 146 |
-
ap.add_argument("--n-questions", type=int, default=50
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
ap.add_argument("--n-frames", type=int, default=8)
|
| 148 |
ap.add_argument("--n-candidates", type=int, default=32)
|
| 149 |
ap.add_argument("--max-pixels", type=int, default=262144)
|
|
@@ -158,7 +174,8 @@ def main() -> int:
|
|
| 158 |
|
| 159 |
pq_path, zip_paths = download_assets(args.chunks)
|
| 160 |
video_dir = unzip_chunks(zip_paths)
|
| 161 |
-
df = load_questions(pq_path, video_dir, args.n_questions
|
|
|
|
| 162 |
|
| 163 |
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
|
| 164 |
|
|
@@ -176,6 +193,8 @@ def main() -> int:
|
|
| 176 |
correct = 0
|
| 177 |
t0 = time.time()
|
| 178 |
for i, row in df.iterrows():
|
|
|
|
|
|
|
| 179 |
video_path = video_dir / f"{row['videoID']}.mp4"
|
| 180 |
|
| 181 |
# MCQ mode = query-aware (task_type=None lets QA path run).
|
|
@@ -183,17 +202,27 @@ def main() -> int:
|
|
| 183 |
# the uniform-fallback path (matches stock 8f
|
| 184 |
# baseline behavior).
|
| 185 |
forced_uniform = (args.mode == "stock-uniform")
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
gold = row["answer"].strip().upper()
|
| 193 |
ok = out["pred"] == gold
|
| 194 |
correct += int(ok)
|
| 195 |
results.append({
|
| 196 |
-
"index":
|
| 197 |
"videoID": row["videoID"],
|
| 198 |
"task_type": row.get("task_type", ""),
|
| 199 |
"gold": gold,
|
|
@@ -205,7 +234,8 @@ def main() -> int:
|
|
| 205 |
"correct": ok,
|
| 206 |
})
|
| 207 |
run = correct / (i + 1)
|
| 208 |
-
print(f"[eval] [{
|
|
|
|
| 209 |
f"acc_so_far={run:.3f} clip={out['latency_clip_s']}s "
|
| 210 |
f"gen={out['latency_gen_s']}s", flush=True)
|
| 211 |
|
|
@@ -216,6 +246,8 @@ def main() -> int:
|
|
| 216 |
"clip_model": args.clip_model,
|
| 217 |
"mode": args.mode,
|
| 218 |
"tag": args.tag,
|
|
|
|
|
|
|
| 219 |
"n_questions": n,
|
| 220 |
"n_frames": args.n_frames,
|
| 221 |
"n_candidates": args.n_candidates,
|
|
|
|
| 103 |
return video_dir
|
| 104 |
|
| 105 |
|
| 106 |
+
def load_questions(pq_path: Path, video_dir: Path, limit: int,
|
| 107 |
+
start_idx: int = 0) -> pd.DataFrame:
|
| 108 |
+
"""Load questions filtered to videos on disk.
|
| 109 |
+
|
| 110 |
+
``start_idx`` skips the first N rows after the videoID filter, which
|
| 111 |
+
is useful for chunked / resumable evaluation when the underlying
|
| 112 |
+
accelerator (e.g. Apple MPS) corrupts state on long runs.
|
| 113 |
+
"""
|
| 114 |
df = pd.read_parquet(pq_path)
|
| 115 |
ids = {p.stem for p in video_dir.glob("*.mp4")}
|
| 116 |
df = df[df["videoID"].isin(ids)].reset_index(drop=True)
|
| 117 |
+
total_avail = len(df)
|
| 118 |
+
if start_idx > 0:
|
| 119 |
+
df = df.iloc[start_idx:].reset_index(drop=True)
|
| 120 |
if limit > 0 and len(df) > limit:
|
| 121 |
df = df.iloc[:limit].copy()
|
| 122 |
+
print(f"[eval] using {len(df)} questions "
|
| 123 |
+
f"(start_idx={start_idx}, total_available={total_avail})")
|
| 124 |
return df
|
| 125 |
|
| 126 |
|
|
|
|
| 154 |
"'wild' = deprecated alias for 'mcq'; "
|
| 155 |
"'stock-uniform' = stock baseline (uniform 8 frames)")
|
| 156 |
ap.add_argument("--tag", default="")
|
| 157 |
+
ap.add_argument("--n-questions", type=int, default=50,
|
| 158 |
+
help="number of questions to score in this run (after start-idx)")
|
| 159 |
+
ap.add_argument("--start-idx", type=int, default=0,
|
| 160 |
+
help="skip the first N filtered questions; useful for "
|
| 161 |
+
"chunked / resumable evaluation when the accelerator "
|
| 162 |
+
"(e.g. Apple MPS) corrupts state on long runs")
|
| 163 |
ap.add_argument("--n-frames", type=int, default=8)
|
| 164 |
ap.add_argument("--n-candidates", type=int, default=32)
|
| 165 |
ap.add_argument("--max-pixels", type=int, default=262144)
|
|
|
|
| 174 |
|
| 175 |
pq_path, zip_paths = download_assets(args.chunks)
|
| 176 |
video_dir = unzip_chunks(zip_paths)
|
| 177 |
+
df = load_questions(pq_path, video_dir, args.n_questions,
|
| 178 |
+
start_idx=args.start_idx)
|
| 179 |
|
| 180 |
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
|
| 181 |
|
|
|
|
| 193 |
correct = 0
|
| 194 |
t0 = time.time()
|
| 195 |
for i, row in df.iterrows():
|
| 196 |
+
# Absolute index into the full filtered df (so chunks have unique idx).
|
| 197 |
+
abs_idx = int(i) + args.start_idx
|
| 198 |
video_path = video_dir / f"{row['videoID']}.mp4"
|
| 199 |
|
| 200 |
# MCQ mode = query-aware (task_type=None lets QA path run).
|
|
|
|
| 202 |
# the uniform-fallback path (matches stock 8f
|
| 203 |
# baseline behavior).
|
| 204 |
forced_uniform = (args.mode == "stock-uniform")
|
| 205 |
+
try:
|
| 206 |
+
out = fv.answer_mcq(
|
| 207 |
+
video_path=video_path,
|
| 208 |
+
question=row["question"],
|
| 209 |
+
options=list(row["options"]),
|
| 210 |
+
task_type=("Object Reasoning" if forced_uniform else None),
|
| 211 |
+
)
|
| 212 |
+
except Exception as e:
|
| 213 |
+
# MPS / accelerator state corruption sometimes triggers
|
| 214 |
+
# mid-run on long inference. Save what we have and exit so
|
| 215 |
+
# an outer chunked-runner can pick up from start-idx + i.
|
| 216 |
+
print(f"[eval] FATAL at q {abs_idx}: {type(e).__name__}: {e}",
|
| 217 |
+
flush=True)
|
| 218 |
+
print(f"[eval] saving partial results ({len(results)}) "
|
| 219 |
+
f"and exiting so caller can resume.", flush=True)
|
| 220 |
+
break
|
| 221 |
gold = row["answer"].strip().upper()
|
| 222 |
ok = out["pred"] == gold
|
| 223 |
correct += int(ok)
|
| 224 |
results.append({
|
| 225 |
+
"index": abs_idx,
|
| 226 |
"videoID": row["videoID"],
|
| 227 |
"task_type": row.get("task_type", ""),
|
| 228 |
"gold": gold,
|
|
|
|
| 234 |
"correct": ok,
|
| 235 |
})
|
| 236 |
run = correct / (i + 1)
|
| 237 |
+
print(f"[eval] [{abs_idx+1}/{args.start_idx + len(df)}] "
|
| 238 |
+
f"gold={gold} pred={out['pred']} "
|
| 239 |
f"acc_so_far={run:.3f} clip={out['latency_clip_s']}s "
|
| 240 |
f"gen={out['latency_gen_s']}s", flush=True)
|
| 241 |
|
|
|
|
| 246 |
"clip_model": args.clip_model,
|
| 247 |
"mode": args.mode,
|
| 248 |
"tag": args.tag,
|
| 249 |
+
"start_idx": args.start_idx,
|
| 250 |
+
"n_questions_attempted": len(df),
|
| 251 |
"n_questions": n,
|
| 252 |
"n_frames": args.n_frames,
|
| 253 |
"n_candidates": args.n_candidates,
|