Add --start-idx for chunked evaluation (MPS resilience)

Apple MPS can corrupt accelerator state on long Qwen3-VL inference
runs (~800+ questions), producing AcceleratorError. Add --start-idx K
so callers can skip the first K filtered questions, enabling a
chunked-runner pattern: each chunk is a fresh Python process with
clean MPS state.

Also wrap the per-question inference in try/except: on
AcceleratorError or any other inference failure, save partial results
and exit gracefully so an outer chunked-runner can resume from
start-idx + completed_count.

Files changed (1) hide show

eval_videomme.py +44 -12

eval_videomme.py CHANGED Viewed

@@ -103,13 +103,24 @@ def unzip_chunks(zip_paths: list[Path]) -> Path:
     return video_dir
-def load_questions(pq_path: Path, video_dir: Path, limit: int) -> pd.DataFrame:
     df = pd.read_parquet(pq_path)
     ids = {p.stem for p in video_dir.glob("*.mp4")}
     df = df[df["videoID"].isin(ids)].reset_index(drop=True)
     if limit > 0 and len(df) > limit:
         df = df.iloc[:limit].copy()
-    print(f"[eval] using {len(df)} questions")
     return df
@@ -143,7 +154,12 @@ def main() -> int:
                          "'wild' = deprecated alias for 'mcq'; "
                          "'stock-uniform' = stock baseline (uniform 8 frames)")
     ap.add_argument("--tag", default="")
-    ap.add_argument("--n-questions", type=int, default=50)
     ap.add_argument("--n-frames", type=int, default=8)
     ap.add_argument("--n-candidates", type=int, default=32)
     ap.add_argument("--max-pixels", type=int, default=262144)
@@ -158,7 +174,8 @@ def main() -> int:
     pq_path, zip_paths = download_assets(args.chunks)
     video_dir = unzip_chunks(zip_paths)
-    df = load_questions(pq_path, video_dir, args.n_questions)
     os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
@@ -176,6 +193,8 @@ def main() -> int:
     correct = 0
     t0 = time.time()
     for i, row in df.iterrows():
         video_path = video_dir / f"{row['videoID']}.mp4"
         # MCQ mode = query-aware (task_type=None lets QA path run).
@@ -183,17 +202,27 @@ def main() -> int:
         #                 the uniform-fallback path (matches stock 8f
         #                 baseline behavior).
         forced_uniform = (args.mode == "stock-uniform")
-        out = fv.answer_mcq(
-            video_path=video_path,
-            question=row["question"],
-            options=list(row["options"]),
-            task_type=("Object Reasoning" if forced_uniform else None),
-        )
         gold = row["answer"].strip().upper()
         ok = out["pred"] == gold
         correct += int(ok)
         results.append({
-            "index": int(i),
             "videoID": row["videoID"],
             "task_type": row.get("task_type", ""),
             "gold": gold,
@@ -205,7 +234,8 @@ def main() -> int:
             "correct": ok,
         })
         run = correct / (i + 1)
-        print(f"[eval] [{i+1}/{len(df)}] gold={gold} pred={out['pred']} "
               f"acc_so_far={run:.3f} clip={out['latency_clip_s']}s "
               f"gen={out['latency_gen_s']}s", flush=True)
@@ -216,6 +246,8 @@ def main() -> int:
         "clip_model": args.clip_model,
         "mode": args.mode,
         "tag": args.tag,
         "n_questions": n,
         "n_frames": args.n_frames,
         "n_candidates": args.n_candidates,

     return video_dir
+def load_questions(pq_path: Path, video_dir: Path, limit: int,
+                   start_idx: int = 0) -> pd.DataFrame:
+    """Load questions filtered to videos on disk.
+    ``start_idx`` skips the first N rows after the videoID filter, which
+    is useful for chunked / resumable evaluation when the underlying
+    accelerator (e.g. Apple MPS) corrupts state on long runs.
+    """
     df = pd.read_parquet(pq_path)
     ids = {p.stem for p in video_dir.glob("*.mp4")}
     df = df[df["videoID"].isin(ids)].reset_index(drop=True)
+    total_avail = len(df)
+    if start_idx > 0:
+        df = df.iloc[start_idx:].reset_index(drop=True)
     if limit > 0 and len(df) > limit:
         df = df.iloc[:limit].copy()
+    print(f"[eval] using {len(df)} questions "
+          f"(start_idx={start_idx}, total_available={total_avail})")
     return df
                          "'wild' = deprecated alias for 'mcq'; "
                          "'stock-uniform' = stock baseline (uniform 8 frames)")
     ap.add_argument("--tag", default="")
+    ap.add_argument("--n-questions", type=int, default=50,
+                    help="number of questions to score in this run (after start-idx)")
+    ap.add_argument("--start-idx", type=int, default=0,
+                    help="skip the first N filtered questions; useful for "
+                         "chunked / resumable evaluation when the accelerator "
+                         "(e.g. Apple MPS) corrupts state on long runs")
     ap.add_argument("--n-frames", type=int, default=8)
     ap.add_argument("--n-candidates", type=int, default=32)
     ap.add_argument("--max-pixels", type=int, default=262144)
     pq_path, zip_paths = download_assets(args.chunks)
     video_dir = unzip_chunks(zip_paths)
+    df = load_questions(pq_path, video_dir, args.n_questions,
+                        start_idx=args.start_idx)
     os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
     correct = 0
     t0 = time.time()
     for i, row in df.iterrows():
+        # Absolute index into the full filtered df (so chunks have unique idx).
+        abs_idx = int(i) + args.start_idx
         video_path = video_dir / f"{row['videoID']}.mp4"
         # MCQ mode = query-aware (task_type=None lets QA path run).
         #                 the uniform-fallback path (matches stock 8f
         #                 baseline behavior).
         forced_uniform = (args.mode == "stock-uniform")
+        try:
+            out = fv.answer_mcq(
+                video_path=video_path,
+                question=row["question"],
+                options=list(row["options"]),
+                task_type=("Object Reasoning" if forced_uniform else None),
+            )
+        except Exception as e:
+            # MPS / accelerator state corruption sometimes triggers
+            # mid-run on long inference. Save what we have and exit so
+            # an outer chunked-runner can pick up from start-idx + i.
+            print(f"[eval] FATAL at q {abs_idx}: {type(e).__name__}: {e}",
+                  flush=True)
+            print(f"[eval] saving partial results ({len(results)}) "
+                  f"and exiting so caller can resume.", flush=True)
+            break
         gold = row["answer"].strip().upper()
         ok = out["pred"] == gold
         correct += int(ok)
         results.append({
+            "index": abs_idx,
             "videoID": row["videoID"],
             "task_type": row.get("task_type", ""),
             "gold": gold,
             "correct": ok,
         })
         run = correct / (i + 1)
+        print(f"[eval] [{abs_idx+1}/{args.start_idx + len(df)}] "
+              f"gold={gold} pred={out['pred']} "
               f"acc_so_far={run:.3f} clip={out['latency_clip_s']}s "
               f"gen={out['latency_gen_s']}s", flush=True)
         "clip_model": args.clip_model,
         "mode": args.mode,
         "tag": args.tag,
+        "start_idx": args.start_idx,
+        "n_questions_attempted": len(df),
         "n_questions": n,
         "n_frames": args.n_frames,
         "n_candidates": args.n_candidates,