commandeaw commited on
Commit
50d1d87
·
verified ·
1 Parent(s): 7df1847

Add --start-idx for chunked evaluation (MPS resilience)

Browse files

Apple MPS can corrupt accelerator state on long Qwen3-VL inference
runs (~800+ questions), producing AcceleratorError. Add --start-idx K
so callers can skip the first K filtered questions, enabling a
chunked-runner pattern: each chunk is a fresh Python process with
clean MPS state.

Also wrap the per-question inference in try/except: on
AcceleratorError or any other inference failure, save partial results
and exit gracefully so an outer chunked-runner can resume from
start-idx + completed_count.

Files changed (1) hide show
  1. eval_videomme.py +44 -12
eval_videomme.py CHANGED
@@ -103,13 +103,24 @@ def unzip_chunks(zip_paths: list[Path]) -> Path:
103
  return video_dir
104
 
105
 
106
- def load_questions(pq_path: Path, video_dir: Path, limit: int) -> pd.DataFrame:
 
 
 
 
 
 
 
107
  df = pd.read_parquet(pq_path)
108
  ids = {p.stem for p in video_dir.glob("*.mp4")}
109
  df = df[df["videoID"].isin(ids)].reset_index(drop=True)
 
 
 
110
  if limit > 0 and len(df) > limit:
111
  df = df.iloc[:limit].copy()
112
- print(f"[eval] using {len(df)} questions")
 
113
  return df
114
 
115
 
@@ -143,7 +154,12 @@ def main() -> int:
143
  "'wild' = deprecated alias for 'mcq'; "
144
  "'stock-uniform' = stock baseline (uniform 8 frames)")
145
  ap.add_argument("--tag", default="")
146
- ap.add_argument("--n-questions", type=int, default=50)
 
 
 
 
 
147
  ap.add_argument("--n-frames", type=int, default=8)
148
  ap.add_argument("--n-candidates", type=int, default=32)
149
  ap.add_argument("--max-pixels", type=int, default=262144)
@@ -158,7 +174,8 @@ def main() -> int:
158
 
159
  pq_path, zip_paths = download_assets(args.chunks)
160
  video_dir = unzip_chunks(zip_paths)
161
- df = load_questions(pq_path, video_dir, args.n_questions)
 
162
 
163
  os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
164
 
@@ -176,6 +193,8 @@ def main() -> int:
176
  correct = 0
177
  t0 = time.time()
178
  for i, row in df.iterrows():
 
 
179
  video_path = video_dir / f"{row['videoID']}.mp4"
180
 
181
  # MCQ mode = query-aware (task_type=None lets QA path run).
@@ -183,17 +202,27 @@ def main() -> int:
183
  # the uniform-fallback path (matches stock 8f
184
  # baseline behavior).
185
  forced_uniform = (args.mode == "stock-uniform")
186
- out = fv.answer_mcq(
187
- video_path=video_path,
188
- question=row["question"],
189
- options=list(row["options"]),
190
- task_type=("Object Reasoning" if forced_uniform else None),
191
- )
 
 
 
 
 
 
 
 
 
 
192
  gold = row["answer"].strip().upper()
193
  ok = out["pred"] == gold
194
  correct += int(ok)
195
  results.append({
196
- "index": int(i),
197
  "videoID": row["videoID"],
198
  "task_type": row.get("task_type", ""),
199
  "gold": gold,
@@ -205,7 +234,8 @@ def main() -> int:
205
  "correct": ok,
206
  })
207
  run = correct / (i + 1)
208
- print(f"[eval] [{i+1}/{len(df)}] gold={gold} pred={out['pred']} "
 
209
  f"acc_so_far={run:.3f} clip={out['latency_clip_s']}s "
210
  f"gen={out['latency_gen_s']}s", flush=True)
211
 
@@ -216,6 +246,8 @@ def main() -> int:
216
  "clip_model": args.clip_model,
217
  "mode": args.mode,
218
  "tag": args.tag,
 
 
219
  "n_questions": n,
220
  "n_frames": args.n_frames,
221
  "n_candidates": args.n_candidates,
 
103
  return video_dir
104
 
105
 
106
+ def load_questions(pq_path: Path, video_dir: Path, limit: int,
107
+ start_idx: int = 0) -> pd.DataFrame:
108
+ """Load questions filtered to videos on disk.
109
+
110
+ ``start_idx`` skips the first N rows after the videoID filter, which
111
+ is useful for chunked / resumable evaluation when the underlying
112
+ accelerator (e.g. Apple MPS) corrupts state on long runs.
113
+ """
114
  df = pd.read_parquet(pq_path)
115
  ids = {p.stem for p in video_dir.glob("*.mp4")}
116
  df = df[df["videoID"].isin(ids)].reset_index(drop=True)
117
+ total_avail = len(df)
118
+ if start_idx > 0:
119
+ df = df.iloc[start_idx:].reset_index(drop=True)
120
  if limit > 0 and len(df) > limit:
121
  df = df.iloc[:limit].copy()
122
+ print(f"[eval] using {len(df)} questions "
123
+ f"(start_idx={start_idx}, total_available={total_avail})")
124
  return df
125
 
126
 
 
154
  "'wild' = deprecated alias for 'mcq'; "
155
  "'stock-uniform' = stock baseline (uniform 8 frames)")
156
  ap.add_argument("--tag", default="")
157
+ ap.add_argument("--n-questions", type=int, default=50,
158
+ help="number of questions to score in this run (after start-idx)")
159
+ ap.add_argument("--start-idx", type=int, default=0,
160
+ help="skip the first N filtered questions; useful for "
161
+ "chunked / resumable evaluation when the accelerator "
162
+ "(e.g. Apple MPS) corrupts state on long runs")
163
  ap.add_argument("--n-frames", type=int, default=8)
164
  ap.add_argument("--n-candidates", type=int, default=32)
165
  ap.add_argument("--max-pixels", type=int, default=262144)
 
174
 
175
  pq_path, zip_paths = download_assets(args.chunks)
176
  video_dir = unzip_chunks(zip_paths)
177
+ df = load_questions(pq_path, video_dir, args.n_questions,
178
+ start_idx=args.start_idx)
179
 
180
  os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
181
 
 
193
  correct = 0
194
  t0 = time.time()
195
  for i, row in df.iterrows():
196
+ # Absolute index into the full filtered df (so chunks have unique idx).
197
+ abs_idx = int(i) + args.start_idx
198
  video_path = video_dir / f"{row['videoID']}.mp4"
199
 
200
  # MCQ mode = query-aware (task_type=None lets QA path run).
 
202
  # the uniform-fallback path (matches stock 8f
203
  # baseline behavior).
204
  forced_uniform = (args.mode == "stock-uniform")
205
+ try:
206
+ out = fv.answer_mcq(
207
+ video_path=video_path,
208
+ question=row["question"],
209
+ options=list(row["options"]),
210
+ task_type=("Object Reasoning" if forced_uniform else None),
211
+ )
212
+ except Exception as e:
213
+ # MPS / accelerator state corruption sometimes triggers
214
+ # mid-run on long inference. Save what we have and exit so
215
+ # an outer chunked-runner can pick up from start-idx + i.
216
+ print(f"[eval] FATAL at q {abs_idx}: {type(e).__name__}: {e}",
217
+ flush=True)
218
+ print(f"[eval] saving partial results ({len(results)}) "
219
+ f"and exiting so caller can resume.", flush=True)
220
+ break
221
  gold = row["answer"].strip().upper()
222
  ok = out["pred"] == gold
223
  correct += int(ok)
224
  results.append({
225
+ "index": abs_idx,
226
  "videoID": row["videoID"],
227
  "task_type": row.get("task_type", ""),
228
  "gold": gold,
 
234
  "correct": ok,
235
  })
236
  run = correct / (i + 1)
237
+ print(f"[eval] [{abs_idx+1}/{args.start_idx + len(df)}] "
238
+ f"gold={gold} pred={out['pred']} "
239
  f"acc_so_far={run:.3f} clip={out['latency_clip_s']}s "
240
  f"gen={out['latency_gen_s']}s", flush=True)
241
 
 
246
  "clip_model": args.clip_model,
247
  "mode": args.mode,
248
  "tag": args.tag,
249
+ "start_idx": args.start_idx,
250
+ "n_questions_attempted": len(df),
251
  "n_questions": n,
252
  "n_frames": args.n_frames,
253
  "n_candidates": args.n_candidates,