#!/usr/bin/env python3 """ Select 2 good examples per task (16 total) from the test set, copy frames, and build examples.json for the Gradio demo. Run once locally before uploading to HF Space. """ import json import os import random import shutil from pathlib import Path random.seed(42) TEST_DATA = "/root/code/qa_instances/qwen3vl_data_test_03_17.json" INFER_DATA = "/root/code/LlamaFactory/medical_finetune/eval/results/qwen3_5vl_sft_inference_final_0321_0911.json" OUTPUT_DIR = Path(__file__).parent / "examples" MAX_FRAMES = 16 # subsample frames for demo (keep it lightweight) TASK_GROUPS = { "Temporal Action Localization": ["tal"], "Spatiotemporal Grounding": ["stg"], "Dense Captioning": ["dense_captioning_gpt", "dense_captioning_gemini"], "Next Action Prediction": ["next_action"], "Video Summary": ["video_summary_gpt", "video_summary_gemini"], "Region Caption": ["region_caption_gpt", "region_caption_gemini"], "CVS Assessment": ["cvs_assessment"], "Skill Assessment": ["skill_assessment"], } EXAMPLES_PER_TASK = 2 def subsample_frames(frame_paths: list[str], max_frames: int) -> list[str]: """Uniformly subsample frames.""" if len(frame_paths) <= max_frames: return frame_paths step = len(frame_paths) / max_frames return [frame_paths[int(i * step)] for i in range(max_frames)] def extract_qa(conversations): question, answer = "", "" for msg in conversations: if msg.get("from") in ("human", "user"): question = msg.get("value", "").replace("