[ { "id": 0, "task": "dense_captioning_gemini", "data_source": "AVOS", "question": "You are an expert in interpreting surgical videos. This video clip depicts an open surgery. Construct a compact summary of all occurrences of ['cutting', 'tying', 'suturing']. For each, provide [start, end] and a one-sentence caption.", "frames": [ "live_examples/live_00/frame_000.jpg", "live_examples/live_00/frame_001.jpg", "live_examples/live_00/frame_002.jpg", "live_examples/live_00/frame_003.jpg", "live_examples/live_00/frame_004.jpg", "live_examples/live_00/frame_005.jpg", "live_examples/live_00/frame_006.jpg", "live_examples/live_00/frame_007.jpg", "live_examples/live_00/frame_008.jpg", "live_examples/live_00/frame_009.jpg", "live_examples/live_00/frame_010.jpg", "live_examples/live_00/frame_011.jpg", "live_examples/live_00/frame_012.jpg", "live_examples/live_00/frame_013.jpg", "live_examples/live_00/frame_014.jpg", "live_examples/live_00/frame_015.jpg", "live_examples/live_00/frame_016.jpg", "live_examples/live_00/frame_017.jpg", "live_examples/live_00/frame_018.jpg", "live_examples/live_00/frame_019.jpg", "live_examples/live_00/frame_020.jpg", "live_examples/live_00/frame_021.jpg", "live_examples/live_00/frame_022.jpg", "live_examples/live_00/frame_023.jpg", "live_examples/live_00/frame_024.jpg", "live_examples/live_00/frame_025.jpg", "live_examples/live_00/frame_026.jpg", "live_examples/live_00/frame_027.jpg", "live_examples/live_00/frame_028.jpg", "live_examples/live_00/frame_029.jpg", "live_examples/live_00/frame_030.jpg", "live_examples/live_00/frame_031.jpg", "live_examples/live_00/frame_032.jpg", "live_examples/live_00/frame_033.jpg", "live_examples/live_00/frame_034.jpg", "live_examples/live_00/frame_035.jpg", "live_examples/live_00/frame_036.jpg", "live_examples/live_00/frame_037.jpg", "live_examples/live_00/frame_038.jpg", "live_examples/live_00/frame_039.jpg", "live_examples/live_00/frame_040.jpg", "live_examples/live_00/frame_041.jpg", "live_examples/live_00/frame_042.jpg", "live_examples/live_00/frame_043.jpg", "live_examples/live_00/frame_044.jpg", "live_examples/live_00/frame_045.jpg", "live_examples/live_00/frame_046.jpg", "live_examples/live_00/frame_047.jpg", "live_examples/live_00/frame_048.jpg", "live_examples/live_00/frame_049.jpg", "live_examples/live_00/frame_050.jpg", "live_examples/live_00/frame_051.jpg", "live_examples/live_00/frame_052.jpg" ], "n_frames": 53, "test_idx": 261 }, { "id": 1, "task": "dense_captioning_gpt", "data_source": "CholecT50", "question": "You are an expert in operative video analysis. The following video contains endoscopic footage of a laparoscopic cholecystectomy, where the gallbladder is removed using small incisions and specialized instruments. Detect and caption only clinically relevant actions from ['preparation', 'carlot-triangle-dissection', 'clipping-and-cutting', 'gallbladder-dissection', 'gallbladder-packaging', 'cleaning-and-coagulation', 'gallbladder-extraction']. Return start/end times with a brief, objective description.", "frames": [ "live_examples/live_01/frame_000.jpg", "live_examples/live_01/frame_001.jpg", "live_examples/live_01/frame_002.jpg", "live_examples/live_01/frame_003.jpg", "live_examples/live_01/frame_004.jpg", "live_examples/live_01/frame_005.jpg", "live_examples/live_01/frame_006.jpg", "live_examples/live_01/frame_007.jpg", "live_examples/live_01/frame_008.jpg", "live_examples/live_01/frame_009.jpg", "live_examples/live_01/frame_010.jpg", "live_examples/live_01/frame_011.jpg", "live_examples/live_01/frame_012.jpg", "live_examples/live_01/frame_013.jpg", "live_examples/live_01/frame_014.jpg", "live_examples/live_01/frame_015.jpg", "live_examples/live_01/frame_016.jpg", "live_examples/live_01/frame_017.jpg", "live_examples/live_01/frame_018.jpg", "live_examples/live_01/frame_019.jpg", "live_examples/live_01/frame_020.jpg", "live_examples/live_01/frame_021.jpg", "live_examples/live_01/frame_022.jpg", "live_examples/live_01/frame_023.jpg", "live_examples/live_01/frame_024.jpg", "live_examples/live_01/frame_025.jpg", "live_examples/live_01/frame_026.jpg", "live_examples/live_01/frame_027.jpg", "live_examples/live_01/frame_028.jpg", "live_examples/live_01/frame_029.jpg", "live_examples/live_01/frame_030.jpg", "live_examples/live_01/frame_031.jpg", "live_examples/live_01/frame_032.jpg", "live_examples/live_01/frame_033.jpg", "live_examples/live_01/frame_034.jpg", "live_examples/live_01/frame_035.jpg", "live_examples/live_01/frame_036.jpg", "live_examples/live_01/frame_037.jpg", "live_examples/live_01/frame_038.jpg", "live_examples/live_01/frame_039.jpg", "live_examples/live_01/frame_040.jpg", "live_examples/live_01/frame_041.jpg", "live_examples/live_01/frame_042.jpg", "live_examples/live_01/frame_043.jpg", "live_examples/live_01/frame_044.jpg", "live_examples/live_01/frame_045.jpg", "live_examples/live_01/frame_046.jpg", "live_examples/live_01/frame_047.jpg", "live_examples/live_01/frame_048.jpg", "live_examples/live_01/frame_049.jpg", "live_examples/live_01/frame_050.jpg", "live_examples/live_01/frame_051.jpg", "live_examples/live_01/frame_052.jpg" ], "n_frames": 53, "test_idx": 1371 }, { "id": 2, "task": "cvs_assessment", "data_source": "Cholec80_CVS", "question": "You are an expert surgical video evaluator. This Cholec80-CVS recording is for assessing Strasberg\u2019s three Critical View of Safety criteria. Scoring the CVS requires evaluating three components: (1) Are there exactly two structures entering the gallbladder? (2) Is the cystic plate clearly seen? (3) Is the hepatocystic triangle free of tissue? Use a 0,1,2 scale for each.", "frames": [ "live_examples/live_02/frame_000.jpg", "live_examples/live_02/frame_001.jpg", "live_examples/live_02/frame_002.jpg", "live_examples/live_02/frame_003.jpg", "live_examples/live_02/frame_004.jpg", "live_examples/live_02/frame_005.jpg", "live_examples/live_02/frame_006.jpg", "live_examples/live_02/frame_007.jpg", "live_examples/live_02/frame_008.jpg", "live_examples/live_02/frame_009.jpg", "live_examples/live_02/frame_010.jpg", "live_examples/live_02/frame_011.jpg", "live_examples/live_02/frame_012.jpg", "live_examples/live_02/frame_013.jpg", "live_examples/live_02/frame_014.jpg", "live_examples/live_02/frame_015.jpg", "live_examples/live_02/frame_016.jpg", "live_examples/live_02/frame_017.jpg", "live_examples/live_02/frame_018.jpg", "live_examples/live_02/frame_019.jpg", "live_examples/live_02/frame_020.jpg", "live_examples/live_02/frame_021.jpg", "live_examples/live_02/frame_022.jpg", "live_examples/live_02/frame_023.jpg", "live_examples/live_02/frame_024.jpg", "live_examples/live_02/frame_025.jpg", "live_examples/live_02/frame_026.jpg", "live_examples/live_02/frame_027.jpg", "live_examples/live_02/frame_028.jpg", "live_examples/live_02/frame_029.jpg", "live_examples/live_02/frame_030.jpg", "live_examples/live_02/frame_031.jpg", "live_examples/live_02/frame_032.jpg", "live_examples/live_02/frame_033.jpg", "live_examples/live_02/frame_034.jpg", "live_examples/live_02/frame_035.jpg", "live_examples/live_02/frame_036.jpg", "live_examples/live_02/frame_037.jpg", "live_examples/live_02/frame_038.jpg", "live_examples/live_02/frame_039.jpg", "live_examples/live_02/frame_040.jpg", "live_examples/live_02/frame_041.jpg", "live_examples/live_02/frame_042.jpg", "live_examples/live_02/frame_043.jpg", "live_examples/live_02/frame_044.jpg", "live_examples/live_02/frame_045.jpg", "live_examples/live_02/frame_046.jpg", "live_examples/live_02/frame_047.jpg", "live_examples/live_02/frame_048.jpg", "live_examples/live_02/frame_049.jpg", "live_examples/live_02/frame_050.jpg", "live_examples/live_02/frame_051.jpg", "live_examples/live_02/frame_052.jpg", "live_examples/live_02/frame_053.jpg", "live_examples/live_02/frame_054.jpg", "live_examples/live_02/frame_055.jpg", "live_examples/live_02/frame_056.jpg", "live_examples/live_02/frame_057.jpg", "live_examples/live_02/frame_058.jpg", "live_examples/live_02/frame_059.jpg" ], "n_frames": 60, "test_idx": 494 } ]