"""Dense Video Captioning evaluation using LLM judge + temporal F1. LLM judge uses IoU-matched segment pairs (matching original Qwen2.5-VL/llm_judge/): - Match predicted segments to GT segments at IoU thresholds (0.3, 0.5, 0.7) - Only judge matched pairs individually (not concatenated) - Average across matched pairs, then across thresholds Temporal F1 algorithm matches Qwen2.5-VL/my_eval/eval_dvc.py exactly: - process_raw_output() + flatten_overlapping_segments() for parsing - Frame-based coordinates (multiply by FPS) - Many-to-many threshold matching across IoU (0.3, 0.5, 0.7) - F1 = 2 * mean_precision * mean_recall / (mean_precision + mean_recall) """ import json import os import re import sys import time import numpy as np from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from threading import Lock from eval_caption_llm_judge import ( call_llm_judge_api, BEST5_ASPECTS, OPENAI_AVAILABLE, compute_semantic_similarity_fallback ) # ============================================================================= # Ported from Qwen2.5-VL/my_eval_old/eval_dvc.py - exact same algorithms # ============================================================================= def zs_parse_multi_segment_annotations(raw_text: str): """Parse raw multiline string with multiple timestamped captions per line.""" all_segments = [] lines = raw_text.strip().split('\n') for line in lines: matches = re.findall( r"(?:\*\*Start Time:\*\*|Start\s*\(?Time\)?|Time\s*Range:|Time\s*Interval:|^|\n)\s*(\d+\.?\d*)\s*[-–]\s*(\d+\.?\d*)\s*seconds?.*?(?:\*\*Description:\*\*|-)\s*(.+?)(?=\n\d|$)", line, flags=re.DOTALL ) for start, end, caption in matches: all_segments.append({ "start": float(start), "end": float(end), "caption": caption.strip().rstrip('.') }) return all_segments def process_raw_output(raw_descriptions: str): """Process raw frame-wise descriptions into structured segments.""" pattern = r"(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s+seconds?:\s+(.*?)(?=\n\d+(?:\.\d+)?-\d+(?:\.\d+)?\s+seconds?:|\Z)" matches = re.findall(pattern, raw_descriptions, re.DOTALL) segments = [] for start, end, desc in matches: segments.append({ "start": float(start), "end": float(end), "caption": desc.strip().replace("\n", " ") }) # Remove duplicate (start, end) segments seen = set() unique_segments = [] for seg in segments: key = (seg["start"], seg["end"]) if key not in seen: seen.add(key) unique_segments.append(seg) if not unique_segments: unique_segments = zs_parse_multi_segment_annotations(raw_descriptions) return unique_segments def check_for_overlaps(segments): """Check a list of temporal segments for any overlaps.""" sorted_segs = sorted(segments, key=lambda x: (x['start'], x['end'])) overlaps = [] for i in range(len(sorted_segs) - 1): seg1 = sorted_segs[i] seg2 = sorted_segs[i + 1] if seg2["start"] < seg1["end"]: overlaps.append((seg1, seg2)) return overlaps def flatten_overlapping_segments(segments, caption_strategy="longest"): """Split overlapping segments into non-overlapping intervals.""" time_points = sorted(set([s["start"] for s in segments] + [s["end"] for s in segments])) result = [] for i in range(len(time_points) - 1): start = time_points[i] end = time_points[i + 1] overlapping = [] for s in segments: if s["start"] < end and s["end"] > start: overlapping.append(s) if not overlapping: continue if caption_strategy == "longest": selected = max(overlapping, key=lambda x: x["end"] - x["start"]) elif caption_strategy == "first": selected = overlapping[0] else: raise ValueError("Unsupported strategy") result.append({ "start": start, "end": end, "caption": selected["caption"] }) return result def iou(interval_1, interval_2): """Compute IoU between two intervals - matches old eval exactly.""" start_1, end_1 = min(*interval_1), max(*interval_1) start_2, end_2 = min(*interval_2), max(*interval_2) intersection = max(0, min(end_1, end_2) - max(start_1, start_2)) union = min( max(end_1, end_2) - min(start_1, start_2), end_1 - start_1 + end_2 - start_2) result = float(intersection) / (union + 1e-8) return result def evaluate_detections(predicted_segments, gt_segments, splits, iou_thresholds=(0.3, 0.5, 0.7, 0.9)): """Compute P/R between predicted and ground truth segments. Many-to-many matching: any pred-gt pair exceeding threshold counts as covered. """ best_recall = [] best_precision = [] predicted_shape = predicted_segments.shape[0] for split in set(splits): metrics = {} for threshold in iou_thresholds: metrics[str(threshold)] = { 'gt_covered': set(), 'pred_covered': set(), } split_idx = np.where(splits == split)[0] split_gt_segments = np.array([gt_segments[idx] for idx in split_idx]) gt_shape = split_gt_segments.shape[0] for idx_g, gt_segment in enumerate(split_gt_segments): for idx_p, segment in enumerate(predicted_segments): sample_iou = iou(segment, gt_segment) for threshold in iou_thresholds: if sample_iou > threshold: metrics[str(threshold)]['pred_covered'].add(idx_p) metrics[str(threshold)]['gt_covered'].add(idx_g) for threshold, m in metrics.items(): pred_covered = m['pred_covered'] gt_covered = m['gt_covered'] m['precision'] = float(len(pred_covered)) / max(float(predicted_shape), 1.0) m['recall'] = float(len(gt_covered)) / float(gt_shape) precision = [m['precision'] for m in metrics.values()] recall = [m['recall'] for m in metrics.values()] if best_precision: best_precision = [max(precision[i], best_precision[i]) for i in range(len(precision))] best_recall = [max(recall[i], best_recall[i]) for i in range(len(recall))] else: best_precision, best_recall = precision, recall return best_precision, best_recall def compute_temporal_f1_single(predicted_segments, gt_segments, splits, iou_thresholds=(0.3, 0.5, 0.7)): """Compute temporal F1 for a single sample using the old eval algorithm. Returns dict with Precision_Mean, Recall_Mean, F1_Score. """ if predicted_segments.shape[0] == 0 or gt_segments.shape[0] == 0: return {'Precision_Mean': 0.0, 'Recall_Mean': 0.0, 'F1_Score': 0.0} detection_precision, detection_recall = evaluate_detections( predicted_segments, gt_segments, splits, iou_thresholds ) mean_precision = sum(detection_precision) / len(detection_precision) mean_recall = sum(detection_recall) / len(detection_recall) f1 = 2 * mean_recall * mean_precision / (mean_recall + mean_precision) \ if (mean_recall + mean_precision) > 0 else 0.0 return { 'Precision_Mean': float(mean_precision), 'Recall_Mean': float(mean_recall), 'F1_Score': float(f1), } # ============================================================================= # Dataset grouping and evaluation (LlamaFactory specific) # ============================================================================= def group_records_by_dataset(data): """Group DVC records by dataset for per-dataset evaluation.""" dataset_groups = defaultdict(list) for key, record in data.items(): qa_type = record.get('qa_type', '') # Match any dense_captioning variant (dense_captioning, dense_captioning_gpt, dense_captioning_gemini, dc) if not any(x in qa_type.lower() for x in ['dense_captioning', 'dense_caption', 'dc']): continue # Check data_source first (leaderboard format), then fall back to dataset/dataset_name dataset = record.get('data_source', record.get('dataset', record.get('dataset_name', record.get('metadata', {}).get('dataset', 'Unknown')))) video_id = record.get('video_id', record.get('metadata', {}).get('video_id', '')) if dataset == 'Unknown' and video_id: video_id_lower = str(video_id).lower() if len(video_id) == 11 and any(c.isalpha() for c in video_id): dataset = "AVOS" elif "_part" in video_id_lower: dataset = "CoPESD" elif "video" in video_id_lower: dataset = "CholecT50" dataset_groups[dataset].append(record) return dict(dataset_groups) def _extract_gt_segments(record): """Extract ground truth segments from struc_info, matching Qwen2.5-VL logic.""" struc_info = record.get('struc_info', []) if isinstance(struc_info, list) and len(struc_info) > 0: if isinstance(struc_info[0], list): # Format: [[{segments...}]] gnd = struc_info[0] elif isinstance(struc_info[0], dict) and 'dc_segments' in struc_info[0]: # NurViD format: [{'dc_segments': [...]}] gnd = struc_info[0]['dc_segments'] else: # Format: [{segments...}] gnd = struc_info else: gnd = struc_info return gnd DVC_IOU_THRESHOLDS = [0.3, 0.5, 0.7] DVC_MAX_WORKERS = 20 # Thread-safe progress counter for DVC LLM judge _dvc_progress_lock = Lock() _dvc_completed = 0 _dvc_total = 0 def _segment_iou(seg1, seg2): """Compute IoU for two temporal segments (dicts with 'start' and 'end').""" intersection = max(0, min(seg1['end'], seg2['end']) - max(seg1['start'], seg2['start'])) union = (seg1['end'] - seg1['start']) + (seg2['end'] - seg2['start']) - intersection return intersection / union if union > 0 else 0.0 def _match_captions_at_threshold(pred_segments, gt_segments, threshold): """Match predicted to ground truth segments at a specific IoU threshold. Returns list of (pred_caption, gt_caption) pairs. """ matched_pairs = [] for pred_seg in pred_segments: best_iou = 0.0 best_gt_caption = None for gt_seg in gt_segments: current_iou = _segment_iou(pred_seg, gt_seg) if current_iou >= threshold and current_iou > best_iou: best_iou = current_iou best_gt_caption = gt_seg['caption'] if best_gt_caption is not None: matched_pairs.append((pred_seg['caption'], best_gt_caption)) return matched_pairs def _evaluate_dvc_caption_iou_matched(records, api_key): """Evaluate DVC captions using IoU-matched segment pairs + LLM judge. Matches the original Qwen2.5-VL/llm_judge/ approach: 1. Parse pred and GT into segments 2. Match at IoU thresholds (0.3, 0.5, 0.7) 3. Judge each matched pair individually 4. Average across pairs, then across thresholds """ global _dvc_completed, _dvc_total # Phase 1: Match all samples at all thresholds print(f" Phase 1: Matching segments at IoU thresholds {DVC_IOU_THRESHOLDS}...") all_matched = [] for record in records: pred_text = record.get('answer', '') gt_text = record.get('gnd', '') pred_segments = process_raw_output(pred_text) gt_segments = _extract_gt_segments(record) if not isinstance(gt_segments, list): continue # Ensure gt_segments are dicts with caption gt_segs = [g for g in gt_segments if isinstance(g, dict) and 'start' in g and 'end' in g and 'caption' in g] if not pred_segments or not gt_segs: continue matched_pairs = {} for threshold in DVC_IOU_THRESHOLDS: pairs = _match_captions_at_threshold(pred_segments, gt_segs, threshold) matched_pairs[threshold] = pairs all_matched.append(matched_pairs) total_pairs = sum(sum(len(pairs) for pairs in m.values()) for m in all_matched) print(f" ✓ Matched {len(all_matched)} samples, {total_pairs} total pairs across all thresholds") if total_pairs == 0: return 0.0, 'llm_judge_iou_matched', 0.0 # Phase 2: Evaluate all matched pairs in parallel _dvc_total = total_pairs _dvc_completed = 0 print(f" Phase 2: Evaluating {total_pairs} pairs with LLM Judge ({DVC_MAX_WORKERS} workers)...") # Collect all tasks: (sample_idx, threshold, pred_caption, gt_caption) tasks = [] for sample_idx, matched_pairs in enumerate(all_matched): for threshold in DVC_IOU_THRESHOLDS: for pred_cap, gt_cap in matched_pairs[threshold]: tasks.append((sample_idx, threshold, pred_cap, gt_cap)) # Store results per threshold threshold_scores = {t: {aspect: [] for aspect in BEST5_ASPECTS} for t in DVC_IOU_THRESHOLDS} api_successes = 0 def _judge_pair(pred_cap, gt_cap): global _dvc_completed result = call_llm_judge_api(pred_cap, gt_cap, 'dense_captioning', api_key) with _dvc_progress_lock: _dvc_completed += 1 if _dvc_completed % 50 == 0: print(f" Progress: {_dvc_completed}/{_dvc_total} API calls completed") return result with ThreadPoolExecutor(max_workers=DVC_MAX_WORKERS) as executor: future_to_task = { executor.submit(_judge_pair, pred_cap, gt_cap): (sample_idx, threshold) for sample_idx, threshold, pred_cap, gt_cap in tasks } for future in as_completed(future_to_task): _, threshold = future_to_task[future] try: result = future.result() if result.get('api_success', False): for aspect in BEST5_ASPECTS: threshold_scores[threshold][aspect].append(result[aspect]) api_successes += 1 except Exception as e: print(f" ⚠ Error: {e}") # Phase 3: Aggregate — average per threshold, then across thresholds per_threshold_avg = {} for threshold in DVC_IOU_THRESHOLDS: aspect_avgs = {} for aspect in BEST5_ASPECTS: scores = threshold_scores[threshold][aspect] aspect_avgs[aspect] = np.mean(scores) if scores else 0.0 valid = [v for v in aspect_avgs.values() if v > 0] per_threshold_avg[threshold] = np.mean(valid) if valid else 0.0 # Overall: average across thresholds valid_thresholds = [v for v in per_threshold_avg.values() if v > 0] overall_score = np.mean(valid_thresholds) if valid_thresholds else 0.0 success_rate = api_successes / total_pairs if total_pairs > 0 else 0.0 print(f" ✓ LLM Judge completed: {api_successes}/{total_pairs} successful") for t in DVC_IOU_THRESHOLDS: print(f" IoU@{t}: {per_threshold_avg[t]:.3f}") print(f" Overall (threshold-averaged): {overall_score:.3f}") return overall_score, 'llm_judge_iou_matched', success_rate def evaluate_dataset_dvc(dataset_name, records, skip_llm_judge=False): """Evaluate DVC for a specific dataset using caption quality + temporal F1.""" print(f"\nEvaluating {dataset_name} ({len(records)} records)...") # Step 1: Evaluate caption quality using IoU-matched LLM judge if skip_llm_judge: print(f" Skipping LLM judge caption evaluation (--skip-llm-judge flag)") caption_score = 0.0 caption_method = 'skipped' else: api_key = os.getenv('OPENAI_API_KEY') if api_key and OPENAI_AVAILABLE: caption_score, caption_method, _ = _evaluate_dvc_caption_iou_matched(records, api_key) else: print(f" ⚠ No API key, using semantic similarity fallback") import tempfile temp_data = {str(i): record for i, record in enumerate(records)} with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: json.dump(temp_data, f) temp_file = f.name try: caption_score = compute_semantic_similarity_fallback(temp_data, 'dense_captioning') caption_method = 'semantic_similarity' finally: os.unlink(temp_file) # Step 2: Compute temporal F1 matching Qwen2.5-VL algorithm exactly all_f1_scores = [] all_precision_scores = [] all_recall_scores = [] for record in records: # Get FPS fps = record.get('fps', record.get('metadata', {}).get('fps', 1.0)) if isinstance(fps, str): fps = float(fps) # Parse predicted segments using process_raw_output (same as Qwen2.5-VL) raw_answer = record.get('answer', '') processed_answer = process_raw_output(raw_answer) overlaps = check_for_overlaps(processed_answer) if overlaps: processed_answer = flatten_overlapping_segments(processed_answer, caption_strategy="longest") # Get ground truth segments gnd = _extract_gt_segments(record) # Convert both to frame-based coordinates (multiply by fps, cast to int) # IMPORTANT: require 'caption' field to match Qwen2.5-VL's prepare_eval_arrays gt_segments = [] if isinstance(gnd, list): for g in gnd: if isinstance(g, dict) and 'start' in g and 'end' in g and 'caption' in g: gt_segments.append([int(float(g['start']) * fps), int(float(g['end']) * fps)]) pred_segments = [] if isinstance(processed_answer, list): for p in processed_answer: if isinstance(p, dict) and 'start' in p and 'end' in p and 'caption' in p: pred_segments.append([int(p['start'] * fps), int(p['end'] * fps)]) # Compute F1 using many-to-many matching across IoU thresholds (0.3, 0.5, 0.7) if pred_segments and gt_segments: pred_np = np.array(pred_segments) gt_np = np.array(gt_segments) splits = np.ones(len(gt_segments), dtype=int) result = compute_temporal_f1_single(pred_np, gt_np, splits, iou_thresholds=(0.3, 0.5, 0.7)) all_f1_scores.append(result['F1_Score']) all_precision_scores.append(result['Precision_Mean']) all_recall_scores.append(result['Recall_Mean']) # Aggregate scores avg_f1 = np.mean(all_f1_scores) if all_f1_scores else 0.0 avg_precision = np.mean(all_precision_scores) if all_precision_scores else 0.0 avg_recall = np.mean(all_recall_scores) if all_recall_scores else 0.0 return { 'overall': { 'caption_score': caption_score, 'caption_method': caption_method, 'temporal_f1': avg_f1, 'temporal_precision': avg_precision, 'temporal_recall': avg_recall, 'count': len(records), 'f1_samples': len(all_f1_scores) } } def main(): """Main evaluation function for DVC.""" if len(sys.argv) < 2: print("Usage: python eval_dvc.py [--skip-llm-judge]") print("Example: python eval_dvc.py results/model_results.json") print("Example: python eval_dvc.py results/model_results.json --skip-llm-judge") sys.exit(1) output_file = sys.argv[1] skip_llm_judge = '--skip-llm-judge' in sys.argv print(f"Loading results from: {output_file}") if skip_llm_judge: print(" --skip-llm-judge flag detected: Skipping caption evaluation, computing temporal F1 only") with open(output_file, "r") as f: infer_output = json.load(f) dataset_records = group_records_by_dataset(infer_output) print(f"\nFound datasets: {list(dataset_records.keys())}") for dataset, records in dataset_records.items(): print(f" {dataset}: {len(records)} DVC records") if not any(dataset_records.values()): print("No DVC records found!") return {} all_results = {} for dataset_name, records in dataset_records.items(): if records: results = evaluate_dataset_dvc(dataset_name, records, skip_llm_judge=skip_llm_judge) all_results[dataset_name] = results print(f"\n{'='*80}") print("DENSE VIDEO CAPTIONING EVALUATION SUMMARY") print(f"{'='*80}") all_caption_scores = [] all_f1_scores = [] for dataset_name, results in all_results.items(): if results: print(f"\n{dataset_name}:") for key, metrics in results.items(): if isinstance(metrics, dict): print(f" Caption Score ({metrics.get('caption_method', 'unknown')}): {metrics.get('caption_score', 0):.4f}") print(f" Temporal F1: {metrics.get('temporal_f1', 0):.4f}") print(f" Temporal Precision: {metrics.get('temporal_precision', 0):.4f}") print(f" Temporal Recall: {metrics.get('temporal_recall', 0):.4f}") print(f" Total samples: {metrics.get('count', 0)}") print(f" F1 computed on: {metrics.get('f1_samples', 0)} samples") all_caption_scores.append(metrics.get('caption_score', 0)) all_f1_scores.append(metrics.get('temporal_f1', 0)) return { 'per_dataset': all_results, 'caption_score': np.mean(all_caption_scores) if all_caption_scores else 0.0, 'temporal_f1': np.mean(all_f1_scores) if all_f1_scores else 0.0, 'method': all_results[list(all_results.keys())[0]]['overall'].get('caption_method', 'unknown') if all_results else 'unknown' } if __name__ == "__main__": main()