| """Dense Video Captioning evaluation using LLM judge + temporal F1. |
| |
| LLM judge uses IoU-matched segment pairs (matching original Qwen2.5-VL/llm_judge/): |
| - Match predicted segments to GT segments at IoU thresholds (0.3, 0.5, 0.7) |
| - Only judge matched pairs individually (not concatenated) |
| - Average across matched pairs, then across thresholds |
| |
| Temporal F1 algorithm matches Qwen2.5-VL/my_eval/eval_dvc.py exactly: |
| - process_raw_output() + flatten_overlapping_segments() for parsing |
| - Frame-based coordinates (multiply by FPS) |
| - Many-to-many threshold matching across IoU (0.3, 0.5, 0.7) |
| - F1 = 2 * mean_precision * mean_recall / (mean_precision + mean_recall) |
| """ |
|
|
| import json |
| import os |
| import re |
| import sys |
| import time |
| import numpy as np |
| from collections import defaultdict |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
| from threading import Lock |
| from eval_caption_llm_judge import ( |
| call_llm_judge_api, BEST5_ASPECTS, OPENAI_AVAILABLE, |
| compute_semantic_similarity_fallback |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def zs_parse_multi_segment_annotations(raw_text: str): |
| """Parse raw multiline string with multiple timestamped captions per line.""" |
| all_segments = [] |
| lines = raw_text.strip().split('\n') |
| for line in lines: |
| matches = re.findall( |
| r"(?:\*\*Start Time:\*\*|Start\s*\(?Time\)?|Time\s*Range:|Time\s*Interval:|^|\n)\s*(\d+\.?\d*)\s*[-–]\s*(\d+\.?\d*)\s*seconds?.*?(?:\*\*Description:\*\*|-)\s*(.+?)(?=\n\d|$)", |
| line, flags=re.DOTALL |
| ) |
| for start, end, caption in matches: |
| all_segments.append({ |
| "start": float(start), |
| "end": float(end), |
| "caption": caption.strip().rstrip('.') |
| }) |
| return all_segments |
|
|
|
|
| def process_raw_output(raw_descriptions: str): |
| """Process raw frame-wise descriptions into structured segments.""" |
| pattern = r"(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s+seconds?:\s+(.*?)(?=\n\d+(?:\.\d+)?-\d+(?:\.\d+)?\s+seconds?:|\Z)" |
| matches = re.findall(pattern, raw_descriptions, re.DOTALL) |
|
|
| segments = [] |
| for start, end, desc in matches: |
| segments.append({ |
| "start": float(start), |
| "end": float(end), |
| "caption": desc.strip().replace("\n", " ") |
| }) |
|
|
| |
| seen = set() |
| unique_segments = [] |
| for seg in segments: |
| key = (seg["start"], seg["end"]) |
| if key not in seen: |
| seen.add(key) |
| unique_segments.append(seg) |
|
|
| if not unique_segments: |
| unique_segments = zs_parse_multi_segment_annotations(raw_descriptions) |
|
|
| return unique_segments |
|
|
|
|
| def check_for_overlaps(segments): |
| """Check a list of temporal segments for any overlaps.""" |
| sorted_segs = sorted(segments, key=lambda x: (x['start'], x['end'])) |
| overlaps = [] |
| for i in range(len(sorted_segs) - 1): |
| seg1 = sorted_segs[i] |
| seg2 = sorted_segs[i + 1] |
| if seg2["start"] < seg1["end"]: |
| overlaps.append((seg1, seg2)) |
| return overlaps |
|
|
|
|
| def flatten_overlapping_segments(segments, caption_strategy="longest"): |
| """Split overlapping segments into non-overlapping intervals.""" |
| time_points = sorted(set([s["start"] for s in segments] + [s["end"] for s in segments])) |
| result = [] |
| for i in range(len(time_points) - 1): |
| start = time_points[i] |
| end = time_points[i + 1] |
| overlapping = [] |
| for s in segments: |
| if s["start"] < end and s["end"] > start: |
| overlapping.append(s) |
| if not overlapping: |
| continue |
| if caption_strategy == "longest": |
| selected = max(overlapping, key=lambda x: x["end"] - x["start"]) |
| elif caption_strategy == "first": |
| selected = overlapping[0] |
| else: |
| raise ValueError("Unsupported strategy") |
| result.append({ |
| "start": start, |
| "end": end, |
| "caption": selected["caption"] |
| }) |
| return result |
|
|
|
|
| def iou(interval_1, interval_2): |
| """Compute IoU between two intervals - matches old eval exactly.""" |
| start_1, end_1 = min(*interval_1), max(*interval_1) |
| start_2, end_2 = min(*interval_2), max(*interval_2) |
|
|
| intersection = max(0, min(end_1, end_2) - max(start_1, start_2)) |
| union = min( |
| max(end_1, end_2) - min(start_1, start_2), |
| end_1 - start_1 + end_2 - start_2) |
| result = float(intersection) / (union + 1e-8) |
| return result |
|
|
|
|
| def evaluate_detections(predicted_segments, gt_segments, splits, |
| iou_thresholds=(0.3, 0.5, 0.7, 0.9)): |
| """Compute P/R between predicted and ground truth segments. |
| |
| Many-to-many matching: any pred-gt pair exceeding threshold counts as covered. |
| """ |
| best_recall = [] |
| best_precision = [] |
|
|
| predicted_shape = predicted_segments.shape[0] |
|
|
| for split in set(splits): |
| metrics = {} |
| for threshold in iou_thresholds: |
| metrics[str(threshold)] = { |
| 'gt_covered': set(), |
| 'pred_covered': set(), |
| } |
| split_idx = np.where(splits == split)[0] |
| split_gt_segments = np.array([gt_segments[idx] for idx in split_idx]) |
| gt_shape = split_gt_segments.shape[0] |
|
|
| for idx_g, gt_segment in enumerate(split_gt_segments): |
| for idx_p, segment in enumerate(predicted_segments): |
| sample_iou = iou(segment, gt_segment) |
| for threshold in iou_thresholds: |
| if sample_iou > threshold: |
| metrics[str(threshold)]['pred_covered'].add(idx_p) |
| metrics[str(threshold)]['gt_covered'].add(idx_g) |
|
|
| for threshold, m in metrics.items(): |
| pred_covered = m['pred_covered'] |
| gt_covered = m['gt_covered'] |
| m['precision'] = float(len(pred_covered)) / max(float(predicted_shape), 1.0) |
| m['recall'] = float(len(gt_covered)) / float(gt_shape) |
|
|
| precision = [m['precision'] for m in metrics.values()] |
| recall = [m['recall'] for m in metrics.values()] |
| if best_precision: |
| best_precision = [max(precision[i], best_precision[i]) for i in range(len(precision))] |
| best_recall = [max(recall[i], best_recall[i]) for i in range(len(recall))] |
| else: |
| best_precision, best_recall = precision, recall |
|
|
| return best_precision, best_recall |
|
|
|
|
| def compute_temporal_f1_single(predicted_segments, gt_segments, splits, |
| iou_thresholds=(0.3, 0.5, 0.7)): |
| """Compute temporal F1 for a single sample using the old eval algorithm. |
| |
| Returns dict with Precision_Mean, Recall_Mean, F1_Score. |
| """ |
| if predicted_segments.shape[0] == 0 or gt_segments.shape[0] == 0: |
| return {'Precision_Mean': 0.0, 'Recall_Mean': 0.0, 'F1_Score': 0.0} |
|
|
| detection_precision, detection_recall = evaluate_detections( |
| predicted_segments, gt_segments, splits, iou_thresholds |
| ) |
|
|
| mean_precision = sum(detection_precision) / len(detection_precision) |
| mean_recall = sum(detection_recall) / len(detection_recall) |
| f1 = 2 * mean_recall * mean_precision / (mean_recall + mean_precision) \ |
| if (mean_recall + mean_precision) > 0 else 0.0 |
|
|
| return { |
| 'Precision_Mean': float(mean_precision), |
| 'Recall_Mean': float(mean_recall), |
| 'F1_Score': float(f1), |
| } |
|
|
|
|
| |
| |
| |
|
|
| def group_records_by_dataset(data): |
| """Group DVC records by dataset for per-dataset evaluation.""" |
| dataset_groups = defaultdict(list) |
|
|
| for key, record in data.items(): |
| qa_type = record.get('qa_type', '') |
| |
| if not any(x in qa_type.lower() for x in ['dense_captioning', 'dense_caption', 'dc']): |
| continue |
|
|
| |
| dataset = record.get('data_source', record.get('dataset', record.get('dataset_name', record.get('metadata', {}).get('dataset', 'Unknown')))) |
| video_id = record.get('video_id', record.get('metadata', {}).get('video_id', '')) |
|
|
| if dataset == 'Unknown' and video_id: |
| video_id_lower = str(video_id).lower() |
| if len(video_id) == 11 and any(c.isalpha() for c in video_id): |
| dataset = "AVOS" |
| elif "_part" in video_id_lower: |
| dataset = "CoPESD" |
| elif "video" in video_id_lower: |
| dataset = "CholecT50" |
|
|
| dataset_groups[dataset].append(record) |
|
|
| return dict(dataset_groups) |
|
|
|
|
| def _extract_gt_segments(record): |
| """Extract ground truth segments from struc_info, matching Qwen2.5-VL logic.""" |
| struc_info = record.get('struc_info', []) |
|
|
| if isinstance(struc_info, list) and len(struc_info) > 0: |
| if isinstance(struc_info[0], list): |
| |
| gnd = struc_info[0] |
| elif isinstance(struc_info[0], dict) and 'dc_segments' in struc_info[0]: |
| |
| gnd = struc_info[0]['dc_segments'] |
| else: |
| |
| gnd = struc_info |
| else: |
| gnd = struc_info |
|
|
| return gnd |
|
|
|
|
| DVC_IOU_THRESHOLDS = [0.3, 0.5, 0.7] |
| DVC_MAX_WORKERS = 20 |
|
|
| |
| _dvc_progress_lock = Lock() |
| _dvc_completed = 0 |
| _dvc_total = 0 |
|
|
|
|
| def _segment_iou(seg1, seg2): |
| """Compute IoU for two temporal segments (dicts with 'start' and 'end').""" |
| intersection = max(0, min(seg1['end'], seg2['end']) - max(seg1['start'], seg2['start'])) |
| union = (seg1['end'] - seg1['start']) + (seg2['end'] - seg2['start']) - intersection |
| return intersection / union if union > 0 else 0.0 |
|
|
|
|
| def _match_captions_at_threshold(pred_segments, gt_segments, threshold): |
| """Match predicted to ground truth segments at a specific IoU threshold. |
| |
| Returns list of (pred_caption, gt_caption) pairs. |
| """ |
| matched_pairs = [] |
| for pred_seg in pred_segments: |
| best_iou = 0.0 |
| best_gt_caption = None |
| for gt_seg in gt_segments: |
| current_iou = _segment_iou(pred_seg, gt_seg) |
| if current_iou >= threshold and current_iou > best_iou: |
| best_iou = current_iou |
| best_gt_caption = gt_seg['caption'] |
| if best_gt_caption is not None: |
| matched_pairs.append((pred_seg['caption'], best_gt_caption)) |
| return matched_pairs |
|
|
|
|
| def _evaluate_dvc_caption_iou_matched(records, api_key): |
| """Evaluate DVC captions using IoU-matched segment pairs + LLM judge. |
| |
| Matches the original Qwen2.5-VL/llm_judge/ approach: |
| 1. Parse pred and GT into segments |
| 2. Match at IoU thresholds (0.3, 0.5, 0.7) |
| 3. Judge each matched pair individually |
| 4. Average across pairs, then across thresholds |
| """ |
| global _dvc_completed, _dvc_total |
|
|
| |
| print(f" Phase 1: Matching segments at IoU thresholds {DVC_IOU_THRESHOLDS}...") |
| all_matched = [] |
|
|
| for record in records: |
| pred_text = record.get('answer', '') |
| gt_text = record.get('gnd', '') |
|
|
| pred_segments = process_raw_output(pred_text) |
| gt_segments = _extract_gt_segments(record) |
|
|
| if not isinstance(gt_segments, list): |
| continue |
|
|
| |
| gt_segs = [g for g in gt_segments if isinstance(g, dict) and 'start' in g and 'end' in g and 'caption' in g] |
|
|
| if not pred_segments or not gt_segs: |
| continue |
|
|
| matched_pairs = {} |
| for threshold in DVC_IOU_THRESHOLDS: |
| pairs = _match_captions_at_threshold(pred_segments, gt_segs, threshold) |
| matched_pairs[threshold] = pairs |
|
|
| all_matched.append(matched_pairs) |
|
|
| total_pairs = sum(sum(len(pairs) for pairs in m.values()) for m in all_matched) |
| print(f" ✓ Matched {len(all_matched)} samples, {total_pairs} total pairs across all thresholds") |
|
|
| if total_pairs == 0: |
| return 0.0, 'llm_judge_iou_matched', 0.0 |
|
|
| |
| _dvc_total = total_pairs |
| _dvc_completed = 0 |
|
|
| print(f" Phase 2: Evaluating {total_pairs} pairs with LLM Judge ({DVC_MAX_WORKERS} workers)...") |
|
|
| |
| tasks = [] |
| for sample_idx, matched_pairs in enumerate(all_matched): |
| for threshold in DVC_IOU_THRESHOLDS: |
| for pred_cap, gt_cap in matched_pairs[threshold]: |
| tasks.append((sample_idx, threshold, pred_cap, gt_cap)) |
|
|
| |
| threshold_scores = {t: {aspect: [] for aspect in BEST5_ASPECTS} for t in DVC_IOU_THRESHOLDS} |
| api_successes = 0 |
|
|
| def _judge_pair(pred_cap, gt_cap): |
| global _dvc_completed |
| result = call_llm_judge_api(pred_cap, gt_cap, 'dense_captioning', api_key) |
| with _dvc_progress_lock: |
| _dvc_completed += 1 |
| if _dvc_completed % 50 == 0: |
| print(f" Progress: {_dvc_completed}/{_dvc_total} API calls completed") |
| return result |
|
|
| with ThreadPoolExecutor(max_workers=DVC_MAX_WORKERS) as executor: |
| future_to_task = { |
| executor.submit(_judge_pair, pred_cap, gt_cap): (sample_idx, threshold) |
| for sample_idx, threshold, pred_cap, gt_cap in tasks |
| } |
|
|
| for future in as_completed(future_to_task): |
| _, threshold = future_to_task[future] |
| try: |
| result = future.result() |
| if result.get('api_success', False): |
| for aspect in BEST5_ASPECTS: |
| threshold_scores[threshold][aspect].append(result[aspect]) |
| api_successes += 1 |
| except Exception as e: |
| print(f" ⚠ Error: {e}") |
|
|
| |
| per_threshold_avg = {} |
| for threshold in DVC_IOU_THRESHOLDS: |
| aspect_avgs = {} |
| for aspect in BEST5_ASPECTS: |
| scores = threshold_scores[threshold][aspect] |
| aspect_avgs[aspect] = np.mean(scores) if scores else 0.0 |
| valid = [v for v in aspect_avgs.values() if v > 0] |
| per_threshold_avg[threshold] = np.mean(valid) if valid else 0.0 |
|
|
| |
| valid_thresholds = [v for v in per_threshold_avg.values() if v > 0] |
| overall_score = np.mean(valid_thresholds) if valid_thresholds else 0.0 |
| success_rate = api_successes / total_pairs if total_pairs > 0 else 0.0 |
|
|
| print(f" ✓ LLM Judge completed: {api_successes}/{total_pairs} successful") |
| for t in DVC_IOU_THRESHOLDS: |
| print(f" IoU@{t}: {per_threshold_avg[t]:.3f}") |
| print(f" Overall (threshold-averaged): {overall_score:.3f}") |
|
|
| return overall_score, 'llm_judge_iou_matched', success_rate |
|
|
|
|
| def evaluate_dataset_dvc(dataset_name, records, skip_llm_judge=False): |
| """Evaluate DVC for a specific dataset using caption quality + temporal F1.""" |
| print(f"\nEvaluating {dataset_name} ({len(records)} records)...") |
|
|
| |
| if skip_llm_judge: |
| print(f" Skipping LLM judge caption evaluation (--skip-llm-judge flag)") |
| caption_score = 0.0 |
| caption_method = 'skipped' |
| else: |
| api_key = os.getenv('OPENAI_API_KEY') |
| if api_key and OPENAI_AVAILABLE: |
| caption_score, caption_method, _ = _evaluate_dvc_caption_iou_matched(records, api_key) |
| else: |
| print(f" ⚠ No API key, using semantic similarity fallback") |
| import tempfile |
| temp_data = {str(i): record for i, record in enumerate(records)} |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: |
| json.dump(temp_data, f) |
| temp_file = f.name |
| try: |
| caption_score = compute_semantic_similarity_fallback(temp_data, 'dense_captioning') |
| caption_method = 'semantic_similarity' |
| finally: |
| os.unlink(temp_file) |
|
|
| |
| all_f1_scores = [] |
| all_precision_scores = [] |
| all_recall_scores = [] |
|
|
| for record in records: |
| |
| fps = record.get('fps', record.get('metadata', {}).get('fps', 1.0)) |
| if isinstance(fps, str): |
| fps = float(fps) |
|
|
| |
| raw_answer = record.get('answer', '') |
| processed_answer = process_raw_output(raw_answer) |
| overlaps = check_for_overlaps(processed_answer) |
| if overlaps: |
| processed_answer = flatten_overlapping_segments(processed_answer, caption_strategy="longest") |
|
|
| |
| gnd = _extract_gt_segments(record) |
|
|
| |
| |
| gt_segments = [] |
| if isinstance(gnd, list): |
| for g in gnd: |
| if isinstance(g, dict) and 'start' in g and 'end' in g and 'caption' in g: |
| gt_segments.append([int(float(g['start']) * fps), int(float(g['end']) * fps)]) |
|
|
| pred_segments = [] |
| if isinstance(processed_answer, list): |
| for p in processed_answer: |
| if isinstance(p, dict) and 'start' in p and 'end' in p and 'caption' in p: |
| pred_segments.append([int(p['start'] * fps), int(p['end'] * fps)]) |
|
|
| |
| if pred_segments and gt_segments: |
| pred_np = np.array(pred_segments) |
| gt_np = np.array(gt_segments) |
| splits = np.ones(len(gt_segments), dtype=int) |
|
|
| result = compute_temporal_f1_single(pred_np, gt_np, splits, |
| iou_thresholds=(0.3, 0.5, 0.7)) |
| all_f1_scores.append(result['F1_Score']) |
| all_precision_scores.append(result['Precision_Mean']) |
| all_recall_scores.append(result['Recall_Mean']) |
|
|
| |
| avg_f1 = np.mean(all_f1_scores) if all_f1_scores else 0.0 |
| avg_precision = np.mean(all_precision_scores) if all_precision_scores else 0.0 |
| avg_recall = np.mean(all_recall_scores) if all_recall_scores else 0.0 |
|
|
| return { |
| 'overall': { |
| 'caption_score': caption_score, |
| 'caption_method': caption_method, |
| 'temporal_f1': avg_f1, |
| 'temporal_precision': avg_precision, |
| 'temporal_recall': avg_recall, |
| 'count': len(records), |
| 'f1_samples': len(all_f1_scores) |
| } |
| } |
|
|
|
|
| def main(): |
| """Main evaluation function for DVC.""" |
| if len(sys.argv) < 2: |
| print("Usage: python eval_dvc.py <results_json_file> [--skip-llm-judge]") |
| print("Example: python eval_dvc.py results/model_results.json") |
| print("Example: python eval_dvc.py results/model_results.json --skip-llm-judge") |
| sys.exit(1) |
|
|
| output_file = sys.argv[1] |
| skip_llm_judge = '--skip-llm-judge' in sys.argv |
|
|
| print(f"Loading results from: {output_file}") |
| if skip_llm_judge: |
| print(" --skip-llm-judge flag detected: Skipping caption evaluation, computing temporal F1 only") |
|
|
| with open(output_file, "r") as f: |
| infer_output = json.load(f) |
|
|
| dataset_records = group_records_by_dataset(infer_output) |
|
|
| print(f"\nFound datasets: {list(dataset_records.keys())}") |
| for dataset, records in dataset_records.items(): |
| print(f" {dataset}: {len(records)} DVC records") |
|
|
| if not any(dataset_records.values()): |
| print("No DVC records found!") |
| return {} |
|
|
| all_results = {} |
| for dataset_name, records in dataset_records.items(): |
| if records: |
| results = evaluate_dataset_dvc(dataset_name, records, skip_llm_judge=skip_llm_judge) |
| all_results[dataset_name] = results |
|
|
| print(f"\n{'='*80}") |
| print("DENSE VIDEO CAPTIONING EVALUATION SUMMARY") |
| print(f"{'='*80}") |
|
|
| all_caption_scores = [] |
| all_f1_scores = [] |
|
|
| for dataset_name, results in all_results.items(): |
| if results: |
| print(f"\n{dataset_name}:") |
| for key, metrics in results.items(): |
| if isinstance(metrics, dict): |
| print(f" Caption Score ({metrics.get('caption_method', 'unknown')}): {metrics.get('caption_score', 0):.4f}") |
| print(f" Temporal F1: {metrics.get('temporal_f1', 0):.4f}") |
| print(f" Temporal Precision: {metrics.get('temporal_precision', 0):.4f}") |
| print(f" Temporal Recall: {metrics.get('temporal_recall', 0):.4f}") |
| print(f" Total samples: {metrics.get('count', 0)}") |
| print(f" F1 computed on: {metrics.get('f1_samples', 0)} samples") |
|
|
| all_caption_scores.append(metrics.get('caption_score', 0)) |
| all_f1_scores.append(metrics.get('temporal_f1', 0)) |
|
|
| return { |
| 'per_dataset': all_results, |
| 'caption_score': np.mean(all_caption_scores) if all_caption_scores else 0.0, |
| 'temporal_f1': np.mean(all_f1_scores) if all_f1_scores else 0.0, |
| 'method': all_results[list(all_results.keys())[0]]['overall'].get('caption_method', 'unknown') if all_results else 'unknown' |
| } |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|