Spaces:

UII-AI
/

MedVidBench-Leaderboard

Running

MedVidBench-Leaderboard / evaluation /eval_dvc.py

MedGRPO Team

fix issues

e2b1040 about 2 months ago

22.1 kB

	"""Dense Video Captioning evaluation using LLM judge + temporal F1.

	LLM judge uses IoU-matched segment pairs (matching original Qwen2.5-VL/llm_judge/):
	- Match predicted segments to GT segments at IoU thresholds (0.3, 0.5, 0.7)
	- Only judge matched pairs individually (not concatenated)
	- Average across matched pairs, then across thresholds

	Temporal F1 algorithm matches Qwen2.5-VL/my_eval/eval_dvc.py exactly:
	- process_raw_output() + flatten_overlapping_segments() for parsing
	- Frame-based coordinates (multiply by FPS)
	- Many-to-many threshold matching across IoU (0.3, 0.5, 0.7)
	- F1 = 2 * mean_precision * mean_recall / (mean_precision + mean_recall)
	"""

	import json
	import os
	import re
	import sys
	import time
	import numpy as np
	from collections import defaultdict
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from threading import Lock
	from eval_caption_llm_judge import (
	call_llm_judge_api, BEST5_ASPECTS, OPENAI_AVAILABLE,
	compute_semantic_similarity_fallback
	)


	# =============================================================================
	# Ported from Qwen2.5-VL/my_eval_old/eval_dvc.py - exact same algorithms
	# =============================================================================

	def zs_parse_multi_segment_annotations(raw_text: str):
	"""Parse raw multiline string with multiple timestamped captions per line."""
	all_segments = []
	lines = raw_text.strip().split('\n')
	for line in lines:
	matches = re.findall(
	r"(?:\\Start Time:\\\|Start\s$?Time$?\|Time\sRange:\|Time\sInterval:\|^\|\n)\s(\d+\.?\d)\s[-–]\s(\d+\.?\d)\sseconds?.?(?:\\Description:\\\|-)\s*(.+?)(?=\n\d\|$)",
	line, flags=re.DOTALL
	)
	for start, end, caption in matches:
	all_segments.append({
	"start": float(start),
	"end": float(end),
	"caption": caption.strip().rstrip('.')
	})
	return all_segments


	def process_raw_output(raw_descriptions: str):
	"""Process raw frame-wise descriptions into structured segments."""
	pattern = r"(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s+seconds?:\s+(.*?)(?=\n\d+(?:\.\d+)?-\d+(?:\.\d+)?\s+seconds?:\|\Z)"
	matches = re.findall(pattern, raw_descriptions, re.DOTALL)

	segments = []
	for start, end, desc in matches:
	segments.append({
	"start": float(start),
	"end": float(end),
	"caption": desc.strip().replace("\n", " ")
	})

	# Remove duplicate (start, end) segments
	seen = set()
	unique_segments = []
	for seg in segments:
	key = (seg["start"], seg["end"])
	if key not in seen:
	seen.add(key)
	unique_segments.append(seg)

	if not unique_segments:
	unique_segments = zs_parse_multi_segment_annotations(raw_descriptions)

	return unique_segments


	def check_for_overlaps(segments):
	"""Check a list of temporal segments for any overlaps."""
	sorted_segs = sorted(segments, key=lambda x: (x['start'], x['end']))
	overlaps = []
	for i in range(len(sorted_segs) - 1):
	seg1 = sorted_segs[i]
	seg2 = sorted_segs[i + 1]
	if seg2["start"] < seg1["end"]:
	overlaps.append((seg1, seg2))
	return overlaps


	def flatten_overlapping_segments(segments, caption_strategy="longest"):
	"""Split overlapping segments into non-overlapping intervals."""
	time_points = sorted(set([s["start"] for s in segments] + [s["end"] for s in segments]))
	result = []
	for i in range(len(time_points) - 1):
	start = time_points[i]
	end = time_points[i + 1]
	overlapping = []
	for s in segments:
	if s["start"] < end and s["end"] > start:
	overlapping.append(s)
	if not overlapping:
	continue
	if caption_strategy == "longest":
	selected = max(overlapping, key=lambda x: x["end"] - x["start"])
	elif caption_strategy == "first":
	selected = overlapping[0]
	else:
	raise ValueError("Unsupported strategy")
	result.append({
	"start": start,
	"end": end,
	"caption": selected["caption"]
	})
	return result


	def iou(interval_1, interval_2):
	"""Compute IoU between two intervals - matches old eval exactly."""
	start_1, end_1 = min(interval_1), max(interval_1)
	start_2, end_2 = min(interval_2), max(interval_2)

	intersection = max(0, min(end_1, end_2) - max(start_1, start_2))
	union = min(
	max(end_1, end_2) - min(start_1, start_2),
	end_1 - start_1 + end_2 - start_2)
	result = float(intersection) / (union + 1e-8)
	return result


	def evaluate_detections(predicted_segments, gt_segments, splits,
	iou_thresholds=(0.3, 0.5, 0.7, 0.9)):
	"""Compute P/R between predicted and ground truth segments.

	Many-to-many matching: any pred-gt pair exceeding threshold counts as covered.
	"""
	best_recall = []
	best_precision = []

	predicted_shape = predicted_segments.shape[0]

	for split in set(splits):
	metrics = {}
	for threshold in iou_thresholds:
	metrics[str(threshold)] = {
	'gt_covered': set(),
	'pred_covered': set(),
	}
	split_idx = np.where(splits == split)[0]
	split_gt_segments = np.array([gt_segments[idx] for idx in split_idx])
	gt_shape = split_gt_segments.shape[0]

	for idx_g, gt_segment in enumerate(split_gt_segments):
	for idx_p, segment in enumerate(predicted_segments):
	sample_iou = iou(segment, gt_segment)
	for threshold in iou_thresholds:
	if sample_iou > threshold:
	metrics[str(threshold)]['pred_covered'].add(idx_p)
	metrics[str(threshold)]['gt_covered'].add(idx_g)

	for threshold, m in metrics.items():
	pred_covered = m['pred_covered']
	gt_covered = m['gt_covered']
	m['precision'] = float(len(pred_covered)) / max(float(predicted_shape), 1.0)
	m['recall'] = float(len(gt_covered)) / float(gt_shape)

	precision = [m['precision'] for m in metrics.values()]
	recall = [m['recall'] for m in metrics.values()]
	if best_precision:
	best_precision = [max(precision[i], best_precision[i]) for i in range(len(precision))]
	best_recall = [max(recall[i], best_recall[i]) for i in range(len(recall))]
	else:
	best_precision, best_recall = precision, recall

	return best_precision, best_recall


	def compute_temporal_f1_single(predicted_segments, gt_segments, splits,
	iou_thresholds=(0.3, 0.5, 0.7)):
	"""Compute temporal F1 for a single sample using the old eval algorithm.

	Returns dict with Precision_Mean, Recall_Mean, F1_Score.
	"""
	if predicted_segments.shape[0] == 0 or gt_segments.shape[0] == 0:
	return {'Precision_Mean': 0.0, 'Recall_Mean': 0.0, 'F1_Score': 0.0}

	detection_precision, detection_recall = evaluate_detections(
	predicted_segments, gt_segments, splits, iou_thresholds
	)

	mean_precision = sum(detection_precision) / len(detection_precision)
	mean_recall = sum(detection_recall) / len(detection_recall)
	f1 = 2 * mean_recall * mean_precision / (mean_recall + mean_precision) \
	if (mean_recall + mean_precision) > 0 else 0.0

	return {
	'Precision_Mean': float(mean_precision),
	'Recall_Mean': float(mean_recall),
	'F1_Score': float(f1),
	}


	# =============================================================================
	# Dataset grouping and evaluation (LlamaFactory specific)
	# =============================================================================

	def group_records_by_dataset(data):
	"""Group DVC records by dataset for per-dataset evaluation."""
	dataset_groups = defaultdict(list)

	for key, record in data.items():
	qa_type = record.get('qa_type', '')
	# Match any dense_captioning variant (dense_captioning, dense_captioning_gpt, dense_captioning_gemini, dc)
	if not any(x in qa_type.lower() for x in ['dense_captioning', 'dense_caption', 'dc']):
	continue

	# Check data_source first (leaderboard format), then fall back to dataset/dataset_name
	dataset = record.get('data_source', record.get('dataset', record.get('dataset_name', record.get('metadata', {}).get('dataset', 'Unknown'))))
	video_id = record.get('video_id', record.get('metadata', {}).get('video_id', ''))

	if dataset == 'Unknown' and video_id:
	video_id_lower = str(video_id).lower()
	if len(video_id) == 11 and any(c.isalpha() for c in video_id):
	dataset = "AVOS"
	elif "_part" in video_id_lower:
	dataset = "CoPESD"
	elif "video" in video_id_lower:
	dataset = "CholecT50"

	dataset_groups[dataset].append(record)

	return dict(dataset_groups)


	def _extract_gt_segments(record):
	"""Extract ground truth segments from struc_info, matching Qwen2.5-VL logic."""
	struc_info = record.get('struc_info', [])

	if isinstance(struc_info, list) and len(struc_info) > 0:
	if isinstance(struc_info[0], list):
	# Format: [[{segments...}]]
	gnd = struc_info[0]
	elif isinstance(struc_info[0], dict) and 'dc_segments' in struc_info[0]:
	# NurViD format: [{'dc_segments': [...]}]
	gnd = struc_info[0]['dc_segments']
	else:
	# Format: [{segments...}]
	gnd = struc_info
	else:
	gnd = struc_info

	return gnd


	DVC_IOU_THRESHOLDS = [0.3, 0.5, 0.7]
	DVC_MAX_WORKERS = 20

	# Thread-safe progress counter for DVC LLM judge
	_dvc_progress_lock = Lock()
	_dvc_completed = 0
	_dvc_total = 0


	def _segment_iou(seg1, seg2):
	"""Compute IoU for two temporal segments (dicts with 'start' and 'end')."""
	intersection = max(0, min(seg1['end'], seg2['end']) - max(seg1['start'], seg2['start']))
	union = (seg1['end'] - seg1['start']) + (seg2['end'] - seg2['start']) - intersection
	return intersection / union if union > 0 else 0.0


	def _match_captions_at_threshold(pred_segments, gt_segments, threshold):
	"""Match predicted to ground truth segments at a specific IoU threshold.

	Returns list of (pred_caption, gt_caption) pairs.
	"""
	matched_pairs = []
	for pred_seg in pred_segments:
	best_iou = 0.0
	best_gt_caption = None
	for gt_seg in gt_segments:
	current_iou = _segment_iou(pred_seg, gt_seg)
	if current_iou >= threshold and current_iou > best_iou:
	best_iou = current_iou
	best_gt_caption = gt_seg['caption']
	if best_gt_caption is not None:
	matched_pairs.append((pred_seg['caption'], best_gt_caption))
	return matched_pairs


	def _evaluate_dvc_caption_iou_matched(records, api_key):
	"""Evaluate DVC captions using IoU-matched segment pairs + LLM judge.

	Matches the original Qwen2.5-VL/llm_judge/ approach:
	1. Parse pred and GT into segments
	2. Match at IoU thresholds (0.3, 0.5, 0.7)
	3. Judge each matched pair individually
	4. Average across pairs, then across thresholds
	"""
	global _dvc_completed, _dvc_total

	# Phase 1: Match all samples at all thresholds
	print(f" Phase 1: Matching segments at IoU thresholds {DVC_IOU_THRESHOLDS}...")
	all_matched = []

	for record in records:
	pred_text = record.get('answer', '')
	gt_text = record.get('gnd', '')

	pred_segments = process_raw_output(pred_text)
	gt_segments = _extract_gt_segments(record)

	if not isinstance(gt_segments, list):
	continue

	# Ensure gt_segments are dicts with caption
	gt_segs = [g for g in gt_segments if isinstance(g, dict) and 'start' in g and 'end' in g and 'caption' in g]

	if not pred_segments or not gt_segs:
	continue

	matched_pairs = {}
	for threshold in DVC_IOU_THRESHOLDS:
	pairs = _match_captions_at_threshold(pred_segments, gt_segs, threshold)
	matched_pairs[threshold] = pairs

	all_matched.append(matched_pairs)

	total_pairs = sum(sum(len(pairs) for pairs in m.values()) for m in all_matched)
	print(f" ✓ Matched {len(all_matched)} samples, {total_pairs} total pairs across all thresholds")

	if total_pairs == 0:
	return 0.0, 'llm_judge_iou_matched', 0.0

	# Phase 2: Evaluate all matched pairs in parallel
	_dvc_total = total_pairs
	_dvc_completed = 0

	print(f" Phase 2: Evaluating {total_pairs} pairs with LLM Judge ({DVC_MAX_WORKERS} workers)...")

	# Collect all tasks: (sample_idx, threshold, pred_caption, gt_caption)
	tasks = []
	for sample_idx, matched_pairs in enumerate(all_matched):
	for threshold in DVC_IOU_THRESHOLDS:
	for pred_cap, gt_cap in matched_pairs[threshold]:
	tasks.append((sample_idx, threshold, pred_cap, gt_cap))

	# Store results per threshold
	threshold_scores = {t: {aspect: [] for aspect in BEST5_ASPECTS} for t in DVC_IOU_THRESHOLDS}
	api_successes = 0

	def _judge_pair(pred_cap, gt_cap):
	global _dvc_completed
	result = call_llm_judge_api(pred_cap, gt_cap, 'dense_captioning', api_key)
	with _dvc_progress_lock:
	_dvc_completed += 1
	if _dvc_completed % 50 == 0:
	print(f" Progress: {_dvc_completed}/{_dvc_total} API calls completed")
	return result

	with ThreadPoolExecutor(max_workers=DVC_MAX_WORKERS) as executor:
	future_to_task = {
	executor.submit(_judge_pair, pred_cap, gt_cap): (sample_idx, threshold)
	for sample_idx, threshold, pred_cap, gt_cap in tasks
	}

	for future in as_completed(future_to_task):
	_, threshold = future_to_task[future]
	try:
	result = future.result()
	if result.get('api_success', False):
	for aspect in BEST5_ASPECTS:
	threshold_scores[threshold][aspect].append(result[aspect])
	api_successes += 1
	except Exception as e:
	print(f" ⚠ Error: {e}")

	# Phase 3: Aggregate — average per threshold, then across thresholds
	per_threshold_avg = {}
	for threshold in DVC_IOU_THRESHOLDS:
	aspect_avgs = {}
	for aspect in BEST5_ASPECTS:
	scores = threshold_scores[threshold][aspect]
	aspect_avgs[aspect] = np.mean(scores) if scores else 0.0
	valid = [v for v in aspect_avgs.values() if v > 0]
	per_threshold_avg[threshold] = np.mean(valid) if valid else 0.0

	# Overall: average across thresholds
	valid_thresholds = [v for v in per_threshold_avg.values() if v > 0]
	overall_score = np.mean(valid_thresholds) if valid_thresholds else 0.0
	success_rate = api_successes / total_pairs if total_pairs > 0 else 0.0

	print(f" ✓ LLM Judge completed: {api_successes}/{total_pairs} successful")
	for t in DVC_IOU_THRESHOLDS:
	print(f" IoU@{t}: {per_threshold_avg[t]:.3f}")
	print(f" Overall (threshold-averaged): {overall_score:.3f}")

	return overall_score, 'llm_judge_iou_matched', success_rate


	def evaluate_dataset_dvc(dataset_name, records, skip_llm_judge=False):
	"""Evaluate DVC for a specific dataset using caption quality + temporal F1."""
	print(f"\nEvaluating {dataset_name} ({len(records)} records)...")

	# Step 1: Evaluate caption quality using IoU-matched LLM judge
	if skip_llm_judge:
	print(f" Skipping LLM judge caption evaluation (--skip-llm-judge flag)")
	caption_score = 0.0
	caption_method = 'skipped'
	else:
	api_key = os.getenv('OPENAI_API_KEY')
	if api_key and OPENAI_AVAILABLE:
	caption_score, caption_method, _ = _evaluate_dvc_caption_iou_matched(records, api_key)
	else:
	print(f" ⚠ No API key, using semantic similarity fallback")
	import tempfile
	temp_data = {str(i): record for i, record in enumerate(records)}
	with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
	json.dump(temp_data, f)
	temp_file = f.name
	try:
	caption_score = compute_semantic_similarity_fallback(temp_data, 'dense_captioning')
	caption_method = 'semantic_similarity'
	finally:
	os.unlink(temp_file)

	# Step 2: Compute temporal F1 matching Qwen2.5-VL algorithm exactly
	all_f1_scores = []
	all_precision_scores = []
	all_recall_scores = []

	for record in records:
	# Get FPS
	fps = record.get('fps', record.get('metadata', {}).get('fps', 1.0))
	if isinstance(fps, str):
	fps = float(fps)

	# Parse predicted segments using process_raw_output (same as Qwen2.5-VL)
	raw_answer = record.get('answer', '')
	processed_answer = process_raw_output(raw_answer)
	overlaps = check_for_overlaps(processed_answer)
	if overlaps:
	processed_answer = flatten_overlapping_segments(processed_answer, caption_strategy="longest")

	# Get ground truth segments
	gnd = _extract_gt_segments(record)

	# Convert both to frame-based coordinates (multiply by fps, cast to int)
	# IMPORTANT: require 'caption' field to match Qwen2.5-VL's prepare_eval_arrays
	gt_segments = []
	if isinstance(gnd, list):
	for g in gnd:
	if isinstance(g, dict) and 'start' in g and 'end' in g and 'caption' in g:
	gt_segments.append([int(float(g['start']) * fps), int(float(g['end']) * fps)])

	pred_segments = []
	if isinstance(processed_answer, list):
	for p in processed_answer:
	if isinstance(p, dict) and 'start' in p and 'end' in p and 'caption' in p:
	pred_segments.append([int(p['start'] * fps), int(p['end'] * fps)])

	# Compute F1 using many-to-many matching across IoU thresholds (0.3, 0.5, 0.7)
	if pred_segments and gt_segments:
	pred_np = np.array(pred_segments)
	gt_np = np.array(gt_segments)
	splits = np.ones(len(gt_segments), dtype=int)

	result = compute_temporal_f1_single(pred_np, gt_np, splits,
	iou_thresholds=(0.3, 0.5, 0.7))
	all_f1_scores.append(result['F1_Score'])
	all_precision_scores.append(result['Precision_Mean'])
	all_recall_scores.append(result['Recall_Mean'])

	# Aggregate scores
	avg_f1 = np.mean(all_f1_scores) if all_f1_scores else 0.0
	avg_precision = np.mean(all_precision_scores) if all_precision_scores else 0.0
	avg_recall = np.mean(all_recall_scores) if all_recall_scores else 0.0

	return {
	'overall': {
	'caption_score': caption_score,
	'caption_method': caption_method,
	'temporal_f1': avg_f1,
	'temporal_precision': avg_precision,
	'temporal_recall': avg_recall,
	'count': len(records),
	'f1_samples': len(all_f1_scores)
	}
	}


	def main():
	"""Main evaluation function for DVC."""
	if len(sys.argv) < 2:
	print("Usage: python eval_dvc.py <results_json_file> [--skip-llm-judge]")
	print("Example: python eval_dvc.py results/model_results.json")
	print("Example: python eval_dvc.py results/model_results.json --skip-llm-judge")
	sys.exit(1)

	output_file = sys.argv[1]
	skip_llm_judge = '--skip-llm-judge' in sys.argv

	print(f"Loading results from: {output_file}")
	if skip_llm_judge:
	print(" --skip-llm-judge flag detected: Skipping caption evaluation, computing temporal F1 only")

	with open(output_file, "r") as f:
	infer_output = json.load(f)

	dataset_records = group_records_by_dataset(infer_output)

	print(f"\nFound datasets: {list(dataset_records.keys())}")
	for dataset, records in dataset_records.items():
	print(f" {dataset}: {len(records)} DVC records")

	if not any(dataset_records.values()):
	print("No DVC records found!")
	return {}

	all_results = {}
	for dataset_name, records in dataset_records.items():
	if records:
	results = evaluate_dataset_dvc(dataset_name, records, skip_llm_judge=skip_llm_judge)
	all_results[dataset_name] = results

	print(f"\n{'='*80}")
	print("DENSE VIDEO CAPTIONING EVALUATION SUMMARY")
	print(f"{'='*80}")

	all_caption_scores = []
	all_f1_scores = []

	for dataset_name, results in all_results.items():
	if results:
	print(f"\n{dataset_name}:")
	for key, metrics in results.items():
	if isinstance(metrics, dict):
	print(f" Caption Score ({metrics.get('caption_method', 'unknown')}): {metrics.get('caption_score', 0):.4f}")
	print(f" Temporal F1: {metrics.get('temporal_f1', 0):.4f}")
	print(f" Temporal Precision: {metrics.get('temporal_precision', 0):.4f}")
	print(f" Temporal Recall: {metrics.get('temporal_recall', 0):.4f}")
	print(f" Total samples: {metrics.get('count', 0)}")
	print(f" F1 computed on: {metrics.get('f1_samples', 0)} samples")

	all_caption_scores.append(metrics.get('caption_score', 0))
	all_f1_scores.append(metrics.get('temporal_f1', 0))

	return {
	'per_dataset': all_results,
	'caption_score': np.mean(all_caption_scores) if all_caption_scores else 0.0,
	'temporal_f1': np.mean(all_f1_scores) if all_f1_scores else 0.0,
	'method': all_results[list(all_results.keys())[0]]['overall'].get('caption_method', 'unknown') if all_results else 'unknown'
	}


	if __name__ == "__main__":
	main()