Spaces:

andytaylor-smg
/

cfb40

Sleeping

App Files Files Community

cfb40 / scripts /test_full_video_evaluation.py

andytaylor-smg

moving stuff all around

6c65498 3 months ago

raw

history blame contribute delete

11.6 kB

	#!/usr/bin/env python3
	"""
	Full video evaluation using template-based play clock reading.

	This test:
	1. Loads pre-built digit templates
	2. Runs play detection on the full video using template matching
	3. Compares detected plays against v3 baseline
	4. Reports accuracy metrics

	The goal is to verify that template-based clock reading matches or exceeds
	OCR-based detection quality while being significantly faster.

	Usage:
	cd /Users/andytaylor/Documents/Personal/cfb40
	source .venv/bin/activate
	python tests/test_digit_templates/test_full_video_evaluation.py
	"""

	import json
	import logging
	import sys
	import time
	from pathlib import Path

	from pipeline.play_detector import DetectionConfig, PlayDetector
	from setup import DigitTemplateLibrary
	from detection import TrackTimeouts

	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)

	# Configuration
	VIDEO_PATH = "full_videos/OSU vs Tenn 12.21.24.mkv"
	TEMPLATE_PATH = "output/OSU_vs_Tenn_12_21_24_template.png"
	PLAYCLOCK_CONFIG_PATH = "output/OSU_vs_Tenn_12_21_24_playclock_config.json"
	TIMEOUT_CONFIG_PATH = "output/OSU_vs_Tenn_12_21_24_timeout_config.json"
	DIGIT_TEMPLATE_PATH = "output/debug/digit_templates"
	V3_BASELINE_PATH = "output/benchmarks/v3_special_plays_baseline.json"

	# Scorebug region (from previous sessions)
	SCOREBUG_REGION = (128, 975, 1669, 46)

	# Minimum play duration filter (same as main pipeline)
	MIN_PLAY_DURATION = 3.0


	def load_v3_baseline():
	"""Load v3 baseline plays for comparison."""
	with open(V3_BASELINE_PATH, "r", encoding="utf-8") as f:
	data = json.load(f)
	return data.get("plays", [])


	def find_matching_play(detected_play: dict, baseline_plays: list, tolerance: float = 5.0):
	"""
	Find a matching play in the baseline.

	A match is found if the detected play's start time is within tolerance
	of a baseline play's start time.

	Args:
	detected_play: Detected play dict with start_time
	baseline_plays: List of baseline plays
	tolerance: Time tolerance in seconds

	Returns:
	Matching baseline play or None
	"""
	detected_start = detected_play.get("start_time", 0)

	for baseline in baseline_plays:
	baseline_start = baseline.get("start_time", 0)
	if abs(detected_start - baseline_start) <= tolerance:
	return baseline

	return None


	def compare_results(detected_plays: list, baseline_plays: list):
	"""
	Compare detected plays against baseline.

	Returns dict with:
	- true_positives: Plays in both detected and baseline
	- false_positives: Plays detected but not in baseline
	- false_negatives: Plays in baseline but not detected
	- precision, recall, f1
	"""
	matched_baseline = set()
	true_positives = []
	false_positives = []

	# Find matches for detected plays
	for detected in detected_plays:
	match = find_matching_play(detected, baseline_plays)
	if match:
	baseline_idx = baseline_plays.index(match)
	if baseline_idx not in matched_baseline:
	matched_baseline.add(baseline_idx)
	true_positives.append({"detected": detected, "baseline": match})
	else:
	# Duplicate match - still counts as FP
	false_positives.append(detected)
	else:
	false_positives.append(detected)

	# Find unmatched baseline plays (false negatives)
	false_negatives = [bp for i, bp in enumerate(baseline_plays) if i not in matched_baseline]

	# Calculate metrics
	tp = len(true_positives)
	fp = len(false_positives)
	fn = len(false_negatives)

	precision = tp / (tp + fp) if (tp + fp) > 0 else 0
	recall = tp / (tp + fn) if (tp + fn) > 0 else 0
	f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

	return {
	"true_positives": true_positives,
	"false_positives": false_positives,
	"false_negatives": false_negatives,
	"counts": {"tp": tp, "fp": fp, "fn": fn},
	"metrics": {"precision": precision, "recall": recall, "f1": f1},
	}


	def run_full_video_evaluation():
	"""Run full video evaluation with template-based clock reading."""
	logger.info("=" * 70)
	logger.info("FULL VIDEO EVALUATION: Template-Based Play Clock Reading")
	logger.info("=" * 70)

	# Check files exist
	for path, name in [
	(VIDEO_PATH, "Video"),
	(TEMPLATE_PATH, "Scorebug template"),
	(PLAYCLOCK_CONFIG_PATH, "Play clock config"),
	(DIGIT_TEMPLATE_PATH, "Digit templates"),
	(V3_BASELINE_PATH, "V3 baseline"),
	]:
	if not Path(path).exists():
	logger.error("%s not found: %s", name, path)
	return False

	# Load v3 baseline
	logger.info("\n[Step 1] Loading v3 baseline...")
	baseline_plays = load_v3_baseline()
	logger.info("V3 baseline plays: %d", len(baseline_plays))

	# Check digit templates
	logger.info("\n[Step 2] Loading digit templates...")
	template_library = DigitTemplateLibrary()
	if not template_library.load(DIGIT_TEMPLATE_PATH):
	logger.error("Failed to load digit templates from %s", DIGIT_TEMPLATE_PATH)
	return False

	coverage = template_library.get_coverage_status()
	logger.info("Template coverage: %d/%d", coverage["total_have"], coverage["total_needed"])
	logger.info(" Ones (center): %s", coverage["ones_center_have"])
	logger.info(" Ones (right): %s", coverage["ones_right_have"])
	logger.info(" Tens (left): %s", coverage["tens_have"])
	logger.info(" Blank: %s", "YES" if coverage["has_blank"] else "NO")

	# Create detection config with template path
	logger.info("\n[Step 3] Setting up detection pipeline...")
	detection_config = DetectionConfig(
	video_path=VIDEO_PATH,
	template_path=TEMPLATE_PATH,
	clock_region_config_path=PLAYCLOCK_CONFIG_PATH,
	start_time=0.0,
	end_time=None, # Full video
	frame_interval=0.5,
	use_template_matching=True,
	digit_template_path=DIGIT_TEMPLATE_PATH,
	)

	# Initialize timeout tracker if config exists
	timeout_tracker = None
	if Path(TIMEOUT_CONFIG_PATH).exists():
	timeout_tracker = TrackTimeouts(config_path=TIMEOUT_CONFIG_PATH)
	logger.info("Timeout tracker initialized")

	# Initialize detector
	detector = PlayDetector(detection_config, timeout_tracker=timeout_tracker)

	# Set fixed scorebug region
	detector.scorebug_detector.set_fixed_region(SCOREBUG_REGION)
	logger.info("Scorebug region set: %s", SCOREBUG_REGION)

	# Run detection
	logger.info("\n[Step 4] Running detection on full video...")
	logger.info("This may take several minutes...")
	start_time = time.time()

	result = detector.detect()

	elapsed = time.time() - start_time
	logger.info("Detection complete in %.1f seconds (%.1f minutes)", elapsed, elapsed / 60)

	# Filter short plays
	detected_plays = []
	for play in result.plays:
	duration = play.get("duration", play.get("end_time", 0) - play.get("start_time", 0))
	if duration >= MIN_PLAY_DURATION:
	detected_plays.append(play)

	logger.info("Detected plays (after filtering): %d", len(detected_plays))

	# Compare against baseline
	logger.info("\n[Step 5] Comparing against v3 baseline...")
	comparison = compare_results(detected_plays, baseline_plays)

	# Print results
	counts = comparison["counts"]
	metrics = comparison["metrics"]

	logger.info("\n" + "=" * 70)
	logger.info("EVALUATION RESULTS")
	logger.info("=" * 70)
	logger.info("V3 Baseline plays: %d", len(baseline_plays))
	logger.info("Detected plays: %d", len(detected_plays))
	logger.info("")
	logger.info("True Positives (matched): %d", counts["tp"])
	logger.info("False Positives (extra): %d", counts["fp"])
	logger.info("False Negatives (missed): %d", counts["fn"])
	logger.info("")
	logger.info("Precision: %.1f%% (detected plays that match baseline)", metrics["precision"] * 100)
	logger.info("Recall: %.1f%% (baseline plays that were detected)", metrics["recall"] * 100)
	logger.info("F1 Score: %.1f%%", metrics["f1"] * 100)
	logger.info("")
	logger.info("Total processing time: %.1f seconds (%.1f minutes)", elapsed, elapsed / 60)

	# Show timing breakdown
	if result.timing:
	logger.info("\nTiming breakdown:")
	for key, value in result.timing.items():
	logger.info(" %s: %.1fs", key, value)

	# List false negatives (missed plays)
	if comparison["false_negatives"]:
	logger.info("\n--- MISSED PLAYS (False Negatives) ---")
	for i, play in enumerate(comparison["false_negatives"][:10]):
	start = play.get("start_time", 0)
	minutes = int(start // 60)
	seconds = start % 60
	logger.info(" %d. t=%d:%05.2f (%.1fs)", i + 1, minutes, seconds, start)
	if len(comparison["false_negatives"]) > 10:
	logger.info(" ... and %d more", len(comparison["false_negatives"]) - 10)

	# List false positives (extra detections)
	if comparison["false_positives"]:
	logger.info("\n--- EXTRA DETECTIONS (False Positives) ---")
	for i, play in enumerate(comparison["false_positives"][:10]):
	start = play.get("start_time", 0)
	minutes = int(start // 60)
	seconds = start % 60
	logger.info(" %d. t=%d:%05.2f (%.1fs)", i + 1, minutes, seconds, start)
	if len(comparison["false_positives"]) > 10:
	logger.info(" ... and %d more", len(comparison["false_positives"]) - 10)

	# Check if specifically testing for the missed play at 1:52:06 (6726s)
	target_time = 6726.0 # 1:52:06
	found_target = False
	for play in detected_plays:
	if abs(play.get("start_time", 0) - target_time) <= 10:
	found_target = True
	logger.info("\n* MILESTONE: Play near 1:52:06 (6726s) WAS DETECTED! *")
	logger.info(" Start time: %.1fs", play.get("start_time", 0))
	break

	if not found_target:
	logger.info("\n* WARNING: Play near 1:52:06 (6726s) was NOT detected *")

	# Save results
	output_path = Path("output/benchmarks/template_matching_evaluation.json")
	output_path.parent.mkdir(parents=True, exist_ok=True)

	results_data = {
	"video": VIDEO_PATH,
	"method": "template_matching",
	"baseline_plays": len(baseline_plays),
	"detected_plays": len(detected_plays),
	"counts": counts,
	"metrics": metrics,
	"elapsed_seconds": elapsed,
	"timing": result.timing,
	"plays": detected_plays,
	"false_negatives": [{"start_time": p.get("start_time")} for p in comparison["false_negatives"]],
	"false_positives": [{"start_time": p.get("start_time")} for p in comparison["false_positives"]],
	}

	with open(output_path, "w", encoding="utf-8") as f:
	json.dump(results_data, f, indent=2)
	logger.info("\nResults saved to: %s", output_path)

	# Pass criteria
	passed = metrics["recall"] >= 0.95 and metrics["precision"] >= 0.90
	if passed:
	logger.info("\nEVALUATION: PASSED (recall >= 95%%, precision >= 90%%)")
	else:
	logger.info("\nEVALUATION: NEEDS REVIEW (recall=%.1f%%, precision=%.1f%%)", metrics["recall"] * 100, metrics["precision"] * 100)

	return passed


	if __name__ == "__main__":
	success = run_full_video_evaluation()
	sys.exit(0 if success else 1)