cfb40 / scripts /test_full_video_evaluation.py
andytaylor-smg's picture
moving stuff all around
6c65498
#!/usr/bin/env python3
"""
Full video evaluation using template-based play clock reading.
This test:
1. Loads pre-built digit templates
2. Runs play detection on the full video using template matching
3. Compares detected plays against v3 baseline
4. Reports accuracy metrics
The goal is to verify that template-based clock reading matches or exceeds
OCR-based detection quality while being significantly faster.
Usage:
cd /Users/andytaylor/Documents/Personal/cfb40
source .venv/bin/activate
python tests/test_digit_templates/test_full_video_evaluation.py
"""
import json
import logging
import sys
import time
from pathlib import Path
from pipeline.play_detector import DetectionConfig, PlayDetector
from setup import DigitTemplateLibrary
from detection import TrackTimeouts
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Configuration
VIDEO_PATH = "full_videos/OSU vs Tenn 12.21.24.mkv"
TEMPLATE_PATH = "output/OSU_vs_Tenn_12_21_24_template.png"
PLAYCLOCK_CONFIG_PATH = "output/OSU_vs_Tenn_12_21_24_playclock_config.json"
TIMEOUT_CONFIG_PATH = "output/OSU_vs_Tenn_12_21_24_timeout_config.json"
DIGIT_TEMPLATE_PATH = "output/debug/digit_templates"
V3_BASELINE_PATH = "output/benchmarks/v3_special_plays_baseline.json"
# Scorebug region (from previous sessions)
SCOREBUG_REGION = (128, 975, 1669, 46)
# Minimum play duration filter (same as main pipeline)
MIN_PLAY_DURATION = 3.0
def load_v3_baseline():
"""Load v3 baseline plays for comparison."""
with open(V3_BASELINE_PATH, "r", encoding="utf-8") as f:
data = json.load(f)
return data.get("plays", [])
def find_matching_play(detected_play: dict, baseline_plays: list, tolerance: float = 5.0):
"""
Find a matching play in the baseline.
A match is found if the detected play's start time is within tolerance
of a baseline play's start time.
Args:
detected_play: Detected play dict with start_time
baseline_plays: List of baseline plays
tolerance: Time tolerance in seconds
Returns:
Matching baseline play or None
"""
detected_start = detected_play.get("start_time", 0)
for baseline in baseline_plays:
baseline_start = baseline.get("start_time", 0)
if abs(detected_start - baseline_start) <= tolerance:
return baseline
return None
def compare_results(detected_plays: list, baseline_plays: list):
"""
Compare detected plays against baseline.
Returns dict with:
- true_positives: Plays in both detected and baseline
- false_positives: Plays detected but not in baseline
- false_negatives: Plays in baseline but not detected
- precision, recall, f1
"""
matched_baseline = set()
true_positives = []
false_positives = []
# Find matches for detected plays
for detected in detected_plays:
match = find_matching_play(detected, baseline_plays)
if match:
baseline_idx = baseline_plays.index(match)
if baseline_idx not in matched_baseline:
matched_baseline.add(baseline_idx)
true_positives.append({"detected": detected, "baseline": match})
else:
# Duplicate match - still counts as FP
false_positives.append(detected)
else:
false_positives.append(detected)
# Find unmatched baseline plays (false negatives)
false_negatives = [bp for i, bp in enumerate(baseline_plays) if i not in matched_baseline]
# Calculate metrics
tp = len(true_positives)
fp = len(false_positives)
fn = len(false_negatives)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
"true_positives": true_positives,
"false_positives": false_positives,
"false_negatives": false_negatives,
"counts": {"tp": tp, "fp": fp, "fn": fn},
"metrics": {"precision": precision, "recall": recall, "f1": f1},
}
def run_full_video_evaluation():
"""Run full video evaluation with template-based clock reading."""
logger.info("=" * 70)
logger.info("FULL VIDEO EVALUATION: Template-Based Play Clock Reading")
logger.info("=" * 70)
# Check files exist
for path, name in [
(VIDEO_PATH, "Video"),
(TEMPLATE_PATH, "Scorebug template"),
(PLAYCLOCK_CONFIG_PATH, "Play clock config"),
(DIGIT_TEMPLATE_PATH, "Digit templates"),
(V3_BASELINE_PATH, "V3 baseline"),
]:
if not Path(path).exists():
logger.error("%s not found: %s", name, path)
return False
# Load v3 baseline
logger.info("\n[Step 1] Loading v3 baseline...")
baseline_plays = load_v3_baseline()
logger.info("V3 baseline plays: %d", len(baseline_plays))
# Check digit templates
logger.info("\n[Step 2] Loading digit templates...")
template_library = DigitTemplateLibrary()
if not template_library.load(DIGIT_TEMPLATE_PATH):
logger.error("Failed to load digit templates from %s", DIGIT_TEMPLATE_PATH)
return False
coverage = template_library.get_coverage_status()
logger.info("Template coverage: %d/%d", coverage["total_have"], coverage["total_needed"])
logger.info(" Ones (center): %s", coverage["ones_center_have"])
logger.info(" Ones (right): %s", coverage["ones_right_have"])
logger.info(" Tens (left): %s", coverage["tens_have"])
logger.info(" Blank: %s", "YES" if coverage["has_blank"] else "NO")
# Create detection config with template path
logger.info("\n[Step 3] Setting up detection pipeline...")
detection_config = DetectionConfig(
video_path=VIDEO_PATH,
template_path=TEMPLATE_PATH,
clock_region_config_path=PLAYCLOCK_CONFIG_PATH,
start_time=0.0,
end_time=None, # Full video
frame_interval=0.5,
use_template_matching=True,
digit_template_path=DIGIT_TEMPLATE_PATH,
)
# Initialize timeout tracker if config exists
timeout_tracker = None
if Path(TIMEOUT_CONFIG_PATH).exists():
timeout_tracker = TrackTimeouts(config_path=TIMEOUT_CONFIG_PATH)
logger.info("Timeout tracker initialized")
# Initialize detector
detector = PlayDetector(detection_config, timeout_tracker=timeout_tracker)
# Set fixed scorebug region
detector.scorebug_detector.set_fixed_region(SCOREBUG_REGION)
logger.info("Scorebug region set: %s", SCOREBUG_REGION)
# Run detection
logger.info("\n[Step 4] Running detection on full video...")
logger.info("This may take several minutes...")
start_time = time.time()
result = detector.detect()
elapsed = time.time() - start_time
logger.info("Detection complete in %.1f seconds (%.1f minutes)", elapsed, elapsed / 60)
# Filter short plays
detected_plays = []
for play in result.plays:
duration = play.get("duration", play.get("end_time", 0) - play.get("start_time", 0))
if duration >= MIN_PLAY_DURATION:
detected_plays.append(play)
logger.info("Detected plays (after filtering): %d", len(detected_plays))
# Compare against baseline
logger.info("\n[Step 5] Comparing against v3 baseline...")
comparison = compare_results(detected_plays, baseline_plays)
# Print results
counts = comparison["counts"]
metrics = comparison["metrics"]
logger.info("\n" + "=" * 70)
logger.info("EVALUATION RESULTS")
logger.info("=" * 70)
logger.info("V3 Baseline plays: %d", len(baseline_plays))
logger.info("Detected plays: %d", len(detected_plays))
logger.info("")
logger.info("True Positives (matched): %d", counts["tp"])
logger.info("False Positives (extra): %d", counts["fp"])
logger.info("False Negatives (missed): %d", counts["fn"])
logger.info("")
logger.info("Precision: %.1f%% (detected plays that match baseline)", metrics["precision"] * 100)
logger.info("Recall: %.1f%% (baseline plays that were detected)", metrics["recall"] * 100)
logger.info("F1 Score: %.1f%%", metrics["f1"] * 100)
logger.info("")
logger.info("Total processing time: %.1f seconds (%.1f minutes)", elapsed, elapsed / 60)
# Show timing breakdown
if result.timing:
logger.info("\nTiming breakdown:")
for key, value in result.timing.items():
logger.info(" %s: %.1fs", key, value)
# List false negatives (missed plays)
if comparison["false_negatives"]:
logger.info("\n--- MISSED PLAYS (False Negatives) ---")
for i, play in enumerate(comparison["false_negatives"][:10]):
start = play.get("start_time", 0)
minutes = int(start // 60)
seconds = start % 60
logger.info(" %d. t=%d:%05.2f (%.1fs)", i + 1, minutes, seconds, start)
if len(comparison["false_negatives"]) > 10:
logger.info(" ... and %d more", len(comparison["false_negatives"]) - 10)
# List false positives (extra detections)
if comparison["false_positives"]:
logger.info("\n--- EXTRA DETECTIONS (False Positives) ---")
for i, play in enumerate(comparison["false_positives"][:10]):
start = play.get("start_time", 0)
minutes = int(start // 60)
seconds = start % 60
logger.info(" %d. t=%d:%05.2f (%.1fs)", i + 1, minutes, seconds, start)
if len(comparison["false_positives"]) > 10:
logger.info(" ... and %d more", len(comparison["false_positives"]) - 10)
# Check if specifically testing for the missed play at 1:52:06 (6726s)
target_time = 6726.0 # 1:52:06
found_target = False
for play in detected_plays:
if abs(play.get("start_time", 0) - target_time) <= 10:
found_target = True
logger.info("\n*** MILESTONE: Play near 1:52:06 (6726s) WAS DETECTED! ***")
logger.info(" Start time: %.1fs", play.get("start_time", 0))
break
if not found_target:
logger.info("\n*** WARNING: Play near 1:52:06 (6726s) was NOT detected ***")
# Save results
output_path = Path("output/benchmarks/template_matching_evaluation.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
results_data = {
"video": VIDEO_PATH,
"method": "template_matching",
"baseline_plays": len(baseline_plays),
"detected_plays": len(detected_plays),
"counts": counts,
"metrics": metrics,
"elapsed_seconds": elapsed,
"timing": result.timing,
"plays": detected_plays,
"false_negatives": [{"start_time": p.get("start_time")} for p in comparison["false_negatives"]],
"false_positives": [{"start_time": p.get("start_time")} for p in comparison["false_positives"]],
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(results_data, f, indent=2)
logger.info("\nResults saved to: %s", output_path)
# Pass criteria
passed = metrics["recall"] >= 0.95 and metrics["precision"] >= 0.90
if passed:
logger.info("\nEVALUATION: PASSED (recall >= 95%%, precision >= 90%%)")
else:
logger.info("\nEVALUATION: NEEDS REVIEW (recall=%.1f%%, precision=%.1f%%)", metrics["recall"] * 100, metrics["precision"] * 100)
return passed
if __name__ == "__main__":
success = run_full_video_evaluation()
sys.exit(0 if success else 1)