Spaces:

uncertainrods
/

smashfix-v1

Sleeping

App Files Files Community

smashfix-v1 / src /evaluate_video.py

uncertainrods

v1-try-deploy

0d0412d about 2 months ago

raw

history blame contribute delete

25 kB

	#!/usr/bin/env python3
	"""
	Evaluate a single video file with KSI metrics and natural language coaching.
	Supports both file paths and real-time webcam input.
	"""

	# --- DETERMINISM FIXES (MUST BE BEFORE TF IMPORT) ---
	import os
	import sys

	# Check for GPU flag early (before TF imports)
	_use_gpu = '--gpu' in sys.argv

	if not _use_gpu:
	# Force CPU mode for deterministic predictions
	os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
	os.environ['MEDIAPIPE_DISABLE_GPU'] = '1'
	print("🔒 Running in CPU mode for deterministic predictions (use --gpu to enable GPU)")

	os.environ['TF_DETERMINISTIC_OPS'] = '1'
	os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
	os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

	import argparse
	import yaml
	import cv2
	import numpy as np
	from collections import deque
	import tensorflow as tf
	from tensorflow.keras.models import load_model
	from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

	# Disable GPU visibility in TF if CPU mode (double check)
	if not _use_gpu:
	tf.config.set_visible_devices([], 'GPU')
	import mediapipe as mp

	from ksi_v2 import EnhancedKSI, ShotPhase
	from features import HybridFeatureExtractor
	from utils import normalize_pose, resolve_crop_config_for_video, should_skip_crop

	try:
	from natural_language_coach import generate_coaching_report
	NLP_AVAILABLE = True
	except ImportError:
	NLP_AVAILABLE = False


	def load_params():
	"""Load configuration from params.yaml"""
	with open("params.yaml") as f:
	return yaml.safe_load(f)


	def load_expert_templates(params):
	"""Load expert reference templates"""
	template_path = params['expert_pipeline']['output_path']
	if not os.path.exists(template_path):
	raise FileNotFoundError(f"Templates not found at {template_path}")
	return np.load(template_path, allow_pickle=True)


	def _smooth_signal(signal, window_size=5):
	"""Apply exponential moving average for smoothing noisy signals."""
	if len(signal) == 0:
	return signal
	alpha = 2.0 / (window_size + 1)
	smoothed = [signal[0]]
	for val in signal[1:]:
	smoothed.append(alpha * val + (1 - alpha) * smoothed[-1])
	return np.array(smoothed)


	def find_contact_moment(all_landmarks, seq_len):
	"""
	Identify the contact moment using multi-joint acceleration (real-time optimized).

	Contact detection:
	- Combines wrist (16), elbow (14), shoulder (12) joints
	- Calculates composite "arm acceleration" (rate of velocity change)
	- Finds peak acceleration with temporal smoothing for robustness
	- Returns the window with highest acceleration (predictive of contact)

	Returns:
	contact_window_idx: Index of window containing contact
	contact_frame_in_window: Frame within that window where contact occurs (0-seq_len)
	"""
	max_acceleration = 0
	contact_window = 0
	contact_frame = 0

	for win_idx, landmarks_window in enumerate(all_landmarks):
	# landmarks_window: (T, 33, 3)

	# Get right arm joints: shoulder (12), elbow (14), wrist (16)
	shoulder_pos = landmarks_window[:, 12, :2] # (T, 2)
	elbow_pos = landmarks_window[:, 14, :2]
	wrist_pos = landmarks_window[:, 16, :2]

	# Calculate velocities for each segment
	shoulder_vel = np.linalg.norm(np.diff(shoulder_pos, axis=0), axis=1) # (T-1,)
	elbow_vel = np.linalg.norm(np.diff(elbow_pos, axis=0), axis=1)
	wrist_vel = np.linalg.norm(np.diff(wrist_pos, axis=0), axis=1)

	# Composite arm velocity (wrist is primary, elbow secondary, shoulder stabilizer)
	# Weighted: more weight on distal joints (wrist is fastest)
	composite_vel = 0.5 * wrist_vel + 0.3 * elbow_vel + 0.2 * shoulder_vel

	# Smooth velocity for real-time robustness
	composite_vel_smooth = _smooth_signal(composite_vel, window_size=3)

	# Calculate acceleration (change in composite velocity)
	if len(composite_vel_smooth) > 1:
	acceleration = np.linalg.norm(np.diff(composite_vel_smooth))

	# Check if this window has maximum acceleration
	if acceleration > max_acceleration:
	max_acceleration = acceleration
	contact_window = win_idx
	# Find frame in this window with max composite velocity
	contact_frame = np.argmax(composite_vel_smooth)

	return contact_window, contact_frame


	def predict_shot_type_at_contact(all_windows, all_landmarks, model, classes, cnn_dim, pipeline_type='hybrid'):
	"""
	Instead of consensus across all windows, predict based on the window
	containing the contact moment (highest acceleration).

	Returns:
	predictions: All predictions for reference
	best_prediction: The prediction at contact moment
	best_class: Shot class at contact
	contact_info: Dict with contact window and frame info
	"""
	# Find contact moment
	seq_len = all_windows[0].shape[0] if all_windows else 40
	contact_window_idx, contact_frame = find_contact_moment(all_landmarks, seq_len)

	# Get all predictions first
	features = np.array(all_windows) # (N, T, D)

	# Downsample to match model's expected sequence length
	expected_seq_len = int(model.inputs[0].shape[1]) if model.inputs else seq_len
	if expected_seq_len != seq_len and expected_seq_len is not None:
	# Downsample using stride
	stride = max(1, seq_len // expected_seq_len)
	features = features[:, ::stride, :][:, :expected_seq_len, :] # Take every stride-th frame
	# Also downsample landmarks for contact detection
	all_landmarks = [lm[::stride][:expected_seq_len] for lm in all_landmarks]
	seq_len = expected_seq_len

	model_inputs = _prepare_model_inputs(model, x_fused=features, cnn_dim=cnn_dim)
	all_probs = model.predict(model_inputs, verbose=0)

	all_predictions = []
	for i, probs in enumerate(all_probs):
	pred_idx = np.argmax(probs)
	predicted_class = classes[pred_idx]
	confidence = float(probs[pred_idx])
	all_predictions.append({
	'window': i,
	'class': predicted_class,
	'confidence': confidence,
	'all_scores': {classes[j]: float(probs[j]) for j in range(len(classes))}
	})

	# Get prediction at contact
	best_prediction = all_predictions[contact_window_idx]
	best_class = best_prediction['class']

	contact_info = {
	'contact_window': contact_window_idx,
	'contact_frame': contact_frame,
	'total_windows': len(all_windows),
	'seq_len': seq_len
	}

	return all_predictions, best_prediction, best_class, contact_info


	def extract_features_from_video(video_source, extractor, params, pipeline_type='hybrid'):
	"""
	Extract features and landmarks from video using sliding window (like realtime_hybrid).
	Uses image-space landmarks (pose_landmarks) so units match expert templates.
	Skips low-quality windows to avoid zeroed KSI.

	Args:
	video_source: File path or webcam index (0, 1, etc)
	extractor: Feature extractor (HybridFeatureExtractor)
	params: Configuration dict
	pipeline_type: 'hybrid' or 'pose'

	Returns:
	all_windows: List of fused feature windows
	all_landmarks: List of (T, 33, 3) landmarks per window
	frame_count: Total frames processed
	"""
	# Open video or webcam
	if isinstance(video_source, str) and video_source.isdigit():
	cap = cv2.VideoCapture(int(video_source))
	is_webcam = True
	else:
	cap = cv2.VideoCapture(video_source)
	is_webcam = False

	if not cap.isOpened():
	raise RuntimeError(f"Cannot open video source: {video_source}")

	cfg = params['hybrid_pipeline']
	seq_len = cfg['sequence_length']
	cnn_dim = cfg['cnn_feature_dim']

	# Correctly resolve crop config
	base_crop = cfg.get('crop_config', {})
	overrides = params.get('crop_overrides', {})
	crop_cfg = resolve_crop_config_for_video(video_source, base_crop, overrides)

	roi_cfg = cfg.get("cnn_roi") or {}

	# Use extractor's pose model
	window = deque(maxlen=seq_len)
	landmark_window = deque(maxlen=seq_len)
	valid_mask_window = deque(maxlen=seq_len)

	all_windows = []
	all_landmarks = []
	all_valid_ratios = []
	last_pose = None
	last_box = None
	frame_count = 0

	print(f"📹 Processing video from: {video_source}")
	print(f" Sequence length: {seq_len} \| CNN features: {cnn_dim}")

	while True:
	ret, frame = cap.read()
	if not ret:
	break

	# Crop if needed
	if crop_cfg:
	frame = _apply_crop(frame, crop_cfg)

	# Pose detection using extractor's MediaPipe
	res = extractor.pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
	frame_count += 1

	# Extract pose landmarks (image-space) so scale matches templates
	if res.pose_landmarks:
	lm = np.array(
	[[l.x, l.y, l.z] for l in res.pose_landmarks.landmark],
	dtype=np.float32,
	)
	pose_flat = normalize_pose(lm).astype(np.float32).flatten()
	last_pose = pose_flat
	landmark_window.append(lm)
	valid_mask_window.append(1)
	else:
	# Reuse last good pose; if none, mark invalid
	zeros_pose = np.zeros(99, dtype=np.float32)
	pose_flat = last_pose if last_pose is not None else zeros_pose
	landmark_window.append(np.zeros((33, 3), dtype=np.float32))
	valid_mask_window.append(0)

	# Extract CNN features from frame
	h, w = frame.shape[:2]
	box = extractor._compute_pose_roi_box(
	getattr(res, "pose_landmarks", None),
	w,
	h,
	roi_cfg,
	last_box=last_box,
	)
	last_box = box if box is not None else last_box
	roi_frame = extractor._crop_with_box(frame, box)

	img_size = cfg.get("cnn_input_size", 224)
	img = cv2.resize(roi_frame, (img_size, img_size))
	img = preprocess_input(np.expand_dims(img[..., ::-1], axis=0))
	cnn_feat = extractor.rgb_model.predict(img, verbose=0)[0].astype(np.float32)

	# Fuse pose and CNN features
	fused = np.concatenate([pose_flat, cnn_feat], axis=0)
	window.append(fused)

	# When window is full, save it
	if len(window) == seq_len:
	valid_ratio = sum(valid_mask_window) / float(seq_len)
	all_windows.append(np.array(list(window)))
	all_landmarks.append(np.array(list(landmark_window)))
	all_valid_ratios.append(valid_ratio)

	cap.release()
	extractor.pose.close()

	if not all_windows:
	raise RuntimeError("No valid windows extracted from video")

	# Filter out low-quality windows (too many missing poses or NaNs)
	filtered_windows = []
	filtered_landmarks = []
	for win, lm, ratio in zip(all_windows, all_landmarks, all_valid_ratios):
	if ratio < 0.7: # require at least 70% frames with pose
	continue
	if not np.isfinite(win).all() or not np.isfinite(lm).all():
	continue
	if np.allclose(lm, 0): # avoid all-zero landmark windows
	continue
	filtered_windows.append(win)
	filtered_landmarks.append(lm)

	if not filtered_windows:
	raise RuntimeError("All windows were filtered out due to low pose quality; try a clearer video")

	print(f" Extracted {frame_count} frames into {len(filtered_windows)} valid windows (from {len(all_windows)} total)")
	print(f" Window shape: {filtered_windows[0].shape}")
	print(f" Landmarks shape: {filtered_landmarks[0].shape}")

	return filtered_windows, filtered_landmarks, frame_count
	print(f" Features shape: {features.shape}")
	print(f" Landmarks shape: {raw_landmarks.shape}")

	return features, raw_landmarks, frame_count


	def _apply_crop(frame, crop_cfg):
	"""Apply crop to frame"""
	if crop_cfg is None:
	return frame

	h, w = frame.shape[:2]
	start_row = int(h * float(crop_cfg.get("top", 0.0)))
	end_row = h - int(h * float(crop_cfg.get("bottom", 0.0)))
	start_col = int(w * float(crop_cfg.get("left", 0.0)))
	end_col = w - int(w * float(crop_cfg.get("right", 0.0)))

	cropped = frame[start_row:end_row, start_col:end_col]
	return cropped if cropped.size else frame


	def _prepare_model_inputs(model, x_fused, cnn_dim):
	"""Prepare inputs for model (handles different input signatures like realtime_hybrid)."""
	if x_fused.ndim != 3:
	raise ValueError(f"Expected x_fused shape (N, T, D), got {x_fused.shape}")

	fused_dim = int(x_fused.shape[-1])
	x_cnn = x_fused[..., -cnn_dim:] if cnn_dim > 0 else x_fused[..., :0]
	x_pose = x_fused[..., :-cnn_dim] if cnn_dim > 0 else x_fused

	# For dual-input models (CNN + Pose), return both inputs in the correct order
	if len(model.inputs) == 2:
	# Typically: [cnn_input, pose_input] or [pose_input, cnn_input]
	# Check which input expects which features based on shape
	input_shapes = [int(inp.shape[-1]) for inp in model.inputs]

	result = []
	for expected_dim in input_shapes:
	if expected_dim == cnn_dim:
	result.append(x_cnn)
	elif expected_dim == (fused_dim - cnn_dim):
	result.append(x_pose)
	else:
	raise ValueError(
	f"Model expects input dim {expected_dim}, but available are CNN({cnn_dim}) or Pose({fused_dim - cnn_dim}). "
	f"(fused_dim={fused_dim})"
	)
	return result

	# Single input model: try to match the expected dimension
	if len(model.inputs) == 1:
	expected = int(model.inputs[0].shape[-1])
	candidates = {
	int(x_cnn.shape[-1]): x_cnn,
	int(x_pose.shape[-1]): x_pose,
	int(x_fused.shape[-1]): x_fused,
	}
	if expected in candidates:
	return [candidates[expected]]
	return [x_fused]

	# Multiple inputs: try to match each dimension
	expected_dims = []
	for inp in model.inputs:
	try:
	expected_dims.append(int(inp.shape[-1]))
	except Exception:
	expected_dims.append(None)

	prepared = []
	for d in expected_dims:
	if d is None:
	prepared.append(x_fused)
	continue
	if d == cnn_dim:
	prepared.append(x_cnn)
	elif d == (fused_dim - cnn_dim):
	prepared.append(x_pose)
	else:
	raise ValueError(
	f"Model expects input dim {d}, but available are CNN({cnn_dim}) or Pose({fused_dim - cnn_dim}). "
	f"(fused_dim={fused_dim})"
	)

	return prepared


	def predict_shot_type(all_windows, model, classes, cnn_dim, pipeline_type='hybrid'):
	"""
	Predict shot type for all windows using proper model input preparation.

	Returns:
	predictions: List of dicts with 'class', 'confidence', and 'all_scores'
	best_class: Most common predicted class (consensus)
	"""
	predictions = []
	pred_indices = []

	# Convert list of windows to array
	features = np.array(all_windows) # (N, T, D)

	# Prepare and predict
	model_inputs = _prepare_model_inputs(model, x_fused=features, cnn_dim=cnn_dim)
	all_probs = model.predict(model_inputs, verbose=0)

	for probs in all_probs:
	pred_idx = np.argmax(probs)
	pred_indices.append(pred_idx)
	predicted_class = classes[pred_idx]
	confidence = float(probs[pred_idx])
	predictions.append({
	'class': predicted_class,
	'confidence': confidence,
	'all_scores': {classes[i]: float(probs[i]) for i in range(len(classes))}
	})

	# Find most common prediction (consensus)
	from collections import Counter
	pred_counts = Counter(pred_indices)
	best_idx = pred_counts.most_common(1)[0][0]
	best_class = classes[best_idx]

	return predictions, best_class


	def evaluate_video(
	video_source,
	model_path,
	pipeline_type='hybrid',
	nlp_skill_level='intermediate',
	generate_report=True
	):
	"""
	Main evaluation function for single video.

	Args:
	video_source: File path or webcam index (0, 1, etc)
	model_path: Path to trained model
	pipeline_type: 'hybrid' or 'pose'
	nlp_skill_level: Skill level for coaching ('beginner', 'intermediate', 'advanced', 'expert')
	generate_report: Whether to generate coaching report
	"""
	params = load_params()
	cfg = params[f'{pipeline_type}_pipeline']
	ksi_cfg = params.get('ksi', {'weights': {'pose': 0.5, 'velocity': 0.3, 'acceleration': 0.2}})

	# Load model
	print(f"\n🔄 Loading model: {model_path}")
	model = load_model(model_path)

	# Load classes
	data_path = cfg['data_path']
	classes = sorted([d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))])
	print(f"📋 Classes: {classes}")

	# Feature extractor
	mp_config = params['mediapipe']
	extractor = HybridFeatureExtractor(
	mp_config=mp_config,
	cnn_dim=cfg['cnn_feature_dim'],
	cnn_input_size=cfg['cnn_input_size'],
	rsn_weights_path=cfg.get('rsn_pretrained_weights'),
	)

	# Get sequence parameters
	seq_len = cfg['sequence_length']
	stride = cfg['stride']

	# Extract features from video
	print(f"\n{'='*70}")
	print(f"EXTRACTING FEATURES FROM VIDEO")
	print(f"{'='*70}")
	all_windows, all_landmarks, frame_count = extract_features_from_video(
	video_source, extractor, params, pipeline_type
	)

	# Predict shot type
	print(f"\n{'='*70}")
	print(f"PREDICTING SHOT TYPE (ALL WINDOWS)")
	print(f"{'='*70}")
	all_predictions, best_prediction, best_class, contact_info = predict_shot_type_at_contact(
	all_windows, all_landmarks, model, classes, cfg['cnn_feature_dim'], pipeline_type
	)

	print(f"\n📊 Total predictions: {len(all_predictions)}")
	print(f"{'─'*70}")

	# Group by class and show statistics
	from collections import Counter
	pred_classes = [p['class'] for p in all_predictions]
	class_counts = Counter(pred_classes)

	print(f"\n🎯 PREDICTION SUMMARY (all windows):")
	for shot_class in sorted(class_counts.keys()):
	count = class_counts[shot_class]
	percentage = 100 * count / len(all_predictions)
	confs = [p['confidence'] for p in all_predictions if p['class'] == shot_class]
	avg_conf = np.mean(confs)
	print(f" {shot_class:20s}: {count:3d} predictions ({percentage:5.1f}%) \| Avg confidence: {avg_conf:.2%}")

	# Show contact-based prediction
	print(f"\n{'─'*70}")
	print(f"⚡ CONTACT-BASED PREDICTION (MOST RELIABLE):")
	print(f"{'─'*70}")
	print(f" Contact occurs at: Window {contact_info['contact_window']} (frame {contact_info['contact_frame']}/{contact_info['seq_len']})")
	print(f"\n 🎯 Predicted at contact: {best_prediction['class']}")
	print(f" Confidence: {best_prediction['confidence']:.2%}")
	print(f" All scores at contact:")
	for cls, score in sorted(best_prediction['all_scores'].items(), key=lambda x: x[1], reverse=True):
	print(f" {cls:20s}: {score:.2%}")

	# Show first 10 detailed predictions
	print(f"\n{'─'*70}")
	print(f"📋 DETAILED PREDICTIONS (first 10 windows):")
	print(f"{'─'*70}")
	for i, pred in enumerate(all_predictions[:10]):
	marker = " ⚡ CONTACT" if i == contact_info['contact_window'] else ""
	print(f"\n Window {i+1:2d}: {pred['class']:20s} ({pred['confidence']:.2%}){marker}")
	sorted_scores = sorted(pred['all_scores'].items(), key=lambda x: x[1], reverse=True)
	for cls, score in sorted_scores[:3]:
	print(f" {cls:20s}: {score:.2%}")

	if len(all_predictions) > 10:
	print(f"\n ... and {len(all_predictions) - 10} more predictions")

	# Calculate KSI
	print(f"\n{'='*70}")
	print(f"CALCULATING KSI METRICS")
	print(f"{'='*70}")

	try:
	templates = load_expert_templates(params)
	except FileNotFoundError:
	print("⚠️ Expert templates not found (data/expert_templates.npz).")
	print(" Skipping KSI metrics and coaching report.")
	print(" Run 'dvc repro generate_templates' to output templates.")
	return

	ksi_calc = EnhancedKSI()

	# Get expert template for consensus class
	template_key = best_class
	if template_key not in templates.files:
	template_key = f'{best_class}_variant1'

	if template_key not in templates.files:
	print(f"⚠️ Template not found for {best_class}")
	return

	expert_template = templates[template_key]
	if expert_template.ndim == 2 and expert_template.shape[1] == 99:
	expert_lm = expert_template.reshape(-1, 33, 3)
	else:
	expert_lm = expert_template

	# Calculate KSI for each window and average
	ksi_scores = []
	for i, user_lm in enumerate(all_landmarks):
	result = ksi_calc.calculate(
	expert_landmarks=expert_lm,
	user_landmarks=user_lm,
	weights=ksi_cfg['weights'],
	)
	ksi_scores.append(result)

	# Use average KSI result
	avg_ksi_total = np.mean([r.ksi_total for r in ksi_scores])
	avg_ksi_weighted = np.mean([r.ksi_weighted for r in ksi_scores])

	# Prefer contact window if valid, otherwise highest KSI
	contact_idx = contact_info['contact_window'] if ksi_scores else 0
	if ksi_scores and np.isfinite(ksi_scores[contact_idx].ksi_total) and ksi_scores[contact_idx].ksi_total > 0:
	result = ksi_scores[contact_idx]
	chosen_idx = contact_idx
	chosen_reason = "contact window"
	else:
	best_idx = int(np.argmax([r.ksi_total for r in ksi_scores])) if ksi_scores else 0
	result = ksi_scores[best_idx]
	chosen_idx = best_idx
	chosen_reason = "highest KSI"

	print(f"📊 KSI Analysis ({len(ksi_scores)} windows):")
	print(f" Average KSI Total: {avg_ksi_total:.3f}")
	print(f" Average KSI Weighted: {avg_ksi_weighted:.3f}")
	print(f" Using window #{chosen_idx + 1} ({chosen_reason}) for report")
	print(f"\n Selected KSI Score: {result.ksi_total:.3f}")
	print(f" KSI Weighted: {result.ksi_weighted:.3f}")
	print(f" Phase scores: {result.phase_scores}")
	print(f" Component scores: {result.components}")

	# Generate coaching report
	if generate_report and NLP_AVAILABLE:
	print(f"\n{'='*70}")
	print(f"GENERATING COACHING REPORT")
	print(f"{'='*70}")

	os.makedirs("coaching_reports", exist_ok=True)

	report = generate_coaching_report(
	ksi_result=result,
	shot_type_str=best_class,
	skill_level_str=nlp_skill_level,
	output_format='text',
	simplified=True
	)

	report_filename = f"coaching_reports/{best_class}_video_ksi{result.ksi_total:.3f}_report.txt"
	with open(report_filename, 'w') as f:
	f.write(report)

	print(f"✅ Report saved: {report_filename}")
	print(f"\n{'='*70}")
	print("📄 COACHING REPORT PREVIEW")
	print(f"{'='*70}")
	print(report)

	print(f"\n{'='*70}")
	print("✨ EVALUATION COMPLETE")
	print(f"{'='*70}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Evaluate a single video with KSI metrics and coaching")
	parser.add_argument("video", type=str, help="Video file path or webcam index (0, 1, etc)")
	parser.add_argument("--type", choices=['pose', 'hybrid'], default='hybrid', help="Pipeline type (default: hybrid)")
	parser.add_argument("--model", type=str, default="models/tcn_hybrid_tuned.h5", help="Model path")
	parser.add_argument("--skill", type=str, default='intermediate',
	choices=['beginner', 'intermediate', 'advanced', 'expert'],
	help="Skill level for coaching (default: intermediate)")
	parser.add_argument("--no-report", action='store_true', help="Skip coaching report generation")
	parser.add_argument("--gpu", action='store_true', help="Use GPU for inference (faster but less deterministic)")

	args = parser.parse_args()

	evaluate_video(
	video_source=args.video,
	model_path=args.model,
	pipeline_type=args.type,
	nlp_skill_level=args.skill,
	generate_report=not args.no_report
	)