Spaces:

uncertainrods
/

smashfix-v1

Sleeping

App Files Files Community

smashfix-v1 / src /evaluate.py

uncertainrods

v1-try-deploy

0d0412d about 2 months ago

raw

history blame contribute delete

27.6 kB

	"""
	Model Evaluation Pipeline with KSI v2.0 Metrics
	================================================

	Comprehensive evaluation script for trained badminton shot classification models.
	Implements enhanced Kinematic Similarity Index (KSI) v2.0 for biomechanical
	analysis and optional natural language coaching feedback generation.

	Key Features:
	- Classification accuracy and confusion matrix analysis
	- KSI v2.0 biomechanical comparison against expert templates
	- Phase-aware scoring (preparation, loading, contact, follow-through)
	- Per-joint error analysis with confidence intervals
	- Velocity and acceleration derivative metrics
	- Optional NLP coaching report generation
	- Full MLflow experiment tracking integration

	Evaluation Metrics:
	1. Classification Metrics
	- Test accuracy, precision, recall, F1
	- Confusion matrix visualization

	2. KSI v2.0 Metrics
	- Total KSI score (0-100, higher = better match)
	- Component breakdown (pose, velocity, acceleration)
	- Phase-specific scores
	- Per-joint error analysis
	- Ranking hinge score for class separation

	Pipeline Position:
	train_pose.py / train_hybrid.py → [evaluate.py] → reports/

	Loads trained model and test data, performs comprehensive evaluation,
	and logs all metrics to MLflow for experiment tracking.

	Dependencies:
	External: tensorflow, sklearn, matplotlib, seaborn, mlflow, numpy, yaml
	Internal: ksi_v2.EnhancedKSI, mlflow_utils, natural_language_coach

	Configuration (params.yaml):
	pose_pipeline / hybrid_pipeline:
	model_path: Path to trained model
	data_path: Path to evaluation data
	ksi:
	weights: Component weighting for KSI calculation

	Usage:
	python evaluate.py pose # Evaluate pose model
	python evaluate.py hybrid --nlp # Evaluate hybrid with NLP feedback

	Author: IPD Research Team
	Version: 2.0.0
	"""

	"""
	# --- DETERMINISM FIXES (MUST BE BEFORE TF IMPORT) ---
	import os
	import sys

	# Check for GPU flag early (before TF imports)
	_use_gpu = '--gpu' in sys.argv

	if not _use_gpu:
	# Force CPU mode for deterministic predictions
	os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
	os.environ['MEDIAPIPE_DISABLE_GPU'] = '1'
	print("🔒 Running in CPU mode for deterministic predictions (use --gpu to enable GPU)")

	os.environ['TF_DETERMINISTIC_OPS'] = '1'
	os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
	os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

	"""
	# --- DETERMINISM FIXES (MUST BE BEFORE TF IMPORT) ---
	import os
	import sys

	# Check for GPU flag early (before TF imports)
	_use_gpu = '--gpu' in sys.argv

	if not _use_gpu:
	# Force CPU mode for deterministic predictions
	os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
	os.environ['MEDIAPIPE_DISABLE_GPU'] = '1'
	print("🔒 Running in CPU mode for deterministic predictions (use --gpu to enable GPU)")

	os.environ['TF_DETERMINISTIC_OPS'] = '1'
	os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
	os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

	import argparse
	import os
	import yaml
	import numpy as np
	import json
	import mlflow
	import matplotlib.pyplot as plt
	import seaborn as sns
	from tensorflow.keras.models import load_model
	from tensorflow.keras.utils import to_categorical
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import confusion_matrix
	from typing import Dict
	from ksi_v2 import EnhancedKSI, ShotPhase
	from mlflow_utils import MLflowRunManager

	try:
	from natural_language_coach import generate_coaching_report
	NLP_AVAILABLE = True
	except ImportError:
	NLP_AVAILABLE = False
	print("⚠️ natural_language_coach not available. NLP feedback will be skipped.")


	def evaluate(pipeline_type: str, model_path: str = None, generate_nlp_feedback: bool = False,
	nlp_skill_level: str = 'intermediate', max_nlp_samples: int = 5, auto_run_name: bool = False,
	data_path: str = None):
	"""
	Evaluate model with enhanced KSI v2 metrics.
	Logs phase scores, confidence intervals, ranking hinge, and component breakdowns.

	Args:
	pipeline_type: 'pose' or 'hybrid'
	model_path: Optional path to model file (overrides params.yaml)
	generate_nlp_feedback: Whether to generate natural language coaching reports
	nlp_skill_level: Skill level for NLP coach ('beginner', 'intermediate', 'advanced', 'expert')
	max_nlp_samples: Maximum number of samples to generate detailed feedback for
	auto_run_name: If True, auto-generate MLflow run name without prompting
	data_path: Optional path to evaluation data (overrides params.yaml)
	"""
	with open("params.yaml") as f:
	params = yaml.safe_load(f)
	cfg = params[f'{pipeline_type}_pipeline']
	ksi_cfg = params.get('ksi', {'weights': {'pose': 0.5, 'velocity': 0.3, 'acceleration': 0.2}})

	# Use provided model path or default from config
	if model_path:
	cfg['model_path'] = model_path
	run_suffix = f" (custom: {os.path.basename(model_path)})"
	else:
	run_suffix = ""

	# Use provided data path or default from config
	if data_path:
	cfg['data_path'] = data_path

	# Setup MLflow with interactive run manager
	exp_name = "Pose_LSTM_Experiment" if pipeline_type == 'pose' else "Hybrid_TCN_Experiment"
	run_manager = MLflowRunManager(exp_name)

	with run_manager.start_interactive_run(
	default_description=f"Evaluation of {pipeline_type} pipeline with KSI v2.0 metrics{run_suffix}",
	auto_name=auto_run_name
	):
	# Log evaluation configuration
	mlflow.log_param("evaluation.model_path", cfg['model_path'])
	mlflow.log_param("evaluation.data_path", cfg['data_path'])
	mlflow.log_param("evaluation.pipeline_type", pipeline_type)

	# Display what we're evaluating on
	print(f"\n{'='*70}")
	print(f"📊 EVALUATION CONFIGURATION")
	print(f"{'='*70}")
	print(f"Pipeline: {pipeline_type}")
	print(f"Model: {cfg['model_path']}")
	print(f"Data: {cfg['data_path']}")
	print(f"{'='*70}\n")

	# 1. Load Data
	X, y = [], []
	raw_landmarks = [] # NEW: store raw landmarks for KSI
	if not os.path.exists(cfg['data_path']):
	print(f"Data path {cfg['data_path']} not found")
	return
	classes = sorted(os.listdir(cfg['data_path']))
	for i, cls in enumerate(classes):
	path = os.path.join(cfg['data_path'], cls)
	if not os.path.isdir(path):
	continue
	for f in os.listdir(path):
	if f.endswith('.npz'):
	data = np.load(os.path.join(path, f))
	X.append(data['features'])
	y.append(i)
	# NEW: load raw landmarks if available (for hybrid pipeline KSI)
	if 'raw_landmarks' in data:
	raw_landmarks.append(data['raw_landmarks'])
	else:
	raw_landmarks.append(None)

	if not X:
	print("No data loaded")
	return
	X, y_cat = np.array(X), to_categorical(y, len(classes))

	# Split data and raw landmarks together
	if raw_landmarks and any(lm is not None for lm in raw_landmarks):
	_, X_test, _, y_test, _, raw_test = train_test_split(
	X, y_cat, raw_landmarks, test_size=0.2, stratify=y, random_state=42
	)
	has_raw_landmarks = True
	else:
	_, X_test, _, y_test = train_test_split(X, y_cat, test_size=0.2, stratify=y, random_state=42)
	raw_test = None
	has_raw_landmarks = False

	# 2. Load Model & Predict
	model = load_model(cfg['model_path'])

	if pipeline_type == 'hybrid':
	cnn_dim = cfg['cnn_feature_dim']
	X_pose, X_cnn = X_test[..., :-cnn_dim], X_test[..., -cnn_dim:]
	loss, acc = model.evaluate([X_cnn, X_pose], y_test, verbose=0)
	y_pred = np.argmax(model.predict([X_cnn, X_pose]), axis=1)
	sample_pose = X_pose
	else:
	loss, acc = model.evaluate(X_test, y_test, verbose=0)
	y_pred = np.argmax(model.predict(X_test), axis=1)
	sample_pose = X_test

	# 3. Enhanced KSI v2 Calculation with contact-centered windowing
	# NEW: KSI now works with hybrid pipeline if raw_landmarks are available
	template_path = params['expert_pipeline']['output_path']
	ksi_results = {
	'avg_ksi_total': 0.0,
	'avg_ksi_weighted': 0.0,
	'avg_confidence_ci_width': 0.0,
	'avg_uncertainty_scalar': 0.0,
	'phase_scores': {},
	'component_scores': {},
	'reliable_count': 0,
	'total_samples': 0
	}

	# Check if we can do KSI evaluation
	can_do_ksi = os.path.exists(template_path) and (
	pipeline_type == 'pose' or (pipeline_type == 'hybrid' and has_raw_landmarks)
	)

	if can_do_ksi:
	print(f"Expert templates found at {template_path}. Computing KSI metrics...")
	templates = np.load(template_path, allow_pickle=True)

	# Check template format - detect if raw landmarks or enhanced features
	sample_template_key = next((k for k in templates.files if not k.startswith('_')), None)
	if sample_template_key is None:
	print(f"⚠ No valid templates found in {template_path}")
	can_do_ksi = False
	else:
	sample_template = templates[sample_template_key]
	# Raw landmarks: (T, 33, 3) or (T, 99) flattened
	# Enhanced features: (T, 32)
	template_is_raw_landmarks = (
	sample_template.ndim == 3 and sample_template.shape[1:] == (33, 3)
	) or (
	sample_template.ndim == 2 and sample_template.shape[1] == 99
	)
	template_is_enhanced = sample_template.ndim == 2 and sample_template.shape[1] == 32

	if template_is_enhanced:
	print(f"⚠ Templates are in 32-feature format (KSI v2 features), not raw landmarks.")
	print(f" KSI calculation requires raw landmarks (T, 33, 3). Regenerate templates with raw landmarks.")
	can_do_ksi = False
	elif not template_is_raw_landmarks:
	print(f"⚠ Unknown template format: shape={sample_template.shape}. Expected (T, 33, 3) or (T, 99).")
	can_do_ksi = False
	else:
	print(f" Template format: {'(T, 33, 3)' if sample_template.ndim == 3 else '(T, 99)'} - OK")

	if can_do_ksi:
	# Initialize enhanced KSI calculator
	ksi_calc = EnhancedKSI(
	fps=params.get('fps', 30.0),
	contact_window_pre_frames=18,
	contact_window_post_frames=18,
	bootstrap_min=50,
	bootstrap_max=200,
	ranking_margin=0.05
	)

	ksi_totals = []
	ksi_weighted_list = []
	ci_widths = []
	uncertainty_scalars = []
	phase_score_accumulator = {p.value: [] for p in ShotPhase}
	component_accumulator = {'pose': [], 'velocity': [], 'acceleration': [], 'jerk': []}
	reliable_count = 0

	# Store individual results for NLP feedback
	individual_results = [] if generate_nlp_feedback else None

	# Sample for speed (evaluate up to 50 items)
	n_samples = min(50, len(sample_pose))
	evaluated_count = 0 # Track actually evaluated samples
	skipped_templates = set() # Track missing templates to report once

	for i in range(n_samples):
	cls = classes[np.argmax(y_test[i])]

	# NEW: Try main template, then variants, prioritizing main
	template_key = None
	if cls in templates:
	template_key = cls
	elif f'{cls}_variant1' in templates:
	# If only variants exist, use variant1 (best quality)
	template_key = f'{cls}_variant1'
	else:
	# Try any key containing the class name
	for key in templates.files:
	if cls in key and not key.startswith('_'):
	template_key = key
	break

	if template_key is None:
	if cls not in skipped_templates:
	skipped_templates.add(cls)
	continue

	# Get user landmarks - use raw_landmarks if available (hybrid), else reshape pose features
	if pipeline_type == 'hybrid' and raw_test is not None and raw_test[i] is not None:
	user_lm = raw_test[i] # Already (T, 33, 3)
	else:
	# Pose pipeline: reshape from flattened features (T, 99) -> (T, 33, 3)
	try:
	user_lm = sample_pose[i].reshape(-1, 33, 3)
	except ValueError as e:
	print(f"⚠ Cannot reshape pose features to landmarks: {sample_pose[i].shape} -> (T, 33, 3)")
	continue

	# Load expert template and reshape if needed
	expert_template = templates[template_key]
	try:
	if expert_template.ndim == 3 and expert_template.shape[1:] == (33, 3):
	expert_lm = expert_template # Already (T, 33, 3)
	elif expert_template.ndim == 2 and expert_template.shape[1] == 99:
	expert_lm = expert_template.reshape(-1, 33, 3) # (T, 99) -> (T, 33, 3)
	else:
	print(f"⚠ Cannot convert template '{template_key}' shape {expert_template.shape} to landmarks")
	continue
	except ValueError as e:
	print(f"⚠ Template reshape failed for '{template_key}': {e}")
	continue

	# Calculate enhanced KSI
	result = ksi_calc.calculate(
	expert_landmarks=expert_lm,
	user_landmarks=user_lm,
	weights=ksi_cfg['weights'],
	baseline_ksi=None # Could pass previous user score for ranking hinge
	)

	ksi_totals.append(result.ksi_total)
	ksi_weighted_list.append(result.ksi_weighted)

	# Confidence metrics
	if result.confidence:
	ci_width = result.confidence.get('ci_95_upper', 0) - result.confidence.get('ci_95_lower', 0)
	ci_widths.append(ci_width)
	uncertainty_scalars.append(result.confidence.get('uncertainty_scalar', 0))
	if result.confidence.get('reliable', False):
	reliable_count += 1

	# Phase scores
	for phase, score in result.phase_scores.items():
	if phase in phase_score_accumulator:
	phase_score_accumulator[phase].append(score)

	# Component scores
	for comp in ['pose', 'velocity', 'acceleration', 'jerk']:
	if comp in result.components:
	component_accumulator[comp].append(result.components[comp])

	# Store individual result for NLP feedback (only if prediction is correct)
	if generate_nlp_feedback:
	predicted_cls = classes[y_pred[i]] if i < len(y_pred) else None
	if predicted_cls == cls: # Only store correctly predicted samples
	individual_results.append({
	'sample_idx': i,
	'class': cls,
	'predicted_class': predicted_cls,
	'ksi_result': result,
	'ksi_total': result.ksi_total
	})

	evaluated_count += 1

	# Report skipped templates
	if skipped_templates:
	print(f" ⚠ Missing templates for classes: {sorted(skipped_templates)}")

	# Aggregate results
	ksi_results['avg_ksi_total'] = float(np.mean(ksi_totals)) if ksi_totals else 0.0
	ksi_results['avg_ksi_weighted'] = float(np.mean(ksi_weighted_list)) if ksi_weighted_list else 0.0
	ksi_results['avg_confidence_ci_width'] = float(np.mean(ci_widths)) if ci_widths else 0.0
	ksi_results['avg_uncertainty_scalar'] = float(np.mean(uncertainty_scalars)) if uncertainty_scalars else 0.0
	ksi_results['reliable_count'] = reliable_count
	ksi_results['total_samples'] = evaluated_count # Use actual evaluated count, not attempted

	for phase, scores in phase_score_accumulator.items():
	if scores:
	ksi_results['phase_scores'][phase] = float(np.mean(scores))

	for comp, scores in component_accumulator.items():
	if scores:
	ksi_results['component_scores'][comp] = float(np.mean(scores))

	# Generate natural language coaching reports
	if generate_nlp_feedback and individual_results and NLP_AVAILABLE:
	print(f"\n{'='*70}")
	print(f"GENERATING NATURAL LANGUAGE COACHING FEEDBACK")
	print(f"{'='*70}")
	print(f" Found {len(individual_results)} correctly predicted samples")

	# DEBUG: Show KSI scores distribution
	ksi_scores = [r['ksi_total'] for r in individual_results]
	print(f" KSI scores range: {min(ksi_scores):.3f} - {max(ksi_scores):.3f}")

	os.makedirs("coaching_reports", exist_ok=True)

	# Pick best sample (highest KSI score) - this gives most interesting feedback
	individual_results.sort(key=lambda x: x['ksi_total'], reverse=True)
	best_sample = individual_results[0]
	samples_to_generate = [best_sample]

	sample = samples_to_generate[0]
	cls = sample['class']
	ksi_result = sample['ksi_result']
	sample_idx = sample['sample_idx']

	print(f"\n📝 Generating coaching report for: {cls} (KSI: {ksi_result.ksi_total:.3f})")

	try:

	# Generate simplified coaching report
	report = generate_coaching_report(
	ksi_result=ksi_result,
	shot_type_str=cls,
	skill_level_str=nlp_skill_level,
	user_name=None,
	output_format='text',
	simplified=True # Remove weekly plan, shorten output
	)

	# Save report
	report_filename = f"coaching_reports/{cls}_ksi{ksi_result.ksi_total:.3f}_report.txt"
	with open(report_filename, 'w') as f:
	f.write(report)

	print(f" ✅ Saved: {report_filename}")

	# Also save JSON version
	json_report = generate_coaching_report(
	ksi_result=ksi_result,
	shot_type_str=cls,
	skill_level_str=nlp_skill_level,
	output_format='json',
	simplified=True
	)
	json_filename = f"coaching_reports/{cls}_ksi{ksi_result.ksi_total:.3f}_report.json"
	with open(json_filename, 'w') as f:
	f.write(json_report)

	# Log to MLflow
	mlflow.log_artifact(report_filename)
	print(f" 📊 Logged to MLflow")

	except Exception as e:
	print(f" ⚠️ Failed to generate report: {e}")
	import traceback
	traceback.print_exc()

	print(f"\n✅ Generated coaching report in coaching_reports/")
	print(f"{'='*70}\n")
	elif pipeline_type == 'hybrid' and not has_raw_landmarks:
	print(f"⚠️ Raw landmarks not found in hybrid data. Run preprocessing again to enable KSI evaluation.")
	print(f" (Old data format detected - missing 'raw_landmarks' in .npz files)")
	else:
	print(f"⚠ Expert templates not found at {template_path}. Skipping KSI metrics.")

	# 4. Logging to MLflow
	mlflow.log_metric("test_accuracy", acc)
	mlflow.log_metric("test_loss", loss)
	mlflow.log_metric("ksi_total", ksi_results['avg_ksi_total'])
	mlflow.log_metric("ksi_weighted", ksi_results['avg_ksi_weighted'])
	mlflow.log_metric("ksi_ci_width", ksi_results['avg_confidence_ci_width'])
	mlflow.log_metric("ksi_uncertainty", ksi_results['avg_uncertainty_scalar'])
	mlflow.log_metric("ksi_reliable_ratio",
	ksi_results['reliable_count'] / max(1, ksi_results['total_samples']))

	# Log phase scores
	for phase, score in ksi_results['phase_scores'].items():
	mlflow.log_metric(f"phase_{phase}", score)

	# Log component scores
	for comp, score in ksi_results['component_scores'].items():
	mlflow.log_metric(f"ksi_{comp}", score)

	# 5. Confusion Matrix
	cm = confusion_matrix(np.argmax(y_test, axis=1), y_pred)
	plt.figure(figsize=(8, 6))
	sns.heatmap(cm, annot=True, fmt='d', xticklabels=classes, yticklabels=classes, cmap='Blues')
	plt.title(f"Confusion Matrix - {pipeline_type.upper()}")

	os.makedirs("dvclive", exist_ok=True)
	# Use production filename for tuned models, pipeline-specific otherwise
	cm_filename = "production_confusion_matrix.png" if "tuned" in cfg['model_path'] else f"{pipeline_type}_confusion_matrix.png"
	cm_path = os.path.join("dvclive", cm_filename)
	plt.savefig(cm_path)
	plt.close()
	mlflow.log_artifact(cm_path)

	# 6. KSI Analysis Plot
	if ksi_results['phase_scores'] or ksi_results['component_scores']:
	fig, axes = plt.subplots(1, 2, figsize=(12, 5))

	# Phase scores
	if ksi_results['phase_scores']:
	phases = list(ksi_results['phase_scores'].keys())
	p_scores = list(ksi_results['phase_scores'].values())
	axes[0].barh(phases, p_scores, color=['gray', 'blue', 'orange', 'red', 'green'][:len(phases)])
	axes[0].set_xlim([0, 1])
	axes[0].set_xlabel('Average Score')
	axes[0].set_title('Phase Scores (Avg)')

	# Component scores
	if ksi_results['component_scores']:
	comps = list(ksi_results['component_scores'].keys())
	c_scores = list(ksi_results['component_scores'].values())
	axes[1].bar(comps, c_scores, color=['steelblue', 'coral', 'seagreen', 'orchid'][:len(comps)])
	axes[1].set_ylim([0, 1])
	axes[1].set_ylabel('Average Score')
	axes[1].set_title('Component Scores (Avg)')

	plt.suptitle(f"KSI v2 Analysis - {pipeline_type.upper()} \| "
	f"KSI: {ksi_results['avg_ksi_total']:.3f} ± {ksi_results['avg_confidence_ci_width']:.3f}")
	plt.tight_layout()
	ksi_plot_path = f"dvclive/{pipeline_type}_ksi_analysis.png"
	plt.savefig(ksi_plot_path)
	plt.close()
	mlflow.log_artifact(ksi_plot_path)

	# 7. Save Metrics for DVC
	metrics = {
	"accuracy": float(acc),
	"loss": float(loss),
	"ksi_total": ksi_results['avg_ksi_total'],
	"ksi_weighted": ksi_results['avg_ksi_weighted'],
	"ksi_ci_width": ksi_results['avg_confidence_ci_width'],
	"ksi_uncertainty": ksi_results['avg_uncertainty_scalar'],
	"ksi_reliable_ratio": ksi_results['reliable_count'] / max(1, ksi_results['total_samples']),
	"phase_scores": ksi_results['phase_scores'],
	"component_scores": ksi_results['component_scores']
	}

	# Determine metrics filename based on model path
	metrics_filename = "production_metrics.json" if "tuned" in cfg['model_path'] else f"{pipeline_type}_metrics.json"
	metrics_path = os.path.join("dvclive", metrics_filename)

	with open(metrics_path, "w") as f:
	json.dump(metrics, f, indent=2)

	print(f"\n{'='*60}")
	print(f"{pipeline_type.upper()} EVALUATION (KSI v2)")
	print(f"{'='*60}")
	print(f"Accuracy: {acc:.4f} \| Loss: {loss:.4f}")
	print(f"KSI Total: {ksi_results['avg_ksi_total']:.4f}")
	print(f"KSI Weighted: {ksi_results['avg_ksi_weighted']:.4f}")
	print(f"CI Width: {ksi_results['avg_confidence_ci_width']:.4f}")
	print(f"Uncertainty: {ksi_results['avg_uncertainty_scalar']:.4f}")
	print(f"Reliable: {ksi_results['reliable_count']}/{ksi_results['total_samples']}")
	print(f"Phase scores: {ksi_results['phase_scores']}")
	print(f"Component scores: {ksi_results['component_scores']}")
	print(f"{'='*60}\n")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--type", choices=['pose', 'hybrid'], required=True)
	parser.add_argument("--model", type=str, help="Optional: custom model path (overrides params.yaml)")
	parser.add_argument("--data", type=str, help="Optional: custom data path for evaluation (overrides params.yaml)")
	parser.add_argument("--nlp", action='store_true', help="Generate natural language coaching reports")
	parser.add_argument("--nlp-skill", type=str, default='intermediate',
	choices=['beginner', 'intermediate', 'advanced', 'expert'],
	help="Skill level for natural language feedback (default: intermediate)")
	parser.add_argument("--nlp-samples", type=int, default=5,
	help="Max number of samples to generate detailed feedback for (default: 5)")
	parser.add_argument("--auto-name", action='store_true',
	help="Auto-generate MLflow run name without prompting")
	parser.add_argument("--gpu", action='store_true', help="Use GPU for inference (faster but less deterministic)")
	args = parser.parse_args()
	evaluate(args.type, args.model, args.nlp, args.nlp_skill, args.nlp_samples, args.auto_name, args.data)