""" Model Evaluation Pipeline with KSI v2.0 Metrics ================================================ Comprehensive evaluation script for trained badminton shot classification models. Implements enhanced Kinematic Similarity Index (KSI) v2.0 for biomechanical analysis and optional natural language coaching feedback generation. Key Features: - Classification accuracy and confusion matrix analysis - KSI v2.0 biomechanical comparison against expert templates - Phase-aware scoring (preparation, loading, contact, follow-through) - Per-joint error analysis with confidence intervals - Velocity and acceleration derivative metrics - Optional NLP coaching report generation - Full MLflow experiment tracking integration Evaluation Metrics: 1. Classification Metrics - Test accuracy, precision, recall, F1 - Confusion matrix visualization 2. KSI v2.0 Metrics - Total KSI score (0-100, higher = better match) - Component breakdown (pose, velocity, acceleration) - Phase-specific scores - Per-joint error analysis - Ranking hinge score for class separation Pipeline Position: train_pose.py / train_hybrid.py → [evaluate.py] → reports/ Loads trained model and test data, performs comprehensive evaluation, and logs all metrics to MLflow for experiment tracking. Dependencies: External: tensorflow, sklearn, matplotlib, seaborn, mlflow, numpy, yaml Internal: ksi_v2.EnhancedKSI, mlflow_utils, natural_language_coach Configuration (params.yaml): pose_pipeline / hybrid_pipeline: model_path: Path to trained model data_path: Path to evaluation data ksi: weights: Component weighting for KSI calculation Usage: python evaluate.py pose # Evaluate pose model python evaluate.py hybrid --nlp # Evaluate hybrid with NLP feedback Author: IPD Research Team Version: 2.0.0 """ """ # --- DETERMINISM FIXES (MUST BE BEFORE TF IMPORT) --- import os import sys # Check for GPU flag early (before TF imports) _use_gpu = '--gpu' in sys.argv if not _use_gpu: # Force CPU mode for deterministic predictions os.environ['CUDA_VISIBLE_DEVICES'] = '-1' os.environ['MEDIAPIPE_DISABLE_GPU'] = '1' print("šŸ”’ Running in CPU mode for deterministic predictions (use --gpu to enable GPU)") os.environ['TF_DETERMINISTIC_OPS'] = '1' os.environ['TF_CUDNN_DETERMINISTIC'] = '1' os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' """ # --- DETERMINISM FIXES (MUST BE BEFORE TF IMPORT) --- import os import sys # Check for GPU flag early (before TF imports) _use_gpu = '--gpu' in sys.argv if not _use_gpu: # Force CPU mode for deterministic predictions os.environ['CUDA_VISIBLE_DEVICES'] = '-1' os.environ['MEDIAPIPE_DISABLE_GPU'] = '1' print("šŸ”’ Running in CPU mode for deterministic predictions (use --gpu to enable GPU)") os.environ['TF_DETERMINISTIC_OPS'] = '1' os.environ['TF_CUDNN_DETERMINISTIC'] = '1' os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' import argparse import os import yaml import numpy as np import json import mlflow import matplotlib.pyplot as plt import seaborn as sns from tensorflow.keras.models import load_model from tensorflow.keras.utils import to_categorical from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from typing import Dict from ksi_v2 import EnhancedKSI, ShotPhase from mlflow_utils import MLflowRunManager try: from natural_language_coach import generate_coaching_report NLP_AVAILABLE = True except ImportError: NLP_AVAILABLE = False print("āš ļø natural_language_coach not available. NLP feedback will be skipped.") def evaluate(pipeline_type: str, model_path: str = None, generate_nlp_feedback: bool = False, nlp_skill_level: str = 'intermediate', max_nlp_samples: int = 5, auto_run_name: bool = False, data_path: str = None): """ Evaluate model with enhanced KSI v2 metrics. Logs phase scores, confidence intervals, ranking hinge, and component breakdowns. Args: pipeline_type: 'pose' or 'hybrid' model_path: Optional path to model file (overrides params.yaml) generate_nlp_feedback: Whether to generate natural language coaching reports nlp_skill_level: Skill level for NLP coach ('beginner', 'intermediate', 'advanced', 'expert') max_nlp_samples: Maximum number of samples to generate detailed feedback for auto_run_name: If True, auto-generate MLflow run name without prompting data_path: Optional path to evaluation data (overrides params.yaml) """ with open("params.yaml") as f: params = yaml.safe_load(f) cfg = params[f'{pipeline_type}_pipeline'] ksi_cfg = params.get('ksi', {'weights': {'pose': 0.5, 'velocity': 0.3, 'acceleration': 0.2}}) # Use provided model path or default from config if model_path: cfg['model_path'] = model_path run_suffix = f" (custom: {os.path.basename(model_path)})" else: run_suffix = "" # Use provided data path or default from config if data_path: cfg['data_path'] = data_path # Setup MLflow with interactive run manager exp_name = "Pose_LSTM_Experiment" if pipeline_type == 'pose' else "Hybrid_TCN_Experiment" run_manager = MLflowRunManager(exp_name) with run_manager.start_interactive_run( default_description=f"Evaluation of {pipeline_type} pipeline with KSI v2.0 metrics{run_suffix}", auto_name=auto_run_name ): # Log evaluation configuration mlflow.log_param("evaluation.model_path", cfg['model_path']) mlflow.log_param("evaluation.data_path", cfg['data_path']) mlflow.log_param("evaluation.pipeline_type", pipeline_type) # Display what we're evaluating on print(f"\n{'='*70}") print(f"šŸ“Š EVALUATION CONFIGURATION") print(f"{'='*70}") print(f"Pipeline: {pipeline_type}") print(f"Model: {cfg['model_path']}") print(f"Data: {cfg['data_path']}") print(f"{'='*70}\n") # 1. Load Data X, y = [], [] raw_landmarks = [] # NEW: store raw landmarks for KSI if not os.path.exists(cfg['data_path']): print(f"Data path {cfg['data_path']} not found") return classes = sorted(os.listdir(cfg['data_path'])) for i, cls in enumerate(classes): path = os.path.join(cfg['data_path'], cls) if not os.path.isdir(path): continue for f in os.listdir(path): if f.endswith('.npz'): data = np.load(os.path.join(path, f)) X.append(data['features']) y.append(i) # NEW: load raw landmarks if available (for hybrid pipeline KSI) if 'raw_landmarks' in data: raw_landmarks.append(data['raw_landmarks']) else: raw_landmarks.append(None) if not X: print("No data loaded") return X, y_cat = np.array(X), to_categorical(y, len(classes)) # Split data and raw landmarks together if raw_landmarks and any(lm is not None for lm in raw_landmarks): _, X_test, _, y_test, _, raw_test = train_test_split( X, y_cat, raw_landmarks, test_size=0.2, stratify=y, random_state=42 ) has_raw_landmarks = True else: _, X_test, _, y_test = train_test_split(X, y_cat, test_size=0.2, stratify=y, random_state=42) raw_test = None has_raw_landmarks = False # 2. Load Model & Predict model = load_model(cfg['model_path']) if pipeline_type == 'hybrid': cnn_dim = cfg['cnn_feature_dim'] X_pose, X_cnn = X_test[..., :-cnn_dim], X_test[..., -cnn_dim:] loss, acc = model.evaluate([X_cnn, X_pose], y_test, verbose=0) y_pred = np.argmax(model.predict([X_cnn, X_pose]), axis=1) sample_pose = X_pose else: loss, acc = model.evaluate(X_test, y_test, verbose=0) y_pred = np.argmax(model.predict(X_test), axis=1) sample_pose = X_test # 3. Enhanced KSI v2 Calculation with contact-centered windowing # NEW: KSI now works with hybrid pipeline if raw_landmarks are available template_path = params['expert_pipeline']['output_path'] ksi_results = { 'avg_ksi_total': 0.0, 'avg_ksi_weighted': 0.0, 'avg_confidence_ci_width': 0.0, 'avg_uncertainty_scalar': 0.0, 'phase_scores': {}, 'component_scores': {}, 'reliable_count': 0, 'total_samples': 0 } # Check if we can do KSI evaluation can_do_ksi = os.path.exists(template_path) and ( pipeline_type == 'pose' or (pipeline_type == 'hybrid' and has_raw_landmarks) ) if can_do_ksi: print(f"Expert templates found at {template_path}. Computing KSI metrics...") templates = np.load(template_path, allow_pickle=True) # Check template format - detect if raw landmarks or enhanced features sample_template_key = next((k for k in templates.files if not k.startswith('_')), None) if sample_template_key is None: print(f"⚠ No valid templates found in {template_path}") can_do_ksi = False else: sample_template = templates[sample_template_key] # Raw landmarks: (T, 33, 3) or (T, 99) flattened # Enhanced features: (T, 32) template_is_raw_landmarks = ( sample_template.ndim == 3 and sample_template.shape[1:] == (33, 3) ) or ( sample_template.ndim == 2 and sample_template.shape[1] == 99 ) template_is_enhanced = sample_template.ndim == 2 and sample_template.shape[1] == 32 if template_is_enhanced: print(f"⚠ Templates are in 32-feature format (KSI v2 features), not raw landmarks.") print(f" KSI calculation requires raw landmarks (T, 33, 3). Regenerate templates with raw landmarks.") can_do_ksi = False elif not template_is_raw_landmarks: print(f"⚠ Unknown template format: shape={sample_template.shape}. Expected (T, 33, 3) or (T, 99).") can_do_ksi = False else: print(f" Template format: {'(T, 33, 3)' if sample_template.ndim == 3 else '(T, 99)'} - OK") if can_do_ksi: # Initialize enhanced KSI calculator ksi_calc = EnhancedKSI( fps=params.get('fps', 30.0), contact_window_pre_frames=18, contact_window_post_frames=18, bootstrap_min=50, bootstrap_max=200, ranking_margin=0.05 ) ksi_totals = [] ksi_weighted_list = [] ci_widths = [] uncertainty_scalars = [] phase_score_accumulator = {p.value: [] for p in ShotPhase} component_accumulator = {'pose': [], 'velocity': [], 'acceleration': [], 'jerk': []} reliable_count = 0 # Store individual results for NLP feedback individual_results = [] if generate_nlp_feedback else None # Sample for speed (evaluate up to 50 items) n_samples = min(50, len(sample_pose)) evaluated_count = 0 # Track actually evaluated samples skipped_templates = set() # Track missing templates to report once for i in range(n_samples): cls = classes[np.argmax(y_test[i])] # NEW: Try main template, then variants, prioritizing main template_key = None if cls in templates: template_key = cls elif f'{cls}_variant1' in templates: # If only variants exist, use variant1 (best quality) template_key = f'{cls}_variant1' else: # Try any key containing the class name for key in templates.files: if cls in key and not key.startswith('_'): template_key = key break if template_key is None: if cls not in skipped_templates: skipped_templates.add(cls) continue # Get user landmarks - use raw_landmarks if available (hybrid), else reshape pose features if pipeline_type == 'hybrid' and raw_test is not None and raw_test[i] is not None: user_lm = raw_test[i] # Already (T, 33, 3) else: # Pose pipeline: reshape from flattened features (T, 99) -> (T, 33, 3) try: user_lm = sample_pose[i].reshape(-1, 33, 3) except ValueError as e: print(f"⚠ Cannot reshape pose features to landmarks: {sample_pose[i].shape} -> (T, 33, 3)") continue # Load expert template and reshape if needed expert_template = templates[template_key] try: if expert_template.ndim == 3 and expert_template.shape[1:] == (33, 3): expert_lm = expert_template # Already (T, 33, 3) elif expert_template.ndim == 2 and expert_template.shape[1] == 99: expert_lm = expert_template.reshape(-1, 33, 3) # (T, 99) -> (T, 33, 3) else: print(f"⚠ Cannot convert template '{template_key}' shape {expert_template.shape} to landmarks") continue except ValueError as e: print(f"⚠ Template reshape failed for '{template_key}': {e}") continue # Calculate enhanced KSI result = ksi_calc.calculate( expert_landmarks=expert_lm, user_landmarks=user_lm, weights=ksi_cfg['weights'], baseline_ksi=None # Could pass previous user score for ranking hinge ) ksi_totals.append(result.ksi_total) ksi_weighted_list.append(result.ksi_weighted) # Confidence metrics if result.confidence: ci_width = result.confidence.get('ci_95_upper', 0) - result.confidence.get('ci_95_lower', 0) ci_widths.append(ci_width) uncertainty_scalars.append(result.confidence.get('uncertainty_scalar', 0)) if result.confidence.get('reliable', False): reliable_count += 1 # Phase scores for phase, score in result.phase_scores.items(): if phase in phase_score_accumulator: phase_score_accumulator[phase].append(score) # Component scores for comp in ['pose', 'velocity', 'acceleration', 'jerk']: if comp in result.components: component_accumulator[comp].append(result.components[comp]) # Store individual result for NLP feedback (only if prediction is correct) if generate_nlp_feedback: predicted_cls = classes[y_pred[i]] if i < len(y_pred) else None if predicted_cls == cls: # Only store correctly predicted samples individual_results.append({ 'sample_idx': i, 'class': cls, 'predicted_class': predicted_cls, 'ksi_result': result, 'ksi_total': result.ksi_total }) evaluated_count += 1 # Report skipped templates if skipped_templates: print(f" ⚠ Missing templates for classes: {sorted(skipped_templates)}") # Aggregate results ksi_results['avg_ksi_total'] = float(np.mean(ksi_totals)) if ksi_totals else 0.0 ksi_results['avg_ksi_weighted'] = float(np.mean(ksi_weighted_list)) if ksi_weighted_list else 0.0 ksi_results['avg_confidence_ci_width'] = float(np.mean(ci_widths)) if ci_widths else 0.0 ksi_results['avg_uncertainty_scalar'] = float(np.mean(uncertainty_scalars)) if uncertainty_scalars else 0.0 ksi_results['reliable_count'] = reliable_count ksi_results['total_samples'] = evaluated_count # Use actual evaluated count, not attempted for phase, scores in phase_score_accumulator.items(): if scores: ksi_results['phase_scores'][phase] = float(np.mean(scores)) for comp, scores in component_accumulator.items(): if scores: ksi_results['component_scores'][comp] = float(np.mean(scores)) # Generate natural language coaching reports if generate_nlp_feedback and individual_results and NLP_AVAILABLE: print(f"\n{'='*70}") print(f"GENERATING NATURAL LANGUAGE COACHING FEEDBACK") print(f"{'='*70}") print(f" Found {len(individual_results)} correctly predicted samples") # DEBUG: Show KSI scores distribution ksi_scores = [r['ksi_total'] for r in individual_results] print(f" KSI scores range: {min(ksi_scores):.3f} - {max(ksi_scores):.3f}") os.makedirs("coaching_reports", exist_ok=True) # Pick best sample (highest KSI score) - this gives most interesting feedback individual_results.sort(key=lambda x: x['ksi_total'], reverse=True) best_sample = individual_results[0] samples_to_generate = [best_sample] sample = samples_to_generate[0] cls = sample['class'] ksi_result = sample['ksi_result'] sample_idx = sample['sample_idx'] print(f"\nšŸ“ Generating coaching report for: {cls} (KSI: {ksi_result.ksi_total:.3f})") try: # Generate simplified coaching report report = generate_coaching_report( ksi_result=ksi_result, shot_type_str=cls, skill_level_str=nlp_skill_level, user_name=None, output_format='text', simplified=True # Remove weekly plan, shorten output ) # Save report report_filename = f"coaching_reports/{cls}_ksi{ksi_result.ksi_total:.3f}_report.txt" with open(report_filename, 'w') as f: f.write(report) print(f" āœ… Saved: {report_filename}") # Also save JSON version json_report = generate_coaching_report( ksi_result=ksi_result, shot_type_str=cls, skill_level_str=nlp_skill_level, output_format='json', simplified=True ) json_filename = f"coaching_reports/{cls}_ksi{ksi_result.ksi_total:.3f}_report.json" with open(json_filename, 'w') as f: f.write(json_report) # Log to MLflow mlflow.log_artifact(report_filename) print(f" šŸ“Š Logged to MLflow") except Exception as e: print(f" āš ļø Failed to generate report: {e}") import traceback traceback.print_exc() print(f"\nāœ… Generated coaching report in coaching_reports/") print(f"{'='*70}\n") elif pipeline_type == 'hybrid' and not has_raw_landmarks: print(f"āš ļø Raw landmarks not found in hybrid data. Run preprocessing again to enable KSI evaluation.") print(f" (Old data format detected - missing 'raw_landmarks' in .npz files)") else: print(f"⚠ Expert templates not found at {template_path}. Skipping KSI metrics.") # 4. Logging to MLflow mlflow.log_metric("test_accuracy", acc) mlflow.log_metric("test_loss", loss) mlflow.log_metric("ksi_total", ksi_results['avg_ksi_total']) mlflow.log_metric("ksi_weighted", ksi_results['avg_ksi_weighted']) mlflow.log_metric("ksi_ci_width", ksi_results['avg_confidence_ci_width']) mlflow.log_metric("ksi_uncertainty", ksi_results['avg_uncertainty_scalar']) mlflow.log_metric("ksi_reliable_ratio", ksi_results['reliable_count'] / max(1, ksi_results['total_samples'])) # Log phase scores for phase, score in ksi_results['phase_scores'].items(): mlflow.log_metric(f"phase_{phase}", score) # Log component scores for comp, score in ksi_results['component_scores'].items(): mlflow.log_metric(f"ksi_{comp}", score) # 5. Confusion Matrix cm = confusion_matrix(np.argmax(y_test, axis=1), y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', xticklabels=classes, yticklabels=classes, cmap='Blues') plt.title(f"Confusion Matrix - {pipeline_type.upper()}") os.makedirs("dvclive", exist_ok=True) # Use production filename for tuned models, pipeline-specific otherwise cm_filename = "production_confusion_matrix.png" if "tuned" in cfg['model_path'] else f"{pipeline_type}_confusion_matrix.png" cm_path = os.path.join("dvclive", cm_filename) plt.savefig(cm_path) plt.close() mlflow.log_artifact(cm_path) # 6. KSI Analysis Plot if ksi_results['phase_scores'] or ksi_results['component_scores']: fig, axes = plt.subplots(1, 2, figsize=(12, 5)) # Phase scores if ksi_results['phase_scores']: phases = list(ksi_results['phase_scores'].keys()) p_scores = list(ksi_results['phase_scores'].values()) axes[0].barh(phases, p_scores, color=['gray', 'blue', 'orange', 'red', 'green'][:len(phases)]) axes[0].set_xlim([0, 1]) axes[0].set_xlabel('Average Score') axes[0].set_title('Phase Scores (Avg)') # Component scores if ksi_results['component_scores']: comps = list(ksi_results['component_scores'].keys()) c_scores = list(ksi_results['component_scores'].values()) axes[1].bar(comps, c_scores, color=['steelblue', 'coral', 'seagreen', 'orchid'][:len(comps)]) axes[1].set_ylim([0, 1]) axes[1].set_ylabel('Average Score') axes[1].set_title('Component Scores (Avg)') plt.suptitle(f"KSI v2 Analysis - {pipeline_type.upper()} | " f"KSI: {ksi_results['avg_ksi_total']:.3f} ± {ksi_results['avg_confidence_ci_width']:.3f}") plt.tight_layout() ksi_plot_path = f"dvclive/{pipeline_type}_ksi_analysis.png" plt.savefig(ksi_plot_path) plt.close() mlflow.log_artifact(ksi_plot_path) # 7. Save Metrics for DVC metrics = { "accuracy": float(acc), "loss": float(loss), "ksi_total": ksi_results['avg_ksi_total'], "ksi_weighted": ksi_results['avg_ksi_weighted'], "ksi_ci_width": ksi_results['avg_confidence_ci_width'], "ksi_uncertainty": ksi_results['avg_uncertainty_scalar'], "ksi_reliable_ratio": ksi_results['reliable_count'] / max(1, ksi_results['total_samples']), "phase_scores": ksi_results['phase_scores'], "component_scores": ksi_results['component_scores'] } # Determine metrics filename based on model path metrics_filename = "production_metrics.json" if "tuned" in cfg['model_path'] else f"{pipeline_type}_metrics.json" metrics_path = os.path.join("dvclive", metrics_filename) with open(metrics_path, "w") as f: json.dump(metrics, f, indent=2) print(f"\n{'='*60}") print(f"{pipeline_type.upper()} EVALUATION (KSI v2)") print(f"{'='*60}") print(f"Accuracy: {acc:.4f} | Loss: {loss:.4f}") print(f"KSI Total: {ksi_results['avg_ksi_total']:.4f}") print(f"KSI Weighted: {ksi_results['avg_ksi_weighted']:.4f}") print(f"CI Width: {ksi_results['avg_confidence_ci_width']:.4f}") print(f"Uncertainty: {ksi_results['avg_uncertainty_scalar']:.4f}") print(f"Reliable: {ksi_results['reliable_count']}/{ksi_results['total_samples']}") print(f"Phase scores: {ksi_results['phase_scores']}") print(f"Component scores: {ksi_results['component_scores']}") print(f"{'='*60}\n") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--type", choices=['pose', 'hybrid'], required=True) parser.add_argument("--model", type=str, help="Optional: custom model path (overrides params.yaml)") parser.add_argument("--data", type=str, help="Optional: custom data path for evaluation (overrides params.yaml)") parser.add_argument("--nlp", action='store_true', help="Generate natural language coaching reports") parser.add_argument("--nlp-skill", type=str, default='intermediate', choices=['beginner', 'intermediate', 'advanced', 'expert'], help="Skill level for natural language feedback (default: intermediate)") parser.add_argument("--nlp-samples", type=int, default=5, help="Max number of samples to generate detailed feedback for (default: 5)") parser.add_argument("--auto-name", action='store_true', help="Auto-generate MLflow run name without prompting") parser.add_argument("--gpu", action='store_true', help="Use GPU for inference (faster but less deterministic)") args = parser.parse_args() evaluate(args.type, args.model, args.nlp, args.nlp_skill, args.nlp_samples, args.auto_name, args.data)