smashfix-v1 / src /evaluate.py
uncertainrods's picture
v1-try-deploy
0d0412d
"""
Model Evaluation Pipeline with KSI v2.0 Metrics
================================================
Comprehensive evaluation script for trained badminton shot classification models.
Implements enhanced Kinematic Similarity Index (KSI) v2.0 for biomechanical
analysis and optional natural language coaching feedback generation.
Key Features:
- Classification accuracy and confusion matrix analysis
- KSI v2.0 biomechanical comparison against expert templates
- Phase-aware scoring (preparation, loading, contact, follow-through)
- Per-joint error analysis with confidence intervals
- Velocity and acceleration derivative metrics
- Optional NLP coaching report generation
- Full MLflow experiment tracking integration
Evaluation Metrics:
1. Classification Metrics
- Test accuracy, precision, recall, F1
- Confusion matrix visualization
2. KSI v2.0 Metrics
- Total KSI score (0-100, higher = better match)
- Component breakdown (pose, velocity, acceleration)
- Phase-specific scores
- Per-joint error analysis
- Ranking hinge score for class separation
Pipeline Position:
train_pose.py / train_hybrid.py → [evaluate.py] → reports/
Loads trained model and test data, performs comprehensive evaluation,
and logs all metrics to MLflow for experiment tracking.
Dependencies:
External: tensorflow, sklearn, matplotlib, seaborn, mlflow, numpy, yaml
Internal: ksi_v2.EnhancedKSI, mlflow_utils, natural_language_coach
Configuration (params.yaml):
pose_pipeline / hybrid_pipeline:
model_path: Path to trained model
data_path: Path to evaluation data
ksi:
weights: Component weighting for KSI calculation
Usage:
python evaluate.py pose # Evaluate pose model
python evaluate.py hybrid --nlp # Evaluate hybrid with NLP feedback
Author: IPD Research Team
Version: 2.0.0
"""
"""
# --- DETERMINISM FIXES (MUST BE BEFORE TF IMPORT) ---
import os
import sys
# Check for GPU flag early (before TF imports)
_use_gpu = '--gpu' in sys.argv
if not _use_gpu:
# Force CPU mode for deterministic predictions
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['MEDIAPIPE_DISABLE_GPU'] = '1'
print("🔒 Running in CPU mode for deterministic predictions (use --gpu to enable GPU)")
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
"""
# --- DETERMINISM FIXES (MUST BE BEFORE TF IMPORT) ---
import os
import sys
# Check for GPU flag early (before TF imports)
_use_gpu = '--gpu' in sys.argv
if not _use_gpu:
# Force CPU mode for deterministic predictions
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['MEDIAPIPE_DISABLE_GPU'] = '1'
print("🔒 Running in CPU mode for deterministic predictions (use --gpu to enable GPU)")
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import argparse
import os
import yaml
import numpy as np
import json
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from typing import Dict
from ksi_v2 import EnhancedKSI, ShotPhase
from mlflow_utils import MLflowRunManager
try:
from natural_language_coach import generate_coaching_report
NLP_AVAILABLE = True
except ImportError:
NLP_AVAILABLE = False
print("⚠️ natural_language_coach not available. NLP feedback will be skipped.")
def evaluate(pipeline_type: str, model_path: str = None, generate_nlp_feedback: bool = False,
nlp_skill_level: str = 'intermediate', max_nlp_samples: int = 5, auto_run_name: bool = False,
data_path: str = None):
"""
Evaluate model with enhanced KSI v2 metrics.
Logs phase scores, confidence intervals, ranking hinge, and component breakdowns.
Args:
pipeline_type: 'pose' or 'hybrid'
model_path: Optional path to model file (overrides params.yaml)
generate_nlp_feedback: Whether to generate natural language coaching reports
nlp_skill_level: Skill level for NLP coach ('beginner', 'intermediate', 'advanced', 'expert')
max_nlp_samples: Maximum number of samples to generate detailed feedback for
auto_run_name: If True, auto-generate MLflow run name without prompting
data_path: Optional path to evaluation data (overrides params.yaml)
"""
with open("params.yaml") as f:
params = yaml.safe_load(f)
cfg = params[f'{pipeline_type}_pipeline']
ksi_cfg = params.get('ksi', {'weights': {'pose': 0.5, 'velocity': 0.3, 'acceleration': 0.2}})
# Use provided model path or default from config
if model_path:
cfg['model_path'] = model_path
run_suffix = f" (custom: {os.path.basename(model_path)})"
else:
run_suffix = ""
# Use provided data path or default from config
if data_path:
cfg['data_path'] = data_path
# Setup MLflow with interactive run manager
exp_name = "Pose_LSTM_Experiment" if pipeline_type == 'pose' else "Hybrid_TCN_Experiment"
run_manager = MLflowRunManager(exp_name)
with run_manager.start_interactive_run(
default_description=f"Evaluation of {pipeline_type} pipeline with KSI v2.0 metrics{run_suffix}",
auto_name=auto_run_name
):
# Log evaluation configuration
mlflow.log_param("evaluation.model_path", cfg['model_path'])
mlflow.log_param("evaluation.data_path", cfg['data_path'])
mlflow.log_param("evaluation.pipeline_type", pipeline_type)
# Display what we're evaluating on
print(f"\n{'='*70}")
print(f"📊 EVALUATION CONFIGURATION")
print(f"{'='*70}")
print(f"Pipeline: {pipeline_type}")
print(f"Model: {cfg['model_path']}")
print(f"Data: {cfg['data_path']}")
print(f"{'='*70}\n")
# 1. Load Data
X, y = [], []
raw_landmarks = [] # NEW: store raw landmarks for KSI
if not os.path.exists(cfg['data_path']):
print(f"Data path {cfg['data_path']} not found")
return
classes = sorted(os.listdir(cfg['data_path']))
for i, cls in enumerate(classes):
path = os.path.join(cfg['data_path'], cls)
if not os.path.isdir(path):
continue
for f in os.listdir(path):
if f.endswith('.npz'):
data = np.load(os.path.join(path, f))
X.append(data['features'])
y.append(i)
# NEW: load raw landmarks if available (for hybrid pipeline KSI)
if 'raw_landmarks' in data:
raw_landmarks.append(data['raw_landmarks'])
else:
raw_landmarks.append(None)
if not X:
print("No data loaded")
return
X, y_cat = np.array(X), to_categorical(y, len(classes))
# Split data and raw landmarks together
if raw_landmarks and any(lm is not None for lm in raw_landmarks):
_, X_test, _, y_test, _, raw_test = train_test_split(
X, y_cat, raw_landmarks, test_size=0.2, stratify=y, random_state=42
)
has_raw_landmarks = True
else:
_, X_test, _, y_test = train_test_split(X, y_cat, test_size=0.2, stratify=y, random_state=42)
raw_test = None
has_raw_landmarks = False
# 2. Load Model & Predict
model = load_model(cfg['model_path'])
if pipeline_type == 'hybrid':
cnn_dim = cfg['cnn_feature_dim']
X_pose, X_cnn = X_test[..., :-cnn_dim], X_test[..., -cnn_dim:]
loss, acc = model.evaluate([X_cnn, X_pose], y_test, verbose=0)
y_pred = np.argmax(model.predict([X_cnn, X_pose]), axis=1)
sample_pose = X_pose
else:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
y_pred = np.argmax(model.predict(X_test), axis=1)
sample_pose = X_test
# 3. Enhanced KSI v2 Calculation with contact-centered windowing
# NEW: KSI now works with hybrid pipeline if raw_landmarks are available
template_path = params['expert_pipeline']['output_path']
ksi_results = {
'avg_ksi_total': 0.0,
'avg_ksi_weighted': 0.0,
'avg_confidence_ci_width': 0.0,
'avg_uncertainty_scalar': 0.0,
'phase_scores': {},
'component_scores': {},
'reliable_count': 0,
'total_samples': 0
}
# Check if we can do KSI evaluation
can_do_ksi = os.path.exists(template_path) and (
pipeline_type == 'pose' or (pipeline_type == 'hybrid' and has_raw_landmarks)
)
if can_do_ksi:
print(f"Expert templates found at {template_path}. Computing KSI metrics...")
templates = np.load(template_path, allow_pickle=True)
# Check template format - detect if raw landmarks or enhanced features
sample_template_key = next((k for k in templates.files if not k.startswith('_')), None)
if sample_template_key is None:
print(f"⚠ No valid templates found in {template_path}")
can_do_ksi = False
else:
sample_template = templates[sample_template_key]
# Raw landmarks: (T, 33, 3) or (T, 99) flattened
# Enhanced features: (T, 32)
template_is_raw_landmarks = (
sample_template.ndim == 3 and sample_template.shape[1:] == (33, 3)
) or (
sample_template.ndim == 2 and sample_template.shape[1] == 99
)
template_is_enhanced = sample_template.ndim == 2 and sample_template.shape[1] == 32
if template_is_enhanced:
print(f"⚠ Templates are in 32-feature format (KSI v2 features), not raw landmarks.")
print(f" KSI calculation requires raw landmarks (T, 33, 3). Regenerate templates with raw landmarks.")
can_do_ksi = False
elif not template_is_raw_landmarks:
print(f"⚠ Unknown template format: shape={sample_template.shape}. Expected (T, 33, 3) or (T, 99).")
can_do_ksi = False
else:
print(f" Template format: {'(T, 33, 3)' if sample_template.ndim == 3 else '(T, 99)'} - OK")
if can_do_ksi:
# Initialize enhanced KSI calculator
ksi_calc = EnhancedKSI(
fps=params.get('fps', 30.0),
contact_window_pre_frames=18,
contact_window_post_frames=18,
bootstrap_min=50,
bootstrap_max=200,
ranking_margin=0.05
)
ksi_totals = []
ksi_weighted_list = []
ci_widths = []
uncertainty_scalars = []
phase_score_accumulator = {p.value: [] for p in ShotPhase}
component_accumulator = {'pose': [], 'velocity': [], 'acceleration': [], 'jerk': []}
reliable_count = 0
# Store individual results for NLP feedback
individual_results = [] if generate_nlp_feedback else None
# Sample for speed (evaluate up to 50 items)
n_samples = min(50, len(sample_pose))
evaluated_count = 0 # Track actually evaluated samples
skipped_templates = set() # Track missing templates to report once
for i in range(n_samples):
cls = classes[np.argmax(y_test[i])]
# NEW: Try main template, then variants, prioritizing main
template_key = None
if cls in templates:
template_key = cls
elif f'{cls}_variant1' in templates:
# If only variants exist, use variant1 (best quality)
template_key = f'{cls}_variant1'
else:
# Try any key containing the class name
for key in templates.files:
if cls in key and not key.startswith('_'):
template_key = key
break
if template_key is None:
if cls not in skipped_templates:
skipped_templates.add(cls)
continue
# Get user landmarks - use raw_landmarks if available (hybrid), else reshape pose features
if pipeline_type == 'hybrid' and raw_test is not None and raw_test[i] is not None:
user_lm = raw_test[i] # Already (T, 33, 3)
else:
# Pose pipeline: reshape from flattened features (T, 99) -> (T, 33, 3)
try:
user_lm = sample_pose[i].reshape(-1, 33, 3)
except ValueError as e:
print(f"⚠ Cannot reshape pose features to landmarks: {sample_pose[i].shape} -> (T, 33, 3)")
continue
# Load expert template and reshape if needed
expert_template = templates[template_key]
try:
if expert_template.ndim == 3 and expert_template.shape[1:] == (33, 3):
expert_lm = expert_template # Already (T, 33, 3)
elif expert_template.ndim == 2 and expert_template.shape[1] == 99:
expert_lm = expert_template.reshape(-1, 33, 3) # (T, 99) -> (T, 33, 3)
else:
print(f"⚠ Cannot convert template '{template_key}' shape {expert_template.shape} to landmarks")
continue
except ValueError as e:
print(f"⚠ Template reshape failed for '{template_key}': {e}")
continue
# Calculate enhanced KSI
result = ksi_calc.calculate(
expert_landmarks=expert_lm,
user_landmarks=user_lm,
weights=ksi_cfg['weights'],
baseline_ksi=None # Could pass previous user score for ranking hinge
)
ksi_totals.append(result.ksi_total)
ksi_weighted_list.append(result.ksi_weighted)
# Confidence metrics
if result.confidence:
ci_width = result.confidence.get('ci_95_upper', 0) - result.confidence.get('ci_95_lower', 0)
ci_widths.append(ci_width)
uncertainty_scalars.append(result.confidence.get('uncertainty_scalar', 0))
if result.confidence.get('reliable', False):
reliable_count += 1
# Phase scores
for phase, score in result.phase_scores.items():
if phase in phase_score_accumulator:
phase_score_accumulator[phase].append(score)
# Component scores
for comp in ['pose', 'velocity', 'acceleration', 'jerk']:
if comp in result.components:
component_accumulator[comp].append(result.components[comp])
# Store individual result for NLP feedback (only if prediction is correct)
if generate_nlp_feedback:
predicted_cls = classes[y_pred[i]] if i < len(y_pred) else None
if predicted_cls == cls: # Only store correctly predicted samples
individual_results.append({
'sample_idx': i,
'class': cls,
'predicted_class': predicted_cls,
'ksi_result': result,
'ksi_total': result.ksi_total
})
evaluated_count += 1
# Report skipped templates
if skipped_templates:
print(f" ⚠ Missing templates for classes: {sorted(skipped_templates)}")
# Aggregate results
ksi_results['avg_ksi_total'] = float(np.mean(ksi_totals)) if ksi_totals else 0.0
ksi_results['avg_ksi_weighted'] = float(np.mean(ksi_weighted_list)) if ksi_weighted_list else 0.0
ksi_results['avg_confidence_ci_width'] = float(np.mean(ci_widths)) if ci_widths else 0.0
ksi_results['avg_uncertainty_scalar'] = float(np.mean(uncertainty_scalars)) if uncertainty_scalars else 0.0
ksi_results['reliable_count'] = reliable_count
ksi_results['total_samples'] = evaluated_count # Use actual evaluated count, not attempted
for phase, scores in phase_score_accumulator.items():
if scores:
ksi_results['phase_scores'][phase] = float(np.mean(scores))
for comp, scores in component_accumulator.items():
if scores:
ksi_results['component_scores'][comp] = float(np.mean(scores))
# Generate natural language coaching reports
if generate_nlp_feedback and individual_results and NLP_AVAILABLE:
print(f"\n{'='*70}")
print(f"GENERATING NATURAL LANGUAGE COACHING FEEDBACK")
print(f"{'='*70}")
print(f" Found {len(individual_results)} correctly predicted samples")
# DEBUG: Show KSI scores distribution
ksi_scores = [r['ksi_total'] for r in individual_results]
print(f" KSI scores range: {min(ksi_scores):.3f} - {max(ksi_scores):.3f}")
os.makedirs("coaching_reports", exist_ok=True)
# Pick best sample (highest KSI score) - this gives most interesting feedback
individual_results.sort(key=lambda x: x['ksi_total'], reverse=True)
best_sample = individual_results[0]
samples_to_generate = [best_sample]
sample = samples_to_generate[0]
cls = sample['class']
ksi_result = sample['ksi_result']
sample_idx = sample['sample_idx']
print(f"\n📝 Generating coaching report for: {cls} (KSI: {ksi_result.ksi_total:.3f})")
try:
# Generate simplified coaching report
report = generate_coaching_report(
ksi_result=ksi_result,
shot_type_str=cls,
skill_level_str=nlp_skill_level,
user_name=None,
output_format='text',
simplified=True # Remove weekly plan, shorten output
)
# Save report
report_filename = f"coaching_reports/{cls}_ksi{ksi_result.ksi_total:.3f}_report.txt"
with open(report_filename, 'w') as f:
f.write(report)
print(f" ✅ Saved: {report_filename}")
# Also save JSON version
json_report = generate_coaching_report(
ksi_result=ksi_result,
shot_type_str=cls,
skill_level_str=nlp_skill_level,
output_format='json',
simplified=True
)
json_filename = f"coaching_reports/{cls}_ksi{ksi_result.ksi_total:.3f}_report.json"
with open(json_filename, 'w') as f:
f.write(json_report)
# Log to MLflow
mlflow.log_artifact(report_filename)
print(f" 📊 Logged to MLflow")
except Exception as e:
print(f" ⚠️ Failed to generate report: {e}")
import traceback
traceback.print_exc()
print(f"\n✅ Generated coaching report in coaching_reports/")
print(f"{'='*70}\n")
elif pipeline_type == 'hybrid' and not has_raw_landmarks:
print(f"⚠️ Raw landmarks not found in hybrid data. Run preprocessing again to enable KSI evaluation.")
print(f" (Old data format detected - missing 'raw_landmarks' in .npz files)")
else:
print(f"⚠ Expert templates not found at {template_path}. Skipping KSI metrics.")
# 4. Logging to MLflow
mlflow.log_metric("test_accuracy", acc)
mlflow.log_metric("test_loss", loss)
mlflow.log_metric("ksi_total", ksi_results['avg_ksi_total'])
mlflow.log_metric("ksi_weighted", ksi_results['avg_ksi_weighted'])
mlflow.log_metric("ksi_ci_width", ksi_results['avg_confidence_ci_width'])
mlflow.log_metric("ksi_uncertainty", ksi_results['avg_uncertainty_scalar'])
mlflow.log_metric("ksi_reliable_ratio",
ksi_results['reliable_count'] / max(1, ksi_results['total_samples']))
# Log phase scores
for phase, score in ksi_results['phase_scores'].items():
mlflow.log_metric(f"phase_{phase}", score)
# Log component scores
for comp, score in ksi_results['component_scores'].items():
mlflow.log_metric(f"ksi_{comp}", score)
# 5. Confusion Matrix
cm = confusion_matrix(np.argmax(y_test, axis=1), y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=classes, yticklabels=classes, cmap='Blues')
plt.title(f"Confusion Matrix - {pipeline_type.upper()}")
os.makedirs("dvclive", exist_ok=True)
# Use production filename for tuned models, pipeline-specific otherwise
cm_filename = "production_confusion_matrix.png" if "tuned" in cfg['model_path'] else f"{pipeline_type}_confusion_matrix.png"
cm_path = os.path.join("dvclive", cm_filename)
plt.savefig(cm_path)
plt.close()
mlflow.log_artifact(cm_path)
# 6. KSI Analysis Plot
if ksi_results['phase_scores'] or ksi_results['component_scores']:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Phase scores
if ksi_results['phase_scores']:
phases = list(ksi_results['phase_scores'].keys())
p_scores = list(ksi_results['phase_scores'].values())
axes[0].barh(phases, p_scores, color=['gray', 'blue', 'orange', 'red', 'green'][:len(phases)])
axes[0].set_xlim([0, 1])
axes[0].set_xlabel('Average Score')
axes[0].set_title('Phase Scores (Avg)')
# Component scores
if ksi_results['component_scores']:
comps = list(ksi_results['component_scores'].keys())
c_scores = list(ksi_results['component_scores'].values())
axes[1].bar(comps, c_scores, color=['steelblue', 'coral', 'seagreen', 'orchid'][:len(comps)])
axes[1].set_ylim([0, 1])
axes[1].set_ylabel('Average Score')
axes[1].set_title('Component Scores (Avg)')
plt.suptitle(f"KSI v2 Analysis - {pipeline_type.upper()} | "
f"KSI: {ksi_results['avg_ksi_total']:.3f} ± {ksi_results['avg_confidence_ci_width']:.3f}")
plt.tight_layout()
ksi_plot_path = f"dvclive/{pipeline_type}_ksi_analysis.png"
plt.savefig(ksi_plot_path)
plt.close()
mlflow.log_artifact(ksi_plot_path)
# 7. Save Metrics for DVC
metrics = {
"accuracy": float(acc),
"loss": float(loss),
"ksi_total": ksi_results['avg_ksi_total'],
"ksi_weighted": ksi_results['avg_ksi_weighted'],
"ksi_ci_width": ksi_results['avg_confidence_ci_width'],
"ksi_uncertainty": ksi_results['avg_uncertainty_scalar'],
"ksi_reliable_ratio": ksi_results['reliable_count'] / max(1, ksi_results['total_samples']),
"phase_scores": ksi_results['phase_scores'],
"component_scores": ksi_results['component_scores']
}
# Determine metrics filename based on model path
metrics_filename = "production_metrics.json" if "tuned" in cfg['model_path'] else f"{pipeline_type}_metrics.json"
metrics_path = os.path.join("dvclive", metrics_filename)
with open(metrics_path, "w") as f:
json.dump(metrics, f, indent=2)
print(f"\n{'='*60}")
print(f"{pipeline_type.upper()} EVALUATION (KSI v2)")
print(f"{'='*60}")
print(f"Accuracy: {acc:.4f} | Loss: {loss:.4f}")
print(f"KSI Total: {ksi_results['avg_ksi_total']:.4f}")
print(f"KSI Weighted: {ksi_results['avg_ksi_weighted']:.4f}")
print(f"CI Width: {ksi_results['avg_confidence_ci_width']:.4f}")
print(f"Uncertainty: {ksi_results['avg_uncertainty_scalar']:.4f}")
print(f"Reliable: {ksi_results['reliable_count']}/{ksi_results['total_samples']}")
print(f"Phase scores: {ksi_results['phase_scores']}")
print(f"Component scores: {ksi_results['component_scores']}")
print(f"{'='*60}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--type", choices=['pose', 'hybrid'], required=True)
parser.add_argument("--model", type=str, help="Optional: custom model path (overrides params.yaml)")
parser.add_argument("--data", type=str, help="Optional: custom data path for evaluation (overrides params.yaml)")
parser.add_argument("--nlp", action='store_true', help="Generate natural language coaching reports")
parser.add_argument("--nlp-skill", type=str, default='intermediate',
choices=['beginner', 'intermediate', 'advanced', 'expert'],
help="Skill level for natural language feedback (default: intermediate)")
parser.add_argument("--nlp-samples", type=int, default=5,
help="Max number of samples to generate detailed feedback for (default: 5)")
parser.add_argument("--auto-name", action='store_true',
help="Auto-generate MLflow run name without prompting")
parser.add_argument("--gpu", action='store_true', help="Use GPU for inference (faster but less deterministic)")
args = parser.parse_args()
evaluate(args.type, args.model, args.nlp, args.nlp_skill, args.nlp_samples, args.auto_name, args.data)