Spaces:

uncertainrods
/

smashfix-v1

Sleeping

File size: 27,604 Bytes

0d0412d

"""
Model Evaluation Pipeline with KSI v2.0 Metrics
================================================

Comprehensive evaluation script for trained badminton shot classification models.
Implements enhanced Kinematic Similarity Index (KSI) v2.0 for biomechanical
analysis and optional natural language coaching feedback generation.

Key Features:
    - Classification accuracy and confusion matrix analysis
    - KSI v2.0 biomechanical comparison against expert templates
    - Phase-aware scoring (preparation, loading, contact, follow-through)
    - Per-joint error analysis with confidence intervals
    - Velocity and acceleration derivative metrics
    - Optional NLP coaching report generation
    - Full MLflow experiment tracking integration

Evaluation Metrics:
    1. Classification Metrics
       - Test accuracy, precision, recall, F1
       - Confusion matrix visualization
       
    2. KSI v2.0 Metrics
       - Total KSI score (0-100, higher = better match)
       - Component breakdown (pose, velocity, acceleration)
       - Phase-specific scores
       - Per-joint error analysis
       - Ranking hinge score for class separation

Pipeline Position:
    train_pose.py / train_hybrid.py → [evaluate.py] → reports/
    
    Loads trained model and test data, performs comprehensive evaluation,
    and logs all metrics to MLflow for experiment tracking.

Dependencies:
    External: tensorflow, sklearn, matplotlib, seaborn, mlflow, numpy, yaml
    Internal: ksi_v2.EnhancedKSI, mlflow_utils, natural_language_coach

Configuration (params.yaml):
    pose_pipeline / hybrid_pipeline:
        model_path: Path to trained model
        data_path: Path to evaluation data
    ksi:
        weights: Component weighting for KSI calculation

Usage:
    python evaluate.py pose          # Evaluate pose model
    python evaluate.py hybrid --nlp  # Evaluate hybrid with NLP feedback

Author: IPD Research Team
Version: 2.0.0
"""

"""
# --- DETERMINISM FIXES (MUST BE BEFORE TF IMPORT) ---
import os
import sys

# Check for GPU flag early (before TF imports)
_use_gpu = '--gpu' in sys.argv

if not _use_gpu:
    # Force CPU mode for deterministic predictions
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    os.environ['MEDIAPIPE_DISABLE_GPU'] = '1'
    print("🔒 Running in CPU mode for deterministic predictions (use --gpu to enable GPU)")

os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

"""
# --- DETERMINISM FIXES (MUST BE BEFORE TF IMPORT) ---
import os
import sys

# Check for GPU flag early (before TF imports)
_use_gpu = '--gpu' in sys.argv

if not _use_gpu:
    # Force CPU mode for deterministic predictions
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    os.environ['MEDIAPIPE_DISABLE_GPU'] = '1'
    print("🔒 Running in CPU mode for deterministic predictions (use --gpu to enable GPU)")

os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

import argparse
import os
import yaml
import numpy as np
import json
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from typing import Dict
from ksi_v2 import EnhancedKSI, ShotPhase
from mlflow_utils import MLflowRunManager

try:
    from natural_language_coach import generate_coaching_report
    NLP_AVAILABLE = True
except ImportError:
    NLP_AVAILABLE = False
    print("⚠️  natural_language_coach not available. NLP feedback will be skipped.")


def evaluate(pipeline_type: str, model_path: str = None, generate_nlp_feedback: bool = False, 
             nlp_skill_level: str = 'intermediate', max_nlp_samples: int = 5, auto_run_name: bool = False,
             data_path: str = None):
    """
    Evaluate model with enhanced KSI v2 metrics.
    Logs phase scores, confidence intervals, ranking hinge, and component breakdowns.
    
    Args:
        pipeline_type: 'pose' or 'hybrid'
        model_path: Optional path to model file (overrides params.yaml)
        generate_nlp_feedback: Whether to generate natural language coaching reports
        nlp_skill_level: Skill level for NLP coach ('beginner', 'intermediate', 'advanced', 'expert')
        max_nlp_samples: Maximum number of samples to generate detailed feedback for
        auto_run_name: If True, auto-generate MLflow run name without prompting
        data_path: Optional path to evaluation data (overrides params.yaml)
    """
    with open("params.yaml") as f:
        params = yaml.safe_load(f)
    cfg = params[f'{pipeline_type}_pipeline']
    ksi_cfg = params.get('ksi', {'weights': {'pose': 0.5, 'velocity': 0.3, 'acceleration': 0.2}})
    
    # Use provided model path or default from config
    if model_path:
        cfg['model_path'] = model_path
        run_suffix = f" (custom: {os.path.basename(model_path)})"
    else:
        run_suffix = ""
    
    # Use provided data path or default from config
    if data_path:
        cfg['data_path'] = data_path
    
    # Setup MLflow with interactive run manager
    exp_name = "Pose_LSTM_Experiment" if pipeline_type == 'pose' else "Hybrid_TCN_Experiment"
    run_manager = MLflowRunManager(exp_name)

    with run_manager.start_interactive_run(
        default_description=f"Evaluation of {pipeline_type} pipeline with KSI v2.0 metrics{run_suffix}",
        auto_name=auto_run_name
    ):
        # Log evaluation configuration
        mlflow.log_param("evaluation.model_path", cfg['model_path'])
        mlflow.log_param("evaluation.data_path", cfg['data_path'])
        mlflow.log_param("evaluation.pipeline_type", pipeline_type)
        
        # Display what we're evaluating on
        print(f"\n{'='*70}")
        print(f"📊 EVALUATION CONFIGURATION")
        print(f"{'='*70}")
        print(f"Pipeline: {pipeline_type}")
        print(f"Model: {cfg['model_path']}")
        print(f"Data: {cfg['data_path']}")
        print(f"{'='*70}\n")
        
        # 1. Load Data
        X, y = [], []
        raw_landmarks = []  # NEW: store raw landmarks for KSI
        if not os.path.exists(cfg['data_path']):
            print(f"Data path {cfg['data_path']} not found")
            return
        classes = sorted(os.listdir(cfg['data_path']))
        for i, cls in enumerate(classes):
            path = os.path.join(cfg['data_path'], cls)
            if not os.path.isdir(path):
                continue
            for f in os.listdir(path):
                if f.endswith('.npz'):
                    data = np.load(os.path.join(path, f))
                    X.append(data['features'])
                    y.append(i)
                    # NEW: load raw landmarks if available (for hybrid pipeline KSI)
                    if 'raw_landmarks' in data:
                        raw_landmarks.append(data['raw_landmarks'])
                    else:
                        raw_landmarks.append(None)
        
        if not X:
            print("No data loaded")
            return
        X, y_cat = np.array(X), to_categorical(y, len(classes))
        
        # Split data and raw landmarks together
        if raw_landmarks and any(lm is not None for lm in raw_landmarks):
            _, X_test, _, y_test, _, raw_test = train_test_split(
                X, y_cat, raw_landmarks, test_size=0.2, stratify=y, random_state=42
            )
            has_raw_landmarks = True
        else:
            _, X_test, _, y_test = train_test_split(X, y_cat, test_size=0.2, stratify=y, random_state=42)
            raw_test = None
            has_raw_landmarks = False
        
        # 2. Load Model & Predict
        model = load_model(cfg['model_path'])
        
        if pipeline_type == 'hybrid':
            cnn_dim = cfg['cnn_feature_dim']
            X_pose, X_cnn = X_test[..., :-cnn_dim], X_test[..., -cnn_dim:]
            loss, acc = model.evaluate([X_cnn, X_pose], y_test, verbose=0)
            y_pred = np.argmax(model.predict([X_cnn, X_pose]), axis=1)
            sample_pose = X_pose
        else:
            loss, acc = model.evaluate(X_test, y_test, verbose=0)
            y_pred = np.argmax(model.predict(X_test), axis=1)
            sample_pose = X_test

        # 3. Enhanced KSI v2 Calculation with contact-centered windowing
        # NEW: KSI now works with hybrid pipeline if raw_landmarks are available
        template_path = params['expert_pipeline']['output_path']
        ksi_results = {
            'avg_ksi_total': 0.0,
            'avg_ksi_weighted': 0.0,
            'avg_confidence_ci_width': 0.0,
            'avg_uncertainty_scalar': 0.0,
            'phase_scores': {},
            'component_scores': {},
            'reliable_count': 0,
            'total_samples': 0
        }
        
        # Check if we can do KSI evaluation
        can_do_ksi = os.path.exists(template_path) and (
            pipeline_type == 'pose' or (pipeline_type == 'hybrid' and has_raw_landmarks)
        )
        
        if can_do_ksi:
            print(f"Expert templates found at {template_path}. Computing KSI metrics...")
            templates = np.load(template_path, allow_pickle=True)
            
            # Check template format - detect if raw landmarks or enhanced features
            sample_template_key = next((k for k in templates.files if not k.startswith('_')), None)
            if sample_template_key is None:
                print(f"⚠ No valid templates found in {template_path}")
                can_do_ksi = False
            else:
                sample_template = templates[sample_template_key]
                # Raw landmarks: (T, 33, 3) or (T, 99) flattened
                # Enhanced features: (T, 32)
                template_is_raw_landmarks = (
                    sample_template.ndim == 3 and sample_template.shape[1:] == (33, 3)
                ) or (
                    sample_template.ndim == 2 and sample_template.shape[1] == 99
                )
                template_is_enhanced = sample_template.ndim == 2 and sample_template.shape[1] == 32
                
                if template_is_enhanced:
                    print(f"⚠ Templates are in 32-feature format (KSI v2 features), not raw landmarks.")
                    print(f"   KSI calculation requires raw landmarks (T, 33, 3). Regenerate templates with raw landmarks.")
                    can_do_ksi = False
                elif not template_is_raw_landmarks:
                    print(f"⚠ Unknown template format: shape={sample_template.shape}. Expected (T, 33, 3) or (T, 99).")
                    can_do_ksi = False
                else:
                    print(f"   Template format: {'(T, 33, 3)' if sample_template.ndim == 3 else '(T, 99)'} - OK")
        
        if can_do_ksi:
            # Initialize enhanced KSI calculator
            ksi_calc = EnhancedKSI(
                fps=params.get('fps', 30.0),
                contact_window_pre_frames=18,
                contact_window_post_frames=18,
                bootstrap_min=50,
                bootstrap_max=200,
                ranking_margin=0.05
            )
            
            ksi_totals = []
            ksi_weighted_list = []
            ci_widths = []
            uncertainty_scalars = []
            phase_score_accumulator = {p.value: [] for p in ShotPhase}
            component_accumulator = {'pose': [], 'velocity': [], 'acceleration': [], 'jerk': []}
            reliable_count = 0
            
            # Store individual results for NLP feedback
            individual_results = [] if generate_nlp_feedback else None
            
            # Sample for speed (evaluate up to 50 items)
            n_samples = min(50, len(sample_pose))
            evaluated_count = 0  # Track actually evaluated samples
            skipped_templates = set()  # Track missing templates to report once
            
            for i in range(n_samples):
                cls = classes[np.argmax(y_test[i])]
                
                # NEW: Try main template, then variants, prioritizing main
                template_key = None
                if cls in templates:
                    template_key = cls
                elif f'{cls}_variant1' in templates:
                    # If only variants exist, use variant1 (best quality)
                    template_key = f'{cls}_variant1'
                else:
                    # Try any key containing the class name
                    for key in templates.files:
                        if cls in key and not key.startswith('_'):
                            template_key = key
                            break
                
                if template_key is None:
                    if cls not in skipped_templates:
                        skipped_templates.add(cls)
                    continue
                
                # Get user landmarks - use raw_landmarks if available (hybrid), else reshape pose features
                if pipeline_type == 'hybrid' and raw_test is not None and raw_test[i] is not None:
                    user_lm = raw_test[i]  # Already (T, 33, 3)
                else:
                    # Pose pipeline: reshape from flattened features (T, 99) -> (T, 33, 3)
                    try:
                        user_lm = sample_pose[i].reshape(-1, 33, 3)
                    except ValueError as e:
                        print(f"⚠ Cannot reshape pose features to landmarks: {sample_pose[i].shape} -> (T, 33, 3)")
                        continue
                
                # Load expert template and reshape if needed
                expert_template = templates[template_key]
                try:
                    if expert_template.ndim == 3 and expert_template.shape[1:] == (33, 3):
                        expert_lm = expert_template  # Already (T, 33, 3)
                    elif expert_template.ndim == 2 and expert_template.shape[1] == 99:
                        expert_lm = expert_template.reshape(-1, 33, 3)  # (T, 99) -> (T, 33, 3)
                    else:
                        print(f"⚠ Cannot convert template '{template_key}' shape {expert_template.shape} to landmarks")
                        continue
                except ValueError as e:
                    print(f"⚠ Template reshape failed for '{template_key}': {e}")
                    continue
                
                # Calculate enhanced KSI
                result = ksi_calc.calculate(
                    expert_landmarks=expert_lm,
                    user_landmarks=user_lm,
                    weights=ksi_cfg['weights'],
                    baseline_ksi=None  # Could pass previous user score for ranking hinge
                )
                
                ksi_totals.append(result.ksi_total)
                ksi_weighted_list.append(result.ksi_weighted)
                
                # Confidence metrics
                if result.confidence:
                    ci_width = result.confidence.get('ci_95_upper', 0) - result.confidence.get('ci_95_lower', 0)
                    ci_widths.append(ci_width)
                    uncertainty_scalars.append(result.confidence.get('uncertainty_scalar', 0))
                    if result.confidence.get('reliable', False):
                        reliable_count += 1
                
                # Phase scores
                for phase, score in result.phase_scores.items():
                    if phase in phase_score_accumulator:
                        phase_score_accumulator[phase].append(score)
                
                # Component scores
                for comp in ['pose', 'velocity', 'acceleration', 'jerk']:
                    if comp in result.components:
                        component_accumulator[comp].append(result.components[comp])
                
                # Store individual result for NLP feedback (only if prediction is correct)
                if generate_nlp_feedback:
                    predicted_cls = classes[y_pred[i]] if i < len(y_pred) else None
                    if predicted_cls == cls:  # Only store correctly predicted samples
                        individual_results.append({
                            'sample_idx': i,
                            'class': cls,
                            'predicted_class': predicted_cls,
                            'ksi_result': result,
                            'ksi_total': result.ksi_total
                        })
                
                evaluated_count += 1
            
            # Report skipped templates
            if skipped_templates:
                print(f"   ⚠ Missing templates for classes: {sorted(skipped_templates)}")
            
            # Aggregate results
            ksi_results['avg_ksi_total'] = float(np.mean(ksi_totals)) if ksi_totals else 0.0
            ksi_results['avg_ksi_weighted'] = float(np.mean(ksi_weighted_list)) if ksi_weighted_list else 0.0
            ksi_results['avg_confidence_ci_width'] = float(np.mean(ci_widths)) if ci_widths else 0.0
            ksi_results['avg_uncertainty_scalar'] = float(np.mean(uncertainty_scalars)) if uncertainty_scalars else 0.0
            ksi_results['reliable_count'] = reliable_count
            ksi_results['total_samples'] = evaluated_count  # Use actual evaluated count, not attempted
            
            for phase, scores in phase_score_accumulator.items():
                if scores:
                    ksi_results['phase_scores'][phase] = float(np.mean(scores))
            
            for comp, scores in component_accumulator.items():
                if scores:
                    ksi_results['component_scores'][comp] = float(np.mean(scores))
            
            # Generate natural language coaching reports
            if generate_nlp_feedback and individual_results and NLP_AVAILABLE:
                print(f"\n{'='*70}")
                print(f"GENERATING NATURAL LANGUAGE COACHING FEEDBACK")
                print(f"{'='*70}")
                print(f"   Found {len(individual_results)} correctly predicted samples")
                
                # DEBUG: Show KSI scores distribution
                ksi_scores = [r['ksi_total'] for r in individual_results]
                print(f"   KSI scores range: {min(ksi_scores):.3f} - {max(ksi_scores):.3f}")
                
                os.makedirs("coaching_reports", exist_ok=True)
                
                # Pick best sample (highest KSI score) - this gives most interesting feedback
                individual_results.sort(key=lambda x: x['ksi_total'], reverse=True)
                best_sample = individual_results[0]
                samples_to_generate = [best_sample]
                
                sample = samples_to_generate[0]
                cls = sample['class']
                ksi_result = sample['ksi_result']
                sample_idx = sample['sample_idx']
                
                print(f"\n📝 Generating coaching report for: {cls} (KSI: {ksi_result.ksi_total:.3f})")
                
                try:
                    
                    # Generate simplified coaching report
                    report = generate_coaching_report(
                        ksi_result=ksi_result,
                        shot_type_str=cls,
                        skill_level_str=nlp_skill_level,
                        user_name=None,
                        output_format='text',
                        simplified=True  # Remove weekly plan, shorten output
                    )
                    
                    # Save report
                    report_filename = f"coaching_reports/{cls}_ksi{ksi_result.ksi_total:.3f}_report.txt"
                    with open(report_filename, 'w') as f:
                        f.write(report)
                    
                    print(f"   ✅ Saved: {report_filename}")
                    
                    # Also save JSON version
                    json_report = generate_coaching_report(
                        ksi_result=ksi_result,
                        shot_type_str=cls,
                        skill_level_str=nlp_skill_level,
                        output_format='json',
                        simplified=True
                    )
                    json_filename = f"coaching_reports/{cls}_ksi{ksi_result.ksi_total:.3f}_report.json"
                    with open(json_filename, 'w') as f:
                        f.write(json_report)
                    
                    # Log to MLflow
                    mlflow.log_artifact(report_filename)
                    print(f"   📊 Logged to MLflow")
                        
                except Exception as e:
                    print(f"   ⚠️ Failed to generate report: {e}")
                    import traceback
                    traceback.print_exc()
                
                print(f"\n✅ Generated coaching report in coaching_reports/")
                print(f"{'='*70}\n")
        elif pipeline_type == 'hybrid' and not has_raw_landmarks:
            print(f"⚠️  Raw landmarks not found in hybrid data. Run preprocessing again to enable KSI evaluation.")
            print(f"    (Old data format detected - missing 'raw_landmarks' in .npz files)")
        else:
            print(f"⚠ Expert templates not found at {template_path}. Skipping KSI metrics.")

        # 4. Logging to MLflow
        mlflow.log_metric("test_accuracy", acc)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("ksi_total", ksi_results['avg_ksi_total'])
        mlflow.log_metric("ksi_weighted", ksi_results['avg_ksi_weighted'])
        mlflow.log_metric("ksi_ci_width", ksi_results['avg_confidence_ci_width'])
        mlflow.log_metric("ksi_uncertainty", ksi_results['avg_uncertainty_scalar'])
        mlflow.log_metric("ksi_reliable_ratio", 
                          ksi_results['reliable_count'] / max(1, ksi_results['total_samples']))
        
        # Log phase scores
        for phase, score in ksi_results['phase_scores'].items():
            mlflow.log_metric(f"phase_{phase}", score)
        
        # Log component scores
        for comp, score in ksi_results['component_scores'].items():
            mlflow.log_metric(f"ksi_{comp}", score)
        
        # 5. Confusion Matrix
        cm = confusion_matrix(np.argmax(y_test, axis=1), y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', xticklabels=classes, yticklabels=classes, cmap='Blues')
        plt.title(f"Confusion Matrix - {pipeline_type.upper()}")
        
        os.makedirs("dvclive", exist_ok=True)
        # Use production filename for tuned models, pipeline-specific otherwise
        cm_filename = "production_confusion_matrix.png" if "tuned" in cfg['model_path'] else f"{pipeline_type}_confusion_matrix.png"
        cm_path = os.path.join("dvclive", cm_filename)
        plt.savefig(cm_path)
        plt.close()
        mlflow.log_artifact(cm_path)
        
        # 6. KSI Analysis Plot
        if ksi_results['phase_scores'] or ksi_results['component_scores']:
            fig, axes = plt.subplots(1, 2, figsize=(12, 5))
            
            # Phase scores
            if ksi_results['phase_scores']:
                phases = list(ksi_results['phase_scores'].keys())
                p_scores = list(ksi_results['phase_scores'].values())
                axes[0].barh(phases, p_scores, color=['gray', 'blue', 'orange', 'red', 'green'][:len(phases)])
                axes[0].set_xlim([0, 1])
                axes[0].set_xlabel('Average Score')
                axes[0].set_title('Phase Scores (Avg)')
            
            # Component scores
            if ksi_results['component_scores']:
                comps = list(ksi_results['component_scores'].keys())
                c_scores = list(ksi_results['component_scores'].values())
                axes[1].bar(comps, c_scores, color=['steelblue', 'coral', 'seagreen', 'orchid'][:len(comps)])
                axes[1].set_ylim([0, 1])
                axes[1].set_ylabel('Average Score')
                axes[1].set_title('Component Scores (Avg)')
            
            plt.suptitle(f"KSI v2 Analysis - {pipeline_type.upper()} | "
                         f"KSI: {ksi_results['avg_ksi_total']:.3f} ± {ksi_results['avg_confidence_ci_width']:.3f}")
            plt.tight_layout()
            ksi_plot_path = f"dvclive/{pipeline_type}_ksi_analysis.png"
            plt.savefig(ksi_plot_path)
            plt.close()
            mlflow.log_artifact(ksi_plot_path)

        # 7. Save Metrics for DVC
        metrics = {
            "accuracy": float(acc),
            "loss": float(loss),
            "ksi_total": ksi_results['avg_ksi_total'],
            "ksi_weighted": ksi_results['avg_ksi_weighted'],
            "ksi_ci_width": ksi_results['avg_confidence_ci_width'],
            "ksi_uncertainty": ksi_results['avg_uncertainty_scalar'],
            "ksi_reliable_ratio": ksi_results['reliable_count'] / max(1, ksi_results['total_samples']),
            "phase_scores": ksi_results['phase_scores'],
            "component_scores": ksi_results['component_scores']
        }
        
        # Determine metrics filename based on model path
        metrics_filename = "production_metrics.json" if "tuned" in cfg['model_path'] else f"{pipeline_type}_metrics.json"
        metrics_path = os.path.join("dvclive", metrics_filename)
        
        with open(metrics_path, "w") as f:
            json.dump(metrics, f, indent=2)

        print(f"\n{'='*60}")
        print(f"{pipeline_type.upper()} EVALUATION (KSI v2)")
        print(f"{'='*60}")
        print(f"Accuracy: {acc:.4f} | Loss: {loss:.4f}")
        print(f"KSI Total: {ksi_results['avg_ksi_total']:.4f}")
        print(f"KSI Weighted: {ksi_results['avg_ksi_weighted']:.4f}")
        print(f"CI Width: {ksi_results['avg_confidence_ci_width']:.4f}")
        print(f"Uncertainty: {ksi_results['avg_uncertainty_scalar']:.4f}")
        print(f"Reliable: {ksi_results['reliable_count']}/{ksi_results['total_samples']}")
        print(f"Phase scores: {ksi_results['phase_scores']}")
        print(f"Component scores: {ksi_results['component_scores']}")
        print(f"{'='*60}\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--type", choices=['pose', 'hybrid'], required=True)
    parser.add_argument("--model", type=str, help="Optional: custom model path (overrides params.yaml)")
    parser.add_argument("--data", type=str, help="Optional: custom data path for evaluation (overrides params.yaml)")
    parser.add_argument("--nlp", action='store_true', help="Generate natural language coaching reports")
    parser.add_argument("--nlp-skill", type=str, default='intermediate', 
                        choices=['beginner', 'intermediate', 'advanced', 'expert'],
                        help="Skill level for natural language feedback (default: intermediate)")
    parser.add_argument("--nlp-samples", type=int, default=5,
                        help="Max number of samples to generate detailed feedback for (default: 5)")
    parser.add_argument("--auto-name", action='store_true', 
                        help="Auto-generate MLflow run name without prompting")
    parser.add_argument("--gpu", action='store_true', help="Use GPU for inference (faster but less deterministic)")
    args = parser.parse_args()
    evaluate(args.type, args.model, args.nlp, args.nlp_skill, args.nlp_samples, args.auto_name, args.data)