Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 15,642 Bytes

225af6a

"""
Train-Test Validation Suite - Deepchecks validation for train-test consistency

This module implements comprehensive train-test validation checks using Deepchecks
to ensure consistency and proper splitting between training and test datasets.

Checks included:
- Train-Test Feature Drift: Detects distribution changes between train and test
- Train-Test Label Drift: Checks if label distribution differs
- Train-Test Samples Mix: Validates no data leakage
- Whole Dataset Drift: Overall distribution comparison
- Feature Label Correlation Change: Checks if correlations change
- New Label: Detects labels in test not present in train
- New Category: Detects new categorical values in test
- String Mismatch Comparison: Compares string inconsistencies
- Date Train Test Leakage Duplicates: Checks for temporal leakage
- Date Train Test Leakage Overlap: Validates proper temporal split
"""

import numpy as np
import pandas as pd
import json
from pathlib import Path
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import train_test_validation

from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR


def load_train_test_data(use_cleaned=True):
    """
    Load training and test datasets from processed data directory.
    
    Args:
        use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT
    
    Returns:
        tuple: (X_train, y_train, X_test, y_test)
    """
    tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
    
    # Choose file names based on cleaned flag
    if use_cleaned:
        train_features = tfidf_dir / "features_tfidf_clean.npy"
        train_labels = tfidf_dir / "labels_tfidf_clean.npy"
        test_features = tfidf_dir / "X_test_clean.npy"
        test_labels = tfidf_dir / "Y_test_clean.npy"
        data_type = "cleaned"
    else:
        train_features = tfidf_dir / "features_tfidf.npy"
        train_labels = tfidf_dir / "labels_tfidf.npy"
        test_features = tfidf_dir / "X_test.npy"
        test_labels = tfidf_dir / "Y_test.npy"
        data_type = "original"
    
    # Load features and labels
    X_train = np.load(train_features)
    y_train = np.load(train_labels)
    X_test = np.load(test_features)
    y_test = np.load(test_labels)
    
    print(f"Loaded {data_type} data:")
    print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
    print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")
    
    return X_train, y_train, X_test, y_test


def create_deepchecks_dataset(X, y, dataset_name="dataset"):
    """
    Create a Deepchecks Dataset object from numpy arrays.
    
    Args:
        X: Feature matrix (numpy array)
        y: Labels (numpy array) - can be multi-label (2D) or single-label (1D)
        dataset_name: Name identifier for the dataset
    
    Returns:
        Dataset: Deepchecks Dataset object
    """
    # Convert to DataFrame for better visualization
    # Create feature names
    feature_names = [f"feature_{i}" for i in range(X.shape[1])]
    
    # Create DataFrame
    df = pd.DataFrame(X, columns=feature_names)
    
    # Handle multi-label case: convert to single label by taking argmax or first active label
    if len(y.shape) > 1 and y.shape[1] > 1:
        # Multi-label: convert to single label (first active label or most confident)
        # For binary multi-label, take the index of first 1
        y_single = np.argmax(y, axis=1)  # Get the index of maximum value
        df['label'] = y_single
        print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks")
    else:
        df['label'] = y
    
    # Create Deepchecks Dataset
    ds = Dataset(df, label='label', cat_features=[])
    
    return ds


def run_train_test_validation_suite(save_output=True, use_cleaned=True):
    """
    Run the complete Train-Test Validation Suite.
    
    This suite performs comprehensive checks including:
    - Train Test Feature Drift: Detects significant distribution changes in features
    - Train Test Label Drift: Checks if label distribution is consistent
    - Train Test Samples Mix: Validates no samples appear in both sets
    - Whole Dataset Drift: Overall dataset distribution comparison
    - Feature Label Correlation Change: Detects changes in feature-label relationships
    - New Label: Identifies labels in test that don't exist in train
    - New Category: Finds new categorical values in test set
    - String Mismatch Comparison: Compares string format consistency
    - Date Train Test Leakage: Checks for temporal data leakage
    - Index Train Test Leakage: Validates proper index separation
    
    Args:
        save_output: Whether to save the HTML report
        use_cleaned: If True, use cleaned data instead of original
    
    Returns:
        SuiteResult: Results from the train-test validation suite
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"TRAIN-TEST VALIDATION SUITE - {data_type} DATA")
    print("="*80)
    
    # Load data
    X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
    
    # Create Deepchecks datasets
    train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
    test_dataset = create_deepchecks_dataset(X_test, y_test, "test")
    
    # Run the Train-Test Validation Suite
    print("\nRunning Train-Test Validation checks...")
    suite = train_test_validation()
    result = suite.run(train_dataset, test_dataset)
    
    # Display results
    print("\nTrain-Test Validation Suite completed!")
    print(f"Total checks: {len(result.results)}")
    
    # Save output
    if save_output:
        output_dir = Path("reports/deepchecks")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save JSON report with appropriate suffix
        suffix = "_clean" if use_cleaned else "_original"
        json_path = output_dir / f"train_test_validation_suite_results{suffix}.json"
        json_results = {
            "suite_name": "Train-Test Validation Suite",
            "total_checks": len(result.results),
            "timestamp": pd.Timestamp.now().isoformat(),
            "checks": []
        }
        
        for check_result in result.results:
            check_data = {
                "check_name": check_result.get_header(),
                "passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None,
                "display": str(check_result.display) if hasattr(check_result, 'display') else None
            }
            json_results["checks"].append(check_data)
        
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(json_results, f, indent=2, ensure_ascii=False)
        print(f"JSON results saved to: {json_path}")
    
    return result


def run_custom_train_test_checks(save_output=True, use_cleaned=True):
    """
    Run custom train-test validation checks tailored for the SkillScope dataset.
    
    These checks are specifically designed for NLP/Text features and
    multi-label classification tasks.
    
    Args:
        save_output: Whether to save the HTML report
        use_cleaned: If True, use cleaned data instead of original
    
    Returns:
        dict: Dictionary containing check results
    """
    from deepchecks.tabular.checks import (
        TrainTestFeatureDrift,
        TrainTestLabelDrift,
        TrainTestSamplesMix,
        WholeDatasetDrift,
        FeatureLabelCorrelationChange,
    )
    
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"CUSTOM TRAIN-TEST VALIDATION CHECKS - {data_type} DATA")
    print("="*80)
    
    # Load data
    X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
    train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
    test_dataset = create_deepchecks_dataset(X_test, y_test, "test")
    
    results = {}
    
    # Check 1: Feature Drift
    print("\n1. Checking for feature drift between train and test...")
    feature_drift_check = TrainTestFeatureDrift()
    results['feature_drift'] = feature_drift_check.run(train_dataset, test_dataset)
    
    # Check 2: Label Drift
    print("2. Checking for label drift between train and test...")
    label_drift_check = TrainTestLabelDrift()
    results['label_drift'] = label_drift_check.run(train_dataset, test_dataset)
    
    # Check 3: Samples Mix (Data Leakage)
    print("3. Checking for data leakage (samples appearing in both sets)...")
    samples_mix_check = TrainTestSamplesMix()
    results['samples_mix'] = samples_mix_check.run(train_dataset, test_dataset)
    
    # Check 4: Whole Dataset Drift
    print("4. Checking overall dataset drift...")
    dataset_drift_check = WholeDatasetDrift()
    results['dataset_drift'] = dataset_drift_check.run(train_dataset, test_dataset)
    
    # Check 5: Feature-Label Correlation Change
    print("5. Checking for changes in feature-label correlation...")
    correlation_change_check = FeatureLabelCorrelationChange()
    results['correlation_change'] = correlation_change_check.run(train_dataset, test_dataset)
    
    # Note: NewLabel check not available in this version of Deepchecks
    # Check 6 would verify new labels in test set not present in train
    print("6. Skipping NewLabel check (not available in this Deepchecks version)")
    
    print("\nAll custom train-test checks completed!")
    
    # Results are available in memory for further processing if needed
    
    return results


def compare_distributions(use_cleaned=True):
    """
    Compare statistical distributions between train and test sets.
    
    Args:
        use_cleaned: If True, compare cleaned data instead of original
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"TRAIN-TEST DISTRIBUTION COMPARISON - {data_type} DATA")
    print("="*80)
    
    X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
    
    print("\n1. SAMPLE SIZES:")
    print(f"   Training: {X_train.shape[0]} samples")
    print(f"   Test: {X_test.shape[0]} samples")
    print(f"   Train/Test ratio: {X_train.shape[0]/X_test.shape[0]:.2f}")
    
    print("\n2. FEATURE DIMENSIONS:")
    print(f"   Training features: {X_train.shape[1]}")
    print(f"   Test features: {X_test.shape[1]}")
    if X_train.shape[1] != X_test.shape[1]:
        print("  WARNING: Feature dimensions don't match!")
    else:
        print("   ✓ Feature dimensions match")
    
    print("\n3. LABEL DISTRIBUTION:")
    train_unique, train_counts = np.unique(y_train, return_counts=True)
    test_unique, test_counts = np.unique(y_test, return_counts=True)
    
    print(f"   Training unique labels: {len(train_unique)}")
    print(f"   Test unique labels: {len(test_unique)}")
    
    # Check for labels in test not in train
    new_labels = set(test_unique) - set(train_unique)
    if new_labels:
        print(f"   WARNING: {len(new_labels)} new labels in test set: {new_labels}")
    else:
        print("   No new labels in test set")
    
    # Check for labels in train not in test
    missing_labels = set(train_unique) - set(test_unique)
    if missing_labels:
        print(f"   INFO: {len(missing_labels)} labels only in train set")
    
    print("\n4. FEATURE STATISTICS COMPARISON:")
    print(f"   Train - Mean: {X_train.mean():.4f}, Std: {X_train.std():.4f}")
    print(f"   Test  - Mean: {X_test.mean():.4f}, Std: {X_test.std():.4f}")
    
    mean_diff = abs(X_train.mean() - X_test.mean())
    std_diff = abs(X_train.std() - X_test.std())
    
    print(f"   Mean difference: {mean_diff:.4f}")
    print(f"   Std difference: {std_diff:.4f}")
    
    if mean_diff > 0.1 or std_diff > 0.1:
        print("   WARNING: Significant statistical differences detected!")
    else:
        print("   Statistical distributions are similar")
    
    print("\n5. SPARSITY COMPARISON:")
    train_sparsity = (X_train == 0).sum() / X_train.size * 100
    test_sparsity = (X_test == 0).sum() / X_test.size * 100
    print(f"   Training sparsity: {train_sparsity:.2f}%")
    print(f"   Test sparsity: {test_sparsity:.2f}%")
    print(f"   Sparsity difference: {abs(train_sparsity - test_sparsity):.2f}%")
    
    if abs(train_sparsity - test_sparsity) > 5:
        print("  WARNING: Significant sparsity difference!")
    else:
        print("   Sparsity levels are similar")


def validate_split_quality(use_cleaned=True):
    """
    Validate the quality of the train-test split.
    
    Args:
        use_cleaned: If True, validate cleaned data instead of original
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"TRAIN-TEST SPLIT QUALITY VALIDATION - {data_type} DATA")
    print("="*80)
    
    X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
    
    total_samples = X_train.shape[0] + X_test.shape[0]
    test_ratio = X_test.shape[0] / total_samples
    
    print(f"\nTotal samples: {total_samples}")
    print(f"Test set ratio: {test_ratio:.2%}")
    
    # Validate test set size (typically 20-30%)
    if 0.15 <= test_ratio <= 0.35:
        print(" Test set size is within recommended range (15-35%)")
    else:
        print(" WARNING: Test set size is outside recommended range")
    
    # Check label distribution similarity
    from scipy.stats import chisquare
    
    # Get common labels
    common_labels = np.intersect1d(np.unique(y_train), np.unique(y_test))
    
    if len(common_labels) > 0:
        train_dist = [np.sum(y_train == label) for label in common_labels]
        test_dist = [np.sum(y_test == label) for label in common_labels]
        
        # Normalize to proportions
        train_props = np.array(train_dist) / len(y_train)
        test_props = np.array(test_dist) / len(y_test)
        
        # Chi-square test
        # Scale test proportions to match train sample size for chi-square
        expected = test_props * len(y_train)
        chi_stat, p_value = chisquare(train_dist, expected)
        
        print(f"\nLabel distribution similarity (chi-square test):")
        print(f"  Chi-square statistic: {chi_stat:.4f}")
        print(f"  P-value: {p_value:.4f}")
        
        if p_value > 0.05:
            print("  Label distributions are statistically similar (p > 0.05)")
        else:
            print("   WARNING: Label distributions differ significantly (p <= 0.05)")
    else:
        print(" WARNING: No common labels between train and test sets!")


if __name__ == "__main__":
    import sys
    
    # By default use cleaned data, unless --original flag is specified
    use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv)
    
    if use_cleaned:
        print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n")
    else:
        print("Testing ORIGINAL data\n")
        print("Note: Using --original flag to test old data\n")
    
    # Compare distributions
    compare_distributions(use_cleaned=use_cleaned)
    
    # Validate split quality
    print("\n")
    validate_split_quality(use_cleaned=use_cleaned)
    
    # Run the full Train-Test Validation Suite
    print("\n")
    suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned)
    
    # Run custom train-test checks
    print("\n")
    custom_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned)
    
    print("\n" + "="*80)
    print("TRAIN-TEST VALIDATION COMPLETED")
    print("="*80)
    print("\nCheck the reports in the 'reports/deepchecks' directory")