""" Train-Test Validation Suite - Deepchecks validation for train-test consistency This module implements comprehensive train-test validation checks using Deepchecks to ensure consistency and proper splitting between training and test datasets. Checks included: - Train-Test Feature Drift: Detects distribution changes between train and test - Train-Test Label Drift: Checks if label distribution differs - Train-Test Samples Mix: Validates no data leakage - Whole Dataset Drift: Overall distribution comparison - Feature Label Correlation Change: Checks if correlations change - New Label: Detects labels in test not present in train - New Category: Detects new categorical values in test - String Mismatch Comparison: Compares string inconsistencies - Date Train Test Leakage Duplicates: Checks for temporal leakage - Date Train Test Leakage Overlap: Validates proper temporal split """ import numpy as np import pandas as pd import json from pathlib import Path from deepchecks.tabular import Dataset from deepchecks.tabular.suites import train_test_validation from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR def load_train_test_data(use_cleaned=True): """ Load training and test datasets from processed data directory. Args: use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT Returns: tuple: (X_train, y_train, X_test, y_test) """ tfidf_dir = PROCESSED_DATA_DIR / "tfidf" # Choose file names based on cleaned flag if use_cleaned: train_features = tfidf_dir / "features_tfidf_clean.npy" train_labels = tfidf_dir / "labels_tfidf_clean.npy" test_features = tfidf_dir / "X_test_clean.npy" test_labels = tfidf_dir / "Y_test_clean.npy" data_type = "cleaned" else: train_features = tfidf_dir / "features_tfidf.npy" train_labels = tfidf_dir / "labels_tfidf.npy" test_features = tfidf_dir / "X_test.npy" test_labels = tfidf_dir / "Y_test.npy" data_type = "original" # Load features and labels X_train = np.load(train_features) y_train = np.load(train_labels) X_test = np.load(test_features) y_test = np.load(test_labels) print(f"Loaded {data_type} data:") print(f"Training set shape: X={X_train.shape}, y={y_train.shape}") print(f"Test set shape: X={X_test.shape}, y={y_test.shape}") return X_train, y_train, X_test, y_test def create_deepchecks_dataset(X, y, dataset_name="dataset"): """ Create a Deepchecks Dataset object from numpy arrays. Args: X: Feature matrix (numpy array) y: Labels (numpy array) - can be multi-label (2D) or single-label (1D) dataset_name: Name identifier for the dataset Returns: Dataset: Deepchecks Dataset object """ # Convert to DataFrame for better visualization # Create feature names feature_names = [f"feature_{i}" for i in range(X.shape[1])] # Create DataFrame df = pd.DataFrame(X, columns=feature_names) # Handle multi-label case: convert to single label by taking argmax or first active label if len(y.shape) > 1 and y.shape[1] > 1: # Multi-label: convert to single label (first active label or most confident) # For binary multi-label, take the index of first 1 y_single = np.argmax(y, axis=1) # Get the index of maximum value df['label'] = y_single print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks") else: df['label'] = y # Create Deepchecks Dataset ds = Dataset(df, label='label', cat_features=[]) return ds def run_train_test_validation_suite(save_output=True, use_cleaned=True): """ Run the complete Train-Test Validation Suite. This suite performs comprehensive checks including: - Train Test Feature Drift: Detects significant distribution changes in features - Train Test Label Drift: Checks if label distribution is consistent - Train Test Samples Mix: Validates no samples appear in both sets - Whole Dataset Drift: Overall dataset distribution comparison - Feature Label Correlation Change: Detects changes in feature-label relationships - New Label: Identifies labels in test that don't exist in train - New Category: Finds new categorical values in test set - String Mismatch Comparison: Compares string format consistency - Date Train Test Leakage: Checks for temporal data leakage - Index Train Test Leakage: Validates proper index separation Args: save_output: Whether to save the HTML report use_cleaned: If True, use cleaned data instead of original Returns: SuiteResult: Results from the train-test validation suite """ data_type = "CLEANED" if use_cleaned else "ORIGINAL" print("="*80) print(f"TRAIN-TEST VALIDATION SUITE - {data_type} DATA") print("="*80) # Load data X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) # Create Deepchecks datasets train_dataset = create_deepchecks_dataset(X_train, y_train, "training") test_dataset = create_deepchecks_dataset(X_test, y_test, "test") # Run the Train-Test Validation Suite print("\nRunning Train-Test Validation checks...") suite = train_test_validation() result = suite.run(train_dataset, test_dataset) # Display results print("\nTrain-Test Validation Suite completed!") print(f"Total checks: {len(result.results)}") # Save output if save_output: output_dir = Path("reports/deepchecks") output_dir.mkdir(parents=True, exist_ok=True) # Save JSON report with appropriate suffix suffix = "_clean" if use_cleaned else "_original" json_path = output_dir / f"train_test_validation_suite_results{suffix}.json" json_results = { "suite_name": "Train-Test Validation Suite", "total_checks": len(result.results), "timestamp": pd.Timestamp.now().isoformat(), "checks": [] } for check_result in result.results: check_data = { "check_name": check_result.get_header(), "passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None, "display": str(check_result.display) if hasattr(check_result, 'display') else None } json_results["checks"].append(check_data) with open(json_path, 'w', encoding='utf-8') as f: json.dump(json_results, f, indent=2, ensure_ascii=False) print(f"JSON results saved to: {json_path}") return result def run_custom_train_test_checks(save_output=True, use_cleaned=True): """ Run custom train-test validation checks tailored for the SkillScope dataset. These checks are specifically designed for NLP/Text features and multi-label classification tasks. Args: save_output: Whether to save the HTML report use_cleaned: If True, use cleaned data instead of original Returns: dict: Dictionary containing check results """ from deepchecks.tabular.checks import ( TrainTestFeatureDrift, TrainTestLabelDrift, TrainTestSamplesMix, WholeDatasetDrift, FeatureLabelCorrelationChange, ) data_type = "CLEANED" if use_cleaned else "ORIGINAL" print("="*80) print(f"CUSTOM TRAIN-TEST VALIDATION CHECKS - {data_type} DATA") print("="*80) # Load data X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) train_dataset = create_deepchecks_dataset(X_train, y_train, "training") test_dataset = create_deepchecks_dataset(X_test, y_test, "test") results = {} # Check 1: Feature Drift print("\n1. Checking for feature drift between train and test...") feature_drift_check = TrainTestFeatureDrift() results['feature_drift'] = feature_drift_check.run(train_dataset, test_dataset) # Check 2: Label Drift print("2. Checking for label drift between train and test...") label_drift_check = TrainTestLabelDrift() results['label_drift'] = label_drift_check.run(train_dataset, test_dataset) # Check 3: Samples Mix (Data Leakage) print("3. Checking for data leakage (samples appearing in both sets)...") samples_mix_check = TrainTestSamplesMix() results['samples_mix'] = samples_mix_check.run(train_dataset, test_dataset) # Check 4: Whole Dataset Drift print("4. Checking overall dataset drift...") dataset_drift_check = WholeDatasetDrift() results['dataset_drift'] = dataset_drift_check.run(train_dataset, test_dataset) # Check 5: Feature-Label Correlation Change print("5. Checking for changes in feature-label correlation...") correlation_change_check = FeatureLabelCorrelationChange() results['correlation_change'] = correlation_change_check.run(train_dataset, test_dataset) # Note: NewLabel check not available in this version of Deepchecks # Check 6 would verify new labels in test set not present in train print("6. Skipping NewLabel check (not available in this Deepchecks version)") print("\nAll custom train-test checks completed!") # Results are available in memory for further processing if needed return results def compare_distributions(use_cleaned=True): """ Compare statistical distributions between train and test sets. Args: use_cleaned: If True, compare cleaned data instead of original """ data_type = "CLEANED" if use_cleaned else "ORIGINAL" print("="*80) print(f"TRAIN-TEST DISTRIBUTION COMPARISON - {data_type} DATA") print("="*80) X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) print("\n1. SAMPLE SIZES:") print(f" Training: {X_train.shape[0]} samples") print(f" Test: {X_test.shape[0]} samples") print(f" Train/Test ratio: {X_train.shape[0]/X_test.shape[0]:.2f}") print("\n2. FEATURE DIMENSIONS:") print(f" Training features: {X_train.shape[1]}") print(f" Test features: {X_test.shape[1]}") if X_train.shape[1] != X_test.shape[1]: print(" WARNING: Feature dimensions don't match!") else: print(" ✓ Feature dimensions match") print("\n3. LABEL DISTRIBUTION:") train_unique, train_counts = np.unique(y_train, return_counts=True) test_unique, test_counts = np.unique(y_test, return_counts=True) print(f" Training unique labels: {len(train_unique)}") print(f" Test unique labels: {len(test_unique)}") # Check for labels in test not in train new_labels = set(test_unique) - set(train_unique) if new_labels: print(f" WARNING: {len(new_labels)} new labels in test set: {new_labels}") else: print(" No new labels in test set") # Check for labels in train not in test missing_labels = set(train_unique) - set(test_unique) if missing_labels: print(f" INFO: {len(missing_labels)} labels only in train set") print("\n4. FEATURE STATISTICS COMPARISON:") print(f" Train - Mean: {X_train.mean():.4f}, Std: {X_train.std():.4f}") print(f" Test - Mean: {X_test.mean():.4f}, Std: {X_test.std():.4f}") mean_diff = abs(X_train.mean() - X_test.mean()) std_diff = abs(X_train.std() - X_test.std()) print(f" Mean difference: {mean_diff:.4f}") print(f" Std difference: {std_diff:.4f}") if mean_diff > 0.1 or std_diff > 0.1: print(" WARNING: Significant statistical differences detected!") else: print(" Statistical distributions are similar") print("\n5. SPARSITY COMPARISON:") train_sparsity = (X_train == 0).sum() / X_train.size * 100 test_sparsity = (X_test == 0).sum() / X_test.size * 100 print(f" Training sparsity: {train_sparsity:.2f}%") print(f" Test sparsity: {test_sparsity:.2f}%") print(f" Sparsity difference: {abs(train_sparsity - test_sparsity):.2f}%") if abs(train_sparsity - test_sparsity) > 5: print(" WARNING: Significant sparsity difference!") else: print(" Sparsity levels are similar") def validate_split_quality(use_cleaned=True): """ Validate the quality of the train-test split. Args: use_cleaned: If True, validate cleaned data instead of original """ data_type = "CLEANED" if use_cleaned else "ORIGINAL" print("="*80) print(f"TRAIN-TEST SPLIT QUALITY VALIDATION - {data_type} DATA") print("="*80) X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) total_samples = X_train.shape[0] + X_test.shape[0] test_ratio = X_test.shape[0] / total_samples print(f"\nTotal samples: {total_samples}") print(f"Test set ratio: {test_ratio:.2%}") # Validate test set size (typically 20-30%) if 0.15 <= test_ratio <= 0.35: print(" Test set size is within recommended range (15-35%)") else: print(" WARNING: Test set size is outside recommended range") # Check label distribution similarity from scipy.stats import chisquare # Get common labels common_labels = np.intersect1d(np.unique(y_train), np.unique(y_test)) if len(common_labels) > 0: train_dist = [np.sum(y_train == label) for label in common_labels] test_dist = [np.sum(y_test == label) for label in common_labels] # Normalize to proportions train_props = np.array(train_dist) / len(y_train) test_props = np.array(test_dist) / len(y_test) # Chi-square test # Scale test proportions to match train sample size for chi-square expected = test_props * len(y_train) chi_stat, p_value = chisquare(train_dist, expected) print(f"\nLabel distribution similarity (chi-square test):") print(f" Chi-square statistic: {chi_stat:.4f}") print(f" P-value: {p_value:.4f}") if p_value > 0.05: print(" Label distributions are statistically similar (p > 0.05)") else: print(" WARNING: Label distributions differ significantly (p <= 0.05)") else: print(" WARNING: No common labels between train and test sets!") if __name__ == "__main__": import sys # By default use cleaned data, unless --original flag is specified use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv) if use_cleaned: print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n") else: print("Testing ORIGINAL data\n") print("Note: Using --original flag to test old data\n") # Compare distributions compare_distributions(use_cleaned=use_cleaned) # Validate split quality print("\n") validate_split_quality(use_cleaned=use_cleaned) # Run the full Train-Test Validation Suite print("\n") suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned) # Run custom train-test checks print("\n") custom_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned) print("\n" + "="*80) print("TRAIN-TEST VALIDATION COMPLETED") print("="*80) print("\nCheck the reports in the 'reports/deepchecks' directory")