| """ |
| Train-Test Validation Suite - Deepchecks validation for train-test consistency |
| |
| This module implements comprehensive train-test validation checks using Deepchecks |
| to ensure consistency and proper splitting between training and test datasets. |
| |
| Checks included: |
| - Train-Test Feature Drift: Detects distribution changes between train and test |
| - Train-Test Label Drift: Checks if label distribution differs |
| - Train-Test Samples Mix: Validates no data leakage |
| - Whole Dataset Drift: Overall distribution comparison |
| - Feature Label Correlation Change: Checks if correlations change |
| - New Label: Detects labels in test not present in train |
| - New Category: Detects new categorical values in test |
| - String Mismatch Comparison: Compares string inconsistencies |
| - Date Train Test Leakage Duplicates: Checks for temporal leakage |
| - Date Train Test Leakage Overlap: Validates proper temporal split |
| """ |
|
|
| import numpy as np |
| import pandas as pd |
| import json |
| from pathlib import Path |
| from deepchecks.tabular import Dataset |
| from deepchecks.tabular.suites import train_test_validation |
|
|
| from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR |
|
|
|
|
| def load_train_test_data(use_cleaned=True): |
| """ |
| Load training and test datasets from processed data directory. |
| |
| Args: |
| use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT |
| |
| Returns: |
| tuple: (X_train, y_train, X_test, y_test) |
| """ |
| tfidf_dir = PROCESSED_DATA_DIR / "tfidf" |
| |
| |
| if use_cleaned: |
| train_features = tfidf_dir / "features_tfidf_clean.npy" |
| train_labels = tfidf_dir / "labels_tfidf_clean.npy" |
| test_features = tfidf_dir / "X_test_clean.npy" |
| test_labels = tfidf_dir / "Y_test_clean.npy" |
| data_type = "cleaned" |
| else: |
| train_features = tfidf_dir / "features_tfidf.npy" |
| train_labels = tfidf_dir / "labels_tfidf.npy" |
| test_features = tfidf_dir / "X_test.npy" |
| test_labels = tfidf_dir / "Y_test.npy" |
| data_type = "original" |
| |
| |
| X_train = np.load(train_features) |
| y_train = np.load(train_labels) |
| X_test = np.load(test_features) |
| y_test = np.load(test_labels) |
| |
| print(f"Loaded {data_type} data:") |
| print(f"Training set shape: X={X_train.shape}, y={y_train.shape}") |
| print(f"Test set shape: X={X_test.shape}, y={y_test.shape}") |
| |
| return X_train, y_train, X_test, y_test |
|
|
|
|
| def create_deepchecks_dataset(X, y, dataset_name="dataset"): |
| """ |
| Create a Deepchecks Dataset object from numpy arrays. |
| |
| Args: |
| X: Feature matrix (numpy array) |
| y: Labels (numpy array) - can be multi-label (2D) or single-label (1D) |
| dataset_name: Name identifier for the dataset |
| |
| Returns: |
| Dataset: Deepchecks Dataset object |
| """ |
| |
| |
| feature_names = [f"feature_{i}" for i in range(X.shape[1])] |
| |
| |
| df = pd.DataFrame(X, columns=feature_names) |
| |
| |
| if len(y.shape) > 1 and y.shape[1] > 1: |
| |
| |
| y_single = np.argmax(y, axis=1) |
| df['label'] = y_single |
| print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks") |
| else: |
| df['label'] = y |
| |
| |
| ds = Dataset(df, label='label', cat_features=[]) |
| |
| return ds |
|
|
|
|
| def run_train_test_validation_suite(save_output=True, use_cleaned=True): |
| """ |
| Run the complete Train-Test Validation Suite. |
| |
| This suite performs comprehensive checks including: |
| - Train Test Feature Drift: Detects significant distribution changes in features |
| - Train Test Label Drift: Checks if label distribution is consistent |
| - Train Test Samples Mix: Validates no samples appear in both sets |
| - Whole Dataset Drift: Overall dataset distribution comparison |
| - Feature Label Correlation Change: Detects changes in feature-label relationships |
| - New Label: Identifies labels in test that don't exist in train |
| - New Category: Finds new categorical values in test set |
| - String Mismatch Comparison: Compares string format consistency |
| - Date Train Test Leakage: Checks for temporal data leakage |
| - Index Train Test Leakage: Validates proper index separation |
| |
| Args: |
| save_output: Whether to save the HTML report |
| use_cleaned: If True, use cleaned data instead of original |
| |
| Returns: |
| SuiteResult: Results from the train-test validation suite |
| """ |
| data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
| print("="*80) |
| print(f"TRAIN-TEST VALIDATION SUITE - {data_type} DATA") |
| print("="*80) |
| |
| |
| X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
| |
| |
| train_dataset = create_deepchecks_dataset(X_train, y_train, "training") |
| test_dataset = create_deepchecks_dataset(X_test, y_test, "test") |
| |
| |
| print("\nRunning Train-Test Validation checks...") |
| suite = train_test_validation() |
| result = suite.run(train_dataset, test_dataset) |
| |
| |
| print("\nTrain-Test Validation Suite completed!") |
| print(f"Total checks: {len(result.results)}") |
| |
| |
| if save_output: |
| output_dir = Path("reports/deepchecks") |
| output_dir.mkdir(parents=True, exist_ok=True) |
| |
| |
| suffix = "_clean" if use_cleaned else "_original" |
| json_path = output_dir / f"train_test_validation_suite_results{suffix}.json" |
| json_results = { |
| "suite_name": "Train-Test Validation Suite", |
| "total_checks": len(result.results), |
| "timestamp": pd.Timestamp.now().isoformat(), |
| "checks": [] |
| } |
| |
| for check_result in result.results: |
| check_data = { |
| "check_name": check_result.get_header(), |
| "passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None, |
| "display": str(check_result.display) if hasattr(check_result, 'display') else None |
| } |
| json_results["checks"].append(check_data) |
| |
| with open(json_path, 'w', encoding='utf-8') as f: |
| json.dump(json_results, f, indent=2, ensure_ascii=False) |
| print(f"JSON results saved to: {json_path}") |
| |
| return result |
|
|
|
|
| def run_custom_train_test_checks(save_output=True, use_cleaned=True): |
| """ |
| Run custom train-test validation checks tailored for the SkillScope dataset. |
| |
| These checks are specifically designed for NLP/Text features and |
| multi-label classification tasks. |
| |
| Args: |
| save_output: Whether to save the HTML report |
| use_cleaned: If True, use cleaned data instead of original |
| |
| Returns: |
| dict: Dictionary containing check results |
| """ |
| from deepchecks.tabular.checks import ( |
| TrainTestFeatureDrift, |
| TrainTestLabelDrift, |
| TrainTestSamplesMix, |
| WholeDatasetDrift, |
| FeatureLabelCorrelationChange, |
| ) |
| |
| data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
| print("="*80) |
| print(f"CUSTOM TRAIN-TEST VALIDATION CHECKS - {data_type} DATA") |
| print("="*80) |
| |
| |
| X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
| train_dataset = create_deepchecks_dataset(X_train, y_train, "training") |
| test_dataset = create_deepchecks_dataset(X_test, y_test, "test") |
| |
| results = {} |
| |
| |
| print("\n1. Checking for feature drift between train and test...") |
| feature_drift_check = TrainTestFeatureDrift() |
| results['feature_drift'] = feature_drift_check.run(train_dataset, test_dataset) |
| |
| |
| print("2. Checking for label drift between train and test...") |
| label_drift_check = TrainTestLabelDrift() |
| results['label_drift'] = label_drift_check.run(train_dataset, test_dataset) |
| |
| |
| print("3. Checking for data leakage (samples appearing in both sets)...") |
| samples_mix_check = TrainTestSamplesMix() |
| results['samples_mix'] = samples_mix_check.run(train_dataset, test_dataset) |
| |
| |
| print("4. Checking overall dataset drift...") |
| dataset_drift_check = WholeDatasetDrift() |
| results['dataset_drift'] = dataset_drift_check.run(train_dataset, test_dataset) |
| |
| |
| print("5. Checking for changes in feature-label correlation...") |
| correlation_change_check = FeatureLabelCorrelationChange() |
| results['correlation_change'] = correlation_change_check.run(train_dataset, test_dataset) |
| |
| |
| |
| print("6. Skipping NewLabel check (not available in this Deepchecks version)") |
| |
| print("\nAll custom train-test checks completed!") |
| |
| |
| |
| return results |
|
|
|
|
| def compare_distributions(use_cleaned=True): |
| """ |
| Compare statistical distributions between train and test sets. |
| |
| Args: |
| use_cleaned: If True, compare cleaned data instead of original |
| """ |
| data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
| print("="*80) |
| print(f"TRAIN-TEST DISTRIBUTION COMPARISON - {data_type} DATA") |
| print("="*80) |
| |
| X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
| |
| print("\n1. SAMPLE SIZES:") |
| print(f" Training: {X_train.shape[0]} samples") |
| print(f" Test: {X_test.shape[0]} samples") |
| print(f" Train/Test ratio: {X_train.shape[0]/X_test.shape[0]:.2f}") |
| |
| print("\n2. FEATURE DIMENSIONS:") |
| print(f" Training features: {X_train.shape[1]}") |
| print(f" Test features: {X_test.shape[1]}") |
| if X_train.shape[1] != X_test.shape[1]: |
| print(" WARNING: Feature dimensions don't match!") |
| else: |
| print(" ✓ Feature dimensions match") |
| |
| print("\n3. LABEL DISTRIBUTION:") |
| train_unique, train_counts = np.unique(y_train, return_counts=True) |
| test_unique, test_counts = np.unique(y_test, return_counts=True) |
| |
| print(f" Training unique labels: {len(train_unique)}") |
| print(f" Test unique labels: {len(test_unique)}") |
| |
| |
| new_labels = set(test_unique) - set(train_unique) |
| if new_labels: |
| print(f" WARNING: {len(new_labels)} new labels in test set: {new_labels}") |
| else: |
| print(" No new labels in test set") |
| |
| |
| missing_labels = set(train_unique) - set(test_unique) |
| if missing_labels: |
| print(f" INFO: {len(missing_labels)} labels only in train set") |
| |
| print("\n4. FEATURE STATISTICS COMPARISON:") |
| print(f" Train - Mean: {X_train.mean():.4f}, Std: {X_train.std():.4f}") |
| print(f" Test - Mean: {X_test.mean():.4f}, Std: {X_test.std():.4f}") |
| |
| mean_diff = abs(X_train.mean() - X_test.mean()) |
| std_diff = abs(X_train.std() - X_test.std()) |
| |
| print(f" Mean difference: {mean_diff:.4f}") |
| print(f" Std difference: {std_diff:.4f}") |
| |
| if mean_diff > 0.1 or std_diff > 0.1: |
| print(" WARNING: Significant statistical differences detected!") |
| else: |
| print(" Statistical distributions are similar") |
| |
| print("\n5. SPARSITY COMPARISON:") |
| train_sparsity = (X_train == 0).sum() / X_train.size * 100 |
| test_sparsity = (X_test == 0).sum() / X_test.size * 100 |
| print(f" Training sparsity: {train_sparsity:.2f}%") |
| print(f" Test sparsity: {test_sparsity:.2f}%") |
| print(f" Sparsity difference: {abs(train_sparsity - test_sparsity):.2f}%") |
| |
| if abs(train_sparsity - test_sparsity) > 5: |
| print(" WARNING: Significant sparsity difference!") |
| else: |
| print(" Sparsity levels are similar") |
|
|
|
|
| def validate_split_quality(use_cleaned=True): |
| """ |
| Validate the quality of the train-test split. |
| |
| Args: |
| use_cleaned: If True, validate cleaned data instead of original |
| """ |
| data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
| print("="*80) |
| print(f"TRAIN-TEST SPLIT QUALITY VALIDATION - {data_type} DATA") |
| print("="*80) |
| |
| X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
| |
| total_samples = X_train.shape[0] + X_test.shape[0] |
| test_ratio = X_test.shape[0] / total_samples |
| |
| print(f"\nTotal samples: {total_samples}") |
| print(f"Test set ratio: {test_ratio:.2%}") |
| |
| |
| if 0.15 <= test_ratio <= 0.35: |
| print(" Test set size is within recommended range (15-35%)") |
| else: |
| print(" WARNING: Test set size is outside recommended range") |
| |
| |
| from scipy.stats import chisquare |
| |
| |
| common_labels = np.intersect1d(np.unique(y_train), np.unique(y_test)) |
| |
| if len(common_labels) > 0: |
| train_dist = [np.sum(y_train == label) for label in common_labels] |
| test_dist = [np.sum(y_test == label) for label in common_labels] |
| |
| |
| train_props = np.array(train_dist) / len(y_train) |
| test_props = np.array(test_dist) / len(y_test) |
| |
| |
| |
| expected = test_props * len(y_train) |
| chi_stat, p_value = chisquare(train_dist, expected) |
| |
| print(f"\nLabel distribution similarity (chi-square test):") |
| print(f" Chi-square statistic: {chi_stat:.4f}") |
| print(f" P-value: {p_value:.4f}") |
| |
| if p_value > 0.05: |
| print(" Label distributions are statistically similar (p > 0.05)") |
| else: |
| print(" WARNING: Label distributions differ significantly (p <= 0.05)") |
| else: |
| print(" WARNING: No common labels between train and test sets!") |
|
|
|
|
| if __name__ == "__main__": |
| import sys |
| |
| |
| use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv) |
| |
| if use_cleaned: |
| print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n") |
| else: |
| print("Testing ORIGINAL data\n") |
| print("Note: Using --original flag to test old data\n") |
| |
| |
| compare_distributions(use_cleaned=use_cleaned) |
| |
| |
| print("\n") |
| validate_split_quality(use_cleaned=use_cleaned) |
| |
| |
| print("\n") |
| suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned) |
| |
| |
| print("\n") |
| custom_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned) |
| |
| print("\n" + "="*80) |
| print("TRAIN-TEST VALIDATION COMPLETED") |
| print("="*80) |
| print("\nCheck the reports in the 'reports/deepchecks' directory") |
|
|