Spaces:
Sleeping
Sleeping
| """ | |
| Train-Test Validation Suite - Deepchecks validation for train-test consistency | |
| This module implements comprehensive train-test validation checks using Deepchecks | |
| to ensure consistency and proper splitting between training and test datasets. | |
| Checks included: | |
| - Train-Test Feature Drift: Detects distribution changes between train and test | |
| - Train-Test Label Drift: Checks if label distribution differs | |
| - Train-Test Samples Mix: Validates no data leakage | |
| - Whole Dataset Drift: Overall distribution comparison | |
| - Feature Label Correlation Change: Checks if correlations change | |
| - New Label: Detects labels in test not present in train | |
| - New Category: Detects new categorical values in test | |
| - String Mismatch Comparison: Compares string inconsistencies | |
| - Date Train Test Leakage Duplicates: Checks for temporal leakage | |
| - Date Train Test Leakage Overlap: Validates proper temporal split | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| import json | |
| from pathlib import Path | |
| from deepchecks.tabular import Dataset | |
| from deepchecks.tabular.suites import train_test_validation | |
| from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR | |
| def load_train_test_data(use_cleaned=True): | |
| """ | |
| Load training and test datasets from processed data directory. | |
| Args: | |
| use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT | |
| Returns: | |
| tuple: (X_train, y_train, X_test, y_test) | |
| """ | |
| tfidf_dir = PROCESSED_DATA_DIR / "tfidf" | |
| # Choose file names based on cleaned flag | |
| if use_cleaned: | |
| train_features = tfidf_dir / "features_tfidf_clean.npy" | |
| train_labels = tfidf_dir / "labels_tfidf_clean.npy" | |
| test_features = tfidf_dir / "X_test_clean.npy" | |
| test_labels = tfidf_dir / "Y_test_clean.npy" | |
| data_type = "cleaned" | |
| else: | |
| train_features = tfidf_dir / "features_tfidf.npy" | |
| train_labels = tfidf_dir / "labels_tfidf.npy" | |
| test_features = tfidf_dir / "X_test.npy" | |
| test_labels = tfidf_dir / "Y_test.npy" | |
| data_type = "original" | |
| # Load features and labels | |
| X_train = np.load(train_features) | |
| y_train = np.load(train_labels) | |
| X_test = np.load(test_features) | |
| y_test = np.load(test_labels) | |
| print(f"Loaded {data_type} data:") | |
| print(f"Training set shape: X={X_train.shape}, y={y_train.shape}") | |
| print(f"Test set shape: X={X_test.shape}, y={y_test.shape}") | |
| return X_train, y_train, X_test, y_test | |
| def create_deepchecks_dataset(X, y, dataset_name="dataset"): | |
| """ | |
| Create a Deepchecks Dataset object from numpy arrays. | |
| Args: | |
| X: Feature matrix (numpy array) | |
| y: Labels (numpy array) - can be multi-label (2D) or single-label (1D) | |
| dataset_name: Name identifier for the dataset | |
| Returns: | |
| Dataset: Deepchecks Dataset object | |
| """ | |
| # Convert to DataFrame for better visualization | |
| # Create feature names | |
| feature_names = [f"feature_{i}" for i in range(X.shape[1])] | |
| # Create DataFrame | |
| df = pd.DataFrame(X, columns=feature_names) | |
| # Handle multi-label case: convert to single label by taking argmax or first active label | |
| if len(y.shape) > 1 and y.shape[1] > 1: | |
| # Multi-label: convert to single label (first active label or most confident) | |
| # For binary multi-label, take the index of first 1 | |
| y_single = np.argmax(y, axis=1) # Get the index of maximum value | |
| df['label'] = y_single | |
| print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks") | |
| else: | |
| df['label'] = y | |
| # Create Deepchecks Dataset | |
| ds = Dataset(df, label='label', cat_features=[]) | |
| return ds | |
| def run_train_test_validation_suite(save_output=True, use_cleaned=True): | |
| """ | |
| Run the complete Train-Test Validation Suite. | |
| This suite performs comprehensive checks including: | |
| - Train Test Feature Drift: Detects significant distribution changes in features | |
| - Train Test Label Drift: Checks if label distribution is consistent | |
| - Train Test Samples Mix: Validates no samples appear in both sets | |
| - Whole Dataset Drift: Overall dataset distribution comparison | |
| - Feature Label Correlation Change: Detects changes in feature-label relationships | |
| - New Label: Identifies labels in test that don't exist in train | |
| - New Category: Finds new categorical values in test set | |
| - String Mismatch Comparison: Compares string format consistency | |
| - Date Train Test Leakage: Checks for temporal data leakage | |
| - Index Train Test Leakage: Validates proper index separation | |
| Args: | |
| save_output: Whether to save the HTML report | |
| use_cleaned: If True, use cleaned data instead of original | |
| Returns: | |
| SuiteResult: Results from the train-test validation suite | |
| """ | |
| data_type = "CLEANED" if use_cleaned else "ORIGINAL" | |
| print("="*80) | |
| print(f"TRAIN-TEST VALIDATION SUITE - {data_type} DATA") | |
| print("="*80) | |
| # Load data | |
| X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) | |
| # Create Deepchecks datasets | |
| train_dataset = create_deepchecks_dataset(X_train, y_train, "training") | |
| test_dataset = create_deepchecks_dataset(X_test, y_test, "test") | |
| # Run the Train-Test Validation Suite | |
| print("\nRunning Train-Test Validation checks...") | |
| suite = train_test_validation() | |
| result = suite.run(train_dataset, test_dataset) | |
| # Display results | |
| print("\nTrain-Test Validation Suite completed!") | |
| print(f"Total checks: {len(result.results)}") | |
| # Save output | |
| if save_output: | |
| output_dir = Path("reports/deepchecks") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Save JSON report with appropriate suffix | |
| suffix = "_clean" if use_cleaned else "_original" | |
| json_path = output_dir / f"train_test_validation_suite_results{suffix}.json" | |
| json_results = { | |
| "suite_name": "Train-Test Validation Suite", | |
| "total_checks": len(result.results), | |
| "timestamp": pd.Timestamp.now().isoformat(), | |
| "checks": [] | |
| } | |
| for check_result in result.results: | |
| check_data = { | |
| "check_name": check_result.get_header(), | |
| "passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None, | |
| "display": str(check_result.display) if hasattr(check_result, 'display') else None | |
| } | |
| json_results["checks"].append(check_data) | |
| with open(json_path, 'w', encoding='utf-8') as f: | |
| json.dump(json_results, f, indent=2, ensure_ascii=False) | |
| print(f"JSON results saved to: {json_path}") | |
| return result | |
| def run_custom_train_test_checks(save_output=True, use_cleaned=True): | |
| """ | |
| Run custom train-test validation checks tailored for the SkillScope dataset. | |
| These checks are specifically designed for NLP/Text features and | |
| multi-label classification tasks. | |
| Args: | |
| save_output: Whether to save the HTML report | |
| use_cleaned: If True, use cleaned data instead of original | |
| Returns: | |
| dict: Dictionary containing check results | |
| """ | |
| from deepchecks.tabular.checks import ( | |
| TrainTestFeatureDrift, | |
| TrainTestLabelDrift, | |
| TrainTestSamplesMix, | |
| WholeDatasetDrift, | |
| FeatureLabelCorrelationChange, | |
| ) | |
| data_type = "CLEANED" if use_cleaned else "ORIGINAL" | |
| print("="*80) | |
| print(f"CUSTOM TRAIN-TEST VALIDATION CHECKS - {data_type} DATA") | |
| print("="*80) | |
| # Load data | |
| X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) | |
| train_dataset = create_deepchecks_dataset(X_train, y_train, "training") | |
| test_dataset = create_deepchecks_dataset(X_test, y_test, "test") | |
| results = {} | |
| # Check 1: Feature Drift | |
| print("\n1. Checking for feature drift between train and test...") | |
| feature_drift_check = TrainTestFeatureDrift() | |
| results['feature_drift'] = feature_drift_check.run(train_dataset, test_dataset) | |
| # Check 2: Label Drift | |
| print("2. Checking for label drift between train and test...") | |
| label_drift_check = TrainTestLabelDrift() | |
| results['label_drift'] = label_drift_check.run(train_dataset, test_dataset) | |
| # Check 3: Samples Mix (Data Leakage) | |
| print("3. Checking for data leakage (samples appearing in both sets)...") | |
| samples_mix_check = TrainTestSamplesMix() | |
| results['samples_mix'] = samples_mix_check.run(train_dataset, test_dataset) | |
| # Check 4: Whole Dataset Drift | |
| print("4. Checking overall dataset drift...") | |
| dataset_drift_check = WholeDatasetDrift() | |
| results['dataset_drift'] = dataset_drift_check.run(train_dataset, test_dataset) | |
| # Check 5: Feature-Label Correlation Change | |
| print("5. Checking for changes in feature-label correlation...") | |
| correlation_change_check = FeatureLabelCorrelationChange() | |
| results['correlation_change'] = correlation_change_check.run(train_dataset, test_dataset) | |
| # Note: NewLabel check not available in this version of Deepchecks | |
| # Check 6 would verify new labels in test set not present in train | |
| print("6. Skipping NewLabel check (not available in this Deepchecks version)") | |
| print("\nAll custom train-test checks completed!") | |
| # Results are available in memory for further processing if needed | |
| return results | |
| def compare_distributions(use_cleaned=True): | |
| """ | |
| Compare statistical distributions between train and test sets. | |
| Args: | |
| use_cleaned: If True, compare cleaned data instead of original | |
| """ | |
| data_type = "CLEANED" if use_cleaned else "ORIGINAL" | |
| print("="*80) | |
| print(f"TRAIN-TEST DISTRIBUTION COMPARISON - {data_type} DATA") | |
| print("="*80) | |
| X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) | |
| print("\n1. SAMPLE SIZES:") | |
| print(f" Training: {X_train.shape[0]} samples") | |
| print(f" Test: {X_test.shape[0]} samples") | |
| print(f" Train/Test ratio: {X_train.shape[0]/X_test.shape[0]:.2f}") | |
| print("\n2. FEATURE DIMENSIONS:") | |
| print(f" Training features: {X_train.shape[1]}") | |
| print(f" Test features: {X_test.shape[1]}") | |
| if X_train.shape[1] != X_test.shape[1]: | |
| print(" WARNING: Feature dimensions don't match!") | |
| else: | |
| print(" ✓ Feature dimensions match") | |
| print("\n3. LABEL DISTRIBUTION:") | |
| train_unique, train_counts = np.unique(y_train, return_counts=True) | |
| test_unique, test_counts = np.unique(y_test, return_counts=True) | |
| print(f" Training unique labels: {len(train_unique)}") | |
| print(f" Test unique labels: {len(test_unique)}") | |
| # Check for labels in test not in train | |
| new_labels = set(test_unique) - set(train_unique) | |
| if new_labels: | |
| print(f" WARNING: {len(new_labels)} new labels in test set: {new_labels}") | |
| else: | |
| print(" No new labels in test set") | |
| # Check for labels in train not in test | |
| missing_labels = set(train_unique) - set(test_unique) | |
| if missing_labels: | |
| print(f" INFO: {len(missing_labels)} labels only in train set") | |
| print("\n4. FEATURE STATISTICS COMPARISON:") | |
| print(f" Train - Mean: {X_train.mean():.4f}, Std: {X_train.std():.4f}") | |
| print(f" Test - Mean: {X_test.mean():.4f}, Std: {X_test.std():.4f}") | |
| mean_diff = abs(X_train.mean() - X_test.mean()) | |
| std_diff = abs(X_train.std() - X_test.std()) | |
| print(f" Mean difference: {mean_diff:.4f}") | |
| print(f" Std difference: {std_diff:.4f}") | |
| if mean_diff > 0.1 or std_diff > 0.1: | |
| print(" WARNING: Significant statistical differences detected!") | |
| else: | |
| print(" Statistical distributions are similar") | |
| print("\n5. SPARSITY COMPARISON:") | |
| train_sparsity = (X_train == 0).sum() / X_train.size * 100 | |
| test_sparsity = (X_test == 0).sum() / X_test.size * 100 | |
| print(f" Training sparsity: {train_sparsity:.2f}%") | |
| print(f" Test sparsity: {test_sparsity:.2f}%") | |
| print(f" Sparsity difference: {abs(train_sparsity - test_sparsity):.2f}%") | |
| if abs(train_sparsity - test_sparsity) > 5: | |
| print(" WARNING: Significant sparsity difference!") | |
| else: | |
| print(" Sparsity levels are similar") | |
| def validate_split_quality(use_cleaned=True): | |
| """ | |
| Validate the quality of the train-test split. | |
| Args: | |
| use_cleaned: If True, validate cleaned data instead of original | |
| """ | |
| data_type = "CLEANED" if use_cleaned else "ORIGINAL" | |
| print("="*80) | |
| print(f"TRAIN-TEST SPLIT QUALITY VALIDATION - {data_type} DATA") | |
| print("="*80) | |
| X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) | |
| total_samples = X_train.shape[0] + X_test.shape[0] | |
| test_ratio = X_test.shape[0] / total_samples | |
| print(f"\nTotal samples: {total_samples}") | |
| print(f"Test set ratio: {test_ratio:.2%}") | |
| # Validate test set size (typically 20-30%) | |
| if 0.15 <= test_ratio <= 0.35: | |
| print(" Test set size is within recommended range (15-35%)") | |
| else: | |
| print(" WARNING: Test set size is outside recommended range") | |
| # Check label distribution similarity | |
| from scipy.stats import chisquare | |
| # Get common labels | |
| common_labels = np.intersect1d(np.unique(y_train), np.unique(y_test)) | |
| if len(common_labels) > 0: | |
| train_dist = [np.sum(y_train == label) for label in common_labels] | |
| test_dist = [np.sum(y_test == label) for label in common_labels] | |
| # Normalize to proportions | |
| train_props = np.array(train_dist) / len(y_train) | |
| test_props = np.array(test_dist) / len(y_test) | |
| # Chi-square test | |
| # Scale test proportions to match train sample size for chi-square | |
| expected = test_props * len(y_train) | |
| chi_stat, p_value = chisquare(train_dist, expected) | |
| print(f"\nLabel distribution similarity (chi-square test):") | |
| print(f" Chi-square statistic: {chi_stat:.4f}") | |
| print(f" P-value: {p_value:.4f}") | |
| if p_value > 0.05: | |
| print(" Label distributions are statistically similar (p > 0.05)") | |
| else: | |
| print(" WARNING: Label distributions differ significantly (p <= 0.05)") | |
| else: | |
| print(" WARNING: No common labels between train and test sets!") | |
| if __name__ == "__main__": | |
| import sys | |
| # By default use cleaned data, unless --original flag is specified | |
| use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv) | |
| if use_cleaned: | |
| print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n") | |
| else: | |
| print("Testing ORIGINAL data\n") | |
| print("Note: Using --original flag to test old data\n") | |
| # Compare distributions | |
| compare_distributions(use_cleaned=use_cleaned) | |
| # Validate split quality | |
| print("\n") | |
| validate_split_quality(use_cleaned=use_cleaned) | |
| # Run the full Train-Test Validation Suite | |
| print("\n") | |
| suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned) | |
| # Run custom train-test checks | |
| print("\n") | |
| custom_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned) | |
| print("\n" + "="*80) | |
| print("TRAIN-TEST VALIDATION COMPLETED") | |
| print("="*80) | |
| print("\nCheck the reports in the 'reports/deepchecks' directory") | |