|
|
""" |
|
|
Train-Test Validation Suite - Deepchecks validation for train-test consistency |
|
|
|
|
|
This module implements comprehensive train-test validation checks using Deepchecks |
|
|
to ensure consistency and proper splitting between training and test datasets. |
|
|
|
|
|
Checks included: |
|
|
- Train-Test Feature Drift: Detects distribution changes between train and test |
|
|
- Train-Test Label Drift: Checks if label distribution differs |
|
|
- Train-Test Samples Mix: Validates no data leakage |
|
|
- Whole Dataset Drift: Overall distribution comparison |
|
|
- Feature Label Correlation Change: Checks if correlations change |
|
|
- New Label: Detects labels in test not present in train |
|
|
- New Category: Detects new categorical values in test |
|
|
- String Mismatch Comparison: Compares string inconsistencies |
|
|
- Date Train Test Leakage Duplicates: Checks for temporal leakage |
|
|
- Date Train Test Leakage Overlap: Validates proper temporal split |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import json |
|
|
from pathlib import Path |
|
|
from deepchecks.tabular import Dataset |
|
|
from deepchecks.tabular.suites import train_test_validation |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR |
|
|
|
|
|
|
|
|
def load_train_test_data(use_cleaned=True): |
|
|
""" |
|
|
Load training and test datasets from processed data directory. |
|
|
|
|
|
Args: |
|
|
use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT |
|
|
|
|
|
Returns: |
|
|
tuple: (X_train, y_train, X_test, y_test) |
|
|
""" |
|
|
tfidf_dir = PROCESSED_DATA_DIR / "tfidf" |
|
|
|
|
|
|
|
|
if use_cleaned: |
|
|
train_features = tfidf_dir / "features_tfidf_clean.npy" |
|
|
train_labels = tfidf_dir / "labels_tfidf_clean.npy" |
|
|
test_features = tfidf_dir / "X_test_clean.npy" |
|
|
test_labels = tfidf_dir / "Y_test_clean.npy" |
|
|
data_type = "cleaned" |
|
|
else: |
|
|
train_features = tfidf_dir / "features_tfidf.npy" |
|
|
train_labels = tfidf_dir / "labels_tfidf.npy" |
|
|
test_features = tfidf_dir / "X_test.npy" |
|
|
test_labels = tfidf_dir / "Y_test.npy" |
|
|
data_type = "original" |
|
|
|
|
|
|
|
|
X_train = np.load(train_features) |
|
|
y_train = np.load(train_labels) |
|
|
X_test = np.load(test_features) |
|
|
y_test = np.load(test_labels) |
|
|
|
|
|
print(f"Loaded {data_type} data:") |
|
|
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}") |
|
|
print(f"Test set shape: X={X_test.shape}, y={y_test.shape}") |
|
|
|
|
|
return X_train, y_train, X_test, y_test |
|
|
|
|
|
|
|
|
def create_deepchecks_dataset(X, y, dataset_name="dataset"): |
|
|
""" |
|
|
Create a Deepchecks Dataset object from numpy arrays. |
|
|
|
|
|
Args: |
|
|
X: Feature matrix (numpy array) |
|
|
y: Labels (numpy array) - can be multi-label (2D) or single-label (1D) |
|
|
dataset_name: Name identifier for the dataset |
|
|
|
|
|
Returns: |
|
|
Dataset: Deepchecks Dataset object |
|
|
""" |
|
|
|
|
|
|
|
|
feature_names = [f"feature_{i}" for i in range(X.shape[1])] |
|
|
|
|
|
|
|
|
df = pd.DataFrame(X, columns=feature_names) |
|
|
|
|
|
|
|
|
if len(y.shape) > 1 and y.shape[1] > 1: |
|
|
|
|
|
|
|
|
y_single = np.argmax(y, axis=1) |
|
|
df['label'] = y_single |
|
|
print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks") |
|
|
else: |
|
|
df['label'] = y |
|
|
|
|
|
|
|
|
ds = Dataset(df, label='label', cat_features=[]) |
|
|
|
|
|
return ds |
|
|
|
|
|
|
|
|
def run_train_test_validation_suite(save_output=True, use_cleaned=True): |
|
|
""" |
|
|
Run the complete Train-Test Validation Suite. |
|
|
|
|
|
This suite performs comprehensive checks including: |
|
|
- Train Test Feature Drift: Detects significant distribution changes in features |
|
|
- Train Test Label Drift: Checks if label distribution is consistent |
|
|
- Train Test Samples Mix: Validates no samples appear in both sets |
|
|
- Whole Dataset Drift: Overall dataset distribution comparison |
|
|
- Feature Label Correlation Change: Detects changes in feature-label relationships |
|
|
- New Label: Identifies labels in test that don't exist in train |
|
|
- New Category: Finds new categorical values in test set |
|
|
- String Mismatch Comparison: Compares string format consistency |
|
|
- Date Train Test Leakage: Checks for temporal data leakage |
|
|
- Index Train Test Leakage: Validates proper index separation |
|
|
|
|
|
Args: |
|
|
save_output: Whether to save the HTML report |
|
|
use_cleaned: If True, use cleaned data instead of original |
|
|
|
|
|
Returns: |
|
|
SuiteResult: Results from the train-test validation suite |
|
|
""" |
|
|
data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
|
|
print("="*80) |
|
|
print(f"TRAIN-TEST VALIDATION SUITE - {data_type} DATA") |
|
|
print("="*80) |
|
|
|
|
|
|
|
|
X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
train_dataset = create_deepchecks_dataset(X_train, y_train, "training") |
|
|
test_dataset = create_deepchecks_dataset(X_test, y_test, "test") |
|
|
|
|
|
|
|
|
print("\nRunning Train-Test Validation checks...") |
|
|
suite = train_test_validation() |
|
|
result = suite.run(train_dataset, test_dataset) |
|
|
|
|
|
|
|
|
print("\nTrain-Test Validation Suite completed!") |
|
|
print(f"Total checks: {len(result.results)}") |
|
|
|
|
|
|
|
|
if save_output: |
|
|
output_dir = Path("reports/deepchecks") |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
suffix = "_clean" if use_cleaned else "_original" |
|
|
json_path = output_dir / f"train_test_validation_suite_results{suffix}.json" |
|
|
json_results = { |
|
|
"suite_name": "Train-Test Validation Suite", |
|
|
"total_checks": len(result.results), |
|
|
"timestamp": pd.Timestamp.now().isoformat(), |
|
|
"checks": [] |
|
|
} |
|
|
|
|
|
for check_result in result.results: |
|
|
check_data = { |
|
|
"check_name": check_result.get_header(), |
|
|
"passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None, |
|
|
"display": str(check_result.display) if hasattr(check_result, 'display') else None |
|
|
} |
|
|
json_results["checks"].append(check_data) |
|
|
|
|
|
with open(json_path, 'w', encoding='utf-8') as f: |
|
|
json.dump(json_results, f, indent=2, ensure_ascii=False) |
|
|
print(f"JSON results saved to: {json_path}") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def run_custom_train_test_checks(save_output=True, use_cleaned=True): |
|
|
""" |
|
|
Run custom train-test validation checks tailored for the SkillScope dataset. |
|
|
|
|
|
These checks are specifically designed for NLP/Text features and |
|
|
multi-label classification tasks. |
|
|
|
|
|
Args: |
|
|
save_output: Whether to save the HTML report |
|
|
use_cleaned: If True, use cleaned data instead of original |
|
|
|
|
|
Returns: |
|
|
dict: Dictionary containing check results |
|
|
""" |
|
|
from deepchecks.tabular.checks import ( |
|
|
TrainTestFeatureDrift, |
|
|
TrainTestLabelDrift, |
|
|
TrainTestSamplesMix, |
|
|
WholeDatasetDrift, |
|
|
FeatureLabelCorrelationChange, |
|
|
) |
|
|
|
|
|
data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
|
|
print("="*80) |
|
|
print(f"CUSTOM TRAIN-TEST VALIDATION CHECKS - {data_type} DATA") |
|
|
print("="*80) |
|
|
|
|
|
|
|
|
X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
|
|
train_dataset = create_deepchecks_dataset(X_train, y_train, "training") |
|
|
test_dataset = create_deepchecks_dataset(X_test, y_test, "test") |
|
|
|
|
|
results = {} |
|
|
|
|
|
|
|
|
print("\n1. Checking for feature drift between train and test...") |
|
|
feature_drift_check = TrainTestFeatureDrift() |
|
|
results['feature_drift'] = feature_drift_check.run(train_dataset, test_dataset) |
|
|
|
|
|
|
|
|
print("2. Checking for label drift between train and test...") |
|
|
label_drift_check = TrainTestLabelDrift() |
|
|
results['label_drift'] = label_drift_check.run(train_dataset, test_dataset) |
|
|
|
|
|
|
|
|
print("3. Checking for data leakage (samples appearing in both sets)...") |
|
|
samples_mix_check = TrainTestSamplesMix() |
|
|
results['samples_mix'] = samples_mix_check.run(train_dataset, test_dataset) |
|
|
|
|
|
|
|
|
print("4. Checking overall dataset drift...") |
|
|
dataset_drift_check = WholeDatasetDrift() |
|
|
results['dataset_drift'] = dataset_drift_check.run(train_dataset, test_dataset) |
|
|
|
|
|
|
|
|
print("5. Checking for changes in feature-label correlation...") |
|
|
correlation_change_check = FeatureLabelCorrelationChange() |
|
|
results['correlation_change'] = correlation_change_check.run(train_dataset, test_dataset) |
|
|
|
|
|
|
|
|
|
|
|
print("6. Skipping NewLabel check (not available in this Deepchecks version)") |
|
|
|
|
|
print("\nAll custom train-test checks completed!") |
|
|
|
|
|
|
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def compare_distributions(use_cleaned=True): |
|
|
""" |
|
|
Compare statistical distributions between train and test sets. |
|
|
|
|
|
Args: |
|
|
use_cleaned: If True, compare cleaned data instead of original |
|
|
""" |
|
|
data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
|
|
print("="*80) |
|
|
print(f"TRAIN-TEST DISTRIBUTION COMPARISON - {data_type} DATA") |
|
|
print("="*80) |
|
|
|
|
|
X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
|
|
|
|
|
print("\n1. SAMPLE SIZES:") |
|
|
print(f" Training: {X_train.shape[0]} samples") |
|
|
print(f" Test: {X_test.shape[0]} samples") |
|
|
print(f" Train/Test ratio: {X_train.shape[0]/X_test.shape[0]:.2f}") |
|
|
|
|
|
print("\n2. FEATURE DIMENSIONS:") |
|
|
print(f" Training features: {X_train.shape[1]}") |
|
|
print(f" Test features: {X_test.shape[1]}") |
|
|
if X_train.shape[1] != X_test.shape[1]: |
|
|
print(" WARNING: Feature dimensions don't match!") |
|
|
else: |
|
|
print(" ✓ Feature dimensions match") |
|
|
|
|
|
print("\n3. LABEL DISTRIBUTION:") |
|
|
train_unique, train_counts = np.unique(y_train, return_counts=True) |
|
|
test_unique, test_counts = np.unique(y_test, return_counts=True) |
|
|
|
|
|
print(f" Training unique labels: {len(train_unique)}") |
|
|
print(f" Test unique labels: {len(test_unique)}") |
|
|
|
|
|
|
|
|
new_labels = set(test_unique) - set(train_unique) |
|
|
if new_labels: |
|
|
print(f" WARNING: {len(new_labels)} new labels in test set: {new_labels}") |
|
|
else: |
|
|
print(" No new labels in test set") |
|
|
|
|
|
|
|
|
missing_labels = set(train_unique) - set(test_unique) |
|
|
if missing_labels: |
|
|
print(f" INFO: {len(missing_labels)} labels only in train set") |
|
|
|
|
|
print("\n4. FEATURE STATISTICS COMPARISON:") |
|
|
print(f" Train - Mean: {X_train.mean():.4f}, Std: {X_train.std():.4f}") |
|
|
print(f" Test - Mean: {X_test.mean():.4f}, Std: {X_test.std():.4f}") |
|
|
|
|
|
mean_diff = abs(X_train.mean() - X_test.mean()) |
|
|
std_diff = abs(X_train.std() - X_test.std()) |
|
|
|
|
|
print(f" Mean difference: {mean_diff:.4f}") |
|
|
print(f" Std difference: {std_diff:.4f}") |
|
|
|
|
|
if mean_diff > 0.1 or std_diff > 0.1: |
|
|
print(" WARNING: Significant statistical differences detected!") |
|
|
else: |
|
|
print(" Statistical distributions are similar") |
|
|
|
|
|
print("\n5. SPARSITY COMPARISON:") |
|
|
train_sparsity = (X_train == 0).sum() / X_train.size * 100 |
|
|
test_sparsity = (X_test == 0).sum() / X_test.size * 100 |
|
|
print(f" Training sparsity: {train_sparsity:.2f}%") |
|
|
print(f" Test sparsity: {test_sparsity:.2f}%") |
|
|
print(f" Sparsity difference: {abs(train_sparsity - test_sparsity):.2f}%") |
|
|
|
|
|
if abs(train_sparsity - test_sparsity) > 5: |
|
|
print(" WARNING: Significant sparsity difference!") |
|
|
else: |
|
|
print(" Sparsity levels are similar") |
|
|
|
|
|
|
|
|
def validate_split_quality(use_cleaned=True): |
|
|
""" |
|
|
Validate the quality of the train-test split. |
|
|
|
|
|
Args: |
|
|
use_cleaned: If True, validate cleaned data instead of original |
|
|
""" |
|
|
data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
|
|
print("="*80) |
|
|
print(f"TRAIN-TEST SPLIT QUALITY VALIDATION - {data_type} DATA") |
|
|
print("="*80) |
|
|
|
|
|
X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
|
|
|
|
|
total_samples = X_train.shape[0] + X_test.shape[0] |
|
|
test_ratio = X_test.shape[0] / total_samples |
|
|
|
|
|
print(f"\nTotal samples: {total_samples}") |
|
|
print(f"Test set ratio: {test_ratio:.2%}") |
|
|
|
|
|
|
|
|
if 0.15 <= test_ratio <= 0.35: |
|
|
print(" Test set size is within recommended range (15-35%)") |
|
|
else: |
|
|
print(" WARNING: Test set size is outside recommended range") |
|
|
|
|
|
|
|
|
from scipy.stats import chisquare |
|
|
|
|
|
|
|
|
common_labels = np.intersect1d(np.unique(y_train), np.unique(y_test)) |
|
|
|
|
|
if len(common_labels) > 0: |
|
|
train_dist = [np.sum(y_train == label) for label in common_labels] |
|
|
test_dist = [np.sum(y_test == label) for label in common_labels] |
|
|
|
|
|
|
|
|
train_props = np.array(train_dist) / len(y_train) |
|
|
test_props = np.array(test_dist) / len(y_test) |
|
|
|
|
|
|
|
|
|
|
|
expected = test_props * len(y_train) |
|
|
chi_stat, p_value = chisquare(train_dist, expected) |
|
|
|
|
|
print(f"\nLabel distribution similarity (chi-square test):") |
|
|
print(f" Chi-square statistic: {chi_stat:.4f}") |
|
|
print(f" P-value: {p_value:.4f}") |
|
|
|
|
|
if p_value > 0.05: |
|
|
print(" Label distributions are statistically similar (p > 0.05)") |
|
|
else: |
|
|
print(" WARNING: Label distributions differ significantly (p <= 0.05)") |
|
|
else: |
|
|
print(" WARNING: No common labels between train and test sets!") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import sys |
|
|
|
|
|
|
|
|
use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv) |
|
|
|
|
|
if use_cleaned: |
|
|
print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n") |
|
|
else: |
|
|
print("Testing ORIGINAL data\n") |
|
|
print("Note: Using --original flag to test old data\n") |
|
|
|
|
|
|
|
|
compare_distributions(use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print("\n") |
|
|
validate_split_quality(use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print("\n") |
|
|
suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print("\n") |
|
|
custom_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned) |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("TRAIN-TEST VALIDATION COMPLETED") |
|
|
print("="*80) |
|
|
print("\nCheck the reports in the 'reports/deepchecks' directory") |
|
|
|