File size: 15,642 Bytes
225af6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 |
"""
Train-Test Validation Suite - Deepchecks validation for train-test consistency
This module implements comprehensive train-test validation checks using Deepchecks
to ensure consistency and proper splitting between training and test datasets.
Checks included:
- Train-Test Feature Drift: Detects distribution changes between train and test
- Train-Test Label Drift: Checks if label distribution differs
- Train-Test Samples Mix: Validates no data leakage
- Whole Dataset Drift: Overall distribution comparison
- Feature Label Correlation Change: Checks if correlations change
- New Label: Detects labels in test not present in train
- New Category: Detects new categorical values in test
- String Mismatch Comparison: Compares string inconsistencies
- Date Train Test Leakage Duplicates: Checks for temporal leakage
- Date Train Test Leakage Overlap: Validates proper temporal split
"""
import numpy as np
import pandas as pd
import json
from pathlib import Path
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import train_test_validation
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR
def load_train_test_data(use_cleaned=True):
"""
Load training and test datasets from processed data directory.
Args:
use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT
Returns:
tuple: (X_train, y_train, X_test, y_test)
"""
tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
# Choose file names based on cleaned flag
if use_cleaned:
train_features = tfidf_dir / "features_tfidf_clean.npy"
train_labels = tfidf_dir / "labels_tfidf_clean.npy"
test_features = tfidf_dir / "X_test_clean.npy"
test_labels = tfidf_dir / "Y_test_clean.npy"
data_type = "cleaned"
else:
train_features = tfidf_dir / "features_tfidf.npy"
train_labels = tfidf_dir / "labels_tfidf.npy"
test_features = tfidf_dir / "X_test.npy"
test_labels = tfidf_dir / "Y_test.npy"
data_type = "original"
# Load features and labels
X_train = np.load(train_features)
y_train = np.load(train_labels)
X_test = np.load(test_features)
y_test = np.load(test_labels)
print(f"Loaded {data_type} data:")
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")
return X_train, y_train, X_test, y_test
def create_deepchecks_dataset(X, y, dataset_name="dataset"):
"""
Create a Deepchecks Dataset object from numpy arrays.
Args:
X: Feature matrix (numpy array)
y: Labels (numpy array) - can be multi-label (2D) or single-label (1D)
dataset_name: Name identifier for the dataset
Returns:
Dataset: Deepchecks Dataset object
"""
# Convert to DataFrame for better visualization
# Create feature names
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
# Create DataFrame
df = pd.DataFrame(X, columns=feature_names)
# Handle multi-label case: convert to single label by taking argmax or first active label
if len(y.shape) > 1 and y.shape[1] > 1:
# Multi-label: convert to single label (first active label or most confident)
# For binary multi-label, take the index of first 1
y_single = np.argmax(y, axis=1) # Get the index of maximum value
df['label'] = y_single
print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks")
else:
df['label'] = y
# Create Deepchecks Dataset
ds = Dataset(df, label='label', cat_features=[])
return ds
def run_train_test_validation_suite(save_output=True, use_cleaned=True):
"""
Run the complete Train-Test Validation Suite.
This suite performs comprehensive checks including:
- Train Test Feature Drift: Detects significant distribution changes in features
- Train Test Label Drift: Checks if label distribution is consistent
- Train Test Samples Mix: Validates no samples appear in both sets
- Whole Dataset Drift: Overall dataset distribution comparison
- Feature Label Correlation Change: Detects changes in feature-label relationships
- New Label: Identifies labels in test that don't exist in train
- New Category: Finds new categorical values in test set
- String Mismatch Comparison: Compares string format consistency
- Date Train Test Leakage: Checks for temporal data leakage
- Index Train Test Leakage: Validates proper index separation
Args:
save_output: Whether to save the HTML report
use_cleaned: If True, use cleaned data instead of original
Returns:
SuiteResult: Results from the train-test validation suite
"""
data_type = "CLEANED" if use_cleaned else "ORIGINAL"
print("="*80)
print(f"TRAIN-TEST VALIDATION SUITE - {data_type} DATA")
print("="*80)
# Load data
X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
# Create Deepchecks datasets
train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
test_dataset = create_deepchecks_dataset(X_test, y_test, "test")
# Run the Train-Test Validation Suite
print("\nRunning Train-Test Validation checks...")
suite = train_test_validation()
result = suite.run(train_dataset, test_dataset)
# Display results
print("\nTrain-Test Validation Suite completed!")
print(f"Total checks: {len(result.results)}")
# Save output
if save_output:
output_dir = Path("reports/deepchecks")
output_dir.mkdir(parents=True, exist_ok=True)
# Save JSON report with appropriate suffix
suffix = "_clean" if use_cleaned else "_original"
json_path = output_dir / f"train_test_validation_suite_results{suffix}.json"
json_results = {
"suite_name": "Train-Test Validation Suite",
"total_checks": len(result.results),
"timestamp": pd.Timestamp.now().isoformat(),
"checks": []
}
for check_result in result.results:
check_data = {
"check_name": check_result.get_header(),
"passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None,
"display": str(check_result.display) if hasattr(check_result, 'display') else None
}
json_results["checks"].append(check_data)
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(json_results, f, indent=2, ensure_ascii=False)
print(f"JSON results saved to: {json_path}")
return result
def run_custom_train_test_checks(save_output=True, use_cleaned=True):
"""
Run custom train-test validation checks tailored for the SkillScope dataset.
These checks are specifically designed for NLP/Text features and
multi-label classification tasks.
Args:
save_output: Whether to save the HTML report
use_cleaned: If True, use cleaned data instead of original
Returns:
dict: Dictionary containing check results
"""
from deepchecks.tabular.checks import (
TrainTestFeatureDrift,
TrainTestLabelDrift,
TrainTestSamplesMix,
WholeDatasetDrift,
FeatureLabelCorrelationChange,
)
data_type = "CLEANED" if use_cleaned else "ORIGINAL"
print("="*80)
print(f"CUSTOM TRAIN-TEST VALIDATION CHECKS - {data_type} DATA")
print("="*80)
# Load data
X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
test_dataset = create_deepchecks_dataset(X_test, y_test, "test")
results = {}
# Check 1: Feature Drift
print("\n1. Checking for feature drift between train and test...")
feature_drift_check = TrainTestFeatureDrift()
results['feature_drift'] = feature_drift_check.run(train_dataset, test_dataset)
# Check 2: Label Drift
print("2. Checking for label drift between train and test...")
label_drift_check = TrainTestLabelDrift()
results['label_drift'] = label_drift_check.run(train_dataset, test_dataset)
# Check 3: Samples Mix (Data Leakage)
print("3. Checking for data leakage (samples appearing in both sets)...")
samples_mix_check = TrainTestSamplesMix()
results['samples_mix'] = samples_mix_check.run(train_dataset, test_dataset)
# Check 4: Whole Dataset Drift
print("4. Checking overall dataset drift...")
dataset_drift_check = WholeDatasetDrift()
results['dataset_drift'] = dataset_drift_check.run(train_dataset, test_dataset)
# Check 5: Feature-Label Correlation Change
print("5. Checking for changes in feature-label correlation...")
correlation_change_check = FeatureLabelCorrelationChange()
results['correlation_change'] = correlation_change_check.run(train_dataset, test_dataset)
# Note: NewLabel check not available in this version of Deepchecks
# Check 6 would verify new labels in test set not present in train
print("6. Skipping NewLabel check (not available in this Deepchecks version)")
print("\nAll custom train-test checks completed!")
# Results are available in memory for further processing if needed
return results
def compare_distributions(use_cleaned=True):
"""
Compare statistical distributions between train and test sets.
Args:
use_cleaned: If True, compare cleaned data instead of original
"""
data_type = "CLEANED" if use_cleaned else "ORIGINAL"
print("="*80)
print(f"TRAIN-TEST DISTRIBUTION COMPARISON - {data_type} DATA")
print("="*80)
X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
print("\n1. SAMPLE SIZES:")
print(f" Training: {X_train.shape[0]} samples")
print(f" Test: {X_test.shape[0]} samples")
print(f" Train/Test ratio: {X_train.shape[0]/X_test.shape[0]:.2f}")
print("\n2. FEATURE DIMENSIONS:")
print(f" Training features: {X_train.shape[1]}")
print(f" Test features: {X_test.shape[1]}")
if X_train.shape[1] != X_test.shape[1]:
print(" WARNING: Feature dimensions don't match!")
else:
print(" ✓ Feature dimensions match")
print("\n3. LABEL DISTRIBUTION:")
train_unique, train_counts = np.unique(y_train, return_counts=True)
test_unique, test_counts = np.unique(y_test, return_counts=True)
print(f" Training unique labels: {len(train_unique)}")
print(f" Test unique labels: {len(test_unique)}")
# Check for labels in test not in train
new_labels = set(test_unique) - set(train_unique)
if new_labels:
print(f" WARNING: {len(new_labels)} new labels in test set: {new_labels}")
else:
print(" No new labels in test set")
# Check for labels in train not in test
missing_labels = set(train_unique) - set(test_unique)
if missing_labels:
print(f" INFO: {len(missing_labels)} labels only in train set")
print("\n4. FEATURE STATISTICS COMPARISON:")
print(f" Train - Mean: {X_train.mean():.4f}, Std: {X_train.std():.4f}")
print(f" Test - Mean: {X_test.mean():.4f}, Std: {X_test.std():.4f}")
mean_diff = abs(X_train.mean() - X_test.mean())
std_diff = abs(X_train.std() - X_test.std())
print(f" Mean difference: {mean_diff:.4f}")
print(f" Std difference: {std_diff:.4f}")
if mean_diff > 0.1 or std_diff > 0.1:
print(" WARNING: Significant statistical differences detected!")
else:
print(" Statistical distributions are similar")
print("\n5. SPARSITY COMPARISON:")
train_sparsity = (X_train == 0).sum() / X_train.size * 100
test_sparsity = (X_test == 0).sum() / X_test.size * 100
print(f" Training sparsity: {train_sparsity:.2f}%")
print(f" Test sparsity: {test_sparsity:.2f}%")
print(f" Sparsity difference: {abs(train_sparsity - test_sparsity):.2f}%")
if abs(train_sparsity - test_sparsity) > 5:
print(" WARNING: Significant sparsity difference!")
else:
print(" Sparsity levels are similar")
def validate_split_quality(use_cleaned=True):
"""
Validate the quality of the train-test split.
Args:
use_cleaned: If True, validate cleaned data instead of original
"""
data_type = "CLEANED" if use_cleaned else "ORIGINAL"
print("="*80)
print(f"TRAIN-TEST SPLIT QUALITY VALIDATION - {data_type} DATA")
print("="*80)
X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
total_samples = X_train.shape[0] + X_test.shape[0]
test_ratio = X_test.shape[0] / total_samples
print(f"\nTotal samples: {total_samples}")
print(f"Test set ratio: {test_ratio:.2%}")
# Validate test set size (typically 20-30%)
if 0.15 <= test_ratio <= 0.35:
print(" Test set size is within recommended range (15-35%)")
else:
print(" WARNING: Test set size is outside recommended range")
# Check label distribution similarity
from scipy.stats import chisquare
# Get common labels
common_labels = np.intersect1d(np.unique(y_train), np.unique(y_test))
if len(common_labels) > 0:
train_dist = [np.sum(y_train == label) for label in common_labels]
test_dist = [np.sum(y_test == label) for label in common_labels]
# Normalize to proportions
train_props = np.array(train_dist) / len(y_train)
test_props = np.array(test_dist) / len(y_test)
# Chi-square test
# Scale test proportions to match train sample size for chi-square
expected = test_props * len(y_train)
chi_stat, p_value = chisquare(train_dist, expected)
print(f"\nLabel distribution similarity (chi-square test):")
print(f" Chi-square statistic: {chi_stat:.4f}")
print(f" P-value: {p_value:.4f}")
if p_value > 0.05:
print(" Label distributions are statistically similar (p > 0.05)")
else:
print(" WARNING: Label distributions differ significantly (p <= 0.05)")
else:
print(" WARNING: No common labels between train and test sets!")
if __name__ == "__main__":
import sys
# By default use cleaned data, unless --original flag is specified
use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv)
if use_cleaned:
print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n")
else:
print("Testing ORIGINAL data\n")
print("Note: Using --original flag to test old data\n")
# Compare distributions
compare_distributions(use_cleaned=use_cleaned)
# Validate split quality
print("\n")
validate_split_quality(use_cleaned=use_cleaned)
# Run the full Train-Test Validation Suite
print("\n")
suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned)
# Run custom train-test checks
print("\n")
custom_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned)
print("\n" + "="*80)
print("TRAIN-TEST VALIDATION COMPLETED")
print("="*80)
print("\nCheck the reports in the 'reports/deepchecks' directory")
|