Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

Hopcroft-Skill-Classification / tests /deepchecks /test_train_test_validation.py

DaCrow13

Deploy to HF Spaces (Clean)

225af6a about 2 months ago

15.6 kB

	"""
	Train-Test Validation Suite - Deepchecks validation for train-test consistency

	This module implements comprehensive train-test validation checks using Deepchecks
	to ensure consistency and proper splitting between training and test datasets.

	Checks included:
	- Train-Test Feature Drift: Detects distribution changes between train and test
	- Train-Test Label Drift: Checks if label distribution differs
	- Train-Test Samples Mix: Validates no data leakage
	- Whole Dataset Drift: Overall distribution comparison
	- Feature Label Correlation Change: Checks if correlations change
	- New Label: Detects labels in test not present in train
	- New Category: Detects new categorical values in test
	- String Mismatch Comparison: Compares string inconsistencies
	- Date Train Test Leakage Duplicates: Checks for temporal leakage
	- Date Train Test Leakage Overlap: Validates proper temporal split
	"""

	import numpy as np
	import pandas as pd
	import json
	from pathlib import Path
	from deepchecks.tabular import Dataset
	from deepchecks.tabular.suites import train_test_validation

	from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR


	def load_train_test_data(use_cleaned=True):
	"""
	Load training and test datasets from processed data directory.

	Args:
	use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT

	Returns:
	tuple: (X_train, y_train, X_test, y_test)
	"""
	tfidf_dir = PROCESSED_DATA_DIR / "tfidf"

	# Choose file names based on cleaned flag
	if use_cleaned:
	train_features = tfidf_dir / "features_tfidf_clean.npy"
	train_labels = tfidf_dir / "labels_tfidf_clean.npy"
	test_features = tfidf_dir / "X_test_clean.npy"
	test_labels = tfidf_dir / "Y_test_clean.npy"
	data_type = "cleaned"
	else:
	train_features = tfidf_dir / "features_tfidf.npy"
	train_labels = tfidf_dir / "labels_tfidf.npy"
	test_features = tfidf_dir / "X_test.npy"
	test_labels = tfidf_dir / "Y_test.npy"
	data_type = "original"

	# Load features and labels
	X_train = np.load(train_features)
	y_train = np.load(train_labels)
	X_test = np.load(test_features)
	y_test = np.load(test_labels)

	print(f"Loaded {data_type} data:")
	print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
	print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")

	return X_train, y_train, X_test, y_test


	def create_deepchecks_dataset(X, y, dataset_name="dataset"):
	"""
	Create a Deepchecks Dataset object from numpy arrays.

	Args:
	X: Feature matrix (numpy array)
	y: Labels (numpy array) - can be multi-label (2D) or single-label (1D)
	dataset_name: Name identifier for the dataset

	Returns:
	Dataset: Deepchecks Dataset object
	"""
	# Convert to DataFrame for better visualization
	# Create feature names
	feature_names = [f"feature_{i}" for i in range(X.shape[1])]

	# Create DataFrame
	df = pd.DataFrame(X, columns=feature_names)

	# Handle multi-label case: convert to single label by taking argmax or first active label
	if len(y.shape) > 1 and y.shape[1] > 1:
	# Multi-label: convert to single label (first active label or most confident)
	# For binary multi-label, take the index of first 1
	y_single = np.argmax(y, axis=1) # Get the index of maximum value
	df['label'] = y_single
	print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks")
	else:
	df['label'] = y

	# Create Deepchecks Dataset
	ds = Dataset(df, label='label', cat_features=[])

	return ds


	def run_train_test_validation_suite(save_output=True, use_cleaned=True):
	"""
	Run the complete Train-Test Validation Suite.

	This suite performs comprehensive checks including:
	- Train Test Feature Drift: Detects significant distribution changes in features
	- Train Test Label Drift: Checks if label distribution is consistent
	- Train Test Samples Mix: Validates no samples appear in both sets
	- Whole Dataset Drift: Overall dataset distribution comparison
	- Feature Label Correlation Change: Detects changes in feature-label relationships
	- New Label: Identifies labels in test that don't exist in train
	- New Category: Finds new categorical values in test set
	- String Mismatch Comparison: Compares string format consistency
	- Date Train Test Leakage: Checks for temporal data leakage
	- Index Train Test Leakage: Validates proper index separation

	Args:
	save_output: Whether to save the HTML report
	use_cleaned: If True, use cleaned data instead of original

	Returns:
	SuiteResult: Results from the train-test validation suite
	"""
	data_type = "CLEANED" if use_cleaned else "ORIGINAL"
	print("="*80)
	print(f"TRAIN-TEST VALIDATION SUITE - {data_type} DATA")
	print("="*80)

	# Load data
	X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)

	# Create Deepchecks datasets
	train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
	test_dataset = create_deepchecks_dataset(X_test, y_test, "test")

	# Run the Train-Test Validation Suite
	print("\nRunning Train-Test Validation checks...")
	suite = train_test_validation()
	result = suite.run(train_dataset, test_dataset)

	# Display results
	print("\nTrain-Test Validation Suite completed!")
	print(f"Total checks: {len(result.results)}")

	# Save output
	if save_output:
	output_dir = Path("reports/deepchecks")
	output_dir.mkdir(parents=True, exist_ok=True)

	# Save JSON report with appropriate suffix
	suffix = "_clean" if use_cleaned else "_original"
	json_path = output_dir / f"train_test_validation_suite_results{suffix}.json"
	json_results = {
	"suite_name": "Train-Test Validation Suite",
	"total_checks": len(result.results),
	"timestamp": pd.Timestamp.now().isoformat(),
	"checks": []
	}

	for check_result in result.results:
	check_data = {
	"check_name": check_result.get_header(),
	"passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None,
	"display": str(check_result.display) if hasattr(check_result, 'display') else None
	}
	json_results["checks"].append(check_data)

	with open(json_path, 'w', encoding='utf-8') as f:
	json.dump(json_results, f, indent=2, ensure_ascii=False)
	print(f"JSON results saved to: {json_path}")

	return result


	def run_custom_train_test_checks(save_output=True, use_cleaned=True):
	"""
	Run custom train-test validation checks tailored for the SkillScope dataset.

	These checks are specifically designed for NLP/Text features and
	multi-label classification tasks.

	Args:
	save_output: Whether to save the HTML report
	use_cleaned: If True, use cleaned data instead of original

	Returns:
	dict: Dictionary containing check results
	"""
	from deepchecks.tabular.checks import (
	TrainTestFeatureDrift,
	TrainTestLabelDrift,
	TrainTestSamplesMix,
	WholeDatasetDrift,
	FeatureLabelCorrelationChange,
	)

	data_type = "CLEANED" if use_cleaned else "ORIGINAL"
	print("="*80)
	print(f"CUSTOM TRAIN-TEST VALIDATION CHECKS - {data_type} DATA")
	print("="*80)

	# Load data
	X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
	train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
	test_dataset = create_deepchecks_dataset(X_test, y_test, "test")

	results = {}

	# Check 1: Feature Drift
	print("\n1. Checking for feature drift between train and test...")
	feature_drift_check = TrainTestFeatureDrift()
	results['feature_drift'] = feature_drift_check.run(train_dataset, test_dataset)

	# Check 2: Label Drift
	print("2. Checking for label drift between train and test...")
	label_drift_check = TrainTestLabelDrift()
	results['label_drift'] = label_drift_check.run(train_dataset, test_dataset)

	# Check 3: Samples Mix (Data Leakage)
	print("3. Checking for data leakage (samples appearing in both sets)...")
	samples_mix_check = TrainTestSamplesMix()
	results['samples_mix'] = samples_mix_check.run(train_dataset, test_dataset)

	# Check 4: Whole Dataset Drift
	print("4. Checking overall dataset drift...")
	dataset_drift_check = WholeDatasetDrift()
	results['dataset_drift'] = dataset_drift_check.run(train_dataset, test_dataset)

	# Check 5: Feature-Label Correlation Change
	print("5. Checking for changes in feature-label correlation...")
	correlation_change_check = FeatureLabelCorrelationChange()
	results['correlation_change'] = correlation_change_check.run(train_dataset, test_dataset)

	# Note: NewLabel check not available in this version of Deepchecks
	# Check 6 would verify new labels in test set not present in train
	print("6. Skipping NewLabel check (not available in this Deepchecks version)")

	print("\nAll custom train-test checks completed!")

	# Results are available in memory for further processing if needed

	return results


	def compare_distributions(use_cleaned=True):
	"""
	Compare statistical distributions between train and test sets.

	Args:
	use_cleaned: If True, compare cleaned data instead of original
	"""
	data_type = "CLEANED" if use_cleaned else "ORIGINAL"
	print("="*80)
	print(f"TRAIN-TEST DISTRIBUTION COMPARISON - {data_type} DATA")
	print("="*80)

	X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)

	print("\n1. SAMPLE SIZES:")
	print(f" Training: {X_train.shape[0]} samples")
	print(f" Test: {X_test.shape[0]} samples")
	print(f" Train/Test ratio: {X_train.shape[0]/X_test.shape[0]:.2f}")

	print("\n2. FEATURE DIMENSIONS:")
	print(f" Training features: {X_train.shape[1]}")
	print(f" Test features: {X_test.shape[1]}")
	if X_train.shape[1] != X_test.shape[1]:
	print(" WARNING: Feature dimensions don't match!")
	else:
	print(" ✓ Feature dimensions match")

	print("\n3. LABEL DISTRIBUTION:")
	train_unique, train_counts = np.unique(y_train, return_counts=True)
	test_unique, test_counts = np.unique(y_test, return_counts=True)

	print(f" Training unique labels: {len(train_unique)}")
	print(f" Test unique labels: {len(test_unique)}")

	# Check for labels in test not in train
	new_labels = set(test_unique) - set(train_unique)
	if new_labels:
	print(f" WARNING: {len(new_labels)} new labels in test set: {new_labels}")
	else:
	print(" No new labels in test set")

	# Check for labels in train not in test
	missing_labels = set(train_unique) - set(test_unique)
	if missing_labels:
	print(f" INFO: {len(missing_labels)} labels only in train set")

	print("\n4. FEATURE STATISTICS COMPARISON:")
	print(f" Train - Mean: {X_train.mean():.4f}, Std: {X_train.std():.4f}")
	print(f" Test - Mean: {X_test.mean():.4f}, Std: {X_test.std():.4f}")

	mean_diff = abs(X_train.mean() - X_test.mean())
	std_diff = abs(X_train.std() - X_test.std())

	print(f" Mean difference: {mean_diff:.4f}")
	print(f" Std difference: {std_diff:.4f}")

	if mean_diff > 0.1 or std_diff > 0.1:
	print(" WARNING: Significant statistical differences detected!")
	else:
	print(" Statistical distributions are similar")

	print("\n5. SPARSITY COMPARISON:")
	train_sparsity = (X_train == 0).sum() / X_train.size * 100
	test_sparsity = (X_test == 0).sum() / X_test.size * 100
	print(f" Training sparsity: {train_sparsity:.2f}%")
	print(f" Test sparsity: {test_sparsity:.2f}%")
	print(f" Sparsity difference: {abs(train_sparsity - test_sparsity):.2f}%")

	if abs(train_sparsity - test_sparsity) > 5:
	print(" WARNING: Significant sparsity difference!")
	else:
	print(" Sparsity levels are similar")


	def validate_split_quality(use_cleaned=True):
	"""
	Validate the quality of the train-test split.

	Args:
	use_cleaned: If True, validate cleaned data instead of original
	"""
	data_type = "CLEANED" if use_cleaned else "ORIGINAL"
	print("="*80)
	print(f"TRAIN-TEST SPLIT QUALITY VALIDATION - {data_type} DATA")
	print("="*80)

	X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)

	total_samples = X_train.shape[0] + X_test.shape[0]
	test_ratio = X_test.shape[0] / total_samples

	print(f"\nTotal samples: {total_samples}")
	print(f"Test set ratio: {test_ratio:.2%}")

	# Validate test set size (typically 20-30%)
	if 0.15 <= test_ratio <= 0.35:
	print(" Test set size is within recommended range (15-35%)")
	else:
	print(" WARNING: Test set size is outside recommended range")

	# Check label distribution similarity
	from scipy.stats import chisquare

	# Get common labels
	common_labels = np.intersect1d(np.unique(y_train), np.unique(y_test))

	if len(common_labels) > 0:
	train_dist = [np.sum(y_train == label) for label in common_labels]
	test_dist = [np.sum(y_test == label) for label in common_labels]

	# Normalize to proportions
	train_props = np.array(train_dist) / len(y_train)
	test_props = np.array(test_dist) / len(y_test)

	# Chi-square test
	# Scale test proportions to match train sample size for chi-square
	expected = test_props * len(y_train)
	chi_stat, p_value = chisquare(train_dist, expected)

	print(f"\nLabel distribution similarity (chi-square test):")
	print(f" Chi-square statistic: {chi_stat:.4f}")
	print(f" P-value: {p_value:.4f}")

	if p_value > 0.05:
	print(" Label distributions are statistically similar (p > 0.05)")
	else:
	print(" WARNING: Label distributions differ significantly (p <= 0.05)")
	else:
	print(" WARNING: No common labels between train and test sets!")


	if __name__ == "__main__":
	import sys

	# By default use cleaned data, unless --original flag is specified
	use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv)

	if use_cleaned:
	print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n")
	else:
	print("Testing ORIGINAL data\n")
	print("Note: Using --original flag to test old data\n")

	# Compare distributions
	compare_distributions(use_cleaned=use_cleaned)

	# Validate split quality
	print("\n")
	validate_split_quality(use_cleaned=use_cleaned)

	# Run the full Train-Test Validation Suite
	print("\n")
	suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned)

	# Run custom train-test checks
	print("\n")
	custom_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned)

	print("\n" + "="*80)
	print("TRAIN-TEST VALIDATION COMPLETED")
	print("="*80)
	print("\nCheck the reports in the 'reports/deepchecks' directory")