Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Sleeping

Hopcroft-Skill-Classification / tests /deepchecks /test_data_integrity.py

DaCrow13

Deploy to HF Spaces (Clean)

225af6a 4 months ago

11.4 kB

	"""
	Data Integrity Suite - Deepchecks validation for dataset integrity

	This module implements comprehensive data integrity checks using Deepchecks
	to validate the quality and consistency of the training and test datasets.

	Checks included:
	- Data duplicates detection
	- Missing values analysis
	- Feature-label correlation
	- Feature-feature correlation
	- Data type consistency
	- Outlier detection
	- Class imbalance analysis
	"""

	import numpy as np
	import pandas as pd
	import json
	from pathlib import Path
	from deepchecks.tabular import Dataset
	from deepchecks.tabular.suites import data_integrity

	from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR


	def load_data(use_cleaned=True):
	"""
	Load training and test datasets from processed data directory.

	Args:
	use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT

	Returns:
	tuple: (X_train, y_train, X_test, y_test)
	"""
	tfidf_dir = PROCESSED_DATA_DIR / "tfidf"

	# Choose file names based on cleaned flag
	if use_cleaned:
	train_features = tfidf_dir / "features_tfidf_clean.npy"
	train_labels = tfidf_dir / "labels_tfidf_clean.npy"
	test_features = tfidf_dir / "X_test_clean.npy"
	test_labels = tfidf_dir / "Y_test_clean.npy"
	data_type = "cleaned"
	else:
	train_features = tfidf_dir / "features_tfidf.npy"
	train_labels = tfidf_dir / "labels_tfidf.npy"
	test_features = tfidf_dir / "X_test.npy"
	test_labels = tfidf_dir / "Y_test.npy"
	data_type = "original"

	# Load features and labels
	X_train = np.load(train_features)
	y_train = np.load(train_labels)
	X_test = np.load(test_features)
	y_test = np.load(test_labels)

	print(f"Loaded {data_type} data:")
	print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
	print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")

	return X_train, y_train, X_test, y_test


	def create_deepchecks_dataset(X, y, dataset_name="dataset"):
	"""
	Create a Deepchecks Dataset object from numpy arrays.

	Args:
	X: Feature matrix (numpy array)
	y: Labels (numpy array) - can be multi-label (2D) or single-label (1D)
	dataset_name: Name identifier for the dataset

	Returns:
	Dataset: Deepchecks Dataset object
	"""
	# Convert to DataFrame for better visualization
	# Create feature names
	feature_names = [f"feature_{i}" for i in range(X.shape[1])]

	# Create DataFrame
	df = pd.DataFrame(X, columns=feature_names)

	# Handle multi-label case: convert to single label by taking argmax or first active label
	if len(y.shape) > 1 and y.shape[1] > 1:
	# Multi-label: convert to single label (first active label or most confident)
	# For binary multi-label, take the index of first 1
	y_single = np.argmax(y, axis=1) # Get the index of maximum value
	df['label'] = y_single
	print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks")
	else:
	df['label'] = y

	# Create Deepchecks Dataset
	ds = Dataset(df, label='label', cat_features=[])

	return ds


	def run_data_integrity_suite(save_output=True, use_cleaned=True):
	"""
	Run the complete Data Integrity Suite on training data.

	This suite performs comprehensive checks including:
	- Data Duplicates: Identifies duplicate samples
	- String Mismatch: Checks for string inconsistencies
	- Mixed Nulls: Detects various null representations
	- Mixed Data Types: Validates consistent data types
	- String Length Out Of Bounds: Checks string length anomalies
	- Is Single Value: Identifies features with only one value
	- Special Characters: Detects special characters in data
	- Class Imbalance: Analyzes label distribution
	- Outlier Sample Detection: Identifies outlier samples
	- Feature Label Correlation: Checks correlation between features and labels

	Args:
	save_output: Whether to save the HTML report
	use_cleaned: If True, use cleaned data instead of original

	Returns:
	SuiteResult: Results from the data integrity suite
	"""
	data_type = "CLEANED" if use_cleaned else "ORIGINAL"
	print("="*80)
	print(f"DATA INTEGRITY SUITE - {data_type} TRAINING DATA")
	print("="*80)

	# Load data
	X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned)

	# Create Deepchecks dataset
	train_dataset = create_deepchecks_dataset(X_train, y_train, "training")

	# Run the Data Integrity Suite
	print("\nRunning Data Integrity checks...")
	suite = data_integrity()
	result = suite.run(train_dataset)

	# Display results
	print("\nData Integrity Suite completed!")
	print(f"Total checks: {len(result.results)}")

	# Save output
	if save_output:
	output_dir = Path("reports/deepchecks")
	output_dir.mkdir(parents=True, exist_ok=True)

	# Save JSON report with appropriate suffix
	suffix = "_clean" if use_cleaned else "_original"
	json_path = output_dir / f"data_integrity_suite_results{suffix}.json"
	json_results = {
	"suite_name": "Data Integrity Suite",
	"total_checks": len(result.results),
	"timestamp": pd.Timestamp.now().isoformat(),
	"checks": []
	}

	for check_result in result.results:
	check_data = {
	"check_name": check_result.get_header(),
	"passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None,
	"display": str(check_result.display) if hasattr(check_result, 'display') else None
	}
	json_results["checks"].append(check_data)

	with open(json_path, 'w', encoding='utf-8') as f:
	json.dump(json_results, f, indent=2, ensure_ascii=False)
	print(f"JSON results saved to: {json_path}")

	return result


	def run_custom_integrity_checks(save_output=True, use_cleaned=True):
	"""
	Run custom integrity checks tailored for the SkillScope dataset.

	These checks are specifically designed for NLP/Text features and
	multi-label classification tasks.

	Args:
	save_output: Whether to save the HTML report
	use_cleaned: If True, use cleaned data instead of original

	Returns:
	dict: Dictionary containing check results
	"""
	from deepchecks.tabular.checks import (
	DataDuplicates,
	MixedNulls,
	IsSingleValue,
	ClassImbalance,
	OutlierSampleDetection,
	FeatureLabelCorrelation,
	)

	data_type = "CLEANED" if use_cleaned else "ORIGINAL"
	print("="*80)
	print(f"CUSTOM DATA INTEGRITY CHECKS - {data_type} DATA")
	print("="*80)

	# Load data
	X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned)
	train_dataset = create_deepchecks_dataset(X_train, y_train, "training")

	results = {}

	# Check 1: Data Duplicates
	print("\n1. Checking for duplicate samples...")
	duplicates_check = DataDuplicates()
	results['duplicates'] = duplicates_check.run(train_dataset)

	# Check 2: Mixed Nulls
	print("2. Checking for mixed null values...")
	nulls_check = MixedNulls()
	results['nulls'] = nulls_check.run(train_dataset)

	# Check 3: Single Value Features
	print("3. Checking for single-value features...")
	single_value_check = IsSingleValue()
	results['single_value'] = single_value_check.run(train_dataset)

	# Check 4: Class Imbalance
	print("4. Checking class distribution...")
	imbalance_check = ClassImbalance()
	results['class_imbalance'] = imbalance_check.run(train_dataset)

	# Check 5: Outlier Detection (with increased timeout)
	print("5. Detecting outlier samples (this may take a while)...")
	try:
	outlier_check = OutlierSampleDetection(timeout=300) # 5 minutes timeout
	results['outliers'] = outlier_check.run(train_dataset)
	except Exception as e:
	print(f" Warning: Outlier detection failed or timed out: {str(e)}")
	results['outliers'] = None

	# Check 6: Feature-Label Correlation (with sample subset for speed)
	print("6. Analyzing feature-label correlation (using sample of features)...")
	try:
	# Use only top 100 features for correlation to speed up
	correlation_check = FeatureLabelCorrelation(n_top_columns=100, timeout=300)
	results['correlation'] = correlation_check.run(train_dataset)
	except Exception as e:
	print(f" Warning: Correlation check failed or timed out: {str(e)}")
	results['correlation'] = None

	print("\nAll custom checks completed!")

	# Results are available in memory for further processing if needed

	return results


	def analyze_data_statistics(use_cleaned=True):
	"""
	Print detailed statistics about the dataset.

	Args:
	use_cleaned: If True, analyze cleaned data instead of original
	"""
	data_type = "CLEANED" if use_cleaned else "ORIGINAL"
	print("="*80)
	print(f"DATASET STATISTICS - {data_type} DATA")
	print("="*80)

	X_train, y_train, X_test, y_test = load_data(use_cleaned=use_cleaned)

	print(f"\nTraining set:")
	print(f" - Samples: {X_train.shape[0]}")
	print(f" - Features: {X_train.shape[1]}")
	print(f" - Unique labels: {len(np.unique(y_train))}")
	print(f" - Label distribution:")
	unique, counts = np.unique(y_train, return_counts=True)
	for label, count in zip(unique[:10], counts[:10]): # Show first 10
	print(f" Label {label}: {count} samples ({count/len(y_train)*100:.2f}%)")
	if len(unique) > 10:
	print(f" ... and {len(unique)-10} more labels")

	print(f"\nTest set:")
	print(f" - Samples: {X_test.shape[0]}")
	print(f" - Features: {X_test.shape[1]}")
	print(f" - Unique labels: {len(np.unique(y_test))}")

	print(f"\nFeature statistics:")
	print(f" - Mean feature value: {X_train.mean():.4f}")
	print(f" - Std feature value: {X_train.std():.4f}")
	print(f" - Min feature value: {X_train.min():.4f}")
	print(f" - Max feature value: {X_train.max():.4f}")
	print(f" - Sparsity: {(X_train == 0).sum() / X_train.size * 100:.2f}%")


	if __name__ == "__main__":
	import sys

	# By default use cleaned data, unless --original flag is specified
	use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv)

	if use_cleaned:
	print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n")
	else:
	print("Testing ORIGINAL data\n")
	print("Note: Using --original flag to test old data\n")

	# Print dataset statistics
	analyze_data_statistics(use_cleaned=use_cleaned)

	# Run the full Data Integrity Suite
	print("\n")
	suite_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned)

	# Run custom integrity checks
	print("\n")
	custom_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned)

	print("\n" + "="*80)
	print("DATA INTEGRITY VALIDATION COMPLETED")
	print("="*80)
	print("\nCheck the reports in the 'reports/deepchecks' directory")