""" Data Integrity Suite - Deepchecks validation for dataset integrity This module implements comprehensive data integrity checks using Deepchecks to validate the quality and consistency of the training and test datasets. Checks included: - Data duplicates detection - Missing values analysis - Feature-label correlation - Feature-feature correlation - Data type consistency - Outlier detection - Class imbalance analysis """ import numpy as np import pandas as pd import json from pathlib import Path from deepchecks.tabular import Dataset from deepchecks.tabular.suites import data_integrity from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR def load_data(use_cleaned=True): """ Load training and test datasets from processed data directory. Args: use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT Returns: tuple: (X_train, y_train, X_test, y_test) """ tfidf_dir = PROCESSED_DATA_DIR / "tfidf" # Choose file names based on cleaned flag if use_cleaned: train_features = tfidf_dir / "features_tfidf_clean.npy" train_labels = tfidf_dir / "labels_tfidf_clean.npy" test_features = tfidf_dir / "X_test_clean.npy" test_labels = tfidf_dir / "Y_test_clean.npy" data_type = "cleaned" else: train_features = tfidf_dir / "features_tfidf.npy" train_labels = tfidf_dir / "labels_tfidf.npy" test_features = tfidf_dir / "X_test.npy" test_labels = tfidf_dir / "Y_test.npy" data_type = "original" # Load features and labels X_train = np.load(train_features) y_train = np.load(train_labels) X_test = np.load(test_features) y_test = np.load(test_labels) print(f"Loaded {data_type} data:") print(f"Training set shape: X={X_train.shape}, y={y_train.shape}") print(f"Test set shape: X={X_test.shape}, y={y_test.shape}") return X_train, y_train, X_test, y_test def create_deepchecks_dataset(X, y, dataset_name="dataset"): """ Create a Deepchecks Dataset object from numpy arrays. Args: X: Feature matrix (numpy array) y: Labels (numpy array) - can be multi-label (2D) or single-label (1D) dataset_name: Name identifier for the dataset Returns: Dataset: Deepchecks Dataset object """ # Convert to DataFrame for better visualization # Create feature names feature_names = [f"feature_{i}" for i in range(X.shape[1])] # Create DataFrame df = pd.DataFrame(X, columns=feature_names) # Handle multi-label case: convert to single label by taking argmax or first active label if len(y.shape) > 1 and y.shape[1] > 1: # Multi-label: convert to single label (first active label or most confident) # For binary multi-label, take the index of first 1 y_single = np.argmax(y, axis=1) # Get the index of maximum value df['label'] = y_single print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks") else: df['label'] = y # Create Deepchecks Dataset ds = Dataset(df, label='label', cat_features=[]) return ds def run_data_integrity_suite(save_output=True, use_cleaned=True): """ Run the complete Data Integrity Suite on training data. This suite performs comprehensive checks including: - Data Duplicates: Identifies duplicate samples - String Mismatch: Checks for string inconsistencies - Mixed Nulls: Detects various null representations - Mixed Data Types: Validates consistent data types - String Length Out Of Bounds: Checks string length anomalies - Is Single Value: Identifies features with only one value - Special Characters: Detects special characters in data - Class Imbalance: Analyzes label distribution - Outlier Sample Detection: Identifies outlier samples - Feature Label Correlation: Checks correlation between features and labels Args: save_output: Whether to save the HTML report use_cleaned: If True, use cleaned data instead of original Returns: SuiteResult: Results from the data integrity suite """ data_type = "CLEANED" if use_cleaned else "ORIGINAL" print("="*80) print(f"DATA INTEGRITY SUITE - {data_type} TRAINING DATA") print("="*80) # Load data X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned) # Create Deepchecks dataset train_dataset = create_deepchecks_dataset(X_train, y_train, "training") # Run the Data Integrity Suite print("\nRunning Data Integrity checks...") suite = data_integrity() result = suite.run(train_dataset) # Display results print("\nData Integrity Suite completed!") print(f"Total checks: {len(result.results)}") # Save output if save_output: output_dir = Path("reports/deepchecks") output_dir.mkdir(parents=True, exist_ok=True) # Save JSON report with appropriate suffix suffix = "_clean" if use_cleaned else "_original" json_path = output_dir / f"data_integrity_suite_results{suffix}.json" json_results = { "suite_name": "Data Integrity Suite", "total_checks": len(result.results), "timestamp": pd.Timestamp.now().isoformat(), "checks": [] } for check_result in result.results: check_data = { "check_name": check_result.get_header(), "passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None, "display": str(check_result.display) if hasattr(check_result, 'display') else None } json_results["checks"].append(check_data) with open(json_path, 'w', encoding='utf-8') as f: json.dump(json_results, f, indent=2, ensure_ascii=False) print(f"JSON results saved to: {json_path}") return result def run_custom_integrity_checks(save_output=True, use_cleaned=True): """ Run custom integrity checks tailored for the SkillScope dataset. These checks are specifically designed for NLP/Text features and multi-label classification tasks. Args: save_output: Whether to save the HTML report use_cleaned: If True, use cleaned data instead of original Returns: dict: Dictionary containing check results """ from deepchecks.tabular.checks import ( DataDuplicates, MixedNulls, IsSingleValue, ClassImbalance, OutlierSampleDetection, FeatureLabelCorrelation, ) data_type = "CLEANED" if use_cleaned else "ORIGINAL" print("="*80) print(f"CUSTOM DATA INTEGRITY CHECKS - {data_type} DATA") print("="*80) # Load data X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned) train_dataset = create_deepchecks_dataset(X_train, y_train, "training") results = {} # Check 1: Data Duplicates print("\n1. Checking for duplicate samples...") duplicates_check = DataDuplicates() results['duplicates'] = duplicates_check.run(train_dataset) # Check 2: Mixed Nulls print("2. Checking for mixed null values...") nulls_check = MixedNulls() results['nulls'] = nulls_check.run(train_dataset) # Check 3: Single Value Features print("3. Checking for single-value features...") single_value_check = IsSingleValue() results['single_value'] = single_value_check.run(train_dataset) # Check 4: Class Imbalance print("4. Checking class distribution...") imbalance_check = ClassImbalance() results['class_imbalance'] = imbalance_check.run(train_dataset) # Check 5: Outlier Detection (with increased timeout) print("5. Detecting outlier samples (this may take a while)...") try: outlier_check = OutlierSampleDetection(timeout=300) # 5 minutes timeout results['outliers'] = outlier_check.run(train_dataset) except Exception as e: print(f" Warning: Outlier detection failed or timed out: {str(e)}") results['outliers'] = None # Check 6: Feature-Label Correlation (with sample subset for speed) print("6. Analyzing feature-label correlation (using sample of features)...") try: # Use only top 100 features for correlation to speed up correlation_check = FeatureLabelCorrelation(n_top_columns=100, timeout=300) results['correlation'] = correlation_check.run(train_dataset) except Exception as e: print(f" Warning: Correlation check failed or timed out: {str(e)}") results['correlation'] = None print("\nAll custom checks completed!") # Results are available in memory for further processing if needed return results def analyze_data_statistics(use_cleaned=True): """ Print detailed statistics about the dataset. Args: use_cleaned: If True, analyze cleaned data instead of original """ data_type = "CLEANED" if use_cleaned else "ORIGINAL" print("="*80) print(f"DATASET STATISTICS - {data_type} DATA") print("="*80) X_train, y_train, X_test, y_test = load_data(use_cleaned=use_cleaned) print(f"\nTraining set:") print(f" - Samples: {X_train.shape[0]}") print(f" - Features: {X_train.shape[1]}") print(f" - Unique labels: {len(np.unique(y_train))}") print(f" - Label distribution:") unique, counts = np.unique(y_train, return_counts=True) for label, count in zip(unique[:10], counts[:10]): # Show first 10 print(f" Label {label}: {count} samples ({count/len(y_train)*100:.2f}%)") if len(unique) > 10: print(f" ... and {len(unique)-10} more labels") print(f"\nTest set:") print(f" - Samples: {X_test.shape[0]}") print(f" - Features: {X_test.shape[1]}") print(f" - Unique labels: {len(np.unique(y_test))}") print(f"\nFeature statistics:") print(f" - Mean feature value: {X_train.mean():.4f}") print(f" - Std feature value: {X_train.std():.4f}") print(f" - Min feature value: {X_train.min():.4f}") print(f" - Max feature value: {X_train.max():.4f}") print(f" - Sparsity: {(X_train == 0).sum() / X_train.size * 100:.2f}%") if __name__ == "__main__": import sys # By default use cleaned data, unless --original flag is specified use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv) if use_cleaned: print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n") else: print("Testing ORIGINAL data\n") print("Note: Using --original flag to test old data\n") # Print dataset statistics analyze_data_statistics(use_cleaned=use_cleaned) # Run the full Data Integrity Suite print("\n") suite_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned) # Run custom integrity checks print("\n") custom_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned) print("\n" + "="*80) print("DATA INTEGRITY VALIDATION COMPLETED") print("="*80) print("\nCheck the reports in the 'reports/deepchecks' directory")