File size: 11,392 Bytes
225af6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 |
"""
Data Integrity Suite - Deepchecks validation for dataset integrity
This module implements comprehensive data integrity checks using Deepchecks
to validate the quality and consistency of the training and test datasets.
Checks included:
- Data duplicates detection
- Missing values analysis
- Feature-label correlation
- Feature-feature correlation
- Data type consistency
- Outlier detection
- Class imbalance analysis
"""
import numpy as np
import pandas as pd
import json
from pathlib import Path
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import data_integrity
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR
def load_data(use_cleaned=True):
"""
Load training and test datasets from processed data directory.
Args:
use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT
Returns:
tuple: (X_train, y_train, X_test, y_test)
"""
tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
# Choose file names based on cleaned flag
if use_cleaned:
train_features = tfidf_dir / "features_tfidf_clean.npy"
train_labels = tfidf_dir / "labels_tfidf_clean.npy"
test_features = tfidf_dir / "X_test_clean.npy"
test_labels = tfidf_dir / "Y_test_clean.npy"
data_type = "cleaned"
else:
train_features = tfidf_dir / "features_tfidf.npy"
train_labels = tfidf_dir / "labels_tfidf.npy"
test_features = tfidf_dir / "X_test.npy"
test_labels = tfidf_dir / "Y_test.npy"
data_type = "original"
# Load features and labels
X_train = np.load(train_features)
y_train = np.load(train_labels)
X_test = np.load(test_features)
y_test = np.load(test_labels)
print(f"Loaded {data_type} data:")
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")
return X_train, y_train, X_test, y_test
def create_deepchecks_dataset(X, y, dataset_name="dataset"):
"""
Create a Deepchecks Dataset object from numpy arrays.
Args:
X: Feature matrix (numpy array)
y: Labels (numpy array) - can be multi-label (2D) or single-label (1D)
dataset_name: Name identifier for the dataset
Returns:
Dataset: Deepchecks Dataset object
"""
# Convert to DataFrame for better visualization
# Create feature names
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
# Create DataFrame
df = pd.DataFrame(X, columns=feature_names)
# Handle multi-label case: convert to single label by taking argmax or first active label
if len(y.shape) > 1 and y.shape[1] > 1:
# Multi-label: convert to single label (first active label or most confident)
# For binary multi-label, take the index of first 1
y_single = np.argmax(y, axis=1) # Get the index of maximum value
df['label'] = y_single
print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks")
else:
df['label'] = y
# Create Deepchecks Dataset
ds = Dataset(df, label='label', cat_features=[])
return ds
def run_data_integrity_suite(save_output=True, use_cleaned=True):
"""
Run the complete Data Integrity Suite on training data.
This suite performs comprehensive checks including:
- Data Duplicates: Identifies duplicate samples
- String Mismatch: Checks for string inconsistencies
- Mixed Nulls: Detects various null representations
- Mixed Data Types: Validates consistent data types
- String Length Out Of Bounds: Checks string length anomalies
- Is Single Value: Identifies features with only one value
- Special Characters: Detects special characters in data
- Class Imbalance: Analyzes label distribution
- Outlier Sample Detection: Identifies outlier samples
- Feature Label Correlation: Checks correlation between features and labels
Args:
save_output: Whether to save the HTML report
use_cleaned: If True, use cleaned data instead of original
Returns:
SuiteResult: Results from the data integrity suite
"""
data_type = "CLEANED" if use_cleaned else "ORIGINAL"
print("="*80)
print(f"DATA INTEGRITY SUITE - {data_type} TRAINING DATA")
print("="*80)
# Load data
X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned)
# Create Deepchecks dataset
train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
# Run the Data Integrity Suite
print("\nRunning Data Integrity checks...")
suite = data_integrity()
result = suite.run(train_dataset)
# Display results
print("\nData Integrity Suite completed!")
print(f"Total checks: {len(result.results)}")
# Save output
if save_output:
output_dir = Path("reports/deepchecks")
output_dir.mkdir(parents=True, exist_ok=True)
# Save JSON report with appropriate suffix
suffix = "_clean" if use_cleaned else "_original"
json_path = output_dir / f"data_integrity_suite_results{suffix}.json"
json_results = {
"suite_name": "Data Integrity Suite",
"total_checks": len(result.results),
"timestamp": pd.Timestamp.now().isoformat(),
"checks": []
}
for check_result in result.results:
check_data = {
"check_name": check_result.get_header(),
"passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None,
"display": str(check_result.display) if hasattr(check_result, 'display') else None
}
json_results["checks"].append(check_data)
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(json_results, f, indent=2, ensure_ascii=False)
print(f"JSON results saved to: {json_path}")
return result
def run_custom_integrity_checks(save_output=True, use_cleaned=True):
"""
Run custom integrity checks tailored for the SkillScope dataset.
These checks are specifically designed for NLP/Text features and
multi-label classification tasks.
Args:
save_output: Whether to save the HTML report
use_cleaned: If True, use cleaned data instead of original
Returns:
dict: Dictionary containing check results
"""
from deepchecks.tabular.checks import (
DataDuplicates,
MixedNulls,
IsSingleValue,
ClassImbalance,
OutlierSampleDetection,
FeatureLabelCorrelation,
)
data_type = "CLEANED" if use_cleaned else "ORIGINAL"
print("="*80)
print(f"CUSTOM DATA INTEGRITY CHECKS - {data_type} DATA")
print("="*80)
# Load data
X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned)
train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
results = {}
# Check 1: Data Duplicates
print("\n1. Checking for duplicate samples...")
duplicates_check = DataDuplicates()
results['duplicates'] = duplicates_check.run(train_dataset)
# Check 2: Mixed Nulls
print("2. Checking for mixed null values...")
nulls_check = MixedNulls()
results['nulls'] = nulls_check.run(train_dataset)
# Check 3: Single Value Features
print("3. Checking for single-value features...")
single_value_check = IsSingleValue()
results['single_value'] = single_value_check.run(train_dataset)
# Check 4: Class Imbalance
print("4. Checking class distribution...")
imbalance_check = ClassImbalance()
results['class_imbalance'] = imbalance_check.run(train_dataset)
# Check 5: Outlier Detection (with increased timeout)
print("5. Detecting outlier samples (this may take a while)...")
try:
outlier_check = OutlierSampleDetection(timeout=300) # 5 minutes timeout
results['outliers'] = outlier_check.run(train_dataset)
except Exception as e:
print(f" Warning: Outlier detection failed or timed out: {str(e)}")
results['outliers'] = None
# Check 6: Feature-Label Correlation (with sample subset for speed)
print("6. Analyzing feature-label correlation (using sample of features)...")
try:
# Use only top 100 features for correlation to speed up
correlation_check = FeatureLabelCorrelation(n_top_columns=100, timeout=300)
results['correlation'] = correlation_check.run(train_dataset)
except Exception as e:
print(f" Warning: Correlation check failed or timed out: {str(e)}")
results['correlation'] = None
print("\nAll custom checks completed!")
# Results are available in memory for further processing if needed
return results
def analyze_data_statistics(use_cleaned=True):
"""
Print detailed statistics about the dataset.
Args:
use_cleaned: If True, analyze cleaned data instead of original
"""
data_type = "CLEANED" if use_cleaned else "ORIGINAL"
print("="*80)
print(f"DATASET STATISTICS - {data_type} DATA")
print("="*80)
X_train, y_train, X_test, y_test = load_data(use_cleaned=use_cleaned)
print(f"\nTraining set:")
print(f" - Samples: {X_train.shape[0]}")
print(f" - Features: {X_train.shape[1]}")
print(f" - Unique labels: {len(np.unique(y_train))}")
print(f" - Label distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique[:10], counts[:10]): # Show first 10
print(f" Label {label}: {count} samples ({count/len(y_train)*100:.2f}%)")
if len(unique) > 10:
print(f" ... and {len(unique)-10} more labels")
print(f"\nTest set:")
print(f" - Samples: {X_test.shape[0]}")
print(f" - Features: {X_test.shape[1]}")
print(f" - Unique labels: {len(np.unique(y_test))}")
print(f"\nFeature statistics:")
print(f" - Mean feature value: {X_train.mean():.4f}")
print(f" - Std feature value: {X_train.std():.4f}")
print(f" - Min feature value: {X_train.min():.4f}")
print(f" - Max feature value: {X_train.max():.4f}")
print(f" - Sparsity: {(X_train == 0).sum() / X_train.size * 100:.2f}%")
if __name__ == "__main__":
import sys
# By default use cleaned data, unless --original flag is specified
use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv)
if use_cleaned:
print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n")
else:
print("Testing ORIGINAL data\n")
print("Note: Using --original flag to test old data\n")
# Print dataset statistics
analyze_data_statistics(use_cleaned=use_cleaned)
# Run the full Data Integrity Suite
print("\n")
suite_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned)
# Run custom integrity checks
print("\n")
custom_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned)
print("\n" + "="*80)
print("DATA INTEGRITY VALIDATION COMPLETED")
print("="*80)
print("\nCheck the reports in the 'reports/deepchecks' directory")
|