|
|
""" |
|
|
Data Integrity Suite - Deepchecks validation for dataset integrity |
|
|
|
|
|
This module implements comprehensive data integrity checks using Deepchecks |
|
|
to validate the quality and consistency of the training and test datasets. |
|
|
|
|
|
Checks included: |
|
|
- Data duplicates detection |
|
|
- Missing values analysis |
|
|
- Feature-label correlation |
|
|
- Feature-feature correlation |
|
|
- Data type consistency |
|
|
- Outlier detection |
|
|
- Class imbalance analysis |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import json |
|
|
from pathlib import Path |
|
|
from deepchecks.tabular import Dataset |
|
|
from deepchecks.tabular.suites import data_integrity |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR |
|
|
|
|
|
|
|
|
def load_data(use_cleaned=True): |
|
|
""" |
|
|
Load training and test datasets from processed data directory. |
|
|
|
|
|
Args: |
|
|
use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT |
|
|
|
|
|
Returns: |
|
|
tuple: (X_train, y_train, X_test, y_test) |
|
|
""" |
|
|
tfidf_dir = PROCESSED_DATA_DIR / "tfidf" |
|
|
|
|
|
|
|
|
if use_cleaned: |
|
|
train_features = tfidf_dir / "features_tfidf_clean.npy" |
|
|
train_labels = tfidf_dir / "labels_tfidf_clean.npy" |
|
|
test_features = tfidf_dir / "X_test_clean.npy" |
|
|
test_labels = tfidf_dir / "Y_test_clean.npy" |
|
|
data_type = "cleaned" |
|
|
else: |
|
|
train_features = tfidf_dir / "features_tfidf.npy" |
|
|
train_labels = tfidf_dir / "labels_tfidf.npy" |
|
|
test_features = tfidf_dir / "X_test.npy" |
|
|
test_labels = tfidf_dir / "Y_test.npy" |
|
|
data_type = "original" |
|
|
|
|
|
|
|
|
X_train = np.load(train_features) |
|
|
y_train = np.load(train_labels) |
|
|
X_test = np.load(test_features) |
|
|
y_test = np.load(test_labels) |
|
|
|
|
|
print(f"Loaded {data_type} data:") |
|
|
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}") |
|
|
print(f"Test set shape: X={X_test.shape}, y={y_test.shape}") |
|
|
|
|
|
return X_train, y_train, X_test, y_test |
|
|
|
|
|
|
|
|
def create_deepchecks_dataset(X, y, dataset_name="dataset"): |
|
|
""" |
|
|
Create a Deepchecks Dataset object from numpy arrays. |
|
|
|
|
|
Args: |
|
|
X: Feature matrix (numpy array) |
|
|
y: Labels (numpy array) - can be multi-label (2D) or single-label (1D) |
|
|
dataset_name: Name identifier for the dataset |
|
|
|
|
|
Returns: |
|
|
Dataset: Deepchecks Dataset object |
|
|
""" |
|
|
|
|
|
|
|
|
feature_names = [f"feature_{i}" for i in range(X.shape[1])] |
|
|
|
|
|
|
|
|
df = pd.DataFrame(X, columns=feature_names) |
|
|
|
|
|
|
|
|
if len(y.shape) > 1 and y.shape[1] > 1: |
|
|
|
|
|
|
|
|
y_single = np.argmax(y, axis=1) |
|
|
df['label'] = y_single |
|
|
print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks") |
|
|
else: |
|
|
df['label'] = y |
|
|
|
|
|
|
|
|
ds = Dataset(df, label='label', cat_features=[]) |
|
|
|
|
|
return ds |
|
|
|
|
|
|
|
|
def run_data_integrity_suite(save_output=True, use_cleaned=True): |
|
|
""" |
|
|
Run the complete Data Integrity Suite on training data. |
|
|
|
|
|
This suite performs comprehensive checks including: |
|
|
- Data Duplicates: Identifies duplicate samples |
|
|
- String Mismatch: Checks for string inconsistencies |
|
|
- Mixed Nulls: Detects various null representations |
|
|
- Mixed Data Types: Validates consistent data types |
|
|
- String Length Out Of Bounds: Checks string length anomalies |
|
|
- Is Single Value: Identifies features with only one value |
|
|
- Special Characters: Detects special characters in data |
|
|
- Class Imbalance: Analyzes label distribution |
|
|
- Outlier Sample Detection: Identifies outlier samples |
|
|
- Feature Label Correlation: Checks correlation between features and labels |
|
|
|
|
|
Args: |
|
|
save_output: Whether to save the HTML report |
|
|
use_cleaned: If True, use cleaned data instead of original |
|
|
|
|
|
Returns: |
|
|
SuiteResult: Results from the data integrity suite |
|
|
""" |
|
|
data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
|
|
print("="*80) |
|
|
print(f"DATA INTEGRITY SUITE - {data_type} TRAINING DATA") |
|
|
print("="*80) |
|
|
|
|
|
|
|
|
X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
train_dataset = create_deepchecks_dataset(X_train, y_train, "training") |
|
|
|
|
|
|
|
|
print("\nRunning Data Integrity checks...") |
|
|
suite = data_integrity() |
|
|
result = suite.run(train_dataset) |
|
|
|
|
|
|
|
|
print("\nData Integrity Suite completed!") |
|
|
print(f"Total checks: {len(result.results)}") |
|
|
|
|
|
|
|
|
if save_output: |
|
|
output_dir = Path("reports/deepchecks") |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
suffix = "_clean" if use_cleaned else "_original" |
|
|
json_path = output_dir / f"data_integrity_suite_results{suffix}.json" |
|
|
json_results = { |
|
|
"suite_name": "Data Integrity Suite", |
|
|
"total_checks": len(result.results), |
|
|
"timestamp": pd.Timestamp.now().isoformat(), |
|
|
"checks": [] |
|
|
} |
|
|
|
|
|
for check_result in result.results: |
|
|
check_data = { |
|
|
"check_name": check_result.get_header(), |
|
|
"passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None, |
|
|
"display": str(check_result.display) if hasattr(check_result, 'display') else None |
|
|
} |
|
|
json_results["checks"].append(check_data) |
|
|
|
|
|
with open(json_path, 'w', encoding='utf-8') as f: |
|
|
json.dump(json_results, f, indent=2, ensure_ascii=False) |
|
|
print(f"JSON results saved to: {json_path}") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def run_custom_integrity_checks(save_output=True, use_cleaned=True): |
|
|
""" |
|
|
Run custom integrity checks tailored for the SkillScope dataset. |
|
|
|
|
|
These checks are specifically designed for NLP/Text features and |
|
|
multi-label classification tasks. |
|
|
|
|
|
Args: |
|
|
save_output: Whether to save the HTML report |
|
|
use_cleaned: If True, use cleaned data instead of original |
|
|
|
|
|
Returns: |
|
|
dict: Dictionary containing check results |
|
|
""" |
|
|
from deepchecks.tabular.checks import ( |
|
|
DataDuplicates, |
|
|
MixedNulls, |
|
|
IsSingleValue, |
|
|
ClassImbalance, |
|
|
OutlierSampleDetection, |
|
|
FeatureLabelCorrelation, |
|
|
) |
|
|
|
|
|
data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
|
|
print("="*80) |
|
|
print(f"CUSTOM DATA INTEGRITY CHECKS - {data_type} DATA") |
|
|
print("="*80) |
|
|
|
|
|
|
|
|
X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned) |
|
|
train_dataset = create_deepchecks_dataset(X_train, y_train, "training") |
|
|
|
|
|
results = {} |
|
|
|
|
|
|
|
|
print("\n1. Checking for duplicate samples...") |
|
|
duplicates_check = DataDuplicates() |
|
|
results['duplicates'] = duplicates_check.run(train_dataset) |
|
|
|
|
|
|
|
|
print("2. Checking for mixed null values...") |
|
|
nulls_check = MixedNulls() |
|
|
results['nulls'] = nulls_check.run(train_dataset) |
|
|
|
|
|
|
|
|
print("3. Checking for single-value features...") |
|
|
single_value_check = IsSingleValue() |
|
|
results['single_value'] = single_value_check.run(train_dataset) |
|
|
|
|
|
|
|
|
print("4. Checking class distribution...") |
|
|
imbalance_check = ClassImbalance() |
|
|
results['class_imbalance'] = imbalance_check.run(train_dataset) |
|
|
|
|
|
|
|
|
print("5. Detecting outlier samples (this may take a while)...") |
|
|
try: |
|
|
outlier_check = OutlierSampleDetection(timeout=300) |
|
|
results['outliers'] = outlier_check.run(train_dataset) |
|
|
except Exception as e: |
|
|
print(f" Warning: Outlier detection failed or timed out: {str(e)}") |
|
|
results['outliers'] = None |
|
|
|
|
|
|
|
|
print("6. Analyzing feature-label correlation (using sample of features)...") |
|
|
try: |
|
|
|
|
|
correlation_check = FeatureLabelCorrelation(n_top_columns=100, timeout=300) |
|
|
results['correlation'] = correlation_check.run(train_dataset) |
|
|
except Exception as e: |
|
|
print(f" Warning: Correlation check failed or timed out: {str(e)}") |
|
|
results['correlation'] = None |
|
|
|
|
|
print("\nAll custom checks completed!") |
|
|
|
|
|
|
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def analyze_data_statistics(use_cleaned=True): |
|
|
""" |
|
|
Print detailed statistics about the dataset. |
|
|
|
|
|
Args: |
|
|
use_cleaned: If True, analyze cleaned data instead of original |
|
|
""" |
|
|
data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
|
|
print("="*80) |
|
|
print(f"DATASET STATISTICS - {data_type} DATA") |
|
|
print("="*80) |
|
|
|
|
|
X_train, y_train, X_test, y_test = load_data(use_cleaned=use_cleaned) |
|
|
|
|
|
print(f"\nTraining set:") |
|
|
print(f" - Samples: {X_train.shape[0]}") |
|
|
print(f" - Features: {X_train.shape[1]}") |
|
|
print(f" - Unique labels: {len(np.unique(y_train))}") |
|
|
print(f" - Label distribution:") |
|
|
unique, counts = np.unique(y_train, return_counts=True) |
|
|
for label, count in zip(unique[:10], counts[:10]): |
|
|
print(f" Label {label}: {count} samples ({count/len(y_train)*100:.2f}%)") |
|
|
if len(unique) > 10: |
|
|
print(f" ... and {len(unique)-10} more labels") |
|
|
|
|
|
print(f"\nTest set:") |
|
|
print(f" - Samples: {X_test.shape[0]}") |
|
|
print(f" - Features: {X_test.shape[1]}") |
|
|
print(f" - Unique labels: {len(np.unique(y_test))}") |
|
|
|
|
|
print(f"\nFeature statistics:") |
|
|
print(f" - Mean feature value: {X_train.mean():.4f}") |
|
|
print(f" - Std feature value: {X_train.std():.4f}") |
|
|
print(f" - Min feature value: {X_train.min():.4f}") |
|
|
print(f" - Max feature value: {X_train.max():.4f}") |
|
|
print(f" - Sparsity: {(X_train == 0).sum() / X_train.size * 100:.2f}%") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import sys |
|
|
|
|
|
|
|
|
use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv) |
|
|
|
|
|
if use_cleaned: |
|
|
print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n") |
|
|
else: |
|
|
print("Testing ORIGINAL data\n") |
|
|
print("Note: Using --original flag to test old data\n") |
|
|
|
|
|
|
|
|
analyze_data_statistics(use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print("\n") |
|
|
suite_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print("\n") |
|
|
custom_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned) |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("DATA INTEGRITY VALIDATION COMPLETED") |
|
|
print("="*80) |
|
|
print("\nCheck the reports in the 'reports/deepchecks' directory") |
|
|
|