File size: 5,711 Bytes
225af6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
"""
Validation script for cleaned data.
This script runs Deepchecks validation on the cleaned dataset to verify that:
1. No duplicates remain
2. No label conflicts exist
3. No data leakage between train and test
4. All data quality issues are resolved
Run this after data_cleaning.py to confirm data quality.
"""
import numpy as np
import pandas as pd
from pathlib import Path
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import data_integrity, train_test_validation
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR
def load_cleaned_data():
"""Load cleaned train and test datasets."""
tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
X_train = np.load(tfidf_dir / "features_tfidf_clean.npy")
y_train = np.load(tfidf_dir / "labels_tfidf_clean.npy")
X_test = np.load(tfidf_dir / "X_test_clean.npy")
y_test = np.load(tfidf_dir / "Y_test_clean.npy")
print(f"Loaded cleaned data:")
print(f" Train: {X_train.shape[0]:,} samples x {X_train.shape[1]:,} features")
print(f" Test: {X_test.shape[0]:,} samples x {X_test.shape[1]:,} features")
print(f" Labels: {y_train.shape[1]} labels")
return X_train, y_train, X_test, y_test
def create_deepchecks_dataset(X, y, dataset_name="dataset"):
"""Create Deepchecks Dataset from numpy arrays."""
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
# Convert multi-label to single label for Deepchecks
if len(y.shape) > 1 and y.shape[1] > 1:
y_single = np.argmax(y, axis=1)
df['label'] = y_single
else:
df['label'] = y
ds = Dataset(df, label='label', cat_features=[])
return ds
def run_validation():
"""Run full Deepchecks validation on cleaned data."""
print("="*80)
print("DEEPCHECKS VALIDATION - CLEANED DATA")
print("="*80)
# Load cleaned data
X_train, y_train, X_test, y_test = load_cleaned_data()
# Create Deepchecks datasets
train_dataset = create_deepchecks_dataset(X_train, y_train, "training_clean")
test_dataset = create_deepchecks_dataset(X_test, y_test, "test_clean")
# Run Data Integrity Suite
print("\n" + "="*80)
print("RUNNING DATA INTEGRITY SUITE")
print("="*80)
integrity_suite = data_integrity()
integrity_result = integrity_suite.run(train_dataset)
# Run Train-Test Validation Suite
print("\n" + "="*80)
print("RUNNING TRAIN-TEST VALIDATION SUITE")
print("="*80)
validation_suite = train_test_validation()
validation_result = validation_suite.run(train_dataset, test_dataset)
# Save reports
output_dir = Path("reports/deepchecks")
output_dir.mkdir(parents=True, exist_ok=True)
# Save JSON results
import json
# Count passed/failed checks (handle CheckFailure objects)
integrity_passed = sum(1 for r in integrity_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions())
integrity_total = len(integrity_result.results)
validation_passed = sum(1 for r in validation_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions())
validation_total = len(validation_result.results)
# Save data integrity results as JSON
integrity_json = {
"suite_name": "Data Integrity Suite (Cleaned)",
"total_checks": len(integrity_result.results),
"timestamp": pd.Timestamp.now().isoformat(),
"passed": integrity_passed,
"failed": integrity_total - integrity_passed
}
with open(output_dir / "data_integrity_clean.json", 'w') as f:
json.dump(integrity_json, f, indent=2)
# Save train-test validation results as JSON
validation_json = {
"suite_name": "Train-Test Validation Suite (Cleaned)",
"total_checks": len(validation_result.results),
"timestamp": pd.Timestamp.now().isoformat(),
"passed": validation_passed,
"failed": validation_total - validation_passed
}
with open(output_dir / "train_test_validation_clean.json", 'w') as f:
json.dump(validation_json, f, indent=2)
print("\n" + "="*80)
print("VALIDATION RESULTS")
print("="*80)
print(f"\nData Integrity Suite:")
print(f" Passed: {integrity_passed}/{integrity_total}")
print(f"\nTrain-Test Validation Suite:")
print(f" Passed: {validation_passed}/{validation_total}")
# Check critical issues
critical_issues = []
for result in integrity_result.results:
if hasattr(result, 'passed_conditions') and not result.passed_conditions():
check_name = result.get_header()
if "Duplicate" in check_name or "Conflict" in check_name:
critical_issues.append(f"Data Integrity: {check_name}")
for result in validation_result.results:
if hasattr(result, 'passed_conditions') and not result.passed_conditions():
check_name = result.get_header()
if "Mix" in check_name or "Leakage" in check_name:
critical_issues.append(f"Train-Test: {check_name}")
if critical_issues:
print(f"\nCRITICAL ISSUES REMAINING:")
for issue in critical_issues:
print(f" - {issue}")
else:
print(f"\nNO CRITICAL ISSUES DETECTED!")
print(f" No duplicates")
print(f" No label conflicts")
print(f" No data leakage")
print(f" Data is ready for training")
print(f"\nReports saved to: {output_dir}")
print("="*80)
return integrity_result, validation_result
if __name__ == "__main__":
run_validation()
|