|
|
""" |
|
|
Validation script for cleaned data. |
|
|
|
|
|
This script runs Deepchecks validation on the cleaned dataset to verify that: |
|
|
1. No duplicates remain |
|
|
2. No label conflicts exist |
|
|
3. No data leakage between train and test |
|
|
4. All data quality issues are resolved |
|
|
|
|
|
Run this after data_cleaning.py to confirm data quality. |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from pathlib import Path |
|
|
from deepchecks.tabular import Dataset |
|
|
from deepchecks.tabular.suites import data_integrity, train_test_validation |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR |
|
|
|
|
|
|
|
|
def load_cleaned_data(): |
|
|
"""Load cleaned train and test datasets.""" |
|
|
tfidf_dir = PROCESSED_DATA_DIR / "tfidf" |
|
|
|
|
|
X_train = np.load(tfidf_dir / "features_tfidf_clean.npy") |
|
|
y_train = np.load(tfidf_dir / "labels_tfidf_clean.npy") |
|
|
X_test = np.load(tfidf_dir / "X_test_clean.npy") |
|
|
y_test = np.load(tfidf_dir / "Y_test_clean.npy") |
|
|
|
|
|
print(f"Loaded cleaned data:") |
|
|
print(f" Train: {X_train.shape[0]:,} samples x {X_train.shape[1]:,} features") |
|
|
print(f" Test: {X_test.shape[0]:,} samples x {X_test.shape[1]:,} features") |
|
|
print(f" Labels: {y_train.shape[1]} labels") |
|
|
|
|
|
return X_train, y_train, X_test, y_test |
|
|
|
|
|
|
|
|
def create_deepchecks_dataset(X, y, dataset_name="dataset"): |
|
|
"""Create Deepchecks Dataset from numpy arrays.""" |
|
|
feature_names = [f"feature_{i}" for i in range(X.shape[1])] |
|
|
df = pd.DataFrame(X, columns=feature_names) |
|
|
|
|
|
|
|
|
if len(y.shape) > 1 and y.shape[1] > 1: |
|
|
y_single = np.argmax(y, axis=1) |
|
|
df['label'] = y_single |
|
|
else: |
|
|
df['label'] = y |
|
|
|
|
|
ds = Dataset(df, label='label', cat_features=[]) |
|
|
return ds |
|
|
|
|
|
|
|
|
def run_validation(): |
|
|
"""Run full Deepchecks validation on cleaned data.""" |
|
|
print("="*80) |
|
|
print("DEEPCHECKS VALIDATION - CLEANED DATA") |
|
|
print("="*80) |
|
|
|
|
|
|
|
|
X_train, y_train, X_test, y_test = load_cleaned_data() |
|
|
|
|
|
|
|
|
train_dataset = create_deepchecks_dataset(X_train, y_train, "training_clean") |
|
|
test_dataset = create_deepchecks_dataset(X_test, y_test, "test_clean") |
|
|
|
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("RUNNING DATA INTEGRITY SUITE") |
|
|
print("="*80) |
|
|
integrity_suite = data_integrity() |
|
|
integrity_result = integrity_suite.run(train_dataset) |
|
|
|
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("RUNNING TRAIN-TEST VALIDATION SUITE") |
|
|
print("="*80) |
|
|
validation_suite = train_test_validation() |
|
|
validation_result = validation_suite.run(train_dataset, test_dataset) |
|
|
|
|
|
|
|
|
output_dir = Path("reports/deepchecks") |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
import json |
|
|
|
|
|
|
|
|
integrity_passed = sum(1 for r in integrity_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions()) |
|
|
integrity_total = len(integrity_result.results) |
|
|
|
|
|
validation_passed = sum(1 for r in validation_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions()) |
|
|
validation_total = len(validation_result.results) |
|
|
|
|
|
|
|
|
integrity_json = { |
|
|
"suite_name": "Data Integrity Suite (Cleaned)", |
|
|
"total_checks": len(integrity_result.results), |
|
|
"timestamp": pd.Timestamp.now().isoformat(), |
|
|
"passed": integrity_passed, |
|
|
"failed": integrity_total - integrity_passed |
|
|
} |
|
|
with open(output_dir / "data_integrity_clean.json", 'w') as f: |
|
|
json.dump(integrity_json, f, indent=2) |
|
|
|
|
|
|
|
|
validation_json = { |
|
|
"suite_name": "Train-Test Validation Suite (Cleaned)", |
|
|
"total_checks": len(validation_result.results), |
|
|
"timestamp": pd.Timestamp.now().isoformat(), |
|
|
"passed": validation_passed, |
|
|
"failed": validation_total - validation_passed |
|
|
} |
|
|
with open(output_dir / "train_test_validation_clean.json", 'w') as f: |
|
|
json.dump(validation_json, f, indent=2) |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("VALIDATION RESULTS") |
|
|
print("="*80) |
|
|
|
|
|
print(f"\nData Integrity Suite:") |
|
|
print(f" Passed: {integrity_passed}/{integrity_total}") |
|
|
|
|
|
print(f"\nTrain-Test Validation Suite:") |
|
|
print(f" Passed: {validation_passed}/{validation_total}") |
|
|
|
|
|
|
|
|
critical_issues = [] |
|
|
|
|
|
for result in integrity_result.results: |
|
|
if hasattr(result, 'passed_conditions') and not result.passed_conditions(): |
|
|
check_name = result.get_header() |
|
|
if "Duplicate" in check_name or "Conflict" in check_name: |
|
|
critical_issues.append(f"Data Integrity: {check_name}") |
|
|
|
|
|
for result in validation_result.results: |
|
|
if hasattr(result, 'passed_conditions') and not result.passed_conditions(): |
|
|
check_name = result.get_header() |
|
|
if "Mix" in check_name or "Leakage" in check_name: |
|
|
critical_issues.append(f"Train-Test: {check_name}") |
|
|
|
|
|
if critical_issues: |
|
|
print(f"\nCRITICAL ISSUES REMAINING:") |
|
|
for issue in critical_issues: |
|
|
print(f" - {issue}") |
|
|
else: |
|
|
print(f"\nNO CRITICAL ISSUES DETECTED!") |
|
|
print(f" No duplicates") |
|
|
print(f" No label conflicts") |
|
|
print(f" No data leakage") |
|
|
print(f" Data is ready for training") |
|
|
|
|
|
print(f"\nReports saved to: {output_dir}") |
|
|
print("="*80) |
|
|
|
|
|
return integrity_result, validation_result |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
run_validation() |
|
|
|