DaCrow13
Deploy to HF Spaces (Clean)
39d224b
"""
Validation script for cleaned data.
This script runs Deepchecks validation on the cleaned dataset to verify that:
1. No duplicates remain
2. No label conflicts exist
3. No data leakage between train and test
4. All data quality issues are resolved
Run this after data_cleaning.py to confirm data quality.
"""
import numpy as np
import pandas as pd
from pathlib import Path
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import data_integrity, train_test_validation
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR
def load_cleaned_data():
"""Load cleaned train and test datasets."""
tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
X_train = np.load(tfidf_dir / "features_tfidf_clean.npy")
y_train = np.load(tfidf_dir / "labels_tfidf_clean.npy")
X_test = np.load(tfidf_dir / "X_test_clean.npy")
y_test = np.load(tfidf_dir / "Y_test_clean.npy")
print(f"Loaded cleaned data:")
print(f" Train: {X_train.shape[0]:,} samples x {X_train.shape[1]:,} features")
print(f" Test: {X_test.shape[0]:,} samples x {X_test.shape[1]:,} features")
print(f" Labels: {y_train.shape[1]} labels")
return X_train, y_train, X_test, y_test
def create_deepchecks_dataset(X, y, dataset_name="dataset"):
"""Create Deepchecks Dataset from numpy arrays."""
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
# Convert multi-label to single label for Deepchecks
if len(y.shape) > 1 and y.shape[1] > 1:
y_single = np.argmax(y, axis=1)
df['label'] = y_single
else:
df['label'] = y
ds = Dataset(df, label='label', cat_features=[])
return ds
def run_validation():
"""Run full Deepchecks validation on cleaned data."""
print("="*80)
print("DEEPCHECKS VALIDATION - CLEANED DATA")
print("="*80)
# Load cleaned data
X_train, y_train, X_test, y_test = load_cleaned_data()
# Create Deepchecks datasets
train_dataset = create_deepchecks_dataset(X_train, y_train, "training_clean")
test_dataset = create_deepchecks_dataset(X_test, y_test, "test_clean")
# Run Data Integrity Suite
print("\n" + "="*80)
print("RUNNING DATA INTEGRITY SUITE")
print("="*80)
integrity_suite = data_integrity()
integrity_result = integrity_suite.run(train_dataset)
# Run Train-Test Validation Suite
print("\n" + "="*80)
print("RUNNING TRAIN-TEST VALIDATION SUITE")
print("="*80)
validation_suite = train_test_validation()
validation_result = validation_suite.run(train_dataset, test_dataset)
# Save reports
output_dir = Path("reports/deepchecks")
output_dir.mkdir(parents=True, exist_ok=True)
# Save JSON results
import json
# Count passed/failed checks (handle CheckFailure objects)
integrity_passed = sum(1 for r in integrity_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions())
integrity_total = len(integrity_result.results)
validation_passed = sum(1 for r in validation_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions())
validation_total = len(validation_result.results)
# Save data integrity results as JSON
integrity_json = {
"suite_name": "Data Integrity Suite (Cleaned)",
"total_checks": len(integrity_result.results),
"timestamp": pd.Timestamp.now().isoformat(),
"passed": integrity_passed,
"failed": integrity_total - integrity_passed
}
with open(output_dir / "data_integrity_clean.json", 'w') as f:
json.dump(integrity_json, f, indent=2)
# Save train-test validation results as JSON
validation_json = {
"suite_name": "Train-Test Validation Suite (Cleaned)",
"total_checks": len(validation_result.results),
"timestamp": pd.Timestamp.now().isoformat(),
"passed": validation_passed,
"failed": validation_total - validation_passed
}
with open(output_dir / "train_test_validation_clean.json", 'w') as f:
json.dump(validation_json, f, indent=2)
print("\n" + "="*80)
print("VALIDATION RESULTS")
print("="*80)
print(f"\nData Integrity Suite:")
print(f" Passed: {integrity_passed}/{integrity_total}")
print(f"\nTrain-Test Validation Suite:")
print(f" Passed: {validation_passed}/{validation_total}")
# Check critical issues
critical_issues = []
for result in integrity_result.results:
if hasattr(result, 'passed_conditions') and not result.passed_conditions():
check_name = result.get_header()
if "Duplicate" in check_name or "Conflict" in check_name:
critical_issues.append(f"Data Integrity: {check_name}")
for result in validation_result.results:
if hasattr(result, 'passed_conditions') and not result.passed_conditions():
check_name = result.get_header()
if "Mix" in check_name or "Leakage" in check_name:
critical_issues.append(f"Train-Test: {check_name}")
if critical_issues:
print(f"\nCRITICAL ISSUES REMAINING:")
for issue in critical_issues:
print(f" - {issue}")
else:
print(f"\nNO CRITICAL ISSUES DETECTED!")
print(f" No duplicates")
print(f" No label conflicts")
print(f" No data leakage")
print(f" Data is ready for training")
print(f"\nReports saved to: {output_dir}")
print("="*80)
return integrity_result, validation_result
if __name__ == "__main__":
run_validation()