""" Validation script for cleaned data. This script runs Deepchecks validation on the cleaned dataset to verify that: 1. No duplicates remain 2. No label conflicts exist 3. No data leakage between train and test 4. All data quality issues are resolved Run this after data_cleaning.py to confirm data quality. """ import numpy as np import pandas as pd from pathlib import Path from deepchecks.tabular import Dataset from deepchecks.tabular.suites import data_integrity, train_test_validation from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR def load_cleaned_data(): """Load cleaned train and test datasets.""" tfidf_dir = PROCESSED_DATA_DIR / "tfidf" X_train = np.load(tfidf_dir / "features_tfidf_clean.npy") y_train = np.load(tfidf_dir / "labels_tfidf_clean.npy") X_test = np.load(tfidf_dir / "X_test_clean.npy") y_test = np.load(tfidf_dir / "Y_test_clean.npy") print(f"Loaded cleaned data:") print(f" Train: {X_train.shape[0]:,} samples x {X_train.shape[1]:,} features") print(f" Test: {X_test.shape[0]:,} samples x {X_test.shape[1]:,} features") print(f" Labels: {y_train.shape[1]} labels") return X_train, y_train, X_test, y_test def create_deepchecks_dataset(X, y, dataset_name="dataset"): """Create Deepchecks Dataset from numpy arrays.""" feature_names = [f"feature_{i}" for i in range(X.shape[1])] df = pd.DataFrame(X, columns=feature_names) # Convert multi-label to single label for Deepchecks if len(y.shape) > 1 and y.shape[1] > 1: y_single = np.argmax(y, axis=1) df['label'] = y_single else: df['label'] = y ds = Dataset(df, label='label', cat_features=[]) return ds def run_validation(): """Run full Deepchecks validation on cleaned data.""" print("="*80) print("DEEPCHECKS VALIDATION - CLEANED DATA") print("="*80) # Load cleaned data X_train, y_train, X_test, y_test = load_cleaned_data() # Create Deepchecks datasets train_dataset = create_deepchecks_dataset(X_train, y_train, "training_clean") test_dataset = create_deepchecks_dataset(X_test, y_test, "test_clean") # Run Data Integrity Suite print("\n" + "="*80) print("RUNNING DATA INTEGRITY SUITE") print("="*80) integrity_suite = data_integrity() integrity_result = integrity_suite.run(train_dataset) # Run Train-Test Validation Suite print("\n" + "="*80) print("RUNNING TRAIN-TEST VALIDATION SUITE") print("="*80) validation_suite = train_test_validation() validation_result = validation_suite.run(train_dataset, test_dataset) # Save reports output_dir = Path("reports/deepchecks") output_dir.mkdir(parents=True, exist_ok=True) # Save JSON results import json # Count passed/failed checks (handle CheckFailure objects) integrity_passed = sum(1 for r in integrity_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions()) integrity_total = len(integrity_result.results) validation_passed = sum(1 for r in validation_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions()) validation_total = len(validation_result.results) # Save data integrity results as JSON integrity_json = { "suite_name": "Data Integrity Suite (Cleaned)", "total_checks": len(integrity_result.results), "timestamp": pd.Timestamp.now().isoformat(), "passed": integrity_passed, "failed": integrity_total - integrity_passed } with open(output_dir / "data_integrity_clean.json", 'w') as f: json.dump(integrity_json, f, indent=2) # Save train-test validation results as JSON validation_json = { "suite_name": "Train-Test Validation Suite (Cleaned)", "total_checks": len(validation_result.results), "timestamp": pd.Timestamp.now().isoformat(), "passed": validation_passed, "failed": validation_total - validation_passed } with open(output_dir / "train_test_validation_clean.json", 'w') as f: json.dump(validation_json, f, indent=2) print("\n" + "="*80) print("VALIDATION RESULTS") print("="*80) print(f"\nData Integrity Suite:") print(f" Passed: {integrity_passed}/{integrity_total}") print(f"\nTrain-Test Validation Suite:") print(f" Passed: {validation_passed}/{validation_total}") # Check critical issues critical_issues = [] for result in integrity_result.results: if hasattr(result, 'passed_conditions') and not result.passed_conditions(): check_name = result.get_header() if "Duplicate" in check_name or "Conflict" in check_name: critical_issues.append(f"Data Integrity: {check_name}") for result in validation_result.results: if hasattr(result, 'passed_conditions') and not result.passed_conditions(): check_name = result.get_header() if "Mix" in check_name or "Leakage" in check_name: critical_issues.append(f"Train-Test: {check_name}") if critical_issues: print(f"\nCRITICAL ISSUES REMAINING:") for issue in critical_issues: print(f" - {issue}") else: print(f"\nNO CRITICAL ISSUES DETECTED!") print(f" No duplicates") print(f" No label conflicts") print(f" No data leakage") print(f" Data is ready for training") print(f"\nReports saved to: {output_dir}") print("="*80) return integrity_result, validation_result if __name__ == "__main__": run_validation()