Spaces:
Sleeping
Sleeping
| """ | |
| Validation script for cleaned data. | |
| This script runs Deepchecks validation on the cleaned dataset to verify that: | |
| 1. No duplicates remain | |
| 2. No label conflicts exist | |
| 3. No data leakage between train and test | |
| 4. All data quality issues are resolved | |
| Run this after data_cleaning.py to confirm data quality. | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| from pathlib import Path | |
| from deepchecks.tabular import Dataset | |
| from deepchecks.tabular.suites import data_integrity, train_test_validation | |
| from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR | |
| def load_cleaned_data(): | |
| """Load cleaned train and test datasets.""" | |
| tfidf_dir = PROCESSED_DATA_DIR / "tfidf" | |
| X_train = np.load(tfidf_dir / "features_tfidf_clean.npy") | |
| y_train = np.load(tfidf_dir / "labels_tfidf_clean.npy") | |
| X_test = np.load(tfidf_dir / "X_test_clean.npy") | |
| y_test = np.load(tfidf_dir / "Y_test_clean.npy") | |
| print(f"Loaded cleaned data:") | |
| print(f" Train: {X_train.shape[0]:,} samples x {X_train.shape[1]:,} features") | |
| print(f" Test: {X_test.shape[0]:,} samples x {X_test.shape[1]:,} features") | |
| print(f" Labels: {y_train.shape[1]} labels") | |
| return X_train, y_train, X_test, y_test | |
| def create_deepchecks_dataset(X, y, dataset_name="dataset"): | |
| """Create Deepchecks Dataset from numpy arrays.""" | |
| feature_names = [f"feature_{i}" for i in range(X.shape[1])] | |
| df = pd.DataFrame(X, columns=feature_names) | |
| # Convert multi-label to single label for Deepchecks | |
| if len(y.shape) > 1 and y.shape[1] > 1: | |
| y_single = np.argmax(y, axis=1) | |
| df['label'] = y_single | |
| else: | |
| df['label'] = y | |
| ds = Dataset(df, label='label', cat_features=[]) | |
| return ds | |
| def run_validation(): | |
| """Run full Deepchecks validation on cleaned data.""" | |
| print("="*80) | |
| print("DEEPCHECKS VALIDATION - CLEANED DATA") | |
| print("="*80) | |
| # Load cleaned data | |
| X_train, y_train, X_test, y_test = load_cleaned_data() | |
| # Create Deepchecks datasets | |
| train_dataset = create_deepchecks_dataset(X_train, y_train, "training_clean") | |
| test_dataset = create_deepchecks_dataset(X_test, y_test, "test_clean") | |
| # Run Data Integrity Suite | |
| print("\n" + "="*80) | |
| print("RUNNING DATA INTEGRITY SUITE") | |
| print("="*80) | |
| integrity_suite = data_integrity() | |
| integrity_result = integrity_suite.run(train_dataset) | |
| # Run Train-Test Validation Suite | |
| print("\n" + "="*80) | |
| print("RUNNING TRAIN-TEST VALIDATION SUITE") | |
| print("="*80) | |
| validation_suite = train_test_validation() | |
| validation_result = validation_suite.run(train_dataset, test_dataset) | |
| # Save reports | |
| output_dir = Path("reports/deepchecks") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Save JSON results | |
| import json | |
| # Count passed/failed checks (handle CheckFailure objects) | |
| integrity_passed = sum(1 for r in integrity_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions()) | |
| integrity_total = len(integrity_result.results) | |
| validation_passed = sum(1 for r in validation_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions()) | |
| validation_total = len(validation_result.results) | |
| # Save data integrity results as JSON | |
| integrity_json = { | |
| "suite_name": "Data Integrity Suite (Cleaned)", | |
| "total_checks": len(integrity_result.results), | |
| "timestamp": pd.Timestamp.now().isoformat(), | |
| "passed": integrity_passed, | |
| "failed": integrity_total - integrity_passed | |
| } | |
| with open(output_dir / "data_integrity_clean.json", 'w') as f: | |
| json.dump(integrity_json, f, indent=2) | |
| # Save train-test validation results as JSON | |
| validation_json = { | |
| "suite_name": "Train-Test Validation Suite (Cleaned)", | |
| "total_checks": len(validation_result.results), | |
| "timestamp": pd.Timestamp.now().isoformat(), | |
| "passed": validation_passed, | |
| "failed": validation_total - validation_passed | |
| } | |
| with open(output_dir / "train_test_validation_clean.json", 'w') as f: | |
| json.dump(validation_json, f, indent=2) | |
| print("\n" + "="*80) | |
| print("VALIDATION RESULTS") | |
| print("="*80) | |
| print(f"\nData Integrity Suite:") | |
| print(f" Passed: {integrity_passed}/{integrity_total}") | |
| print(f"\nTrain-Test Validation Suite:") | |
| print(f" Passed: {validation_passed}/{validation_total}") | |
| # Check critical issues | |
| critical_issues = [] | |
| for result in integrity_result.results: | |
| if hasattr(result, 'passed_conditions') and not result.passed_conditions(): | |
| check_name = result.get_header() | |
| if "Duplicate" in check_name or "Conflict" in check_name: | |
| critical_issues.append(f"Data Integrity: {check_name}") | |
| for result in validation_result.results: | |
| if hasattr(result, 'passed_conditions') and not result.passed_conditions(): | |
| check_name = result.get_header() | |
| if "Mix" in check_name or "Leakage" in check_name: | |
| critical_issues.append(f"Train-Test: {check_name}") | |
| if critical_issues: | |
| print(f"\nCRITICAL ISSUES REMAINING:") | |
| for issue in critical_issues: | |
| print(f" - {issue}") | |
| else: | |
| print(f"\nNO CRITICAL ISSUES DETECTED!") | |
| print(f" No duplicates") | |
| print(f" No label conflicts") | |
| print(f" No data leakage") | |
| print(f" Data is ready for training") | |
| print(f"\nReports saved to: {output_dir}") | |
| print("="*80) | |
| return integrity_result, validation_result | |
| if __name__ == "__main__": | |
| run_validation() | |