| | """ |
| | Validation script for cleaned data. |
| | |
| | This script runs Deepchecks validation on the cleaned dataset to verify that: |
| | 1. No duplicates remain |
| | 2. No label conflicts exist |
| | 3. No data leakage between train and test |
| | 4. All data quality issues are resolved |
| | |
| | Run this after data_cleaning.py to confirm data quality. |
| | """ |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | from pathlib import Path |
| | from deepchecks.tabular import Dataset |
| | from deepchecks.tabular.suites import data_integrity, train_test_validation |
| |
|
| | from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR |
| |
|
| |
|
| | def load_cleaned_data(): |
| | """Load cleaned train and test datasets.""" |
| | tfidf_dir = PROCESSED_DATA_DIR / "tfidf" |
| | |
| | X_train = np.load(tfidf_dir / "features_tfidf_clean.npy") |
| | y_train = np.load(tfidf_dir / "labels_tfidf_clean.npy") |
| | X_test = np.load(tfidf_dir / "X_test_clean.npy") |
| | y_test = np.load(tfidf_dir / "Y_test_clean.npy") |
| | |
| | print(f"Loaded cleaned data:") |
| | print(f" Train: {X_train.shape[0]:,} samples x {X_train.shape[1]:,} features") |
| | print(f" Test: {X_test.shape[0]:,} samples x {X_test.shape[1]:,} features") |
| | print(f" Labels: {y_train.shape[1]} labels") |
| | |
| | return X_train, y_train, X_test, y_test |
| |
|
| |
|
| | def create_deepchecks_dataset(X, y, dataset_name="dataset"): |
| | """Create Deepchecks Dataset from numpy arrays.""" |
| | feature_names = [f"feature_{i}" for i in range(X.shape[1])] |
| | df = pd.DataFrame(X, columns=feature_names) |
| | |
| | |
| | if len(y.shape) > 1 and y.shape[1] > 1: |
| | y_single = np.argmax(y, axis=1) |
| | df['label'] = y_single |
| | else: |
| | df['label'] = y |
| | |
| | ds = Dataset(df, label='label', cat_features=[]) |
| | return ds |
| |
|
| |
|
| | def run_validation(): |
| | """Run full Deepchecks validation on cleaned data.""" |
| | print("="*80) |
| | print("DEEPCHECKS VALIDATION - CLEANED DATA") |
| | print("="*80) |
| | |
| | |
| | X_train, y_train, X_test, y_test = load_cleaned_data() |
| | |
| | |
| | train_dataset = create_deepchecks_dataset(X_train, y_train, "training_clean") |
| | test_dataset = create_deepchecks_dataset(X_test, y_test, "test_clean") |
| | |
| | |
| | print("\n" + "="*80) |
| | print("RUNNING DATA INTEGRITY SUITE") |
| | print("="*80) |
| | integrity_suite = data_integrity() |
| | integrity_result = integrity_suite.run(train_dataset) |
| | |
| | |
| | print("\n" + "="*80) |
| | print("RUNNING TRAIN-TEST VALIDATION SUITE") |
| | print("="*80) |
| | validation_suite = train_test_validation() |
| | validation_result = validation_suite.run(train_dataset, test_dataset) |
| | |
| | |
| | output_dir = Path("reports/deepchecks") |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| | |
| | |
| | import json |
| | |
| | |
| | integrity_passed = sum(1 for r in integrity_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions()) |
| | integrity_total = len(integrity_result.results) |
| | |
| | validation_passed = sum(1 for r in validation_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions()) |
| | validation_total = len(validation_result.results) |
| | |
| | |
| | integrity_json = { |
| | "suite_name": "Data Integrity Suite (Cleaned)", |
| | "total_checks": len(integrity_result.results), |
| | "timestamp": pd.Timestamp.now().isoformat(), |
| | "passed": integrity_passed, |
| | "failed": integrity_total - integrity_passed |
| | } |
| | with open(output_dir / "data_integrity_clean.json", 'w') as f: |
| | json.dump(integrity_json, f, indent=2) |
| | |
| | |
| | validation_json = { |
| | "suite_name": "Train-Test Validation Suite (Cleaned)", |
| | "total_checks": len(validation_result.results), |
| | "timestamp": pd.Timestamp.now().isoformat(), |
| | "passed": validation_passed, |
| | "failed": validation_total - validation_passed |
| | } |
| | with open(output_dir / "train_test_validation_clean.json", 'w') as f: |
| | json.dump(validation_json, f, indent=2) |
| | |
| | print("\n" + "="*80) |
| | print("VALIDATION RESULTS") |
| | print("="*80) |
| | |
| | print(f"\nData Integrity Suite:") |
| | print(f" Passed: {integrity_passed}/{integrity_total}") |
| | |
| | print(f"\nTrain-Test Validation Suite:") |
| | print(f" Passed: {validation_passed}/{validation_total}") |
| | |
| | |
| | critical_issues = [] |
| | |
| | for result in integrity_result.results: |
| | if hasattr(result, 'passed_conditions') and not result.passed_conditions(): |
| | check_name = result.get_header() |
| | if "Duplicate" in check_name or "Conflict" in check_name: |
| | critical_issues.append(f"Data Integrity: {check_name}") |
| | |
| | for result in validation_result.results: |
| | if hasattr(result, 'passed_conditions') and not result.passed_conditions(): |
| | check_name = result.get_header() |
| | if "Mix" in check_name or "Leakage" in check_name: |
| | critical_issues.append(f"Train-Test: {check_name}") |
| | |
| | if critical_issues: |
| | print(f"\nCRITICAL ISSUES REMAINING:") |
| | for issue in critical_issues: |
| | print(f" - {issue}") |
| | else: |
| | print(f"\nNO CRITICAL ISSUES DETECTED!") |
| | print(f" No duplicates") |
| | print(f" No label conflicts") |
| | print(f" No data leakage") |
| | print(f" Data is ready for training") |
| | |
| | print(f"\nReports saved to: {output_dir}") |
| | print("="*80) |
| | |
| | return integrity_result, validation_result |
| |
|
| |
|
| | if __name__ == "__main__": |
| | run_validation() |
| |
|