File size: 5,711 Bytes
39d224b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Validation script for cleaned data.

This script runs Deepchecks validation on the cleaned dataset to verify that:
1. No duplicates remain
2. No label conflicts exist
3. No data leakage between train and test
4. All data quality issues are resolved

Run this after data_cleaning.py to confirm data quality.
"""

import numpy as np
import pandas as pd
from pathlib import Path
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import data_integrity, train_test_validation

from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR


def load_cleaned_data():
    """Load cleaned train and test datasets."""
    tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
    
    X_train = np.load(tfidf_dir / "features_tfidf_clean.npy")
    y_train = np.load(tfidf_dir / "labels_tfidf_clean.npy")
    X_test = np.load(tfidf_dir / "X_test_clean.npy")
    y_test = np.load(tfidf_dir / "Y_test_clean.npy")
    
    print(f"Loaded cleaned data:")
    print(f"  Train: {X_train.shape[0]:,} samples x {X_train.shape[1]:,} features")
    print(f"  Test:  {X_test.shape[0]:,} samples x {X_test.shape[1]:,} features")
    print(f"  Labels: {y_train.shape[1]} labels")
    
    return X_train, y_train, X_test, y_test


def create_deepchecks_dataset(X, y, dataset_name="dataset"):
    """Create Deepchecks Dataset from numpy arrays."""
    feature_names = [f"feature_{i}" for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=feature_names)
    
    # Convert multi-label to single label for Deepchecks
    if len(y.shape) > 1 and y.shape[1] > 1:
        y_single = np.argmax(y, axis=1)
        df['label'] = y_single
    else:
        df['label'] = y
    
    ds = Dataset(df, label='label', cat_features=[])
    return ds


def run_validation():
    """Run full Deepchecks validation on cleaned data."""
    print("="*80)
    print("DEEPCHECKS VALIDATION - CLEANED DATA")
    print("="*80)
    
    # Load cleaned data
    X_train, y_train, X_test, y_test = load_cleaned_data()
    
    # Create Deepchecks datasets
    train_dataset = create_deepchecks_dataset(X_train, y_train, "training_clean")
    test_dataset = create_deepchecks_dataset(X_test, y_test, "test_clean")
    
    # Run Data Integrity Suite
    print("\n" + "="*80)
    print("RUNNING DATA INTEGRITY SUITE")
    print("="*80)
    integrity_suite = data_integrity()
    integrity_result = integrity_suite.run(train_dataset)
    
    # Run Train-Test Validation Suite
    print("\n" + "="*80)
    print("RUNNING TRAIN-TEST VALIDATION SUITE")
    print("="*80)
    validation_suite = train_test_validation()
    validation_result = validation_suite.run(train_dataset, test_dataset)
    
    # Save reports
    output_dir = Path("reports/deepchecks")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Save JSON results
    import json
    
    # Count passed/failed checks (handle CheckFailure objects)
    integrity_passed = sum(1 for r in integrity_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions())
    integrity_total = len(integrity_result.results)
    
    validation_passed = sum(1 for r in validation_result.results if hasattr(r, 'passed_conditions') and r.passed_conditions())
    validation_total = len(validation_result.results)
    
    # Save data integrity results as JSON
    integrity_json = {
        "suite_name": "Data Integrity Suite (Cleaned)",
        "total_checks": len(integrity_result.results),
        "timestamp": pd.Timestamp.now().isoformat(),
        "passed": integrity_passed,
        "failed": integrity_total - integrity_passed
    }
    with open(output_dir / "data_integrity_clean.json", 'w') as f:
        json.dump(integrity_json, f, indent=2)
    
    # Save train-test validation results as JSON
    validation_json = {
        "suite_name": "Train-Test Validation Suite (Cleaned)",
        "total_checks": len(validation_result.results),
        "timestamp": pd.Timestamp.now().isoformat(),
        "passed": validation_passed,
        "failed": validation_total - validation_passed
    }
    with open(output_dir / "train_test_validation_clean.json", 'w') as f:
        json.dump(validation_json, f, indent=2)
    
    print("\n" + "="*80)
    print("VALIDATION RESULTS")
    print("="*80)
    
    print(f"\nData Integrity Suite:")
    print(f"   Passed: {integrity_passed}/{integrity_total}")
    
    print(f"\nTrain-Test Validation Suite:")
    print(f"   Passed: {validation_passed}/{validation_total}")
    
    # Check critical issues
    critical_issues = []
    
    for result in integrity_result.results:
        if hasattr(result, 'passed_conditions') and not result.passed_conditions():
            check_name = result.get_header()
            if "Duplicate" in check_name or "Conflict" in check_name:
                critical_issues.append(f"Data Integrity: {check_name}")
    
    for result in validation_result.results:
        if hasattr(result, 'passed_conditions') and not result.passed_conditions():
            check_name = result.get_header()
            if "Mix" in check_name or "Leakage" in check_name:
                critical_issues.append(f"Train-Test: {check_name}")
    
    if critical_issues:
        print(f"\nCRITICAL ISSUES REMAINING:")
        for issue in critical_issues:
            print(f"   - {issue}")
    else:
        print(f"\nNO CRITICAL ISSUES DETECTED!")
        print(f"   No duplicates")
        print(f"   No label conflicts")
        print(f"   No data leakage")
        print(f"   Data is ready for training")
    
    print(f"\nReports saved to: {output_dir}")
    print("="*80)
    
    return integrity_result, validation_result


if __name__ == "__main__":
    run_validation()