File size: 11,392 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
"""
Data Integrity Suite - Deepchecks validation for dataset integrity

This module implements comprehensive data integrity checks using Deepchecks
to validate the quality and consistency of the training and test datasets.

Checks included:
- Data duplicates detection
- Missing values analysis
- Feature-label correlation
- Feature-feature correlation
- Data type consistency
- Outlier detection
- Class imbalance analysis
"""

import numpy as np
import pandas as pd
import json
from pathlib import Path
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import data_integrity

from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR


def load_data(use_cleaned=True):
    """
    Load training and test datasets from processed data directory.
    
    Args:
        use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT
    
    Returns:
        tuple: (X_train, y_train, X_test, y_test)
    """
    tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
    
    # Choose file names based on cleaned flag
    if use_cleaned:
        train_features = tfidf_dir / "features_tfidf_clean.npy"
        train_labels = tfidf_dir / "labels_tfidf_clean.npy"
        test_features = tfidf_dir / "X_test_clean.npy"
        test_labels = tfidf_dir / "Y_test_clean.npy"
        data_type = "cleaned"
    else:
        train_features = tfidf_dir / "features_tfidf.npy"
        train_labels = tfidf_dir / "labels_tfidf.npy"
        test_features = tfidf_dir / "X_test.npy"
        test_labels = tfidf_dir / "Y_test.npy"
        data_type = "original"
    
    # Load features and labels
    X_train = np.load(train_features)
    y_train = np.load(train_labels)
    X_test = np.load(test_features)
    y_test = np.load(test_labels)
    
    print(f"Loaded {data_type} data:")
    print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
    print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")
    
    return X_train, y_train, X_test, y_test


def create_deepchecks_dataset(X, y, dataset_name="dataset"):
    """
    Create a Deepchecks Dataset object from numpy arrays.
    
    Args:
        X: Feature matrix (numpy array)
        y: Labels (numpy array) - can be multi-label (2D) or single-label (1D)
        dataset_name: Name identifier for the dataset
    
    Returns:
        Dataset: Deepchecks Dataset object
    """
    # Convert to DataFrame for better visualization
    # Create feature names
    feature_names = [f"feature_{i}" for i in range(X.shape[1])]
    
    # Create DataFrame
    df = pd.DataFrame(X, columns=feature_names)
    
    # Handle multi-label case: convert to single label by taking argmax or first active label
    if len(y.shape) > 1 and y.shape[1] > 1:
        # Multi-label: convert to single label (first active label or most confident)
        # For binary multi-label, take the index of first 1
        y_single = np.argmax(y, axis=1)  # Get the index of maximum value
        df['label'] = y_single
        print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks")
    else:
        df['label'] = y
    
    # Create Deepchecks Dataset
    ds = Dataset(df, label='label', cat_features=[])
    
    return ds


def run_data_integrity_suite(save_output=True, use_cleaned=True):
    """
    Run the complete Data Integrity Suite on training data.
    
    This suite performs comprehensive checks including:
    - Data Duplicates: Identifies duplicate samples
    - String Mismatch: Checks for string inconsistencies
    - Mixed Nulls: Detects various null representations
    - Mixed Data Types: Validates consistent data types
    - String Length Out Of Bounds: Checks string length anomalies
    - Is Single Value: Identifies features with only one value
    - Special Characters: Detects special characters in data
    - Class Imbalance: Analyzes label distribution
    - Outlier Sample Detection: Identifies outlier samples
    - Feature Label Correlation: Checks correlation between features and labels
    
    Args:
        save_output: Whether to save the HTML report
        use_cleaned: If True, use cleaned data instead of original
    
    Returns:
        SuiteResult: Results from the data integrity suite
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"DATA INTEGRITY SUITE - {data_type} TRAINING DATA")
    print("="*80)
    
    # Load data
    X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned)
    
    # Create Deepchecks dataset
    train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
    
    # Run the Data Integrity Suite
    print("\nRunning Data Integrity checks...")
    suite = data_integrity()
    result = suite.run(train_dataset)
    
    # Display results
    print("\nData Integrity Suite completed!")
    print(f"Total checks: {len(result.results)}")
    
    # Save output
    if save_output:
        output_dir = Path("reports/deepchecks")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save JSON report with appropriate suffix
        suffix = "_clean" if use_cleaned else "_original"
        json_path = output_dir / f"data_integrity_suite_results{suffix}.json"
        json_results = {
            "suite_name": "Data Integrity Suite",
            "total_checks": len(result.results),
            "timestamp": pd.Timestamp.now().isoformat(),
            "checks": []
        }
        
        for check_result in result.results:
            check_data = {
                "check_name": check_result.get_header(),
                "passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None,
                "display": str(check_result.display) if hasattr(check_result, 'display') else None
            }
            json_results["checks"].append(check_data)
        
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(json_results, f, indent=2, ensure_ascii=False)
        print(f"JSON results saved to: {json_path}")
    
    return result


def run_custom_integrity_checks(save_output=True, use_cleaned=True):
    """
    Run custom integrity checks tailored for the SkillScope dataset.
    
    These checks are specifically designed for NLP/Text features and
    multi-label classification tasks.
    
    Args:
        save_output: Whether to save the HTML report
        use_cleaned: If True, use cleaned data instead of original
    
    Returns:
        dict: Dictionary containing check results
    """
    from deepchecks.tabular.checks import (
        DataDuplicates,
        MixedNulls,
        IsSingleValue,
        ClassImbalance,
        OutlierSampleDetection,
        FeatureLabelCorrelation,
    )
    
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"CUSTOM DATA INTEGRITY CHECKS - {data_type} DATA")
    print("="*80)
    
    # Load data
    X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned)
    train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
    
    results = {}
    
    # Check 1: Data Duplicates
    print("\n1. Checking for duplicate samples...")
    duplicates_check = DataDuplicates()
    results['duplicates'] = duplicates_check.run(train_dataset)
    
    # Check 2: Mixed Nulls
    print("2. Checking for mixed null values...")
    nulls_check = MixedNulls()
    results['nulls'] = nulls_check.run(train_dataset)
    
    # Check 3: Single Value Features
    print("3. Checking for single-value features...")
    single_value_check = IsSingleValue()
    results['single_value'] = single_value_check.run(train_dataset)
    
    # Check 4: Class Imbalance
    print("4. Checking class distribution...")
    imbalance_check = ClassImbalance()
    results['class_imbalance'] = imbalance_check.run(train_dataset)
    
    # Check 5: Outlier Detection (with increased timeout)
    print("5. Detecting outlier samples (this may take a while)...")
    try:
        outlier_check = OutlierSampleDetection(timeout=300)  # 5 minutes timeout
        results['outliers'] = outlier_check.run(train_dataset)
    except Exception as e:
        print(f"   Warning: Outlier detection failed or timed out: {str(e)}")
        results['outliers'] = None
    
    # Check 6: Feature-Label Correlation (with sample subset for speed)
    print("6. Analyzing feature-label correlation (using sample of features)...")
    try:
        # Use only top 100 features for correlation to speed up
        correlation_check = FeatureLabelCorrelation(n_top_columns=100, timeout=300)
        results['correlation'] = correlation_check.run(train_dataset)
    except Exception as e:
        print(f"   Warning: Correlation check failed or timed out: {str(e)}")
        results['correlation'] = None
    
    print("\nAll custom checks completed!")
    
    # Results are available in memory for further processing if needed
    
    return results


def analyze_data_statistics(use_cleaned=True):
    """
    Print detailed statistics about the dataset.
    
    Args:
        use_cleaned: If True, analyze cleaned data instead of original
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"DATASET STATISTICS - {data_type} DATA")
    print("="*80)
    
    X_train, y_train, X_test, y_test = load_data(use_cleaned=use_cleaned)
    
    print(f"\nTraining set:")
    print(f"  - Samples: {X_train.shape[0]}")
    print(f"  - Features: {X_train.shape[1]}")
    print(f"  - Unique labels: {len(np.unique(y_train))}")
    print(f"  - Label distribution:")
    unique, counts = np.unique(y_train, return_counts=True)
    for label, count in zip(unique[:10], counts[:10]):  # Show first 10
        print(f"    Label {label}: {count} samples ({count/len(y_train)*100:.2f}%)")
    if len(unique) > 10:
        print(f"    ... and {len(unique)-10} more labels")
    
    print(f"\nTest set:")
    print(f"  - Samples: {X_test.shape[0]}")
    print(f"  - Features: {X_test.shape[1]}")
    print(f"  - Unique labels: {len(np.unique(y_test))}")
    
    print(f"\nFeature statistics:")
    print(f"  - Mean feature value: {X_train.mean():.4f}")
    print(f"  - Std feature value: {X_train.std():.4f}")
    print(f"  - Min feature value: {X_train.min():.4f}")
    print(f"  - Max feature value: {X_train.max():.4f}")
    print(f"  - Sparsity: {(X_train == 0).sum() / X_train.size * 100:.2f}%")


if __name__ == "__main__":
    import sys
    
    # By default use cleaned data, unless --original flag is specified
    use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv)
    
    if use_cleaned:
        print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n")
    else:
        print("Testing ORIGINAL data\n")
        print("Note: Using --original flag to test old data\n")
    
    # Print dataset statistics
    analyze_data_statistics(use_cleaned=use_cleaned)
    
    # Run the full Data Integrity Suite
    print("\n")
    suite_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned)
    
    # Run custom integrity checks
    print("\n")
    custom_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned)
    
    print("\n" + "="*80)
    print("DATA INTEGRITY VALIDATION COMPLETED")
    print("="*80)
    print("\nCheck the reports in the 'reports/deepchecks' directory")