File size: 15,642 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
"""
Train-Test Validation Suite - Deepchecks validation for train-test consistency

This module implements comprehensive train-test validation checks using Deepchecks
to ensure consistency and proper splitting between training and test datasets.

Checks included:
- Train-Test Feature Drift: Detects distribution changes between train and test
- Train-Test Label Drift: Checks if label distribution differs
- Train-Test Samples Mix: Validates no data leakage
- Whole Dataset Drift: Overall distribution comparison
- Feature Label Correlation Change: Checks if correlations change
- New Label: Detects labels in test not present in train
- New Category: Detects new categorical values in test
- String Mismatch Comparison: Compares string inconsistencies
- Date Train Test Leakage Duplicates: Checks for temporal leakage
- Date Train Test Leakage Overlap: Validates proper temporal split
"""

import numpy as np
import pandas as pd
import json
from pathlib import Path
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import train_test_validation

from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR


def load_train_test_data(use_cleaned=True):
    """
    Load training and test datasets from processed data directory.
    
    Args:
        use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT
    
    Returns:
        tuple: (X_train, y_train, X_test, y_test)
    """
    tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
    
    # Choose file names based on cleaned flag
    if use_cleaned:
        train_features = tfidf_dir / "features_tfidf_clean.npy"
        train_labels = tfidf_dir / "labels_tfidf_clean.npy"
        test_features = tfidf_dir / "X_test_clean.npy"
        test_labels = tfidf_dir / "Y_test_clean.npy"
        data_type = "cleaned"
    else:
        train_features = tfidf_dir / "features_tfidf.npy"
        train_labels = tfidf_dir / "labels_tfidf.npy"
        test_features = tfidf_dir / "X_test.npy"
        test_labels = tfidf_dir / "Y_test.npy"
        data_type = "original"
    
    # Load features and labels
    X_train = np.load(train_features)
    y_train = np.load(train_labels)
    X_test = np.load(test_features)
    y_test = np.load(test_labels)
    
    print(f"Loaded {data_type} data:")
    print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
    print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")
    
    return X_train, y_train, X_test, y_test


def create_deepchecks_dataset(X, y, dataset_name="dataset"):
    """
    Create a Deepchecks Dataset object from numpy arrays.
    
    Args:
        X: Feature matrix (numpy array)
        y: Labels (numpy array) - can be multi-label (2D) or single-label (1D)
        dataset_name: Name identifier for the dataset
    
    Returns:
        Dataset: Deepchecks Dataset object
    """
    # Convert to DataFrame for better visualization
    # Create feature names
    feature_names = [f"feature_{i}" for i in range(X.shape[1])]
    
    # Create DataFrame
    df = pd.DataFrame(X, columns=feature_names)
    
    # Handle multi-label case: convert to single label by taking argmax or first active label
    if len(y.shape) > 1 and y.shape[1] > 1:
        # Multi-label: convert to single label (first active label or most confident)
        # For binary multi-label, take the index of first 1
        y_single = np.argmax(y, axis=1)  # Get the index of maximum value
        df['label'] = y_single
        print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks")
    else:
        df['label'] = y
    
    # Create Deepchecks Dataset
    ds = Dataset(df, label='label', cat_features=[])
    
    return ds


def run_train_test_validation_suite(save_output=True, use_cleaned=True):
    """
    Run the complete Train-Test Validation Suite.
    
    This suite performs comprehensive checks including:
    - Train Test Feature Drift: Detects significant distribution changes in features
    - Train Test Label Drift: Checks if label distribution is consistent
    - Train Test Samples Mix: Validates no samples appear in both sets
    - Whole Dataset Drift: Overall dataset distribution comparison
    - Feature Label Correlation Change: Detects changes in feature-label relationships
    - New Label: Identifies labels in test that don't exist in train
    - New Category: Finds new categorical values in test set
    - String Mismatch Comparison: Compares string format consistency
    - Date Train Test Leakage: Checks for temporal data leakage
    - Index Train Test Leakage: Validates proper index separation
    
    Args:
        save_output: Whether to save the HTML report
        use_cleaned: If True, use cleaned data instead of original
    
    Returns:
        SuiteResult: Results from the train-test validation suite
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"TRAIN-TEST VALIDATION SUITE - {data_type} DATA")
    print("="*80)
    
    # Load data
    X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
    
    # Create Deepchecks datasets
    train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
    test_dataset = create_deepchecks_dataset(X_test, y_test, "test")
    
    # Run the Train-Test Validation Suite
    print("\nRunning Train-Test Validation checks...")
    suite = train_test_validation()
    result = suite.run(train_dataset, test_dataset)
    
    # Display results
    print("\nTrain-Test Validation Suite completed!")
    print(f"Total checks: {len(result.results)}")
    
    # Save output
    if save_output:
        output_dir = Path("reports/deepchecks")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save JSON report with appropriate suffix
        suffix = "_clean" if use_cleaned else "_original"
        json_path = output_dir / f"train_test_validation_suite_results{suffix}.json"
        json_results = {
            "suite_name": "Train-Test Validation Suite",
            "total_checks": len(result.results),
            "timestamp": pd.Timestamp.now().isoformat(),
            "checks": []
        }
        
        for check_result in result.results:
            check_data = {
                "check_name": check_result.get_header(),
                "passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None,
                "display": str(check_result.display) if hasattr(check_result, 'display') else None
            }
            json_results["checks"].append(check_data)
        
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(json_results, f, indent=2, ensure_ascii=False)
        print(f"JSON results saved to: {json_path}")
    
    return result


def run_custom_train_test_checks(save_output=True, use_cleaned=True):
    """
    Run custom train-test validation checks tailored for the SkillScope dataset.
    
    These checks are specifically designed for NLP/Text features and
    multi-label classification tasks.
    
    Args:
        save_output: Whether to save the HTML report
        use_cleaned: If True, use cleaned data instead of original
    
    Returns:
        dict: Dictionary containing check results
    """
    from deepchecks.tabular.checks import (
        TrainTestFeatureDrift,
        TrainTestLabelDrift,
        TrainTestSamplesMix,
        WholeDatasetDrift,
        FeatureLabelCorrelationChange,
    )
    
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"CUSTOM TRAIN-TEST VALIDATION CHECKS - {data_type} DATA")
    print("="*80)
    
    # Load data
    X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
    train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
    test_dataset = create_deepchecks_dataset(X_test, y_test, "test")
    
    results = {}
    
    # Check 1: Feature Drift
    print("\n1. Checking for feature drift between train and test...")
    feature_drift_check = TrainTestFeatureDrift()
    results['feature_drift'] = feature_drift_check.run(train_dataset, test_dataset)
    
    # Check 2: Label Drift
    print("2. Checking for label drift between train and test...")
    label_drift_check = TrainTestLabelDrift()
    results['label_drift'] = label_drift_check.run(train_dataset, test_dataset)
    
    # Check 3: Samples Mix (Data Leakage)
    print("3. Checking for data leakage (samples appearing in both sets)...")
    samples_mix_check = TrainTestSamplesMix()
    results['samples_mix'] = samples_mix_check.run(train_dataset, test_dataset)
    
    # Check 4: Whole Dataset Drift
    print("4. Checking overall dataset drift...")
    dataset_drift_check = WholeDatasetDrift()
    results['dataset_drift'] = dataset_drift_check.run(train_dataset, test_dataset)
    
    # Check 5: Feature-Label Correlation Change
    print("5. Checking for changes in feature-label correlation...")
    correlation_change_check = FeatureLabelCorrelationChange()
    results['correlation_change'] = correlation_change_check.run(train_dataset, test_dataset)
    
    # Note: NewLabel check not available in this version of Deepchecks
    # Check 6 would verify new labels in test set not present in train
    print("6. Skipping NewLabel check (not available in this Deepchecks version)")
    
    print("\nAll custom train-test checks completed!")
    
    # Results are available in memory for further processing if needed
    
    return results


def compare_distributions(use_cleaned=True):
    """
    Compare statistical distributions between train and test sets.
    
    Args:
        use_cleaned: If True, compare cleaned data instead of original
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"TRAIN-TEST DISTRIBUTION COMPARISON - {data_type} DATA")
    print("="*80)
    
    X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
    
    print("\n1. SAMPLE SIZES:")
    print(f"   Training: {X_train.shape[0]} samples")
    print(f"   Test: {X_test.shape[0]} samples")
    print(f"   Train/Test ratio: {X_train.shape[0]/X_test.shape[0]:.2f}")
    
    print("\n2. FEATURE DIMENSIONS:")
    print(f"   Training features: {X_train.shape[1]}")
    print(f"   Test features: {X_test.shape[1]}")
    if X_train.shape[1] != X_test.shape[1]:
        print("  WARNING: Feature dimensions don't match!")
    else:
        print("   ✓ Feature dimensions match")
    
    print("\n3. LABEL DISTRIBUTION:")
    train_unique, train_counts = np.unique(y_train, return_counts=True)
    test_unique, test_counts = np.unique(y_test, return_counts=True)
    
    print(f"   Training unique labels: {len(train_unique)}")
    print(f"   Test unique labels: {len(test_unique)}")
    
    # Check for labels in test not in train
    new_labels = set(test_unique) - set(train_unique)
    if new_labels:
        print(f"   WARNING: {len(new_labels)} new labels in test set: {new_labels}")
    else:
        print("   No new labels in test set")
    
    # Check for labels in train not in test
    missing_labels = set(train_unique) - set(test_unique)
    if missing_labels:
        print(f"   INFO: {len(missing_labels)} labels only in train set")
    
    print("\n4. FEATURE STATISTICS COMPARISON:")
    print(f"   Train - Mean: {X_train.mean():.4f}, Std: {X_train.std():.4f}")
    print(f"   Test  - Mean: {X_test.mean():.4f}, Std: {X_test.std():.4f}")
    
    mean_diff = abs(X_train.mean() - X_test.mean())
    std_diff = abs(X_train.std() - X_test.std())
    
    print(f"   Mean difference: {mean_diff:.4f}")
    print(f"   Std difference: {std_diff:.4f}")
    
    if mean_diff > 0.1 or std_diff > 0.1:
        print("   WARNING: Significant statistical differences detected!")
    else:
        print("   Statistical distributions are similar")
    
    print("\n5. SPARSITY COMPARISON:")
    train_sparsity = (X_train == 0).sum() / X_train.size * 100
    test_sparsity = (X_test == 0).sum() / X_test.size * 100
    print(f"   Training sparsity: {train_sparsity:.2f}%")
    print(f"   Test sparsity: {test_sparsity:.2f}%")
    print(f"   Sparsity difference: {abs(train_sparsity - test_sparsity):.2f}%")
    
    if abs(train_sparsity - test_sparsity) > 5:
        print("  WARNING: Significant sparsity difference!")
    else:
        print("   Sparsity levels are similar")


def validate_split_quality(use_cleaned=True):
    """
    Validate the quality of the train-test split.
    
    Args:
        use_cleaned: If True, validate cleaned data instead of original
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"TRAIN-TEST SPLIT QUALITY VALIDATION - {data_type} DATA")
    print("="*80)
    
    X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned)
    
    total_samples = X_train.shape[0] + X_test.shape[0]
    test_ratio = X_test.shape[0] / total_samples
    
    print(f"\nTotal samples: {total_samples}")
    print(f"Test set ratio: {test_ratio:.2%}")
    
    # Validate test set size (typically 20-30%)
    if 0.15 <= test_ratio <= 0.35:
        print(" Test set size is within recommended range (15-35%)")
    else:
        print(" WARNING: Test set size is outside recommended range")
    
    # Check label distribution similarity
    from scipy.stats import chisquare
    
    # Get common labels
    common_labels = np.intersect1d(np.unique(y_train), np.unique(y_test))
    
    if len(common_labels) > 0:
        train_dist = [np.sum(y_train == label) for label in common_labels]
        test_dist = [np.sum(y_test == label) for label in common_labels]
        
        # Normalize to proportions
        train_props = np.array(train_dist) / len(y_train)
        test_props = np.array(test_dist) / len(y_test)
        
        # Chi-square test
        # Scale test proportions to match train sample size for chi-square
        expected = test_props * len(y_train)
        chi_stat, p_value = chisquare(train_dist, expected)
        
        print(f"\nLabel distribution similarity (chi-square test):")
        print(f"  Chi-square statistic: {chi_stat:.4f}")
        print(f"  P-value: {p_value:.4f}")
        
        if p_value > 0.05:
            print("  Label distributions are statistically similar (p > 0.05)")
        else:
            print("   WARNING: Label distributions differ significantly (p <= 0.05)")
    else:
        print(" WARNING: No common labels between train and test sets!")


if __name__ == "__main__":
    import sys
    
    # By default use cleaned data, unless --original flag is specified
    use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv)
    
    if use_cleaned:
        print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n")
    else:
        print("Testing ORIGINAL data\n")
        print("Note: Using --original flag to test old data\n")
    
    # Compare distributions
    compare_distributions(use_cleaned=use_cleaned)
    
    # Validate split quality
    print("\n")
    validate_split_quality(use_cleaned=use_cleaned)
    
    # Run the full Train-Test Validation Suite
    print("\n")
    suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned)
    
    # Run custom train-test checks
    print("\n")
    custom_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned)
    
    print("\n" + "="*80)
    print("TRAIN-TEST VALIDATION COMPLETED")
    print("="*80)
    print("\nCheck the reports in the 'reports/deepchecks' directory")