File size: 6,434 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
Comprehensive Deepchecks Test Runner - Original vs Cleaned Data Comparison

This script runs all Deepchecks tests on both original and cleaned data,
allowing for direct comparison of data quality improvements after cleaning.

Usage:
    python tests/deepchecks/run_all_tests_comparison.py

Output:
    - Generates reports for both original and cleaned data
    - Creates comparison summary
    - Saves all results in reports/deepchecks/
"""

import sys
from pathlib import Path

# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from test_data_integrity import (
    run_data_integrity_suite,
    run_custom_integrity_checks,
    analyze_data_statistics
)
from test_train_test_validation import (
    run_train_test_validation_suite,
    run_custom_train_test_checks,
    compare_distributions,
    validate_split_quality
)


def print_section_header(title):
    """Print a formatted section header."""
    print("\n" + "="*80)
    print(f"  {title}")
    print("="*80 + "\n")


def run_all_tests_for_data(use_cleaned=False):
    """
    Run all Deepchecks tests for either original or cleaned data.
    
    Args:
        use_cleaned: If True, test cleaned data; otherwise test original data
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    
    print("\n" + "#"*80)
    print(f"#  RUNNING ALL DEEPCHECKS TESTS - {data_type} DATA")
    print("#"*80)
    
    # 1. Dataset Statistics
    print_section_header(f"1. DATASET STATISTICS - {data_type}")
    analyze_data_statistics(use_cleaned=use_cleaned)
    
    # 2. Data Integrity Suite
    print_section_header(f"2. DATA INTEGRITY SUITE - {data_type}")
    integrity_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned)
    
    # 3. Custom Integrity Checks
    print_section_header(f"3. CUSTOM INTEGRITY CHECKS - {data_type}")
    custom_integrity_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned)
    
    # 4. Distribution Comparison
    print_section_header(f"4. TRAIN-TEST DISTRIBUTION COMPARISON - {data_type}")
    compare_distributions(use_cleaned=use_cleaned)
    
    # 5. Split Quality Validation
    print_section_header(f"5. TRAIN-TEST SPLIT QUALITY - {data_type}")
    validate_split_quality(use_cleaned=use_cleaned)
    
    # 6. Train-Test Validation Suite
    print_section_header(f"6. TRAIN-TEST VALIDATION SUITE - {data_type}")
    validation_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned)
    
    # 7. Custom Train-Test Checks
    print_section_header(f"7. CUSTOM TRAIN-TEST CHECKS - {data_type}")
    custom_validation_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned)
    
    return {
        "integrity_suite": integrity_result,
        "custom_integrity": custom_integrity_results,
        "validation_suite": validation_result,
        "custom_validation": custom_validation_results
    }


def main():
    """Main function to run all tests and generate comparison."""
    print("\n" + "*"*80)
    print("*  DEEPCHECKS COMPREHENSIVE TEST SUITE")
    print("*  Original vs Cleaned Data Comparison")
    print("*"*80)
    
    print("\nThis script will run all Deepchecks tests on both:")
    print("  1. Original data (before cleaning)")
    print("  2. Cleaned data (after data_cleaning.py)")
    print("\nThis allows direct comparison of data quality improvements.\n")
    
    # Check if cleaned data exists
    from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR
    tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
    
    cleaned_files_exist = all([
        (tfidf_dir / "features_tfidf_clean.npy").exists(),
        (tfidf_dir / "labels_tfidf_clean.npy").exists(),
        (tfidf_dir / "X_test_clean.npy").exists(),
        (tfidf_dir / "Y_test_clean.npy").exists()
    ])
    
    if not cleaned_files_exist:
        print("⚠️  WARNING: Cleaned data files not found!")
        print("   Please run data_cleaning.py first:")
        print("   python -m hopcroft_skill_classification_tool_competition.data_cleaning")
        print("\n   Continuing with original data only...\n")
        
        # Run tests only on original data
        print_section_header("TESTING ORIGINAL DATA ONLY")
        original_results = run_all_tests_for_data(use_cleaned=False)
        
    else:
        # Run tests on both original and cleaned data
        print("✓ Cleaned data files found")
        print("  Running tests on both original and cleaned data...\n")
        
        # Test original data
        original_results = run_all_tests_for_data(use_cleaned=False)
        
        # Test cleaned data
        cleaned_results = run_all_tests_for_data(use_cleaned=True)
        
        # Print comparison summary
        print("\n" + "="*80)
        print("  COMPARISON SUMMARY")
        print("="*80)
        print("\nOriginal vs Cleaned Data:")
        print("  - Original data tests saved with '_original' suffix")
        print("  - Cleaned data tests saved with '_clean' suffix")
        print("\nExpected improvements in cleaned data:")
        print("  ✓ No duplicates (0.0%)")
        print("  ✓ No label conflicts (0.0%)")
        print("  ✓ No data leakage (0 samples)")
        print("  ✓ Proper stratified split (80/20)")
        print("  ✓ Clean train/test separation")
    
    # Final summary
    print("\n" + "="*80)
    print("  ALL TESTS COMPLETED")
    print("="*80)
    print("\nReports saved in: reports/deepchecks/")
    print("\nFiles generated:")
    print("  Original data:")
    print("    - data_integrity_suite_results_original.json")
    print("    - train_test_validation_suite_results_original.json")
    
    if cleaned_files_exist:
        print("\n  Cleaned data:")
        print("    - data_integrity_suite_results_clean.json")
        print("    - train_test_validation_suite_results_clean.json")
    
    print("\nNext steps:")
    if not cleaned_files_exist:
        print("  1. Run data_cleaning.py to generate cleaned data")
        print("  2. Re-run this script to compare original vs cleaned")
    else:
        print("  1. Review JSON reports in reports/deepchecks/")
        print("  2. Compare original vs cleaned results")
        print("  3. Use cleaned data for model training")
    
    print("\n" + "="*80 + "\n")


if __name__ == "__main__":
    main()