File size: 6,434 Bytes
225af6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
"""
Comprehensive Deepchecks Test Runner - Original vs Cleaned Data Comparison
This script runs all Deepchecks tests on both original and cleaned data,
allowing for direct comparison of data quality improvements after cleaning.
Usage:
python tests/deepchecks/run_all_tests_comparison.py
Output:
- Generates reports for both original and cleaned data
- Creates comparison summary
- Saves all results in reports/deepchecks/
"""
import sys
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from test_data_integrity import (
run_data_integrity_suite,
run_custom_integrity_checks,
analyze_data_statistics
)
from test_train_test_validation import (
run_train_test_validation_suite,
run_custom_train_test_checks,
compare_distributions,
validate_split_quality
)
def print_section_header(title):
"""Print a formatted section header."""
print("\n" + "="*80)
print(f" {title}")
print("="*80 + "\n")
def run_all_tests_for_data(use_cleaned=False):
"""
Run all Deepchecks tests for either original or cleaned data.
Args:
use_cleaned: If True, test cleaned data; otherwise test original data
"""
data_type = "CLEANED" if use_cleaned else "ORIGINAL"
print("\n" + "#"*80)
print(f"# RUNNING ALL DEEPCHECKS TESTS - {data_type} DATA")
print("#"*80)
# 1. Dataset Statistics
print_section_header(f"1. DATASET STATISTICS - {data_type}")
analyze_data_statistics(use_cleaned=use_cleaned)
# 2. Data Integrity Suite
print_section_header(f"2. DATA INTEGRITY SUITE - {data_type}")
integrity_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned)
# 3. Custom Integrity Checks
print_section_header(f"3. CUSTOM INTEGRITY CHECKS - {data_type}")
custom_integrity_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned)
# 4. Distribution Comparison
print_section_header(f"4. TRAIN-TEST DISTRIBUTION COMPARISON - {data_type}")
compare_distributions(use_cleaned=use_cleaned)
# 5. Split Quality Validation
print_section_header(f"5. TRAIN-TEST SPLIT QUALITY - {data_type}")
validate_split_quality(use_cleaned=use_cleaned)
# 6. Train-Test Validation Suite
print_section_header(f"6. TRAIN-TEST VALIDATION SUITE - {data_type}")
validation_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned)
# 7. Custom Train-Test Checks
print_section_header(f"7. CUSTOM TRAIN-TEST CHECKS - {data_type}")
custom_validation_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned)
return {
"integrity_suite": integrity_result,
"custom_integrity": custom_integrity_results,
"validation_suite": validation_result,
"custom_validation": custom_validation_results
}
def main():
"""Main function to run all tests and generate comparison."""
print("\n" + "*"*80)
print("* DEEPCHECKS COMPREHENSIVE TEST SUITE")
print("* Original vs Cleaned Data Comparison")
print("*"*80)
print("\nThis script will run all Deepchecks tests on both:")
print(" 1. Original data (before cleaning)")
print(" 2. Cleaned data (after data_cleaning.py)")
print("\nThis allows direct comparison of data quality improvements.\n")
# Check if cleaned data exists
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR
tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
cleaned_files_exist = all([
(tfidf_dir / "features_tfidf_clean.npy").exists(),
(tfidf_dir / "labels_tfidf_clean.npy").exists(),
(tfidf_dir / "X_test_clean.npy").exists(),
(tfidf_dir / "Y_test_clean.npy").exists()
])
if not cleaned_files_exist:
print("⚠️ WARNING: Cleaned data files not found!")
print(" Please run data_cleaning.py first:")
print(" python -m hopcroft_skill_classification_tool_competition.data_cleaning")
print("\n Continuing with original data only...\n")
# Run tests only on original data
print_section_header("TESTING ORIGINAL DATA ONLY")
original_results = run_all_tests_for_data(use_cleaned=False)
else:
# Run tests on both original and cleaned data
print("✓ Cleaned data files found")
print(" Running tests on both original and cleaned data...\n")
# Test original data
original_results = run_all_tests_for_data(use_cleaned=False)
# Test cleaned data
cleaned_results = run_all_tests_for_data(use_cleaned=True)
# Print comparison summary
print("\n" + "="*80)
print(" COMPARISON SUMMARY")
print("="*80)
print("\nOriginal vs Cleaned Data:")
print(" - Original data tests saved with '_original' suffix")
print(" - Cleaned data tests saved with '_clean' suffix")
print("\nExpected improvements in cleaned data:")
print(" ✓ No duplicates (0.0%)")
print(" ✓ No label conflicts (0.0%)")
print(" ✓ No data leakage (0 samples)")
print(" ✓ Proper stratified split (80/20)")
print(" ✓ Clean train/test separation")
# Final summary
print("\n" + "="*80)
print(" ALL TESTS COMPLETED")
print("="*80)
print("\nReports saved in: reports/deepchecks/")
print("\nFiles generated:")
print(" Original data:")
print(" - data_integrity_suite_results_original.json")
print(" - train_test_validation_suite_results_original.json")
if cleaned_files_exist:
print("\n Cleaned data:")
print(" - data_integrity_suite_results_clean.json")
print(" - train_test_validation_suite_results_clean.json")
print("\nNext steps:")
if not cleaned_files_exist:
print(" 1. Run data_cleaning.py to generate cleaned data")
print(" 2. Re-run this script to compare original vs cleaned")
else:
print(" 1. Review JSON reports in reports/deepchecks/")
print(" 2. Compare original vs cleaned results")
print(" 3. Use cleaned data for model training")
print("\n" + "="*80 + "\n")
if __name__ == "__main__":
main()
|