""" Comprehensive Deepchecks Test Runner - Original vs Cleaned Data Comparison This script runs all Deepchecks tests on both original and cleaned data, allowing for direct comparison of data quality improvements after cleaning. Usage: python tests/deepchecks/run_all_tests_comparison.py Output: - Generates reports for both original and cleaned data - Creates comparison summary - Saves all results in reports/deepchecks/ """ import sys from pathlib import Path # Add project root to path project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from test_data_integrity import ( run_data_integrity_suite, run_custom_integrity_checks, analyze_data_statistics ) from test_train_test_validation import ( run_train_test_validation_suite, run_custom_train_test_checks, compare_distributions, validate_split_quality ) def print_section_header(title): """Print a formatted section header.""" print("\n" + "="*80) print(f" {title}") print("="*80 + "\n") def run_all_tests_for_data(use_cleaned=False): """ Run all Deepchecks tests for either original or cleaned data. Args: use_cleaned: If True, test cleaned data; otherwise test original data """ data_type = "CLEANED" if use_cleaned else "ORIGINAL" print("\n" + "#"*80) print(f"# RUNNING ALL DEEPCHECKS TESTS - {data_type} DATA") print("#"*80) # 1. Dataset Statistics print_section_header(f"1. DATASET STATISTICS - {data_type}") analyze_data_statistics(use_cleaned=use_cleaned) # 2. Data Integrity Suite print_section_header(f"2. DATA INTEGRITY SUITE - {data_type}") integrity_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned) # 3. Custom Integrity Checks print_section_header(f"3. CUSTOM INTEGRITY CHECKS - {data_type}") custom_integrity_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned) # 4. Distribution Comparison print_section_header(f"4. TRAIN-TEST DISTRIBUTION COMPARISON - {data_type}") compare_distributions(use_cleaned=use_cleaned) # 5. Split Quality Validation print_section_header(f"5. TRAIN-TEST SPLIT QUALITY - {data_type}") validate_split_quality(use_cleaned=use_cleaned) # 6. Train-Test Validation Suite print_section_header(f"6. TRAIN-TEST VALIDATION SUITE - {data_type}") validation_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned) # 7. Custom Train-Test Checks print_section_header(f"7. CUSTOM TRAIN-TEST CHECKS - {data_type}") custom_validation_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned) return { "integrity_suite": integrity_result, "custom_integrity": custom_integrity_results, "validation_suite": validation_result, "custom_validation": custom_validation_results } def main(): """Main function to run all tests and generate comparison.""" print("\n" + "*"*80) print("* DEEPCHECKS COMPREHENSIVE TEST SUITE") print("* Original vs Cleaned Data Comparison") print("*"*80) print("\nThis script will run all Deepchecks tests on both:") print(" 1. Original data (before cleaning)") print(" 2. Cleaned data (after data_cleaning.py)") print("\nThis allows direct comparison of data quality improvements.\n") # Check if cleaned data exists from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR tfidf_dir = PROCESSED_DATA_DIR / "tfidf" cleaned_files_exist = all([ (tfidf_dir / "features_tfidf_clean.npy").exists(), (tfidf_dir / "labels_tfidf_clean.npy").exists(), (tfidf_dir / "X_test_clean.npy").exists(), (tfidf_dir / "Y_test_clean.npy").exists() ]) if not cleaned_files_exist: print("⚠️ WARNING: Cleaned data files not found!") print(" Please run data_cleaning.py first:") print(" python -m hopcroft_skill_classification_tool_competition.data_cleaning") print("\n Continuing with original data only...\n") # Run tests only on original data print_section_header("TESTING ORIGINAL DATA ONLY") original_results = run_all_tests_for_data(use_cleaned=False) else: # Run tests on both original and cleaned data print("✓ Cleaned data files found") print(" Running tests on both original and cleaned data...\n") # Test original data original_results = run_all_tests_for_data(use_cleaned=False) # Test cleaned data cleaned_results = run_all_tests_for_data(use_cleaned=True) # Print comparison summary print("\n" + "="*80) print(" COMPARISON SUMMARY") print("="*80) print("\nOriginal vs Cleaned Data:") print(" - Original data tests saved with '_original' suffix") print(" - Cleaned data tests saved with '_clean' suffix") print("\nExpected improvements in cleaned data:") print(" ✓ No duplicates (0.0%)") print(" ✓ No label conflicts (0.0%)") print(" ✓ No data leakage (0 samples)") print(" ✓ Proper stratified split (80/20)") print(" ✓ Clean train/test separation") # Final summary print("\n" + "="*80) print(" ALL TESTS COMPLETED") print("="*80) print("\nReports saved in: reports/deepchecks/") print("\nFiles generated:") print(" Original data:") print(" - data_integrity_suite_results_original.json") print(" - train_test_validation_suite_results_original.json") if cleaned_files_exist: print("\n Cleaned data:") print(" - data_integrity_suite_results_clean.json") print(" - train_test_validation_suite_results_clean.json") print("\nNext steps:") if not cleaned_files_exist: print(" 1. Run data_cleaning.py to generate cleaned data") print(" 2. Re-run this script to compare original vs cleaned") else: print(" 1. Review JSON reports in reports/deepchecks/") print(" 2. Compare original vs cleaned results") print(" 3. Use cleaned data for model training") print("\n" + "="*80 + "\n") if __name__ == "__main__": main()