Spaces:
Sleeping
Sleeping
| """ | |
| Comprehensive Deepchecks Test Runner - Original vs Cleaned Data Comparison | |
| This script runs all Deepchecks tests on both original and cleaned data, | |
| allowing for direct comparison of data quality improvements after cleaning. | |
| Usage: | |
| python tests/deepchecks/run_all_tests_comparison.py | |
| Output: | |
| - Generates reports for both original and cleaned data | |
| - Creates comparison summary | |
| - Saves all results in reports/deepchecks/ | |
| """ | |
| import sys | |
| from pathlib import Path | |
| # Add project root to path | |
| project_root = Path(__file__).parent.parent.parent | |
| sys.path.insert(0, str(project_root)) | |
| from test_data_integrity import ( | |
| run_data_integrity_suite, | |
| run_custom_integrity_checks, | |
| analyze_data_statistics | |
| ) | |
| from test_train_test_validation import ( | |
| run_train_test_validation_suite, | |
| run_custom_train_test_checks, | |
| compare_distributions, | |
| validate_split_quality | |
| ) | |
| def print_section_header(title): | |
| """Print a formatted section header.""" | |
| print("\n" + "="*80) | |
| print(f" {title}") | |
| print("="*80 + "\n") | |
| def run_all_tests_for_data(use_cleaned=False): | |
| """ | |
| Run all Deepchecks tests for either original or cleaned data. | |
| Args: | |
| use_cleaned: If True, test cleaned data; otherwise test original data | |
| """ | |
| data_type = "CLEANED" if use_cleaned else "ORIGINAL" | |
| print("\n" + "#"*80) | |
| print(f"# RUNNING ALL DEEPCHECKS TESTS - {data_type} DATA") | |
| print("#"*80) | |
| # 1. Dataset Statistics | |
| print_section_header(f"1. DATASET STATISTICS - {data_type}") | |
| analyze_data_statistics(use_cleaned=use_cleaned) | |
| # 2. Data Integrity Suite | |
| print_section_header(f"2. DATA INTEGRITY SUITE - {data_type}") | |
| integrity_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned) | |
| # 3. Custom Integrity Checks | |
| print_section_header(f"3. CUSTOM INTEGRITY CHECKS - {data_type}") | |
| custom_integrity_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned) | |
| # 4. Distribution Comparison | |
| print_section_header(f"4. TRAIN-TEST DISTRIBUTION COMPARISON - {data_type}") | |
| compare_distributions(use_cleaned=use_cleaned) | |
| # 5. Split Quality Validation | |
| print_section_header(f"5. TRAIN-TEST SPLIT QUALITY - {data_type}") | |
| validate_split_quality(use_cleaned=use_cleaned) | |
| # 6. Train-Test Validation Suite | |
| print_section_header(f"6. TRAIN-TEST VALIDATION SUITE - {data_type}") | |
| validation_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned) | |
| # 7. Custom Train-Test Checks | |
| print_section_header(f"7. CUSTOM TRAIN-TEST CHECKS - {data_type}") | |
| custom_validation_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned) | |
| return { | |
| "integrity_suite": integrity_result, | |
| "custom_integrity": custom_integrity_results, | |
| "validation_suite": validation_result, | |
| "custom_validation": custom_validation_results | |
| } | |
| def main(): | |
| """Main function to run all tests and generate comparison.""" | |
| print("\n" + "*"*80) | |
| print("* DEEPCHECKS COMPREHENSIVE TEST SUITE") | |
| print("* Original vs Cleaned Data Comparison") | |
| print("*"*80) | |
| print("\nThis script will run all Deepchecks tests on both:") | |
| print(" 1. Original data (before cleaning)") | |
| print(" 2. Cleaned data (after data_cleaning.py)") | |
| print("\nThis allows direct comparison of data quality improvements.\n") | |
| # Check if cleaned data exists | |
| from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR | |
| tfidf_dir = PROCESSED_DATA_DIR / "tfidf" | |
| cleaned_files_exist = all([ | |
| (tfidf_dir / "features_tfidf_clean.npy").exists(), | |
| (tfidf_dir / "labels_tfidf_clean.npy").exists(), | |
| (tfidf_dir / "X_test_clean.npy").exists(), | |
| (tfidf_dir / "Y_test_clean.npy").exists() | |
| ]) | |
| if not cleaned_files_exist: | |
| print("⚠️ WARNING: Cleaned data files not found!") | |
| print(" Please run data_cleaning.py first:") | |
| print(" python -m hopcroft_skill_classification_tool_competition.data_cleaning") | |
| print("\n Continuing with original data only...\n") | |
| # Run tests only on original data | |
| print_section_header("TESTING ORIGINAL DATA ONLY") | |
| original_results = run_all_tests_for_data(use_cleaned=False) | |
| else: | |
| # Run tests on both original and cleaned data | |
| print("✓ Cleaned data files found") | |
| print(" Running tests on both original and cleaned data...\n") | |
| # Test original data | |
| original_results = run_all_tests_for_data(use_cleaned=False) | |
| # Test cleaned data | |
| cleaned_results = run_all_tests_for_data(use_cleaned=True) | |
| # Print comparison summary | |
| print("\n" + "="*80) | |
| print(" COMPARISON SUMMARY") | |
| print("="*80) | |
| print("\nOriginal vs Cleaned Data:") | |
| print(" - Original data tests saved with '_original' suffix") | |
| print(" - Cleaned data tests saved with '_clean' suffix") | |
| print("\nExpected improvements in cleaned data:") | |
| print(" ✓ No duplicates (0.0%)") | |
| print(" ✓ No label conflicts (0.0%)") | |
| print(" ✓ No data leakage (0 samples)") | |
| print(" ✓ Proper stratified split (80/20)") | |
| print(" ✓ Clean train/test separation") | |
| # Final summary | |
| print("\n" + "="*80) | |
| print(" ALL TESTS COMPLETED") | |
| print("="*80) | |
| print("\nReports saved in: reports/deepchecks/") | |
| print("\nFiles generated:") | |
| print(" Original data:") | |
| print(" - data_integrity_suite_results_original.json") | |
| print(" - train_test_validation_suite_results_original.json") | |
| if cleaned_files_exist: | |
| print("\n Cleaned data:") | |
| print(" - data_integrity_suite_results_clean.json") | |
| print(" - train_test_validation_suite_results_clean.json") | |
| print("\nNext steps:") | |
| if not cleaned_files_exist: | |
| print(" 1. Run data_cleaning.py to generate cleaned data") | |
| print(" 2. Re-run this script to compare original vs cleaned") | |
| else: | |
| print(" 1. Review JSON reports in reports/deepchecks/") | |
| print(" 2. Compare original vs cleaned results") | |
| print(" 3. Use cleaned data for model training") | |
| print("\n" + "="*80 + "\n") | |
| if __name__ == "__main__": | |
| main() | |