|
|
""" |
|
|
Comprehensive Deepchecks Test Runner - Original vs Cleaned Data Comparison |
|
|
|
|
|
This script runs all Deepchecks tests on both original and cleaned data, |
|
|
allowing for direct comparison of data quality improvements after cleaning. |
|
|
|
|
|
Usage: |
|
|
python tests/deepchecks/run_all_tests_comparison.py |
|
|
|
|
|
Output: |
|
|
- Generates reports for both original and cleaned data |
|
|
- Creates comparison summary |
|
|
- Saves all results in reports/deepchecks/ |
|
|
""" |
|
|
|
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
project_root = Path(__file__).parent.parent.parent |
|
|
sys.path.insert(0, str(project_root)) |
|
|
|
|
|
from test_data_integrity import ( |
|
|
run_data_integrity_suite, |
|
|
run_custom_integrity_checks, |
|
|
analyze_data_statistics |
|
|
) |
|
|
from test_train_test_validation import ( |
|
|
run_train_test_validation_suite, |
|
|
run_custom_train_test_checks, |
|
|
compare_distributions, |
|
|
validate_split_quality |
|
|
) |
|
|
|
|
|
|
|
|
def print_section_header(title): |
|
|
"""Print a formatted section header.""" |
|
|
print("\n" + "="*80) |
|
|
print(f" {title}") |
|
|
print("="*80 + "\n") |
|
|
|
|
|
|
|
|
def run_all_tests_for_data(use_cleaned=False): |
|
|
""" |
|
|
Run all Deepchecks tests for either original or cleaned data. |
|
|
|
|
|
Args: |
|
|
use_cleaned: If True, test cleaned data; otherwise test original data |
|
|
""" |
|
|
data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
|
|
|
|
|
print("\n" + "#"*80) |
|
|
print(f"# RUNNING ALL DEEPCHECKS TESTS - {data_type} DATA") |
|
|
print("#"*80) |
|
|
|
|
|
|
|
|
print_section_header(f"1. DATASET STATISTICS - {data_type}") |
|
|
analyze_data_statistics(use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print_section_header(f"2. DATA INTEGRITY SUITE - {data_type}") |
|
|
integrity_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print_section_header(f"3. CUSTOM INTEGRITY CHECKS - {data_type}") |
|
|
custom_integrity_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print_section_header(f"4. TRAIN-TEST DISTRIBUTION COMPARISON - {data_type}") |
|
|
compare_distributions(use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print_section_header(f"5. TRAIN-TEST SPLIT QUALITY - {data_type}") |
|
|
validate_split_quality(use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print_section_header(f"6. TRAIN-TEST VALIDATION SUITE - {data_type}") |
|
|
validation_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print_section_header(f"7. CUSTOM TRAIN-TEST CHECKS - {data_type}") |
|
|
custom_validation_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned) |
|
|
|
|
|
return { |
|
|
"integrity_suite": integrity_result, |
|
|
"custom_integrity": custom_integrity_results, |
|
|
"validation_suite": validation_result, |
|
|
"custom_validation": custom_validation_results |
|
|
} |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main function to run all tests and generate comparison.""" |
|
|
print("\n" + "*"*80) |
|
|
print("* DEEPCHECKS COMPREHENSIVE TEST SUITE") |
|
|
print("* Original vs Cleaned Data Comparison") |
|
|
print("*"*80) |
|
|
|
|
|
print("\nThis script will run all Deepchecks tests on both:") |
|
|
print(" 1. Original data (before cleaning)") |
|
|
print(" 2. Cleaned data (after data_cleaning.py)") |
|
|
print("\nThis allows direct comparison of data quality improvements.\n") |
|
|
|
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR |
|
|
tfidf_dir = PROCESSED_DATA_DIR / "tfidf" |
|
|
|
|
|
cleaned_files_exist = all([ |
|
|
(tfidf_dir / "features_tfidf_clean.npy").exists(), |
|
|
(tfidf_dir / "labels_tfidf_clean.npy").exists(), |
|
|
(tfidf_dir / "X_test_clean.npy").exists(), |
|
|
(tfidf_dir / "Y_test_clean.npy").exists() |
|
|
]) |
|
|
|
|
|
if not cleaned_files_exist: |
|
|
print("⚠️ WARNING: Cleaned data files not found!") |
|
|
print(" Please run data_cleaning.py first:") |
|
|
print(" python -m hopcroft_skill_classification_tool_competition.data_cleaning") |
|
|
print("\n Continuing with original data only...\n") |
|
|
|
|
|
|
|
|
print_section_header("TESTING ORIGINAL DATA ONLY") |
|
|
original_results = run_all_tests_for_data(use_cleaned=False) |
|
|
|
|
|
else: |
|
|
|
|
|
print("✓ Cleaned data files found") |
|
|
print(" Running tests on both original and cleaned data...\n") |
|
|
|
|
|
|
|
|
original_results = run_all_tests_for_data(use_cleaned=False) |
|
|
|
|
|
|
|
|
cleaned_results = run_all_tests_for_data(use_cleaned=True) |
|
|
|
|
|
|
|
|
print("\n" + "="*80) |
|
|
print(" COMPARISON SUMMARY") |
|
|
print("="*80) |
|
|
print("\nOriginal vs Cleaned Data:") |
|
|
print(" - Original data tests saved with '_original' suffix") |
|
|
print(" - Cleaned data tests saved with '_clean' suffix") |
|
|
print("\nExpected improvements in cleaned data:") |
|
|
print(" ✓ No duplicates (0.0%)") |
|
|
print(" ✓ No label conflicts (0.0%)") |
|
|
print(" ✓ No data leakage (0 samples)") |
|
|
print(" ✓ Proper stratified split (80/20)") |
|
|
print(" ✓ Clean train/test separation") |
|
|
|
|
|
|
|
|
print("\n" + "="*80) |
|
|
print(" ALL TESTS COMPLETED") |
|
|
print("="*80) |
|
|
print("\nReports saved in: reports/deepchecks/") |
|
|
print("\nFiles generated:") |
|
|
print(" Original data:") |
|
|
print(" - data_integrity_suite_results_original.json") |
|
|
print(" - train_test_validation_suite_results_original.json") |
|
|
|
|
|
if cleaned_files_exist: |
|
|
print("\n Cleaned data:") |
|
|
print(" - data_integrity_suite_results_clean.json") |
|
|
print(" - train_test_validation_suite_results_clean.json") |
|
|
|
|
|
print("\nNext steps:") |
|
|
if not cleaned_files_exist: |
|
|
print(" 1. Run data_cleaning.py to generate cleaned data") |
|
|
print(" 2. Re-run this script to compare original vs cleaned") |
|
|
else: |
|
|
print(" 1. Review JSON reports in reports/deepchecks/") |
|
|
print(" 2. Compare original vs cleaned results") |
|
|
print(" 3. Use cleaned data for model training") |
|
|
|
|
|
print("\n" + "="*80 + "\n") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|