Hopcroft-Skill-Classification / tests /deepchecks /run_all_tests_comparison.py
DaCrow13
Deploy to HF Spaces (Clean)
225af6a
"""
Comprehensive Deepchecks Test Runner - Original vs Cleaned Data Comparison
This script runs all Deepchecks tests on both original and cleaned data,
allowing for direct comparison of data quality improvements after cleaning.
Usage:
python tests/deepchecks/run_all_tests_comparison.py
Output:
- Generates reports for both original and cleaned data
- Creates comparison summary
- Saves all results in reports/deepchecks/
"""
import sys
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from test_data_integrity import (
run_data_integrity_suite,
run_custom_integrity_checks,
analyze_data_statistics
)
from test_train_test_validation import (
run_train_test_validation_suite,
run_custom_train_test_checks,
compare_distributions,
validate_split_quality
)
def print_section_header(title):
"""Print a formatted section header."""
print("\n" + "="*80)
print(f" {title}")
print("="*80 + "\n")
def run_all_tests_for_data(use_cleaned=False):
"""
Run all Deepchecks tests for either original or cleaned data.
Args:
use_cleaned: If True, test cleaned data; otherwise test original data
"""
data_type = "CLEANED" if use_cleaned else "ORIGINAL"
print("\n" + "#"*80)
print(f"# RUNNING ALL DEEPCHECKS TESTS - {data_type} DATA")
print("#"*80)
# 1. Dataset Statistics
print_section_header(f"1. DATASET STATISTICS - {data_type}")
analyze_data_statistics(use_cleaned=use_cleaned)
# 2. Data Integrity Suite
print_section_header(f"2. DATA INTEGRITY SUITE - {data_type}")
integrity_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned)
# 3. Custom Integrity Checks
print_section_header(f"3. CUSTOM INTEGRITY CHECKS - {data_type}")
custom_integrity_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned)
# 4. Distribution Comparison
print_section_header(f"4. TRAIN-TEST DISTRIBUTION COMPARISON - {data_type}")
compare_distributions(use_cleaned=use_cleaned)
# 5. Split Quality Validation
print_section_header(f"5. TRAIN-TEST SPLIT QUALITY - {data_type}")
validate_split_quality(use_cleaned=use_cleaned)
# 6. Train-Test Validation Suite
print_section_header(f"6. TRAIN-TEST VALIDATION SUITE - {data_type}")
validation_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned)
# 7. Custom Train-Test Checks
print_section_header(f"7. CUSTOM TRAIN-TEST CHECKS - {data_type}")
custom_validation_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned)
return {
"integrity_suite": integrity_result,
"custom_integrity": custom_integrity_results,
"validation_suite": validation_result,
"custom_validation": custom_validation_results
}
def main():
"""Main function to run all tests and generate comparison."""
print("\n" + "*"*80)
print("* DEEPCHECKS COMPREHENSIVE TEST SUITE")
print("* Original vs Cleaned Data Comparison")
print("*"*80)
print("\nThis script will run all Deepchecks tests on both:")
print(" 1. Original data (before cleaning)")
print(" 2. Cleaned data (after data_cleaning.py)")
print("\nThis allows direct comparison of data quality improvements.\n")
# Check if cleaned data exists
from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR
tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
cleaned_files_exist = all([
(tfidf_dir / "features_tfidf_clean.npy").exists(),
(tfidf_dir / "labels_tfidf_clean.npy").exists(),
(tfidf_dir / "X_test_clean.npy").exists(),
(tfidf_dir / "Y_test_clean.npy").exists()
])
if not cleaned_files_exist:
print("⚠️ WARNING: Cleaned data files not found!")
print(" Please run data_cleaning.py first:")
print(" python -m hopcroft_skill_classification_tool_competition.data_cleaning")
print("\n Continuing with original data only...\n")
# Run tests only on original data
print_section_header("TESTING ORIGINAL DATA ONLY")
original_results = run_all_tests_for_data(use_cleaned=False)
else:
# Run tests on both original and cleaned data
print("✓ Cleaned data files found")
print(" Running tests on both original and cleaned data...\n")
# Test original data
original_results = run_all_tests_for_data(use_cleaned=False)
# Test cleaned data
cleaned_results = run_all_tests_for_data(use_cleaned=True)
# Print comparison summary
print("\n" + "="*80)
print(" COMPARISON SUMMARY")
print("="*80)
print("\nOriginal vs Cleaned Data:")
print(" - Original data tests saved with '_original' suffix")
print(" - Cleaned data tests saved with '_clean' suffix")
print("\nExpected improvements in cleaned data:")
print(" ✓ No duplicates (0.0%)")
print(" ✓ No label conflicts (0.0%)")
print(" ✓ No data leakage (0 samples)")
print(" ✓ Proper stratified split (80/20)")
print(" ✓ Clean train/test separation")
# Final summary
print("\n" + "="*80)
print(" ALL TESTS COMPLETED")
print("="*80)
print("\nReports saved in: reports/deepchecks/")
print("\nFiles generated:")
print(" Original data:")
print(" - data_integrity_suite_results_original.json")
print(" - train_test_validation_suite_results_original.json")
if cleaned_files_exist:
print("\n Cleaned data:")
print(" - data_integrity_suite_results_clean.json")
print(" - train_test_validation_suite_results_clean.json")
print("\nNext steps:")
if not cleaned_files_exist:
print(" 1. Run data_cleaning.py to generate cleaned data")
print(" 2. Re-run this script to compare original vs cleaned")
else:
print(" 1. Review JSON reports in reports/deepchecks/")
print(" 2. Compare original vs cleaned results")
print(" 3. Use cleaned data for model training")
print("\n" + "="*80 + "\n")
if __name__ == "__main__":
main()