DaCrow13
Deploy to HF Spaces (Clean)
225af6a
"""
Run all Deepchecks validation suites
This script executes all data validation checks:
1. Data Integrity Suite - validates training data quality
2. Train-Test Validation Suite - ensures proper train/test split
Usage:
python tests/run_all_deepchecks.py
"""
import sys
from pathlib import Path
# Add project root to path
project_root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(project_root))
from tests.test_data_integrity import (
run_data_integrity_suite,
run_custom_integrity_checks,
analyze_data_statistics
)
from tests.test_train_test_validation import (
run_train_test_validation_suite,
run_custom_train_test_checks,
compare_distributions,
validate_split_quality
)
def main():
"""
Run all Deepchecks validation suites and generate reports.
"""
import argparse
parser = argparse.ArgumentParser(description='Run Deepchecks validation suites')
parser.add_argument('--original', action='store_true',
help='Use original data instead of cleaned data')
args = parser.parse_args()
use_cleaned = not args.original
print("="*80)
print(" DEEPCHECKS VALIDATION - COMPLETE SUITE")
print("="*80)
print(f"\nUsing {'CLEANED' if use_cleaned else 'ORIGINAL'} data")
print("Reports will be saved in: reports/deepchecks/")
print("\n" + "="*80)
# Phase 1: Data Integrity Checks
print("\nPHASE 1: DATA INTEGRITY VALIDATION")
print("="*80)
try:
# Dataset statistics
analyze_data_statistics(use_cleaned=use_cleaned)
# Run full integrity suite
print("\n")
integrity_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned)
# Run custom integrity checks
print("\n")
custom_integrity_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned)
print("\nPhase 1 completed successfully!")
except Exception as e:
print(f"\nError in Phase 1: {str(e)}")
return False
# Phase 2: Train-Test Validation
print("\n\nPHASE 2: TRAIN-TEST VALIDATION")
print("="*80)
try:
# Distribution comparison
compare_distributions(use_cleaned=use_cleaned)
# Split quality validation
print("\n")
validate_split_quality(use_cleaned=use_cleaned)
# Run full train-test suite
print("\n")
train_test_suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned)
# Run custom train-test checks
print("\n")
custom_train_test_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned)
print("\nPhase 2 completed successfully!")
except Exception as e:
print(f"\nError in Phase 2: {str(e)}")
return False
# Summary
print("\n\n" + "="*80)
print(" VALIDATION SUMMARY")
print("="*80)
print("\nAll Deepchecks validation suites completed successfully!")
print("\nGenerated Reports:")
print(" - reports/deepchecks/data_integrity_suite_results.json")
print(" - reports/deepchecks/train_test_validation_suite_results.json")
print(" - reports/deepchecks/validation_summary.json")
print("\nNext Steps:")
print(" 1. Review the JSON reports for check results")
print(" 2. Examine any warnings or failed checks")
print(" 3. Address data quality issues if found")
print(" 4. Document findings in your project documentation")
print("\n" + "="*80)
return True
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)