Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 11,392 Bytes

225af6a

"""
Data Integrity Suite - Deepchecks validation for dataset integrity

This module implements comprehensive data integrity checks using Deepchecks
to validate the quality and consistency of the training and test datasets.

Checks included:
- Data duplicates detection
- Missing values analysis
- Feature-label correlation
- Feature-feature correlation
- Data type consistency
- Outlier detection
- Class imbalance analysis
"""

import numpy as np
import pandas as pd
import json
from pathlib import Path
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import data_integrity

from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR


def load_data(use_cleaned=True):
    """
    Load training and test datasets from processed data directory.
    
    Args:
        use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT
    
    Returns:
        tuple: (X_train, y_train, X_test, y_test)
    """
    tfidf_dir = PROCESSED_DATA_DIR / "tfidf"
    
    # Choose file names based on cleaned flag
    if use_cleaned:
        train_features = tfidf_dir / "features_tfidf_clean.npy"
        train_labels = tfidf_dir / "labels_tfidf_clean.npy"
        test_features = tfidf_dir / "X_test_clean.npy"
        test_labels = tfidf_dir / "Y_test_clean.npy"
        data_type = "cleaned"
    else:
        train_features = tfidf_dir / "features_tfidf.npy"
        train_labels = tfidf_dir / "labels_tfidf.npy"
        test_features = tfidf_dir / "X_test.npy"
        test_labels = tfidf_dir / "Y_test.npy"
        data_type = "original"
    
    # Load features and labels
    X_train = np.load(train_features)
    y_train = np.load(train_labels)
    X_test = np.load(test_features)
    y_test = np.load(test_labels)
    
    print(f"Loaded {data_type} data:")
    print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
    print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")
    
    return X_train, y_train, X_test, y_test


def create_deepchecks_dataset(X, y, dataset_name="dataset"):
    """
    Create a Deepchecks Dataset object from numpy arrays.
    
    Args:
        X: Feature matrix (numpy array)
        y: Labels (numpy array) - can be multi-label (2D) or single-label (1D)
        dataset_name: Name identifier for the dataset
    
    Returns:
        Dataset: Deepchecks Dataset object
    """
    # Convert to DataFrame for better visualization
    # Create feature names
    feature_names = [f"feature_{i}" for i in range(X.shape[1])]
    
    # Create DataFrame
    df = pd.DataFrame(X, columns=feature_names)
    
    # Handle multi-label case: convert to single label by taking argmax or first active label
    if len(y.shape) > 1 and y.shape[1] > 1:
        # Multi-label: convert to single label (first active label or most confident)
        # For binary multi-label, take the index of first 1
        y_single = np.argmax(y, axis=1)  # Get the index of maximum value
        df['label'] = y_single
        print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks")
    else:
        df['label'] = y
    
    # Create Deepchecks Dataset
    ds = Dataset(df, label='label', cat_features=[])
    
    return ds


def run_data_integrity_suite(save_output=True, use_cleaned=True):
    """
    Run the complete Data Integrity Suite on training data.
    
    This suite performs comprehensive checks including:
    - Data Duplicates: Identifies duplicate samples
    - String Mismatch: Checks for string inconsistencies
    - Mixed Nulls: Detects various null representations
    - Mixed Data Types: Validates consistent data types
    - String Length Out Of Bounds: Checks string length anomalies
    - Is Single Value: Identifies features with only one value
    - Special Characters: Detects special characters in data
    - Class Imbalance: Analyzes label distribution
    - Outlier Sample Detection: Identifies outlier samples
    - Feature Label Correlation: Checks correlation between features and labels
    
    Args:
        save_output: Whether to save the HTML report
        use_cleaned: If True, use cleaned data instead of original
    
    Returns:
        SuiteResult: Results from the data integrity suite
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"DATA INTEGRITY SUITE - {data_type} TRAINING DATA")
    print("="*80)
    
    # Load data
    X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned)
    
    # Create Deepchecks dataset
    train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
    
    # Run the Data Integrity Suite
    print("\nRunning Data Integrity checks...")
    suite = data_integrity()
    result = suite.run(train_dataset)
    
    # Display results
    print("\nData Integrity Suite completed!")
    print(f"Total checks: {len(result.results)}")
    
    # Save output
    if save_output:
        output_dir = Path("reports/deepchecks")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save JSON report with appropriate suffix
        suffix = "_clean" if use_cleaned else "_original"
        json_path = output_dir / f"data_integrity_suite_results{suffix}.json"
        json_results = {
            "suite_name": "Data Integrity Suite",
            "total_checks": len(result.results),
            "timestamp": pd.Timestamp.now().isoformat(),
            "checks": []
        }
        
        for check_result in result.results:
            check_data = {
                "check_name": check_result.get_header(),
                "passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None,
                "display": str(check_result.display) if hasattr(check_result, 'display') else None
            }
            json_results["checks"].append(check_data)
        
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(json_results, f, indent=2, ensure_ascii=False)
        print(f"JSON results saved to: {json_path}")
    
    return result


def run_custom_integrity_checks(save_output=True, use_cleaned=True):
    """
    Run custom integrity checks tailored for the SkillScope dataset.
    
    These checks are specifically designed for NLP/Text features and
    multi-label classification tasks.
    
    Args:
        save_output: Whether to save the HTML report
        use_cleaned: If True, use cleaned data instead of original
    
    Returns:
        dict: Dictionary containing check results
    """
    from deepchecks.tabular.checks import (
        DataDuplicates,
        MixedNulls,
        IsSingleValue,
        ClassImbalance,
        OutlierSampleDetection,
        FeatureLabelCorrelation,
    )
    
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"CUSTOM DATA INTEGRITY CHECKS - {data_type} DATA")
    print("="*80)
    
    # Load data
    X_train, y_train, _, _ = load_data(use_cleaned=use_cleaned)
    train_dataset = create_deepchecks_dataset(X_train, y_train, "training")
    
    results = {}
    
    # Check 1: Data Duplicates
    print("\n1. Checking for duplicate samples...")
    duplicates_check = DataDuplicates()
    results['duplicates'] = duplicates_check.run(train_dataset)
    
    # Check 2: Mixed Nulls
    print("2. Checking for mixed null values...")
    nulls_check = MixedNulls()
    results['nulls'] = nulls_check.run(train_dataset)
    
    # Check 3: Single Value Features
    print("3. Checking for single-value features...")
    single_value_check = IsSingleValue()
    results['single_value'] = single_value_check.run(train_dataset)
    
    # Check 4: Class Imbalance
    print("4. Checking class distribution...")
    imbalance_check = ClassImbalance()
    results['class_imbalance'] = imbalance_check.run(train_dataset)
    
    # Check 5: Outlier Detection (with increased timeout)
    print("5. Detecting outlier samples (this may take a while)...")
    try:
        outlier_check = OutlierSampleDetection(timeout=300)  # 5 minutes timeout
        results['outliers'] = outlier_check.run(train_dataset)
    except Exception as e:
        print(f"   Warning: Outlier detection failed or timed out: {str(e)}")
        results['outliers'] = None
    
    # Check 6: Feature-Label Correlation (with sample subset for speed)
    print("6. Analyzing feature-label correlation (using sample of features)...")
    try:
        # Use only top 100 features for correlation to speed up
        correlation_check = FeatureLabelCorrelation(n_top_columns=100, timeout=300)
        results['correlation'] = correlation_check.run(train_dataset)
    except Exception as e:
        print(f"   Warning: Correlation check failed or timed out: {str(e)}")
        results['correlation'] = None
    
    print("\nAll custom checks completed!")
    
    # Results are available in memory for further processing if needed
    
    return results


def analyze_data_statistics(use_cleaned=True):
    """
    Print detailed statistics about the dataset.
    
    Args:
        use_cleaned: If True, analyze cleaned data instead of original
    """
    data_type = "CLEANED" if use_cleaned else "ORIGINAL"
    print("="*80)
    print(f"DATASET STATISTICS - {data_type} DATA")
    print("="*80)
    
    X_train, y_train, X_test, y_test = load_data(use_cleaned=use_cleaned)
    
    print(f"\nTraining set:")
    print(f"  - Samples: {X_train.shape[0]}")
    print(f"  - Features: {X_train.shape[1]}")
    print(f"  - Unique labels: {len(np.unique(y_train))}")
    print(f"  - Label distribution:")
    unique, counts = np.unique(y_train, return_counts=True)
    for label, count in zip(unique[:10], counts[:10]):  # Show first 10
        print(f"    Label {label}: {count} samples ({count/len(y_train)*100:.2f}%)")
    if len(unique) > 10:
        print(f"    ... and {len(unique)-10} more labels")
    
    print(f"\nTest set:")
    print(f"  - Samples: {X_test.shape[0]}")
    print(f"  - Features: {X_test.shape[1]}")
    print(f"  - Unique labels: {len(np.unique(y_test))}")
    
    print(f"\nFeature statistics:")
    print(f"  - Mean feature value: {X_train.mean():.4f}")
    print(f"  - Std feature value: {X_train.std():.4f}")
    print(f"  - Min feature value: {X_train.min():.4f}")
    print(f"  - Max feature value: {X_train.max():.4f}")
    print(f"  - Sparsity: {(X_train == 0).sum() / X_train.size * 100:.2f}%")


if __name__ == "__main__":
    import sys
    
    # By default use cleaned data, unless --original flag is specified
    use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv)
    
    if use_cleaned:
        print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n")
    else:
        print("Testing ORIGINAL data\n")
        print("Note: Using --original flag to test old data\n")
    
    # Print dataset statistics
    analyze_data_statistics(use_cleaned=use_cleaned)
    
    # Run the full Data Integrity Suite
    print("\n")
    suite_result = run_data_integrity_suite(save_output=True, use_cleaned=use_cleaned)
    
    # Run custom integrity checks
    print("\n")
    custom_results = run_custom_integrity_checks(save_output=True, use_cleaned=use_cleaned)
    
    print("\n" + "="*80)
    print("DATA INTEGRITY VALIDATION COMPLETED")
    print("="*80)
    print("\nCheck the reports in the 'reports/deepchecks' directory")