Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 11,760 Bytes

"""
Integration tests for the feature extraction pipeline.

Tests the combined functionality of dataset loading, text processing,
and feature extraction working together.
"""
import pytest
import numpy as np
import pandas as pd
import tempfile
import sqlite3
from pathlib import Path

from hopcroft_skill_classification_tool_competition.features import (
    load_data_from_db,
    create_feature_dataset,
    extract_tfidf_features,
    prepare_labels,
    get_text_columns,
    get_label_columns,
)


@pytest.mark.integration
class TestFeatureExtractionPipeline:
    """Integration tests for complete feature extraction pipeline."""
    
    def test_full_pipeline_from_dataframe_to_features(self, sample_dataframe):
        """Test complete pipeline from DataFrame to features and labels."""
        # Extract features
        features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50)
        
        # Prepare labels
        labels = prepare_labels(sample_dataframe)
        
        # Verify alignment
        assert features.shape[0] == len(labels)
        assert features.shape[0] == len(sample_dataframe)
        
        # Verify data types
        assert isinstance(features, np.ndarray)
        assert isinstance(labels, pd.DataFrame)
        
        # Verify no NaN or Inf values
        assert not np.any(np.isnan(features))
        assert not np.any(np.isinf(features))
        assert not labels.isnull().any().any()
    
    def test_pipeline_with_database_to_features(self, temp_db):
        """Test pipeline from database loading to feature extraction."""
        # Load from database
        df = load_data_from_db(temp_db)
        
        # Extract features
        features, vectorizer = extract_tfidf_features(df, max_features=50)
        
        # Prepare labels
        labels = prepare_labels(df)
        
        # Verify complete pipeline
        assert features.shape[0] == len(df)
        assert labels.shape[0] == len(df)
        assert features.shape[0] == labels.shape[0]
    
    def test_create_feature_dataset_integration(self, temp_db):
        """Test the complete create_feature_dataset function."""
        features, labels, feature_names, label_names = create_feature_dataset(
            db_path=temp_db,
            save_processed=False
        )
        
        # Verify outputs
        assert isinstance(features, np.ndarray)
        assert isinstance(labels, pd.DataFrame)
        assert isinstance(feature_names, np.ndarray)  # sklearn returns ndarray
        assert isinstance(label_names, list)
        
        # Verify shapes match
        assert features.shape[0] == labels.shape[0]
        assert features.shape[1] == len(feature_names)
        assert labels.shape[1] == len(label_names)
    
    def test_pipeline_preserves_sample_count(self, sample_dataframe):
        """Test that no samples are lost during pipeline."""
        initial_count = len(sample_dataframe)
        
        features, _ = extract_tfidf_features(sample_dataframe, max_features=50)
        labels = prepare_labels(sample_dataframe)
        
        assert features.shape[0] == initial_count
        assert labels.shape[0] == initial_count
    
    def test_pipeline_with_various_text_lengths(self):
        """Test pipeline with documents of varying lengths."""
        df = pd.DataFrame({
            'issue text': [
                'short',
                'This is a medium length text with several words',
                'This is a very long text ' * 50,  # Very long
            ],
            'issue description': ['desc1', 'desc2', 'desc3'],
            'Label1': [1, 0, 1],
            'Label2': [0, 1, 1],
        })
        
        features, _ = extract_tfidf_features(df, max_features=50)
        labels = prepare_labels(df)
        
        # All documents should be processed
        assert features.shape[0] == 3
        assert labels.shape[0] == 3
        
        # Features should have reasonable values
        assert not np.all(features == 0)


@pytest.mark.integration
class TestDataFlowConsistency:
    """Integration tests for data consistency through the pipeline."""
    
    def test_text_cleaning_affects_features(self, sample_dataframe):
        """Test that text cleaning impacts feature extraction."""
        # Add dirty text
        dirty_df = sample_dataframe.copy()
        dirty_df['issue text'] = [
            "Bug https://example.com with <b>HTML</b>",
            "Feature with ```code block```",
            "Update with   extra   spaces",
            "Test with 😀 emoji",
            "Normal clean text",
        ]
        
        # Use min_df=1 and max_df=1.0 for small test datasets to avoid empty vocabulary
        features_dirty, _ = extract_tfidf_features(dirty_df, max_features=50, min_df=1, max_df=1.0)
        
        # Clean version
        clean_df = sample_dataframe.copy()
        clean_df['issue text'] = [
            "Bug with HTML",
            "Feature with",
            "Update with extra spaces",
            "Test with emoji",
            "Normal clean text",
        ]
        
        # Use min_df=1 and max_df=1.0 for small test datasets
        features_clean, _ = extract_tfidf_features(clean_df, max_features=50, min_df=1, max_df=1.0)
        
        # Features should be similar (cleaning is applied to both)
        # But not necessarily identical due to stemming
        assert features_dirty.shape == features_clean.shape
    
    def test_label_binarization_consistency(self):
        """Test that label binarization is consistent."""
        df = pd.DataFrame({
            'issue text': ['text1', 'text2', 'text3'],
            'issue description': ['desc1', 'desc2', 'desc3'],
            'Label1': [0, 5, 10],  # Different counts
            'Label2': [1, 0, 100],
        })
        
        labels = prepare_labels(df)
        
        # All values should be 0 or 1
        assert set(labels.values.flatten()).issubset({0, 1})
        
        # Specific checks
        assert labels.loc[0, 'Label1'] == 0
        assert labels.loc[1, 'Label1'] == 1
        assert labels.loc[2, 'Label1'] == 1
        assert labels.loc[0, 'Label2'] == 1
        assert labels.loc[1, 'Label2'] == 0
        assert labels.loc[2, 'Label2'] == 1
    
    def test_feature_label_alignment(self, sample_dataframe):
        """Test that features and labels remain aligned."""
        features, _ = extract_tfidf_features(sample_dataframe, max_features=50)
        labels = prepare_labels(sample_dataframe)
        
        # Check alignment by comparing indices
        for i in range(len(sample_dataframe)):
            # Each row should correspond to the same sample
            assert features[i].shape[0] > 0  # Has features
            assert labels.iloc[i].shape[0] > 0  # Has labels


@pytest.mark.integration
@pytest.mark.slow
class TestLargeDatasetHandling:
    """Integration tests with larger datasets (marked as slow)."""
    
    def test_pipeline_with_large_dataset(self):
        """Test pipeline with a larger number of samples."""
        # Create larger dataset
        n_samples = 1000
        df = pd.DataFrame({
            'issue text': [f'Issue number {i} with some text' for i in range(n_samples)],
            'issue description': [f'Description for issue {i}' for i in range(n_samples)],
            'Label1': np.random.randint(0, 2, n_samples),
            'Label2': np.random.randint(0, 2, n_samples),
            'Label3': np.random.randint(0, 2, n_samples),
        })
        
        features, _ = extract_tfidf_features(df, max_features=500)
        labels = prepare_labels(df)
        
        assert features.shape[0] == n_samples
        assert labels.shape[0] == n_samples
        assert features.shape[1] <= 500
    
    def test_pipeline_with_many_labels(self):
        """Test pipeline with many label columns."""
        n_labels = 50
        df = pd.DataFrame({
            'issue text': ['text1', 'text2', 'text3'],
            'issue description': ['desc1', 'desc2', 'desc3'],
        })
        
        # Add many label columns
        for i in range(n_labels):
            df[f'Label_{i}'] = np.random.randint(0, 2, 3)
        
        labels = prepare_labels(df)
        
        assert labels.shape[1] == n_labels
        assert set(labels.values.flatten()).issubset({0, 1})


@pytest.mark.integration
class TestSaveAndLoadIntegration:
    """Integration tests for saving and loading processed data."""
    
    def test_save_and_load_features(self, temp_db):
        """Test saving features and labels then loading them back."""
        with tempfile.TemporaryDirectory() as tmpdir:
            from hopcroft_skill_classification_tool_competition.features import (
                create_feature_dataset,
                load_processed_data
            )
            
            # Mock the PROCESSED_DATA_DIR
            with pytest.MonkeyPatch.context() as m:
                tmpdir_path = Path(tmpdir)
                tfidf_dir = tmpdir_path / "tfidf"
                tfidf_dir.mkdir(parents=True)
                
                # Create and save
                features_orig, labels_orig, _, _ = create_feature_dataset(
                    db_path=temp_db,
                    save_processed=True
                )
                
                # Save manually since we're mocking
                np.save(tfidf_dir / "features_tfidf.npy", features_orig)
                np.save(tfidf_dir / "labels_tfidf.npy", labels_orig.values)
                
                # Load back
                features_loaded = np.load(tfidf_dir / "features_tfidf.npy")
                labels_loaded = np.load(tfidf_dir / "labels_tfidf.npy")
                
                # Verify they match
                np.testing.assert_array_equal(features_orig, features_loaded)
                np.testing.assert_array_equal(labels_orig.values, labels_loaded)


@pytest.mark.integration
class TestErrorHandlingInPipeline:
    """Integration tests for error handling throughout pipeline."""
    
    def test_pipeline_with_missing_columns(self):
        """Test pipeline behavior with missing expected columns."""
        df = pd.DataFrame({
            'wrong_col_1': ['text1', 'text2'],
            'wrong_col_2': ['desc1', 'desc2'],
            'Label1': [1, 0],
        })
        
        # Should handle missing text columns gracefully
        text_cols = get_text_columns(df)
        assert len(text_cols) == 0
        
        # Should still work with explicit column specification
        # (though results may not be meaningful)
        with pytest.raises(ValueError, match="No text columns found"):
            extract_tfidf_features(df)
    
    def test_pipeline_with_all_nan_text(self):
        """Test pipeline with all NaN text values raises appropriate error.
        
        TF-IDF cannot build a vocabulary from empty/NaN documents,
        so it should raise a ValueError with a descriptive message.
        """
        df = pd.DataFrame({
            'issue text': [None, None, None],
            'issue description': [None, None, None],
            'Label1': [1, 0, 1],
        })
        
        # TF-IDF should raise ValueError for empty vocabulary
        with pytest.raises(ValueError, match="empty vocabulary"):
            extract_tfidf_features(df, max_features=50)
    
    def test_pipeline_with_empty_labels(self):
        """Test pipeline when no labels are present."""
        df = pd.DataFrame({
            'issue text': ['text1', 'text2'],
            'issue description': ['desc1', 'desc2'],
            # No label columns
        })
        
        label_cols = get_label_columns(df)
        
        # Should return empty list
        assert len(label_cols) == 0