""" Integration tests for the feature extraction pipeline. Tests the combined functionality of dataset loading, text processing, and feature extraction working together. """ import pytest import numpy as np import pandas as pd import tempfile import sqlite3 from pathlib import Path from hopcroft_skill_classification_tool_competition.features import ( load_data_from_db, create_feature_dataset, extract_tfidf_features, prepare_labels, get_text_columns, get_label_columns, ) @pytest.mark.integration class TestFeatureExtractionPipeline: """Integration tests for complete feature extraction pipeline.""" def test_full_pipeline_from_dataframe_to_features(self, sample_dataframe): """Test complete pipeline from DataFrame to features and labels.""" # Extract features features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) # Prepare labels labels = prepare_labels(sample_dataframe) # Verify alignment assert features.shape[0] == len(labels) assert features.shape[0] == len(sample_dataframe) # Verify data types assert isinstance(features, np.ndarray) assert isinstance(labels, pd.DataFrame) # Verify no NaN or Inf values assert not np.any(np.isnan(features)) assert not np.any(np.isinf(features)) assert not labels.isnull().any().any() def test_pipeline_with_database_to_features(self, temp_db): """Test pipeline from database loading to feature extraction.""" # Load from database df = load_data_from_db(temp_db) # Extract features features, vectorizer = extract_tfidf_features(df, max_features=50) # Prepare labels labels = prepare_labels(df) # Verify complete pipeline assert features.shape[0] == len(df) assert labels.shape[0] == len(df) assert features.shape[0] == labels.shape[0] def test_create_feature_dataset_integration(self, temp_db): """Test the complete create_feature_dataset function.""" features, labels, feature_names, label_names = create_feature_dataset( db_path=temp_db, save_processed=False ) # Verify outputs assert isinstance(features, np.ndarray) assert isinstance(labels, pd.DataFrame) assert isinstance(feature_names, np.ndarray) # sklearn returns ndarray assert isinstance(label_names, list) # Verify shapes match assert features.shape[0] == labels.shape[0] assert features.shape[1] == len(feature_names) assert labels.shape[1] == len(label_names) def test_pipeline_preserves_sample_count(self, sample_dataframe): """Test that no samples are lost during pipeline.""" initial_count = len(sample_dataframe) features, _ = extract_tfidf_features(sample_dataframe, max_features=50) labels = prepare_labels(sample_dataframe) assert features.shape[0] == initial_count assert labels.shape[0] == initial_count def test_pipeline_with_various_text_lengths(self): """Test pipeline with documents of varying lengths.""" df = pd.DataFrame({ 'issue text': [ 'short', 'This is a medium length text with several words', 'This is a very long text ' * 50, # Very long ], 'issue description': ['desc1', 'desc2', 'desc3'], 'Label1': [1, 0, 1], 'Label2': [0, 1, 1], }) features, _ = extract_tfidf_features(df, max_features=50) labels = prepare_labels(df) # All documents should be processed assert features.shape[0] == 3 assert labels.shape[0] == 3 # Features should have reasonable values assert not np.all(features == 0) @pytest.mark.integration class TestDataFlowConsistency: """Integration tests for data consistency through the pipeline.""" def test_text_cleaning_affects_features(self, sample_dataframe): """Test that text cleaning impacts feature extraction.""" # Add dirty text dirty_df = sample_dataframe.copy() dirty_df['issue text'] = [ "Bug https://example.com with HTML", "Feature with ```code block```", "Update with extra spaces", "Test with 😀 emoji", "Normal clean text", ] # Use min_df=1 and max_df=1.0 for small test datasets to avoid empty vocabulary features_dirty, _ = extract_tfidf_features(dirty_df, max_features=50, min_df=1, max_df=1.0) # Clean version clean_df = sample_dataframe.copy() clean_df['issue text'] = [ "Bug with HTML", "Feature with", "Update with extra spaces", "Test with emoji", "Normal clean text", ] # Use min_df=1 and max_df=1.0 for small test datasets features_clean, _ = extract_tfidf_features(clean_df, max_features=50, min_df=1, max_df=1.0) # Features should be similar (cleaning is applied to both) # But not necessarily identical due to stemming assert features_dirty.shape == features_clean.shape def test_label_binarization_consistency(self): """Test that label binarization is consistent.""" df = pd.DataFrame({ 'issue text': ['text1', 'text2', 'text3'], 'issue description': ['desc1', 'desc2', 'desc3'], 'Label1': [0, 5, 10], # Different counts 'Label2': [1, 0, 100], }) labels = prepare_labels(df) # All values should be 0 or 1 assert set(labels.values.flatten()).issubset({0, 1}) # Specific checks assert labels.loc[0, 'Label1'] == 0 assert labels.loc[1, 'Label1'] == 1 assert labels.loc[2, 'Label1'] == 1 assert labels.loc[0, 'Label2'] == 1 assert labels.loc[1, 'Label2'] == 0 assert labels.loc[2, 'Label2'] == 1 def test_feature_label_alignment(self, sample_dataframe): """Test that features and labels remain aligned.""" features, _ = extract_tfidf_features(sample_dataframe, max_features=50) labels = prepare_labels(sample_dataframe) # Check alignment by comparing indices for i in range(len(sample_dataframe)): # Each row should correspond to the same sample assert features[i].shape[0] > 0 # Has features assert labels.iloc[i].shape[0] > 0 # Has labels @pytest.mark.integration @pytest.mark.slow class TestLargeDatasetHandling: """Integration tests with larger datasets (marked as slow).""" def test_pipeline_with_large_dataset(self): """Test pipeline with a larger number of samples.""" # Create larger dataset n_samples = 1000 df = pd.DataFrame({ 'issue text': [f'Issue number {i} with some text' for i in range(n_samples)], 'issue description': [f'Description for issue {i}' for i in range(n_samples)], 'Label1': np.random.randint(0, 2, n_samples), 'Label2': np.random.randint(0, 2, n_samples), 'Label3': np.random.randint(0, 2, n_samples), }) features, _ = extract_tfidf_features(df, max_features=500) labels = prepare_labels(df) assert features.shape[0] == n_samples assert labels.shape[0] == n_samples assert features.shape[1] <= 500 def test_pipeline_with_many_labels(self): """Test pipeline with many label columns.""" n_labels = 50 df = pd.DataFrame({ 'issue text': ['text1', 'text2', 'text3'], 'issue description': ['desc1', 'desc2', 'desc3'], }) # Add many label columns for i in range(n_labels): df[f'Label_{i}'] = np.random.randint(0, 2, 3) labels = prepare_labels(df) assert labels.shape[1] == n_labels assert set(labels.values.flatten()).issubset({0, 1}) @pytest.mark.integration class TestSaveAndLoadIntegration: """Integration tests for saving and loading processed data.""" def test_save_and_load_features(self, temp_db): """Test saving features and labels then loading them back.""" with tempfile.TemporaryDirectory() as tmpdir: from hopcroft_skill_classification_tool_competition.features import ( create_feature_dataset, load_processed_data ) # Mock the PROCESSED_DATA_DIR with pytest.MonkeyPatch.context() as m: tmpdir_path = Path(tmpdir) tfidf_dir = tmpdir_path / "tfidf" tfidf_dir.mkdir(parents=True) # Create and save features_orig, labels_orig, _, _ = create_feature_dataset( db_path=temp_db, save_processed=True ) # Save manually since we're mocking np.save(tfidf_dir / "features_tfidf.npy", features_orig) np.save(tfidf_dir / "labels_tfidf.npy", labels_orig.values) # Load back features_loaded = np.load(tfidf_dir / "features_tfidf.npy") labels_loaded = np.load(tfidf_dir / "labels_tfidf.npy") # Verify they match np.testing.assert_array_equal(features_orig, features_loaded) np.testing.assert_array_equal(labels_orig.values, labels_loaded) @pytest.mark.integration class TestErrorHandlingInPipeline: """Integration tests for error handling throughout pipeline.""" def test_pipeline_with_missing_columns(self): """Test pipeline behavior with missing expected columns.""" df = pd.DataFrame({ 'wrong_col_1': ['text1', 'text2'], 'wrong_col_2': ['desc1', 'desc2'], 'Label1': [1, 0], }) # Should handle missing text columns gracefully text_cols = get_text_columns(df) assert len(text_cols) == 0 # Should still work with explicit column specification # (though results may not be meaningful) with pytest.raises(ValueError, match="No text columns found"): extract_tfidf_features(df) def test_pipeline_with_all_nan_text(self): """Test pipeline with all NaN text values raises appropriate error. TF-IDF cannot build a vocabulary from empty/NaN documents, so it should raise a ValueError with a descriptive message. """ df = pd.DataFrame({ 'issue text': [None, None, None], 'issue description': [None, None, None], 'Label1': [1, 0, 1], }) # TF-IDF should raise ValueError for empty vocabulary with pytest.raises(ValueError, match="empty vocabulary"): extract_tfidf_features(df, max_features=50) def test_pipeline_with_empty_labels(self): """Test pipeline when no labels are present.""" df = pd.DataFrame({ 'issue text': ['text1', 'text2'], 'issue description': ['desc1', 'desc2'], # No label columns }) label_cols = get_label_columns(df) # Should return empty list assert len(label_cols) == 0