Spaces:
Sleeping
Sleeping
| """ | |
| Integration tests for the feature extraction pipeline. | |
| Tests the combined functionality of dataset loading, text processing, | |
| and feature extraction working together. | |
| """ | |
| import pytest | |
| import numpy as np | |
| import pandas as pd | |
| import tempfile | |
| import sqlite3 | |
| from pathlib import Path | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| load_data_from_db, | |
| create_feature_dataset, | |
| extract_tfidf_features, | |
| prepare_labels, | |
| get_text_columns, | |
| get_label_columns, | |
| ) | |
| class TestFeatureExtractionPipeline: | |
| """Integration tests for complete feature extraction pipeline.""" | |
| def test_full_pipeline_from_dataframe_to_features(self, sample_dataframe): | |
| """Test complete pipeline from DataFrame to features and labels.""" | |
| # Extract features | |
| features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) | |
| # Prepare labels | |
| labels = prepare_labels(sample_dataframe) | |
| # Verify alignment | |
| assert features.shape[0] == len(labels) | |
| assert features.shape[0] == len(sample_dataframe) | |
| # Verify data types | |
| assert isinstance(features, np.ndarray) | |
| assert isinstance(labels, pd.DataFrame) | |
| # Verify no NaN or Inf values | |
| assert not np.any(np.isnan(features)) | |
| assert not np.any(np.isinf(features)) | |
| assert not labels.isnull().any().any() | |
| def test_pipeline_with_database_to_features(self, temp_db): | |
| """Test pipeline from database loading to feature extraction.""" | |
| # Load from database | |
| df = load_data_from_db(temp_db) | |
| # Extract features | |
| features, vectorizer = extract_tfidf_features(df, max_features=50) | |
| # Prepare labels | |
| labels = prepare_labels(df) | |
| # Verify complete pipeline | |
| assert features.shape[0] == len(df) | |
| assert labels.shape[0] == len(df) | |
| assert features.shape[0] == labels.shape[0] | |
| def test_create_feature_dataset_integration(self, temp_db): | |
| """Test the complete create_feature_dataset function.""" | |
| features, labels, feature_names, label_names = create_feature_dataset( | |
| db_path=temp_db, | |
| save_processed=False | |
| ) | |
| # Verify outputs | |
| assert isinstance(features, np.ndarray) | |
| assert isinstance(labels, pd.DataFrame) | |
| assert isinstance(feature_names, np.ndarray) # sklearn returns ndarray | |
| assert isinstance(label_names, list) | |
| # Verify shapes match | |
| assert features.shape[0] == labels.shape[0] | |
| assert features.shape[1] == len(feature_names) | |
| assert labels.shape[1] == len(label_names) | |
| def test_pipeline_preserves_sample_count(self, sample_dataframe): | |
| """Test that no samples are lost during pipeline.""" | |
| initial_count = len(sample_dataframe) | |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| assert features.shape[0] == initial_count | |
| assert labels.shape[0] == initial_count | |
| def test_pipeline_with_various_text_lengths(self): | |
| """Test pipeline with documents of varying lengths.""" | |
| df = pd.DataFrame({ | |
| 'issue text': [ | |
| 'short', | |
| 'This is a medium length text with several words', | |
| 'This is a very long text ' * 50, # Very long | |
| ], | |
| 'issue description': ['desc1', 'desc2', 'desc3'], | |
| 'Label1': [1, 0, 1], | |
| 'Label2': [0, 1, 1], | |
| }) | |
| features, _ = extract_tfidf_features(df, max_features=50) | |
| labels = prepare_labels(df) | |
| # All documents should be processed | |
| assert features.shape[0] == 3 | |
| assert labels.shape[0] == 3 | |
| # Features should have reasonable values | |
| assert not np.all(features == 0) | |
| class TestDataFlowConsistency: | |
| """Integration tests for data consistency through the pipeline.""" | |
| def test_text_cleaning_affects_features(self, sample_dataframe): | |
| """Test that text cleaning impacts feature extraction.""" | |
| # Add dirty text | |
| dirty_df = sample_dataframe.copy() | |
| dirty_df['issue text'] = [ | |
| "Bug https://example.com with <b>HTML</b>", | |
| "Feature with ```code block```", | |
| "Update with extra spaces", | |
| "Test with 😀 emoji", | |
| "Normal clean text", | |
| ] | |
| features_dirty, _ = extract_tfidf_features(dirty_df, max_features=50) | |
| # Clean version | |
| clean_df = sample_dataframe.copy() | |
| clean_df['issue text'] = [ | |
| "Bug with HTML", | |
| "Feature with", | |
| "Update with extra spaces", | |
| "Test with emoji", | |
| "Normal clean text", | |
| ] | |
| features_clean, _ = extract_tfidf_features(clean_df, max_features=50) | |
| # Features should be similar (cleaning is applied to both) | |
| # But not necessarily identical due to stemming | |
| assert features_dirty.shape == features_clean.shape | |
| def test_label_binarization_consistency(self): | |
| """Test that label binarization is consistent.""" | |
| df = pd.DataFrame({ | |
| 'issue text': ['text1', 'text2', 'text3'], | |
| 'issue description': ['desc1', 'desc2', 'desc3'], | |
| 'Label1': [0, 5, 10], # Different counts | |
| 'Label2': [1, 0, 100], | |
| }) | |
| labels = prepare_labels(df) | |
| # All values should be 0 or 1 | |
| assert set(labels.values.flatten()).issubset({0, 1}) | |
| # Specific checks | |
| assert labels.loc[0, 'Label1'] == 0 | |
| assert labels.loc[1, 'Label1'] == 1 | |
| assert labels.loc[2, 'Label1'] == 1 | |
| assert labels.loc[0, 'Label2'] == 1 | |
| assert labels.loc[1, 'Label2'] == 0 | |
| assert labels.loc[2, 'Label2'] == 1 | |
| def test_feature_label_alignment(self, sample_dataframe): | |
| """Test that features and labels remain aligned.""" | |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| # Check alignment by comparing indices | |
| for i in range(len(sample_dataframe)): | |
| # Each row should correspond to the same sample | |
| assert features[i].shape[0] > 0 # Has features | |
| assert labels.iloc[i].shape[0] > 0 # Has labels | |
| class TestLargeDatasetHandling: | |
| """Integration tests with larger datasets (marked as slow).""" | |
| def test_pipeline_with_large_dataset(self): | |
| """Test pipeline with a larger number of samples.""" | |
| # Create larger dataset | |
| n_samples = 1000 | |
| df = pd.DataFrame({ | |
| 'issue text': [f'Issue number {i} with some text' for i in range(n_samples)], | |
| 'issue description': [f'Description for issue {i}' for i in range(n_samples)], | |
| 'Label1': np.random.randint(0, 2, n_samples), | |
| 'Label2': np.random.randint(0, 2, n_samples), | |
| 'Label3': np.random.randint(0, 2, n_samples), | |
| }) | |
| features, _ = extract_tfidf_features(df, max_features=500) | |
| labels = prepare_labels(df) | |
| assert features.shape[0] == n_samples | |
| assert labels.shape[0] == n_samples | |
| assert features.shape[1] <= 500 | |
| def test_pipeline_with_many_labels(self): | |
| """Test pipeline with many label columns.""" | |
| n_labels = 50 | |
| df = pd.DataFrame({ | |
| 'issue text': ['text1', 'text2', 'text3'], | |
| 'issue description': ['desc1', 'desc2', 'desc3'], | |
| }) | |
| # Add many label columns | |
| for i in range(n_labels): | |
| df[f'Label_{i}'] = np.random.randint(0, 2, 3) | |
| labels = prepare_labels(df) | |
| assert labels.shape[1] == n_labels | |
| assert set(labels.values.flatten()).issubset({0, 1}) | |
| class TestSaveAndLoadIntegration: | |
| """Integration tests for saving and loading processed data.""" | |
| def test_save_and_load_features(self, temp_db): | |
| """Test saving features and labels then loading them back.""" | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| create_feature_dataset, | |
| load_processed_data | |
| ) | |
| # Mock the PROCESSED_DATA_DIR | |
| with pytest.MonkeyPatch.context() as m: | |
| tmpdir_path = Path(tmpdir) | |
| tfidf_dir = tmpdir_path / "tfidf" | |
| tfidf_dir.mkdir(parents=True) | |
| # Create and save | |
| features_orig, labels_orig, _, _ = create_feature_dataset( | |
| db_path=temp_db, | |
| save_processed=True | |
| ) | |
| # Save manually since we're mocking | |
| np.save(tfidf_dir / "features_tfidf.npy", features_orig) | |
| np.save(tfidf_dir / "labels_tfidf.npy", labels_orig.values) | |
| # Load back | |
| features_loaded = np.load(tfidf_dir / "features_tfidf.npy") | |
| labels_loaded = np.load(tfidf_dir / "labels_tfidf.npy") | |
| # Verify they match | |
| np.testing.assert_array_equal(features_orig, features_loaded) | |
| np.testing.assert_array_equal(labels_orig.values, labels_loaded) | |
| class TestErrorHandlingInPipeline: | |
| """Integration tests for error handling throughout pipeline.""" | |
| def test_pipeline_with_missing_columns(self): | |
| """Test pipeline behavior with missing expected columns.""" | |
| df = pd.DataFrame({ | |
| 'wrong_col_1': ['text1', 'text2'], | |
| 'wrong_col_2': ['desc1', 'desc2'], | |
| 'Label1': [1, 0], | |
| }) | |
| # Should handle missing text columns gracefully | |
| text_cols = get_text_columns(df) | |
| assert len(text_cols) == 0 | |
| # Should still work with explicit column specification | |
| # (though results may not be meaningful) | |
| with pytest.raises(ValueError, match="No text columns found"): | |
| extract_tfidf_features(df) | |
| def test_pipeline_with_all_nan_text(self): | |
| """Test pipeline with all NaN text values.""" | |
| df = pd.DataFrame({ | |
| 'issue text': [None, None, None], | |
| 'issue description': [None, None, None], | |
| 'Label1': [1, 0, 1], | |
| }) | |
| # Should handle NaN values without crashing | |
| features, _ = extract_tfidf_features(df, max_features=50) | |
| # May result in zero features for all samples | |
| assert features.shape[0] == 3 | |
| assert not np.any(np.isnan(features)) | |
| def test_pipeline_with_empty_labels(self): | |
| """Test pipeline when no labels are present.""" | |
| df = pd.DataFrame({ | |
| 'issue text': ['text1', 'text2'], | |
| 'issue description': ['desc1', 'desc2'], | |
| # No label columns | |
| }) | |
| label_cols = get_label_columns(df) | |
| # Should return empty list | |
| assert len(label_cols) == 0 | |