File size: 11,760 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea5abff
 
225af6a
 
 
 
 
 
 
 
 
 
 
ea5abff
 
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea5abff
 
 
 
 
225af6a
 
 
 
 
 
ea5abff
 
 
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
"""
Integration tests for the feature extraction pipeline.

Tests the combined functionality of dataset loading, text processing,
and feature extraction working together.
"""
import pytest
import numpy as np
import pandas as pd
import tempfile
import sqlite3
from pathlib import Path

from hopcroft_skill_classification_tool_competition.features import (
    load_data_from_db,
    create_feature_dataset,
    extract_tfidf_features,
    prepare_labels,
    get_text_columns,
    get_label_columns,
)


@pytest.mark.integration
class TestFeatureExtractionPipeline:
    """Integration tests for complete feature extraction pipeline."""
    
    def test_full_pipeline_from_dataframe_to_features(self, sample_dataframe):
        """Test complete pipeline from DataFrame to features and labels."""
        # Extract features
        features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50)
        
        # Prepare labels
        labels = prepare_labels(sample_dataframe)
        
        # Verify alignment
        assert features.shape[0] == len(labels)
        assert features.shape[0] == len(sample_dataframe)
        
        # Verify data types
        assert isinstance(features, np.ndarray)
        assert isinstance(labels, pd.DataFrame)
        
        # Verify no NaN or Inf values
        assert not np.any(np.isnan(features))
        assert not np.any(np.isinf(features))
        assert not labels.isnull().any().any()
    
    def test_pipeline_with_database_to_features(self, temp_db):
        """Test pipeline from database loading to feature extraction."""
        # Load from database
        df = load_data_from_db(temp_db)
        
        # Extract features
        features, vectorizer = extract_tfidf_features(df, max_features=50)
        
        # Prepare labels
        labels = prepare_labels(df)
        
        # Verify complete pipeline
        assert features.shape[0] == len(df)
        assert labels.shape[0] == len(df)
        assert features.shape[0] == labels.shape[0]
    
    def test_create_feature_dataset_integration(self, temp_db):
        """Test the complete create_feature_dataset function."""
        features, labels, feature_names, label_names = create_feature_dataset(
            db_path=temp_db,
            save_processed=False
        )
        
        # Verify outputs
        assert isinstance(features, np.ndarray)
        assert isinstance(labels, pd.DataFrame)
        assert isinstance(feature_names, np.ndarray)  # sklearn returns ndarray
        assert isinstance(label_names, list)
        
        # Verify shapes match
        assert features.shape[0] == labels.shape[0]
        assert features.shape[1] == len(feature_names)
        assert labels.shape[1] == len(label_names)
    
    def test_pipeline_preserves_sample_count(self, sample_dataframe):
        """Test that no samples are lost during pipeline."""
        initial_count = len(sample_dataframe)
        
        features, _ = extract_tfidf_features(sample_dataframe, max_features=50)
        labels = prepare_labels(sample_dataframe)
        
        assert features.shape[0] == initial_count
        assert labels.shape[0] == initial_count
    
    def test_pipeline_with_various_text_lengths(self):
        """Test pipeline with documents of varying lengths."""
        df = pd.DataFrame({
            'issue text': [
                'short',
                'This is a medium length text with several words',
                'This is a very long text ' * 50,  # Very long
            ],
            'issue description': ['desc1', 'desc2', 'desc3'],
            'Label1': [1, 0, 1],
            'Label2': [0, 1, 1],
        })
        
        features, _ = extract_tfidf_features(df, max_features=50)
        labels = prepare_labels(df)
        
        # All documents should be processed
        assert features.shape[0] == 3
        assert labels.shape[0] == 3
        
        # Features should have reasonable values
        assert not np.all(features == 0)


@pytest.mark.integration
class TestDataFlowConsistency:
    """Integration tests for data consistency through the pipeline."""
    
    def test_text_cleaning_affects_features(self, sample_dataframe):
        """Test that text cleaning impacts feature extraction."""
        # Add dirty text
        dirty_df = sample_dataframe.copy()
        dirty_df['issue text'] = [
            "Bug https://example.com with <b>HTML</b>",
            "Feature with ```code block```",
            "Update with   extra   spaces",
            "Test with 😀 emoji",
            "Normal clean text",
        ]
        
        # Use min_df=1 and max_df=1.0 for small test datasets to avoid empty vocabulary
        features_dirty, _ = extract_tfidf_features(dirty_df, max_features=50, min_df=1, max_df=1.0)
        
        # Clean version
        clean_df = sample_dataframe.copy()
        clean_df['issue text'] = [
            "Bug with HTML",
            "Feature with",
            "Update with extra spaces",
            "Test with emoji",
            "Normal clean text",
        ]
        
        # Use min_df=1 and max_df=1.0 for small test datasets
        features_clean, _ = extract_tfidf_features(clean_df, max_features=50, min_df=1, max_df=1.0)
        
        # Features should be similar (cleaning is applied to both)
        # But not necessarily identical due to stemming
        assert features_dirty.shape == features_clean.shape
    
    def test_label_binarization_consistency(self):
        """Test that label binarization is consistent."""
        df = pd.DataFrame({
            'issue text': ['text1', 'text2', 'text3'],
            'issue description': ['desc1', 'desc2', 'desc3'],
            'Label1': [0, 5, 10],  # Different counts
            'Label2': [1, 0, 100],
        })
        
        labels = prepare_labels(df)
        
        # All values should be 0 or 1
        assert set(labels.values.flatten()).issubset({0, 1})
        
        # Specific checks
        assert labels.loc[0, 'Label1'] == 0
        assert labels.loc[1, 'Label1'] == 1
        assert labels.loc[2, 'Label1'] == 1
        assert labels.loc[0, 'Label2'] == 1
        assert labels.loc[1, 'Label2'] == 0
        assert labels.loc[2, 'Label2'] == 1
    
    def test_feature_label_alignment(self, sample_dataframe):
        """Test that features and labels remain aligned."""
        features, _ = extract_tfidf_features(sample_dataframe, max_features=50)
        labels = prepare_labels(sample_dataframe)
        
        # Check alignment by comparing indices
        for i in range(len(sample_dataframe)):
            # Each row should correspond to the same sample
            assert features[i].shape[0] > 0  # Has features
            assert labels.iloc[i].shape[0] > 0  # Has labels


@pytest.mark.integration
@pytest.mark.slow
class TestLargeDatasetHandling:
    """Integration tests with larger datasets (marked as slow)."""
    
    def test_pipeline_with_large_dataset(self):
        """Test pipeline with a larger number of samples."""
        # Create larger dataset
        n_samples = 1000
        df = pd.DataFrame({
            'issue text': [f'Issue number {i} with some text' for i in range(n_samples)],
            'issue description': [f'Description for issue {i}' for i in range(n_samples)],
            'Label1': np.random.randint(0, 2, n_samples),
            'Label2': np.random.randint(0, 2, n_samples),
            'Label3': np.random.randint(0, 2, n_samples),
        })
        
        features, _ = extract_tfidf_features(df, max_features=500)
        labels = prepare_labels(df)
        
        assert features.shape[0] == n_samples
        assert labels.shape[0] == n_samples
        assert features.shape[1] <= 500
    
    def test_pipeline_with_many_labels(self):
        """Test pipeline with many label columns."""
        n_labels = 50
        df = pd.DataFrame({
            'issue text': ['text1', 'text2', 'text3'],
            'issue description': ['desc1', 'desc2', 'desc3'],
        })
        
        # Add many label columns
        for i in range(n_labels):
            df[f'Label_{i}'] = np.random.randint(0, 2, 3)
        
        labels = prepare_labels(df)
        
        assert labels.shape[1] == n_labels
        assert set(labels.values.flatten()).issubset({0, 1})


@pytest.mark.integration
class TestSaveAndLoadIntegration:
    """Integration tests for saving and loading processed data."""
    
    def test_save_and_load_features(self, temp_db):
        """Test saving features and labels then loading them back."""
        with tempfile.TemporaryDirectory() as tmpdir:
            from hopcroft_skill_classification_tool_competition.features import (
                create_feature_dataset,
                load_processed_data
            )
            
            # Mock the PROCESSED_DATA_DIR
            with pytest.MonkeyPatch.context() as m:
                tmpdir_path = Path(tmpdir)
                tfidf_dir = tmpdir_path / "tfidf"
                tfidf_dir.mkdir(parents=True)
                
                # Create and save
                features_orig, labels_orig, _, _ = create_feature_dataset(
                    db_path=temp_db,
                    save_processed=True
                )
                
                # Save manually since we're mocking
                np.save(tfidf_dir / "features_tfidf.npy", features_orig)
                np.save(tfidf_dir / "labels_tfidf.npy", labels_orig.values)
                
                # Load back
                features_loaded = np.load(tfidf_dir / "features_tfidf.npy")
                labels_loaded = np.load(tfidf_dir / "labels_tfidf.npy")
                
                # Verify they match
                np.testing.assert_array_equal(features_orig, features_loaded)
                np.testing.assert_array_equal(labels_orig.values, labels_loaded)


@pytest.mark.integration
class TestErrorHandlingInPipeline:
    """Integration tests for error handling throughout pipeline."""
    
    def test_pipeline_with_missing_columns(self):
        """Test pipeline behavior with missing expected columns."""
        df = pd.DataFrame({
            'wrong_col_1': ['text1', 'text2'],
            'wrong_col_2': ['desc1', 'desc2'],
            'Label1': [1, 0],
        })
        
        # Should handle missing text columns gracefully
        text_cols = get_text_columns(df)
        assert len(text_cols) == 0
        
        # Should still work with explicit column specification
        # (though results may not be meaningful)
        with pytest.raises(ValueError, match="No text columns found"):
            extract_tfidf_features(df)
    
    def test_pipeline_with_all_nan_text(self):
        """Test pipeline with all NaN text values raises appropriate error.
        
        TF-IDF cannot build a vocabulary from empty/NaN documents,
        so it should raise a ValueError with a descriptive message.
        """
        df = pd.DataFrame({
            'issue text': [None, None, None],
            'issue description': [None, None, None],
            'Label1': [1, 0, 1],
        })
        
        # TF-IDF should raise ValueError for empty vocabulary
        with pytest.raises(ValueError, match="empty vocabulary"):
            extract_tfidf_features(df, max_features=50)
    
    def test_pipeline_with_empty_labels(self):
        """Test pipeline when no labels are present."""
        df = pd.DataFrame({
            'issue text': ['text1', 'text2'],
            'issue description': ['desc1', 'desc2'],
            # No label columns
        })
        
        label_cols = get_label_columns(df)
        
        # Should return empty list
        assert len(label_cols) == 0