File size: 16,919 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
"""
Unit tests for features.py module.

Tests individual functions for text cleaning, feature extraction,
and label preparation.
"""
import pytest
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from hopcroft_skill_classification_tool_competition.features import (
    clean_github_text,
    get_text_columns,
    get_label_columns,
    combine_text_fields,
    extract_tfidf_features,
    prepare_labels,
    get_dataset_info,
    load_data_from_db,
)


@pytest.mark.unit
class TestTextCleaning:
    """Unit tests for text cleaning functionality."""
    
    def test_clean_github_text_removes_urls(self):
        """Test that URLs are removed from text."""
        text = "Fixed bug https://github.com/repo/issues/123 in authentication"
        cleaned = clean_github_text(text)
        
        assert "https://" not in cleaned
        assert "github.com" not in cleaned
        assert "fix" in cleaned.lower()  # Stemmed version of "fixed"
        assert "authent" in cleaned.lower()  # Stemmed version
    
    def test_clean_github_text_removes_html(self):
        """Test that HTML tags are removed."""
        text = "Added <b>bold</b> feature with <i>italic</i> text"
        cleaned = clean_github_text(text)
        
        assert "<b>" not in cleaned
        assert "<i>" not in cleaned
        assert "bold" in cleaned.lower()
        # After stemming, "italic" becomes "ital"
        assert "ital" in cleaned.lower()
    
    def test_clean_github_text_removes_code_blocks(self):
        """Test that markdown code blocks are removed."""
        text = """Fixed bug in code:
        ```python
        def foo():
            pass
        ```
        """
        cleaned = clean_github_text(text)
        
        assert "```" not in cleaned
        assert "python" not in cleaned
        assert "def" not in cleaned
        assert "fix" in cleaned.lower()
    
    def test_clean_github_text_removes_inline_code(self):
        """Test that inline code markers are removed."""
        text = "Updated `getUserById()` method implementation"
        cleaned = clean_github_text(text)
        
        assert "`" not in cleaned
        assert "method" in cleaned.lower()
    
    def test_clean_github_text_normalizes_whitespace(self):
        """Test that extra whitespace is normalized."""
        text = "Fixed    multiple   spaces   and\n\n\nnewlines"
        cleaned = clean_github_text(text)
        
        assert "    " not in cleaned
        assert "\n\n" not in cleaned
        # Should be single spaces
        words = cleaned.split()
        assert len(words) == len([w for w in words if w])  # No empty strings
    
    @pytest.mark.parametrize("text,expected_empty", [
        ("", True),
        (None, True),
        ("   ", True),
        ("\n\n", True),
        ("a", False),
    ])
    def test_clean_github_text_empty_inputs(self, text, expected_empty):
        """Test handling of empty or null inputs."""
        cleaned = clean_github_text(text)
        assert isinstance(cleaned, str)
        
        if expected_empty:
            assert cleaned == "" or cleaned.isspace()
        else:
            assert len(cleaned) > 0
    
    def test_clean_github_text_applies_stemming(self):
        """Test that stemming is applied to words."""
        text = "running walked swimming"
        cleaned = clean_github_text(text)
        
        # Porter stemmer should convert to stems
        assert "run" in cleaned.lower()  # running -> run
        assert "walk" in cleaned.lower()  # walked -> walk
        assert "swim" in cleaned.lower()  # swimming -> swim
    
    def test_clean_github_text_removes_emojis(self):
        """Test that emojis and non-ASCII characters are removed."""
        text = "Fixed bug 😀 with special chars"
        cleaned = clean_github_text(text)
        
        # Should only contain ASCII
        assert cleaned.isascii()
        assert "fix" in cleaned.lower()


@pytest.mark.unit
class TestColumnIdentification:
    """Unit tests for column identification functions."""
    
    def test_get_text_columns_identifies_correctly(self, sample_dataframe):
        """Test that text columns are correctly identified."""
        text_cols = get_text_columns(sample_dataframe)
        
        assert 'issue text' in text_cols
        assert 'issue description' in text_cols
        assert len(text_cols) == 2
    
    def test_get_text_columns_handles_missing_columns(self):
        """Test handling when text columns are missing."""
        df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
        text_cols = get_text_columns(df)
        
        assert isinstance(text_cols, list)
        assert len(text_cols) == 0  # No standard text columns found
    
    def test_get_label_columns_identifies_correctly(self, sample_dataframe):
        """Test that label columns are correctly identified."""
        label_cols = get_label_columns(sample_dataframe)
        
        # Should exclude metadata columns
        assert 'Repo Name' not in label_cols
        assert 'PR #' not in label_cols
        assert 'issue text' not in label_cols
        assert 'issue description' not in label_cols
        
        # Should include label columns
        assert 'Language' in label_cols
        assert 'Data Structure' in label_cols
        assert 'Testing' in label_cols
    
    def test_get_label_columns_only_numeric(self, sample_dataframe):
        """Test that only numeric columns are identified as labels."""
        label_cols = get_label_columns(sample_dataframe)
        
        # All label columns should be numeric
        for col in label_cols:
            assert pd.api.types.is_numeric_dtype(sample_dataframe[col])


@pytest.mark.unit
class TestTextCombination:
    """Unit tests for text combination functionality."""
    
    def test_combine_text_fields_combines_correctly(self, sample_dataframe):
        """Test that multiple text fields are combined."""
        text_cols = ['issue text', 'issue description']
        combined = combine_text_fields(sample_dataframe, text_cols)
        
        assert len(combined) == len(sample_dataframe)
        assert isinstance(combined, pd.Series)
        
        # Check that both columns are present
        for i, text in enumerate(combined):
            assert isinstance(text, str)
            # Should contain content from both columns (stemmed)
            assert len(text) > 0
    
    def test_combine_text_fields_applies_cleaning(self, sample_dataframe):
        """Test that cleaning is applied during combination."""
        # Add dirty text
        sample_dataframe['issue text'] = [
            "Fixed https://example.com bug",
            "Added feature",
            "Updated docs",
            "Refactored code",
            "Improved tests"
        ]
        
        text_cols = ['issue text']
        combined = combine_text_fields(sample_dataframe, text_cols)
        
        # URLs should be removed
        for text in combined:
            assert "https://" not in text
            assert "example.com" not in text
    
    def test_combine_text_fields_handles_nulls(self):
        """Test handling of null values in text fields."""
        df = pd.DataFrame({
            'text1': ['hello', None, 'world'],
            'text2': [None, 'foo', 'bar']
        })
        
        combined = combine_text_fields(df, ['text1', 'text2'])
        
        assert len(combined) == 3
        # Should not raise error and should handle nulls gracefully
        for text in combined:
            assert isinstance(text, str)


@pytest.mark.unit
class TestTfidfExtraction:
    """Unit tests for TF-IDF feature extraction."""
    
    def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe):
        """Test that TF-IDF extraction returns correct shape."""
        features, vectorizer = extract_tfidf_features(
            sample_dataframe,
            max_features=50
        )
        
        assert features.shape[0] == len(sample_dataframe)
        assert features.shape[1] <= 50  # May be less if vocabulary is small
        assert isinstance(vectorizer, TfidfVectorizer)
    
    def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe):
        """Test that features are returned as numpy array."""
        features, _ = extract_tfidf_features(sample_dataframe)
        
        assert isinstance(features, np.ndarray)
        assert features.dtype == np.float64 or features.dtype == np.float32
    
    @pytest.mark.parametrize("max_features", [10, 50, 100, None])
    def test_extract_tfidf_features_respects_max_features(
        self, sample_dataframe, max_features
    ):
        """Test that max_features parameter is respected."""
        features, _ = extract_tfidf_features(
            sample_dataframe,
            max_features=max_features
        )
        
        if max_features is not None:
            assert features.shape[1] <= max_features
    
    @pytest.mark.parametrize("ngram_range", [(1, 1), (1, 2), (1, 3)])
    def test_extract_tfidf_features_ngram_range(
        self, sample_dataframe, ngram_range
    ):
        """Test different n-gram ranges."""
        features, vectorizer = extract_tfidf_features(
            sample_dataframe,
            ngram_range=ngram_range,
            max_features=50
        )
        
        assert features.shape[0] == len(sample_dataframe)
        vocab = vectorizer.get_feature_names_out()
        
        # Check that n-grams are present if range includes them
        if ngram_range[1] > 1:
            # Should have some bigrams (words with space)
            bigrams = [term for term in vocab if ' ' in term]
            assert len(bigrams) > 0 or len(vocab) < 50  # May not have bigrams if vocab is small
    
    def test_extract_tfidf_features_handles_empty_text(self):
        """Test handling of documents with empty text."""
        df = pd.DataFrame({
            'issue text': ['', 'valid text', '   '],
            'issue description': ['desc', '', 'another desc']
        })
        
        features, vectorizer = extract_tfidf_features(df, max_features=50)
        
        # Should not raise error
        assert features.shape[0] == 3
        assert not np.any(np.isnan(features))
        assert not np.any(np.isinf(features))


@pytest.mark.unit
class TestLabelPreparation:
    """Unit tests for label preparation."""
    
    def test_prepare_labels_returns_binary(self, sample_dataframe):
        """Test that labels are converted to binary format."""
        labels = prepare_labels(sample_dataframe)
        
        # Should only contain 0 and 1
        unique_values = np.unique(labels.values)
        assert set(unique_values).issubset({0, 1})
    
    def test_prepare_labels_correct_shape(self, sample_dataframe):
        """Test that label matrix has correct shape."""
        label_cols = get_label_columns(sample_dataframe)
        labels = prepare_labels(sample_dataframe)
        
        assert labels.shape[0] == len(sample_dataframe)
        assert labels.shape[1] == len(label_cols)
    
    def test_prepare_labels_converts_counts_to_binary(self):
        """Test that label counts > 0 are converted to 1."""
        df = pd.DataFrame({
            'Repo Name': ['repo1', 'repo2'],
            'issue text': ['text1', 'text2'],
            'Label1': [0, 5],  # 5 should become 1
            'Label2': [3, 0],  # 3 should become 1
            'Label3': [0, 0],
        })
        
        labels = prepare_labels(df)
        
        assert labels.loc[0, 'Label1'] == 0
        assert labels.loc[0, 'Label2'] == 1
        assert labels.loc[1, 'Label1'] == 1
        assert labels.loc[1, 'Label2'] == 0
    
    def test_prepare_labels_preserves_column_names(self, sample_dataframe):
        """Test that label column names are preserved."""
        label_cols = get_label_columns(sample_dataframe)
        labels = prepare_labels(sample_dataframe)
        
        assert list(labels.columns) == label_cols


@pytest.mark.unit
class TestDatasetInfo:
    """Unit tests for dataset information extraction."""
    
    def test_get_dataset_info_returns_dict(self, sample_dataframe):
        """Test that dataset info returns a dictionary."""
        info = get_dataset_info(sample_dataframe)
        
        assert isinstance(info, dict)
    
    def test_get_dataset_info_contains_required_keys(self, sample_dataframe):
        """Test that all required keys are present."""
        info = get_dataset_info(sample_dataframe)
        
        required_keys = [
            'total_issues', 'total_columns', 'text_columns',
            'num_text_columns', 'label_columns', 'num_labels',
            'avg_labels_per_issue', 'median_labels_per_issue'
        ]
        
        for key in required_keys:
            assert key in info
    
    def test_get_dataset_info_correct_counts(self, sample_dataframe):
        """Test that counts are calculated correctly."""
        info = get_dataset_info(sample_dataframe)
        
        assert info['total_issues'] == len(sample_dataframe)
        assert info['total_columns'] == len(sample_dataframe.columns)
        assert info['num_text_columns'] == 2  # issue text and description
    
    def test_get_dataset_info_label_statistics(self, sample_dataframe):
        """Test label statistics are reasonable."""
        info = get_dataset_info(sample_dataframe)
        
        assert info['avg_labels_per_issue'] >= 0
        assert info['median_labels_per_issue'] >= 0
        assert info['avg_labels_per_issue'] <= info['num_labels']


@pytest.mark.unit
@pytest.mark.requires_data
class TestDatabaseLoading:
    """Unit tests for database loading (requires temp DB)."""
    
    def test_load_data_from_db_returns_dataframe(self, temp_db):
        """Test that loading from DB returns a DataFrame."""
        df = load_data_from_db(temp_db)
        
        assert isinstance(df, pd.DataFrame)
        assert len(df) > 0
    
    def test_load_data_from_db_contains_expected_columns(self, temp_db):
        """Test that loaded data has expected columns."""
        df = load_data_from_db(temp_db)
        
        assert 'issue text' in df.columns
        assert 'issue description' in df.columns
        assert 'Repo Name' in df.columns
        assert 'PR #' in df.columns
    
    def test_load_data_from_db_nonexistent_file(self):
        """Test handling of nonexistent database file."""
        from pathlib import Path
        
        with pytest.raises(Exception):  # Could be FileNotFoundError or sqlite3 error
            load_data_from_db(Path("/nonexistent/path/to/db.db"))


@pytest.mark.unit
class TestEdgeCases:
    """Unit tests for edge cases and error handling."""
    
    def test_extract_tfidf_with_single_document(self):
        """Test TF-IDF extraction with only one document."""
        df = pd.DataFrame({
            'issue text': ['Single document for testing'],
            'issue description': ['Description'],
            'Label1': [1]
        })
        
        # Must set min_df=1 for single document
        features, vectorizer = extract_tfidf_features(
            df, 
            max_features=50,
            min_df=1,
            max_df=1.0
        )
        
        assert features.shape[0] == 1
        assert features.shape[1] > 0
    
    def test_extract_tfidf_with_identical_documents(self):
        """Test TF-IDF with identical documents."""
        df = pd.DataFrame({
            'issue text': ['Same text'] * 3,
            'issue description': ['Same description'] * 3,
            'Label1': [1, 0, 1]
        })
        
        # Must set max_df=1.0 because all docs are identical (100% frequency)
        # Must set min_df=1 to ensure terms are kept even if they appear in all docs
        features, _ = extract_tfidf_features(
            df, 
            max_features=50,
            min_df=1,
            max_df=1.0
        )
        
        # All documents should have similar (but not necessarily identical) features
        assert features.shape[0] == 3
        assert not np.all(features == 0)
    
    def test_prepare_labels_with_all_zeros(self):
        """Test label preparation when a label has all zeros."""
        df = pd.DataFrame({
            'issue text': ['text1', 'text2'],
            'Label1': [0, 0],  # All zeros
            'Label2': [1, 1],
        })
        
        labels = prepare_labels(df)
        
        assert labels['Label1'].sum() == 0
        assert labels['Label2'].sum() == 2
    
    def test_clean_text_with_only_special_characters(self):
        """Test cleaning text that contains only special characters."""
        text = "!@#$%^&*()"
        cleaned = clean_github_text(text)
        
        # Should handle gracefully (may be empty or contain only ASCII equivalents)
        assert isinstance(cleaned, str)