File size: 16,919 Bytes
225af6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 |
"""
Unit tests for features.py module.
Tests individual functions for text cleaning, feature extraction,
and label preparation.
"""
import pytest
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from hopcroft_skill_classification_tool_competition.features import (
clean_github_text,
get_text_columns,
get_label_columns,
combine_text_fields,
extract_tfidf_features,
prepare_labels,
get_dataset_info,
load_data_from_db,
)
@pytest.mark.unit
class TestTextCleaning:
"""Unit tests for text cleaning functionality."""
def test_clean_github_text_removes_urls(self):
"""Test that URLs are removed from text."""
text = "Fixed bug https://github.com/repo/issues/123 in authentication"
cleaned = clean_github_text(text)
assert "https://" not in cleaned
assert "github.com" not in cleaned
assert "fix" in cleaned.lower() # Stemmed version of "fixed"
assert "authent" in cleaned.lower() # Stemmed version
def test_clean_github_text_removes_html(self):
"""Test that HTML tags are removed."""
text = "Added <b>bold</b> feature with <i>italic</i> text"
cleaned = clean_github_text(text)
assert "<b>" not in cleaned
assert "<i>" not in cleaned
assert "bold" in cleaned.lower()
# After stemming, "italic" becomes "ital"
assert "ital" in cleaned.lower()
def test_clean_github_text_removes_code_blocks(self):
"""Test that markdown code blocks are removed."""
text = """Fixed bug in code:
```python
def foo():
pass
```
"""
cleaned = clean_github_text(text)
assert "```" not in cleaned
assert "python" not in cleaned
assert "def" not in cleaned
assert "fix" in cleaned.lower()
def test_clean_github_text_removes_inline_code(self):
"""Test that inline code markers are removed."""
text = "Updated `getUserById()` method implementation"
cleaned = clean_github_text(text)
assert "`" not in cleaned
assert "method" in cleaned.lower()
def test_clean_github_text_normalizes_whitespace(self):
"""Test that extra whitespace is normalized."""
text = "Fixed multiple spaces and\n\n\nnewlines"
cleaned = clean_github_text(text)
assert " " not in cleaned
assert "\n\n" not in cleaned
# Should be single spaces
words = cleaned.split()
assert len(words) == len([w for w in words if w]) # No empty strings
@pytest.mark.parametrize("text,expected_empty", [
("", True),
(None, True),
(" ", True),
("\n\n", True),
("a", False),
])
def test_clean_github_text_empty_inputs(self, text, expected_empty):
"""Test handling of empty or null inputs."""
cleaned = clean_github_text(text)
assert isinstance(cleaned, str)
if expected_empty:
assert cleaned == "" or cleaned.isspace()
else:
assert len(cleaned) > 0
def test_clean_github_text_applies_stemming(self):
"""Test that stemming is applied to words."""
text = "running walked swimming"
cleaned = clean_github_text(text)
# Porter stemmer should convert to stems
assert "run" in cleaned.lower() # running -> run
assert "walk" in cleaned.lower() # walked -> walk
assert "swim" in cleaned.lower() # swimming -> swim
def test_clean_github_text_removes_emojis(self):
"""Test that emojis and non-ASCII characters are removed."""
text = "Fixed bug 😀 with special chars"
cleaned = clean_github_text(text)
# Should only contain ASCII
assert cleaned.isascii()
assert "fix" in cleaned.lower()
@pytest.mark.unit
class TestColumnIdentification:
"""Unit tests for column identification functions."""
def test_get_text_columns_identifies_correctly(self, sample_dataframe):
"""Test that text columns are correctly identified."""
text_cols = get_text_columns(sample_dataframe)
assert 'issue text' in text_cols
assert 'issue description' in text_cols
assert len(text_cols) == 2
def test_get_text_columns_handles_missing_columns(self):
"""Test handling when text columns are missing."""
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
text_cols = get_text_columns(df)
assert isinstance(text_cols, list)
assert len(text_cols) == 0 # No standard text columns found
def test_get_label_columns_identifies_correctly(self, sample_dataframe):
"""Test that label columns are correctly identified."""
label_cols = get_label_columns(sample_dataframe)
# Should exclude metadata columns
assert 'Repo Name' not in label_cols
assert 'PR #' not in label_cols
assert 'issue text' not in label_cols
assert 'issue description' not in label_cols
# Should include label columns
assert 'Language' in label_cols
assert 'Data Structure' in label_cols
assert 'Testing' in label_cols
def test_get_label_columns_only_numeric(self, sample_dataframe):
"""Test that only numeric columns are identified as labels."""
label_cols = get_label_columns(sample_dataframe)
# All label columns should be numeric
for col in label_cols:
assert pd.api.types.is_numeric_dtype(sample_dataframe[col])
@pytest.mark.unit
class TestTextCombination:
"""Unit tests for text combination functionality."""
def test_combine_text_fields_combines_correctly(self, sample_dataframe):
"""Test that multiple text fields are combined."""
text_cols = ['issue text', 'issue description']
combined = combine_text_fields(sample_dataframe, text_cols)
assert len(combined) == len(sample_dataframe)
assert isinstance(combined, pd.Series)
# Check that both columns are present
for i, text in enumerate(combined):
assert isinstance(text, str)
# Should contain content from both columns (stemmed)
assert len(text) > 0
def test_combine_text_fields_applies_cleaning(self, sample_dataframe):
"""Test that cleaning is applied during combination."""
# Add dirty text
sample_dataframe['issue text'] = [
"Fixed https://example.com bug",
"Added feature",
"Updated docs",
"Refactored code",
"Improved tests"
]
text_cols = ['issue text']
combined = combine_text_fields(sample_dataframe, text_cols)
# URLs should be removed
for text in combined:
assert "https://" not in text
assert "example.com" not in text
def test_combine_text_fields_handles_nulls(self):
"""Test handling of null values in text fields."""
df = pd.DataFrame({
'text1': ['hello', None, 'world'],
'text2': [None, 'foo', 'bar']
})
combined = combine_text_fields(df, ['text1', 'text2'])
assert len(combined) == 3
# Should not raise error and should handle nulls gracefully
for text in combined:
assert isinstance(text, str)
@pytest.mark.unit
class TestTfidfExtraction:
"""Unit tests for TF-IDF feature extraction."""
def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe):
"""Test that TF-IDF extraction returns correct shape."""
features, vectorizer = extract_tfidf_features(
sample_dataframe,
max_features=50
)
assert features.shape[0] == len(sample_dataframe)
assert features.shape[1] <= 50 # May be less if vocabulary is small
assert isinstance(vectorizer, TfidfVectorizer)
def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe):
"""Test that features are returned as numpy array."""
features, _ = extract_tfidf_features(sample_dataframe)
assert isinstance(features, np.ndarray)
assert features.dtype == np.float64 or features.dtype == np.float32
@pytest.mark.parametrize("max_features", [10, 50, 100, None])
def test_extract_tfidf_features_respects_max_features(
self, sample_dataframe, max_features
):
"""Test that max_features parameter is respected."""
features, _ = extract_tfidf_features(
sample_dataframe,
max_features=max_features
)
if max_features is not None:
assert features.shape[1] <= max_features
@pytest.mark.parametrize("ngram_range", [(1, 1), (1, 2), (1, 3)])
def test_extract_tfidf_features_ngram_range(
self, sample_dataframe, ngram_range
):
"""Test different n-gram ranges."""
features, vectorizer = extract_tfidf_features(
sample_dataframe,
ngram_range=ngram_range,
max_features=50
)
assert features.shape[0] == len(sample_dataframe)
vocab = vectorizer.get_feature_names_out()
# Check that n-grams are present if range includes them
if ngram_range[1] > 1:
# Should have some bigrams (words with space)
bigrams = [term for term in vocab if ' ' in term]
assert len(bigrams) > 0 or len(vocab) < 50 # May not have bigrams if vocab is small
def test_extract_tfidf_features_handles_empty_text(self):
"""Test handling of documents with empty text."""
df = pd.DataFrame({
'issue text': ['', 'valid text', ' '],
'issue description': ['desc', '', 'another desc']
})
features, vectorizer = extract_tfidf_features(df, max_features=50)
# Should not raise error
assert features.shape[0] == 3
assert not np.any(np.isnan(features))
assert not np.any(np.isinf(features))
@pytest.mark.unit
class TestLabelPreparation:
"""Unit tests for label preparation."""
def test_prepare_labels_returns_binary(self, sample_dataframe):
"""Test that labels are converted to binary format."""
labels = prepare_labels(sample_dataframe)
# Should only contain 0 and 1
unique_values = np.unique(labels.values)
assert set(unique_values).issubset({0, 1})
def test_prepare_labels_correct_shape(self, sample_dataframe):
"""Test that label matrix has correct shape."""
label_cols = get_label_columns(sample_dataframe)
labels = prepare_labels(sample_dataframe)
assert labels.shape[0] == len(sample_dataframe)
assert labels.shape[1] == len(label_cols)
def test_prepare_labels_converts_counts_to_binary(self):
"""Test that label counts > 0 are converted to 1."""
df = pd.DataFrame({
'Repo Name': ['repo1', 'repo2'],
'issue text': ['text1', 'text2'],
'Label1': [0, 5], # 5 should become 1
'Label2': [3, 0], # 3 should become 1
'Label3': [0, 0],
})
labels = prepare_labels(df)
assert labels.loc[0, 'Label1'] == 0
assert labels.loc[0, 'Label2'] == 1
assert labels.loc[1, 'Label1'] == 1
assert labels.loc[1, 'Label2'] == 0
def test_prepare_labels_preserves_column_names(self, sample_dataframe):
"""Test that label column names are preserved."""
label_cols = get_label_columns(sample_dataframe)
labels = prepare_labels(sample_dataframe)
assert list(labels.columns) == label_cols
@pytest.mark.unit
class TestDatasetInfo:
"""Unit tests for dataset information extraction."""
def test_get_dataset_info_returns_dict(self, sample_dataframe):
"""Test that dataset info returns a dictionary."""
info = get_dataset_info(sample_dataframe)
assert isinstance(info, dict)
def test_get_dataset_info_contains_required_keys(self, sample_dataframe):
"""Test that all required keys are present."""
info = get_dataset_info(sample_dataframe)
required_keys = [
'total_issues', 'total_columns', 'text_columns',
'num_text_columns', 'label_columns', 'num_labels',
'avg_labels_per_issue', 'median_labels_per_issue'
]
for key in required_keys:
assert key in info
def test_get_dataset_info_correct_counts(self, sample_dataframe):
"""Test that counts are calculated correctly."""
info = get_dataset_info(sample_dataframe)
assert info['total_issues'] == len(sample_dataframe)
assert info['total_columns'] == len(sample_dataframe.columns)
assert info['num_text_columns'] == 2 # issue text and description
def test_get_dataset_info_label_statistics(self, sample_dataframe):
"""Test label statistics are reasonable."""
info = get_dataset_info(sample_dataframe)
assert info['avg_labels_per_issue'] >= 0
assert info['median_labels_per_issue'] >= 0
assert info['avg_labels_per_issue'] <= info['num_labels']
@pytest.mark.unit
@pytest.mark.requires_data
class TestDatabaseLoading:
"""Unit tests for database loading (requires temp DB)."""
def test_load_data_from_db_returns_dataframe(self, temp_db):
"""Test that loading from DB returns a DataFrame."""
df = load_data_from_db(temp_db)
assert isinstance(df, pd.DataFrame)
assert len(df) > 0
def test_load_data_from_db_contains_expected_columns(self, temp_db):
"""Test that loaded data has expected columns."""
df = load_data_from_db(temp_db)
assert 'issue text' in df.columns
assert 'issue description' in df.columns
assert 'Repo Name' in df.columns
assert 'PR #' in df.columns
def test_load_data_from_db_nonexistent_file(self):
"""Test handling of nonexistent database file."""
from pathlib import Path
with pytest.raises(Exception): # Could be FileNotFoundError or sqlite3 error
load_data_from_db(Path("/nonexistent/path/to/db.db"))
@pytest.mark.unit
class TestEdgeCases:
"""Unit tests for edge cases and error handling."""
def test_extract_tfidf_with_single_document(self):
"""Test TF-IDF extraction with only one document."""
df = pd.DataFrame({
'issue text': ['Single document for testing'],
'issue description': ['Description'],
'Label1': [1]
})
# Must set min_df=1 for single document
features, vectorizer = extract_tfidf_features(
df,
max_features=50,
min_df=1,
max_df=1.0
)
assert features.shape[0] == 1
assert features.shape[1] > 0
def test_extract_tfidf_with_identical_documents(self):
"""Test TF-IDF with identical documents."""
df = pd.DataFrame({
'issue text': ['Same text'] * 3,
'issue description': ['Same description'] * 3,
'Label1': [1, 0, 1]
})
# Must set max_df=1.0 because all docs are identical (100% frequency)
# Must set min_df=1 to ensure terms are kept even if they appear in all docs
features, _ = extract_tfidf_features(
df,
max_features=50,
min_df=1,
max_df=1.0
)
# All documents should have similar (but not necessarily identical) features
assert features.shape[0] == 3
assert not np.all(features == 0)
def test_prepare_labels_with_all_zeros(self):
"""Test label preparation when a label has all zeros."""
df = pd.DataFrame({
'issue text': ['text1', 'text2'],
'Label1': [0, 0], # All zeros
'Label2': [1, 1],
})
labels = prepare_labels(df)
assert labels['Label1'].sum() == 0
assert labels['Label2'].sum() == 2
def test_clean_text_with_only_special_characters(self):
"""Test cleaning text that contains only special characters."""
text = "!@#$%^&*()"
cleaned = clean_github_text(text)
# Should handle gracefully (may be empty or contain only ASCII equivalents)
assert isinstance(cleaned, str)
|