"""
Unit tests for features.py module.
Tests individual functions for text cleaning, feature extraction,
and label preparation.
"""
import pytest
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from hopcroft_skill_classification_tool_competition.features import (
clean_github_text,
get_text_columns,
get_label_columns,
combine_text_fields,
extract_tfidf_features,
prepare_labels,
get_dataset_info,
load_data_from_db,
)
@pytest.mark.unit
class TestTextCleaning:
"""Unit tests for text cleaning functionality."""
def test_clean_github_text_removes_urls(self):
"""Test that URLs are removed from text."""
text = "Fixed bug https://github.com/repo/issues/123 in authentication"
cleaned = clean_github_text(text)
assert "https://" not in cleaned
assert "github.com" not in cleaned
assert "fix" in cleaned.lower() # Stemmed version of "fixed"
assert "authent" in cleaned.lower() # Stemmed version
def test_clean_github_text_removes_html(self):
"""Test that HTML tags are removed."""
text = "Added bold feature with italic text"
cleaned = clean_github_text(text)
assert "" not in cleaned
assert "" not in cleaned
assert "bold" in cleaned.lower()
# After stemming, "italic" becomes "ital"
assert "ital" in cleaned.lower()
def test_clean_github_text_removes_code_blocks(self):
"""Test that markdown code blocks are removed."""
text = """Fixed bug in code:
```python
def foo():
pass
```
"""
cleaned = clean_github_text(text)
assert "```" not in cleaned
assert "python" not in cleaned
assert "def" not in cleaned
assert "fix" in cleaned.lower()
def test_clean_github_text_removes_inline_code(self):
"""Test that inline code markers are removed."""
text = "Updated `getUserById()` method implementation"
cleaned = clean_github_text(text)
assert "`" not in cleaned
assert "method" in cleaned.lower()
def test_clean_github_text_normalizes_whitespace(self):
"""Test that extra whitespace is normalized."""
text = "Fixed multiple spaces and\n\n\nnewlines"
cleaned = clean_github_text(text)
assert " " not in cleaned
assert "\n\n" not in cleaned
# Should be single spaces
words = cleaned.split()
assert len(words) == len([w for w in words if w]) # No empty strings
@pytest.mark.parametrize("text,expected_empty", [
("", True),
(None, True),
(" ", True),
("\n\n", True),
("a", False),
])
def test_clean_github_text_empty_inputs(self, text, expected_empty):
"""Test handling of empty or null inputs."""
cleaned = clean_github_text(text)
assert isinstance(cleaned, str)
if expected_empty:
assert cleaned == "" or cleaned.isspace()
else:
assert len(cleaned) > 0
def test_clean_github_text_applies_stemming(self):
"""Test that stemming is applied to words."""
text = "running walked swimming"
cleaned = clean_github_text(text)
# Porter stemmer should convert to stems
assert "run" in cleaned.lower() # running -> run
assert "walk" in cleaned.lower() # walked -> walk
assert "swim" in cleaned.lower() # swimming -> swim
def test_clean_github_text_removes_emojis(self):
"""Test that emojis and non-ASCII characters are removed."""
text = "Fixed bug 😀 with special chars"
cleaned = clean_github_text(text)
# Should only contain ASCII
assert cleaned.isascii()
assert "fix" in cleaned.lower()
@pytest.mark.unit
class TestColumnIdentification:
"""Unit tests for column identification functions."""
def test_get_text_columns_identifies_correctly(self, sample_dataframe):
"""Test that text columns are correctly identified."""
text_cols = get_text_columns(sample_dataframe)
assert 'issue text' in text_cols
assert 'issue description' in text_cols
assert len(text_cols) == 2
def test_get_text_columns_handles_missing_columns(self):
"""Test handling when text columns are missing."""
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
text_cols = get_text_columns(df)
assert isinstance(text_cols, list)
assert len(text_cols) == 0 # No standard text columns found
def test_get_label_columns_identifies_correctly(self, sample_dataframe):
"""Test that label columns are correctly identified."""
label_cols = get_label_columns(sample_dataframe)
# Should exclude metadata columns
assert 'Repo Name' not in label_cols
assert 'PR #' not in label_cols
assert 'issue text' not in label_cols
assert 'issue description' not in label_cols
# Should include label columns
assert 'Language' in label_cols
assert 'Data Structure' in label_cols
assert 'Testing' in label_cols
def test_get_label_columns_only_numeric(self, sample_dataframe):
"""Test that only numeric columns are identified as labels."""
label_cols = get_label_columns(sample_dataframe)
# All label columns should be numeric
for col in label_cols:
assert pd.api.types.is_numeric_dtype(sample_dataframe[col])
@pytest.mark.unit
class TestTextCombination:
"""Unit tests for text combination functionality."""
def test_combine_text_fields_combines_correctly(self, sample_dataframe):
"""Test that multiple text fields are combined."""
text_cols = ['issue text', 'issue description']
combined = combine_text_fields(sample_dataframe, text_cols)
assert len(combined) == len(sample_dataframe)
assert isinstance(combined, pd.Series)
# Check that both columns are present
for i, text in enumerate(combined):
assert isinstance(text, str)
# Should contain content from both columns (stemmed)
assert len(text) > 0
def test_combine_text_fields_applies_cleaning(self, sample_dataframe):
"""Test that cleaning is applied during combination."""
# Add dirty text
sample_dataframe['issue text'] = [
"Fixed https://example.com bug",
"Added feature",
"Updated docs",
"Refactored code",
"Improved tests"
]
text_cols = ['issue text']
combined = combine_text_fields(sample_dataframe, text_cols)
# URLs should be removed
for text in combined:
assert "https://" not in text
assert "example.com" not in text
def test_combine_text_fields_handles_nulls(self):
"""Test handling of null values in text fields."""
df = pd.DataFrame({
'text1': ['hello', None, 'world'],
'text2': [None, 'foo', 'bar']
})
combined = combine_text_fields(df, ['text1', 'text2'])
assert len(combined) == 3
# Should not raise error and should handle nulls gracefully
for text in combined:
assert isinstance(text, str)
@pytest.mark.unit
class TestTfidfExtraction:
"""Unit tests for TF-IDF feature extraction."""
def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe):
"""Test that TF-IDF extraction returns correct shape."""
features, vectorizer = extract_tfidf_features(
sample_dataframe,
max_features=50
)
assert features.shape[0] == len(sample_dataframe)
assert features.shape[1] <= 50 # May be less if vocabulary is small
assert isinstance(vectorizer, TfidfVectorizer)
def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe):
"""Test that features are returned as numpy array."""
features, _ = extract_tfidf_features(sample_dataframe)
assert isinstance(features, np.ndarray)
assert features.dtype == np.float64 or features.dtype == np.float32
@pytest.mark.parametrize("max_features", [10, 50, 100, None])
def test_extract_tfidf_features_respects_max_features(
self, sample_dataframe, max_features
):
"""Test that max_features parameter is respected."""
features, _ = extract_tfidf_features(
sample_dataframe,
max_features=max_features
)
if max_features is not None:
assert features.shape[1] <= max_features
@pytest.mark.parametrize("ngram_range", [(1, 1), (1, 2), (1, 3)])
def test_extract_tfidf_features_ngram_range(
self, sample_dataframe, ngram_range
):
"""Test different n-gram ranges."""
features, vectorizer = extract_tfidf_features(
sample_dataframe,
ngram_range=ngram_range,
max_features=50
)
assert features.shape[0] == len(sample_dataframe)
vocab = vectorizer.get_feature_names_out()
# Check that n-grams are present if range includes them
if ngram_range[1] > 1:
# Should have some bigrams (words with space)
bigrams = [term for term in vocab if ' ' in term]
assert len(bigrams) > 0 or len(vocab) < 50 # May not have bigrams if vocab is small
def test_extract_tfidf_features_handles_empty_text(self):
"""Test handling of documents with empty text."""
df = pd.DataFrame({
'issue text': ['', 'valid text', ' '],
'issue description': ['desc', '', 'another desc']
})
features, vectorizer = extract_tfidf_features(df, max_features=50)
# Should not raise error
assert features.shape[0] == 3
assert not np.any(np.isnan(features))
assert not np.any(np.isinf(features))
@pytest.mark.unit
class TestLabelPreparation:
"""Unit tests for label preparation."""
def test_prepare_labels_returns_binary(self, sample_dataframe):
"""Test that labels are converted to binary format."""
labels = prepare_labels(sample_dataframe)
# Should only contain 0 and 1
unique_values = np.unique(labels.values)
assert set(unique_values).issubset({0, 1})
def test_prepare_labels_correct_shape(self, sample_dataframe):
"""Test that label matrix has correct shape."""
label_cols = get_label_columns(sample_dataframe)
labels = prepare_labels(sample_dataframe)
assert labels.shape[0] == len(sample_dataframe)
assert labels.shape[1] == len(label_cols)
def test_prepare_labels_converts_counts_to_binary(self):
"""Test that label counts > 0 are converted to 1."""
df = pd.DataFrame({
'Repo Name': ['repo1', 'repo2'],
'issue text': ['text1', 'text2'],
'Label1': [0, 5], # 5 should become 1
'Label2': [3, 0], # 3 should become 1
'Label3': [0, 0],
})
labels = prepare_labels(df)
assert labels.loc[0, 'Label1'] == 0
assert labels.loc[0, 'Label2'] == 1
assert labels.loc[1, 'Label1'] == 1
assert labels.loc[1, 'Label2'] == 0
def test_prepare_labels_preserves_column_names(self, sample_dataframe):
"""Test that label column names are preserved."""
label_cols = get_label_columns(sample_dataframe)
labels = prepare_labels(sample_dataframe)
assert list(labels.columns) == label_cols
@pytest.mark.unit
class TestDatasetInfo:
"""Unit tests for dataset information extraction."""
def test_get_dataset_info_returns_dict(self, sample_dataframe):
"""Test that dataset info returns a dictionary."""
info = get_dataset_info(sample_dataframe)
assert isinstance(info, dict)
def test_get_dataset_info_contains_required_keys(self, sample_dataframe):
"""Test that all required keys are present."""
info = get_dataset_info(sample_dataframe)
required_keys = [
'total_issues', 'total_columns', 'text_columns',
'num_text_columns', 'label_columns', 'num_labels',
'avg_labels_per_issue', 'median_labels_per_issue'
]
for key in required_keys:
assert key in info
def test_get_dataset_info_correct_counts(self, sample_dataframe):
"""Test that counts are calculated correctly."""
info = get_dataset_info(sample_dataframe)
assert info['total_issues'] == len(sample_dataframe)
assert info['total_columns'] == len(sample_dataframe.columns)
assert info['num_text_columns'] == 2 # issue text and description
def test_get_dataset_info_label_statistics(self, sample_dataframe):
"""Test label statistics are reasonable."""
info = get_dataset_info(sample_dataframe)
assert info['avg_labels_per_issue'] >= 0
assert info['median_labels_per_issue'] >= 0
assert info['avg_labels_per_issue'] <= info['num_labels']
@pytest.mark.unit
@pytest.mark.requires_data
class TestDatabaseLoading:
"""Unit tests for database loading (requires temp DB)."""
def test_load_data_from_db_returns_dataframe(self, temp_db):
"""Test that loading from DB returns a DataFrame."""
df = load_data_from_db(temp_db)
assert isinstance(df, pd.DataFrame)
assert len(df) > 0
def test_load_data_from_db_contains_expected_columns(self, temp_db):
"""Test that loaded data has expected columns."""
df = load_data_from_db(temp_db)
assert 'issue text' in df.columns
assert 'issue description' in df.columns
assert 'Repo Name' in df.columns
assert 'PR #' in df.columns
def test_load_data_from_db_nonexistent_file(self):
"""Test handling of nonexistent database file."""
from pathlib import Path
with pytest.raises(Exception): # Could be FileNotFoundError or sqlite3 error
load_data_from_db(Path("/nonexistent/path/to/db.db"))
@pytest.mark.unit
class TestEdgeCases:
"""Unit tests for edge cases and error handling."""
def test_extract_tfidf_with_single_document(self):
"""Test TF-IDF extraction with only one document."""
df = pd.DataFrame({
'issue text': ['Single document for testing'],
'issue description': ['Description'],
'Label1': [1]
})
# Must set min_df=1 for single document
features, vectorizer = extract_tfidf_features(
df,
max_features=50,
min_df=1,
max_df=1.0
)
assert features.shape[0] == 1
assert features.shape[1] > 0
def test_extract_tfidf_with_identical_documents(self):
"""Test TF-IDF with identical documents."""
df = pd.DataFrame({
'issue text': ['Same text'] * 3,
'issue description': ['Same description'] * 3,
'Label1': [1, 0, 1]
})
# Must set max_df=1.0 because all docs are identical (100% frequency)
# Must set min_df=1 to ensure terms are kept even if they appear in all docs
features, _ = extract_tfidf_features(
df,
max_features=50,
min_df=1,
max_df=1.0
)
# All documents should have similar (but not necessarily identical) features
assert features.shape[0] == 3
assert not np.all(features == 0)
def test_prepare_labels_with_all_zeros(self):
"""Test label preparation when a label has all zeros."""
df = pd.DataFrame({
'issue text': ['text1', 'text2'],
'Label1': [0, 0], # All zeros
'Label2': [1, 1],
})
labels = prepare_labels(df)
assert labels['Label1'].sum() == 0
assert labels['Label2'].sum() == 2
def test_clean_text_with_only_special_characters(self):
"""Test cleaning text that contains only special characters."""
text = "!@#$%^&*()"
cleaned = clean_github_text(text)
# Should handle gracefully (may be empty or contain only ASCII equivalents)
assert isinstance(cleaned, str)