DaCrow13
Deploy to HF Spaces (Clean)
225af6a
"""
Unit tests for features.py module.
Tests individual functions for text cleaning, feature extraction,
and label preparation.
"""
import pytest
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from hopcroft_skill_classification_tool_competition.features import (
clean_github_text,
get_text_columns,
get_label_columns,
combine_text_fields,
extract_tfidf_features,
prepare_labels,
get_dataset_info,
load_data_from_db,
)
@pytest.mark.unit
class TestTextCleaning:
"""Unit tests for text cleaning functionality."""
def test_clean_github_text_removes_urls(self):
"""Test that URLs are removed from text."""
text = "Fixed bug https://github.com/repo/issues/123 in authentication"
cleaned = clean_github_text(text)
assert "https://" not in cleaned
assert "github.com" not in cleaned
assert "fix" in cleaned.lower() # Stemmed version of "fixed"
assert "authent" in cleaned.lower() # Stemmed version
def test_clean_github_text_removes_html(self):
"""Test that HTML tags are removed."""
text = "Added <b>bold</b> feature with <i>italic</i> text"
cleaned = clean_github_text(text)
assert "<b>" not in cleaned
assert "<i>" not in cleaned
assert "bold" in cleaned.lower()
# After stemming, "italic" becomes "ital"
assert "ital" in cleaned.lower()
def test_clean_github_text_removes_code_blocks(self):
"""Test that markdown code blocks are removed."""
text = """Fixed bug in code:
```python
def foo():
pass
```
"""
cleaned = clean_github_text(text)
assert "```" not in cleaned
assert "python" not in cleaned
assert "def" not in cleaned
assert "fix" in cleaned.lower()
def test_clean_github_text_removes_inline_code(self):
"""Test that inline code markers are removed."""
text = "Updated `getUserById()` method implementation"
cleaned = clean_github_text(text)
assert "`" not in cleaned
assert "method" in cleaned.lower()
def test_clean_github_text_normalizes_whitespace(self):
"""Test that extra whitespace is normalized."""
text = "Fixed multiple spaces and\n\n\nnewlines"
cleaned = clean_github_text(text)
assert " " not in cleaned
assert "\n\n" not in cleaned
# Should be single spaces
words = cleaned.split()
assert len(words) == len([w for w in words if w]) # No empty strings
@pytest.mark.parametrize("text,expected_empty", [
("", True),
(None, True),
(" ", True),
("\n\n", True),
("a", False),
])
def test_clean_github_text_empty_inputs(self, text, expected_empty):
"""Test handling of empty or null inputs."""
cleaned = clean_github_text(text)
assert isinstance(cleaned, str)
if expected_empty:
assert cleaned == "" or cleaned.isspace()
else:
assert len(cleaned) > 0
def test_clean_github_text_applies_stemming(self):
"""Test that stemming is applied to words."""
text = "running walked swimming"
cleaned = clean_github_text(text)
# Porter stemmer should convert to stems
assert "run" in cleaned.lower() # running -> run
assert "walk" in cleaned.lower() # walked -> walk
assert "swim" in cleaned.lower() # swimming -> swim
def test_clean_github_text_removes_emojis(self):
"""Test that emojis and non-ASCII characters are removed."""
text = "Fixed bug 😀 with special chars"
cleaned = clean_github_text(text)
# Should only contain ASCII
assert cleaned.isascii()
assert "fix" in cleaned.lower()
@pytest.mark.unit
class TestColumnIdentification:
"""Unit tests for column identification functions."""
def test_get_text_columns_identifies_correctly(self, sample_dataframe):
"""Test that text columns are correctly identified."""
text_cols = get_text_columns(sample_dataframe)
assert 'issue text' in text_cols
assert 'issue description' in text_cols
assert len(text_cols) == 2
def test_get_text_columns_handles_missing_columns(self):
"""Test handling when text columns are missing."""
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
text_cols = get_text_columns(df)
assert isinstance(text_cols, list)
assert len(text_cols) == 0 # No standard text columns found
def test_get_label_columns_identifies_correctly(self, sample_dataframe):
"""Test that label columns are correctly identified."""
label_cols = get_label_columns(sample_dataframe)
# Should exclude metadata columns
assert 'Repo Name' not in label_cols
assert 'PR #' not in label_cols
assert 'issue text' not in label_cols
assert 'issue description' not in label_cols
# Should include label columns
assert 'Language' in label_cols
assert 'Data Structure' in label_cols
assert 'Testing' in label_cols
def test_get_label_columns_only_numeric(self, sample_dataframe):
"""Test that only numeric columns are identified as labels."""
label_cols = get_label_columns(sample_dataframe)
# All label columns should be numeric
for col in label_cols:
assert pd.api.types.is_numeric_dtype(sample_dataframe[col])
@pytest.mark.unit
class TestTextCombination:
"""Unit tests for text combination functionality."""
def test_combine_text_fields_combines_correctly(self, sample_dataframe):
"""Test that multiple text fields are combined."""
text_cols = ['issue text', 'issue description']
combined = combine_text_fields(sample_dataframe, text_cols)
assert len(combined) == len(sample_dataframe)
assert isinstance(combined, pd.Series)
# Check that both columns are present
for i, text in enumerate(combined):
assert isinstance(text, str)
# Should contain content from both columns (stemmed)
assert len(text) > 0
def test_combine_text_fields_applies_cleaning(self, sample_dataframe):
"""Test that cleaning is applied during combination."""
# Add dirty text
sample_dataframe['issue text'] = [
"Fixed https://example.com bug",
"Added feature",
"Updated docs",
"Refactored code",
"Improved tests"
]
text_cols = ['issue text']
combined = combine_text_fields(sample_dataframe, text_cols)
# URLs should be removed
for text in combined:
assert "https://" not in text
assert "example.com" not in text
def test_combine_text_fields_handles_nulls(self):
"""Test handling of null values in text fields."""
df = pd.DataFrame({
'text1': ['hello', None, 'world'],
'text2': [None, 'foo', 'bar']
})
combined = combine_text_fields(df, ['text1', 'text2'])
assert len(combined) == 3
# Should not raise error and should handle nulls gracefully
for text in combined:
assert isinstance(text, str)
@pytest.mark.unit
class TestTfidfExtraction:
"""Unit tests for TF-IDF feature extraction."""
def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe):
"""Test that TF-IDF extraction returns correct shape."""
features, vectorizer = extract_tfidf_features(
sample_dataframe,
max_features=50
)
assert features.shape[0] == len(sample_dataframe)
assert features.shape[1] <= 50 # May be less if vocabulary is small
assert isinstance(vectorizer, TfidfVectorizer)
def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe):
"""Test that features are returned as numpy array."""
features, _ = extract_tfidf_features(sample_dataframe)
assert isinstance(features, np.ndarray)
assert features.dtype == np.float64 or features.dtype == np.float32
@pytest.mark.parametrize("max_features", [10, 50, 100, None])
def test_extract_tfidf_features_respects_max_features(
self, sample_dataframe, max_features
):
"""Test that max_features parameter is respected."""
features, _ = extract_tfidf_features(
sample_dataframe,
max_features=max_features
)
if max_features is not None:
assert features.shape[1] <= max_features
@pytest.mark.parametrize("ngram_range", [(1, 1), (1, 2), (1, 3)])
def test_extract_tfidf_features_ngram_range(
self, sample_dataframe, ngram_range
):
"""Test different n-gram ranges."""
features, vectorizer = extract_tfidf_features(
sample_dataframe,
ngram_range=ngram_range,
max_features=50
)
assert features.shape[0] == len(sample_dataframe)
vocab = vectorizer.get_feature_names_out()
# Check that n-grams are present if range includes them
if ngram_range[1] > 1:
# Should have some bigrams (words with space)
bigrams = [term for term in vocab if ' ' in term]
assert len(bigrams) > 0 or len(vocab) < 50 # May not have bigrams if vocab is small
def test_extract_tfidf_features_handles_empty_text(self):
"""Test handling of documents with empty text."""
df = pd.DataFrame({
'issue text': ['', 'valid text', ' '],
'issue description': ['desc', '', 'another desc']
})
features, vectorizer = extract_tfidf_features(df, max_features=50)
# Should not raise error
assert features.shape[0] == 3
assert not np.any(np.isnan(features))
assert not np.any(np.isinf(features))
@pytest.mark.unit
class TestLabelPreparation:
"""Unit tests for label preparation."""
def test_prepare_labels_returns_binary(self, sample_dataframe):
"""Test that labels are converted to binary format."""
labels = prepare_labels(sample_dataframe)
# Should only contain 0 and 1
unique_values = np.unique(labels.values)
assert set(unique_values).issubset({0, 1})
def test_prepare_labels_correct_shape(self, sample_dataframe):
"""Test that label matrix has correct shape."""
label_cols = get_label_columns(sample_dataframe)
labels = prepare_labels(sample_dataframe)
assert labels.shape[0] == len(sample_dataframe)
assert labels.shape[1] == len(label_cols)
def test_prepare_labels_converts_counts_to_binary(self):
"""Test that label counts > 0 are converted to 1."""
df = pd.DataFrame({
'Repo Name': ['repo1', 'repo2'],
'issue text': ['text1', 'text2'],
'Label1': [0, 5], # 5 should become 1
'Label2': [3, 0], # 3 should become 1
'Label3': [0, 0],
})
labels = prepare_labels(df)
assert labels.loc[0, 'Label1'] == 0
assert labels.loc[0, 'Label2'] == 1
assert labels.loc[1, 'Label1'] == 1
assert labels.loc[1, 'Label2'] == 0
def test_prepare_labels_preserves_column_names(self, sample_dataframe):
"""Test that label column names are preserved."""
label_cols = get_label_columns(sample_dataframe)
labels = prepare_labels(sample_dataframe)
assert list(labels.columns) == label_cols
@pytest.mark.unit
class TestDatasetInfo:
"""Unit tests for dataset information extraction."""
def test_get_dataset_info_returns_dict(self, sample_dataframe):
"""Test that dataset info returns a dictionary."""
info = get_dataset_info(sample_dataframe)
assert isinstance(info, dict)
def test_get_dataset_info_contains_required_keys(self, sample_dataframe):
"""Test that all required keys are present."""
info = get_dataset_info(sample_dataframe)
required_keys = [
'total_issues', 'total_columns', 'text_columns',
'num_text_columns', 'label_columns', 'num_labels',
'avg_labels_per_issue', 'median_labels_per_issue'
]
for key in required_keys:
assert key in info
def test_get_dataset_info_correct_counts(self, sample_dataframe):
"""Test that counts are calculated correctly."""
info = get_dataset_info(sample_dataframe)
assert info['total_issues'] == len(sample_dataframe)
assert info['total_columns'] == len(sample_dataframe.columns)
assert info['num_text_columns'] == 2 # issue text and description
def test_get_dataset_info_label_statistics(self, sample_dataframe):
"""Test label statistics are reasonable."""
info = get_dataset_info(sample_dataframe)
assert info['avg_labels_per_issue'] >= 0
assert info['median_labels_per_issue'] >= 0
assert info['avg_labels_per_issue'] <= info['num_labels']
@pytest.mark.unit
@pytest.mark.requires_data
class TestDatabaseLoading:
"""Unit tests for database loading (requires temp DB)."""
def test_load_data_from_db_returns_dataframe(self, temp_db):
"""Test that loading from DB returns a DataFrame."""
df = load_data_from_db(temp_db)
assert isinstance(df, pd.DataFrame)
assert len(df) > 0
def test_load_data_from_db_contains_expected_columns(self, temp_db):
"""Test that loaded data has expected columns."""
df = load_data_from_db(temp_db)
assert 'issue text' in df.columns
assert 'issue description' in df.columns
assert 'Repo Name' in df.columns
assert 'PR #' in df.columns
def test_load_data_from_db_nonexistent_file(self):
"""Test handling of nonexistent database file."""
from pathlib import Path
with pytest.raises(Exception): # Could be FileNotFoundError or sqlite3 error
load_data_from_db(Path("/nonexistent/path/to/db.db"))
@pytest.mark.unit
class TestEdgeCases:
"""Unit tests for edge cases and error handling."""
def test_extract_tfidf_with_single_document(self):
"""Test TF-IDF extraction with only one document."""
df = pd.DataFrame({
'issue text': ['Single document for testing'],
'issue description': ['Description'],
'Label1': [1]
})
# Must set min_df=1 for single document
features, vectorizer = extract_tfidf_features(
df,
max_features=50,
min_df=1,
max_df=1.0
)
assert features.shape[0] == 1
assert features.shape[1] > 0
def test_extract_tfidf_with_identical_documents(self):
"""Test TF-IDF with identical documents."""
df = pd.DataFrame({
'issue text': ['Same text'] * 3,
'issue description': ['Same description'] * 3,
'Label1': [1, 0, 1]
})
# Must set max_df=1.0 because all docs are identical (100% frequency)
# Must set min_df=1 to ensure terms are kept even if they appear in all docs
features, _ = extract_tfidf_features(
df,
max_features=50,
min_df=1,
max_df=1.0
)
# All documents should have similar (but not necessarily identical) features
assert features.shape[0] == 3
assert not np.all(features == 0)
def test_prepare_labels_with_all_zeros(self):
"""Test label preparation when a label has all zeros."""
df = pd.DataFrame({
'issue text': ['text1', 'text2'],
'Label1': [0, 0], # All zeros
'Label2': [1, 1],
})
labels = prepare_labels(df)
assert labels['Label1'].sum() == 0
assert labels['Label2'].sum() == 2
def test_clean_text_with_only_special_characters(self):
"""Test cleaning text that contains only special characters."""
text = "!@#$%^&*()"
cleaned = clean_github_text(text)
# Should handle gracefully (may be empty or contain only ASCII equivalents)
assert isinstance(cleaned, str)