|
|
""" |
|
|
Unit tests for features.py module. |
|
|
|
|
|
Tests individual functions for text cleaning, feature extraction, |
|
|
and label preparation. |
|
|
""" |
|
|
import pytest |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
clean_github_text, |
|
|
get_text_columns, |
|
|
get_label_columns, |
|
|
combine_text_fields, |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
get_dataset_info, |
|
|
load_data_from_db, |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
class TestTextCleaning: |
|
|
"""Unit tests for text cleaning functionality.""" |
|
|
|
|
|
def test_clean_github_text_removes_urls(self): |
|
|
"""Test that URLs are removed from text.""" |
|
|
text = "Fixed bug https://github.com/repo/issues/123 in authentication" |
|
|
cleaned = clean_github_text(text) |
|
|
|
|
|
assert "https://" not in cleaned |
|
|
assert "github.com" not in cleaned |
|
|
assert "fix" in cleaned.lower() |
|
|
assert "authent" in cleaned.lower() |
|
|
|
|
|
def test_clean_github_text_removes_html(self): |
|
|
"""Test that HTML tags are removed.""" |
|
|
text = "Added <b>bold</b> feature with <i>italic</i> text" |
|
|
cleaned = clean_github_text(text) |
|
|
|
|
|
assert "<b>" not in cleaned |
|
|
assert "<i>" not in cleaned |
|
|
assert "bold" in cleaned.lower() |
|
|
|
|
|
assert "ital" in cleaned.lower() |
|
|
|
|
|
def test_clean_github_text_removes_code_blocks(self): |
|
|
"""Test that markdown code blocks are removed.""" |
|
|
text = """Fixed bug in code: |
|
|
```python |
|
|
def foo(): |
|
|
pass |
|
|
``` |
|
|
""" |
|
|
cleaned = clean_github_text(text) |
|
|
|
|
|
assert "```" not in cleaned |
|
|
assert "python" not in cleaned |
|
|
assert "def" not in cleaned |
|
|
assert "fix" in cleaned.lower() |
|
|
|
|
|
def test_clean_github_text_removes_inline_code(self): |
|
|
"""Test that inline code markers are removed.""" |
|
|
text = "Updated `getUserById()` method implementation" |
|
|
cleaned = clean_github_text(text) |
|
|
|
|
|
assert "`" not in cleaned |
|
|
assert "method" in cleaned.lower() |
|
|
|
|
|
def test_clean_github_text_normalizes_whitespace(self): |
|
|
"""Test that extra whitespace is normalized.""" |
|
|
text = "Fixed multiple spaces and\n\n\nnewlines" |
|
|
cleaned = clean_github_text(text) |
|
|
|
|
|
assert " " not in cleaned |
|
|
assert "\n\n" not in cleaned |
|
|
|
|
|
words = cleaned.split() |
|
|
assert len(words) == len([w for w in words if w]) |
|
|
|
|
|
@pytest.mark.parametrize("text,expected_empty", [ |
|
|
("", True), |
|
|
(None, True), |
|
|
(" ", True), |
|
|
("\n\n", True), |
|
|
("a", False), |
|
|
]) |
|
|
def test_clean_github_text_empty_inputs(self, text, expected_empty): |
|
|
"""Test handling of empty or null inputs.""" |
|
|
cleaned = clean_github_text(text) |
|
|
assert isinstance(cleaned, str) |
|
|
|
|
|
if expected_empty: |
|
|
assert cleaned == "" or cleaned.isspace() |
|
|
else: |
|
|
assert len(cleaned) > 0 |
|
|
|
|
|
def test_clean_github_text_applies_stemming(self): |
|
|
"""Test that stemming is applied to words.""" |
|
|
text = "running walked swimming" |
|
|
cleaned = clean_github_text(text) |
|
|
|
|
|
|
|
|
assert "run" in cleaned.lower() |
|
|
assert "walk" in cleaned.lower() |
|
|
assert "swim" in cleaned.lower() |
|
|
|
|
|
def test_clean_github_text_removes_emojis(self): |
|
|
"""Test that emojis and non-ASCII characters are removed.""" |
|
|
text = "Fixed bug 😀 with special chars" |
|
|
cleaned = clean_github_text(text) |
|
|
|
|
|
|
|
|
assert cleaned.isascii() |
|
|
assert "fix" in cleaned.lower() |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
class TestColumnIdentification: |
|
|
"""Unit tests for column identification functions.""" |
|
|
|
|
|
def test_get_text_columns_identifies_correctly(self, sample_dataframe): |
|
|
"""Test that text columns are correctly identified.""" |
|
|
text_cols = get_text_columns(sample_dataframe) |
|
|
|
|
|
assert 'issue text' in text_cols |
|
|
assert 'issue description' in text_cols |
|
|
assert len(text_cols) == 2 |
|
|
|
|
|
def test_get_text_columns_handles_missing_columns(self): |
|
|
"""Test handling when text columns are missing.""" |
|
|
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}) |
|
|
text_cols = get_text_columns(df) |
|
|
|
|
|
assert isinstance(text_cols, list) |
|
|
assert len(text_cols) == 0 |
|
|
|
|
|
def test_get_label_columns_identifies_correctly(self, sample_dataframe): |
|
|
"""Test that label columns are correctly identified.""" |
|
|
label_cols = get_label_columns(sample_dataframe) |
|
|
|
|
|
|
|
|
assert 'Repo Name' not in label_cols |
|
|
assert 'PR #' not in label_cols |
|
|
assert 'issue text' not in label_cols |
|
|
assert 'issue description' not in label_cols |
|
|
|
|
|
|
|
|
assert 'Language' in label_cols |
|
|
assert 'Data Structure' in label_cols |
|
|
assert 'Testing' in label_cols |
|
|
|
|
|
def test_get_label_columns_only_numeric(self, sample_dataframe): |
|
|
"""Test that only numeric columns are identified as labels.""" |
|
|
label_cols = get_label_columns(sample_dataframe) |
|
|
|
|
|
|
|
|
for col in label_cols: |
|
|
assert pd.api.types.is_numeric_dtype(sample_dataframe[col]) |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
class TestTextCombination: |
|
|
"""Unit tests for text combination functionality.""" |
|
|
|
|
|
def test_combine_text_fields_combines_correctly(self, sample_dataframe): |
|
|
"""Test that multiple text fields are combined.""" |
|
|
text_cols = ['issue text', 'issue description'] |
|
|
combined = combine_text_fields(sample_dataframe, text_cols) |
|
|
|
|
|
assert len(combined) == len(sample_dataframe) |
|
|
assert isinstance(combined, pd.Series) |
|
|
|
|
|
|
|
|
for i, text in enumerate(combined): |
|
|
assert isinstance(text, str) |
|
|
|
|
|
assert len(text) > 0 |
|
|
|
|
|
def test_combine_text_fields_applies_cleaning(self, sample_dataframe): |
|
|
"""Test that cleaning is applied during combination.""" |
|
|
|
|
|
sample_dataframe['issue text'] = [ |
|
|
"Fixed https://example.com bug", |
|
|
"Added feature", |
|
|
"Updated docs", |
|
|
"Refactored code", |
|
|
"Improved tests" |
|
|
] |
|
|
|
|
|
text_cols = ['issue text'] |
|
|
combined = combine_text_fields(sample_dataframe, text_cols) |
|
|
|
|
|
|
|
|
for text in combined: |
|
|
assert "https://" not in text |
|
|
assert "example.com" not in text |
|
|
|
|
|
def test_combine_text_fields_handles_nulls(self): |
|
|
"""Test handling of null values in text fields.""" |
|
|
df = pd.DataFrame({ |
|
|
'text1': ['hello', None, 'world'], |
|
|
'text2': [None, 'foo', 'bar'] |
|
|
}) |
|
|
|
|
|
combined = combine_text_fields(df, ['text1', 'text2']) |
|
|
|
|
|
assert len(combined) == 3 |
|
|
|
|
|
for text in combined: |
|
|
assert isinstance(text, str) |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
class TestTfidfExtraction: |
|
|
"""Unit tests for TF-IDF feature extraction.""" |
|
|
|
|
|
def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe): |
|
|
"""Test that TF-IDF extraction returns correct shape.""" |
|
|
features, vectorizer = extract_tfidf_features( |
|
|
sample_dataframe, |
|
|
max_features=50 |
|
|
) |
|
|
|
|
|
assert features.shape[0] == len(sample_dataframe) |
|
|
assert features.shape[1] <= 50 |
|
|
assert isinstance(vectorizer, TfidfVectorizer) |
|
|
|
|
|
def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe): |
|
|
"""Test that features are returned as numpy array.""" |
|
|
features, _ = extract_tfidf_features(sample_dataframe) |
|
|
|
|
|
assert isinstance(features, np.ndarray) |
|
|
assert features.dtype == np.float64 or features.dtype == np.float32 |
|
|
|
|
|
@pytest.mark.parametrize("max_features", [10, 50, 100, None]) |
|
|
def test_extract_tfidf_features_respects_max_features( |
|
|
self, sample_dataframe, max_features |
|
|
): |
|
|
"""Test that max_features parameter is respected.""" |
|
|
features, _ = extract_tfidf_features( |
|
|
sample_dataframe, |
|
|
max_features=max_features |
|
|
) |
|
|
|
|
|
if max_features is not None: |
|
|
assert features.shape[1] <= max_features |
|
|
|
|
|
@pytest.mark.parametrize("ngram_range", [(1, 1), (1, 2), (1, 3)]) |
|
|
def test_extract_tfidf_features_ngram_range( |
|
|
self, sample_dataframe, ngram_range |
|
|
): |
|
|
"""Test different n-gram ranges.""" |
|
|
features, vectorizer = extract_tfidf_features( |
|
|
sample_dataframe, |
|
|
ngram_range=ngram_range, |
|
|
max_features=50 |
|
|
) |
|
|
|
|
|
assert features.shape[0] == len(sample_dataframe) |
|
|
vocab = vectorizer.get_feature_names_out() |
|
|
|
|
|
|
|
|
if ngram_range[1] > 1: |
|
|
|
|
|
bigrams = [term for term in vocab if ' ' in term] |
|
|
assert len(bigrams) > 0 or len(vocab) < 50 |
|
|
|
|
|
def test_extract_tfidf_features_handles_empty_text(self): |
|
|
"""Test handling of documents with empty text.""" |
|
|
df = pd.DataFrame({ |
|
|
'issue text': ['', 'valid text', ' '], |
|
|
'issue description': ['desc', '', 'another desc'] |
|
|
}) |
|
|
|
|
|
features, vectorizer = extract_tfidf_features(df, max_features=50) |
|
|
|
|
|
|
|
|
assert features.shape[0] == 3 |
|
|
assert not np.any(np.isnan(features)) |
|
|
assert not np.any(np.isinf(features)) |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
class TestLabelPreparation: |
|
|
"""Unit tests for label preparation.""" |
|
|
|
|
|
def test_prepare_labels_returns_binary(self, sample_dataframe): |
|
|
"""Test that labels are converted to binary format.""" |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
|
|
|
unique_values = np.unique(labels.values) |
|
|
assert set(unique_values).issubset({0, 1}) |
|
|
|
|
|
def test_prepare_labels_correct_shape(self, sample_dataframe): |
|
|
"""Test that label matrix has correct shape.""" |
|
|
label_cols = get_label_columns(sample_dataframe) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
assert labels.shape[0] == len(sample_dataframe) |
|
|
assert labels.shape[1] == len(label_cols) |
|
|
|
|
|
def test_prepare_labels_converts_counts_to_binary(self): |
|
|
"""Test that label counts > 0 are converted to 1.""" |
|
|
df = pd.DataFrame({ |
|
|
'Repo Name': ['repo1', 'repo2'], |
|
|
'issue text': ['text1', 'text2'], |
|
|
'Label1': [0, 5], |
|
|
'Label2': [3, 0], |
|
|
'Label3': [0, 0], |
|
|
}) |
|
|
|
|
|
labels = prepare_labels(df) |
|
|
|
|
|
assert labels.loc[0, 'Label1'] == 0 |
|
|
assert labels.loc[0, 'Label2'] == 1 |
|
|
assert labels.loc[1, 'Label1'] == 1 |
|
|
assert labels.loc[1, 'Label2'] == 0 |
|
|
|
|
|
def test_prepare_labels_preserves_column_names(self, sample_dataframe): |
|
|
"""Test that label column names are preserved.""" |
|
|
label_cols = get_label_columns(sample_dataframe) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
assert list(labels.columns) == label_cols |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
class TestDatasetInfo: |
|
|
"""Unit tests for dataset information extraction.""" |
|
|
|
|
|
def test_get_dataset_info_returns_dict(self, sample_dataframe): |
|
|
"""Test that dataset info returns a dictionary.""" |
|
|
info = get_dataset_info(sample_dataframe) |
|
|
|
|
|
assert isinstance(info, dict) |
|
|
|
|
|
def test_get_dataset_info_contains_required_keys(self, sample_dataframe): |
|
|
"""Test that all required keys are present.""" |
|
|
info = get_dataset_info(sample_dataframe) |
|
|
|
|
|
required_keys = [ |
|
|
'total_issues', 'total_columns', 'text_columns', |
|
|
'num_text_columns', 'label_columns', 'num_labels', |
|
|
'avg_labels_per_issue', 'median_labels_per_issue' |
|
|
] |
|
|
|
|
|
for key in required_keys: |
|
|
assert key in info |
|
|
|
|
|
def test_get_dataset_info_correct_counts(self, sample_dataframe): |
|
|
"""Test that counts are calculated correctly.""" |
|
|
info = get_dataset_info(sample_dataframe) |
|
|
|
|
|
assert info['total_issues'] == len(sample_dataframe) |
|
|
assert info['total_columns'] == len(sample_dataframe.columns) |
|
|
assert info['num_text_columns'] == 2 |
|
|
|
|
|
def test_get_dataset_info_label_statistics(self, sample_dataframe): |
|
|
"""Test label statistics are reasonable.""" |
|
|
info = get_dataset_info(sample_dataframe) |
|
|
|
|
|
assert info['avg_labels_per_issue'] >= 0 |
|
|
assert info['median_labels_per_issue'] >= 0 |
|
|
assert info['avg_labels_per_issue'] <= info['num_labels'] |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
@pytest.mark.requires_data |
|
|
class TestDatabaseLoading: |
|
|
"""Unit tests for database loading (requires temp DB).""" |
|
|
|
|
|
def test_load_data_from_db_returns_dataframe(self, temp_db): |
|
|
"""Test that loading from DB returns a DataFrame.""" |
|
|
df = load_data_from_db(temp_db) |
|
|
|
|
|
assert isinstance(df, pd.DataFrame) |
|
|
assert len(df) > 0 |
|
|
|
|
|
def test_load_data_from_db_contains_expected_columns(self, temp_db): |
|
|
"""Test that loaded data has expected columns.""" |
|
|
df = load_data_from_db(temp_db) |
|
|
|
|
|
assert 'issue text' in df.columns |
|
|
assert 'issue description' in df.columns |
|
|
assert 'Repo Name' in df.columns |
|
|
assert 'PR #' in df.columns |
|
|
|
|
|
def test_load_data_from_db_nonexistent_file(self): |
|
|
"""Test handling of nonexistent database file.""" |
|
|
from pathlib import Path |
|
|
|
|
|
with pytest.raises(Exception): |
|
|
load_data_from_db(Path("/nonexistent/path/to/db.db")) |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
class TestEdgeCases: |
|
|
"""Unit tests for edge cases and error handling.""" |
|
|
|
|
|
def test_extract_tfidf_with_single_document(self): |
|
|
"""Test TF-IDF extraction with only one document.""" |
|
|
df = pd.DataFrame({ |
|
|
'issue text': ['Single document for testing'], |
|
|
'issue description': ['Description'], |
|
|
'Label1': [1] |
|
|
}) |
|
|
|
|
|
|
|
|
features, vectorizer = extract_tfidf_features( |
|
|
df, |
|
|
max_features=50, |
|
|
min_df=1, |
|
|
max_df=1.0 |
|
|
) |
|
|
|
|
|
assert features.shape[0] == 1 |
|
|
assert features.shape[1] > 0 |
|
|
|
|
|
def test_extract_tfidf_with_identical_documents(self): |
|
|
"""Test TF-IDF with identical documents.""" |
|
|
df = pd.DataFrame({ |
|
|
'issue text': ['Same text'] * 3, |
|
|
'issue description': ['Same description'] * 3, |
|
|
'Label1': [1, 0, 1] |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
features, _ = extract_tfidf_features( |
|
|
df, |
|
|
max_features=50, |
|
|
min_df=1, |
|
|
max_df=1.0 |
|
|
) |
|
|
|
|
|
|
|
|
assert features.shape[0] == 3 |
|
|
assert not np.all(features == 0) |
|
|
|
|
|
def test_prepare_labels_with_all_zeros(self): |
|
|
"""Test label preparation when a label has all zeros.""" |
|
|
df = pd.DataFrame({ |
|
|
'issue text': ['text1', 'text2'], |
|
|
'Label1': [0, 0], |
|
|
'Label2': [1, 1], |
|
|
}) |
|
|
|
|
|
labels = prepare_labels(df) |
|
|
|
|
|
assert labels['Label1'].sum() == 0 |
|
|
assert labels['Label2'].sum() == 2 |
|
|
|
|
|
def test_clean_text_with_only_special_characters(self): |
|
|
"""Test cleaning text that contains only special characters.""" |
|
|
text = "!@#$%^&*()" |
|
|
cleaned = clean_github_text(text) |
|
|
|
|
|
|
|
|
assert isinstance(cleaned, str) |
|
|
|