Spaces:
Sleeping
Sleeping
| """ | |
| Unit tests for features.py module. | |
| Tests individual functions for text cleaning, feature extraction, | |
| and label preparation. | |
| """ | |
| import pytest | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| clean_github_text, | |
| get_text_columns, | |
| get_label_columns, | |
| combine_text_fields, | |
| extract_tfidf_features, | |
| prepare_labels, | |
| get_dataset_info, | |
| load_data_from_db, | |
| ) | |
| class TestTextCleaning: | |
| """Unit tests for text cleaning functionality.""" | |
| def test_clean_github_text_removes_urls(self): | |
| """Test that URLs are removed from text.""" | |
| text = "Fixed bug https://github.com/repo/issues/123 in authentication" | |
| cleaned = clean_github_text(text) | |
| assert "https://" not in cleaned | |
| assert "github.com" not in cleaned | |
| assert "fix" in cleaned.lower() # Stemmed version of "fixed" | |
| assert "authent" in cleaned.lower() # Stemmed version | |
| def test_clean_github_text_removes_html(self): | |
| """Test that HTML tags are removed.""" | |
| text = "Added <b>bold</b> feature with <i>italic</i> text" | |
| cleaned = clean_github_text(text) | |
| assert "<b>" not in cleaned | |
| assert "<i>" not in cleaned | |
| assert "bold" in cleaned.lower() | |
| # After stemming, "italic" becomes "ital" | |
| assert "ital" in cleaned.lower() | |
| def test_clean_github_text_removes_code_blocks(self): | |
| """Test that markdown code blocks are removed.""" | |
| text = """Fixed bug in code: | |
| ```python | |
| def foo(): | |
| pass | |
| ``` | |
| """ | |
| cleaned = clean_github_text(text) | |
| assert "```" not in cleaned | |
| assert "python" not in cleaned | |
| assert "def" not in cleaned | |
| assert "fix" in cleaned.lower() | |
| def test_clean_github_text_removes_inline_code(self): | |
| """Test that inline code markers are removed.""" | |
| text = "Updated `getUserById()` method implementation" | |
| cleaned = clean_github_text(text) | |
| assert "`" not in cleaned | |
| assert "method" in cleaned.lower() | |
| def test_clean_github_text_normalizes_whitespace(self): | |
| """Test that extra whitespace is normalized.""" | |
| text = "Fixed multiple spaces and\n\n\nnewlines" | |
| cleaned = clean_github_text(text) | |
| assert " " not in cleaned | |
| assert "\n\n" not in cleaned | |
| # Should be single spaces | |
| words = cleaned.split() | |
| assert len(words) == len([w for w in words if w]) # No empty strings | |
| def test_clean_github_text_empty_inputs(self, text, expected_empty): | |
| """Test handling of empty or null inputs.""" | |
| cleaned = clean_github_text(text) | |
| assert isinstance(cleaned, str) | |
| if expected_empty: | |
| assert cleaned == "" or cleaned.isspace() | |
| else: | |
| assert len(cleaned) > 0 | |
| def test_clean_github_text_applies_stemming(self): | |
| """Test that stemming is applied to words.""" | |
| text = "running walked swimming" | |
| cleaned = clean_github_text(text) | |
| # Porter stemmer should convert to stems | |
| assert "run" in cleaned.lower() # running -> run | |
| assert "walk" in cleaned.lower() # walked -> walk | |
| assert "swim" in cleaned.lower() # swimming -> swim | |
| def test_clean_github_text_removes_emojis(self): | |
| """Test that emojis and non-ASCII characters are removed.""" | |
| text = "Fixed bug 😀 with special chars" | |
| cleaned = clean_github_text(text) | |
| # Should only contain ASCII | |
| assert cleaned.isascii() | |
| assert "fix" in cleaned.lower() | |
| class TestColumnIdentification: | |
| """Unit tests for column identification functions.""" | |
| def test_get_text_columns_identifies_correctly(self, sample_dataframe): | |
| """Test that text columns are correctly identified.""" | |
| text_cols = get_text_columns(sample_dataframe) | |
| assert 'issue text' in text_cols | |
| assert 'issue description' in text_cols | |
| assert len(text_cols) == 2 | |
| def test_get_text_columns_handles_missing_columns(self): | |
| """Test handling when text columns are missing.""" | |
| df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}) | |
| text_cols = get_text_columns(df) | |
| assert isinstance(text_cols, list) | |
| assert len(text_cols) == 0 # No standard text columns found | |
| def test_get_label_columns_identifies_correctly(self, sample_dataframe): | |
| """Test that label columns are correctly identified.""" | |
| label_cols = get_label_columns(sample_dataframe) | |
| # Should exclude metadata columns | |
| assert 'Repo Name' not in label_cols | |
| assert 'PR #' not in label_cols | |
| assert 'issue text' not in label_cols | |
| assert 'issue description' not in label_cols | |
| # Should include label columns | |
| assert 'Language' in label_cols | |
| assert 'Data Structure' in label_cols | |
| assert 'Testing' in label_cols | |
| def test_get_label_columns_only_numeric(self, sample_dataframe): | |
| """Test that only numeric columns are identified as labels.""" | |
| label_cols = get_label_columns(sample_dataframe) | |
| # All label columns should be numeric | |
| for col in label_cols: | |
| assert pd.api.types.is_numeric_dtype(sample_dataframe[col]) | |
| class TestTextCombination: | |
| """Unit tests for text combination functionality.""" | |
| def test_combine_text_fields_combines_correctly(self, sample_dataframe): | |
| """Test that multiple text fields are combined.""" | |
| text_cols = ['issue text', 'issue description'] | |
| combined = combine_text_fields(sample_dataframe, text_cols) | |
| assert len(combined) == len(sample_dataframe) | |
| assert isinstance(combined, pd.Series) | |
| # Check that both columns are present | |
| for i, text in enumerate(combined): | |
| assert isinstance(text, str) | |
| # Should contain content from both columns (stemmed) | |
| assert len(text) > 0 | |
| def test_combine_text_fields_applies_cleaning(self, sample_dataframe): | |
| """Test that cleaning is applied during combination.""" | |
| # Add dirty text | |
| sample_dataframe['issue text'] = [ | |
| "Fixed https://example.com bug", | |
| "Added feature", | |
| "Updated docs", | |
| "Refactored code", | |
| "Improved tests" | |
| ] | |
| text_cols = ['issue text'] | |
| combined = combine_text_fields(sample_dataframe, text_cols) | |
| # URLs should be removed | |
| for text in combined: | |
| assert "https://" not in text | |
| assert "example.com" not in text | |
| def test_combine_text_fields_handles_nulls(self): | |
| """Test handling of null values in text fields.""" | |
| df = pd.DataFrame({ | |
| 'text1': ['hello', None, 'world'], | |
| 'text2': [None, 'foo', 'bar'] | |
| }) | |
| combined = combine_text_fields(df, ['text1', 'text2']) | |
| assert len(combined) == 3 | |
| # Should not raise error and should handle nulls gracefully | |
| for text in combined: | |
| assert isinstance(text, str) | |
| class TestTfidfExtraction: | |
| """Unit tests for TF-IDF feature extraction.""" | |
| def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe): | |
| """Test that TF-IDF extraction returns correct shape.""" | |
| features, vectorizer = extract_tfidf_features( | |
| sample_dataframe, | |
| max_features=50 | |
| ) | |
| assert features.shape[0] == len(sample_dataframe) | |
| assert features.shape[1] <= 50 # May be less if vocabulary is small | |
| assert isinstance(vectorizer, TfidfVectorizer) | |
| def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe): | |
| """Test that features are returned as numpy array.""" | |
| features, _ = extract_tfidf_features(sample_dataframe) | |
| assert isinstance(features, np.ndarray) | |
| assert features.dtype == np.float64 or features.dtype == np.float32 | |
| def test_extract_tfidf_features_respects_max_features( | |
| self, sample_dataframe, max_features | |
| ): | |
| """Test that max_features parameter is respected.""" | |
| features, _ = extract_tfidf_features( | |
| sample_dataframe, | |
| max_features=max_features | |
| ) | |
| if max_features is not None: | |
| assert features.shape[1] <= max_features | |
| def test_extract_tfidf_features_ngram_range( | |
| self, sample_dataframe, ngram_range | |
| ): | |
| """Test different n-gram ranges.""" | |
| features, vectorizer = extract_tfidf_features( | |
| sample_dataframe, | |
| ngram_range=ngram_range, | |
| max_features=50 | |
| ) | |
| assert features.shape[0] == len(sample_dataframe) | |
| vocab = vectorizer.get_feature_names_out() | |
| # Check that n-grams are present if range includes them | |
| if ngram_range[1] > 1: | |
| # Should have some bigrams (words with space) | |
| bigrams = [term for term in vocab if ' ' in term] | |
| assert len(bigrams) > 0 or len(vocab) < 50 # May not have bigrams if vocab is small | |
| def test_extract_tfidf_features_handles_empty_text(self): | |
| """Test handling of documents with empty text.""" | |
| df = pd.DataFrame({ | |
| 'issue text': ['', 'valid text', ' '], | |
| 'issue description': ['desc', '', 'another desc'] | |
| }) | |
| features, vectorizer = extract_tfidf_features(df, max_features=50) | |
| # Should not raise error | |
| assert features.shape[0] == 3 | |
| assert not np.any(np.isnan(features)) | |
| assert not np.any(np.isinf(features)) | |
| class TestLabelPreparation: | |
| """Unit tests for label preparation.""" | |
| def test_prepare_labels_returns_binary(self, sample_dataframe): | |
| """Test that labels are converted to binary format.""" | |
| labels = prepare_labels(sample_dataframe) | |
| # Should only contain 0 and 1 | |
| unique_values = np.unique(labels.values) | |
| assert set(unique_values).issubset({0, 1}) | |
| def test_prepare_labels_correct_shape(self, sample_dataframe): | |
| """Test that label matrix has correct shape.""" | |
| label_cols = get_label_columns(sample_dataframe) | |
| labels = prepare_labels(sample_dataframe) | |
| assert labels.shape[0] == len(sample_dataframe) | |
| assert labels.shape[1] == len(label_cols) | |
| def test_prepare_labels_converts_counts_to_binary(self): | |
| """Test that label counts > 0 are converted to 1.""" | |
| df = pd.DataFrame({ | |
| 'Repo Name': ['repo1', 'repo2'], | |
| 'issue text': ['text1', 'text2'], | |
| 'Label1': [0, 5], # 5 should become 1 | |
| 'Label2': [3, 0], # 3 should become 1 | |
| 'Label3': [0, 0], | |
| }) | |
| labels = prepare_labels(df) | |
| assert labels.loc[0, 'Label1'] == 0 | |
| assert labels.loc[0, 'Label2'] == 1 | |
| assert labels.loc[1, 'Label1'] == 1 | |
| assert labels.loc[1, 'Label2'] == 0 | |
| def test_prepare_labels_preserves_column_names(self, sample_dataframe): | |
| """Test that label column names are preserved.""" | |
| label_cols = get_label_columns(sample_dataframe) | |
| labels = prepare_labels(sample_dataframe) | |
| assert list(labels.columns) == label_cols | |
| class TestDatasetInfo: | |
| """Unit tests for dataset information extraction.""" | |
| def test_get_dataset_info_returns_dict(self, sample_dataframe): | |
| """Test that dataset info returns a dictionary.""" | |
| info = get_dataset_info(sample_dataframe) | |
| assert isinstance(info, dict) | |
| def test_get_dataset_info_contains_required_keys(self, sample_dataframe): | |
| """Test that all required keys are present.""" | |
| info = get_dataset_info(sample_dataframe) | |
| required_keys = [ | |
| 'total_issues', 'total_columns', 'text_columns', | |
| 'num_text_columns', 'label_columns', 'num_labels', | |
| 'avg_labels_per_issue', 'median_labels_per_issue' | |
| ] | |
| for key in required_keys: | |
| assert key in info | |
| def test_get_dataset_info_correct_counts(self, sample_dataframe): | |
| """Test that counts are calculated correctly.""" | |
| info = get_dataset_info(sample_dataframe) | |
| assert info['total_issues'] == len(sample_dataframe) | |
| assert info['total_columns'] == len(sample_dataframe.columns) | |
| assert info['num_text_columns'] == 2 # issue text and description | |
| def test_get_dataset_info_label_statistics(self, sample_dataframe): | |
| """Test label statistics are reasonable.""" | |
| info = get_dataset_info(sample_dataframe) | |
| assert info['avg_labels_per_issue'] >= 0 | |
| assert info['median_labels_per_issue'] >= 0 | |
| assert info['avg_labels_per_issue'] <= info['num_labels'] | |
| class TestDatabaseLoading: | |
| """Unit tests for database loading (requires temp DB).""" | |
| def test_load_data_from_db_returns_dataframe(self, temp_db): | |
| """Test that loading from DB returns a DataFrame.""" | |
| df = load_data_from_db(temp_db) | |
| assert isinstance(df, pd.DataFrame) | |
| assert len(df) > 0 | |
| def test_load_data_from_db_contains_expected_columns(self, temp_db): | |
| """Test that loaded data has expected columns.""" | |
| df = load_data_from_db(temp_db) | |
| assert 'issue text' in df.columns | |
| assert 'issue description' in df.columns | |
| assert 'Repo Name' in df.columns | |
| assert 'PR #' in df.columns | |
| def test_load_data_from_db_nonexistent_file(self): | |
| """Test handling of nonexistent database file.""" | |
| from pathlib import Path | |
| with pytest.raises(Exception): # Could be FileNotFoundError or sqlite3 error | |
| load_data_from_db(Path("/nonexistent/path/to/db.db")) | |
| class TestEdgeCases: | |
| """Unit tests for edge cases and error handling.""" | |
| def test_extract_tfidf_with_single_document(self): | |
| """Test TF-IDF extraction with only one document.""" | |
| df = pd.DataFrame({ | |
| 'issue text': ['Single document for testing'], | |
| 'issue description': ['Description'], | |
| 'Label1': [1] | |
| }) | |
| # Must set min_df=1 for single document | |
| features, vectorizer = extract_tfidf_features( | |
| df, | |
| max_features=50, | |
| min_df=1, | |
| max_df=1.0 | |
| ) | |
| assert features.shape[0] == 1 | |
| assert features.shape[1] > 0 | |
| def test_extract_tfidf_with_identical_documents(self): | |
| """Test TF-IDF with identical documents.""" | |
| df = pd.DataFrame({ | |
| 'issue text': ['Same text'] * 3, | |
| 'issue description': ['Same description'] * 3, | |
| 'Label1': [1, 0, 1] | |
| }) | |
| # Must set max_df=1.0 because all docs are identical (100% frequency) | |
| # Must set min_df=1 to ensure terms are kept even if they appear in all docs | |
| features, _ = extract_tfidf_features( | |
| df, | |
| max_features=50, | |
| min_df=1, | |
| max_df=1.0 | |
| ) | |
| # All documents should have similar (but not necessarily identical) features | |
| assert features.shape[0] == 3 | |
| assert not np.all(features == 0) | |
| def test_prepare_labels_with_all_zeros(self): | |
| """Test label preparation when a label has all zeros.""" | |
| df = pd.DataFrame({ | |
| 'issue text': ['text1', 'text2'], | |
| 'Label1': [0, 0], # All zeros | |
| 'Label2': [1, 1], | |
| }) | |
| labels = prepare_labels(df) | |
| assert labels['Label1'].sum() == 0 | |
| assert labels['Label2'].sum() == 2 | |
| def test_clean_text_with_only_special_characters(self): | |
| """Test cleaning text that contains only special characters.""" | |
| text = "!@#$%^&*()" | |
| cleaned = clean_github_text(text) | |
| # Should handle gracefully (may be empty or contain only ASCII equivalents) | |
| assert isinstance(cleaned, str) | |