""" Unit tests for features.py module. Tests individual functions for text cleaning, feature extraction, and label preparation. """ import pytest import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from hopcroft_skill_classification_tool_competition.features import ( clean_github_text, get_text_columns, get_label_columns, combine_text_fields, extract_tfidf_features, prepare_labels, get_dataset_info, load_data_from_db, ) @pytest.mark.unit class TestTextCleaning: """Unit tests for text cleaning functionality.""" def test_clean_github_text_removes_urls(self): """Test that URLs are removed from text.""" text = "Fixed bug https://github.com/repo/issues/123 in authentication" cleaned = clean_github_text(text) assert "https://" not in cleaned assert "github.com" not in cleaned assert "fix" in cleaned.lower() # Stemmed version of "fixed" assert "authent" in cleaned.lower() # Stemmed version def test_clean_github_text_removes_html(self): """Test that HTML tags are removed.""" text = "Added bold feature with italic text" cleaned = clean_github_text(text) assert "" not in cleaned assert "" not in cleaned assert "bold" in cleaned.lower() # After stemming, "italic" becomes "ital" assert "ital" in cleaned.lower() def test_clean_github_text_removes_code_blocks(self): """Test that markdown code blocks are removed.""" text = """Fixed bug in code: ```python def foo(): pass ``` """ cleaned = clean_github_text(text) assert "```" not in cleaned assert "python" not in cleaned assert "def" not in cleaned assert "fix" in cleaned.lower() def test_clean_github_text_removes_inline_code(self): """Test that inline code markers are removed.""" text = "Updated `getUserById()` method implementation" cleaned = clean_github_text(text) assert "`" not in cleaned assert "method" in cleaned.lower() def test_clean_github_text_normalizes_whitespace(self): """Test that extra whitespace is normalized.""" text = "Fixed multiple spaces and\n\n\nnewlines" cleaned = clean_github_text(text) assert " " not in cleaned assert "\n\n" not in cleaned # Should be single spaces words = cleaned.split() assert len(words) == len([w for w in words if w]) # No empty strings @pytest.mark.parametrize("text,expected_empty", [ ("", True), (None, True), (" ", True), ("\n\n", True), ("a", False), ]) def test_clean_github_text_empty_inputs(self, text, expected_empty): """Test handling of empty or null inputs.""" cleaned = clean_github_text(text) assert isinstance(cleaned, str) if expected_empty: assert cleaned == "" or cleaned.isspace() else: assert len(cleaned) > 0 def test_clean_github_text_applies_stemming(self): """Test that stemming is applied to words.""" text = "running walked swimming" cleaned = clean_github_text(text) # Porter stemmer should convert to stems assert "run" in cleaned.lower() # running -> run assert "walk" in cleaned.lower() # walked -> walk assert "swim" in cleaned.lower() # swimming -> swim def test_clean_github_text_removes_emojis(self): """Test that emojis and non-ASCII characters are removed.""" text = "Fixed bug 😀 with special chars" cleaned = clean_github_text(text) # Should only contain ASCII assert cleaned.isascii() assert "fix" in cleaned.lower() @pytest.mark.unit class TestColumnIdentification: """Unit tests for column identification functions.""" def test_get_text_columns_identifies_correctly(self, sample_dataframe): """Test that text columns are correctly identified.""" text_cols = get_text_columns(sample_dataframe) assert 'issue text' in text_cols assert 'issue description' in text_cols assert len(text_cols) == 2 def test_get_text_columns_handles_missing_columns(self): """Test handling when text columns are missing.""" df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}) text_cols = get_text_columns(df) assert isinstance(text_cols, list) assert len(text_cols) == 0 # No standard text columns found def test_get_label_columns_identifies_correctly(self, sample_dataframe): """Test that label columns are correctly identified.""" label_cols = get_label_columns(sample_dataframe) # Should exclude metadata columns assert 'Repo Name' not in label_cols assert 'PR #' not in label_cols assert 'issue text' not in label_cols assert 'issue description' not in label_cols # Should include label columns assert 'Language' in label_cols assert 'Data Structure' in label_cols assert 'Testing' in label_cols def test_get_label_columns_only_numeric(self, sample_dataframe): """Test that only numeric columns are identified as labels.""" label_cols = get_label_columns(sample_dataframe) # All label columns should be numeric for col in label_cols: assert pd.api.types.is_numeric_dtype(sample_dataframe[col]) @pytest.mark.unit class TestTextCombination: """Unit tests for text combination functionality.""" def test_combine_text_fields_combines_correctly(self, sample_dataframe): """Test that multiple text fields are combined.""" text_cols = ['issue text', 'issue description'] combined = combine_text_fields(sample_dataframe, text_cols) assert len(combined) == len(sample_dataframe) assert isinstance(combined, pd.Series) # Check that both columns are present for i, text in enumerate(combined): assert isinstance(text, str) # Should contain content from both columns (stemmed) assert len(text) > 0 def test_combine_text_fields_applies_cleaning(self, sample_dataframe): """Test that cleaning is applied during combination.""" # Add dirty text sample_dataframe['issue text'] = [ "Fixed https://example.com bug", "Added feature", "Updated docs", "Refactored code", "Improved tests" ] text_cols = ['issue text'] combined = combine_text_fields(sample_dataframe, text_cols) # URLs should be removed for text in combined: assert "https://" not in text assert "example.com" not in text def test_combine_text_fields_handles_nulls(self): """Test handling of null values in text fields.""" df = pd.DataFrame({ 'text1': ['hello', None, 'world'], 'text2': [None, 'foo', 'bar'] }) combined = combine_text_fields(df, ['text1', 'text2']) assert len(combined) == 3 # Should not raise error and should handle nulls gracefully for text in combined: assert isinstance(text, str) @pytest.mark.unit class TestTfidfExtraction: """Unit tests for TF-IDF feature extraction.""" def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe): """Test that TF-IDF extraction returns correct shape.""" features, vectorizer = extract_tfidf_features( sample_dataframe, max_features=50 ) assert features.shape[0] == len(sample_dataframe) assert features.shape[1] <= 50 # May be less if vocabulary is small assert isinstance(vectorizer, TfidfVectorizer) def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe): """Test that features are returned as numpy array.""" features, _ = extract_tfidf_features(sample_dataframe) assert isinstance(features, np.ndarray) assert features.dtype == np.float64 or features.dtype == np.float32 @pytest.mark.parametrize("max_features", [10, 50, 100, None]) def test_extract_tfidf_features_respects_max_features( self, sample_dataframe, max_features ): """Test that max_features parameter is respected.""" features, _ = extract_tfidf_features( sample_dataframe, max_features=max_features ) if max_features is not None: assert features.shape[1] <= max_features @pytest.mark.parametrize("ngram_range", [(1, 1), (1, 2), (1, 3)]) def test_extract_tfidf_features_ngram_range( self, sample_dataframe, ngram_range ): """Test different n-gram ranges.""" features, vectorizer = extract_tfidf_features( sample_dataframe, ngram_range=ngram_range, max_features=50 ) assert features.shape[0] == len(sample_dataframe) vocab = vectorizer.get_feature_names_out() # Check that n-grams are present if range includes them if ngram_range[1] > 1: # Should have some bigrams (words with space) bigrams = [term for term in vocab if ' ' in term] assert len(bigrams) > 0 or len(vocab) < 50 # May not have bigrams if vocab is small def test_extract_tfidf_features_handles_empty_text(self): """Test handling of documents with empty text.""" df = pd.DataFrame({ 'issue text': ['', 'valid text', ' '], 'issue description': ['desc', '', 'another desc'] }) features, vectorizer = extract_tfidf_features(df, max_features=50) # Should not raise error assert features.shape[0] == 3 assert not np.any(np.isnan(features)) assert not np.any(np.isinf(features)) @pytest.mark.unit class TestLabelPreparation: """Unit tests for label preparation.""" def test_prepare_labels_returns_binary(self, sample_dataframe): """Test that labels are converted to binary format.""" labels = prepare_labels(sample_dataframe) # Should only contain 0 and 1 unique_values = np.unique(labels.values) assert set(unique_values).issubset({0, 1}) def test_prepare_labels_correct_shape(self, sample_dataframe): """Test that label matrix has correct shape.""" label_cols = get_label_columns(sample_dataframe) labels = prepare_labels(sample_dataframe) assert labels.shape[0] == len(sample_dataframe) assert labels.shape[1] == len(label_cols) def test_prepare_labels_converts_counts_to_binary(self): """Test that label counts > 0 are converted to 1.""" df = pd.DataFrame({ 'Repo Name': ['repo1', 'repo2'], 'issue text': ['text1', 'text2'], 'Label1': [0, 5], # 5 should become 1 'Label2': [3, 0], # 3 should become 1 'Label3': [0, 0], }) labels = prepare_labels(df) assert labels.loc[0, 'Label1'] == 0 assert labels.loc[0, 'Label2'] == 1 assert labels.loc[1, 'Label1'] == 1 assert labels.loc[1, 'Label2'] == 0 def test_prepare_labels_preserves_column_names(self, sample_dataframe): """Test that label column names are preserved.""" label_cols = get_label_columns(sample_dataframe) labels = prepare_labels(sample_dataframe) assert list(labels.columns) == label_cols @pytest.mark.unit class TestDatasetInfo: """Unit tests for dataset information extraction.""" def test_get_dataset_info_returns_dict(self, sample_dataframe): """Test that dataset info returns a dictionary.""" info = get_dataset_info(sample_dataframe) assert isinstance(info, dict) def test_get_dataset_info_contains_required_keys(self, sample_dataframe): """Test that all required keys are present.""" info = get_dataset_info(sample_dataframe) required_keys = [ 'total_issues', 'total_columns', 'text_columns', 'num_text_columns', 'label_columns', 'num_labels', 'avg_labels_per_issue', 'median_labels_per_issue' ] for key in required_keys: assert key in info def test_get_dataset_info_correct_counts(self, sample_dataframe): """Test that counts are calculated correctly.""" info = get_dataset_info(sample_dataframe) assert info['total_issues'] == len(sample_dataframe) assert info['total_columns'] == len(sample_dataframe.columns) assert info['num_text_columns'] == 2 # issue text and description def test_get_dataset_info_label_statistics(self, sample_dataframe): """Test label statistics are reasonable.""" info = get_dataset_info(sample_dataframe) assert info['avg_labels_per_issue'] >= 0 assert info['median_labels_per_issue'] >= 0 assert info['avg_labels_per_issue'] <= info['num_labels'] @pytest.mark.unit @pytest.mark.requires_data class TestDatabaseLoading: """Unit tests for database loading (requires temp DB).""" def test_load_data_from_db_returns_dataframe(self, temp_db): """Test that loading from DB returns a DataFrame.""" df = load_data_from_db(temp_db) assert isinstance(df, pd.DataFrame) assert len(df) > 0 def test_load_data_from_db_contains_expected_columns(self, temp_db): """Test that loaded data has expected columns.""" df = load_data_from_db(temp_db) assert 'issue text' in df.columns assert 'issue description' in df.columns assert 'Repo Name' in df.columns assert 'PR #' in df.columns def test_load_data_from_db_nonexistent_file(self): """Test handling of nonexistent database file.""" from pathlib import Path with pytest.raises(Exception): # Could be FileNotFoundError or sqlite3 error load_data_from_db(Path("/nonexistent/path/to/db.db")) @pytest.mark.unit class TestEdgeCases: """Unit tests for edge cases and error handling.""" def test_extract_tfidf_with_single_document(self): """Test TF-IDF extraction with only one document.""" df = pd.DataFrame({ 'issue text': ['Single document for testing'], 'issue description': ['Description'], 'Label1': [1] }) # Must set min_df=1 for single document features, vectorizer = extract_tfidf_features( df, max_features=50, min_df=1, max_df=1.0 ) assert features.shape[0] == 1 assert features.shape[1] > 0 def test_extract_tfidf_with_identical_documents(self): """Test TF-IDF with identical documents.""" df = pd.DataFrame({ 'issue text': ['Same text'] * 3, 'issue description': ['Same description'] * 3, 'Label1': [1, 0, 1] }) # Must set max_df=1.0 because all docs are identical (100% frequency) # Must set min_df=1 to ensure terms are kept even if they appear in all docs features, _ = extract_tfidf_features( df, max_features=50, min_df=1, max_df=1.0 ) # All documents should have similar (but not necessarily identical) features assert features.shape[0] == 3 assert not np.all(features == 0) def test_prepare_labels_with_all_zeros(self): """Test label preparation when a label has all zeros.""" df = pd.DataFrame({ 'issue text': ['text1', 'text2'], 'Label1': [0, 0], # All zeros 'Label2': [1, 1], }) labels = prepare_labels(df) assert labels['Label1'].sum() == 0 assert labels['Label2'].sum() == 2 def test_clean_text_with_only_special_characters(self): """Test cleaning text that contains only special characters.""" text = "!@#$%^&*()" cleaned = clean_github_text(text) # Should handle gracefully (may be empty or contain only ASCII equivalents) assert isinstance(cleaned, str)