Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 16,919 Bytes

225af6a

"""
Unit tests for features.py module.

Tests individual functions for text cleaning, feature extraction,
and label preparation.
"""
import pytest
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from hopcroft_skill_classification_tool_competition.features import (
    clean_github_text,
    get_text_columns,
    get_label_columns,
    combine_text_fields,
    extract_tfidf_features,
    prepare_labels,
    get_dataset_info,
    load_data_from_db,
)


@pytest.mark.unit
class TestTextCleaning:
    """Unit tests for text cleaning functionality."""
    
    def test_clean_github_text_removes_urls(self):
        """Test that URLs are removed from text."""
        text = "Fixed bug https://github.com/repo/issues/123 in authentication"
        cleaned = clean_github_text(text)
        
        assert "https://" not in cleaned
        assert "github.com" not in cleaned
        assert "fix" in cleaned.lower()  # Stemmed version of "fixed"
        assert "authent" in cleaned.lower()  # Stemmed version
    
    def test_clean_github_text_removes_html(self):
        """Test that HTML tags are removed."""
        text = "Added <b>bold</b> feature with <i>italic</i> text"
        cleaned = clean_github_text(text)
        
        assert "<b>" not in cleaned
        assert "<i>" not in cleaned
        assert "bold" in cleaned.lower()
        # After stemming, "italic" becomes "ital"
        assert "ital" in cleaned.lower()
    
    def test_clean_github_text_removes_code_blocks(self):
        """Test that markdown code blocks are removed."""
        text = """Fixed bug in code:
        ```python
        def foo():
            pass
        ```
        """
        cleaned = clean_github_text(text)
        
        assert "```" not in cleaned
        assert "python" not in cleaned
        assert "def" not in cleaned
        assert "fix" in cleaned.lower()
    
    def test_clean_github_text_removes_inline_code(self):
        """Test that inline code markers are removed."""
        text = "Updated `getUserById()` method implementation"
        cleaned = clean_github_text(text)
        
        assert "`" not in cleaned
        assert "method" in cleaned.lower()
    
    def test_clean_github_text_normalizes_whitespace(self):
        """Test that extra whitespace is normalized."""
        text = "Fixed    multiple   spaces   and\n\n\nnewlines"
        cleaned = clean_github_text(text)
        
        assert "    " not in cleaned
        assert "\n\n" not in cleaned
        # Should be single spaces
        words = cleaned.split()
        assert len(words) == len([w for w in words if w])  # No empty strings
    
    @pytest.mark.parametrize("text,expected_empty", [
        ("", True),
        (None, True),
        ("   ", True),
        ("\n\n", True),
        ("a", False),
    ])
    def test_clean_github_text_empty_inputs(self, text, expected_empty):
        """Test handling of empty or null inputs."""
        cleaned = clean_github_text(text)
        assert isinstance(cleaned, str)
        
        if expected_empty:
            assert cleaned == "" or cleaned.isspace()
        else:
            assert len(cleaned) > 0
    
    def test_clean_github_text_applies_stemming(self):
        """Test that stemming is applied to words."""
        text = "running walked swimming"
        cleaned = clean_github_text(text)
        
        # Porter stemmer should convert to stems
        assert "run" in cleaned.lower()  # running -> run
        assert "walk" in cleaned.lower()  # walked -> walk
        assert "swim" in cleaned.lower()  # swimming -> swim
    
    def test_clean_github_text_removes_emojis(self):
        """Test that emojis and non-ASCII characters are removed."""
        text = "Fixed bug 😀 with special chars"
        cleaned = clean_github_text(text)
        
        # Should only contain ASCII
        assert cleaned.isascii()
        assert "fix" in cleaned.lower()


@pytest.mark.unit
class TestColumnIdentification:
    """Unit tests for column identification functions."""
    
    def test_get_text_columns_identifies_correctly(self, sample_dataframe):
        """Test that text columns are correctly identified."""
        text_cols = get_text_columns(sample_dataframe)
        
        assert 'issue text' in text_cols
        assert 'issue description' in text_cols
        assert len(text_cols) == 2
    
    def test_get_text_columns_handles_missing_columns(self):
        """Test handling when text columns are missing."""
        df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
        text_cols = get_text_columns(df)
        
        assert isinstance(text_cols, list)
        assert len(text_cols) == 0  # No standard text columns found
    
    def test_get_label_columns_identifies_correctly(self, sample_dataframe):
        """Test that label columns are correctly identified."""
        label_cols = get_label_columns(sample_dataframe)
        
        # Should exclude metadata columns
        assert 'Repo Name' not in label_cols
        assert 'PR #' not in label_cols
        assert 'issue text' not in label_cols
        assert 'issue description' not in label_cols
        
        # Should include label columns
        assert 'Language' in label_cols
        assert 'Data Structure' in label_cols
        assert 'Testing' in label_cols
    
    def test_get_label_columns_only_numeric(self, sample_dataframe):
        """Test that only numeric columns are identified as labels."""
        label_cols = get_label_columns(sample_dataframe)
        
        # All label columns should be numeric
        for col in label_cols:
            assert pd.api.types.is_numeric_dtype(sample_dataframe[col])


@pytest.mark.unit
class TestTextCombination:
    """Unit tests for text combination functionality."""
    
    def test_combine_text_fields_combines_correctly(self, sample_dataframe):
        """Test that multiple text fields are combined."""
        text_cols = ['issue text', 'issue description']
        combined = combine_text_fields(sample_dataframe, text_cols)
        
        assert len(combined) == len(sample_dataframe)
        assert isinstance(combined, pd.Series)
        
        # Check that both columns are present
        for i, text in enumerate(combined):
            assert isinstance(text, str)
            # Should contain content from both columns (stemmed)
            assert len(text) > 0
    
    def test_combine_text_fields_applies_cleaning(self, sample_dataframe):
        """Test that cleaning is applied during combination."""
        # Add dirty text
        sample_dataframe['issue text'] = [
            "Fixed https://example.com bug",
            "Added feature",
            "Updated docs",
            "Refactored code",
            "Improved tests"
        ]
        
        text_cols = ['issue text']
        combined = combine_text_fields(sample_dataframe, text_cols)
        
        # URLs should be removed
        for text in combined:
            assert "https://" not in text
            assert "example.com" not in text
    
    def test_combine_text_fields_handles_nulls(self):
        """Test handling of null values in text fields."""
        df = pd.DataFrame({
            'text1': ['hello', None, 'world'],
            'text2': [None, 'foo', 'bar']
        })
        
        combined = combine_text_fields(df, ['text1', 'text2'])
        
        assert len(combined) == 3
        # Should not raise error and should handle nulls gracefully
        for text in combined:
            assert isinstance(text, str)


@pytest.mark.unit
class TestTfidfExtraction:
    """Unit tests for TF-IDF feature extraction."""
    
    def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe):
        """Test that TF-IDF extraction returns correct shape."""
        features, vectorizer = extract_tfidf_features(
            sample_dataframe,
            max_features=50
        )
        
        assert features.shape[0] == len(sample_dataframe)
        assert features.shape[1] <= 50  # May be less if vocabulary is small
        assert isinstance(vectorizer, TfidfVectorizer)
    
    def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe):
        """Test that features are returned as numpy array."""
        features, _ = extract_tfidf_features(sample_dataframe)
        
        assert isinstance(features, np.ndarray)
        assert features.dtype == np.float64 or features.dtype == np.float32
    
    @pytest.mark.parametrize("max_features", [10, 50, 100, None])
    def test_extract_tfidf_features_respects_max_features(
        self, sample_dataframe, max_features
    ):
        """Test that max_features parameter is respected."""
        features, _ = extract_tfidf_features(
            sample_dataframe,
            max_features=max_features
        )
        
        if max_features is not None:
            assert features.shape[1] <= max_features
    
    @pytest.mark.parametrize("ngram_range", [(1, 1), (1, 2), (1, 3)])
    def test_extract_tfidf_features_ngram_range(
        self, sample_dataframe, ngram_range
    ):
        """Test different n-gram ranges."""
        features, vectorizer = extract_tfidf_features(
            sample_dataframe,
            ngram_range=ngram_range,
            max_features=50
        )
        
        assert features.shape[0] == len(sample_dataframe)
        vocab = vectorizer.get_feature_names_out()
        
        # Check that n-grams are present if range includes them
        if ngram_range[1] > 1:
            # Should have some bigrams (words with space)
            bigrams = [term for term in vocab if ' ' in term]
            assert len(bigrams) > 0 or len(vocab) < 50  # May not have bigrams if vocab is small
    
    def test_extract_tfidf_features_handles_empty_text(self):
        """Test handling of documents with empty text."""
        df = pd.DataFrame({
            'issue text': ['', 'valid text', '   '],
            'issue description': ['desc', '', 'another desc']
        })
        
        features, vectorizer = extract_tfidf_features(df, max_features=50)
        
        # Should not raise error
        assert features.shape[0] == 3
        assert not np.any(np.isnan(features))
        assert not np.any(np.isinf(features))


@pytest.mark.unit
class TestLabelPreparation:
    """Unit tests for label preparation."""
    
    def test_prepare_labels_returns_binary(self, sample_dataframe):
        """Test that labels are converted to binary format."""
        labels = prepare_labels(sample_dataframe)
        
        # Should only contain 0 and 1
        unique_values = np.unique(labels.values)
        assert set(unique_values).issubset({0, 1})
    
    def test_prepare_labels_correct_shape(self, sample_dataframe):
        """Test that label matrix has correct shape."""
        label_cols = get_label_columns(sample_dataframe)
        labels = prepare_labels(sample_dataframe)
        
        assert labels.shape[0] == len(sample_dataframe)
        assert labels.shape[1] == len(label_cols)
    
    def test_prepare_labels_converts_counts_to_binary(self):
        """Test that label counts > 0 are converted to 1."""
        df = pd.DataFrame({
            'Repo Name': ['repo1', 'repo2'],
            'issue text': ['text1', 'text2'],
            'Label1': [0, 5],  # 5 should become 1
            'Label2': [3, 0],  # 3 should become 1
            'Label3': [0, 0],
        })
        
        labels = prepare_labels(df)
        
        assert labels.loc[0, 'Label1'] == 0
        assert labels.loc[0, 'Label2'] == 1
        assert labels.loc[1, 'Label1'] == 1
        assert labels.loc[1, 'Label2'] == 0
    
    def test_prepare_labels_preserves_column_names(self, sample_dataframe):
        """Test that label column names are preserved."""
        label_cols = get_label_columns(sample_dataframe)
        labels = prepare_labels(sample_dataframe)
        
        assert list(labels.columns) == label_cols


@pytest.mark.unit
class TestDatasetInfo:
    """Unit tests for dataset information extraction."""
    
    def test_get_dataset_info_returns_dict(self, sample_dataframe):
        """Test that dataset info returns a dictionary."""
        info = get_dataset_info(sample_dataframe)
        
        assert isinstance(info, dict)
    
    def test_get_dataset_info_contains_required_keys(self, sample_dataframe):
        """Test that all required keys are present."""
        info = get_dataset_info(sample_dataframe)
        
        required_keys = [
            'total_issues', 'total_columns', 'text_columns',
            'num_text_columns', 'label_columns', 'num_labels',
            'avg_labels_per_issue', 'median_labels_per_issue'
        ]
        
        for key in required_keys:
            assert key in info
    
    def test_get_dataset_info_correct_counts(self, sample_dataframe):
        """Test that counts are calculated correctly."""
        info = get_dataset_info(sample_dataframe)
        
        assert info['total_issues'] == len(sample_dataframe)
        assert info['total_columns'] == len(sample_dataframe.columns)
        assert info['num_text_columns'] == 2  # issue text and description
    
    def test_get_dataset_info_label_statistics(self, sample_dataframe):
        """Test label statistics are reasonable."""
        info = get_dataset_info(sample_dataframe)
        
        assert info['avg_labels_per_issue'] >= 0
        assert info['median_labels_per_issue'] >= 0
        assert info['avg_labels_per_issue'] <= info['num_labels']


@pytest.mark.unit
@pytest.mark.requires_data
class TestDatabaseLoading:
    """Unit tests for database loading (requires temp DB)."""
    
    def test_load_data_from_db_returns_dataframe(self, temp_db):
        """Test that loading from DB returns a DataFrame."""
        df = load_data_from_db(temp_db)
        
        assert isinstance(df, pd.DataFrame)
        assert len(df) > 0
    
    def test_load_data_from_db_contains_expected_columns(self, temp_db):
        """Test that loaded data has expected columns."""
        df = load_data_from_db(temp_db)
        
        assert 'issue text' in df.columns
        assert 'issue description' in df.columns
        assert 'Repo Name' in df.columns
        assert 'PR #' in df.columns
    
    def test_load_data_from_db_nonexistent_file(self):
        """Test handling of nonexistent database file."""
        from pathlib import Path
        
        with pytest.raises(Exception):  # Could be FileNotFoundError or sqlite3 error
            load_data_from_db(Path("/nonexistent/path/to/db.db"))


@pytest.mark.unit
class TestEdgeCases:
    """Unit tests for edge cases and error handling."""
    
    def test_extract_tfidf_with_single_document(self):
        """Test TF-IDF extraction with only one document."""
        df = pd.DataFrame({
            'issue text': ['Single document for testing'],
            'issue description': ['Description'],
            'Label1': [1]
        })
        
        # Must set min_df=1 for single document
        features, vectorizer = extract_tfidf_features(
            df, 
            max_features=50,
            min_df=1,
            max_df=1.0
        )
        
        assert features.shape[0] == 1
        assert features.shape[1] > 0
    
    def test_extract_tfidf_with_identical_documents(self):
        """Test TF-IDF with identical documents."""
        df = pd.DataFrame({
            'issue text': ['Same text'] * 3,
            'issue description': ['Same description'] * 3,
            'Label1': [1, 0, 1]
        })
        
        # Must set max_df=1.0 because all docs are identical (100% frequency)
        # Must set min_df=1 to ensure terms are kept even if they appear in all docs
        features, _ = extract_tfidf_features(
            df, 
            max_features=50,
            min_df=1,
            max_df=1.0
        )
        
        # All documents should have similar (but not necessarily identical) features
        assert features.shape[0] == 3
        assert not np.all(features == 0)
    
    def test_prepare_labels_with_all_zeros(self):
        """Test label preparation when a label has all zeros."""
        df = pd.DataFrame({
            'issue text': ['text1', 'text2'],
            'Label1': [0, 0],  # All zeros
            'Label2': [1, 1],
        })
        
        labels = prepare_labels(df)
        
        assert labels['Label1'].sum() == 0
        assert labels['Label2'].sum() == 2
    
    def test_clean_text_with_only_special_characters(self):
        """Test cleaning text that contains only special characters."""
        text = "!@#$%^&*()"
        cleaned = clean_github_text(text)
        
        # Should handle gracefully (may be empty or contain only ASCII equivalents)
        assert isinstance(cleaned, str)