Spaces:

dima806
/

developer_salary_prediction

Running

File size: 8,673 Bytes

"""Tests for src/preprocessing.py - Feature engineering utilities."""

import numpy as np
import pandas as pd

from src.preprocessing import (
    normalize_other_categories,
    prepare_features,
    reduce_cardinality,
)


class TestNormalizeOtherCategories:
    """Tests for normalize_other_categories()."""

    def test_replaces_other_please_specify(self):
        """'Other (please specify):' is replaced with 'Other'."""
        series = pd.Series(["Other (please specify):", "Developer, back-end"])
        result = normalize_other_categories(series)
        assert result.iloc[0] == "Other"
        assert result.iloc[1] == "Developer, back-end"

    def test_replaces_other_colon(self):
        """'Other:' is replaced with 'Other'."""
        series = pd.Series(["Other:", "Software Development"])
        result = normalize_other_categories(series)
        assert result.iloc[0] == "Other"

    def test_leaves_non_other_unchanged(self):
        """Non-Other values are not modified."""
        values = ["Developer, back-end", "Software Development", "India"]
        series = pd.Series(values)
        result = normalize_other_categories(series)
        assert list(result) == values

    def test_preserves_exact_other(self):
        """Exact 'Other' is kept as-is."""
        series = pd.Series(["Other"])
        result = normalize_other_categories(series)
        assert result.iloc[0] == "Other"


class TestReduceCardinality:
    """Tests for reduce_cardinality()."""

    def test_groups_rare_categories(self):
        """Rare categories are grouped into 'Other'."""
        # Create series with one dominant and many rare categories
        values = ["Common"] * 100 + ["Rare1", "Rare2", "Rare3"]
        series = pd.Series(values)
        result = reduce_cardinality(series, max_categories=5, min_frequency=10)
        assert "Common" in result.values
        assert "Rare1" not in result.values
        assert (result == "Other").sum() == 3

    def test_keeps_frequent_categories(self):
        """Frequent categories are kept intact."""
        values = ["A"] * 100 + ["B"] * 80 + ["C"] * 60
        series = pd.Series(values)
        result = reduce_cardinality(series, max_categories=5, min_frequency=50)
        assert set(result.unique()) == {"A", "B", "C"}

    def test_uses_config_defaults_when_no_args(self):
        """Without explicit args, falls back to config defaults."""
        values = ["Common"] * 200 + ["Rare"] * 2
        series = pd.Series(values)
        # Call without explicit max_categories / min_frequency
        result = reduce_cardinality(series)
        # "Rare" should be grouped into "Other" using config defaults
        assert "Rare" not in result.values
        assert "Common" in result.values


class TestPrepareFeatures:
    """Tests for prepare_features()."""

    def test_returns_dataframe_with_numeric_columns(self):
        """Output contains YearsCode and WorkExp as numeric columns."""
        df = pd.DataFrame(
            {
                "Country": ["India"],
                "YearsCode": [5.0],
                "WorkExp": [3.0],
                "EdLevel": ["Other"],
                "DevType": ["Developer, back-end"],
                "Industry": ["Software Development"],
                "Age": ["25-34 years old"],
                "ICorPM": ["Individual contributor"],
                "OrgSize": ["20 to 99 employees"],
                "Employment": ["Employed"],
            }
        )
        result = prepare_features(df)
        assert "YearsCode" in result.columns
        assert "WorkExp" in result.columns

    def test_fills_missing_numeric_with_zero(self):
        """Missing numeric values are filled with 0."""
        df = pd.DataFrame(
            {
                "Country": ["India"],
                "YearsCode": [np.nan],
                "WorkExp": [np.nan],
                "EdLevel": ["Other"],
                "DevType": ["Developer, back-end"],
                "Industry": ["Software Development"],
                "Age": ["25-34 years old"],
                "ICorPM": ["Individual contributor"],
                "OrgSize": ["20 to 99 employees"],
                "Employment": ["Employed"],
            }
        )
        result = prepare_features(df)
        assert result["YearsCode"].iloc[0] == 0.0
        assert result["WorkExp"].iloc[0] == 0.0

    def test_one_hot_encodes_categorical_columns(self):
        """Categorical columns are one-hot encoded."""
        df = pd.DataFrame(
            {
                "Country": ["India", "Germany"],
                "YearsCode": [5.0, 10.0],
                "WorkExp": [3.0, 8.0],
                "EdLevel": ["Other", "Other"],
                "DevType": ["Developer, back-end", "Developer, front-end"],
                "Industry": ["Software Development", "Healthcare"],
                "Age": ["25-34 years old", "35-44 years old"],
                "ICorPM": ["Individual contributor", "People manager"],
                "OrgSize": ["20 to 99 employees", "100 to 499 employees"],
                "Employment": ["Employed", "Employed"],
            }
        )
        result = prepare_features(df)
        # Should have one-hot columns for categorical features
        non_numeric = ("YearsCode", "WorkExp")
        categorical_cols = [
            c for c in result.columns if "_" in c and c not in non_numeric
        ]
        assert len(categorical_cols) > 0

    def test_renames_legacy_years_code_pro_column(self):
        """Legacy YearsCodePro column is renamed to YearsCode."""
        df = pd.DataFrame(
            {
                "Country": ["India"],
                "YearsCodePro": [5.0],
                "WorkExp": [3.0],
                "EdLevel": ["Other"],
                "DevType": ["Developer, back-end"],
                "Industry": ["Software Development"],
                "Age": ["25-34 years old"],
                "ICorPM": ["Individual contributor"],
                "OrgSize": ["20 to 99 employees"],
                "Employment": ["Employed"],
            }
        )
        result = prepare_features(df)
        assert "YearsCode" in result.columns
        assert "YearsCodePro" not in result.columns

    def test_fills_missing_categorical_with_unknown(self):
        """Missing categorical values are filled with 'Unknown'."""
        df = pd.DataFrame(
            {
                "Country": [None],
                "YearsCode": [5.0],
                "WorkExp": [3.0],
                "EdLevel": [None],
                "DevType": [None],
                "Industry": [None],
                "Age": [None],
                "ICorPM": [None],
                "OrgSize": [None],
                "Employment": [None],
            }
        )
        result = prepare_features(df)
        # Categoricals filled with "Unknown" → one-hot encodes "Unknown"
        unknown_cols = [c for c in result.columns if "Unknown" in c]
        assert len(unknown_cols) > 0

    def test_different_inputs_produce_different_encodings(self):
        """Different categorical values produce distinct one-hot encodings."""
        base = {
            "YearsCode": [5.0],
            "WorkExp": [3.0],
            "EdLevel": ["Other"],
            "DevType": ["Developer, back-end"],
            "Industry": ["Software Development"],
            "Age": ["25-34 years old"],
            "ICorPM": ["Individual contributor"],
            "OrgSize": ["20 to 99 employees"],
            "Employment": ["Employed"],
        }
        df_usa = pd.DataFrame({"Country": ["United States of America"], **base})
        df_deu = pd.DataFrame({"Country": ["Germany"], **base})

        enc_usa = prepare_features(df_usa)
        enc_deu = prepare_features(df_deu)

        assert not enc_usa.equals(enc_deu), (
            "USA and Germany inputs produced identical encodings — "
            "categorical features are not being encoded"
        )

    def test_does_not_modify_original(self):
        """prepare_features does not modify the input DataFrame."""
        df = pd.DataFrame(
            {
                "Country": ["India"],
                "YearsCode": [5.0],
                "WorkExp": [3.0],
                "EdLevel": ["Other"],
                "DevType": ["Developer, back-end"],
                "Industry": ["Software Development"],
                "Age": ["25-34 years old"],
                "ICorPM": ["Individual contributor"],
                "OrgSize": ["20 to 99 employees"],
                "Employment": ["Employed"],
            }
        )
        original_country = df["Country"].iloc[0]
        prepare_features(df)
        assert df["Country"].iloc[0] == original_country