developer_salary_prediction / tests /test_preprocessing.py
dima806's picture
Upload 39 files
eeeaee6 verified
"""Tests for src/preprocessing.py - Feature engineering utilities."""
import numpy as np
import pandas as pd
from src.preprocessing import (
normalize_other_categories,
prepare_features,
reduce_cardinality,
)
class TestNormalizeOtherCategories:
"""Tests for normalize_other_categories()."""
def test_replaces_other_please_specify(self):
"""'Other (please specify):' is replaced with 'Other'."""
series = pd.Series(["Other (please specify):", "Developer, back-end"])
result = normalize_other_categories(series)
assert result.iloc[0] == "Other"
assert result.iloc[1] == "Developer, back-end"
def test_replaces_other_colon(self):
"""'Other:' is replaced with 'Other'."""
series = pd.Series(["Other:", "Software Development"])
result = normalize_other_categories(series)
assert result.iloc[0] == "Other"
def test_leaves_non_other_unchanged(self):
"""Non-Other values are not modified."""
values = ["Developer, back-end", "Software Development", "India"]
series = pd.Series(values)
result = normalize_other_categories(series)
assert list(result) == values
def test_preserves_exact_other(self):
"""Exact 'Other' is kept as-is."""
series = pd.Series(["Other"])
result = normalize_other_categories(series)
assert result.iloc[0] == "Other"
class TestReduceCardinality:
"""Tests for reduce_cardinality()."""
def test_groups_rare_categories(self):
"""Rare categories are grouped into 'Other'."""
# Create series with one dominant and many rare categories
values = ["Common"] * 100 + ["Rare1", "Rare2", "Rare3"]
series = pd.Series(values)
result = reduce_cardinality(series, max_categories=5, min_frequency=10)
assert "Common" in result.values
assert "Rare1" not in result.values
assert (result == "Other").sum() == 3
def test_keeps_frequent_categories(self):
"""Frequent categories are kept intact."""
values = ["A"] * 100 + ["B"] * 80 + ["C"] * 60
series = pd.Series(values)
result = reduce_cardinality(series, max_categories=5, min_frequency=50)
assert set(result.unique()) == {"A", "B", "C"}
def test_uses_config_defaults_when_no_args(self):
"""Without explicit args, falls back to config defaults."""
values = ["Common"] * 200 + ["Rare"] * 2
series = pd.Series(values)
# Call without explicit max_categories / min_frequency
result = reduce_cardinality(series)
# "Rare" should be grouped into "Other" using config defaults
assert "Rare" not in result.values
assert "Common" in result.values
class TestPrepareFeatures:
"""Tests for prepare_features()."""
def test_returns_dataframe_with_numeric_columns(self):
"""Output contains YearsCode and WorkExp as numeric columns."""
df = pd.DataFrame(
{
"Country": ["India"],
"YearsCode": [5.0],
"WorkExp": [3.0],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
)
result = prepare_features(df)
assert "YearsCode" in result.columns
assert "WorkExp" in result.columns
def test_fills_missing_numeric_with_zero(self):
"""Missing numeric values are filled with 0."""
df = pd.DataFrame(
{
"Country": ["India"],
"YearsCode": [np.nan],
"WorkExp": [np.nan],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
)
result = prepare_features(df)
assert result["YearsCode"].iloc[0] == 0.0
assert result["WorkExp"].iloc[0] == 0.0
def test_one_hot_encodes_categorical_columns(self):
"""Categorical columns are one-hot encoded."""
df = pd.DataFrame(
{
"Country": ["India", "Germany"],
"YearsCode": [5.0, 10.0],
"WorkExp": [3.0, 8.0],
"EdLevel": ["Other", "Other"],
"DevType": ["Developer, back-end", "Developer, front-end"],
"Industry": ["Software Development", "Healthcare"],
"Age": ["25-34 years old", "35-44 years old"],
"ICorPM": ["Individual contributor", "People manager"],
"OrgSize": ["20 to 99 employees", "100 to 499 employees"],
"Employment": ["Employed", "Employed"],
}
)
result = prepare_features(df)
# Should have one-hot columns for categorical features
non_numeric = ("YearsCode", "WorkExp")
categorical_cols = [
c for c in result.columns if "_" in c and c not in non_numeric
]
assert len(categorical_cols) > 0
def test_renames_legacy_years_code_pro_column(self):
"""Legacy YearsCodePro column is renamed to YearsCode."""
df = pd.DataFrame(
{
"Country": ["India"],
"YearsCodePro": [5.0],
"WorkExp": [3.0],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
)
result = prepare_features(df)
assert "YearsCode" in result.columns
assert "YearsCodePro" not in result.columns
def test_fills_missing_categorical_with_unknown(self):
"""Missing categorical values are filled with 'Unknown'."""
df = pd.DataFrame(
{
"Country": [None],
"YearsCode": [5.0],
"WorkExp": [3.0],
"EdLevel": [None],
"DevType": [None],
"Industry": [None],
"Age": [None],
"ICorPM": [None],
"OrgSize": [None],
"Employment": [None],
}
)
result = prepare_features(df)
# Categoricals filled with "Unknown" → one-hot encodes "Unknown"
unknown_cols = [c for c in result.columns if "Unknown" in c]
assert len(unknown_cols) > 0
def test_different_inputs_produce_different_encodings(self):
"""Different categorical values produce distinct one-hot encodings."""
base = {
"YearsCode": [5.0],
"WorkExp": [3.0],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
df_usa = pd.DataFrame({"Country": ["United States of America"], **base})
df_deu = pd.DataFrame({"Country": ["Germany"], **base})
enc_usa = prepare_features(df_usa)
enc_deu = prepare_features(df_deu)
assert not enc_usa.equals(enc_deu), (
"USA and Germany inputs produced identical encodings — "
"categorical features are not being encoded"
)
def test_does_not_modify_original(self):
"""prepare_features does not modify the input DataFrame."""
df = pd.DataFrame(
{
"Country": ["India"],
"YearsCode": [5.0],
"WorkExp": [3.0],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
)
original_country = df["Country"].iloc[0]
prepare_features(df)
assert df["Country"].iloc[0] == original_country