File size: 8,673 Bytes
a32e584 9d508e3 2cc5253 9d508e3 a32e584 1a584f9 eeeaee6 a32e584 1a584f9 eeeaee6 a32e584 1a584f9 eeeaee6 a32e584 2cc5253 a32e584 2cc5253 a32e584 9d508e3 1a584f9 eeeaee6 9d508e3 1a584f9 eeeaee6 9d508e3 eeeaee6 9d508e3 2cc5253 eeeaee6 2cc5253 a32e584 1a584f9 eeeaee6 a32e584 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | """Tests for src/preprocessing.py - Feature engineering utilities."""
import numpy as np
import pandas as pd
from src.preprocessing import (
normalize_other_categories,
prepare_features,
reduce_cardinality,
)
class TestNormalizeOtherCategories:
"""Tests for normalize_other_categories()."""
def test_replaces_other_please_specify(self):
"""'Other (please specify):' is replaced with 'Other'."""
series = pd.Series(["Other (please specify):", "Developer, back-end"])
result = normalize_other_categories(series)
assert result.iloc[0] == "Other"
assert result.iloc[1] == "Developer, back-end"
def test_replaces_other_colon(self):
"""'Other:' is replaced with 'Other'."""
series = pd.Series(["Other:", "Software Development"])
result = normalize_other_categories(series)
assert result.iloc[0] == "Other"
def test_leaves_non_other_unchanged(self):
"""Non-Other values are not modified."""
values = ["Developer, back-end", "Software Development", "India"]
series = pd.Series(values)
result = normalize_other_categories(series)
assert list(result) == values
def test_preserves_exact_other(self):
"""Exact 'Other' is kept as-is."""
series = pd.Series(["Other"])
result = normalize_other_categories(series)
assert result.iloc[0] == "Other"
class TestReduceCardinality:
"""Tests for reduce_cardinality()."""
def test_groups_rare_categories(self):
"""Rare categories are grouped into 'Other'."""
# Create series with one dominant and many rare categories
values = ["Common"] * 100 + ["Rare1", "Rare2", "Rare3"]
series = pd.Series(values)
result = reduce_cardinality(series, max_categories=5, min_frequency=10)
assert "Common" in result.values
assert "Rare1" not in result.values
assert (result == "Other").sum() == 3
def test_keeps_frequent_categories(self):
"""Frequent categories are kept intact."""
values = ["A"] * 100 + ["B"] * 80 + ["C"] * 60
series = pd.Series(values)
result = reduce_cardinality(series, max_categories=5, min_frequency=50)
assert set(result.unique()) == {"A", "B", "C"}
def test_uses_config_defaults_when_no_args(self):
"""Without explicit args, falls back to config defaults."""
values = ["Common"] * 200 + ["Rare"] * 2
series = pd.Series(values)
# Call without explicit max_categories / min_frequency
result = reduce_cardinality(series)
# "Rare" should be grouped into "Other" using config defaults
assert "Rare" not in result.values
assert "Common" in result.values
class TestPrepareFeatures:
"""Tests for prepare_features()."""
def test_returns_dataframe_with_numeric_columns(self):
"""Output contains YearsCode and WorkExp as numeric columns."""
df = pd.DataFrame(
{
"Country": ["India"],
"YearsCode": [5.0],
"WorkExp": [3.0],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
)
result = prepare_features(df)
assert "YearsCode" in result.columns
assert "WorkExp" in result.columns
def test_fills_missing_numeric_with_zero(self):
"""Missing numeric values are filled with 0."""
df = pd.DataFrame(
{
"Country": ["India"],
"YearsCode": [np.nan],
"WorkExp": [np.nan],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
)
result = prepare_features(df)
assert result["YearsCode"].iloc[0] == 0.0
assert result["WorkExp"].iloc[0] == 0.0
def test_one_hot_encodes_categorical_columns(self):
"""Categorical columns are one-hot encoded."""
df = pd.DataFrame(
{
"Country": ["India", "Germany"],
"YearsCode": [5.0, 10.0],
"WorkExp": [3.0, 8.0],
"EdLevel": ["Other", "Other"],
"DevType": ["Developer, back-end", "Developer, front-end"],
"Industry": ["Software Development", "Healthcare"],
"Age": ["25-34 years old", "35-44 years old"],
"ICorPM": ["Individual contributor", "People manager"],
"OrgSize": ["20 to 99 employees", "100 to 499 employees"],
"Employment": ["Employed", "Employed"],
}
)
result = prepare_features(df)
# Should have one-hot columns for categorical features
non_numeric = ("YearsCode", "WorkExp")
categorical_cols = [
c for c in result.columns if "_" in c and c not in non_numeric
]
assert len(categorical_cols) > 0
def test_renames_legacy_years_code_pro_column(self):
"""Legacy YearsCodePro column is renamed to YearsCode."""
df = pd.DataFrame(
{
"Country": ["India"],
"YearsCodePro": [5.0],
"WorkExp": [3.0],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
)
result = prepare_features(df)
assert "YearsCode" in result.columns
assert "YearsCodePro" not in result.columns
def test_fills_missing_categorical_with_unknown(self):
"""Missing categorical values are filled with 'Unknown'."""
df = pd.DataFrame(
{
"Country": [None],
"YearsCode": [5.0],
"WorkExp": [3.0],
"EdLevel": [None],
"DevType": [None],
"Industry": [None],
"Age": [None],
"ICorPM": [None],
"OrgSize": [None],
"Employment": [None],
}
)
result = prepare_features(df)
# Categoricals filled with "Unknown" → one-hot encodes "Unknown"
unknown_cols = [c for c in result.columns if "Unknown" in c]
assert len(unknown_cols) > 0
def test_different_inputs_produce_different_encodings(self):
"""Different categorical values produce distinct one-hot encodings."""
base = {
"YearsCode": [5.0],
"WorkExp": [3.0],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
df_usa = pd.DataFrame({"Country": ["United States of America"], **base})
df_deu = pd.DataFrame({"Country": ["Germany"], **base})
enc_usa = prepare_features(df_usa)
enc_deu = prepare_features(df_deu)
assert not enc_usa.equals(enc_deu), (
"USA and Germany inputs produced identical encodings — "
"categorical features are not being encoded"
)
def test_does_not_modify_original(self):
"""prepare_features does not modify the input DataFrame."""
df = pd.DataFrame(
{
"Country": ["India"],
"YearsCode": [5.0],
"WorkExp": [3.0],
"EdLevel": ["Other"],
"DevType": ["Developer, back-end"],
"Industry": ["Software Development"],
"Age": ["25-34 years old"],
"ICorPM": ["Individual contributor"],
"OrgSize": ["20 to 99 employees"],
"Employment": ["Employed"],
}
)
original_country = df["Country"].iloc[0]
prepare_features(df)
assert df["Country"].iloc[0] == original_country
|