File size: 8,673 Bytes
a32e584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d508e3
2cc5253
9d508e3
 
 
 
 
 
 
 
a32e584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a584f9
eeeaee6
a32e584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a584f9
eeeaee6
a32e584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a584f9
eeeaee6
a32e584
 
 
 
2cc5253
a32e584
2cc5253
a32e584
 
 
9d508e3
 
 
 
 
 
 
 
 
 
 
 
1a584f9
eeeaee6
9d508e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a584f9
eeeaee6
9d508e3
 
 
eeeaee6
9d508e3
 
 
2cc5253
 
 
 
 
 
 
 
 
 
 
eeeaee6
2cc5253
 
 
 
 
 
 
 
 
 
 
 
a32e584
 
 
 
 
 
 
 
 
 
 
 
1a584f9
eeeaee6
a32e584
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""Tests for src/preprocessing.py - Feature engineering utilities."""

import numpy as np
import pandas as pd

from src.preprocessing import (
    normalize_other_categories,
    prepare_features,
    reduce_cardinality,
)


class TestNormalizeOtherCategories:
    """Tests for normalize_other_categories()."""

    def test_replaces_other_please_specify(self):
        """'Other (please specify):' is replaced with 'Other'."""
        series = pd.Series(["Other (please specify):", "Developer, back-end"])
        result = normalize_other_categories(series)
        assert result.iloc[0] == "Other"
        assert result.iloc[1] == "Developer, back-end"

    def test_replaces_other_colon(self):
        """'Other:' is replaced with 'Other'."""
        series = pd.Series(["Other:", "Software Development"])
        result = normalize_other_categories(series)
        assert result.iloc[0] == "Other"

    def test_leaves_non_other_unchanged(self):
        """Non-Other values are not modified."""
        values = ["Developer, back-end", "Software Development", "India"]
        series = pd.Series(values)
        result = normalize_other_categories(series)
        assert list(result) == values

    def test_preserves_exact_other(self):
        """Exact 'Other' is kept as-is."""
        series = pd.Series(["Other"])
        result = normalize_other_categories(series)
        assert result.iloc[0] == "Other"


class TestReduceCardinality:
    """Tests for reduce_cardinality()."""

    def test_groups_rare_categories(self):
        """Rare categories are grouped into 'Other'."""
        # Create series with one dominant and many rare categories
        values = ["Common"] * 100 + ["Rare1", "Rare2", "Rare3"]
        series = pd.Series(values)
        result = reduce_cardinality(series, max_categories=5, min_frequency=10)
        assert "Common" in result.values
        assert "Rare1" not in result.values
        assert (result == "Other").sum() == 3

    def test_keeps_frequent_categories(self):
        """Frequent categories are kept intact."""
        values = ["A"] * 100 + ["B"] * 80 + ["C"] * 60
        series = pd.Series(values)
        result = reduce_cardinality(series, max_categories=5, min_frequency=50)
        assert set(result.unique()) == {"A", "B", "C"}

    def test_uses_config_defaults_when_no_args(self):
        """Without explicit args, falls back to config defaults."""
        values = ["Common"] * 200 + ["Rare"] * 2
        series = pd.Series(values)
        # Call without explicit max_categories / min_frequency
        result = reduce_cardinality(series)
        # "Rare" should be grouped into "Other" using config defaults
        assert "Rare" not in result.values
        assert "Common" in result.values


class TestPrepareFeatures:
    """Tests for prepare_features()."""

    def test_returns_dataframe_with_numeric_columns(self):
        """Output contains YearsCode and WorkExp as numeric columns."""
        df = pd.DataFrame(
            {
                "Country": ["India"],
                "YearsCode": [5.0],
                "WorkExp": [3.0],
                "EdLevel": ["Other"],
                "DevType": ["Developer, back-end"],
                "Industry": ["Software Development"],
                "Age": ["25-34 years old"],
                "ICorPM": ["Individual contributor"],
                "OrgSize": ["20 to 99 employees"],
                "Employment": ["Employed"],
            }
        )
        result = prepare_features(df)
        assert "YearsCode" in result.columns
        assert "WorkExp" in result.columns

    def test_fills_missing_numeric_with_zero(self):
        """Missing numeric values are filled with 0."""
        df = pd.DataFrame(
            {
                "Country": ["India"],
                "YearsCode": [np.nan],
                "WorkExp": [np.nan],
                "EdLevel": ["Other"],
                "DevType": ["Developer, back-end"],
                "Industry": ["Software Development"],
                "Age": ["25-34 years old"],
                "ICorPM": ["Individual contributor"],
                "OrgSize": ["20 to 99 employees"],
                "Employment": ["Employed"],
            }
        )
        result = prepare_features(df)
        assert result["YearsCode"].iloc[0] == 0.0
        assert result["WorkExp"].iloc[0] == 0.0

    def test_one_hot_encodes_categorical_columns(self):
        """Categorical columns are one-hot encoded."""
        df = pd.DataFrame(
            {
                "Country": ["India", "Germany"],
                "YearsCode": [5.0, 10.0],
                "WorkExp": [3.0, 8.0],
                "EdLevel": ["Other", "Other"],
                "DevType": ["Developer, back-end", "Developer, front-end"],
                "Industry": ["Software Development", "Healthcare"],
                "Age": ["25-34 years old", "35-44 years old"],
                "ICorPM": ["Individual contributor", "People manager"],
                "OrgSize": ["20 to 99 employees", "100 to 499 employees"],
                "Employment": ["Employed", "Employed"],
            }
        )
        result = prepare_features(df)
        # Should have one-hot columns for categorical features
        non_numeric = ("YearsCode", "WorkExp")
        categorical_cols = [
            c for c in result.columns if "_" in c and c not in non_numeric
        ]
        assert len(categorical_cols) > 0

    def test_renames_legacy_years_code_pro_column(self):
        """Legacy YearsCodePro column is renamed to YearsCode."""
        df = pd.DataFrame(
            {
                "Country": ["India"],
                "YearsCodePro": [5.0],
                "WorkExp": [3.0],
                "EdLevel": ["Other"],
                "DevType": ["Developer, back-end"],
                "Industry": ["Software Development"],
                "Age": ["25-34 years old"],
                "ICorPM": ["Individual contributor"],
                "OrgSize": ["20 to 99 employees"],
                "Employment": ["Employed"],
            }
        )
        result = prepare_features(df)
        assert "YearsCode" in result.columns
        assert "YearsCodePro" not in result.columns

    def test_fills_missing_categorical_with_unknown(self):
        """Missing categorical values are filled with 'Unknown'."""
        df = pd.DataFrame(
            {
                "Country": [None],
                "YearsCode": [5.0],
                "WorkExp": [3.0],
                "EdLevel": [None],
                "DevType": [None],
                "Industry": [None],
                "Age": [None],
                "ICorPM": [None],
                "OrgSize": [None],
                "Employment": [None],
            }
        )
        result = prepare_features(df)
        # Categoricals filled with "Unknown" → one-hot encodes "Unknown"
        unknown_cols = [c for c in result.columns if "Unknown" in c]
        assert len(unknown_cols) > 0

    def test_different_inputs_produce_different_encodings(self):
        """Different categorical values produce distinct one-hot encodings."""
        base = {
            "YearsCode": [5.0],
            "WorkExp": [3.0],
            "EdLevel": ["Other"],
            "DevType": ["Developer, back-end"],
            "Industry": ["Software Development"],
            "Age": ["25-34 years old"],
            "ICorPM": ["Individual contributor"],
            "OrgSize": ["20 to 99 employees"],
            "Employment": ["Employed"],
        }
        df_usa = pd.DataFrame({"Country": ["United States of America"], **base})
        df_deu = pd.DataFrame({"Country": ["Germany"], **base})

        enc_usa = prepare_features(df_usa)
        enc_deu = prepare_features(df_deu)

        assert not enc_usa.equals(enc_deu), (
            "USA and Germany inputs produced identical encodings — "
            "categorical features are not being encoded"
        )

    def test_does_not_modify_original(self):
        """prepare_features does not modify the input DataFrame."""
        df = pd.DataFrame(
            {
                "Country": ["India"],
                "YearsCode": [5.0],
                "WorkExp": [3.0],
                "EdLevel": ["Other"],
                "DevType": ["Developer, back-end"],
                "Industry": ["Software Development"],
                "Age": ["25-34 years old"],
                "ICorPM": ["Individual contributor"],
                "OrgSize": ["20 to 99 employees"],
                "Employment": ["Employed"],
            }
        )
        original_country = df["Country"].iloc[0]
        prepare_features(df)
        assert df["Country"].iloc[0] == original_country