Spaces:
Sleeping
Sleeping
| """Tests for src/preprocessing.py - Feature engineering utilities.""" | |
| import numpy as np | |
| import pandas as pd | |
| from src.preprocessing import ( | |
| normalize_other_categories, | |
| prepare_features, | |
| reduce_cardinality, | |
| ) | |
| class TestNormalizeOtherCategories: | |
| """Tests for normalize_other_categories().""" | |
| def test_replaces_other_please_specify(self): | |
| """'Other (please specify):' is replaced with 'Other'.""" | |
| series = pd.Series(["Other (please specify):", "Developer, back-end"]) | |
| result = normalize_other_categories(series) | |
| assert result.iloc[0] == "Other" | |
| assert result.iloc[1] == "Developer, back-end" | |
| def test_replaces_other_colon(self): | |
| """'Other:' is replaced with 'Other'.""" | |
| series = pd.Series(["Other:", "Software Development"]) | |
| result = normalize_other_categories(series) | |
| assert result.iloc[0] == "Other" | |
| def test_leaves_non_other_unchanged(self): | |
| """Non-Other values are not modified.""" | |
| values = ["Developer, back-end", "Software Development", "India"] | |
| series = pd.Series(values) | |
| result = normalize_other_categories(series) | |
| assert list(result) == values | |
| def test_preserves_exact_other(self): | |
| """Exact 'Other' is kept as-is.""" | |
| series = pd.Series(["Other"]) | |
| result = normalize_other_categories(series) | |
| assert result.iloc[0] == "Other" | |
| class TestReduceCardinality: | |
| """Tests for reduce_cardinality().""" | |
| def test_groups_rare_categories(self): | |
| """Rare categories are grouped into 'Other'.""" | |
| # Create series with one dominant and many rare categories | |
| values = ["Common"] * 100 + ["Rare1", "Rare2", "Rare3"] | |
| series = pd.Series(values) | |
| result = reduce_cardinality(series, max_categories=5, min_frequency=10) | |
| assert "Common" in result.values | |
| assert "Rare1" not in result.values | |
| assert (result == "Other").sum() == 3 | |
| def test_keeps_frequent_categories(self): | |
| """Frequent categories are kept intact.""" | |
| values = ["A"] * 100 + ["B"] * 80 + ["C"] * 60 | |
| series = pd.Series(values) | |
| result = reduce_cardinality(series, max_categories=5, min_frequency=50) | |
| assert set(result.unique()) == {"A", "B", "C"} | |
| def test_uses_config_defaults_when_no_args(self): | |
| """Without explicit args, falls back to config defaults.""" | |
| values = ["Common"] * 200 + ["Rare"] * 2 | |
| series = pd.Series(values) | |
| # Call without explicit max_categories / min_frequency | |
| result = reduce_cardinality(series) | |
| # "Rare" should be grouped into "Other" using config defaults | |
| assert "Rare" not in result.values | |
| assert "Common" in result.values | |
| class TestPrepareFeatures: | |
| """Tests for prepare_features().""" | |
| def test_returns_dataframe_with_numeric_columns(self): | |
| """Output contains YearsCode and WorkExp as numeric columns.""" | |
| df = pd.DataFrame( | |
| { | |
| "Country": ["India"], | |
| "YearsCode": [5.0], | |
| "WorkExp": [3.0], | |
| "EdLevel": ["Other"], | |
| "DevType": ["Developer, back-end"], | |
| "Industry": ["Software Development"], | |
| "Age": ["25-34 years old"], | |
| "ICorPM": ["Individual contributor"], | |
| "OrgSize": ["20 to 99 employees"], | |
| "Employment": ["Employed"], | |
| } | |
| ) | |
| result = prepare_features(df) | |
| assert "YearsCode" in result.columns | |
| assert "WorkExp" in result.columns | |
| def test_fills_missing_numeric_with_zero(self): | |
| """Missing numeric values are filled with 0.""" | |
| df = pd.DataFrame( | |
| { | |
| "Country": ["India"], | |
| "YearsCode": [np.nan], | |
| "WorkExp": [np.nan], | |
| "EdLevel": ["Other"], | |
| "DevType": ["Developer, back-end"], | |
| "Industry": ["Software Development"], | |
| "Age": ["25-34 years old"], | |
| "ICorPM": ["Individual contributor"], | |
| "OrgSize": ["20 to 99 employees"], | |
| "Employment": ["Employed"], | |
| } | |
| ) | |
| result = prepare_features(df) | |
| assert result["YearsCode"].iloc[0] == 0.0 | |
| assert result["WorkExp"].iloc[0] == 0.0 | |
| def test_one_hot_encodes_categorical_columns(self): | |
| """Categorical columns are one-hot encoded.""" | |
| df = pd.DataFrame( | |
| { | |
| "Country": ["India", "Germany"], | |
| "YearsCode": [5.0, 10.0], | |
| "WorkExp": [3.0, 8.0], | |
| "EdLevel": ["Other", "Other"], | |
| "DevType": ["Developer, back-end", "Developer, front-end"], | |
| "Industry": ["Software Development", "Healthcare"], | |
| "Age": ["25-34 years old", "35-44 years old"], | |
| "ICorPM": ["Individual contributor", "People manager"], | |
| "OrgSize": ["20 to 99 employees", "100 to 499 employees"], | |
| "Employment": ["Employed", "Employed"], | |
| } | |
| ) | |
| result = prepare_features(df) | |
| # Should have one-hot columns for categorical features | |
| non_numeric = ("YearsCode", "WorkExp") | |
| categorical_cols = [ | |
| c for c in result.columns if "_" in c and c not in non_numeric | |
| ] | |
| assert len(categorical_cols) > 0 | |
| def test_renames_legacy_years_code_pro_column(self): | |
| """Legacy YearsCodePro column is renamed to YearsCode.""" | |
| df = pd.DataFrame( | |
| { | |
| "Country": ["India"], | |
| "YearsCodePro": [5.0], | |
| "WorkExp": [3.0], | |
| "EdLevel": ["Other"], | |
| "DevType": ["Developer, back-end"], | |
| "Industry": ["Software Development"], | |
| "Age": ["25-34 years old"], | |
| "ICorPM": ["Individual contributor"], | |
| "OrgSize": ["20 to 99 employees"], | |
| "Employment": ["Employed"], | |
| } | |
| ) | |
| result = prepare_features(df) | |
| assert "YearsCode" in result.columns | |
| assert "YearsCodePro" not in result.columns | |
| def test_fills_missing_categorical_with_unknown(self): | |
| """Missing categorical values are filled with 'Unknown'.""" | |
| df = pd.DataFrame( | |
| { | |
| "Country": [None], | |
| "YearsCode": [5.0], | |
| "WorkExp": [3.0], | |
| "EdLevel": [None], | |
| "DevType": [None], | |
| "Industry": [None], | |
| "Age": [None], | |
| "ICorPM": [None], | |
| "OrgSize": [None], | |
| "Employment": [None], | |
| } | |
| ) | |
| result = prepare_features(df) | |
| # Categoricals filled with "Unknown" → one-hot encodes "Unknown" | |
| unknown_cols = [c for c in result.columns if "Unknown" in c] | |
| assert len(unknown_cols) > 0 | |
| def test_different_inputs_produce_different_encodings(self): | |
| """Different categorical values produce distinct one-hot encodings.""" | |
| base = { | |
| "YearsCode": [5.0], | |
| "WorkExp": [3.0], | |
| "EdLevel": ["Other"], | |
| "DevType": ["Developer, back-end"], | |
| "Industry": ["Software Development"], | |
| "Age": ["25-34 years old"], | |
| "ICorPM": ["Individual contributor"], | |
| "OrgSize": ["20 to 99 employees"], | |
| "Employment": ["Employed"], | |
| } | |
| df_usa = pd.DataFrame({"Country": ["United States of America"], **base}) | |
| df_deu = pd.DataFrame({"Country": ["Germany"], **base}) | |
| enc_usa = prepare_features(df_usa) | |
| enc_deu = prepare_features(df_deu) | |
| assert not enc_usa.equals(enc_deu), ( | |
| "USA and Germany inputs produced identical encodings — " | |
| "categorical features are not being encoded" | |
| ) | |
| def test_does_not_modify_original(self): | |
| """prepare_features does not modify the input DataFrame.""" | |
| df = pd.DataFrame( | |
| { | |
| "Country": ["India"], | |
| "YearsCode": [5.0], | |
| "WorkExp": [3.0], | |
| "EdLevel": ["Other"], | |
| "DevType": ["Developer, back-end"], | |
| "Industry": ["Software Development"], | |
| "Age": ["25-34 years old"], | |
| "ICorPM": ["Individual contributor"], | |
| "OrgSize": ["20 to 99 employees"], | |
| "Employment": ["Employed"], | |
| } | |
| ) | |
| original_country = df["Country"].iloc[0] | |
| prepare_features(df) | |
| assert df["Country"].iloc[0] == original_country | |