File size: 6,417 Bytes
f1718f3
 
 
 
 
 
 
 
 
 
 
 
a32e584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1718f3
a32e584
f1718f3
 
 
 
 
 
 
 
 
 
 
 
 
 
a32e584
 
f1718f3
 
a32e584
f1718f3
a32e584
 
 
 
f1718f3
 
 
 
 
 
 
 
 
 
a32e584
 
f1718f3
 
 
 
 
 
 
 
 
 
1a584f9
eeeaee6
f1718f3
 
 
 
 
 
 
 
 
 
 
a32e584
f1718f3
 
 
 
 
 
 
1a584f9
 
 
 
 
 
 
 
eeeaee6
1a584f9
 
a32e584
 
 
 
 
 
1a584f9
f1718f3
a32e584
f1718f3
55cdb7e
a32e584
 
 
 
55cdb7e
f1718f3
 
 
07d23c4
f1718f3
 
 
55cdb7e
07d23c4
a32e584
1a584f9
eeeaee6
f1718f3
 
 
 
 
 
a32e584
 
 
 
 
 
 
 
 
1a584f9
eeeaee6
a32e584
f1718f3
 
 
 
 
 
 
a32e584
 
 
f1718f3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""Data preprocessing utilities for consistent feature engineering."""

from pathlib import Path
import pandas as pd
import yaml

# Load configuration once at module level
_config_path = Path("config/model_parameters.yaml")
with open(_config_path, "r") as f:
    _config = yaml.safe_load(f)


def _get_other_category() -> str:
    """Get the standard 'Other' category name from config."""
    return _config["features"]["cardinality"].get("other_category", "Other")


def normalize_other_categories(series: pd.Series) -> pd.Series:
    """
    Normalize variants of 'Other' to the standard category name.

    Replaces values like 'Other (please specify):', 'Other:', etc.
    with the standard 'Other' category from config.
    """
    other_name = _get_other_category()
    return series.replace(
        to_replace=r"^Other\b.*$",
        value=other_name,
        regex=True,
    )


def reduce_cardinality(
    series: pd.Series, max_categories: int = None, min_frequency: int = None
) -> pd.Series:
    """
    Reduce cardinality by grouping rare categories into 'Other'.

    Args:
        series: Pandas Series with categorical values
        max_categories: Maximum number of categories to keep
                       (default: from config)
        min_frequency: Minimum occurrences for a category to be kept
                      (default: from config)

    Returns:
        Series with rare categories replaced by 'Other'
    """
    other_name = _get_other_category()

    # Use config defaults if not provided
    if max_categories is None:
        max_categories = _config["features"]["cardinality"]["max_categories"]
    if min_frequency is None:
        min_frequency = _config["features"]["cardinality"]["min_frequency"]

    # Normalize "Other" variants before counting frequencies
    series = normalize_other_categories(series)

    # Count value frequencies
    value_counts = series.value_counts()

    # Keep only categories that meet both criteria:
    # 1. In top max_categories by frequency
    # 2. Have at least min_frequency occurrences
    top_categories = value_counts.head(max_categories)
    kept_categories = top_categories[top_categories >= min_frequency].index.tolist()

    # Replace rare categories with the standard 'Other' name
    return series.apply(lambda x: x if x in kept_categories else other_name)


def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply consistent feature transformations for both training and inference.

    This function ensures that the same preprocessing steps are applied
    during training and inference, preventing data leakage and inconsistencies.

    Args:
        df: DataFrame with columns: Country, YearsCode, WorkExp, EdLevel,
            DevType, Industry, Age, ICorPM, OrgSize, Employment.
            NOTE: During training, cardinality reduction should be applied to df
            BEFORE calling this function. During inference, valid_categories.yaml
            ensures only valid (already-reduced) categories are used.

    Returns:
        DataFrame with one-hot encoded features ready for model input

    Note:
        - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
        - Normalizes Unicode apostrophes to regular apostrophes
        - Applies one-hot encoding with drop_first=True to avoid multicollinearity
        - Column names in output will be like: YearsCode, WorkExp, Country_X, EdLevel_Y, DevType_Z, Industry_W, Age_V, ICorPM_U
        - Does NOT apply cardinality reduction (must be done before calling this)
    """
    # Create a copy to avoid modifying the original
    df_processed = df.copy()

    # Normalize Unicode apostrophes to regular apostrophes for consistency
    # This handles cases where data has \u2019 (') instead of '
    _categorical_cols = [
        "Country",
        "EdLevel",
        "DevType",
        "Industry",
        "Age",
        "ICorPM",
        "OrgSize",
        "Employment",
    ]
    for col in _categorical_cols:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].str.replace(
                "\u2019", "'", regex=False
            )

    # Normalize "Other" category variants (e.g. "Other (please specify):" -> "Other")
    for col in _categorical_cols:
        if col in df_processed.columns:
            df_processed[col] = normalize_other_categories(df_processed[col])

    # Handle legacy column name (YearsCodePro -> YearsCode)
    if (
        "YearsCodePro" in df_processed.columns
        and "YearsCode" not in df_processed.columns
    ):
        df_processed.rename(columns={"YearsCodePro": "YearsCode"}, inplace=True)

    # Fill missing values with defaults
    df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
    df_processed["WorkExp"] = df_processed["WorkExp"].fillna(0)
    df_processed["Country"] = df_processed["Country"].fillna("Unknown")
    df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
    df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
    df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
    df_processed["Age"] = df_processed["Age"].fillna("Unknown")
    df_processed["ICorPM"] = df_processed["ICorPM"].fillna("Unknown")
    df_processed["OrgSize"] = df_processed["OrgSize"].fillna("Unknown")
    df_processed["Employment"] = df_processed["Employment"].fillna("Unknown")

    # NOTE: Cardinality reduction is NOT applied here
    # It should be applied during training BEFORE calling this function
    # During inference, valid_categories.yaml ensures only valid values are used

    # Select only the features we need
    feature_cols = [
        "Country",
        "YearsCode",
        "WorkExp",
        "EdLevel",
        "DevType",
        "Industry",
        "Age",
        "ICorPM",
        "OrgSize",
        "Employment",
    ]
    df_features = df_processed[feature_cols]

    # Apply one-hot encoding for categorical variables
    # For inference (single rows), we need drop_first=False to create columns
    # The reindex in infer.py will align with training columns
    # For training (many rows), we use the config value
    is_inference = len(df_features) == 1
    drop_first = (
        False if is_inference else _config["features"]["encoding"]["drop_first"]
    )
    df_encoded = pd.get_dummies(df_features, drop_first=drop_first)

    return df_encoded