File size: 6,417 Bytes
f1718f3 a32e584 f1718f3 a32e584 f1718f3 a32e584 f1718f3 a32e584 f1718f3 a32e584 f1718f3 a32e584 f1718f3 1a584f9 eeeaee6 f1718f3 a32e584 f1718f3 1a584f9 eeeaee6 1a584f9 a32e584 1a584f9 f1718f3 a32e584 f1718f3 55cdb7e a32e584 55cdb7e f1718f3 07d23c4 f1718f3 55cdb7e 07d23c4 a32e584 1a584f9 eeeaee6 f1718f3 a32e584 1a584f9 eeeaee6 a32e584 f1718f3 a32e584 f1718f3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | """Data preprocessing utilities for consistent feature engineering."""
from pathlib import Path
import pandas as pd
import yaml
# Load configuration once at module level
_config_path = Path("config/model_parameters.yaml")
with open(_config_path, "r") as f:
_config = yaml.safe_load(f)
def _get_other_category() -> str:
"""Get the standard 'Other' category name from config."""
return _config["features"]["cardinality"].get("other_category", "Other")
def normalize_other_categories(series: pd.Series) -> pd.Series:
"""
Normalize variants of 'Other' to the standard category name.
Replaces values like 'Other (please specify):', 'Other:', etc.
with the standard 'Other' category from config.
"""
other_name = _get_other_category()
return series.replace(
to_replace=r"^Other\b.*$",
value=other_name,
regex=True,
)
def reduce_cardinality(
series: pd.Series, max_categories: int = None, min_frequency: int = None
) -> pd.Series:
"""
Reduce cardinality by grouping rare categories into 'Other'.
Args:
series: Pandas Series with categorical values
max_categories: Maximum number of categories to keep
(default: from config)
min_frequency: Minimum occurrences for a category to be kept
(default: from config)
Returns:
Series with rare categories replaced by 'Other'
"""
other_name = _get_other_category()
# Use config defaults if not provided
if max_categories is None:
max_categories = _config["features"]["cardinality"]["max_categories"]
if min_frequency is None:
min_frequency = _config["features"]["cardinality"]["min_frequency"]
# Normalize "Other" variants before counting frequencies
series = normalize_other_categories(series)
# Count value frequencies
value_counts = series.value_counts()
# Keep only categories that meet both criteria:
# 1. In top max_categories by frequency
# 2. Have at least min_frequency occurrences
top_categories = value_counts.head(max_categories)
kept_categories = top_categories[top_categories >= min_frequency].index.tolist()
# Replace rare categories with the standard 'Other' name
return series.apply(lambda x: x if x in kept_categories else other_name)
def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Apply consistent feature transformations for both training and inference.
This function ensures that the same preprocessing steps are applied
during training and inference, preventing data leakage and inconsistencies.
Args:
df: DataFrame with columns: Country, YearsCode, WorkExp, EdLevel,
DevType, Industry, Age, ICorPM, OrgSize, Employment.
NOTE: During training, cardinality reduction should be applied to df
BEFORE calling this function. During inference, valid_categories.yaml
ensures only valid (already-reduced) categories are used.
Returns:
DataFrame with one-hot encoded features ready for model input
Note:
- Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
- Normalizes Unicode apostrophes to regular apostrophes
- Applies one-hot encoding with drop_first=True to avoid multicollinearity
- Column names in output will be like: YearsCode, WorkExp, Country_X, EdLevel_Y, DevType_Z, Industry_W, Age_V, ICorPM_U
- Does NOT apply cardinality reduction (must be done before calling this)
"""
# Create a copy to avoid modifying the original
df_processed = df.copy()
# Normalize Unicode apostrophes to regular apostrophes for consistency
# This handles cases where data has \u2019 (') instead of '
_categorical_cols = [
"Country",
"EdLevel",
"DevType",
"Industry",
"Age",
"ICorPM",
"OrgSize",
"Employment",
]
for col in _categorical_cols:
if col in df_processed.columns:
df_processed[col] = df_processed[col].str.replace(
"\u2019", "'", regex=False
)
# Normalize "Other" category variants (e.g. "Other (please specify):" -> "Other")
for col in _categorical_cols:
if col in df_processed.columns:
df_processed[col] = normalize_other_categories(df_processed[col])
# Handle legacy column name (YearsCodePro -> YearsCode)
if (
"YearsCodePro" in df_processed.columns
and "YearsCode" not in df_processed.columns
):
df_processed.rename(columns={"YearsCodePro": "YearsCode"}, inplace=True)
# Fill missing values with defaults
df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
df_processed["WorkExp"] = df_processed["WorkExp"].fillna(0)
df_processed["Country"] = df_processed["Country"].fillna("Unknown")
df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
df_processed["Age"] = df_processed["Age"].fillna("Unknown")
df_processed["ICorPM"] = df_processed["ICorPM"].fillna("Unknown")
df_processed["OrgSize"] = df_processed["OrgSize"].fillna("Unknown")
df_processed["Employment"] = df_processed["Employment"].fillna("Unknown")
# NOTE: Cardinality reduction is NOT applied here
# It should be applied during training BEFORE calling this function
# During inference, valid_categories.yaml ensures only valid values are used
# Select only the features we need
feature_cols = [
"Country",
"YearsCode",
"WorkExp",
"EdLevel",
"DevType",
"Industry",
"Age",
"ICorPM",
"OrgSize",
"Employment",
]
df_features = df_processed[feature_cols]
# Apply one-hot encoding for categorical variables
# For inference (single rows), we need drop_first=False to create columns
# The reindex in infer.py will align with training columns
# For training (many rows), we use the config value
is_inference = len(df_features) == 1
drop_first = (
False if is_inference else _config["features"]["encoding"]["drop_first"]
)
df_encoded = pd.get_dummies(df_features, drop_first=drop_first)
return df_encoded
|