developer_salary_prediction / src /preprocessing.py
dima806's picture
Upload 39 files
eeeaee6 verified
"""Data preprocessing utilities for consistent feature engineering."""
from pathlib import Path
import pandas as pd
import yaml
# Load configuration once at module level
_config_path = Path("config/model_parameters.yaml")
with open(_config_path, "r") as f:
_config = yaml.safe_load(f)
def _get_other_category() -> str:
"""Get the standard 'Other' category name from config."""
return _config["features"]["cardinality"].get("other_category", "Other")
def normalize_other_categories(series: pd.Series) -> pd.Series:
"""
Normalize variants of 'Other' to the standard category name.
Replaces values like 'Other (please specify):', 'Other:', etc.
with the standard 'Other' category from config.
"""
other_name = _get_other_category()
return series.replace(
to_replace=r"^Other\b.*$",
value=other_name,
regex=True,
)
def reduce_cardinality(
series: pd.Series, max_categories: int = None, min_frequency: int = None
) -> pd.Series:
"""
Reduce cardinality by grouping rare categories into 'Other'.
Args:
series: Pandas Series with categorical values
max_categories: Maximum number of categories to keep
(default: from config)
min_frequency: Minimum occurrences for a category to be kept
(default: from config)
Returns:
Series with rare categories replaced by 'Other'
"""
other_name = _get_other_category()
# Use config defaults if not provided
if max_categories is None:
max_categories = _config["features"]["cardinality"]["max_categories"]
if min_frequency is None:
min_frequency = _config["features"]["cardinality"]["min_frequency"]
# Normalize "Other" variants before counting frequencies
series = normalize_other_categories(series)
# Count value frequencies
value_counts = series.value_counts()
# Keep only categories that meet both criteria:
# 1. In top max_categories by frequency
# 2. Have at least min_frequency occurrences
top_categories = value_counts.head(max_categories)
kept_categories = top_categories[top_categories >= min_frequency].index.tolist()
# Replace rare categories with the standard 'Other' name
return series.apply(lambda x: x if x in kept_categories else other_name)
def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Apply consistent feature transformations for both training and inference.
This function ensures that the same preprocessing steps are applied
during training and inference, preventing data leakage and inconsistencies.
Args:
df: DataFrame with columns: Country, YearsCode, WorkExp, EdLevel,
DevType, Industry, Age, ICorPM, OrgSize, Employment.
NOTE: During training, cardinality reduction should be applied to df
BEFORE calling this function. During inference, valid_categories.yaml
ensures only valid (already-reduced) categories are used.
Returns:
DataFrame with one-hot encoded features ready for model input
Note:
- Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
- Normalizes Unicode apostrophes to regular apostrophes
- Applies one-hot encoding with drop_first=True to avoid multicollinearity
- Column names in output will be like: YearsCode, WorkExp, Country_X, EdLevel_Y, DevType_Z, Industry_W, Age_V, ICorPM_U
- Does NOT apply cardinality reduction (must be done before calling this)
"""
# Create a copy to avoid modifying the original
df_processed = df.copy()
# Normalize Unicode apostrophes to regular apostrophes for consistency
# This handles cases where data has \u2019 (') instead of '
_categorical_cols = [
"Country",
"EdLevel",
"DevType",
"Industry",
"Age",
"ICorPM",
"OrgSize",
"Employment",
]
for col in _categorical_cols:
if col in df_processed.columns:
df_processed[col] = df_processed[col].str.replace(
"\u2019", "'", regex=False
)
# Normalize "Other" category variants (e.g. "Other (please specify):" -> "Other")
for col in _categorical_cols:
if col in df_processed.columns:
df_processed[col] = normalize_other_categories(df_processed[col])
# Handle legacy column name (YearsCodePro -> YearsCode)
if (
"YearsCodePro" in df_processed.columns
and "YearsCode" not in df_processed.columns
):
df_processed.rename(columns={"YearsCodePro": "YearsCode"}, inplace=True)
# Fill missing values with defaults
df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
df_processed["WorkExp"] = df_processed["WorkExp"].fillna(0)
df_processed["Country"] = df_processed["Country"].fillna("Unknown")
df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
df_processed["Age"] = df_processed["Age"].fillna("Unknown")
df_processed["ICorPM"] = df_processed["ICorPM"].fillna("Unknown")
df_processed["OrgSize"] = df_processed["OrgSize"].fillna("Unknown")
df_processed["Employment"] = df_processed["Employment"].fillna("Unknown")
# NOTE: Cardinality reduction is NOT applied here
# It should be applied during training BEFORE calling this function
# During inference, valid_categories.yaml ensures only valid values are used
# Select only the features we need
feature_cols = [
"Country",
"YearsCode",
"WorkExp",
"EdLevel",
"DevType",
"Industry",
"Age",
"ICorPM",
"OrgSize",
"Employment",
]
df_features = df_processed[feature_cols]
# Apply one-hot encoding for categorical variables
# For inference (single rows), we need drop_first=False to create columns
# The reindex in infer.py will align with training columns
# For training (many rows), we use the config value
is_inference = len(df_features) == 1
drop_first = (
False if is_inference else _config["features"]["encoding"]["drop_first"]
)
df_encoded = pd.get_dummies(df_features, drop_first=drop_first)
return df_encoded