Spaces:

dima806
/

developer_salary_prediction

Running

File size: 6,417 Bytes

"""Data preprocessing utilities for consistent feature engineering."""

from pathlib import Path
import pandas as pd
import yaml

# Load configuration once at module level
_config_path = Path("config/model_parameters.yaml")
with open(_config_path, "r") as f:
    _config = yaml.safe_load(f)


def _get_other_category() -> str:
    """Get the standard 'Other' category name from config."""
    return _config["features"]["cardinality"].get("other_category", "Other")


def normalize_other_categories(series: pd.Series) -> pd.Series:
    """
    Normalize variants of 'Other' to the standard category name.

    Replaces values like 'Other (please specify):', 'Other:', etc.
    with the standard 'Other' category from config.
    """
    other_name = _get_other_category()
    return series.replace(
        to_replace=r"^Other\b.*$",
        value=other_name,
        regex=True,
    )


def reduce_cardinality(
    series: pd.Series, max_categories: int = None, min_frequency: int = None
) -> pd.Series:
    """
    Reduce cardinality by grouping rare categories into 'Other'.

    Args:
        series: Pandas Series with categorical values
        max_categories: Maximum number of categories to keep
                       (default: from config)
        min_frequency: Minimum occurrences for a category to be kept
                      (default: from config)

    Returns:
        Series with rare categories replaced by 'Other'
    """
    other_name = _get_other_category()

    # Use config defaults if not provided
    if max_categories is None:
        max_categories = _config["features"]["cardinality"]["max_categories"]
    if min_frequency is None:
        min_frequency = _config["features"]["cardinality"]["min_frequency"]

    # Normalize "Other" variants before counting frequencies
    series = normalize_other_categories(series)

    # Count value frequencies
    value_counts = series.value_counts()

    # Keep only categories that meet both criteria:
    # 1. In top max_categories by frequency
    # 2. Have at least min_frequency occurrences
    top_categories = value_counts.head(max_categories)
    kept_categories = top_categories[top_categories >= min_frequency].index.tolist()

    # Replace rare categories with the standard 'Other' name
    return series.apply(lambda x: x if x in kept_categories else other_name)


def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply consistent feature transformations for both training and inference.

    This function ensures that the same preprocessing steps are applied
    during training and inference, preventing data leakage and inconsistencies.

    Args:
        df: DataFrame with columns: Country, YearsCode, WorkExp, EdLevel,
            DevType, Industry, Age, ICorPM, OrgSize, Employment.
            NOTE: During training, cardinality reduction should be applied to df
            BEFORE calling this function. During inference, valid_categories.yaml
            ensures only valid (already-reduced) categories are used.

    Returns:
        DataFrame with one-hot encoded features ready for model input

    Note:
        - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
        - Normalizes Unicode apostrophes to regular apostrophes
        - Applies one-hot encoding with drop_first=True to avoid multicollinearity
        - Column names in output will be like: YearsCode, WorkExp, Country_X, EdLevel_Y, DevType_Z, Industry_W, Age_V, ICorPM_U
        - Does NOT apply cardinality reduction (must be done before calling this)
    """
    # Create a copy to avoid modifying the original
    df_processed = df.copy()

    # Normalize Unicode apostrophes to regular apostrophes for consistency
    # This handles cases where data has \u2019 (') instead of '
    _categorical_cols = [
        "Country",
        "EdLevel",
        "DevType",
        "Industry",
        "Age",
        "ICorPM",
        "OrgSize",
        "Employment",
    ]
    for col in _categorical_cols:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].str.replace(
                "\u2019", "'", regex=False
            )

    # Normalize "Other" category variants (e.g. "Other (please specify):" -> "Other")
    for col in _categorical_cols:
        if col in df_processed.columns:
            df_processed[col] = normalize_other_categories(df_processed[col])

    # Handle legacy column name (YearsCodePro -> YearsCode)
    if (
        "YearsCodePro" in df_processed.columns
        and "YearsCode" not in df_processed.columns
    ):
        df_processed.rename(columns={"YearsCodePro": "YearsCode"}, inplace=True)

    # Fill missing values with defaults
    df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
    df_processed["WorkExp"] = df_processed["WorkExp"].fillna(0)
    df_processed["Country"] = df_processed["Country"].fillna("Unknown")
    df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
    df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
    df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
    df_processed["Age"] = df_processed["Age"].fillna("Unknown")
    df_processed["ICorPM"] = df_processed["ICorPM"].fillna("Unknown")
    df_processed["OrgSize"] = df_processed["OrgSize"].fillna("Unknown")
    df_processed["Employment"] = df_processed["Employment"].fillna("Unknown")

    # NOTE: Cardinality reduction is NOT applied here
    # It should be applied during training BEFORE calling this function
    # During inference, valid_categories.yaml ensures only valid values are used

    # Select only the features we need
    feature_cols = [
        "Country",
        "YearsCode",
        "WorkExp",
        "EdLevel",
        "DevType",
        "Industry",
        "Age",
        "ICorPM",
        "OrgSize",
        "Employment",
    ]
    df_features = df_processed[feature_cols]

    # Apply one-hot encoding for categorical variables
    # For inference (single rows), we need drop_first=False to create columns
    # The reindex in infer.py will align with training columns
    # For training (many rows), we use the config value
    is_inference = len(df_features) == 1
    drop_first = (
        False if is_inference else _config["features"]["encoding"]["drop_first"]
    )
    df_encoded = pd.get_dummies(df_features, drop_first=drop_first)

    return df_encoded