Spaces:

dima806
/

developer_salary_prediction

Running

App Files Files Community

developer_salary_prediction / src /preprocessing.py

dima806

Upload 39 files

eeeaee6 verified 18 days ago

raw

history blame contribute delete

6.42 kB

	"""Data preprocessing utilities for consistent feature engineering."""

	from pathlib import Path
	import pandas as pd
	import yaml

	# Load configuration once at module level
	_config_path = Path("config/model_parameters.yaml")
	with open(_config_path, "r") as f:
	_config = yaml.safe_load(f)


	def _get_other_category() -> str:
	"""Get the standard 'Other' category name from config."""
	return _config["features"]["cardinality"].get("other_category", "Other")


	def normalize_other_categories(series: pd.Series) -> pd.Series:
	"""
	Normalize variants of 'Other' to the standard category name.

	Replaces values like 'Other (please specify):', 'Other:', etc.
	with the standard 'Other' category from config.
	"""
	other_name = _get_other_category()
	return series.replace(
	to_replace=r"^Other\b.*$",
	value=other_name,
	regex=True,
	)


	def reduce_cardinality(
	series: pd.Series, max_categories: int = None, min_frequency: int = None
	) -> pd.Series:
	"""
	Reduce cardinality by grouping rare categories into 'Other'.

	Args:
	series: Pandas Series with categorical values
	max_categories: Maximum number of categories to keep
	(default: from config)
	min_frequency: Minimum occurrences for a category to be kept
	(default: from config)

	Returns:
	Series with rare categories replaced by 'Other'
	"""
	other_name = _get_other_category()

	# Use config defaults if not provided
	if max_categories is None:
	max_categories = _config["features"]["cardinality"]["max_categories"]
	if min_frequency is None:
	min_frequency = _config["features"]["cardinality"]["min_frequency"]

	# Normalize "Other" variants before counting frequencies
	series = normalize_other_categories(series)

	# Count value frequencies
	value_counts = series.value_counts()

	# Keep only categories that meet both criteria:
	# 1. In top max_categories by frequency
	# 2. Have at least min_frequency occurrences
	top_categories = value_counts.head(max_categories)
	kept_categories = top_categories[top_categories >= min_frequency].index.tolist()

	# Replace rare categories with the standard 'Other' name
	return series.apply(lambda x: x if x in kept_categories else other_name)


	def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Apply consistent feature transformations for both training and inference.

	This function ensures that the same preprocessing steps are applied
	during training and inference, preventing data leakage and inconsistencies.

	Args:
	df: DataFrame with columns: Country, YearsCode, WorkExp, EdLevel,
	DevType, Industry, Age, ICorPM, OrgSize, Employment.
	NOTE: During training, cardinality reduction should be applied to df
	BEFORE calling this function. During inference, valid_categories.yaml
	ensures only valid (already-reduced) categories are used.

	Returns:
	DataFrame with one-hot encoded features ready for model input

	Note:
	- Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
	- Normalizes Unicode apostrophes to regular apostrophes
	- Applies one-hot encoding with drop_first=True to avoid multicollinearity
	- Column names in output will be like: YearsCode, WorkExp, Country_X, EdLevel_Y, DevType_Z, Industry_W, Age_V, ICorPM_U
	- Does NOT apply cardinality reduction (must be done before calling this)
	"""
	# Create a copy to avoid modifying the original
	df_processed = df.copy()

	# Normalize Unicode apostrophes to regular apostrophes for consistency
	# This handles cases where data has \u2019 (') instead of '
	_categorical_cols = [
	"Country",
	"EdLevel",
	"DevType",
	"Industry",
	"Age",
	"ICorPM",
	"OrgSize",
	"Employment",
	]
	for col in _categorical_cols:
	if col in df_processed.columns:
	df_processed[col] = df_processed[col].str.replace(
	"\u2019", "'", regex=False
	)

	# Normalize "Other" category variants (e.g. "Other (please specify):" -> "Other")
	for col in _categorical_cols:
	if col in df_processed.columns:
	df_processed[col] = normalize_other_categories(df_processed[col])

	# Handle legacy column name (YearsCodePro -> YearsCode)
	if (
	"YearsCodePro" in df_processed.columns
	and "YearsCode" not in df_processed.columns
	):
	df_processed.rename(columns={"YearsCodePro": "YearsCode"}, inplace=True)

	# Fill missing values with defaults
	df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
	df_processed["WorkExp"] = df_processed["WorkExp"].fillna(0)
	df_processed["Country"] = df_processed["Country"].fillna("Unknown")
	df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
	df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
	df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
	df_processed["Age"] = df_processed["Age"].fillna("Unknown")
	df_processed["ICorPM"] = df_processed["ICorPM"].fillna("Unknown")
	df_processed["OrgSize"] = df_processed["OrgSize"].fillna("Unknown")
	df_processed["Employment"] = df_processed["Employment"].fillna("Unknown")

	# NOTE: Cardinality reduction is NOT applied here
	# It should be applied during training BEFORE calling this function
	# During inference, valid_categories.yaml ensures only valid values are used

	# Select only the features we need
	feature_cols = [
	"Country",
	"YearsCode",
	"WorkExp",
	"EdLevel",
	"DevType",
	"Industry",
	"Age",
	"ICorPM",
	"OrgSize",
	"Employment",
	]
	df_features = df_processed[feature_cols]

	# Apply one-hot encoding for categorical variables
	# For inference (single rows), we need drop_first=False to create columns
	# The reindex in infer.py will align with training columns
	# For training (many rows), we use the config value
	is_inference = len(df_features) == 1
	drop_first = (
	False if is_inference else _config["features"]["encoding"]["drop_first"]
	)
	df_encoded = pd.get_dummies(df_features, drop_first=drop_first)

	return df_encoded