Spaces:

dima806
/

developer_salary_prediction

Sleeping

App Files Files Community

developer_salary_prediction / tests /test_preprocessing.py

dima806

Upload 39 files

eeeaee6 verified 18 days ago

raw

history blame contribute delete

8.67 kB

	"""Tests for src/preprocessing.py - Feature engineering utilities."""

	import numpy as np
	import pandas as pd

	from src.preprocessing import (
	normalize_other_categories,
	prepare_features,
	reduce_cardinality,
	)


	class TestNormalizeOtherCategories:
	"""Tests for normalize_other_categories()."""

	def test_replaces_other_please_specify(self):
	"""'Other (please specify):' is replaced with 'Other'."""
	series = pd.Series(["Other (please specify):", "Developer, back-end"])
	result = normalize_other_categories(series)
	assert result.iloc[0] == "Other"
	assert result.iloc[1] == "Developer, back-end"

	def test_replaces_other_colon(self):
	"""'Other:' is replaced with 'Other'."""
	series = pd.Series(["Other:", "Software Development"])
	result = normalize_other_categories(series)
	assert result.iloc[0] == "Other"

	def test_leaves_non_other_unchanged(self):
	"""Non-Other values are not modified."""
	values = ["Developer, back-end", "Software Development", "India"]
	series = pd.Series(values)
	result = normalize_other_categories(series)
	assert list(result) == values

	def test_preserves_exact_other(self):
	"""Exact 'Other' is kept as-is."""
	series = pd.Series(["Other"])
	result = normalize_other_categories(series)
	assert result.iloc[0] == "Other"


	class TestReduceCardinality:
	"""Tests for reduce_cardinality()."""

	def test_groups_rare_categories(self):
	"""Rare categories are grouped into 'Other'."""
	# Create series with one dominant and many rare categories
	values = ["Common"] * 100 + ["Rare1", "Rare2", "Rare3"]
	series = pd.Series(values)
	result = reduce_cardinality(series, max_categories=5, min_frequency=10)
	assert "Common" in result.values
	assert "Rare1" not in result.values
	assert (result == "Other").sum() == 3

	def test_keeps_frequent_categories(self):
	"""Frequent categories are kept intact."""
	values = ["A"] * 100 + ["B"] * 80 + ["C"] * 60
	series = pd.Series(values)
	result = reduce_cardinality(series, max_categories=5, min_frequency=50)
	assert set(result.unique()) == {"A", "B", "C"}

	def test_uses_config_defaults_when_no_args(self):
	"""Without explicit args, falls back to config defaults."""
	values = ["Common"] * 200 + ["Rare"] * 2
	series = pd.Series(values)
	# Call without explicit max_categories / min_frequency
	result = reduce_cardinality(series)
	# "Rare" should be grouped into "Other" using config defaults
	assert "Rare" not in result.values
	assert "Common" in result.values


	class TestPrepareFeatures:
	"""Tests for prepare_features()."""

	def test_returns_dataframe_with_numeric_columns(self):
	"""Output contains YearsCode and WorkExp as numeric columns."""
	df = pd.DataFrame(
	{
	"Country": ["India"],
	"YearsCode": [5.0],
	"WorkExp": [3.0],
	"EdLevel": ["Other"],
	"DevType": ["Developer, back-end"],
	"Industry": ["Software Development"],
	"Age": ["25-34 years old"],
	"ICorPM": ["Individual contributor"],
	"OrgSize": ["20 to 99 employees"],
	"Employment": ["Employed"],
	}
	)
	result = prepare_features(df)
	assert "YearsCode" in result.columns
	assert "WorkExp" in result.columns

	def test_fills_missing_numeric_with_zero(self):
	"""Missing numeric values are filled with 0."""
	df = pd.DataFrame(
	{
	"Country": ["India"],
	"YearsCode": [np.nan],
	"WorkExp": [np.nan],
	"EdLevel": ["Other"],
	"DevType": ["Developer, back-end"],
	"Industry": ["Software Development"],
	"Age": ["25-34 years old"],
	"ICorPM": ["Individual contributor"],
	"OrgSize": ["20 to 99 employees"],
	"Employment": ["Employed"],
	}
	)
	result = prepare_features(df)
	assert result["YearsCode"].iloc[0] == 0.0
	assert result["WorkExp"].iloc[0] == 0.0

	def test_one_hot_encodes_categorical_columns(self):
	"""Categorical columns are one-hot encoded."""
	df = pd.DataFrame(
	{
	"Country": ["India", "Germany"],
	"YearsCode": [5.0, 10.0],
	"WorkExp": [3.0, 8.0],
	"EdLevel": ["Other", "Other"],
	"DevType": ["Developer, back-end", "Developer, front-end"],
	"Industry": ["Software Development", "Healthcare"],
	"Age": ["25-34 years old", "35-44 years old"],
	"ICorPM": ["Individual contributor", "People manager"],
	"OrgSize": ["20 to 99 employees", "100 to 499 employees"],
	"Employment": ["Employed", "Employed"],
	}
	)
	result = prepare_features(df)
	# Should have one-hot columns for categorical features
	non_numeric = ("YearsCode", "WorkExp")
	categorical_cols = [
	c for c in result.columns if "_" in c and c not in non_numeric
	]
	assert len(categorical_cols) > 0

	def test_renames_legacy_years_code_pro_column(self):
	"""Legacy YearsCodePro column is renamed to YearsCode."""
	df = pd.DataFrame(
	{
	"Country": ["India"],
	"YearsCodePro": [5.0],
	"WorkExp": [3.0],
	"EdLevel": ["Other"],
	"DevType": ["Developer, back-end"],
	"Industry": ["Software Development"],
	"Age": ["25-34 years old"],
	"ICorPM": ["Individual contributor"],
	"OrgSize": ["20 to 99 employees"],
	"Employment": ["Employed"],
	}
	)
	result = prepare_features(df)
	assert "YearsCode" in result.columns
	assert "YearsCodePro" not in result.columns

	def test_fills_missing_categorical_with_unknown(self):
	"""Missing categorical values are filled with 'Unknown'."""
	df = pd.DataFrame(
	{
	"Country": [None],
	"YearsCode": [5.0],
	"WorkExp": [3.0],
	"EdLevel": [None],
	"DevType": [None],
	"Industry": [None],
	"Age": [None],
	"ICorPM": [None],
	"OrgSize": [None],
	"Employment": [None],
	}
	)
	result = prepare_features(df)
	# Categoricals filled with "Unknown" → one-hot encodes "Unknown"
	unknown_cols = [c for c in result.columns if "Unknown" in c]
	assert len(unknown_cols) > 0

	def test_different_inputs_produce_different_encodings(self):
	"""Different categorical values produce distinct one-hot encodings."""
	base = {
	"YearsCode": [5.0],
	"WorkExp": [3.0],
	"EdLevel": ["Other"],
	"DevType": ["Developer, back-end"],
	"Industry": ["Software Development"],
	"Age": ["25-34 years old"],
	"ICorPM": ["Individual contributor"],
	"OrgSize": ["20 to 99 employees"],
	"Employment": ["Employed"],
	}
	df_usa = pd.DataFrame({"Country": ["United States of America"], **base})
	df_deu = pd.DataFrame({"Country": ["Germany"], **base})

	enc_usa = prepare_features(df_usa)
	enc_deu = prepare_features(df_deu)

	assert not enc_usa.equals(enc_deu), (
	"USA and Germany inputs produced identical encodings — "
	"categorical features are not being encoded"
	)

	def test_does_not_modify_original(self):
	"""prepare_features does not modify the input DataFrame."""
	df = pd.DataFrame(
	{
	"Country": ["India"],
	"YearsCode": [5.0],
	"WorkExp": [3.0],
	"EdLevel": ["Other"],
	"DevType": ["Developer, back-end"],
	"Industry": ["Software Development"],
	"Age": ["25-34 years old"],
	"ICorPM": ["Individual contributor"],
	"OrgSize": ["20 to 99 employees"],
	"Employment": ["Employed"],
	}
	)
	original_country = df["Country"].iloc[0]
	prepare_features(df)
	assert df["Country"].iloc[0] == original_country