Spaces:

dima806
/

developer_salary_prediction

Sleeping

App Files Files Community

developer_salary_prediction / diagnose_encoding.py

dima806

Upload 32 files

a32e584 verified 30 days ago

raw

history blame contribute delete

2.36 kB

	"""Diagnose why categorical features aren't affecting predictions."""

	from src.preprocessing import prepare_features
	import pandas as pd

	# Create two inputs that differ ONLY in Country
	input1 = pd.DataFrame(
	{
	"Country": ["United States of America"],
	"YearsCode": [5.0],
	"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
	"DevType": ["Developer, full-stack"],
	}
	)

	input2 = pd.DataFrame(
	{
	"Country": ["Germany"], # Different!
	"YearsCode": [5.0],
	"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
	"DevType": ["Developer, full-stack"],
	}
	)

	print("=" * 70)
	print("ENCODING DIAGNOSIS")
	print("=" * 70)

	# Process features
	features1 = prepare_features(input1)
	features2 = prepare_features(input2)

	print("\nInput 1 (USA):")
	print(f" Shape: {features1.shape}")
	print(f" Columns: {list(features1.columns)}")
	non_zero1 = [col for col in features1.columns if features1[col].iloc[0] != 0]
	print(f" Non-zero features ({len(non_zero1)}): {non_zero1}")

	print("\nInput 2 (Germany):")
	print(f" Shape: {features2.shape}")
	non_zero2 = [col for col in features2.columns if features2[col].iloc[0] != 0]
	print(f" Non-zero features ({len(non_zero2)}): {non_zero2}")

	print(f"\nAre encoded features identical? {features1.equals(features2)}")

	if features1.equals(features2):
	print("\n❌ PROBLEM: Different countries produce IDENTICAL encodings!")
	print(" This explains why categorical features don't affect predictions.")
	else:
	print("\n✅ Encodings are different - categorical features should work.")

	# Check what happens with Country specifically
	print("\n" + "=" * 70)
	print("COUNTRY ENCODING CHECK")
	print("=" * 70)

	# Test just Country encoding
	test_countries = ["United States of America", "Germany", "India"]
	for country in test_countries:
	test_df = pd.DataFrame(
	{
	"Country": [country],
	"YearsCode": [5.0],
	"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
	"DevType": ["Developer, full-stack"],
	}
	)
	encoded = prepare_features(test_df)
	country_cols = [col for col in encoded.columns if col.startswith("Country_")]
	non_zero_countries = [col for col in country_cols if encoded[col].iloc[0] != 0]
	print(f"{country:40s} -> {non_zero_countries}")