developer_salary_prediction / diagnose_encoding.py
dima806's picture
Upload 32 files
a32e584 verified
"""Diagnose why categorical features aren't affecting predictions."""
from src.preprocessing import prepare_features
import pandas as pd
# Create two inputs that differ ONLY in Country
input1 = pd.DataFrame(
{
"Country": ["United States of America"],
"YearsCode": [5.0],
"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
"DevType": ["Developer, full-stack"],
}
)
input2 = pd.DataFrame(
{
"Country": ["Germany"], # Different!
"YearsCode": [5.0],
"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
"DevType": ["Developer, full-stack"],
}
)
print("=" * 70)
print("ENCODING DIAGNOSIS")
print("=" * 70)
# Process features
features1 = prepare_features(input1)
features2 = prepare_features(input2)
print("\nInput 1 (USA):")
print(f" Shape: {features1.shape}")
print(f" Columns: {list(features1.columns)}")
non_zero1 = [col for col in features1.columns if features1[col].iloc[0] != 0]
print(f" Non-zero features ({len(non_zero1)}): {non_zero1}")
print("\nInput 2 (Germany):")
print(f" Shape: {features2.shape}")
non_zero2 = [col for col in features2.columns if features2[col].iloc[0] != 0]
print(f" Non-zero features ({len(non_zero2)}): {non_zero2}")
print(f"\nAre encoded features identical? {features1.equals(features2)}")
if features1.equals(features2):
print("\n❌ PROBLEM: Different countries produce IDENTICAL encodings!")
print(" This explains why categorical features don't affect predictions.")
else:
print("\n✅ Encodings are different - categorical features should work.")
# Check what happens with Country specifically
print("\n" + "=" * 70)
print("COUNTRY ENCODING CHECK")
print("=" * 70)
# Test just Country encoding
test_countries = ["United States of America", "Germany", "India"]
for country in test_countries:
test_df = pd.DataFrame(
{
"Country": [country],
"YearsCode": [5.0],
"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
"DevType": ["Developer, full-stack"],
}
)
encoded = prepare_features(test_df)
country_cols = [col for col in encoded.columns if col.startswith("Country_")]
non_zero_countries = [col for col in country_cols if encoded[col].iloc[0] != 0]
print(f"{country:40s} -> {non_zero_countries}")