Spaces:
Sleeping
Sleeping
File size: 2,355 Bytes
55cdb7e a32e584 55cdb7e a32e584 55cdb7e a32e584 55cdb7e a32e584 55cdb7e a32e584 55cdb7e a32e584 55cdb7e a32e584 55cdb7e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | """Diagnose why categorical features aren't affecting predictions."""
from src.preprocessing import prepare_features
import pandas as pd
# Create two inputs that differ ONLY in Country
input1 = pd.DataFrame(
{
"Country": ["United States of America"],
"YearsCode": [5.0],
"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
"DevType": ["Developer, full-stack"],
}
)
input2 = pd.DataFrame(
{
"Country": ["Germany"], # Different!
"YearsCode": [5.0],
"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
"DevType": ["Developer, full-stack"],
}
)
print("=" * 70)
print("ENCODING DIAGNOSIS")
print("=" * 70)
# Process features
features1 = prepare_features(input1)
features2 = prepare_features(input2)
print("\nInput 1 (USA):")
print(f" Shape: {features1.shape}")
print(f" Columns: {list(features1.columns)}")
non_zero1 = [col for col in features1.columns if features1[col].iloc[0] != 0]
print(f" Non-zero features ({len(non_zero1)}): {non_zero1}")
print("\nInput 2 (Germany):")
print(f" Shape: {features2.shape}")
non_zero2 = [col for col in features2.columns if features2[col].iloc[0] != 0]
print(f" Non-zero features ({len(non_zero2)}): {non_zero2}")
print(f"\nAre encoded features identical? {features1.equals(features2)}")
if features1.equals(features2):
print("\n❌ PROBLEM: Different countries produce IDENTICAL encodings!")
print(" This explains why categorical features don't affect predictions.")
else:
print("\n✅ Encodings are different - categorical features should work.")
# Check what happens with Country specifically
print("\n" + "=" * 70)
print("COUNTRY ENCODING CHECK")
print("=" * 70)
# Test just Country encoding
test_countries = ["United States of America", "Germany", "India"]
for country in test_countries:
test_df = pd.DataFrame(
{
"Country": [country],
"YearsCode": [5.0],
"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
"DevType": ["Developer, full-stack"],
}
)
encoded = prepare_features(test_df)
country_cols = [col for col in encoded.columns if col.startswith("Country_")]
non_zero_countries = [col for col in country_cols if encoded[col].iloc[0] != 0]
print(f"{country:40s} -> {non_zero_countries}")
|