File size: 2,355 Bytes
55cdb7e
 
 
 
 
 
a32e584
 
 
 
 
 
 
 
55cdb7e
a32e584
 
 
 
 
 
 
 
55cdb7e
 
 
 
 
 
 
 
 
a32e584
55cdb7e
 
 
 
 
a32e584
55cdb7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a32e584
55cdb7e
a32e584
 
 
 
 
 
 
 
55cdb7e
a32e584
55cdb7e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""Diagnose why categorical features aren't affecting predictions."""

from src.preprocessing import prepare_features
import pandas as pd

# Create two inputs that differ ONLY in Country
input1 = pd.DataFrame(
    {
        "Country": ["United States of America"],
        "YearsCode": [5.0],
        "EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
        "DevType": ["Developer, full-stack"],
    }
)

input2 = pd.DataFrame(
    {
        "Country": ["Germany"],  # Different!
        "YearsCode": [5.0],
        "EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
        "DevType": ["Developer, full-stack"],
    }
)

print("=" * 70)
print("ENCODING DIAGNOSIS")
print("=" * 70)

# Process features
features1 = prepare_features(input1)
features2 = prepare_features(input2)

print("\nInput 1 (USA):")
print(f"  Shape: {features1.shape}")
print(f"  Columns: {list(features1.columns)}")
non_zero1 = [col for col in features1.columns if features1[col].iloc[0] != 0]
print(f"  Non-zero features ({len(non_zero1)}): {non_zero1}")

print("\nInput 2 (Germany):")
print(f"  Shape: {features2.shape}")
non_zero2 = [col for col in features2.columns if features2[col].iloc[0] != 0]
print(f"  Non-zero features ({len(non_zero2)}): {non_zero2}")

print(f"\nAre encoded features identical? {features1.equals(features2)}")

if features1.equals(features2):
    print("\n❌ PROBLEM: Different countries produce IDENTICAL encodings!")
    print("   This explains why categorical features don't affect predictions.")
else:
    print("\n✅ Encodings are different - categorical features should work.")

# Check what happens with Country specifically
print("\n" + "=" * 70)
print("COUNTRY ENCODING CHECK")
print("=" * 70)

# Test just Country encoding
test_countries = ["United States of America", "Germany", "India"]
for country in test_countries:
    test_df = pd.DataFrame(
        {
            "Country": [country],
            "YearsCode": [5.0],
            "EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
            "DevType": ["Developer, full-stack"],
        }
    )
    encoded = prepare_features(test_df)
    country_cols = [col for col in encoded.columns if col.startswith("Country_")]
    non_zero_countries = [col for col in country_cols if encoded[col].iloc[0] != 0]
    print(f"{country:40s} -> {non_zero_countries}")