Spaces:
Sleeping
Sleeping
| """Diagnose why categorical features aren't affecting predictions.""" | |
| from src.preprocessing import prepare_features | |
| import pandas as pd | |
| # Create two inputs that differ ONLY in Country | |
| input1 = pd.DataFrame( | |
| { | |
| "Country": ["United States of America"], | |
| "YearsCode": [5.0], | |
| "EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"], | |
| "DevType": ["Developer, full-stack"], | |
| } | |
| ) | |
| input2 = pd.DataFrame( | |
| { | |
| "Country": ["Germany"], # Different! | |
| "YearsCode": [5.0], | |
| "EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"], | |
| "DevType": ["Developer, full-stack"], | |
| } | |
| ) | |
| print("=" * 70) | |
| print("ENCODING DIAGNOSIS") | |
| print("=" * 70) | |
| # Process features | |
| features1 = prepare_features(input1) | |
| features2 = prepare_features(input2) | |
| print("\nInput 1 (USA):") | |
| print(f" Shape: {features1.shape}") | |
| print(f" Columns: {list(features1.columns)}") | |
| non_zero1 = [col for col in features1.columns if features1[col].iloc[0] != 0] | |
| print(f" Non-zero features ({len(non_zero1)}): {non_zero1}") | |
| print("\nInput 2 (Germany):") | |
| print(f" Shape: {features2.shape}") | |
| non_zero2 = [col for col in features2.columns if features2[col].iloc[0] != 0] | |
| print(f" Non-zero features ({len(non_zero2)}): {non_zero2}") | |
| print(f"\nAre encoded features identical? {features1.equals(features2)}") | |
| if features1.equals(features2): | |
| print("\n❌ PROBLEM: Different countries produce IDENTICAL encodings!") | |
| print(" This explains why categorical features don't affect predictions.") | |
| else: | |
| print("\n✅ Encodings are different - categorical features should work.") | |
| # Check what happens with Country specifically | |
| print("\n" + "=" * 70) | |
| print("COUNTRY ENCODING CHECK") | |
| print("=" * 70) | |
| # Test just Country encoding | |
| test_countries = ["United States of America", "Germany", "India"] | |
| for country in test_countries: | |
| test_df = pd.DataFrame( | |
| { | |
| "Country": [country], | |
| "YearsCode": [5.0], | |
| "EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"], | |
| "DevType": ["Developer, full-stack"], | |
| } | |
| ) | |
| encoded = prepare_features(test_df) | |
| country_cols = [col for col in encoded.columns if col.startswith("Country_")] | |
| non_zero_countries = [col for col in country_cols if encoded[col].iloc[0] != 0] | |
| print(f"{country:40s} -> {non_zero_countries}") | |