Spaces:

dima806
/

developer_salary_prediction

Running

App Files Files Community

dima806 commited on Feb 13

Commit

07d23c4

verified ·

1 Parent(s): 2121037

Upload 22 files

Browse files

Files changed (10) hide show

Claude.md +5 -2
app.py +23 -0
config/valid_categories.yaml +8 -0
example_inference.py +16 -0
models/model.pkl +2 -2
src/infer.py +9 -0
src/preprocessing.py +6 -4
src/schema.py +8 -0
src/train.py +32 -3
test_feature_impact.py +124 -11

Claude.md CHANGED Viewed

@@ -77,7 +77,8 @@ input_data = SalaryInput(
     years_code=5.0,
     education_level="Bachelor's degree",
     dev_type="Developer, back-end",
-    industry="Software Development"
 )
 salary = predict_salary(input_data)
 ```
@@ -123,6 +124,7 @@ The dataset must include these columns:
 - `EdLevel` - Education level
 - `DevType` - Developer type
 - `Industry` - Industry the developer works in
 - `ConvertedCompYearly` - Annual salary (target variable)
 ### Model Expectations
@@ -157,7 +159,8 @@ test_input = SalaryInput(
     years_code=3.0,
     education_level="Bachelor's degree",
     dev_type="Developer, back-end",
-    industry="Software Development"
 )
 print(predict_salary(test_input))
 ```

     years_code=5.0,
     education_level="Bachelor's degree",
     dev_type="Developer, back-end",
+    industry="Software Development",
+    age="25-34 years old"
 )
 salary = predict_salary(input_data)
 ```
 - `EdLevel` - Education level
 - `DevType` - Developer type
 - `Industry` - Industry the developer works in
+- `Age` - Developer's age range
 - `ConvertedCompYearly` - Annual salary (target variable)
 ### Model Expectations
     years_code=3.0,
     education_level="Bachelor's degree",
     dev_type="Developer, back-end",
+    industry="Software Development",
+    age="25-34 years old"
 )
 print(predict_salary(test_input))
 ```

app.py CHANGED Viewed

@@ -27,9 +27,11 @@ with st.sidebar:
         Developer Survey data to predict annual salaries based on:
         - Country
         - Total years of coding experience (including education)
         - Education level
         - Developer type
         - Industry
         """
     )
     st.info("💡 Tip: Results are estimates based on survey averages.")
@@ -40,6 +42,7 @@ with st.sidebar:
     st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
     st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
     st.write(f"**Industries:** {len(valid_categories['Industry'])} available")
     st.caption("Only values from the training data are shown in the dropdowns.")
 # Main input form
@@ -52,12 +55,14 @@ valid_countries = valid_categories["Country"]
 valid_education_levels = valid_categories["EdLevel"]
 valid_dev_types = valid_categories["DevType"]
 valid_industries = valid_categories["Industry"]
 # Set default values (if available)
 default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
 default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
 default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
 default_industry = "Software Development" if "Software Development" in valid_industries else valid_industries[0]
 with col1:
     country = st.selectbox(
@@ -76,6 +81,15 @@ with col1:
         help="Including any education, how many years have you been coding in total?",
     )
 with col2:
     education = st.selectbox(
         "Education Level",
@@ -98,6 +112,13 @@ industry = st.selectbox(
     help="Industry the developer works in (only industries from training data)",
 )
 # Prediction button
 if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
     try:
@@ -105,9 +126,11 @@ if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
         input_data = SalaryInput(
             country=country,
             years_code=years,
             education_level=education,
             dev_type=dev_type,
             industry=industry,
         )
         # Make prediction

         Developer Survey data to predict annual salaries based on:
         - Country
         - Total years of coding experience (including education)
+        - Years of professional work experience
         - Education level
         - Developer type
         - Industry
+        - Age
         """
     )
     st.info("💡 Tip: Results are estimates based on survey averages.")
     st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
     st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
     st.write(f"**Industries:** {len(valid_categories['Industry'])} available")
+    st.write(f"**Age Ranges:** {len(valid_categories['Age'])} available")
     st.caption("Only values from the training data are shown in the dropdowns.")
 # Main input form
 valid_education_levels = valid_categories["EdLevel"]
 valid_dev_types = valid_categories["DevType"]
 valid_industries = valid_categories["Industry"]
+valid_ages = valid_categories["Age"]
 # Set default values (if available)
 default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
 default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
 default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
 default_industry = "Software Development" if "Software Development" in valid_industries else valid_industries[0]
+default_age = "25-34 years old" if "25-34 years old" in valid_ages else valid_ages[0]
 with col1:
     country = st.selectbox(
         help="Including any education, how many years have you been coding in total?",
     )
+    work_exp = st.number_input(
+        "Years of Professional Work Experience",
+        min_value=0,
+        max_value=50,
+        value=5,
+        step=1,
+        help="How many years of professional work experience do you have?",
+    )
 with col2:
     education = st.selectbox(
         "Education Level",
     help="Industry the developer works in (only industries from training data)",
 )
+age = st.selectbox(
+    "Age",
+    options=valid_ages,
+    index=valid_ages.index(default_age),
+    help="Developer's age range",
+)
 # Prediction button
 if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
     try:
         input_data = SalaryInput(
             country=country,
             years_code=years,
+            work_exp=work_exp,
             education_level=education,
             dev_type=dev_type,
             industry=industry,
+            age=age,
         )
         # Make prediction

config/valid_categories.yaml CHANGED Viewed

@@ -69,3 +69,11 @@ Industry:
 - Retail and Consumer Services
 - Software Development
 - Transportation, or Supply Chain

 - Retail and Consumer Services
 - Software Development
 - Transportation, or Supply Chain
+Age:
+- 18-24 years old
+- 25-34 years old
+- 35-44 years old
+- 45-54 years old
+- 55-64 years old
+- 65 years or older
+- Other

example_inference.py CHANGED Viewed

@@ -18,16 +18,20 @@ def main():
     input_data_1 = SalaryInput(
         country="United States of America",
         years_code=5.0,
         education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         dev_type="Developer, full-stack",
         industry="Software Development",
     )
     print(f"Country: {input_data_1.country}")
     print(f"Years of Coding (Total): {input_data_1.years_code}")
     print(f"Education Level: {input_data_1.education_level}")
     print(f"Developer Type: {input_data_1.dev_type}")
     print(f"Industry: {input_data_1.industry}")
     salary_1 = predict_salary(input_data_1)
     print(f"💰 Predicted Salary: ${salary_1:,.2f} USD/year")
@@ -39,16 +43,20 @@ def main():
     input_data_2 = SalaryInput(
         country="United States of America",
         years_code=2.0,
         education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
         dev_type="Developer, front-end",
         industry="Fintech",
     )
     print(f"Country: {input_data_2.country}")
     print(f"Years of Coding (Total): {input_data_2.years_code}")
     print(f"Education Level: {input_data_2.education_level}")
     print(f"Developer Type: {input_data_2.dev_type}")
     print(f"Industry: {input_data_2.industry}")
     salary_2 = predict_salary(input_data_2)
     print(f"💰 Predicted Salary: ${salary_2:,.2f} USD/year")
@@ -60,16 +68,20 @@ def main():
     input_data_3 = SalaryInput(
         country="United States of America",
         years_code=10.0,
         education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
         dev_type="Engineering manager",
         industry="Banking/Financial Services",
     )
     print(f"Country: {input_data_3.country}")
     print(f"Years of Coding (Total): {input_data_3.years_code}")
     print(f"Education Level: {input_data_3.education_level}")
     print(f"Developer Type: {input_data_3.dev_type}")
     print(f"Industry: {input_data_3.industry}")
     salary_3 = predict_salary(input_data_3)
     print(f"💰 Predicted Salary: ${salary_3:,.2f} USD/year")
@@ -81,16 +93,20 @@ def main():
     input_data_4 = SalaryInput(
         country="Germany",
         years_code=5.0,
         education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         dev_type="Developer, back-end",
         industry="Manufacturing",
     )
     print(f"Country: {input_data_4.country}")
     print(f"Years of Coding (Total): {input_data_4.years_code}")
     print(f"Education Level: {input_data_4.education_level}")
     print(f"Developer Type: {input_data_4.dev_type}")
     print(f"Industry: {input_data_4.industry}")
     salary_4 = predict_salary(input_data_4)
     print(f"💰 Predicted Salary: ${salary_4:,.2f} USD/year")

     input_data_1 = SalaryInput(
         country="United States of America",
         years_code=5.0,
+        work_exp=3.0,
         education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         dev_type="Developer, full-stack",
         industry="Software Development",
+        age="25-34 years old",
     )
     print(f"Country: {input_data_1.country}")
     print(f"Years of Coding (Total): {input_data_1.years_code}")
+    print(f"Work Experience: {input_data_1.work_exp}")
     print(f"Education Level: {input_data_1.education_level}")
     print(f"Developer Type: {input_data_1.dev_type}")
     print(f"Industry: {input_data_1.industry}")
+    print(f"Age: {input_data_1.age}")
     salary_1 = predict_salary(input_data_1)
     print(f"💰 Predicted Salary: ${salary_1:,.2f} USD/year")
     input_data_2 = SalaryInput(
         country="United States of America",
         years_code=2.0,
+        work_exp=1.0,
         education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
         dev_type="Developer, front-end",
         industry="Fintech",
+        age="18-24 years old",
     )
     print(f"Country: {input_data_2.country}")
     print(f"Years of Coding (Total): {input_data_2.years_code}")
+    print(f"Work Experience: {input_data_2.work_exp}")
     print(f"Education Level: {input_data_2.education_level}")
     print(f"Developer Type: {input_data_2.dev_type}")
     print(f"Industry: {input_data_2.industry}")
+    print(f"Age: {input_data_2.age}")
     salary_2 = predict_salary(input_data_2)
     print(f"💰 Predicted Salary: ${salary_2:,.2f} USD/year")
     input_data_3 = SalaryInput(
         country="United States of America",
         years_code=10.0,
+        work_exp=8.0,
         education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
         dev_type="Engineering manager",
         industry="Banking/Financial Services",
+        age="35-44 years old",
     )
     print(f"Country: {input_data_3.country}")
     print(f"Years of Coding (Total): {input_data_3.years_code}")
+    print(f"Work Experience: {input_data_3.work_exp}")
     print(f"Education Level: {input_data_3.education_level}")
     print(f"Developer Type: {input_data_3.dev_type}")
     print(f"Industry: {input_data_3.industry}")
+    print(f"Age: {input_data_3.age}")
     salary_3 = predict_salary(input_data_3)
     print(f"💰 Predicted Salary: ${salary_3:,.2f} USD/year")
     input_data_4 = SalaryInput(
         country="Germany",
         years_code=5.0,
+        work_exp=3.0,
         education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         dev_type="Developer, back-end",
         industry="Manufacturing",
+        age="25-34 years old",
     )
     print(f"Country: {input_data_4.country}")
     print(f"Years of Coding (Total): {input_data_4.years_code}")
+    print(f"Work Experience: {input_data_4.work_exp}")
     print(f"Education Level: {input_data_4.education_level}")
     print(f"Developer Type: {input_data_4.dev_type}")
     print(f"Industry: {input_data_4.industry}")
+    print(f"Age: {input_data_4.age}")
     salary_4 = predict_salary(input_data_4)
     print(f"💰 Predicted Salary: ${salary_4:,.2f} USD/year")

models/model.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5165f22311d0eb6809380cf4fa5a749b59f0d8e81903462fe7c2c882e09e916f
-size 3192752

 version https://git-lfs.github.com/spec/v1
+oid sha256:efa053c467c2c1ea33a65f04aedc32fb7a6c47658238f26d88cd0a10986c0c98
+size 3985657

src/infer.py CHANGED Viewed

@@ -99,14 +99,23 @@ def predict_salary(data: SalaryInput) -> float:
             f"Check config/valid_categories.yaml for all valid values."
         )
     # Create a DataFrame with the input data
     input_df = pd.DataFrame(
         {
             "Country": [data.country],
             "YearsCode": [data.years_code],
             "EdLevel": [data.education_level],
             "DevType": [data.dev_type],
             "Industry": [data.industry],
         }
     )

             f"Check config/valid_categories.yaml for all valid values."
         )
+    if data.age not in valid_categories["Age"]:
+        raise ValueError(
+            f"Invalid age: '{data.age}'. "
+            f"Must be one of {len(valid_categories['Age'])} valid age ranges. "
+            f"Check config/valid_categories.yaml for all valid values."
+        )
     # Create a DataFrame with the input data
     input_df = pd.DataFrame(
         {
             "Country": [data.country],
             "YearsCode": [data.years_code],
+            "WorkExp": [data.work_exp],
             "EdLevel": [data.education_level],
             "DevType": [data.dev_type],
             "Industry": [data.industry],
+            "Age": [data.age],
         }
     )

src/preprocessing.py CHANGED Viewed

@@ -55,7 +55,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
     during training and inference, preventing data leakage and inconsistencies.
     Args:
-        df: DataFrame with columns: Country, YearsCode, EdLevel, DevType, Industry
             NOTE: During training, cardinality reduction should be applied to df
             BEFORE calling this function. During inference, valid_categories.yaml
             ensures only valid (already-reduced) categories are used.
@@ -67,7 +67,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
         - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
         - Normalizes Unicode apostrophes to regular apostrophes
         - Applies one-hot encoding with drop_first=True to avoid multicollinearity
-        - Column names in output will be like: YearsCode, Country_X, EdLevel_Y, DevType_Z, Industry_W
         - Does NOT apply cardinality reduction (must be done before calling this)
     """
     # Create a copy to avoid modifying the original
@@ -75,7 +75,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
     # Normalize Unicode apostrophes to regular apostrophes for consistency
     # This handles cases where data has \u2019 (') instead of '
-    for col in ["Country", "EdLevel", "DevType", "Industry"]:
         if col in df_processed.columns:
             df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
@@ -85,17 +85,19 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
     # Fill missing values with defaults
     df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
     df_processed["Country"] = df_processed["Country"].fillna("Unknown")
     df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
     df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
     df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
     # NOTE: Cardinality reduction is NOT applied here
     # It should be applied during training BEFORE calling this function
     # During inference, valid_categories.yaml ensures only valid values are used
     # Select only the features we need
-    feature_cols = ["Country", "YearsCode", "EdLevel", "DevType", "Industry"]
     df_features = df_processed[feature_cols]
     # Apply one-hot encoding for categorical variables

     during training and inference, preventing data leakage and inconsistencies.
     Args:
+        df: DataFrame with columns: Country, YearsCode, WorkExp, EdLevel, DevType, Industry, Age
             NOTE: During training, cardinality reduction should be applied to df
             BEFORE calling this function. During inference, valid_categories.yaml
             ensures only valid (already-reduced) categories are used.
         - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
         - Normalizes Unicode apostrophes to regular apostrophes
         - Applies one-hot encoding with drop_first=True to avoid multicollinearity
+        - Column names in output will be like: YearsCode, WorkExp, Country_X, EdLevel_Y, DevType_Z, Industry_W, Age_V
         - Does NOT apply cardinality reduction (must be done before calling this)
     """
     # Create a copy to avoid modifying the original
     # Normalize Unicode apostrophes to regular apostrophes for consistency
     # This handles cases where data has \u2019 (') instead of '
+    for col in ["Country", "EdLevel", "DevType", "Industry", "Age"]:
         if col in df_processed.columns:
             df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
     # Fill missing values with defaults
     df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
+    df_processed["WorkExp"] = df_processed["WorkExp"].fillna(0)
     df_processed["Country"] = df_processed["Country"].fillna("Unknown")
     df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
     df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
     df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
+    df_processed["Age"] = df_processed["Age"].fillna("Unknown")
     # NOTE: Cardinality reduction is NOT applied here
     # It should be applied during training BEFORE calling this function
     # During inference, valid_categories.yaml ensures only valid values are used
     # Select only the features we need
+    feature_cols = ["Country", "YearsCode", "WorkExp", "EdLevel", "DevType", "Industry", "Age"]
     df_features = df_processed[feature_cols]
     # Apply one-hot encoding for categorical variables

src/schema.py CHANGED Viewed

@@ -12,9 +12,15 @@ class SalaryInput(BaseModel):
         ge=0,
         description="Including any education, how many years have you been coding in total?",
     )
     education_level: str = Field(..., description="Education level")
     dev_type: str = Field(..., description="Developer type")
     industry: str = Field(..., description="Industry the developer works in")
     class Config:
         """Pydantic configuration."""
@@ -23,8 +29,10 @@ class SalaryInput(BaseModel):
             "example": {
                 "country": "United States",
                 "years_code": 5.0,
                 "education_level": "Bachelor's degree",
                 "dev_type": "Developer, back-end",
                 "industry": "Software Development",
             }
         }

         ge=0,
         description="Including any education, how many years have you been coding in total?",
     )
+    work_exp: float = Field(
+        ...,
+        ge=0,
+        description="How many years of professional work experience do you have?",
+    )
     education_level: str = Field(..., description="Education level")
     dev_type: str = Field(..., description="Developer type")
     industry: str = Field(..., description="Industry the developer works in")
+    age: str = Field(..., description="Developer's age range")
     class Config:
         """Pydantic configuration."""
             "example": {
                 "country": "United States",
                 "years_code": 5.0,
+                "work_exp": 3.0,
                 "education_level": "Bachelor's degree",
                 "dev_type": "Developer, back-end",
                 "industry": "Software Development",
+                "age": "25-34 years old",
             }
         }

src/train.py CHANGED Viewed

@@ -32,7 +32,7 @@ def main():
     # Load only required columns to save memory
     df = pd.read_csv(
         data_path,
-        usecols=["Country", "YearsCode", "EdLevel", "DevType", "Industry",
                  "Currency", "CompTotal", "ConvertedCompYearly"],
     )
@@ -67,12 +67,14 @@ def main():
     df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
     df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
     df_copy["Industry"] = df_copy["Industry"].str.replace('\u2019', "'", regex=False)
     # Apply cardinality reduction
     df_copy["Country"] = reduce_cardinality(df_copy["Country"])
     df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
     df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
     df_copy["Industry"] = reduce_cardinality(df_copy["Industry"])
     # Apply cardinality reduction to the actual training data as well
     # (prepare_features no longer does this internally)
@@ -80,6 +82,7 @@ def main():
     df["EdLevel"] = reduce_cardinality(df["EdLevel"])
     df["DevType"] = reduce_cardinality(df["DevType"])
     df["Industry"] = reduce_cardinality(df["Industry"])
     # Now apply full feature transformations for model training
     X = prepare_features(df)
@@ -91,19 +94,21 @@ def main():
     edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
     devtype_values = df_copy["DevType"].dropna().unique().tolist()
     industry_values = df_copy["Industry"].dropna().unique().tolist()
     valid_categories = {
         "Country": sorted(country_values),
         "EdLevel": sorted(edlevel_values),
         "DevType": sorted(devtype_values),
         "Industry": sorted(industry_values),
     }
     valid_categories_path = Path("config/valid_categories.yaml")
     with open(valid_categories_path, "w") as f:
         yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
-    print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types, and {len(valid_categories['Industry'])} valid industries to {valid_categories_path}")
     # Compute currency conversion rates per country
     # Use the original data with Currency and CompTotal columns
@@ -181,6 +186,12 @@ def main():
     for industry, count in top_industry.items():
         print(f"  - {industry}: {count:,} ({count/len(df)*100:.1f}%)")
     # Show YearsCode statistics
     print("\n💼 Years of Coding Experience:")
     print(f"  - Min: {df['YearsCode'].min():.1f}")
@@ -190,6 +201,15 @@ def main():
     print(f"  - 25th percentile: {df['YearsCode'].quantile(0.25):.1f}")
     print(f"  - 75th percentile: {df['YearsCode'].quantile(0.75):.1f}")
     # Show most common one-hot encoded features (by frequency)
     # Separate analysis for each categorical feature
@@ -231,12 +251,21 @@ def main():
         industry_name = feature.replace('Industry_', '')
         print(f"  {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
     print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
-    print("   - Numeric: 1 (YearsCode)")
     print(f"   - Country: {len(country_features)}")
     print(f"   - Education: {len(edlevel_features)}")
     print(f"   - DevType: {len(devtype_features)}")
     print(f"   - Industry: {len(industry_features)}")
     print("=" * 60 + "\n")

     # Load only required columns to save memory
     df = pd.read_csv(
         data_path,
+        usecols=["Country", "YearsCode", "WorkExp", "EdLevel", "DevType", "Industry", "Age",
                  "Currency", "CompTotal", "ConvertedCompYearly"],
     )
     df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
     df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
     df_copy["Industry"] = df_copy["Industry"].str.replace('\u2019', "'", regex=False)
+    df_copy["Age"] = df_copy["Age"].str.replace('\u2019', "'", regex=False)
     # Apply cardinality reduction
     df_copy["Country"] = reduce_cardinality(df_copy["Country"])
     df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
     df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
     df_copy["Industry"] = reduce_cardinality(df_copy["Industry"])
+    df_copy["Age"] = reduce_cardinality(df_copy["Age"])
     # Apply cardinality reduction to the actual training data as well
     # (prepare_features no longer does this internally)
     df["EdLevel"] = reduce_cardinality(df["EdLevel"])
     df["DevType"] = reduce_cardinality(df["DevType"])
     df["Industry"] = reduce_cardinality(df["Industry"])
+    df["Age"] = reduce_cardinality(df["Age"])
     # Now apply full feature transformations for model training
     X = prepare_features(df)
     edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
     devtype_values = df_copy["DevType"].dropna().unique().tolist()
     industry_values = df_copy["Industry"].dropna().unique().tolist()
+    age_values = df_copy["Age"].dropna().unique().tolist()
     valid_categories = {
         "Country": sorted(country_values),
         "EdLevel": sorted(edlevel_values),
         "DevType": sorted(devtype_values),
         "Industry": sorted(industry_values),
+        "Age": sorted(age_values),
     }
     valid_categories_path = Path("config/valid_categories.yaml")
     with open(valid_categories_path, "w") as f:
         yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
+    print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types, {len(valid_categories['Industry'])} valid industries, and {len(valid_categories['Age'])} valid age ranges to {valid_categories_path}")
     # Compute currency conversion rates per country
     # Use the original data with Currency and CompTotal columns
     for industry, count in top_industry.items():
         print(f"  - {industry}: {count:,} ({count/len(df)*100:.1f}%)")
+    # Show age distribution
+    print("\n🎂 Age Distribution:")
+    top_age = df["Age"].value_counts().head(10)
+    for age, count in top_age.items():
+        print(f"  - {age}: {count:,} ({count/len(df)*100:.1f}%)")
     # Show YearsCode statistics
     print("\n💼 Years of Coding Experience:")
     print(f"  - Min: {df['YearsCode'].min():.1f}")
     print(f"  - 25th percentile: {df['YearsCode'].quantile(0.25):.1f}")
     print(f"  - 75th percentile: {df['YearsCode'].quantile(0.75):.1f}")
+    # Show WorkExp statistics
+    print("\n💼 Years of Professional Work Experience:")
+    print(f"  - Min: {df['WorkExp'].min():.1f}")
+    print(f"  - Max: {df['WorkExp'].max():.1f}")
+    print(f"  - Mean: {df['WorkExp'].mean():.1f}")
+    print(f"  - Median: {df['WorkExp'].median():.1f}")
+    print(f"  - 25th percentile: {df['WorkExp'].quantile(0.25):.1f}")
+    print(f"  - 75th percentile: {df['WorkExp'].quantile(0.75):.1f}")
     # Show most common one-hot encoded features (by frequency)
     # Separate analysis for each categorical feature
         industry_name = feature.replace('Industry_', '')
         print(f"  {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
+    # Age features
+    print("\n🎂 Top 10 Age Features (most common):")
+    age_features = categorical_features[categorical_features.index.str.startswith('Age_')]
+    for i, (feature, count) in enumerate(age_features.head(10).items(), 1):
+        percentage = (count / len(X)) * 100
+        age_name = feature.replace('Age_', '')
+        print(f"  {i:2d}. {age_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
     print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
+    print("   - Numeric: 2 (YearsCode, WorkExp)")
     print(f"   - Country: {len(country_features)}")
     print(f"   - Education: {len(edlevel_features)}")
     print(f"   - DevType: {len(devtype_features)}")
     print(f"   - Industry: {len(industry_features)}")
+    print(f"   - Age: {len(age_features)}")
     print("=" * 60 + "\n")

test_feature_impact.py CHANGED Viewed

@@ -12,9 +12,11 @@ def test_years_experience_impact():
     base_input = {
         "country": "United States of America",
         "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         "dev_type": "Developer, full-stack",
         "industry": "Software Development",
     }
     # Test with different years of experience
@@ -45,9 +47,11 @@ def test_country_impact():
     base_input = {
         "years_code": 5.0,
         "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         "dev_type": "Developer, full-stack",
         "industry": "Software Development",
     }
     # Test with different countries (select diverse ones)
@@ -93,8 +97,10 @@ def test_education_impact():
     base_input = {
         "country": "United States of America",
         "years_code": 5.0,
         "dev_type": "Developer, full-stack",
         "industry": "Software Development",
     }
     # Test with different education levels
@@ -141,8 +147,10 @@ def test_devtype_impact():
     base_input = {
         "country": "United States of America",
         "years_code": 5.0,
         "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         "industry": "Software Development",
     }
     # Test with different developer types (using actual values from trained model)
@@ -189,8 +197,10 @@ def test_industry_impact():
     base_input = {
         "country": "United States of America",
         "years_code": 5.0,
         "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         "dev_type": "Developer, full-stack",
     }
     # Test with different industries (using actual values from trained model)
@@ -228,40 +238,132 @@ def test_industry_impact():
         return False
 def test_combined_features():
     """Test that combining different features produces expected variations."""
     print("\n" + "=" * 70)
-    print("TEST 6: Combined Feature Variations")
     print("=" * 70)
     # Create diverse combinations (using actual values from trained model)
     test_cases = [
-        ("India", 2, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, back-end", "Software Development"),
-        ("Germany", 5, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Developer, full-stack", "Manufacturing"),
-        ("United States of America", 10, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Engineering manager", "Fintech"),
-        ("Poland", 15, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, front-end", "Healthcare"),
-        ("Brazil", 5, "Some college/university study without earning a degree", "DevOps engineer or professional", "Government"),
     ]
     predictions = []
-    for country, years, education, devtype, industry in test_cases:
         # Skip if not in valid categories
         if (country not in valid_categories["Country"]
                 or education not in valid_categories["EdLevel"]
                 or devtype not in valid_categories["DevType"]
-                or industry not in valid_categories["Industry"]):
             continue
         input_data = SalaryInput(
             country=country,
             years_code=years,
             education_level=education,
             dev_type=devtype,
             industry=industry,
         )
         salary = predict_salary(input_data)
         predictions.append(salary)
-        print(f"  {country[:15]:15s} | {years:2d}y | {education[:25]:25s} | {devtype[:25]:25s} | {industry[:20]:20s} -> ${salary:,.2f}")
     # Check if predictions are different
     unique_predictions = len(set(predictions))
@@ -289,13 +391,15 @@ def print_feature_analysis():
     edlevel_features = [f for f in feature_columns if f.startswith('EdLevel_')]
     devtype_features = [f for f in feature_columns if f.startswith('DevType_')]
     industry_features = [f for f in feature_columns if f.startswith('Industry_')]
-    numeric_features = [f for f in feature_columns if not f.startswith(('Country_', 'EdLevel_', 'DevType_', 'Industry_'))]
     print(f"  - Numeric features: {len(numeric_features)} -> {numeric_features}")
     print(f"  - Country features: {len(country_features)}")
     print(f"  - Education features: {len(edlevel_features)}")
     print(f"  - DevType features: {len(devtype_features)}")
     print(f"  - Industry features: {len(industry_features)}")
     if len(country_features) > 0:
         print(f"\nSample country features:")
@@ -317,6 +421,11 @@ def print_feature_analysis():
         for feat in industry_features[:5]:
             print(f"    - {feat}")
     # Check if there are any features at all
     if len(country_features) == 0:
         print("\n⚠️  WARNING: No country features found!")
@@ -326,6 +435,8 @@ def print_feature_analysis():
         print("\n⚠️  WARNING: No developer type features found!")
     if len(industry_features) == 0:
         print("\n⚠️  WARNING: No industry features found!")
 def main():
@@ -340,11 +451,13 @@ def main():
     # Run all tests
     results = {
-        "Years of Experience": test_years_experience_impact(),
         "Country": test_country_impact(),
         "Education Level": test_education_impact(),
         "Developer Type": test_devtype_impact(),
         "Industry": test_industry_impact(),
         "Combined Features": test_combined_features(),
     }

     base_input = {
         "country": "United States of America",
+        "work_exp": 3.0,
         "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         "dev_type": "Developer, full-stack",
         "industry": "Software Development",
+        "age": "25-34 years old",
     }
     # Test with different years of experience
     base_input = {
         "years_code": 5.0,
+        "work_exp": 3.0,
         "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         "dev_type": "Developer, full-stack",
         "industry": "Software Development",
+        "age": "25-34 years old",
     }
     # Test with different countries (select diverse ones)
     base_input = {
         "country": "United States of America",
         "years_code": 5.0,
+        "work_exp": 3.0,
         "dev_type": "Developer, full-stack",
         "industry": "Software Development",
+        "age": "25-34 years old",
     }
     # Test with different education levels
     base_input = {
         "country": "United States of America",
         "years_code": 5.0,
+        "work_exp": 3.0,
         "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         "industry": "Software Development",
+        "age": "25-34 years old",
     }
     # Test with different developer types (using actual values from trained model)
     base_input = {
         "country": "United States of America",
         "years_code": 5.0,
+        "work_exp": 3.0,
         "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
         "dev_type": "Developer, full-stack",
+        "age": "25-34 years old",
     }
     # Test with different industries (using actual values from trained model)
         return False
+def test_age_impact():
+    """Test that changing age changes prediction."""
+    print("\n" + "=" * 70)
+    print("TEST 6: Age Impact")
+    print("=" * 70)
+    base_input = {
+        "country": "United States of America",
+        "years_code": 5.0,
+        "work_exp": 3.0,
+        "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+        "dev_type": "Developer, full-stack",
+        "industry": "Software Development",
+    }
+    # Test with different age ranges (using actual values from trained model)
+    test_ages = [
+        "18-24 years old",
+        "25-34 years old",
+        "35-44 years old",
+        "45-54 years old",
+        "55-64 years old",
+    ]
+    # Filter to only ages that exist in valid categories
+    test_ages = [a for a in test_ages if a in valid_categories["Age"]]
+    predictions = []
+    for age in test_ages:
+        input_data = SalaryInput(**base_input, age=age)
+        salary = predict_salary(input_data)
+        predictions.append(salary)
+        print(f"  Age: {age[:50]:50s} -> Salary: ${salary:,.2f}")
+    # Check if predictions are different
+    unique_predictions = len(set(predictions))
+    if unique_predictions == len(predictions):
+        print(f"\n✅ PASS: All {len(predictions)} predictions are different")
+        return True
+    elif unique_predictions == 1:
+        print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
+        print("   This indicates the model is NOT using age as a feature!")
+        return False
+    else:
+        print(f"\n⚠️  PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
+        print(f"   Duplicate salaries found - possible feature issue")
+        return False
+def test_work_exp_impact():
+    """Test that changing years of work experience changes prediction."""
+    print("\n" + "=" * 70)
+    print("TEST 7: Work Experience Impact")
+    print("=" * 70)
+    base_input = {
+        "country": "United States of America",
+        "years_code": 10.0,
+        "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+        "dev_type": "Developer, full-stack",
+        "industry": "Software Development",
+        "age": "25-34 years old",
+    }
+    # Test with different years of work experience
+    work_exp_tests = [0, 1, 3, 5, 10, 20]
+    predictions = []
+    for work_exp in work_exp_tests:
+        input_data = SalaryInput(**base_input, work_exp=work_exp)
+        salary = predict_salary(input_data)
+        predictions.append(salary)
+        print(f"  Work Exp: {work_exp:2d} -> Salary: ${salary:,.2f}")
+    # Check if predictions are different
+    unique_predictions = len(set(predictions))
+    if unique_predictions == len(predictions):
+        print(f"\n✅ PASS: All {len(predictions)} predictions are different")
+        return True
+    elif unique_predictions == 1:
+        print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
+        print("   This indicates the model is NOT using work experience as a feature!")
+        return False
+    else:
+        print(f"\n⚠️  PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
+        print(f"   Duplicate salaries found - possible feature issue")
+        return False
 def test_combined_features():
     """Test that combining different features produces expected variations."""
     print("\n" + "=" * 70)
+    print("TEST 8: Combined Feature Variations")
     print("=" * 70)
     # Create diverse combinations (using actual values from trained model)
     test_cases = [
+        ("India", 2, 1, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, back-end", "Software Development", "18-24 years old"),
+        ("Germany", 5, 3, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Developer, full-stack", "Manufacturing", "25-34 years old"),
+        ("United States of America", 10, 8, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Engineering manager", "Fintech", "35-44 years old"),
+        ("Poland", 15, 12, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, front-end", "Healthcare", "45-54 years old"),
+        ("Brazil", 5, 3, "Some college/university study without earning a degree", "DevOps engineer or professional", "Government", "25-34 years old"),
     ]
     predictions = []
+    for country, years, work_exp, education, devtype, industry, age in test_cases:
         # Skip if not in valid categories
         if (country not in valid_categories["Country"]
                 or education not in valid_categories["EdLevel"]
                 or devtype not in valid_categories["DevType"]
+                or industry not in valid_categories["Industry"]
+                or age not in valid_categories["Age"]):
             continue
         input_data = SalaryInput(
             country=country,
             years_code=years,
+            work_exp=work_exp,
             education_level=education,
             dev_type=devtype,
             industry=industry,
+            age=age,
         )
         salary = predict_salary(input_data)
         predictions.append(salary)
+        print(f"  {country[:15]:15s} | {years:2d}y | {work_exp:2d}w | {education[:25]:25s} | {devtype[:25]:25s} | {industry[:20]:20s} | {age[:15]:15s} -> ${salary:,.2f}")
     # Check if predictions are different
     unique_predictions = len(set(predictions))
     edlevel_features = [f for f in feature_columns if f.startswith('EdLevel_')]
     devtype_features = [f for f in feature_columns if f.startswith('DevType_')]
     industry_features = [f for f in feature_columns if f.startswith('Industry_')]
+    age_features = [f for f in feature_columns if f.startswith('Age_')]
+    numeric_features = [f for f in feature_columns if not f.startswith(('Country_', 'EdLevel_', 'DevType_', 'Industry_', 'Age_'))]
     print(f"  - Numeric features: {len(numeric_features)} -> {numeric_features}")
     print(f"  - Country features: {len(country_features)}")
     print(f"  - Education features: {len(edlevel_features)}")
     print(f"  - DevType features: {len(devtype_features)}")
     print(f"  - Industry features: {len(industry_features)}")
+    print(f"  - Age features: {len(age_features)}")
     if len(country_features) > 0:
         print(f"\nSample country features:")
         for feat in industry_features[:5]:
             print(f"    - {feat}")
+    if len(age_features) > 0:
+        print(f"\nSample age features:")
+        for feat in age_features[:5]:
+            print(f"    - {feat}")
     # Check if there are any features at all
     if len(country_features) == 0:
         print("\n⚠️  WARNING: No country features found!")
         print("\n⚠️  WARNING: No developer type features found!")
     if len(industry_features) == 0:
         print("\n⚠️  WARNING: No industry features found!")
+    if len(age_features) == 0:
+        print("\n⚠️  WARNING: No age features found!")
 def main():
     # Run all tests
     results = {
+        "Years of Coding": test_years_experience_impact(),
         "Country": test_country_impact(),
         "Education Level": test_education_impact(),
         "Developer Type": test_devtype_impact(),
         "Industry": test_industry_impact(),
+        "Age": test_age_impact(),
+        "Work Experience": test_work_exp_impact(),
         "Combined Features": test_combined_features(),
     }