dima806 commited on
Commit
07d23c4
·
verified ·
1 Parent(s): 2121037

Upload 22 files

Browse files
Claude.md CHANGED
@@ -77,7 +77,8 @@ input_data = SalaryInput(
77
  years_code=5.0,
78
  education_level="Bachelor's degree",
79
  dev_type="Developer, back-end",
80
- industry="Software Development"
 
81
  )
82
  salary = predict_salary(input_data)
83
  ```
@@ -123,6 +124,7 @@ The dataset must include these columns:
123
  - `EdLevel` - Education level
124
  - `DevType` - Developer type
125
  - `Industry` - Industry the developer works in
 
126
  - `ConvertedCompYearly` - Annual salary (target variable)
127
 
128
  ### Model Expectations
@@ -157,7 +159,8 @@ test_input = SalaryInput(
157
  years_code=3.0,
158
  education_level="Bachelor's degree",
159
  dev_type="Developer, back-end",
160
- industry="Software Development"
 
161
  )
162
  print(predict_salary(test_input))
163
  ```
 
77
  years_code=5.0,
78
  education_level="Bachelor's degree",
79
  dev_type="Developer, back-end",
80
+ industry="Software Development",
81
+ age="25-34 years old"
82
  )
83
  salary = predict_salary(input_data)
84
  ```
 
124
  - `EdLevel` - Education level
125
  - `DevType` - Developer type
126
  - `Industry` - Industry the developer works in
127
+ - `Age` - Developer's age range
128
  - `ConvertedCompYearly` - Annual salary (target variable)
129
 
130
  ### Model Expectations
 
159
  years_code=3.0,
160
  education_level="Bachelor's degree",
161
  dev_type="Developer, back-end",
162
+ industry="Software Development",
163
+ age="25-34 years old"
164
  )
165
  print(predict_salary(test_input))
166
  ```
app.py CHANGED
@@ -27,9 +27,11 @@ with st.sidebar:
27
  Developer Survey data to predict annual salaries based on:
28
  - Country
29
  - Total years of coding experience (including education)
 
30
  - Education level
31
  - Developer type
32
  - Industry
 
33
  """
34
  )
35
  st.info("💡 Tip: Results are estimates based on survey averages.")
@@ -40,6 +42,7 @@ with st.sidebar:
40
  st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
41
  st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
42
  st.write(f"**Industries:** {len(valid_categories['Industry'])} available")
 
43
  st.caption("Only values from the training data are shown in the dropdowns.")
44
 
45
  # Main input form
@@ -52,12 +55,14 @@ valid_countries = valid_categories["Country"]
52
  valid_education_levels = valid_categories["EdLevel"]
53
  valid_dev_types = valid_categories["DevType"]
54
  valid_industries = valid_categories["Industry"]
 
55
 
56
  # Set default values (if available)
57
  default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
58
  default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
59
  default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
60
  default_industry = "Software Development" if "Software Development" in valid_industries else valid_industries[0]
 
61
 
62
  with col1:
63
  country = st.selectbox(
@@ -76,6 +81,15 @@ with col1:
76
  help="Including any education, how many years have you been coding in total?",
77
  )
78
 
 
 
 
 
 
 
 
 
 
79
  with col2:
80
  education = st.selectbox(
81
  "Education Level",
@@ -98,6 +112,13 @@ industry = st.selectbox(
98
  help="Industry the developer works in (only industries from training data)",
99
  )
100
 
 
 
 
 
 
 
 
101
  # Prediction button
102
  if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
103
  try:
@@ -105,9 +126,11 @@ if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
105
  input_data = SalaryInput(
106
  country=country,
107
  years_code=years,
 
108
  education_level=education,
109
  dev_type=dev_type,
110
  industry=industry,
 
111
  )
112
 
113
  # Make prediction
 
27
  Developer Survey data to predict annual salaries based on:
28
  - Country
29
  - Total years of coding experience (including education)
30
+ - Years of professional work experience
31
  - Education level
32
  - Developer type
33
  - Industry
34
+ - Age
35
  """
36
  )
37
  st.info("💡 Tip: Results are estimates based on survey averages.")
 
42
  st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
43
  st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
44
  st.write(f"**Industries:** {len(valid_categories['Industry'])} available")
45
+ st.write(f"**Age Ranges:** {len(valid_categories['Age'])} available")
46
  st.caption("Only values from the training data are shown in the dropdowns.")
47
 
48
  # Main input form
 
55
  valid_education_levels = valid_categories["EdLevel"]
56
  valid_dev_types = valid_categories["DevType"]
57
  valid_industries = valid_categories["Industry"]
58
+ valid_ages = valid_categories["Age"]
59
 
60
  # Set default values (if available)
61
  default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
62
  default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
63
  default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
64
  default_industry = "Software Development" if "Software Development" in valid_industries else valid_industries[0]
65
+ default_age = "25-34 years old" if "25-34 years old" in valid_ages else valid_ages[0]
66
 
67
  with col1:
68
  country = st.selectbox(
 
81
  help="Including any education, how many years have you been coding in total?",
82
  )
83
 
84
+ work_exp = st.number_input(
85
+ "Years of Professional Work Experience",
86
+ min_value=0,
87
+ max_value=50,
88
+ value=5,
89
+ step=1,
90
+ help="How many years of professional work experience do you have?",
91
+ )
92
+
93
  with col2:
94
  education = st.selectbox(
95
  "Education Level",
 
112
  help="Industry the developer works in (only industries from training data)",
113
  )
114
 
115
+ age = st.selectbox(
116
+ "Age",
117
+ options=valid_ages,
118
+ index=valid_ages.index(default_age),
119
+ help="Developer's age range",
120
+ )
121
+
122
  # Prediction button
123
  if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
124
  try:
 
126
  input_data = SalaryInput(
127
  country=country,
128
  years_code=years,
129
+ work_exp=work_exp,
130
  education_level=education,
131
  dev_type=dev_type,
132
  industry=industry,
133
+ age=age,
134
  )
135
 
136
  # Make prediction
config/valid_categories.yaml CHANGED
@@ -69,3 +69,11 @@ Industry:
69
  - Retail and Consumer Services
70
  - Software Development
71
  - Transportation, or Supply Chain
 
 
 
 
 
 
 
 
 
69
  - Retail and Consumer Services
70
  - Software Development
71
  - Transportation, or Supply Chain
72
+ Age:
73
+ - 18-24 years old
74
+ - 25-34 years old
75
+ - 35-44 years old
76
+ - 45-54 years old
77
+ - 55-64 years old
78
+ - 65 years or older
79
+ - Other
example_inference.py CHANGED
@@ -18,16 +18,20 @@ def main():
18
  input_data_1 = SalaryInput(
19
  country="United States of America",
20
  years_code=5.0,
 
21
  education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
22
  dev_type="Developer, full-stack",
23
  industry="Software Development",
 
24
  )
25
 
26
  print(f"Country: {input_data_1.country}")
27
  print(f"Years of Coding (Total): {input_data_1.years_code}")
 
28
  print(f"Education Level: {input_data_1.education_level}")
29
  print(f"Developer Type: {input_data_1.dev_type}")
30
  print(f"Industry: {input_data_1.industry}")
 
31
 
32
  salary_1 = predict_salary(input_data_1)
33
  print(f"💰 Predicted Salary: ${salary_1:,.2f} USD/year")
@@ -39,16 +43,20 @@ def main():
39
  input_data_2 = SalaryInput(
40
  country="United States of America",
41
  years_code=2.0,
 
42
  education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
43
  dev_type="Developer, front-end",
44
  industry="Fintech",
 
45
  )
46
 
47
  print(f"Country: {input_data_2.country}")
48
  print(f"Years of Coding (Total): {input_data_2.years_code}")
 
49
  print(f"Education Level: {input_data_2.education_level}")
50
  print(f"Developer Type: {input_data_2.dev_type}")
51
  print(f"Industry: {input_data_2.industry}")
 
52
 
53
  salary_2 = predict_salary(input_data_2)
54
  print(f"💰 Predicted Salary: ${salary_2:,.2f} USD/year")
@@ -60,16 +68,20 @@ def main():
60
  input_data_3 = SalaryInput(
61
  country="United States of America",
62
  years_code=10.0,
 
63
  education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
64
  dev_type="Engineering manager",
65
  industry="Banking/Financial Services",
 
66
  )
67
 
68
  print(f"Country: {input_data_3.country}")
69
  print(f"Years of Coding (Total): {input_data_3.years_code}")
 
70
  print(f"Education Level: {input_data_3.education_level}")
71
  print(f"Developer Type: {input_data_3.dev_type}")
72
  print(f"Industry: {input_data_3.industry}")
 
73
 
74
  salary_3 = predict_salary(input_data_3)
75
  print(f"💰 Predicted Salary: ${salary_3:,.2f} USD/year")
@@ -81,16 +93,20 @@ def main():
81
  input_data_4 = SalaryInput(
82
  country="Germany",
83
  years_code=5.0,
 
84
  education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
85
  dev_type="Developer, back-end",
86
  industry="Manufacturing",
 
87
  )
88
 
89
  print(f"Country: {input_data_4.country}")
90
  print(f"Years of Coding (Total): {input_data_4.years_code}")
 
91
  print(f"Education Level: {input_data_4.education_level}")
92
  print(f"Developer Type: {input_data_4.dev_type}")
93
  print(f"Industry: {input_data_4.industry}")
 
94
 
95
  salary_4 = predict_salary(input_data_4)
96
  print(f"💰 Predicted Salary: ${salary_4:,.2f} USD/year")
 
18
  input_data_1 = SalaryInput(
19
  country="United States of America",
20
  years_code=5.0,
21
+ work_exp=3.0,
22
  education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
23
  dev_type="Developer, full-stack",
24
  industry="Software Development",
25
+ age="25-34 years old",
26
  )
27
 
28
  print(f"Country: {input_data_1.country}")
29
  print(f"Years of Coding (Total): {input_data_1.years_code}")
30
+ print(f"Work Experience: {input_data_1.work_exp}")
31
  print(f"Education Level: {input_data_1.education_level}")
32
  print(f"Developer Type: {input_data_1.dev_type}")
33
  print(f"Industry: {input_data_1.industry}")
34
+ print(f"Age: {input_data_1.age}")
35
 
36
  salary_1 = predict_salary(input_data_1)
37
  print(f"💰 Predicted Salary: ${salary_1:,.2f} USD/year")
 
43
  input_data_2 = SalaryInput(
44
  country="United States of America",
45
  years_code=2.0,
46
+ work_exp=1.0,
47
  education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
48
  dev_type="Developer, front-end",
49
  industry="Fintech",
50
+ age="18-24 years old",
51
  )
52
 
53
  print(f"Country: {input_data_2.country}")
54
  print(f"Years of Coding (Total): {input_data_2.years_code}")
55
+ print(f"Work Experience: {input_data_2.work_exp}")
56
  print(f"Education Level: {input_data_2.education_level}")
57
  print(f"Developer Type: {input_data_2.dev_type}")
58
  print(f"Industry: {input_data_2.industry}")
59
+ print(f"Age: {input_data_2.age}")
60
 
61
  salary_2 = predict_salary(input_data_2)
62
  print(f"💰 Predicted Salary: ${salary_2:,.2f} USD/year")
 
68
  input_data_3 = SalaryInput(
69
  country="United States of America",
70
  years_code=10.0,
71
+ work_exp=8.0,
72
  education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
73
  dev_type="Engineering manager",
74
  industry="Banking/Financial Services",
75
+ age="35-44 years old",
76
  )
77
 
78
  print(f"Country: {input_data_3.country}")
79
  print(f"Years of Coding (Total): {input_data_3.years_code}")
80
+ print(f"Work Experience: {input_data_3.work_exp}")
81
  print(f"Education Level: {input_data_3.education_level}")
82
  print(f"Developer Type: {input_data_3.dev_type}")
83
  print(f"Industry: {input_data_3.industry}")
84
+ print(f"Age: {input_data_3.age}")
85
 
86
  salary_3 = predict_salary(input_data_3)
87
  print(f"💰 Predicted Salary: ${salary_3:,.2f} USD/year")
 
93
  input_data_4 = SalaryInput(
94
  country="Germany",
95
  years_code=5.0,
96
+ work_exp=3.0,
97
  education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
98
  dev_type="Developer, back-end",
99
  industry="Manufacturing",
100
+ age="25-34 years old",
101
  )
102
 
103
  print(f"Country: {input_data_4.country}")
104
  print(f"Years of Coding (Total): {input_data_4.years_code}")
105
+ print(f"Work Experience: {input_data_4.work_exp}")
106
  print(f"Education Level: {input_data_4.education_level}")
107
  print(f"Developer Type: {input_data_4.dev_type}")
108
  print(f"Industry: {input_data_4.industry}")
109
+ print(f"Age: {input_data_4.age}")
110
 
111
  salary_4 = predict_salary(input_data_4)
112
  print(f"💰 Predicted Salary: ${salary_4:,.2f} USD/year")
models/model.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5165f22311d0eb6809380cf4fa5a749b59f0d8e81903462fe7c2c882e09e916f
3
- size 3192752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efa053c467c2c1ea33a65f04aedc32fb7a6c47658238f26d88cd0a10986c0c98
3
+ size 3985657
src/infer.py CHANGED
@@ -99,14 +99,23 @@ def predict_salary(data: SalaryInput) -> float:
99
  f"Check config/valid_categories.yaml for all valid values."
100
  )
101
 
 
 
 
 
 
 
 
102
  # Create a DataFrame with the input data
103
  input_df = pd.DataFrame(
104
  {
105
  "Country": [data.country],
106
  "YearsCode": [data.years_code],
 
107
  "EdLevel": [data.education_level],
108
  "DevType": [data.dev_type],
109
  "Industry": [data.industry],
 
110
  }
111
  )
112
 
 
99
  f"Check config/valid_categories.yaml for all valid values."
100
  )
101
 
102
+ if data.age not in valid_categories["Age"]:
103
+ raise ValueError(
104
+ f"Invalid age: '{data.age}'. "
105
+ f"Must be one of {len(valid_categories['Age'])} valid age ranges. "
106
+ f"Check config/valid_categories.yaml for all valid values."
107
+ )
108
+
109
  # Create a DataFrame with the input data
110
  input_df = pd.DataFrame(
111
  {
112
  "Country": [data.country],
113
  "YearsCode": [data.years_code],
114
+ "WorkExp": [data.work_exp],
115
  "EdLevel": [data.education_level],
116
  "DevType": [data.dev_type],
117
  "Industry": [data.industry],
118
+ "Age": [data.age],
119
  }
120
  )
121
 
src/preprocessing.py CHANGED
@@ -55,7 +55,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
55
  during training and inference, preventing data leakage and inconsistencies.
56
 
57
  Args:
58
- df: DataFrame with columns: Country, YearsCode, EdLevel, DevType, Industry
59
  NOTE: During training, cardinality reduction should be applied to df
60
  BEFORE calling this function. During inference, valid_categories.yaml
61
  ensures only valid (already-reduced) categories are used.
@@ -67,7 +67,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
67
  - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
68
  - Normalizes Unicode apostrophes to regular apostrophes
69
  - Applies one-hot encoding with drop_first=True to avoid multicollinearity
70
- - Column names in output will be like: YearsCode, Country_X, EdLevel_Y, DevType_Z, Industry_W
71
  - Does NOT apply cardinality reduction (must be done before calling this)
72
  """
73
  # Create a copy to avoid modifying the original
@@ -75,7 +75,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
75
 
76
  # Normalize Unicode apostrophes to regular apostrophes for consistency
77
  # This handles cases where data has \u2019 (') instead of '
78
- for col in ["Country", "EdLevel", "DevType", "Industry"]:
79
  if col in df_processed.columns:
80
  df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
81
 
@@ -85,17 +85,19 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
85
 
86
  # Fill missing values with defaults
87
  df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
 
88
  df_processed["Country"] = df_processed["Country"].fillna("Unknown")
89
  df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
90
  df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
91
  df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
 
92
 
93
  # NOTE: Cardinality reduction is NOT applied here
94
  # It should be applied during training BEFORE calling this function
95
  # During inference, valid_categories.yaml ensures only valid values are used
96
 
97
  # Select only the features we need
98
- feature_cols = ["Country", "YearsCode", "EdLevel", "DevType", "Industry"]
99
  df_features = df_processed[feature_cols]
100
 
101
  # Apply one-hot encoding for categorical variables
 
55
  during training and inference, preventing data leakage and inconsistencies.
56
 
57
  Args:
58
+ df: DataFrame with columns: Country, YearsCode, WorkExp, EdLevel, DevType, Industry, Age
59
  NOTE: During training, cardinality reduction should be applied to df
60
  BEFORE calling this function. During inference, valid_categories.yaml
61
  ensures only valid (already-reduced) categories are used.
 
67
  - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
68
  - Normalizes Unicode apostrophes to regular apostrophes
69
  - Applies one-hot encoding with drop_first=True to avoid multicollinearity
70
+ - Column names in output will be like: YearsCode, WorkExp, Country_X, EdLevel_Y, DevType_Z, Industry_W, Age_V
71
  - Does NOT apply cardinality reduction (must be done before calling this)
72
  """
73
  # Create a copy to avoid modifying the original
 
75
 
76
  # Normalize Unicode apostrophes to regular apostrophes for consistency
77
  # This handles cases where data has \u2019 (') instead of '
78
+ for col in ["Country", "EdLevel", "DevType", "Industry", "Age"]:
79
  if col in df_processed.columns:
80
  df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
81
 
 
85
 
86
  # Fill missing values with defaults
87
  df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
88
+ df_processed["WorkExp"] = df_processed["WorkExp"].fillna(0)
89
  df_processed["Country"] = df_processed["Country"].fillna("Unknown")
90
  df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
91
  df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
92
  df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
93
+ df_processed["Age"] = df_processed["Age"].fillna("Unknown")
94
 
95
  # NOTE: Cardinality reduction is NOT applied here
96
  # It should be applied during training BEFORE calling this function
97
  # During inference, valid_categories.yaml ensures only valid values are used
98
 
99
  # Select only the features we need
100
+ feature_cols = ["Country", "YearsCode", "WorkExp", "EdLevel", "DevType", "Industry", "Age"]
101
  df_features = df_processed[feature_cols]
102
 
103
  # Apply one-hot encoding for categorical variables
src/schema.py CHANGED
@@ -12,9 +12,15 @@ class SalaryInput(BaseModel):
12
  ge=0,
13
  description="Including any education, how many years have you been coding in total?",
14
  )
 
 
 
 
 
15
  education_level: str = Field(..., description="Education level")
16
  dev_type: str = Field(..., description="Developer type")
17
  industry: str = Field(..., description="Industry the developer works in")
 
18
 
19
  class Config:
20
  """Pydantic configuration."""
@@ -23,8 +29,10 @@ class SalaryInput(BaseModel):
23
  "example": {
24
  "country": "United States",
25
  "years_code": 5.0,
 
26
  "education_level": "Bachelor's degree",
27
  "dev_type": "Developer, back-end",
28
  "industry": "Software Development",
 
29
  }
30
  }
 
12
  ge=0,
13
  description="Including any education, how many years have you been coding in total?",
14
  )
15
+ work_exp: float = Field(
16
+ ...,
17
+ ge=0,
18
+ description="How many years of professional work experience do you have?",
19
+ )
20
  education_level: str = Field(..., description="Education level")
21
  dev_type: str = Field(..., description="Developer type")
22
  industry: str = Field(..., description="Industry the developer works in")
23
+ age: str = Field(..., description="Developer's age range")
24
 
25
  class Config:
26
  """Pydantic configuration."""
 
29
  "example": {
30
  "country": "United States",
31
  "years_code": 5.0,
32
+ "work_exp": 3.0,
33
  "education_level": "Bachelor's degree",
34
  "dev_type": "Developer, back-end",
35
  "industry": "Software Development",
36
+ "age": "25-34 years old",
37
  }
38
  }
src/train.py CHANGED
@@ -32,7 +32,7 @@ def main():
32
  # Load only required columns to save memory
33
  df = pd.read_csv(
34
  data_path,
35
- usecols=["Country", "YearsCode", "EdLevel", "DevType", "Industry",
36
  "Currency", "CompTotal", "ConvertedCompYearly"],
37
  )
38
 
@@ -67,12 +67,14 @@ def main():
67
  df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
68
  df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
69
  df_copy["Industry"] = df_copy["Industry"].str.replace('\u2019', "'", regex=False)
 
70
 
71
  # Apply cardinality reduction
72
  df_copy["Country"] = reduce_cardinality(df_copy["Country"])
73
  df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
74
  df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
75
  df_copy["Industry"] = reduce_cardinality(df_copy["Industry"])
 
76
 
77
  # Apply cardinality reduction to the actual training data as well
78
  # (prepare_features no longer does this internally)
@@ -80,6 +82,7 @@ def main():
80
  df["EdLevel"] = reduce_cardinality(df["EdLevel"])
81
  df["DevType"] = reduce_cardinality(df["DevType"])
82
  df["Industry"] = reduce_cardinality(df["Industry"])
 
83
 
84
  # Now apply full feature transformations for model training
85
  X = prepare_features(df)
@@ -91,19 +94,21 @@ def main():
91
  edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
92
  devtype_values = df_copy["DevType"].dropna().unique().tolist()
93
  industry_values = df_copy["Industry"].dropna().unique().tolist()
 
94
 
95
  valid_categories = {
96
  "Country": sorted(country_values),
97
  "EdLevel": sorted(edlevel_values),
98
  "DevType": sorted(devtype_values),
99
  "Industry": sorted(industry_values),
 
100
  }
101
 
102
  valid_categories_path = Path("config/valid_categories.yaml")
103
  with open(valid_categories_path, "w") as f:
104
  yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
105
 
106
- print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types, and {len(valid_categories['Industry'])} valid industries to {valid_categories_path}")
107
 
108
  # Compute currency conversion rates per country
109
  # Use the original data with Currency and CompTotal columns
@@ -181,6 +186,12 @@ def main():
181
  for industry, count in top_industry.items():
182
  print(f" - {industry}: {count:,} ({count/len(df)*100:.1f}%)")
183
 
 
 
 
 
 
 
184
  # Show YearsCode statistics
185
  print("\n💼 Years of Coding Experience:")
186
  print(f" - Min: {df['YearsCode'].min():.1f}")
@@ -190,6 +201,15 @@ def main():
190
  print(f" - 25th percentile: {df['YearsCode'].quantile(0.25):.1f}")
191
  print(f" - 75th percentile: {df['YearsCode'].quantile(0.75):.1f}")
192
 
 
 
 
 
 
 
 
 
 
193
  # Show most common one-hot encoded features (by frequency)
194
  # Separate analysis for each categorical feature
195
 
@@ -231,12 +251,21 @@ def main():
231
  industry_name = feature.replace('Industry_', '')
232
  print(f" {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
233
 
 
 
 
 
 
 
 
 
234
  print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
235
- print(" - Numeric: 1 (YearsCode)")
236
  print(f" - Country: {len(country_features)}")
237
  print(f" - Education: {len(edlevel_features)}")
238
  print(f" - DevType: {len(devtype_features)}")
239
  print(f" - Industry: {len(industry_features)}")
 
240
 
241
  print("=" * 60 + "\n")
242
 
 
32
  # Load only required columns to save memory
33
  df = pd.read_csv(
34
  data_path,
35
+ usecols=["Country", "YearsCode", "WorkExp", "EdLevel", "DevType", "Industry", "Age",
36
  "Currency", "CompTotal", "ConvertedCompYearly"],
37
  )
38
 
 
67
  df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
68
  df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
69
  df_copy["Industry"] = df_copy["Industry"].str.replace('\u2019', "'", regex=False)
70
+ df_copy["Age"] = df_copy["Age"].str.replace('\u2019', "'", regex=False)
71
 
72
  # Apply cardinality reduction
73
  df_copy["Country"] = reduce_cardinality(df_copy["Country"])
74
  df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
75
  df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
76
  df_copy["Industry"] = reduce_cardinality(df_copy["Industry"])
77
+ df_copy["Age"] = reduce_cardinality(df_copy["Age"])
78
 
79
  # Apply cardinality reduction to the actual training data as well
80
  # (prepare_features no longer does this internally)
 
82
  df["EdLevel"] = reduce_cardinality(df["EdLevel"])
83
  df["DevType"] = reduce_cardinality(df["DevType"])
84
  df["Industry"] = reduce_cardinality(df["Industry"])
85
+ df["Age"] = reduce_cardinality(df["Age"])
86
 
87
  # Now apply full feature transformations for model training
88
  X = prepare_features(df)
 
94
  edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
95
  devtype_values = df_copy["DevType"].dropna().unique().tolist()
96
  industry_values = df_copy["Industry"].dropna().unique().tolist()
97
+ age_values = df_copy["Age"].dropna().unique().tolist()
98
 
99
  valid_categories = {
100
  "Country": sorted(country_values),
101
  "EdLevel": sorted(edlevel_values),
102
  "DevType": sorted(devtype_values),
103
  "Industry": sorted(industry_values),
104
+ "Age": sorted(age_values),
105
  }
106
 
107
  valid_categories_path = Path("config/valid_categories.yaml")
108
  with open(valid_categories_path, "w") as f:
109
  yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
110
 
111
+ print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types, {len(valid_categories['Industry'])} valid industries, and {len(valid_categories['Age'])} valid age ranges to {valid_categories_path}")
112
 
113
  # Compute currency conversion rates per country
114
  # Use the original data with Currency and CompTotal columns
 
186
  for industry, count in top_industry.items():
187
  print(f" - {industry}: {count:,} ({count/len(df)*100:.1f}%)")
188
 
189
+ # Show age distribution
190
+ print("\n🎂 Age Distribution:")
191
+ top_age = df["Age"].value_counts().head(10)
192
+ for age, count in top_age.items():
193
+ print(f" - {age}: {count:,} ({count/len(df)*100:.1f}%)")
194
+
195
  # Show YearsCode statistics
196
  print("\n💼 Years of Coding Experience:")
197
  print(f" - Min: {df['YearsCode'].min():.1f}")
 
201
  print(f" - 25th percentile: {df['YearsCode'].quantile(0.25):.1f}")
202
  print(f" - 75th percentile: {df['YearsCode'].quantile(0.75):.1f}")
203
 
204
+ # Show WorkExp statistics
205
+ print("\n💼 Years of Professional Work Experience:")
206
+ print(f" - Min: {df['WorkExp'].min():.1f}")
207
+ print(f" - Max: {df['WorkExp'].max():.1f}")
208
+ print(f" - Mean: {df['WorkExp'].mean():.1f}")
209
+ print(f" - Median: {df['WorkExp'].median():.1f}")
210
+ print(f" - 25th percentile: {df['WorkExp'].quantile(0.25):.1f}")
211
+ print(f" - 75th percentile: {df['WorkExp'].quantile(0.75):.1f}")
212
+
213
  # Show most common one-hot encoded features (by frequency)
214
  # Separate analysis for each categorical feature
215
 
 
251
  industry_name = feature.replace('Industry_', '')
252
  print(f" {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
253
 
254
+ # Age features
255
+ print("\n🎂 Top 10 Age Features (most common):")
256
+ age_features = categorical_features[categorical_features.index.str.startswith('Age_')]
257
+ for i, (feature, count) in enumerate(age_features.head(10).items(), 1):
258
+ percentage = (count / len(X)) * 100
259
+ age_name = feature.replace('Age_', '')
260
+ print(f" {i:2d}. {age_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
261
+
262
  print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
263
+ print(" - Numeric: 2 (YearsCode, WorkExp)")
264
  print(f" - Country: {len(country_features)}")
265
  print(f" - Education: {len(edlevel_features)}")
266
  print(f" - DevType: {len(devtype_features)}")
267
  print(f" - Industry: {len(industry_features)}")
268
+ print(f" - Age: {len(age_features)}")
269
 
270
  print("=" * 60 + "\n")
271
 
test_feature_impact.py CHANGED
@@ -12,9 +12,11 @@ def test_years_experience_impact():
12
 
13
  base_input = {
14
  "country": "United States of America",
 
15
  "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
16
  "dev_type": "Developer, full-stack",
17
  "industry": "Software Development",
 
18
  }
19
 
20
  # Test with different years of experience
@@ -45,9 +47,11 @@ def test_country_impact():
45
 
46
  base_input = {
47
  "years_code": 5.0,
 
48
  "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
49
  "dev_type": "Developer, full-stack",
50
  "industry": "Software Development",
 
51
  }
52
 
53
  # Test with different countries (select diverse ones)
@@ -93,8 +97,10 @@ def test_education_impact():
93
  base_input = {
94
  "country": "United States of America",
95
  "years_code": 5.0,
 
96
  "dev_type": "Developer, full-stack",
97
  "industry": "Software Development",
 
98
  }
99
 
100
  # Test with different education levels
@@ -141,8 +147,10 @@ def test_devtype_impact():
141
  base_input = {
142
  "country": "United States of America",
143
  "years_code": 5.0,
 
144
  "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
145
  "industry": "Software Development",
 
146
  }
147
 
148
  # Test with different developer types (using actual values from trained model)
@@ -189,8 +197,10 @@ def test_industry_impact():
189
  base_input = {
190
  "country": "United States of America",
191
  "years_code": 5.0,
 
192
  "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
193
  "dev_type": "Developer, full-stack",
 
194
  }
195
 
196
  # Test with different industries (using actual values from trained model)
@@ -228,40 +238,132 @@ def test_industry_impact():
228
  return False
229
 
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  def test_combined_features():
232
  """Test that combining different features produces expected variations."""
233
  print("\n" + "=" * 70)
234
- print("TEST 6: Combined Feature Variations")
235
  print("=" * 70)
236
 
237
  # Create diverse combinations (using actual values from trained model)
238
  test_cases = [
239
- ("India", 2, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, back-end", "Software Development"),
240
- ("Germany", 5, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Developer, full-stack", "Manufacturing"),
241
- ("United States of America", 10, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Engineering manager", "Fintech"),
242
- ("Poland", 15, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, front-end", "Healthcare"),
243
- ("Brazil", 5, "Some college/university study without earning a degree", "DevOps engineer or professional", "Government"),
244
  ]
245
 
246
  predictions = []
247
- for country, years, education, devtype, industry in test_cases:
248
  # Skip if not in valid categories
249
  if (country not in valid_categories["Country"]
250
  or education not in valid_categories["EdLevel"]
251
  or devtype not in valid_categories["DevType"]
252
- or industry not in valid_categories["Industry"]):
 
253
  continue
254
 
255
  input_data = SalaryInput(
256
  country=country,
257
  years_code=years,
 
258
  education_level=education,
259
  dev_type=devtype,
260
  industry=industry,
 
261
  )
262
  salary = predict_salary(input_data)
263
  predictions.append(salary)
264
- print(f" {country[:15]:15s} | {years:2d}y | {education[:25]:25s} | {devtype[:25]:25s} | {industry[:20]:20s} -> ${salary:,.2f}")
265
 
266
  # Check if predictions are different
267
  unique_predictions = len(set(predictions))
@@ -289,13 +391,15 @@ def print_feature_analysis():
289
  edlevel_features = [f for f in feature_columns if f.startswith('EdLevel_')]
290
  devtype_features = [f for f in feature_columns if f.startswith('DevType_')]
291
  industry_features = [f for f in feature_columns if f.startswith('Industry_')]
292
- numeric_features = [f for f in feature_columns if not f.startswith(('Country_', 'EdLevel_', 'DevType_', 'Industry_'))]
 
293
 
294
  print(f" - Numeric features: {len(numeric_features)} -> {numeric_features}")
295
  print(f" - Country features: {len(country_features)}")
296
  print(f" - Education features: {len(edlevel_features)}")
297
  print(f" - DevType features: {len(devtype_features)}")
298
  print(f" - Industry features: {len(industry_features)}")
 
299
 
300
  if len(country_features) > 0:
301
  print(f"\nSample country features:")
@@ -317,6 +421,11 @@ def print_feature_analysis():
317
  for feat in industry_features[:5]:
318
  print(f" - {feat}")
319
 
 
 
 
 
 
320
  # Check if there are any features at all
321
  if len(country_features) == 0:
322
  print("\n⚠️ WARNING: No country features found!")
@@ -326,6 +435,8 @@ def print_feature_analysis():
326
  print("\n⚠️ WARNING: No developer type features found!")
327
  if len(industry_features) == 0:
328
  print("\n⚠️ WARNING: No industry features found!")
 
 
329
 
330
 
331
  def main():
@@ -340,11 +451,13 @@ def main():
340
 
341
  # Run all tests
342
  results = {
343
- "Years of Experience": test_years_experience_impact(),
344
  "Country": test_country_impact(),
345
  "Education Level": test_education_impact(),
346
  "Developer Type": test_devtype_impact(),
347
  "Industry": test_industry_impact(),
 
 
348
  "Combined Features": test_combined_features(),
349
  }
350
 
 
12
 
13
  base_input = {
14
  "country": "United States of America",
15
+ "work_exp": 3.0,
16
  "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
17
  "dev_type": "Developer, full-stack",
18
  "industry": "Software Development",
19
+ "age": "25-34 years old",
20
  }
21
 
22
  # Test with different years of experience
 
47
 
48
  base_input = {
49
  "years_code": 5.0,
50
+ "work_exp": 3.0,
51
  "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
52
  "dev_type": "Developer, full-stack",
53
  "industry": "Software Development",
54
+ "age": "25-34 years old",
55
  }
56
 
57
  # Test with different countries (select diverse ones)
 
97
  base_input = {
98
  "country": "United States of America",
99
  "years_code": 5.0,
100
+ "work_exp": 3.0,
101
  "dev_type": "Developer, full-stack",
102
  "industry": "Software Development",
103
+ "age": "25-34 years old",
104
  }
105
 
106
  # Test with different education levels
 
147
  base_input = {
148
  "country": "United States of America",
149
  "years_code": 5.0,
150
+ "work_exp": 3.0,
151
  "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
152
  "industry": "Software Development",
153
+ "age": "25-34 years old",
154
  }
155
 
156
  # Test with different developer types (using actual values from trained model)
 
197
  base_input = {
198
  "country": "United States of America",
199
  "years_code": 5.0,
200
+ "work_exp": 3.0,
201
  "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
202
  "dev_type": "Developer, full-stack",
203
+ "age": "25-34 years old",
204
  }
205
 
206
  # Test with different industries (using actual values from trained model)
 
238
  return False
239
 
240
 
241
+ def test_age_impact():
242
+ """Test that changing age changes prediction."""
243
+ print("\n" + "=" * 70)
244
+ print("TEST 6: Age Impact")
245
+ print("=" * 70)
246
+
247
+ base_input = {
248
+ "country": "United States of America",
249
+ "years_code": 5.0,
250
+ "work_exp": 3.0,
251
+ "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
252
+ "dev_type": "Developer, full-stack",
253
+ "industry": "Software Development",
254
+ }
255
+
256
+ # Test with different age ranges (using actual values from trained model)
257
+ test_ages = [
258
+ "18-24 years old",
259
+ "25-34 years old",
260
+ "35-44 years old",
261
+ "45-54 years old",
262
+ "55-64 years old",
263
+ ]
264
+
265
+ # Filter to only ages that exist in valid categories
266
+ test_ages = [a for a in test_ages if a in valid_categories["Age"]]
267
+
268
+ predictions = []
269
+ for age in test_ages:
270
+ input_data = SalaryInput(**base_input, age=age)
271
+ salary = predict_salary(input_data)
272
+ predictions.append(salary)
273
+ print(f" Age: {age[:50]:50s} -> Salary: ${salary:,.2f}")
274
+
275
+ # Check if predictions are different
276
+ unique_predictions = len(set(predictions))
277
+ if unique_predictions == len(predictions):
278
+ print(f"\n✅ PASS: All {len(predictions)} predictions are different")
279
+ return True
280
+ elif unique_predictions == 1:
281
+ print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
282
+ print(" This indicates the model is NOT using age as a feature!")
283
+ return False
284
+ else:
285
+ print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
286
+ print(f" Duplicate salaries found - possible feature issue")
287
+ return False
288
+
289
+
290
+ def test_work_exp_impact():
291
+ """Test that changing years of work experience changes prediction."""
292
+ print("\n" + "=" * 70)
293
+ print("TEST 7: Work Experience Impact")
294
+ print("=" * 70)
295
+
296
+ base_input = {
297
+ "country": "United States of America",
298
+ "years_code": 10.0,
299
+ "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
300
+ "dev_type": "Developer, full-stack",
301
+ "industry": "Software Development",
302
+ "age": "25-34 years old",
303
+ }
304
+
305
+ # Test with different years of work experience
306
+ work_exp_tests = [0, 1, 3, 5, 10, 20]
307
+ predictions = []
308
+
309
+ for work_exp in work_exp_tests:
310
+ input_data = SalaryInput(**base_input, work_exp=work_exp)
311
+ salary = predict_salary(input_data)
312
+ predictions.append(salary)
313
+ print(f" Work Exp: {work_exp:2d} -> Salary: ${salary:,.2f}")
314
+
315
+ # Check if predictions are different
316
+ unique_predictions = len(set(predictions))
317
+ if unique_predictions == len(predictions):
318
+ print(f"\n✅ PASS: All {len(predictions)} predictions are different")
319
+ return True
320
+ elif unique_predictions == 1:
321
+ print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
322
+ print(" This indicates the model is NOT using work experience as a feature!")
323
+ return False
324
+ else:
325
+ print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
326
+ print(f" Duplicate salaries found - possible feature issue")
327
+ return False
328
+
329
+
330
  def test_combined_features():
331
  """Test that combining different features produces expected variations."""
332
  print("\n" + "=" * 70)
333
+ print("TEST 8: Combined Feature Variations")
334
  print("=" * 70)
335
 
336
  # Create diverse combinations (using actual values from trained model)
337
  test_cases = [
338
+ ("India", 2, 1, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, back-end", "Software Development", "18-24 years old"),
339
+ ("Germany", 5, 3, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Developer, full-stack", "Manufacturing", "25-34 years old"),
340
+ ("United States of America", 10, 8, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Engineering manager", "Fintech", "35-44 years old"),
341
+ ("Poland", 15, 12, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, front-end", "Healthcare", "45-54 years old"),
342
+ ("Brazil", 5, 3, "Some college/university study without earning a degree", "DevOps engineer or professional", "Government", "25-34 years old"),
343
  ]
344
 
345
  predictions = []
346
+ for country, years, work_exp, education, devtype, industry, age in test_cases:
347
  # Skip if not in valid categories
348
  if (country not in valid_categories["Country"]
349
  or education not in valid_categories["EdLevel"]
350
  or devtype not in valid_categories["DevType"]
351
+ or industry not in valid_categories["Industry"]
352
+ or age not in valid_categories["Age"]):
353
  continue
354
 
355
  input_data = SalaryInput(
356
  country=country,
357
  years_code=years,
358
+ work_exp=work_exp,
359
  education_level=education,
360
  dev_type=devtype,
361
  industry=industry,
362
+ age=age,
363
  )
364
  salary = predict_salary(input_data)
365
  predictions.append(salary)
366
+ print(f" {country[:15]:15s} | {years:2d}y | {work_exp:2d}w | {education[:25]:25s} | {devtype[:25]:25s} | {industry[:20]:20s} | {age[:15]:15s} -> ${salary:,.2f}")
367
 
368
  # Check if predictions are different
369
  unique_predictions = len(set(predictions))
 
391
  edlevel_features = [f for f in feature_columns if f.startswith('EdLevel_')]
392
  devtype_features = [f for f in feature_columns if f.startswith('DevType_')]
393
  industry_features = [f for f in feature_columns if f.startswith('Industry_')]
394
+ age_features = [f for f in feature_columns if f.startswith('Age_')]
395
+ numeric_features = [f for f in feature_columns if not f.startswith(('Country_', 'EdLevel_', 'DevType_', 'Industry_', 'Age_'))]
396
 
397
  print(f" - Numeric features: {len(numeric_features)} -> {numeric_features}")
398
  print(f" - Country features: {len(country_features)}")
399
  print(f" - Education features: {len(edlevel_features)}")
400
  print(f" - DevType features: {len(devtype_features)}")
401
  print(f" - Industry features: {len(industry_features)}")
402
+ print(f" - Age features: {len(age_features)}")
403
 
404
  if len(country_features) > 0:
405
  print(f"\nSample country features:")
 
421
  for feat in industry_features[:5]:
422
  print(f" - {feat}")
423
 
424
+ if len(age_features) > 0:
425
+ print(f"\nSample age features:")
426
+ for feat in age_features[:5]:
427
+ print(f" - {feat}")
428
+
429
  # Check if there are any features at all
430
  if len(country_features) == 0:
431
  print("\n⚠️ WARNING: No country features found!")
 
435
  print("\n⚠️ WARNING: No developer type features found!")
436
  if len(industry_features) == 0:
437
  print("\n⚠️ WARNING: No industry features found!")
438
+ if len(age_features) == 0:
439
+ print("\n⚠️ WARNING: No age features found!")
440
 
441
 
442
  def main():
 
451
 
452
  # Run all tests
453
  results = {
454
+ "Years of Coding": test_years_experience_impact(),
455
  "Country": test_country_impact(),
456
  "Education Level": test_education_impact(),
457
  "Developer Type": test_devtype_impact(),
458
  "Industry": test_industry_impact(),
459
+ "Age": test_age_impact(),
460
+ "Work Experience": test_work_exp_impact(),
461
  "Combined Features": test_combined_features(),
462
  }
463