Upload 22 files
Browse files- Claude.md +5 -2
- app.py +23 -0
- config/valid_categories.yaml +8 -0
- example_inference.py +16 -0
- models/model.pkl +2 -2
- src/infer.py +9 -0
- src/preprocessing.py +6 -4
- src/schema.py +8 -0
- src/train.py +32 -3
- test_feature_impact.py +124 -11
Claude.md
CHANGED
|
@@ -77,7 +77,8 @@ input_data = SalaryInput(
|
|
| 77 |
years_code=5.0,
|
| 78 |
education_level="Bachelor's degree",
|
| 79 |
dev_type="Developer, back-end",
|
| 80 |
-
industry="Software Development"
|
|
|
|
| 81 |
)
|
| 82 |
salary = predict_salary(input_data)
|
| 83 |
```
|
|
@@ -123,6 +124,7 @@ The dataset must include these columns:
|
|
| 123 |
- `EdLevel` - Education level
|
| 124 |
- `DevType` - Developer type
|
| 125 |
- `Industry` - Industry the developer works in
|
|
|
|
| 126 |
- `ConvertedCompYearly` - Annual salary (target variable)
|
| 127 |
|
| 128 |
### Model Expectations
|
|
@@ -157,7 +159,8 @@ test_input = SalaryInput(
|
|
| 157 |
years_code=3.0,
|
| 158 |
education_level="Bachelor's degree",
|
| 159 |
dev_type="Developer, back-end",
|
| 160 |
-
industry="Software Development"
|
|
|
|
| 161 |
)
|
| 162 |
print(predict_salary(test_input))
|
| 163 |
```
|
|
|
|
| 77 |
years_code=5.0,
|
| 78 |
education_level="Bachelor's degree",
|
| 79 |
dev_type="Developer, back-end",
|
| 80 |
+
industry="Software Development",
|
| 81 |
+
age="25-34 years old"
|
| 82 |
)
|
| 83 |
salary = predict_salary(input_data)
|
| 84 |
```
|
|
|
|
| 124 |
- `EdLevel` - Education level
|
| 125 |
- `DevType` - Developer type
|
| 126 |
- `Industry` - Industry the developer works in
|
| 127 |
+
- `Age` - Developer's age range
|
| 128 |
- `ConvertedCompYearly` - Annual salary (target variable)
|
| 129 |
|
| 130 |
### Model Expectations
|
|
|
|
| 159 |
years_code=3.0,
|
| 160 |
education_level="Bachelor's degree",
|
| 161 |
dev_type="Developer, back-end",
|
| 162 |
+
industry="Software Development",
|
| 163 |
+
age="25-34 years old"
|
| 164 |
)
|
| 165 |
print(predict_salary(test_input))
|
| 166 |
```
|
app.py
CHANGED
|
@@ -27,9 +27,11 @@ with st.sidebar:
|
|
| 27 |
Developer Survey data to predict annual salaries based on:
|
| 28 |
- Country
|
| 29 |
- Total years of coding experience (including education)
|
|
|
|
| 30 |
- Education level
|
| 31 |
- Developer type
|
| 32 |
- Industry
|
|
|
|
| 33 |
"""
|
| 34 |
)
|
| 35 |
st.info("💡 Tip: Results are estimates based on survey averages.")
|
|
@@ -40,6 +42,7 @@ with st.sidebar:
|
|
| 40 |
st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
|
| 41 |
st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
|
| 42 |
st.write(f"**Industries:** {len(valid_categories['Industry'])} available")
|
|
|
|
| 43 |
st.caption("Only values from the training data are shown in the dropdowns.")
|
| 44 |
|
| 45 |
# Main input form
|
|
@@ -52,12 +55,14 @@ valid_countries = valid_categories["Country"]
|
|
| 52 |
valid_education_levels = valid_categories["EdLevel"]
|
| 53 |
valid_dev_types = valid_categories["DevType"]
|
| 54 |
valid_industries = valid_categories["Industry"]
|
|
|
|
| 55 |
|
| 56 |
# Set default values (if available)
|
| 57 |
default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
|
| 58 |
default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
|
| 59 |
default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
|
| 60 |
default_industry = "Software Development" if "Software Development" in valid_industries else valid_industries[0]
|
|
|
|
| 61 |
|
| 62 |
with col1:
|
| 63 |
country = st.selectbox(
|
|
@@ -76,6 +81,15 @@ with col1:
|
|
| 76 |
help="Including any education, how many years have you been coding in total?",
|
| 77 |
)
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
with col2:
|
| 80 |
education = st.selectbox(
|
| 81 |
"Education Level",
|
|
@@ -98,6 +112,13 @@ industry = st.selectbox(
|
|
| 98 |
help="Industry the developer works in (only industries from training data)",
|
| 99 |
)
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
# Prediction button
|
| 102 |
if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
|
| 103 |
try:
|
|
@@ -105,9 +126,11 @@ if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
|
|
| 105 |
input_data = SalaryInput(
|
| 106 |
country=country,
|
| 107 |
years_code=years,
|
|
|
|
| 108 |
education_level=education,
|
| 109 |
dev_type=dev_type,
|
| 110 |
industry=industry,
|
|
|
|
| 111 |
)
|
| 112 |
|
| 113 |
# Make prediction
|
|
|
|
| 27 |
Developer Survey data to predict annual salaries based on:
|
| 28 |
- Country
|
| 29 |
- Total years of coding experience (including education)
|
| 30 |
+
- Years of professional work experience
|
| 31 |
- Education level
|
| 32 |
- Developer type
|
| 33 |
- Industry
|
| 34 |
+
- Age
|
| 35 |
"""
|
| 36 |
)
|
| 37 |
st.info("💡 Tip: Results are estimates based on survey averages.")
|
|
|
|
| 42 |
st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
|
| 43 |
st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
|
| 44 |
st.write(f"**Industries:** {len(valid_categories['Industry'])} available")
|
| 45 |
+
st.write(f"**Age Ranges:** {len(valid_categories['Age'])} available")
|
| 46 |
st.caption("Only values from the training data are shown in the dropdowns.")
|
| 47 |
|
| 48 |
# Main input form
|
|
|
|
| 55 |
valid_education_levels = valid_categories["EdLevel"]
|
| 56 |
valid_dev_types = valid_categories["DevType"]
|
| 57 |
valid_industries = valid_categories["Industry"]
|
| 58 |
+
valid_ages = valid_categories["Age"]
|
| 59 |
|
| 60 |
# Set default values (if available)
|
| 61 |
default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
|
| 62 |
default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
|
| 63 |
default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
|
| 64 |
default_industry = "Software Development" if "Software Development" in valid_industries else valid_industries[0]
|
| 65 |
+
default_age = "25-34 years old" if "25-34 years old" in valid_ages else valid_ages[0]
|
| 66 |
|
| 67 |
with col1:
|
| 68 |
country = st.selectbox(
|
|
|
|
| 81 |
help="Including any education, how many years have you been coding in total?",
|
| 82 |
)
|
| 83 |
|
| 84 |
+
work_exp = st.number_input(
|
| 85 |
+
"Years of Professional Work Experience",
|
| 86 |
+
min_value=0,
|
| 87 |
+
max_value=50,
|
| 88 |
+
value=5,
|
| 89 |
+
step=1,
|
| 90 |
+
help="How many years of professional work experience do you have?",
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
with col2:
|
| 94 |
education = st.selectbox(
|
| 95 |
"Education Level",
|
|
|
|
| 112 |
help="Industry the developer works in (only industries from training data)",
|
| 113 |
)
|
| 114 |
|
| 115 |
+
age = st.selectbox(
|
| 116 |
+
"Age",
|
| 117 |
+
options=valid_ages,
|
| 118 |
+
index=valid_ages.index(default_age),
|
| 119 |
+
help="Developer's age range",
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
# Prediction button
|
| 123 |
if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
|
| 124 |
try:
|
|
|
|
| 126 |
input_data = SalaryInput(
|
| 127 |
country=country,
|
| 128 |
years_code=years,
|
| 129 |
+
work_exp=work_exp,
|
| 130 |
education_level=education,
|
| 131 |
dev_type=dev_type,
|
| 132 |
industry=industry,
|
| 133 |
+
age=age,
|
| 134 |
)
|
| 135 |
|
| 136 |
# Make prediction
|
config/valid_categories.yaml
CHANGED
|
@@ -69,3 +69,11 @@ Industry:
|
|
| 69 |
- Retail and Consumer Services
|
| 70 |
- Software Development
|
| 71 |
- Transportation, or Supply Chain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
- Retail and Consumer Services
|
| 70 |
- Software Development
|
| 71 |
- Transportation, or Supply Chain
|
| 72 |
+
Age:
|
| 73 |
+
- 18-24 years old
|
| 74 |
+
- 25-34 years old
|
| 75 |
+
- 35-44 years old
|
| 76 |
+
- 45-54 years old
|
| 77 |
+
- 55-64 years old
|
| 78 |
+
- 65 years or older
|
| 79 |
+
- Other
|
example_inference.py
CHANGED
|
@@ -18,16 +18,20 @@ def main():
|
|
| 18 |
input_data_1 = SalaryInput(
|
| 19 |
country="United States of America",
|
| 20 |
years_code=5.0,
|
|
|
|
| 21 |
education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 22 |
dev_type="Developer, full-stack",
|
| 23 |
industry="Software Development",
|
|
|
|
| 24 |
)
|
| 25 |
|
| 26 |
print(f"Country: {input_data_1.country}")
|
| 27 |
print(f"Years of Coding (Total): {input_data_1.years_code}")
|
|
|
|
| 28 |
print(f"Education Level: {input_data_1.education_level}")
|
| 29 |
print(f"Developer Type: {input_data_1.dev_type}")
|
| 30 |
print(f"Industry: {input_data_1.industry}")
|
|
|
|
| 31 |
|
| 32 |
salary_1 = predict_salary(input_data_1)
|
| 33 |
print(f"💰 Predicted Salary: ${salary_1:,.2f} USD/year")
|
|
@@ -39,16 +43,20 @@ def main():
|
|
| 39 |
input_data_2 = SalaryInput(
|
| 40 |
country="United States of America",
|
| 41 |
years_code=2.0,
|
|
|
|
| 42 |
education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
|
| 43 |
dev_type="Developer, front-end",
|
| 44 |
industry="Fintech",
|
|
|
|
| 45 |
)
|
| 46 |
|
| 47 |
print(f"Country: {input_data_2.country}")
|
| 48 |
print(f"Years of Coding (Total): {input_data_2.years_code}")
|
|
|
|
| 49 |
print(f"Education Level: {input_data_2.education_level}")
|
| 50 |
print(f"Developer Type: {input_data_2.dev_type}")
|
| 51 |
print(f"Industry: {input_data_2.industry}")
|
|
|
|
| 52 |
|
| 53 |
salary_2 = predict_salary(input_data_2)
|
| 54 |
print(f"💰 Predicted Salary: ${salary_2:,.2f} USD/year")
|
|
@@ -60,16 +68,20 @@ def main():
|
|
| 60 |
input_data_3 = SalaryInput(
|
| 61 |
country="United States of America",
|
| 62 |
years_code=10.0,
|
|
|
|
| 63 |
education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
|
| 64 |
dev_type="Engineering manager",
|
| 65 |
industry="Banking/Financial Services",
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
print(f"Country: {input_data_3.country}")
|
| 69 |
print(f"Years of Coding (Total): {input_data_3.years_code}")
|
|
|
|
| 70 |
print(f"Education Level: {input_data_3.education_level}")
|
| 71 |
print(f"Developer Type: {input_data_3.dev_type}")
|
| 72 |
print(f"Industry: {input_data_3.industry}")
|
|
|
|
| 73 |
|
| 74 |
salary_3 = predict_salary(input_data_3)
|
| 75 |
print(f"💰 Predicted Salary: ${salary_3:,.2f} USD/year")
|
|
@@ -81,16 +93,20 @@ def main():
|
|
| 81 |
input_data_4 = SalaryInput(
|
| 82 |
country="Germany",
|
| 83 |
years_code=5.0,
|
|
|
|
| 84 |
education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 85 |
dev_type="Developer, back-end",
|
| 86 |
industry="Manufacturing",
|
|
|
|
| 87 |
)
|
| 88 |
|
| 89 |
print(f"Country: {input_data_4.country}")
|
| 90 |
print(f"Years of Coding (Total): {input_data_4.years_code}")
|
|
|
|
| 91 |
print(f"Education Level: {input_data_4.education_level}")
|
| 92 |
print(f"Developer Type: {input_data_4.dev_type}")
|
| 93 |
print(f"Industry: {input_data_4.industry}")
|
|
|
|
| 94 |
|
| 95 |
salary_4 = predict_salary(input_data_4)
|
| 96 |
print(f"💰 Predicted Salary: ${salary_4:,.2f} USD/year")
|
|
|
|
| 18 |
input_data_1 = SalaryInput(
|
| 19 |
country="United States of America",
|
| 20 |
years_code=5.0,
|
| 21 |
+
work_exp=3.0,
|
| 22 |
education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 23 |
dev_type="Developer, full-stack",
|
| 24 |
industry="Software Development",
|
| 25 |
+
age="25-34 years old",
|
| 26 |
)
|
| 27 |
|
| 28 |
print(f"Country: {input_data_1.country}")
|
| 29 |
print(f"Years of Coding (Total): {input_data_1.years_code}")
|
| 30 |
+
print(f"Work Experience: {input_data_1.work_exp}")
|
| 31 |
print(f"Education Level: {input_data_1.education_level}")
|
| 32 |
print(f"Developer Type: {input_data_1.dev_type}")
|
| 33 |
print(f"Industry: {input_data_1.industry}")
|
| 34 |
+
print(f"Age: {input_data_1.age}")
|
| 35 |
|
| 36 |
salary_1 = predict_salary(input_data_1)
|
| 37 |
print(f"💰 Predicted Salary: ${salary_1:,.2f} USD/year")
|
|
|
|
| 43 |
input_data_2 = SalaryInput(
|
| 44 |
country="United States of America",
|
| 45 |
years_code=2.0,
|
| 46 |
+
work_exp=1.0,
|
| 47 |
education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
|
| 48 |
dev_type="Developer, front-end",
|
| 49 |
industry="Fintech",
|
| 50 |
+
age="18-24 years old",
|
| 51 |
)
|
| 52 |
|
| 53 |
print(f"Country: {input_data_2.country}")
|
| 54 |
print(f"Years of Coding (Total): {input_data_2.years_code}")
|
| 55 |
+
print(f"Work Experience: {input_data_2.work_exp}")
|
| 56 |
print(f"Education Level: {input_data_2.education_level}")
|
| 57 |
print(f"Developer Type: {input_data_2.dev_type}")
|
| 58 |
print(f"Industry: {input_data_2.industry}")
|
| 59 |
+
print(f"Age: {input_data_2.age}")
|
| 60 |
|
| 61 |
salary_2 = predict_salary(input_data_2)
|
| 62 |
print(f"💰 Predicted Salary: ${salary_2:,.2f} USD/year")
|
|
|
|
| 68 |
input_data_3 = SalaryInput(
|
| 69 |
country="United States of America",
|
| 70 |
years_code=10.0,
|
| 71 |
+
work_exp=8.0,
|
| 72 |
education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
|
| 73 |
dev_type="Engineering manager",
|
| 74 |
industry="Banking/Financial Services",
|
| 75 |
+
age="35-44 years old",
|
| 76 |
)
|
| 77 |
|
| 78 |
print(f"Country: {input_data_3.country}")
|
| 79 |
print(f"Years of Coding (Total): {input_data_3.years_code}")
|
| 80 |
+
print(f"Work Experience: {input_data_3.work_exp}")
|
| 81 |
print(f"Education Level: {input_data_3.education_level}")
|
| 82 |
print(f"Developer Type: {input_data_3.dev_type}")
|
| 83 |
print(f"Industry: {input_data_3.industry}")
|
| 84 |
+
print(f"Age: {input_data_3.age}")
|
| 85 |
|
| 86 |
salary_3 = predict_salary(input_data_3)
|
| 87 |
print(f"💰 Predicted Salary: ${salary_3:,.2f} USD/year")
|
|
|
|
| 93 |
input_data_4 = SalaryInput(
|
| 94 |
country="Germany",
|
| 95 |
years_code=5.0,
|
| 96 |
+
work_exp=3.0,
|
| 97 |
education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 98 |
dev_type="Developer, back-end",
|
| 99 |
industry="Manufacturing",
|
| 100 |
+
age="25-34 years old",
|
| 101 |
)
|
| 102 |
|
| 103 |
print(f"Country: {input_data_4.country}")
|
| 104 |
print(f"Years of Coding (Total): {input_data_4.years_code}")
|
| 105 |
+
print(f"Work Experience: {input_data_4.work_exp}")
|
| 106 |
print(f"Education Level: {input_data_4.education_level}")
|
| 107 |
print(f"Developer Type: {input_data_4.dev_type}")
|
| 108 |
print(f"Industry: {input_data_4.industry}")
|
| 109 |
+
print(f"Age: {input_data_4.age}")
|
| 110 |
|
| 111 |
salary_4 = predict_salary(input_data_4)
|
| 112 |
print(f"💰 Predicted Salary: ${salary_4:,.2f} USD/year")
|
models/model.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efa053c467c2c1ea33a65f04aedc32fb7a6c47658238f26d88cd0a10986c0c98
|
| 3 |
+
size 3985657
|
src/infer.py
CHANGED
|
@@ -99,14 +99,23 @@ def predict_salary(data: SalaryInput) -> float:
|
|
| 99 |
f"Check config/valid_categories.yaml for all valid values."
|
| 100 |
)
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
# Create a DataFrame with the input data
|
| 103 |
input_df = pd.DataFrame(
|
| 104 |
{
|
| 105 |
"Country": [data.country],
|
| 106 |
"YearsCode": [data.years_code],
|
|
|
|
| 107 |
"EdLevel": [data.education_level],
|
| 108 |
"DevType": [data.dev_type],
|
| 109 |
"Industry": [data.industry],
|
|
|
|
| 110 |
}
|
| 111 |
)
|
| 112 |
|
|
|
|
| 99 |
f"Check config/valid_categories.yaml for all valid values."
|
| 100 |
)
|
| 101 |
|
| 102 |
+
if data.age not in valid_categories["Age"]:
|
| 103 |
+
raise ValueError(
|
| 104 |
+
f"Invalid age: '{data.age}'. "
|
| 105 |
+
f"Must be one of {len(valid_categories['Age'])} valid age ranges. "
|
| 106 |
+
f"Check config/valid_categories.yaml for all valid values."
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
# Create a DataFrame with the input data
|
| 110 |
input_df = pd.DataFrame(
|
| 111 |
{
|
| 112 |
"Country": [data.country],
|
| 113 |
"YearsCode": [data.years_code],
|
| 114 |
+
"WorkExp": [data.work_exp],
|
| 115 |
"EdLevel": [data.education_level],
|
| 116 |
"DevType": [data.dev_type],
|
| 117 |
"Industry": [data.industry],
|
| 118 |
+
"Age": [data.age],
|
| 119 |
}
|
| 120 |
)
|
| 121 |
|
src/preprocessing.py
CHANGED
|
@@ -55,7 +55,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 55 |
during training and inference, preventing data leakage and inconsistencies.
|
| 56 |
|
| 57 |
Args:
|
| 58 |
-
df: DataFrame with columns: Country, YearsCode, EdLevel, DevType, Industry
|
| 59 |
NOTE: During training, cardinality reduction should be applied to df
|
| 60 |
BEFORE calling this function. During inference, valid_categories.yaml
|
| 61 |
ensures only valid (already-reduced) categories are used.
|
|
@@ -67,7 +67,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 67 |
- Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
|
| 68 |
- Normalizes Unicode apostrophes to regular apostrophes
|
| 69 |
- Applies one-hot encoding with drop_first=True to avoid multicollinearity
|
| 70 |
-
- Column names in output will be like: YearsCode, Country_X, EdLevel_Y, DevType_Z, Industry_W
|
| 71 |
- Does NOT apply cardinality reduction (must be done before calling this)
|
| 72 |
"""
|
| 73 |
# Create a copy to avoid modifying the original
|
|
@@ -75,7 +75,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 75 |
|
| 76 |
# Normalize Unicode apostrophes to regular apostrophes for consistency
|
| 77 |
# This handles cases where data has \u2019 (') instead of '
|
| 78 |
-
for col in ["Country", "EdLevel", "DevType", "Industry"]:
|
| 79 |
if col in df_processed.columns:
|
| 80 |
df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
|
| 81 |
|
|
@@ -85,17 +85,19 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 85 |
|
| 86 |
# Fill missing values with defaults
|
| 87 |
df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
|
|
|
|
| 88 |
df_processed["Country"] = df_processed["Country"].fillna("Unknown")
|
| 89 |
df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
|
| 90 |
df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
|
| 91 |
df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
|
|
|
|
| 92 |
|
| 93 |
# NOTE: Cardinality reduction is NOT applied here
|
| 94 |
# It should be applied during training BEFORE calling this function
|
| 95 |
# During inference, valid_categories.yaml ensures only valid values are used
|
| 96 |
|
| 97 |
# Select only the features we need
|
| 98 |
-
feature_cols = ["Country", "YearsCode", "EdLevel", "DevType", "Industry"]
|
| 99 |
df_features = df_processed[feature_cols]
|
| 100 |
|
| 101 |
# Apply one-hot encoding for categorical variables
|
|
|
|
| 55 |
during training and inference, preventing data leakage and inconsistencies.
|
| 56 |
|
| 57 |
Args:
|
| 58 |
+
df: DataFrame with columns: Country, YearsCode, WorkExp, EdLevel, DevType, Industry, Age
|
| 59 |
NOTE: During training, cardinality reduction should be applied to df
|
| 60 |
BEFORE calling this function. During inference, valid_categories.yaml
|
| 61 |
ensures only valid (already-reduced) categories are used.
|
|
|
|
| 67 |
- Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
|
| 68 |
- Normalizes Unicode apostrophes to regular apostrophes
|
| 69 |
- Applies one-hot encoding with drop_first=True to avoid multicollinearity
|
| 70 |
+
- Column names in output will be like: YearsCode, WorkExp, Country_X, EdLevel_Y, DevType_Z, Industry_W, Age_V
|
| 71 |
- Does NOT apply cardinality reduction (must be done before calling this)
|
| 72 |
"""
|
| 73 |
# Create a copy to avoid modifying the original
|
|
|
|
| 75 |
|
| 76 |
# Normalize Unicode apostrophes to regular apostrophes for consistency
|
| 77 |
# This handles cases where data has \u2019 (') instead of '
|
| 78 |
+
for col in ["Country", "EdLevel", "DevType", "Industry", "Age"]:
|
| 79 |
if col in df_processed.columns:
|
| 80 |
df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
|
| 81 |
|
|
|
|
| 85 |
|
| 86 |
# Fill missing values with defaults
|
| 87 |
df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
|
| 88 |
+
df_processed["WorkExp"] = df_processed["WorkExp"].fillna(0)
|
| 89 |
df_processed["Country"] = df_processed["Country"].fillna("Unknown")
|
| 90 |
df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
|
| 91 |
df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
|
| 92 |
df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
|
| 93 |
+
df_processed["Age"] = df_processed["Age"].fillna("Unknown")
|
| 94 |
|
| 95 |
# NOTE: Cardinality reduction is NOT applied here
|
| 96 |
# It should be applied during training BEFORE calling this function
|
| 97 |
# During inference, valid_categories.yaml ensures only valid values are used
|
| 98 |
|
| 99 |
# Select only the features we need
|
| 100 |
+
feature_cols = ["Country", "YearsCode", "WorkExp", "EdLevel", "DevType", "Industry", "Age"]
|
| 101 |
df_features = df_processed[feature_cols]
|
| 102 |
|
| 103 |
# Apply one-hot encoding for categorical variables
|
src/schema.py
CHANGED
|
@@ -12,9 +12,15 @@ class SalaryInput(BaseModel):
|
|
| 12 |
ge=0,
|
| 13 |
description="Including any education, how many years have you been coding in total?",
|
| 14 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
education_level: str = Field(..., description="Education level")
|
| 16 |
dev_type: str = Field(..., description="Developer type")
|
| 17 |
industry: str = Field(..., description="Industry the developer works in")
|
|
|
|
| 18 |
|
| 19 |
class Config:
|
| 20 |
"""Pydantic configuration."""
|
|
@@ -23,8 +29,10 @@ class SalaryInput(BaseModel):
|
|
| 23 |
"example": {
|
| 24 |
"country": "United States",
|
| 25 |
"years_code": 5.0,
|
|
|
|
| 26 |
"education_level": "Bachelor's degree",
|
| 27 |
"dev_type": "Developer, back-end",
|
| 28 |
"industry": "Software Development",
|
|
|
|
| 29 |
}
|
| 30 |
}
|
|
|
|
| 12 |
ge=0,
|
| 13 |
description="Including any education, how many years have you been coding in total?",
|
| 14 |
)
|
| 15 |
+
work_exp: float = Field(
|
| 16 |
+
...,
|
| 17 |
+
ge=0,
|
| 18 |
+
description="How many years of professional work experience do you have?",
|
| 19 |
+
)
|
| 20 |
education_level: str = Field(..., description="Education level")
|
| 21 |
dev_type: str = Field(..., description="Developer type")
|
| 22 |
industry: str = Field(..., description="Industry the developer works in")
|
| 23 |
+
age: str = Field(..., description="Developer's age range")
|
| 24 |
|
| 25 |
class Config:
|
| 26 |
"""Pydantic configuration."""
|
|
|
|
| 29 |
"example": {
|
| 30 |
"country": "United States",
|
| 31 |
"years_code": 5.0,
|
| 32 |
+
"work_exp": 3.0,
|
| 33 |
"education_level": "Bachelor's degree",
|
| 34 |
"dev_type": "Developer, back-end",
|
| 35 |
"industry": "Software Development",
|
| 36 |
+
"age": "25-34 years old",
|
| 37 |
}
|
| 38 |
}
|
src/train.py
CHANGED
|
@@ -32,7 +32,7 @@ def main():
|
|
| 32 |
# Load only required columns to save memory
|
| 33 |
df = pd.read_csv(
|
| 34 |
data_path,
|
| 35 |
-
usecols=["Country", "YearsCode", "EdLevel", "DevType", "Industry",
|
| 36 |
"Currency", "CompTotal", "ConvertedCompYearly"],
|
| 37 |
)
|
| 38 |
|
|
@@ -67,12 +67,14 @@ def main():
|
|
| 67 |
df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
|
| 68 |
df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
|
| 69 |
df_copy["Industry"] = df_copy["Industry"].str.replace('\u2019', "'", regex=False)
|
|
|
|
| 70 |
|
| 71 |
# Apply cardinality reduction
|
| 72 |
df_copy["Country"] = reduce_cardinality(df_copy["Country"])
|
| 73 |
df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
|
| 74 |
df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
|
| 75 |
df_copy["Industry"] = reduce_cardinality(df_copy["Industry"])
|
|
|
|
| 76 |
|
| 77 |
# Apply cardinality reduction to the actual training data as well
|
| 78 |
# (prepare_features no longer does this internally)
|
|
@@ -80,6 +82,7 @@ def main():
|
|
| 80 |
df["EdLevel"] = reduce_cardinality(df["EdLevel"])
|
| 81 |
df["DevType"] = reduce_cardinality(df["DevType"])
|
| 82 |
df["Industry"] = reduce_cardinality(df["Industry"])
|
|
|
|
| 83 |
|
| 84 |
# Now apply full feature transformations for model training
|
| 85 |
X = prepare_features(df)
|
|
@@ -91,19 +94,21 @@ def main():
|
|
| 91 |
edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
|
| 92 |
devtype_values = df_copy["DevType"].dropna().unique().tolist()
|
| 93 |
industry_values = df_copy["Industry"].dropna().unique().tolist()
|
|
|
|
| 94 |
|
| 95 |
valid_categories = {
|
| 96 |
"Country": sorted(country_values),
|
| 97 |
"EdLevel": sorted(edlevel_values),
|
| 98 |
"DevType": sorted(devtype_values),
|
| 99 |
"Industry": sorted(industry_values),
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
valid_categories_path = Path("config/valid_categories.yaml")
|
| 103 |
with open(valid_categories_path, "w") as f:
|
| 104 |
yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
|
| 105 |
|
| 106 |
-
print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types,
|
| 107 |
|
| 108 |
# Compute currency conversion rates per country
|
| 109 |
# Use the original data with Currency and CompTotal columns
|
|
@@ -181,6 +186,12 @@ def main():
|
|
| 181 |
for industry, count in top_industry.items():
|
| 182 |
print(f" - {industry}: {count:,} ({count/len(df)*100:.1f}%)")
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
# Show YearsCode statistics
|
| 185 |
print("\n💼 Years of Coding Experience:")
|
| 186 |
print(f" - Min: {df['YearsCode'].min():.1f}")
|
|
@@ -190,6 +201,15 @@ def main():
|
|
| 190 |
print(f" - 25th percentile: {df['YearsCode'].quantile(0.25):.1f}")
|
| 191 |
print(f" - 75th percentile: {df['YearsCode'].quantile(0.75):.1f}")
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# Show most common one-hot encoded features (by frequency)
|
| 194 |
# Separate analysis for each categorical feature
|
| 195 |
|
|
@@ -231,12 +251,21 @@ def main():
|
|
| 231 |
industry_name = feature.replace('Industry_', '')
|
| 232 |
print(f" {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
|
| 235 |
-
print(" - Numeric:
|
| 236 |
print(f" - Country: {len(country_features)}")
|
| 237 |
print(f" - Education: {len(edlevel_features)}")
|
| 238 |
print(f" - DevType: {len(devtype_features)}")
|
| 239 |
print(f" - Industry: {len(industry_features)}")
|
|
|
|
| 240 |
|
| 241 |
print("=" * 60 + "\n")
|
| 242 |
|
|
|
|
| 32 |
# Load only required columns to save memory
|
| 33 |
df = pd.read_csv(
|
| 34 |
data_path,
|
| 35 |
+
usecols=["Country", "YearsCode", "WorkExp", "EdLevel", "DevType", "Industry", "Age",
|
| 36 |
"Currency", "CompTotal", "ConvertedCompYearly"],
|
| 37 |
)
|
| 38 |
|
|
|
|
| 67 |
df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
|
| 68 |
df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
|
| 69 |
df_copy["Industry"] = df_copy["Industry"].str.replace('\u2019', "'", regex=False)
|
| 70 |
+
df_copy["Age"] = df_copy["Age"].str.replace('\u2019', "'", regex=False)
|
| 71 |
|
| 72 |
# Apply cardinality reduction
|
| 73 |
df_copy["Country"] = reduce_cardinality(df_copy["Country"])
|
| 74 |
df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
|
| 75 |
df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
|
| 76 |
df_copy["Industry"] = reduce_cardinality(df_copy["Industry"])
|
| 77 |
+
df_copy["Age"] = reduce_cardinality(df_copy["Age"])
|
| 78 |
|
| 79 |
# Apply cardinality reduction to the actual training data as well
|
| 80 |
# (prepare_features no longer does this internally)
|
|
|
|
| 82 |
df["EdLevel"] = reduce_cardinality(df["EdLevel"])
|
| 83 |
df["DevType"] = reduce_cardinality(df["DevType"])
|
| 84 |
df["Industry"] = reduce_cardinality(df["Industry"])
|
| 85 |
+
df["Age"] = reduce_cardinality(df["Age"])
|
| 86 |
|
| 87 |
# Now apply full feature transformations for model training
|
| 88 |
X = prepare_features(df)
|
|
|
|
| 94 |
edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
|
| 95 |
devtype_values = df_copy["DevType"].dropna().unique().tolist()
|
| 96 |
industry_values = df_copy["Industry"].dropna().unique().tolist()
|
| 97 |
+
age_values = df_copy["Age"].dropna().unique().tolist()
|
| 98 |
|
| 99 |
valid_categories = {
|
| 100 |
"Country": sorted(country_values),
|
| 101 |
"EdLevel": sorted(edlevel_values),
|
| 102 |
"DevType": sorted(devtype_values),
|
| 103 |
"Industry": sorted(industry_values),
|
| 104 |
+
"Age": sorted(age_values),
|
| 105 |
}
|
| 106 |
|
| 107 |
valid_categories_path = Path("config/valid_categories.yaml")
|
| 108 |
with open(valid_categories_path, "w") as f:
|
| 109 |
yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
|
| 110 |
|
| 111 |
+
print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types, {len(valid_categories['Industry'])} valid industries, and {len(valid_categories['Age'])} valid age ranges to {valid_categories_path}")
|
| 112 |
|
| 113 |
# Compute currency conversion rates per country
|
| 114 |
# Use the original data with Currency and CompTotal columns
|
|
|
|
| 186 |
for industry, count in top_industry.items():
|
| 187 |
print(f" - {industry}: {count:,} ({count/len(df)*100:.1f}%)")
|
| 188 |
|
| 189 |
+
# Show age distribution
|
| 190 |
+
print("\n🎂 Age Distribution:")
|
| 191 |
+
top_age = df["Age"].value_counts().head(10)
|
| 192 |
+
for age, count in top_age.items():
|
| 193 |
+
print(f" - {age}: {count:,} ({count/len(df)*100:.1f}%)")
|
| 194 |
+
|
| 195 |
# Show YearsCode statistics
|
| 196 |
print("\n💼 Years of Coding Experience:")
|
| 197 |
print(f" - Min: {df['YearsCode'].min():.1f}")
|
|
|
|
| 201 |
print(f" - 25th percentile: {df['YearsCode'].quantile(0.25):.1f}")
|
| 202 |
print(f" - 75th percentile: {df['YearsCode'].quantile(0.75):.1f}")
|
| 203 |
|
| 204 |
+
# Show WorkExp statistics
|
| 205 |
+
print("\n💼 Years of Professional Work Experience:")
|
| 206 |
+
print(f" - Min: {df['WorkExp'].min():.1f}")
|
| 207 |
+
print(f" - Max: {df['WorkExp'].max():.1f}")
|
| 208 |
+
print(f" - Mean: {df['WorkExp'].mean():.1f}")
|
| 209 |
+
print(f" - Median: {df['WorkExp'].median():.1f}")
|
| 210 |
+
print(f" - 25th percentile: {df['WorkExp'].quantile(0.25):.1f}")
|
| 211 |
+
print(f" - 75th percentile: {df['WorkExp'].quantile(0.75):.1f}")
|
| 212 |
+
|
| 213 |
# Show most common one-hot encoded features (by frequency)
|
| 214 |
# Separate analysis for each categorical feature
|
| 215 |
|
|
|
|
| 251 |
industry_name = feature.replace('Industry_', '')
|
| 252 |
print(f" {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
|
| 253 |
|
| 254 |
+
# Age features
|
| 255 |
+
print("\n🎂 Top 10 Age Features (most common):")
|
| 256 |
+
age_features = categorical_features[categorical_features.index.str.startswith('Age_')]
|
| 257 |
+
for i, (feature, count) in enumerate(age_features.head(10).items(), 1):
|
| 258 |
+
percentage = (count / len(X)) * 100
|
| 259 |
+
age_name = feature.replace('Age_', '')
|
| 260 |
+
print(f" {i:2d}. {age_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
|
| 261 |
+
|
| 262 |
print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
|
| 263 |
+
print(" - Numeric: 2 (YearsCode, WorkExp)")
|
| 264 |
print(f" - Country: {len(country_features)}")
|
| 265 |
print(f" - Education: {len(edlevel_features)}")
|
| 266 |
print(f" - DevType: {len(devtype_features)}")
|
| 267 |
print(f" - Industry: {len(industry_features)}")
|
| 268 |
+
print(f" - Age: {len(age_features)}")
|
| 269 |
|
| 270 |
print("=" * 60 + "\n")
|
| 271 |
|
test_feature_impact.py
CHANGED
|
@@ -12,9 +12,11 @@ def test_years_experience_impact():
|
|
| 12 |
|
| 13 |
base_input = {
|
| 14 |
"country": "United States of America",
|
|
|
|
| 15 |
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 16 |
"dev_type": "Developer, full-stack",
|
| 17 |
"industry": "Software Development",
|
|
|
|
| 18 |
}
|
| 19 |
|
| 20 |
# Test with different years of experience
|
|
@@ -45,9 +47,11 @@ def test_country_impact():
|
|
| 45 |
|
| 46 |
base_input = {
|
| 47 |
"years_code": 5.0,
|
|
|
|
| 48 |
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 49 |
"dev_type": "Developer, full-stack",
|
| 50 |
"industry": "Software Development",
|
|
|
|
| 51 |
}
|
| 52 |
|
| 53 |
# Test with different countries (select diverse ones)
|
|
@@ -93,8 +97,10 @@ def test_education_impact():
|
|
| 93 |
base_input = {
|
| 94 |
"country": "United States of America",
|
| 95 |
"years_code": 5.0,
|
|
|
|
| 96 |
"dev_type": "Developer, full-stack",
|
| 97 |
"industry": "Software Development",
|
|
|
|
| 98 |
}
|
| 99 |
|
| 100 |
# Test with different education levels
|
|
@@ -141,8 +147,10 @@ def test_devtype_impact():
|
|
| 141 |
base_input = {
|
| 142 |
"country": "United States of America",
|
| 143 |
"years_code": 5.0,
|
|
|
|
| 144 |
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 145 |
"industry": "Software Development",
|
|
|
|
| 146 |
}
|
| 147 |
|
| 148 |
# Test with different developer types (using actual values from trained model)
|
|
@@ -189,8 +197,10 @@ def test_industry_impact():
|
|
| 189 |
base_input = {
|
| 190 |
"country": "United States of America",
|
| 191 |
"years_code": 5.0,
|
|
|
|
| 192 |
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 193 |
"dev_type": "Developer, full-stack",
|
|
|
|
| 194 |
}
|
| 195 |
|
| 196 |
# Test with different industries (using actual values from trained model)
|
|
@@ -228,40 +238,132 @@ def test_industry_impact():
|
|
| 228 |
return False
|
| 229 |
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
def test_combined_features():
|
| 232 |
"""Test that combining different features produces expected variations."""
|
| 233 |
print("\n" + "=" * 70)
|
| 234 |
-
print("TEST
|
| 235 |
print("=" * 70)
|
| 236 |
|
| 237 |
# Create diverse combinations (using actual values from trained model)
|
| 238 |
test_cases = [
|
| 239 |
-
("India", 2, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, back-end", "Software Development"),
|
| 240 |
-
("Germany", 5, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Developer, full-stack", "Manufacturing"),
|
| 241 |
-
("United States of America", 10, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Engineering manager", "Fintech"),
|
| 242 |
-
("Poland", 15, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, front-end", "Healthcare"),
|
| 243 |
-
("Brazil", 5, "Some college/university study without earning a degree", "DevOps engineer or professional", "Government"),
|
| 244 |
]
|
| 245 |
|
| 246 |
predictions = []
|
| 247 |
-
for country, years, education, devtype, industry in test_cases:
|
| 248 |
# Skip if not in valid categories
|
| 249 |
if (country not in valid_categories["Country"]
|
| 250 |
or education not in valid_categories["EdLevel"]
|
| 251 |
or devtype not in valid_categories["DevType"]
|
| 252 |
-
or industry not in valid_categories["Industry"]
|
|
|
|
| 253 |
continue
|
| 254 |
|
| 255 |
input_data = SalaryInput(
|
| 256 |
country=country,
|
| 257 |
years_code=years,
|
|
|
|
| 258 |
education_level=education,
|
| 259 |
dev_type=devtype,
|
| 260 |
industry=industry,
|
|
|
|
| 261 |
)
|
| 262 |
salary = predict_salary(input_data)
|
| 263 |
predictions.append(salary)
|
| 264 |
-
print(f" {country[:15]:15s} | {years:2d}y | {education[:25]:25s} | {devtype[:25]:25s} | {industry[:20]:20s} -> ${salary:,.2f}")
|
| 265 |
|
| 266 |
# Check if predictions are different
|
| 267 |
unique_predictions = len(set(predictions))
|
|
@@ -289,13 +391,15 @@ def print_feature_analysis():
|
|
| 289 |
edlevel_features = [f for f in feature_columns if f.startswith('EdLevel_')]
|
| 290 |
devtype_features = [f for f in feature_columns if f.startswith('DevType_')]
|
| 291 |
industry_features = [f for f in feature_columns if f.startswith('Industry_')]
|
| 292 |
-
|
|
|
|
| 293 |
|
| 294 |
print(f" - Numeric features: {len(numeric_features)} -> {numeric_features}")
|
| 295 |
print(f" - Country features: {len(country_features)}")
|
| 296 |
print(f" - Education features: {len(edlevel_features)}")
|
| 297 |
print(f" - DevType features: {len(devtype_features)}")
|
| 298 |
print(f" - Industry features: {len(industry_features)}")
|
|
|
|
| 299 |
|
| 300 |
if len(country_features) > 0:
|
| 301 |
print(f"\nSample country features:")
|
|
@@ -317,6 +421,11 @@ def print_feature_analysis():
|
|
| 317 |
for feat in industry_features[:5]:
|
| 318 |
print(f" - {feat}")
|
| 319 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
# Check if there are any features at all
|
| 321 |
if len(country_features) == 0:
|
| 322 |
print("\n⚠️ WARNING: No country features found!")
|
|
@@ -326,6 +435,8 @@ def print_feature_analysis():
|
|
| 326 |
print("\n⚠️ WARNING: No developer type features found!")
|
| 327 |
if len(industry_features) == 0:
|
| 328 |
print("\n⚠️ WARNING: No industry features found!")
|
|
|
|
|
|
|
| 329 |
|
| 330 |
|
| 331 |
def main():
|
|
@@ -340,11 +451,13 @@ def main():
|
|
| 340 |
|
| 341 |
# Run all tests
|
| 342 |
results = {
|
| 343 |
-
"Years of
|
| 344 |
"Country": test_country_impact(),
|
| 345 |
"Education Level": test_education_impact(),
|
| 346 |
"Developer Type": test_devtype_impact(),
|
| 347 |
"Industry": test_industry_impact(),
|
|
|
|
|
|
|
| 348 |
"Combined Features": test_combined_features(),
|
| 349 |
}
|
| 350 |
|
|
|
|
| 12 |
|
| 13 |
base_input = {
|
| 14 |
"country": "United States of America",
|
| 15 |
+
"work_exp": 3.0,
|
| 16 |
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 17 |
"dev_type": "Developer, full-stack",
|
| 18 |
"industry": "Software Development",
|
| 19 |
+
"age": "25-34 years old",
|
| 20 |
}
|
| 21 |
|
| 22 |
# Test with different years of experience
|
|
|
|
| 47 |
|
| 48 |
base_input = {
|
| 49 |
"years_code": 5.0,
|
| 50 |
+
"work_exp": 3.0,
|
| 51 |
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 52 |
"dev_type": "Developer, full-stack",
|
| 53 |
"industry": "Software Development",
|
| 54 |
+
"age": "25-34 years old",
|
| 55 |
}
|
| 56 |
|
| 57 |
# Test with different countries (select diverse ones)
|
|
|
|
| 97 |
base_input = {
|
| 98 |
"country": "United States of America",
|
| 99 |
"years_code": 5.0,
|
| 100 |
+
"work_exp": 3.0,
|
| 101 |
"dev_type": "Developer, full-stack",
|
| 102 |
"industry": "Software Development",
|
| 103 |
+
"age": "25-34 years old",
|
| 104 |
}
|
| 105 |
|
| 106 |
# Test with different education levels
|
|
|
|
| 147 |
base_input = {
|
| 148 |
"country": "United States of America",
|
| 149 |
"years_code": 5.0,
|
| 150 |
+
"work_exp": 3.0,
|
| 151 |
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 152 |
"industry": "Software Development",
|
| 153 |
+
"age": "25-34 years old",
|
| 154 |
}
|
| 155 |
|
| 156 |
# Test with different developer types (using actual values from trained model)
|
|
|
|
| 197 |
base_input = {
|
| 198 |
"country": "United States of America",
|
| 199 |
"years_code": 5.0,
|
| 200 |
+
"work_exp": 3.0,
|
| 201 |
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 202 |
"dev_type": "Developer, full-stack",
|
| 203 |
+
"age": "25-34 years old",
|
| 204 |
}
|
| 205 |
|
| 206 |
# Test with different industries (using actual values from trained model)
|
|
|
|
| 238 |
return False
|
| 239 |
|
| 240 |
|
| 241 |
+
def test_age_impact():
|
| 242 |
+
"""Test that changing age changes prediction."""
|
| 243 |
+
print("\n" + "=" * 70)
|
| 244 |
+
print("TEST 6: Age Impact")
|
| 245 |
+
print("=" * 70)
|
| 246 |
+
|
| 247 |
+
base_input = {
|
| 248 |
+
"country": "United States of America",
|
| 249 |
+
"years_code": 5.0,
|
| 250 |
+
"work_exp": 3.0,
|
| 251 |
+
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 252 |
+
"dev_type": "Developer, full-stack",
|
| 253 |
+
"industry": "Software Development",
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
# Test with different age ranges (using actual values from trained model)
|
| 257 |
+
test_ages = [
|
| 258 |
+
"18-24 years old",
|
| 259 |
+
"25-34 years old",
|
| 260 |
+
"35-44 years old",
|
| 261 |
+
"45-54 years old",
|
| 262 |
+
"55-64 years old",
|
| 263 |
+
]
|
| 264 |
+
|
| 265 |
+
# Filter to only ages that exist in valid categories
|
| 266 |
+
test_ages = [a for a in test_ages if a in valid_categories["Age"]]
|
| 267 |
+
|
| 268 |
+
predictions = []
|
| 269 |
+
for age in test_ages:
|
| 270 |
+
input_data = SalaryInput(**base_input, age=age)
|
| 271 |
+
salary = predict_salary(input_data)
|
| 272 |
+
predictions.append(salary)
|
| 273 |
+
print(f" Age: {age[:50]:50s} -> Salary: ${salary:,.2f}")
|
| 274 |
+
|
| 275 |
+
# Check if predictions are different
|
| 276 |
+
unique_predictions = len(set(predictions))
|
| 277 |
+
if unique_predictions == len(predictions):
|
| 278 |
+
print(f"\n✅ PASS: All {len(predictions)} predictions are different")
|
| 279 |
+
return True
|
| 280 |
+
elif unique_predictions == 1:
|
| 281 |
+
print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
|
| 282 |
+
print(" This indicates the model is NOT using age as a feature!")
|
| 283 |
+
return False
|
| 284 |
+
else:
|
| 285 |
+
print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
|
| 286 |
+
print(f" Duplicate salaries found - possible feature issue")
|
| 287 |
+
return False
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def test_work_exp_impact():
|
| 291 |
+
"""Test that changing years of work experience changes prediction."""
|
| 292 |
+
print("\n" + "=" * 70)
|
| 293 |
+
print("TEST 7: Work Experience Impact")
|
| 294 |
+
print("=" * 70)
|
| 295 |
+
|
| 296 |
+
base_input = {
|
| 297 |
+
"country": "United States of America",
|
| 298 |
+
"years_code": 10.0,
|
| 299 |
+
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 300 |
+
"dev_type": "Developer, full-stack",
|
| 301 |
+
"industry": "Software Development",
|
| 302 |
+
"age": "25-34 years old",
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
# Test with different years of work experience
|
| 306 |
+
work_exp_tests = [0, 1, 3, 5, 10, 20]
|
| 307 |
+
predictions = []
|
| 308 |
+
|
| 309 |
+
for work_exp in work_exp_tests:
|
| 310 |
+
input_data = SalaryInput(**base_input, work_exp=work_exp)
|
| 311 |
+
salary = predict_salary(input_data)
|
| 312 |
+
predictions.append(salary)
|
| 313 |
+
print(f" Work Exp: {work_exp:2d} -> Salary: ${salary:,.2f}")
|
| 314 |
+
|
| 315 |
+
# Check if predictions are different
|
| 316 |
+
unique_predictions = len(set(predictions))
|
| 317 |
+
if unique_predictions == len(predictions):
|
| 318 |
+
print(f"\n✅ PASS: All {len(predictions)} predictions are different")
|
| 319 |
+
return True
|
| 320 |
+
elif unique_predictions == 1:
|
| 321 |
+
print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
|
| 322 |
+
print(" This indicates the model is NOT using work experience as a feature!")
|
| 323 |
+
return False
|
| 324 |
+
else:
|
| 325 |
+
print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
|
| 326 |
+
print(f" Duplicate salaries found - possible feature issue")
|
| 327 |
+
return False
|
| 328 |
+
|
| 329 |
+
|
| 330 |
def test_combined_features():
|
| 331 |
"""Test that combining different features produces expected variations."""
|
| 332 |
print("\n" + "=" * 70)
|
| 333 |
+
print("TEST 8: Combined Feature Variations")
|
| 334 |
print("=" * 70)
|
| 335 |
|
| 336 |
# Create diverse combinations (using actual values from trained model)
|
| 337 |
test_cases = [
|
| 338 |
+
("India", 2, 1, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, back-end", "Software Development", "18-24 years old"),
|
| 339 |
+
("Germany", 5, 3, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Developer, full-stack", "Manufacturing", "25-34 years old"),
|
| 340 |
+
("United States of America", 10, 8, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Engineering manager", "Fintech", "35-44 years old"),
|
| 341 |
+
("Poland", 15, 12, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, front-end", "Healthcare", "45-54 years old"),
|
| 342 |
+
("Brazil", 5, 3, "Some college/university study without earning a degree", "DevOps engineer or professional", "Government", "25-34 years old"),
|
| 343 |
]
|
| 344 |
|
| 345 |
predictions = []
|
| 346 |
+
for country, years, work_exp, education, devtype, industry, age in test_cases:
|
| 347 |
# Skip if not in valid categories
|
| 348 |
if (country not in valid_categories["Country"]
|
| 349 |
or education not in valid_categories["EdLevel"]
|
| 350 |
or devtype not in valid_categories["DevType"]
|
| 351 |
+
or industry not in valid_categories["Industry"]
|
| 352 |
+
or age not in valid_categories["Age"]):
|
| 353 |
continue
|
| 354 |
|
| 355 |
input_data = SalaryInput(
|
| 356 |
country=country,
|
| 357 |
years_code=years,
|
| 358 |
+
work_exp=work_exp,
|
| 359 |
education_level=education,
|
| 360 |
dev_type=devtype,
|
| 361 |
industry=industry,
|
| 362 |
+
age=age,
|
| 363 |
)
|
| 364 |
salary = predict_salary(input_data)
|
| 365 |
predictions.append(salary)
|
| 366 |
+
print(f" {country[:15]:15s} | {years:2d}y | {work_exp:2d}w | {education[:25]:25s} | {devtype[:25]:25s} | {industry[:20]:20s} | {age[:15]:15s} -> ${salary:,.2f}")
|
| 367 |
|
| 368 |
# Check if predictions are different
|
| 369 |
unique_predictions = len(set(predictions))
|
|
|
|
| 391 |
edlevel_features = [f for f in feature_columns if f.startswith('EdLevel_')]
|
| 392 |
devtype_features = [f for f in feature_columns if f.startswith('DevType_')]
|
| 393 |
industry_features = [f for f in feature_columns if f.startswith('Industry_')]
|
| 394 |
+
age_features = [f for f in feature_columns if f.startswith('Age_')]
|
| 395 |
+
numeric_features = [f for f in feature_columns if not f.startswith(('Country_', 'EdLevel_', 'DevType_', 'Industry_', 'Age_'))]
|
| 396 |
|
| 397 |
print(f" - Numeric features: {len(numeric_features)} -> {numeric_features}")
|
| 398 |
print(f" - Country features: {len(country_features)}")
|
| 399 |
print(f" - Education features: {len(edlevel_features)}")
|
| 400 |
print(f" - DevType features: {len(devtype_features)}")
|
| 401 |
print(f" - Industry features: {len(industry_features)}")
|
| 402 |
+
print(f" - Age features: {len(age_features)}")
|
| 403 |
|
| 404 |
if len(country_features) > 0:
|
| 405 |
print(f"\nSample country features:")
|
|
|
|
| 421 |
for feat in industry_features[:5]:
|
| 422 |
print(f" - {feat}")
|
| 423 |
|
| 424 |
+
if len(age_features) > 0:
|
| 425 |
+
print(f"\nSample age features:")
|
| 426 |
+
for feat in age_features[:5]:
|
| 427 |
+
print(f" - {feat}")
|
| 428 |
+
|
| 429 |
# Check if there are any features at all
|
| 430 |
if len(country_features) == 0:
|
| 431 |
print("\n⚠️ WARNING: No country features found!")
|
|
|
|
| 435 |
print("\n⚠️ WARNING: No developer type features found!")
|
| 436 |
if len(industry_features) == 0:
|
| 437 |
print("\n⚠️ WARNING: No industry features found!")
|
| 438 |
+
if len(age_features) == 0:
|
| 439 |
+
print("\n⚠️ WARNING: No age features found!")
|
| 440 |
|
| 441 |
|
| 442 |
def main():
|
|
|
|
| 451 |
|
| 452 |
# Run all tests
|
| 453 |
results = {
|
| 454 |
+
"Years of Coding": test_years_experience_impact(),
|
| 455 |
"Country": test_country_impact(),
|
| 456 |
"Education Level": test_education_impact(),
|
| 457 |
"Developer Type": test_devtype_impact(),
|
| 458 |
"Industry": test_industry_impact(),
|
| 459 |
+
"Age": test_age_impact(),
|
| 460 |
+
"Work Experience": test_work_exp_impact(),
|
| 461 |
"Combined Features": test_combined_features(),
|
| 462 |
}
|
| 463 |
|