import streamlit as st
import pandas as pd
import numpy as np
import pickle
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from streamlit_shap import st_shap
# Page configuration
st.set_page_config(
page_title="EU Developer Salary Predictor",
page_icon="đ°",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for better styling
st.markdown("""
""", unsafe_allow_html=True)
# Load model and info
@st.cache_resource
def load_model():
with open('salary_model.pkl', 'rb') as f:
model = pickle.load(f)
with open('model_info.pkl', 'rb') as f:
info = pickle.load(f)
return model, info
try:
model_pipeline, model_info = load_model()
except Exception as e:
st.error(f"â Error loading model: {e}")
st.stop()
# Feature options - Only countries using EUR or commonly reporting in EUR
COUNTRY_OPTIONS = [
'Austria',
'Belgium',
'France',
'Germany',
'Ireland',
'Italy',
'Netherlands',
'Portugal',
'Spain'
]
ED_LEVEL_OPTIONS = [
"Primary/elementary school",
"Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",
"Some college/university study without earning a degree",
"Associate degree (A.A., A.S., etc.)",
"Bachelor's degree (B.A., B.S., B.Eng., etc.)",
"Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
"Professional degree (JD, MD, Ph.D, Ed.D, etc.)",
"Something else"
]
DEV_TYPE_OPTIONS = [
'Developer, back-end', 'Developer, full-stack', 'Developer, front-end',
'Engineering manager', 'Developer, desktop or enterprise applications',
'Developer, mobile', 'DevOps specialist', 'Data scientist or machine learning specialist',
'Data or business analyst', 'System administrator', 'Developer, QA or test',
'Product manager', 'Other'
]
ORG_SIZE_OPTIONS = [
'2 to 9 employees',
'10 to 19 employees',
'20 to 99 employees',
'100 to 499 employees',
'500 to 999 employees',
'1,000 to 4,999 employees',
'5,000 to 9,999 employees',
'10,000 or more employees'
]
REMOTE_WORK_OPTIONS = [
'Hybrid (some remote, some in-person)', 'Fully remote', 'In-person'
]
# ============================================================================
# SIDEBAR - INPUT FORM
# ============================================================================
with st.sidebar:
st.title("đ¯ Developer Profile")
st.markdown("---")
# Personal Information
st.subheader("đ¤ Personal")
age_group = st.selectbox(
"Age Group",
options=[1, 2, 3, 4, 5],
format_func=lambda x: {
1: "18-24 years", 2: "25-34 years", 3: "35-44 years",
4: "45-54 years", 5: "55+ years"
}[x],
key="age"
)
years_code_pro = st.slider(
"Years of Experience",
min_value=0, max_value=40, value=5,
key="years"
)
country = st.selectbox(
"Country",
options=COUNTRY_OPTIONS,
key="country"
)
st.markdown("---")
# Professional Information
st.subheader("đŧ Professional")
dev_type = st.selectbox(
"Developer Type",
options=DEV_TYPE_OPTIONS,
key="dev_type"
)
ed_level = st.selectbox(
"Education Level",
options=ED_LEVEL_OPTIONS,
key="ed_level"
)
org_size = st.selectbox(
"Organization Size",
options=ORG_SIZE_OPTIONS,
key="org_size"
)
remote_work = st.selectbox(
"Work Arrangement",
options=REMOTE_WORK_OPTIONS,
key="remote"
)
st.markdown("---")
# Additional Information
st.subheader("âī¸ Additional")
so_account = st.checkbox(
"Stack Overflow Account",
value=True,
key="so"
)
ai_select = st.checkbox(
"Uses AI Tools",
value=True,
key="ai"
)
st.markdown("---")
# Predict button in sidebar
predict_button = st.button(
"đŽ Predict Salary",
type="primary",
use_container_width=True,
key="predict_btn"
)
st.markdown("---")
# MODEL DETAILS - Moved to bottom of sidebar
st.subheader("đ Model Details")
st.markdown("""
- **Data Source**: Stack Overflow 2024 Survey
- **Sample**: 7,000+ European developers
- **Algorithm**: Optimized Random Forest
- **Accuracy**: RMSE ~âŦ18,600
- **Last Updated**: 2025
""")
# ============================================================================
# MAIN CONTENT AREA
# ============================================================================
# Header
st.markdown('
đ° European Developer Salary Predictor
', unsafe_allow_html=True)
st.markdown('', unsafe_allow_html=True)
# Handle prediction
if predict_button:
# Create input dataframe
input_data = pd.DataFrame({
'age_group': [age_group],
'years_code_pro': [years_code_pro],
'remote_work': [remote_work],
'ed_level': [ed_level],
'dev_type': [dev_type],
'org_size': [org_size],
'country': [country],
'so_account': [so_account],
'ai_select': [ai_select]
})
# Make prediction
prediction = model_pipeline.predict(input_data)[0]
# Store in session state
st.session_state['current_input'] = input_data
st.session_state['current_prediction'] = prediction
st.session_state['has_prediction'] = True
# Show results if prediction exists
if st.session_state.get('has_prediction', False):
prediction = st.session_state['current_prediction']
# Display main prediction
st.markdown("""
Predicted Annual Salary
âŦ{:,.0f}
""".format(prediction), unsafe_allow_html=True)
# Breakdown metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("đ° Annual", f"âŦ{prediction:,.0f}")
with col2:
st.metric("đ
Monthly", f"âŦ{prediction/12:,.0f}")
with col3:
st.metric("đ Weekly", f"âŦ{prediction/52:,.0f}")
with col4:
st.metric("â° Hourly", f"âŦ{prediction/2080:,.0f}")
st.markdown("---")
# Tabs for detailed analysis
tab1, tab2, tab3 = st.tabs(["đ Model Insights", "đ What-If Analysis", "âšī¸ About Prediction"])
# TAB 1: MODEL INSIGHTS
with tab1:
st.header("đ§ Understanding Your Prediction")
st.write("See which factors had the biggest impact on your predicted salary.")
input_data = st.session_state['current_input']
# Transform input
preprocessor = model_pipeline.named_steps['preprocessor']
model = model_pipeline.named_steps['regressor']
X_transformed = preprocessor.transform(input_data)
feature_names = list(preprocessor.get_feature_names_out())
# Create SHAP explainer
with st.spinner("Calculating feature impacts..."):
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_transformed)
if isinstance(explainer.expected_value, np.ndarray):
expected_value = float(explainer.expected_value[0])
else:
expected_value = float(explainer.expected_value)
st.subheader("đ¯ Feature Impact Visualization")
st.write(f"**Base salary** (average): âŦ{expected_value:,.0f}")
st.write("Features in **red** increase your salary. Features in **blue** decrease it.")
# Force plot
st_shap(shap.force_plot(
expected_value,
shap_values[0],
X_transformed[0],
feature_names=feature_names
))
st.markdown("---")
# Feature contribution table
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("đ Top Contributing Factors")
# Create feature mapping WITHOUT emojis for chart
def clean_feature_name(feature):
"""Convert technical feature names to user-friendly labels WITHOUT emojis"""
# Remove prefixes
feature = feature.replace('cat__', '').replace('num__', '').replace('remainder__', '')
# Simple mappings
simple_map = {
'years_code_pro': 'Years of Experience',
'age_group': 'Age Group',
'so_account': 'Stack Overflow Account',
'ai_select': 'Uses AI Tools'
}
if feature in simple_map:
return simple_map[feature]
# Handle categorical variables
replacements = {
'country_': 'Country: ',
'remote_work_': 'Work: ',
'dev_type_': 'Role: ',
'org_size_': 'Company Size: ',
'ed_level_': 'Education: '
}
for prefix, label in replacements.items():
if prefix in feature:
return label + feature.replace(prefix, '').replace('_', ' ')
# Fallback
return feature.replace('_', ' ').title()
# Create emoji version for the table only
def clean_feature_name_with_emoji(feature):
"""Convert technical feature names to user-friendly labels WITH emojis"""
base_name = clean_feature_name(feature)
# Add emojis based on content
if 'Years of Experience' in base_name:
return 'âąī¸ ' + base_name
elif 'Age Group' in base_name:
return 'đ¤ ' + base_name
elif 'Country:' in base_name:
return 'đ ' + base_name
elif 'Work:' in base_name:
return 'đ ' + base_name
elif 'Role:' in base_name:
return 'đģ ' + base_name
elif 'Company Size:' in base_name:
return 'đĸ ' + base_name
elif 'Education:' in base_name:
return 'đ ' + base_name
elif 'Stack Overflow' in base_name:
return 'đ ' + base_name
elif 'AI Tools' in base_name:
return 'đ¤ ' + base_name
return base_name
shap_df = pd.DataFrame({
'Feature': feature_names,
'SHAP Value': shap_values[0],
'Impact': ['âŦī¸ Increases' if x > 0 else 'âŦī¸ Decreases' for x in shap_values[0]]
})
shap_df['Abs SHAP'] = shap_df['SHAP Value'].abs()
shap_df = shap_df.sort_values('Abs SHAP', ascending=False).head(10)
# Clean feature names - NO emojis for chart, WITH emojis for table
shap_df['Feature_Clean'] = shap_df['Feature'].apply(clean_feature_name)
shap_df['Feature_Clean_Emoji'] = shap_df['Feature'].apply(clean_feature_name_with_emoji)
# Create visualization with improved styling
fig, ax = plt.subplots(figsize=(10, 6))
# Modern color scheme
colors = ['#10b981' if x > 0 else '#ef4444' for x in shap_df['SHAP Value']]
# Create bars
bars = ax.barh(range(len(shap_df)), shap_df['SHAP Value'], color=colors, alpha=0.85, height=0.7)
# Add value labels on bars - improved positioning
max_abs_value = shap_df['Abs SHAP'].max()
for i, (bar, value) in enumerate(zip(bars, shap_df['SHAP Value'])):
abs_value = abs(value)
# For large bars (>30% of max), place label inside
# For small bars, place label outside
if abs_value > max_abs_value * 0.3:
# Inside the bar
x_pos = value / 2
color = 'white'
ha = 'center'
else:
# Outside the bar
offset = max_abs_value * 0.05 # 5% of max value as offset
x_pos = value + (offset if value > 0 else -offset)
color = '#10b981' if value > 0 else '#ef4444'
ha = 'left' if value > 0 else 'right'
ax.text(x_pos, i, f'âŦ{abs_value:,.0f}',
ha=ha, va='center',
fontweight='bold', fontsize=10,
color=color)
# Set labels with cleaned names WITHOUT EMOJIS
ax.set_yticks(range(len(shap_df)))
ax.set_yticklabels(shap_df['Feature_Clean'], fontsize=10)
ax.set_xlabel('Impact on Salary (EUR)', fontsize=11, fontweight='bold')
ax.set_title('How Different Factors Affect Your Salary',
fontsize=13, fontweight='bold', pad=20)
# Add zero line
ax.axvline(x=0, color='#64748b', linestyle='-', linewidth=2, alpha=0.5)
# Add legend
from matplotlib.patches import Patch
legend_elements = [
Patch(facecolor='#10b981', alpha=0.85, label='Increases Salary'),
Patch(facecolor='#ef4444', alpha=0.85, label='Decreases Salary')
]
ax.legend(handles=legend_elements, loc='upper right', frameon=True,
fancybox=True, shadow=True, fontsize=10)
# Styling
ax.grid(axis='x', alpha=0.2, linestyle='--')
ax.set_facecolor('#f8fafc')
fig.patch.set_facecolor('white')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
st.pyplot(fig)
# Add explanation box with improved text
st.info("""
**đĄ How to read this chart:**
- **Green bars** pointing right â These factors *increase* your salary
- **Red bars** pointing left â These factors *decrease* your salary
- **Longer bars** = Bigger impact on your predicted salary
- **Why do I see other countries/categories I didn't select?** The chart shows the top 10 most impactful features for your prediction. When you see a **red bar** for a category you *didn't* select (like other countries), it means "not having this characteristic lowers your salary compared to having it." For example, if you see "Country: Germany" with a **red bar showing âŦ1,059**, it means being from Germany would have added âŦ1,059 to your salary compared to your current country.
""")
with col2:
st.subheader("đ Impact Details")
# Format the table with cleaned names (WITH emojis for table)
display_df = shap_df[['Feature_Clean_Emoji', 'SHAP Value']].copy()
display_df.columns = ['Factor', 'Impact Amount']
display_df['Impact Amount'] = display_df['Impact Amount'].apply(
lambda x: f"+âŦ{x:,.0f}" if x > 0 else f"-âŦ{abs(x):,.0f}"
)
display_df = display_df.reset_index(drop=True)
st.dataframe(
display_df,
use_container_width=True,
hide_index=True
)
# TAB 2: WHAT-IF ANALYSIS
with tab2:
st.header("đ What-If Scenario Analysis")
st.write("Explore how changing different factors affects your predicted salary.")
original_input = st.session_state['current_input'].copy()
original_prediction = st.session_state['current_prediction']
col1, col2 = st.columns([1, 2])
with col1:
st.subheader("đī¸ Modify Factor")
feature_to_change = st.selectbox(
"Select factor to modify",
options=['years_code_pro', 'country', 'remote_work', 'dev_type', 'org_size', 'ed_level'],
format_func=lambda x: {
'years_code_pro': 'âąī¸ Years of Experience',
'country': 'đ Country',
'remote_work': 'đ Work Arrangement',
'dev_type': 'đģ Developer Type',
'org_size': 'đĸ Organization Size',
'ed_level': 'đ Education Level'
}[x]
)
modified_input = original_input.copy()
if feature_to_change == 'years_code_pro':
new_value = st.slider(
"New years of experience",
min_value=0, max_value=40,
value=int(original_input[feature_to_change].values[0]),
key="what_if_years"
)
modified_input[feature_to_change] = new_value
elif feature_to_change == 'country':
new_value = st.selectbox("New country", COUNTRY_OPTIONS, key="what_if_country")
modified_input[feature_to_change] = new_value
elif feature_to_change == 'remote_work':
new_value = st.selectbox("New work arrangement", REMOTE_WORK_OPTIONS, key="what_if_remote")
modified_input[feature_to_change] = new_value
elif feature_to_change == 'dev_type':
new_value = st.selectbox("New developer type", DEV_TYPE_OPTIONS, key="what_if_dev")
modified_input[feature_to_change] = new_value
elif feature_to_change == 'org_size':
new_value = st.selectbox("New org size", ORG_SIZE_OPTIONS, key="what_if_org")
modified_input[feature_to_change] = new_value
elif feature_to_change == 'ed_level':
new_value = st.selectbox("New education level", ED_LEVEL_OPTIONS, key="what_if_ed")
modified_input[feature_to_change] = new_value
# Calculate comparison button
if st.button("đ Compare Scenarios", use_container_width=True):
st.session_state['comparison_active'] = True
st.session_state['modified_input'] = modified_input
with col2:
if st.session_state.get('comparison_active', False):
modified_input = st.session_state['modified_input']
modified_prediction = model_pipeline.predict(modified_input)[0]
difference = modified_prediction - original_prediction
percent_change = (difference / original_prediction) * 100
st.subheader("đ Comparison Results")
# Visual comparison
fig, ax = plt.subplots(figsize=(10, 5))
scenarios = ['Current\nProfile', 'Modified\nProfile']
salaries = [original_prediction, modified_prediction]
colors = ['#3498db', '#e74c3c' if difference < 0 else '#2ecc71']
bars = ax.bar(scenarios, salaries, color=colors, alpha=0.7, width=0.6)
# Add value labels on bars
for bar, salary in zip(bars, salaries):
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height,
f'âŦ{salary:,.0f}',
ha='center', va='bottom', fontweight='bold', fontsize=12)
ax.set_ylabel('Annual Salary (EUR)', fontweight='bold', fontsize=11)
ax.set_title('Salary Comparison', fontweight='bold', fontsize=13, pad=20)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
st.pyplot(fig)
# Summary metrics
col_a, col_b, col_c = st.columns(3)
with col_a:
st.metric("Current Salary", f"âŦ{original_prediction:,.0f}")
with col_b:
st.metric("Modified Salary", f"âŦ{modified_prediction:,.0f}")
with col_c:
st.metric("Difference", f"âŦ{abs(difference):,.0f}",
f"{percent_change:+.1f}%")
# Interpretation
if difference > 0:
st.success(f"â
This change would **increase** your salary by âŦ{difference:,.0f} ({percent_change:.1f}%)")
elif difference < 0:
st.error(f"â ī¸ This change would **decrease** your salary by âŦ{abs(difference):,.0f} ({percent_change:.1f}%)")
else:
st.info("âĄī¸ This change has **no significant impact** on salary")
else:
st.info("đ Select a factor to modify and click 'Compare Scenarios' to see the impact")
# TAB 3: ABOUT PREDICTION
with tab3:
st.header("âšī¸ About This Prediction")
col1, col2 = st.columns(2)
with col1:
st.subheader("đ Model Information")
st.markdown("""
- **Algorithm**: Random Forest Regressor (Optimized)
- **Training Data**: Stack Overflow 2024 Developer Survey
- **Sample Size**: 7,000+ European developers
- **Model Accuracy**: RMSE â âŦ18,600
- **Features Used**: 9 key factors
""")
st.subheader("đ¯ Prediction Confidence")
st.info("This model performs best for developers with 0-20 years of experience in Eurozone countries. Average prediction error: ÂąâŦ18,600")
with col2:
st.subheader("đ Your Profile Summary")
profile_data = {
'Factor': ['Age Group', 'Experience', 'Country', 'Developer Type',
'Education', 'Org Size', 'Work Arrangement', 'SO Account', 'Uses AI'],
'Value': [
{1: "18-24", 2: "25-34", 3: "35-44", 4: "45-54", 5: "55+"}[age_group],
f"{years_code_pro} years",
country,
dev_type,
ed_level[:30] + "..." if len(ed_level) > 30 else ed_level,
org_size,
remote_work,
"Yes" if so_account else "No",
"Yes" if ai_select else "No"
]
}
st.dataframe(
pd.DataFrame(profile_data),
use_container_width=True,
hide_index=True
)
st.markdown("---")
st.warning("""
**â ī¸ Important Disclaimer**: This prediction is an **estimate** based on historical survey data.
Actual salaries can vary significantly based on:
- Specific technical skills and expertise
- Company size, stage, and funding
- Individual negotiation and performance
- Local market conditions and demand
- Benefits, equity, and other compensation
Use this tool as a **reference point**, not a definitive salary expectation.
""")
else:
# Only show the instructions when no prediction has been made
if not st.session_state.get("has_prediction", False):
st.markdown("---")
st.info("đ **Get Started**: Fill in your profile in the sidebar and click **'Predict Salary'** to see your results!")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("### đ¯ Step 1")
st.write("Enter personal information (age, experience, country)")
with col2:
st.markdown("### đŧ Step 2")
st.write("Add professional details (role, education, company)")
with col3:
st.markdown("### đŽ Step 3")
st.write("Click **'Predict Salary'** to see your estimate!")