import streamlit as st import pandas as pd import numpy as np import pickle import shap import matplotlib.pyplot as plt import seaborn as sns from streamlit_shap import st_shap # Page configuration st.set_page_config( page_title="EU Developer Salary Predictor", page_icon="💰", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) # Load model and info @st.cache_resource def load_model(): with open('salary_model.pkl', 'rb') as f: model = pickle.load(f) with open('model_info.pkl', 'rb') as f: info = pickle.load(f) return model, info try: model_pipeline, model_info = load_model() except Exception as e: st.error(f"❌ Error loading model: {e}") st.stop() # Feature options - Only countries using EUR or commonly reporting in EUR COUNTRY_OPTIONS = [ 'Austria', 'Belgium', 'France', 'Germany', 'Ireland', 'Italy', 'Netherlands', 'Portugal', 'Spain' ] ED_LEVEL_OPTIONS = [ "Primary/elementary school", "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)", "Some college/university study without earning a degree", "Associate degree (A.A., A.S., etc.)", "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Professional degree (JD, MD, Ph.D, Ed.D, etc.)", "Something else" ] DEV_TYPE_OPTIONS = [ 'Developer, back-end', 'Developer, full-stack', 'Developer, front-end', 'Engineering manager', 'Developer, desktop or enterprise applications', 'Developer, mobile', 'DevOps specialist', 'Data scientist or machine learning specialist', 'Data or business analyst', 'System administrator', 'Developer, QA or test', 'Product manager', 'Other' ] ORG_SIZE_OPTIONS = [ '2 to 9 employees', '10 to 19 employees', '20 to 99 employees', '100 to 499 employees', '500 to 999 employees', '1,000 to 4,999 employees', '5,000 to 9,999 employees', '10,000 or more employees' ] REMOTE_WORK_OPTIONS = [ 'Hybrid (some remote, some in-person)', 'Fully remote', 'In-person' ] # ============================================================================ # SIDEBAR - INPUT FORM # ============================================================================ with st.sidebar: st.title("đŸŽ¯ Developer Profile") st.markdown("---") # Personal Information st.subheader("👤 Personal") age_group = st.selectbox( "Age Group", options=[1, 2, 3, 4, 5], format_func=lambda x: { 1: "18-24 years", 2: "25-34 years", 3: "35-44 years", 4: "45-54 years", 5: "55+ years" }[x], key="age" ) years_code_pro = st.slider( "Years of Experience", min_value=0, max_value=40, value=5, key="years" ) country = st.selectbox( "Country", options=COUNTRY_OPTIONS, key="country" ) st.markdown("---") # Professional Information st.subheader("đŸ’ŧ Professional") dev_type = st.selectbox( "Developer Type", options=DEV_TYPE_OPTIONS, key="dev_type" ) ed_level = st.selectbox( "Education Level", options=ED_LEVEL_OPTIONS, key="ed_level" ) org_size = st.selectbox( "Organization Size", options=ORG_SIZE_OPTIONS, key="org_size" ) remote_work = st.selectbox( "Work Arrangement", options=REMOTE_WORK_OPTIONS, key="remote" ) st.markdown("---") # Additional Information st.subheader("âš™ī¸ Additional") so_account = st.checkbox( "Stack Overflow Account", value=True, key="so" ) ai_select = st.checkbox( "Uses AI Tools", value=True, key="ai" ) st.markdown("---") # Predict button in sidebar predict_button = st.button( "🔮 Predict Salary", type="primary", use_container_width=True, key="predict_btn" ) st.markdown("---") # MODEL DETAILS - Moved to bottom of sidebar st.subheader("📊 Model Details") st.markdown(""" - **Data Source**: Stack Overflow 2024 Survey - **Sample**: 7,000+ European developers - **Algorithm**: Optimized Random Forest - **Accuracy**: RMSE ~â‚Ŧ18,600 - **Last Updated**: 2025 """) # ============================================================================ # MAIN CONTENT AREA # ============================================================================ # Header st.markdown('
💰 European Developer Salary Predictor
', unsafe_allow_html=True) st.markdown('
Salary estimation for European software developers
', unsafe_allow_html=True) # Handle prediction if predict_button: # Create input dataframe input_data = pd.DataFrame({ 'age_group': [age_group], 'years_code_pro': [years_code_pro], 'remote_work': [remote_work], 'ed_level': [ed_level], 'dev_type': [dev_type], 'org_size': [org_size], 'country': [country], 'so_account': [so_account], 'ai_select': [ai_select] }) # Make prediction prediction = model_pipeline.predict(input_data)[0] # Store in session state st.session_state['current_input'] = input_data st.session_state['current_prediction'] = prediction st.session_state['has_prediction'] = True # Show results if prediction exists if st.session_state.get('has_prediction', False): prediction = st.session_state['current_prediction'] # Display main prediction st.markdown("""
Predicted Annual Salary
â‚Ŧ{:,.0f}
""".format(prediction), unsafe_allow_html=True) # Breakdown metrics col1, col2, col3, col4 = st.columns(4) with col1: st.metric("💰 Annual", f"â‚Ŧ{prediction:,.0f}") with col2: st.metric("📅 Monthly", f"â‚Ŧ{prediction/12:,.0f}") with col3: st.metric("📆 Weekly", f"â‚Ŧ{prediction/52:,.0f}") with col4: st.metric("⏰ Hourly", f"â‚Ŧ{prediction/2080:,.0f}") st.markdown("---") # Tabs for detailed analysis tab1, tab2, tab3 = st.tabs(["📊 Model Insights", "🔄 What-If Analysis", "â„šī¸ About Prediction"]) # TAB 1: MODEL INSIGHTS with tab1: st.header("🧠 Understanding Your Prediction") st.write("See which factors had the biggest impact on your predicted salary.") input_data = st.session_state['current_input'] # Transform input preprocessor = model_pipeline.named_steps['preprocessor'] model = model_pipeline.named_steps['regressor'] X_transformed = preprocessor.transform(input_data) feature_names = list(preprocessor.get_feature_names_out()) # Create SHAP explainer with st.spinner("Calculating feature impacts..."): explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_transformed) if isinstance(explainer.expected_value, np.ndarray): expected_value = float(explainer.expected_value[0]) else: expected_value = float(explainer.expected_value) st.subheader("đŸŽ¯ Feature Impact Visualization") st.write(f"**Base salary** (average): â‚Ŧ{expected_value:,.0f}") st.write("Features in **red** increase your salary. Features in **blue** decrease it.") # Force plot st_shap(shap.force_plot( expected_value, shap_values[0], X_transformed[0], feature_names=feature_names )) st.markdown("---") # Feature contribution table col1, col2 = st.columns([2, 1]) with col1: st.subheader("📈 Top Contributing Factors") # Create feature mapping WITHOUT emojis for chart def clean_feature_name(feature): """Convert technical feature names to user-friendly labels WITHOUT emojis""" # Remove prefixes feature = feature.replace('cat__', '').replace('num__', '').replace('remainder__', '') # Simple mappings simple_map = { 'years_code_pro': 'Years of Experience', 'age_group': 'Age Group', 'so_account': 'Stack Overflow Account', 'ai_select': 'Uses AI Tools' } if feature in simple_map: return simple_map[feature] # Handle categorical variables replacements = { 'country_': 'Country: ', 'remote_work_': 'Work: ', 'dev_type_': 'Role: ', 'org_size_': 'Company Size: ', 'ed_level_': 'Education: ' } for prefix, label in replacements.items(): if prefix in feature: return label + feature.replace(prefix, '').replace('_', ' ') # Fallback return feature.replace('_', ' ').title() # Create emoji version for the table only def clean_feature_name_with_emoji(feature): """Convert technical feature names to user-friendly labels WITH emojis""" base_name = clean_feature_name(feature) # Add emojis based on content if 'Years of Experience' in base_name: return 'âąī¸ ' + base_name elif 'Age Group' in base_name: return '👤 ' + base_name elif 'Country:' in base_name: return '🌍 ' + base_name elif 'Work:' in base_name: return '🏠 ' + base_name elif 'Role:' in base_name: return 'đŸ’ģ ' + base_name elif 'Company Size:' in base_name: return 'đŸĸ ' + base_name elif 'Education:' in base_name: return '🎓 ' + base_name elif 'Stack Overflow' in base_name: return '📚 ' + base_name elif 'AI Tools' in base_name: return '🤖 ' + base_name return base_name shap_df = pd.DataFrame({ 'Feature': feature_names, 'SHAP Value': shap_values[0], 'Impact': ['âŦ†ī¸ Increases' if x > 0 else 'âŦ‡ī¸ Decreases' for x in shap_values[0]] }) shap_df['Abs SHAP'] = shap_df['SHAP Value'].abs() shap_df = shap_df.sort_values('Abs SHAP', ascending=False).head(10) # Clean feature names - NO emojis for chart, WITH emojis for table shap_df['Feature_Clean'] = shap_df['Feature'].apply(clean_feature_name) shap_df['Feature_Clean_Emoji'] = shap_df['Feature'].apply(clean_feature_name_with_emoji) # Create visualization with improved styling fig, ax = plt.subplots(figsize=(10, 6)) # Modern color scheme colors = ['#10b981' if x > 0 else '#ef4444' for x in shap_df['SHAP Value']] # Create bars bars = ax.barh(range(len(shap_df)), shap_df['SHAP Value'], color=colors, alpha=0.85, height=0.7) # Add value labels on bars - improved positioning max_abs_value = shap_df['Abs SHAP'].max() for i, (bar, value) in enumerate(zip(bars, shap_df['SHAP Value'])): abs_value = abs(value) # For large bars (>30% of max), place label inside # For small bars, place label outside if abs_value > max_abs_value * 0.3: # Inside the bar x_pos = value / 2 color = 'white' ha = 'center' else: # Outside the bar offset = max_abs_value * 0.05 # 5% of max value as offset x_pos = value + (offset if value > 0 else -offset) color = '#10b981' if value > 0 else '#ef4444' ha = 'left' if value > 0 else 'right' ax.text(x_pos, i, f'â‚Ŧ{abs_value:,.0f}', ha=ha, va='center', fontweight='bold', fontsize=10, color=color) # Set labels with cleaned names WITHOUT EMOJIS ax.set_yticks(range(len(shap_df))) ax.set_yticklabels(shap_df['Feature_Clean'], fontsize=10) ax.set_xlabel('Impact on Salary (EUR)', fontsize=11, fontweight='bold') ax.set_title('How Different Factors Affect Your Salary', fontsize=13, fontweight='bold', pad=20) # Add zero line ax.axvline(x=0, color='#64748b', linestyle='-', linewidth=2, alpha=0.5) # Add legend from matplotlib.patches import Patch legend_elements = [ Patch(facecolor='#10b981', alpha=0.85, label='Increases Salary'), Patch(facecolor='#ef4444', alpha=0.85, label='Decreases Salary') ] ax.legend(handles=legend_elements, loc='upper right', frameon=True, fancybox=True, shadow=True, fontsize=10) # Styling ax.grid(axis='x', alpha=0.2, linestyle='--') ax.set_facecolor('#f8fafc') fig.patch.set_facecolor('white') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.tight_layout() st.pyplot(fig) # Add explanation box with improved text st.info(""" **💡 How to read this chart:** - **Green bars** pointing right → These factors *increase* your salary - **Red bars** pointing left → These factors *decrease* your salary - **Longer bars** = Bigger impact on your predicted salary - **Why do I see other countries/categories I didn't select?** The chart shows the top 10 most impactful features for your prediction. When you see a **red bar** for a category you *didn't* select (like other countries), it means "not having this characteristic lowers your salary compared to having it." For example, if you see "Country: Germany" with a **red bar showing â‚Ŧ1,059**, it means being from Germany would have added â‚Ŧ1,059 to your salary compared to your current country. """) with col2: st.subheader("📋 Impact Details") # Format the table with cleaned names (WITH emojis for table) display_df = shap_df[['Feature_Clean_Emoji', 'SHAP Value']].copy() display_df.columns = ['Factor', 'Impact Amount'] display_df['Impact Amount'] = display_df['Impact Amount'].apply( lambda x: f"+â‚Ŧ{x:,.0f}" if x > 0 else f"-â‚Ŧ{abs(x):,.0f}" ) display_df = display_df.reset_index(drop=True) st.dataframe( display_df, use_container_width=True, hide_index=True ) # TAB 2: WHAT-IF ANALYSIS with tab2: st.header("🔄 What-If Scenario Analysis") st.write("Explore how changing different factors affects your predicted salary.") original_input = st.session_state['current_input'].copy() original_prediction = st.session_state['current_prediction'] col1, col2 = st.columns([1, 2]) with col1: st.subheader("đŸŽ›ī¸ Modify Factor") feature_to_change = st.selectbox( "Select factor to modify", options=['years_code_pro', 'country', 'remote_work', 'dev_type', 'org_size', 'ed_level'], format_func=lambda x: { 'years_code_pro': 'âąī¸ Years of Experience', 'country': '🌍 Country', 'remote_work': '🏠 Work Arrangement', 'dev_type': 'đŸ’ģ Developer Type', 'org_size': 'đŸĸ Organization Size', 'ed_level': '🎓 Education Level' }[x] ) modified_input = original_input.copy() if feature_to_change == 'years_code_pro': new_value = st.slider( "New years of experience", min_value=0, max_value=40, value=int(original_input[feature_to_change].values[0]), key="what_if_years" ) modified_input[feature_to_change] = new_value elif feature_to_change == 'country': new_value = st.selectbox("New country", COUNTRY_OPTIONS, key="what_if_country") modified_input[feature_to_change] = new_value elif feature_to_change == 'remote_work': new_value = st.selectbox("New work arrangement", REMOTE_WORK_OPTIONS, key="what_if_remote") modified_input[feature_to_change] = new_value elif feature_to_change == 'dev_type': new_value = st.selectbox("New developer type", DEV_TYPE_OPTIONS, key="what_if_dev") modified_input[feature_to_change] = new_value elif feature_to_change == 'org_size': new_value = st.selectbox("New org size", ORG_SIZE_OPTIONS, key="what_if_org") modified_input[feature_to_change] = new_value elif feature_to_change == 'ed_level': new_value = st.selectbox("New education level", ED_LEVEL_OPTIONS, key="what_if_ed") modified_input[feature_to_change] = new_value # Calculate comparison button if st.button("🔄 Compare Scenarios", use_container_width=True): st.session_state['comparison_active'] = True st.session_state['modified_input'] = modified_input with col2: if st.session_state.get('comparison_active', False): modified_input = st.session_state['modified_input'] modified_prediction = model_pipeline.predict(modified_input)[0] difference = modified_prediction - original_prediction percent_change = (difference / original_prediction) * 100 st.subheader("📊 Comparison Results") # Visual comparison fig, ax = plt.subplots(figsize=(10, 5)) scenarios = ['Current\nProfile', 'Modified\nProfile'] salaries = [original_prediction, modified_prediction] colors = ['#3498db', '#e74c3c' if difference < 0 else '#2ecc71'] bars = ax.bar(scenarios, salaries, color=colors, alpha=0.7, width=0.6) # Add value labels on bars for bar, salary in zip(bars, salaries): height = bar.get_height() ax.text(bar.get_x() + bar.get_width()/2., height, f'â‚Ŧ{salary:,.0f}', ha='center', va='bottom', fontweight='bold', fontsize=12) ax.set_ylabel('Annual Salary (EUR)', fontweight='bold', fontsize=11) ax.set_title('Salary Comparison', fontweight='bold', fontsize=13, pad=20) ax.grid(axis='y', alpha=0.3) plt.tight_layout() st.pyplot(fig) # Summary metrics col_a, col_b, col_c = st.columns(3) with col_a: st.metric("Current Salary", f"â‚Ŧ{original_prediction:,.0f}") with col_b: st.metric("Modified Salary", f"â‚Ŧ{modified_prediction:,.0f}") with col_c: st.metric("Difference", f"â‚Ŧ{abs(difference):,.0f}", f"{percent_change:+.1f}%") # Interpretation if difference > 0: st.success(f"✅ This change would **increase** your salary by â‚Ŧ{difference:,.0f} ({percent_change:.1f}%)") elif difference < 0: st.error(f"âš ī¸ This change would **decrease** your salary by â‚Ŧ{abs(difference):,.0f} ({percent_change:.1f}%)") else: st.info("âžĄī¸ This change has **no significant impact** on salary") else: st.info("👈 Select a factor to modify and click 'Compare Scenarios' to see the impact") # TAB 3: ABOUT PREDICTION with tab3: st.header("â„šī¸ About This Prediction") col1, col2 = st.columns(2) with col1: st.subheader("📊 Model Information") st.markdown(""" - **Algorithm**: Random Forest Regressor (Optimized) - **Training Data**: Stack Overflow 2024 Developer Survey - **Sample Size**: 7,000+ European developers - **Model Accuracy**: RMSE ≈ â‚Ŧ18,600 - **Features Used**: 9 key factors """) st.subheader("đŸŽ¯ Prediction Confidence") st.info("This model performs best for developers with 0-20 years of experience in Eurozone countries. Average prediction error: Âąâ‚Ŧ18,600") with col2: st.subheader("📋 Your Profile Summary") profile_data = { 'Factor': ['Age Group', 'Experience', 'Country', 'Developer Type', 'Education', 'Org Size', 'Work Arrangement', 'SO Account', 'Uses AI'], 'Value': [ {1: "18-24", 2: "25-34", 3: "35-44", 4: "45-54", 5: "55+"}[age_group], f"{years_code_pro} years", country, dev_type, ed_level[:30] + "..." if len(ed_level) > 30 else ed_level, org_size, remote_work, "Yes" if so_account else "No", "Yes" if ai_select else "No" ] } st.dataframe( pd.DataFrame(profile_data), use_container_width=True, hide_index=True ) st.markdown("---") st.warning(""" **âš ī¸ Important Disclaimer**: This prediction is an **estimate** based on historical survey data. Actual salaries can vary significantly based on: - Specific technical skills and expertise - Company size, stage, and funding - Individual negotiation and performance - Local market conditions and demand - Benefits, equity, and other compensation Use this tool as a **reference point**, not a definitive salary expectation. """) else: # Only show the instructions when no prediction has been made if not st.session_state.get("has_prediction", False): st.markdown("---") st.info("👈 **Get Started**: Fill in your profile in the sidebar and click **'Predict Salary'** to see your results!") col1, col2, col3 = st.columns(3) with col1: st.markdown("### đŸŽ¯ Step 1") st.write("Enter personal information (age, experience, country)") with col2: st.markdown("### đŸ’ŧ Step 2") st.write("Add professional details (role, education, company)") with col3: st.markdown("### 🔮 Step 3") st.write("Click **'Predict Salary'** to see your estimate!")