Spaces:

Peter512
/

developer-salary-predictor

Sleeping

File size: 26,799 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import pickle
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from streamlit_shap import st_shap

# Page configuration
st.set_page_config(
    page_title="EU Developer Salary Predictor",
    page_icon="💰",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for better styling
st.markdown("""
    <style>
    .main-header {
        font-size: 2.8rem;
        font-weight: bold;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 1rem;
    }
    .sub-header {
        font-size: 1.2rem;
        color: #666;
        text-align: center;
        margin-bottom: 2rem;
    }
    .prediction-box {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        padding: 2rem 2.5rem;
        border-radius: 15px;
        text-align: center;
        margin: 1.5rem 0;
        box-shadow: 0 10px 30px rgba(0,0,0,0.2);
    }
    .prediction-value {
        font-size: 3.5rem;
        font-weight: bold;
        color: white;
        text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
        margin: 0.5rem 0;
    }
    .prediction-label {
        font-size: 1.1rem;
        color: rgba(255,255,255,0.95);
        margin-bottom: 0.5rem;
        font-weight: 500;
    }
    .metric-card {
        background-color: #f8f9fa;
        padding: 1rem;
        border-radius: 10px;
        border-left: 4px solid #1f77b4;
        margin: 0.5rem 0;
    }
    .sidebar .sidebar-content {
        background-color: #f8f9fa;
    }
    /* Ensure tab text is always visible */
    .stTabs [data-baseweb="tab-list"] button {
        color: #262730 !important;
    }
    .stTabs [data-baseweb="tab-list"] button[aria-selected="true"] {
        background-color: #1f77b4 !important;
        color: white !important;
    }
    .stTabs [data-baseweb="tab-list"] button[aria-selected="true"] p {
        color: white !important;
    }
    .stTabs [data-baseweb="tab-list"] {
        gap: 1rem;
        background-color: #f8f9fa;
        padding: 0.5rem;
        border-radius: 10px;
    }
    .stTabs [data-baseweb="tab"] {
        height: 50px;
        padding: 0 20px;
        border-radius: 8px;
        color: #262730;
        font-weight: 500;
    }
    .stTabs [data-baseweb="tab"]:hover {
        background-color: #e9ecef;
    }
    .stTabs [aria-selected="true"] {
        background-color: #1f77b4;
        color: white !important;
    }
    </style>
""", unsafe_allow_html=True)

# Load model and info
@st.cache_resource
def load_model():
    with open('salary_model.pkl', 'rb') as f:
        model = pickle.load(f)
    with open('model_info.pkl', 'rb') as f:
        info = pickle.load(f)
    return model, info

try:
    model_pipeline, model_info = load_model()
except Exception as e:
    st.error(f"❌ Error loading model: {e}")
    st.stop()

# Feature options - Only countries using EUR or commonly reporting in EUR
COUNTRY_OPTIONS = [
    'Austria',
    'Belgium', 
    'France',
    'Germany',
    'Ireland',
    'Italy',
    'Netherlands',
    'Portugal',
    'Spain'
]

ED_LEVEL_OPTIONS = [
    "Primary/elementary school",
    "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",
    "Some college/university study without earning a degree",
    "Associate degree (A.A., A.S., etc.)",
    "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
    "Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
    "Professional degree (JD, MD, Ph.D, Ed.D, etc.)",
    "Something else"
]

DEV_TYPE_OPTIONS = [
    'Developer, back-end', 'Developer, full-stack', 'Developer, front-end',
    'Engineering manager', 'Developer, desktop or enterprise applications',
    'Developer, mobile', 'DevOps specialist', 'Data scientist or machine learning specialist',
    'Data or business analyst', 'System administrator', 'Developer, QA or test',
    'Product manager', 'Other'
]

ORG_SIZE_OPTIONS = [
    '2 to 9 employees',
    '10 to 19 employees',
    '20 to 99 employees',
    '100 to 499 employees',
    '500 to 999 employees',
    '1,000 to 4,999 employees',
    '5,000 to 9,999 employees',
    '10,000 or more employees'
]

REMOTE_WORK_OPTIONS = [
    'Hybrid (some remote, some in-person)', 'Fully remote', 'In-person'
]

# ============================================================================
# SIDEBAR - INPUT FORM
# ============================================================================
with st.sidebar:
    st.title("🎯 Developer Profile")
    st.markdown("---")
    
    # Personal Information
    st.subheader("👤 Personal")
    age_group = st.selectbox(
        "Age Group",
        options=[1, 2, 3, 4, 5],
        format_func=lambda x: {
            1: "18-24 years", 2: "25-34 years", 3: "35-44 years",
            4: "45-54 years", 5: "55+ years"
        }[x],
        key="age"
    )
    
    years_code_pro = st.slider(
        "Years of Experience",
        min_value=0, max_value=40, value=5,
        key="years"
    )
    
    country = st.selectbox(
        "Country",
        options=COUNTRY_OPTIONS,
        key="country"
    )
    
    st.markdown("---")
    
    # Professional Information
    st.subheader("💼 Professional")
    
    dev_type = st.selectbox(
        "Developer Type",
        options=DEV_TYPE_OPTIONS,
        key="dev_type"
    )
    
    ed_level = st.selectbox(
        "Education Level",
        options=ED_LEVEL_OPTIONS,
        key="ed_level"
    )
    
    org_size = st.selectbox(
        "Organization Size",
        options=ORG_SIZE_OPTIONS,
        key="org_size"
    )
    
    remote_work = st.selectbox(
        "Work Arrangement",
        options=REMOTE_WORK_OPTIONS,
        key="remote"
    )
    
    st.markdown("---")
    
    # Additional Information
    st.subheader("⚙️ Additional")
    
    so_account = st.checkbox(
        "Stack Overflow Account",
        value=True,
        key="so"
    )
    
    ai_select = st.checkbox(
        "Uses AI Tools",
        value=True,
        key="ai"
    )
    
    st.markdown("---")
    
    # Predict button in sidebar
    predict_button = st.button(
        "🔮 Predict Salary",
        type="primary",
        use_container_width=True,
        key="predict_btn"
    )
    
    st.markdown("---")
    
    # MODEL DETAILS - Moved to bottom of sidebar
    st.subheader("📊 Model Details")
    st.markdown("""
    - **Data Source**: Stack Overflow 2024 Survey
    - **Sample**: 7,000+ European developers
    - **Algorithm**: Optimized Random Forest
    - **Accuracy**: RMSE ~€18,600
    - **Last Updated**: 2025
    """)

# ============================================================================
# MAIN CONTENT AREA
# ============================================================================

# Header
st.markdown('<div class="main-header">💰 European Developer Salary Predictor</div>', unsafe_allow_html=True)
st.markdown('<div class="sub-header">Salary estimation for European software developers</div>', unsafe_allow_html=True)

# Handle prediction
if predict_button:
    # Create input dataframe
    input_data = pd.DataFrame({
        'age_group': [age_group],
        'years_code_pro': [years_code_pro],
        'remote_work': [remote_work],
        'ed_level': [ed_level],
        'dev_type': [dev_type],
        'org_size': [org_size],
        'country': [country],
        'so_account': [so_account],
        'ai_select': [ai_select]
    })
    
    # Make prediction
    prediction = model_pipeline.predict(input_data)[0]
    
    # Store in session state
    st.session_state['current_input'] = input_data
    st.session_state['current_prediction'] = prediction
    st.session_state['has_prediction'] = True

# Show results if prediction exists
if st.session_state.get('has_prediction', False):
    prediction = st.session_state['current_prediction']
    
    # Display main prediction
    st.markdown("""
        <div class="prediction-box">
            <div class="prediction-label">Predicted Annual Salary</div>
            <div class="prediction-value">€{:,.0f}</div>
        </div>
    """.format(prediction), unsafe_allow_html=True)
    
    # Breakdown metrics
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("💰 Annual", f"€{prediction:,.0f}")
    with col2:
        st.metric("📅 Monthly", f"€{prediction/12:,.0f}")
    with col3:
        st.metric("📆 Weekly", f"€{prediction/52:,.0f}")
    with col4:
        st.metric("⏰ Hourly", f"€{prediction/2080:,.0f}")
    
    st.markdown("---")
    
    # Tabs for detailed analysis
    tab1, tab2, tab3 = st.tabs(["📊 Model Insights", "🔄 What-If Analysis", "ℹ️ About Prediction"])
    
    # TAB 1: MODEL INSIGHTS
    with tab1:
        st.header("🧠 Understanding Your Prediction")
        st.write("See which factors had the biggest impact on your predicted salary.")
        
        input_data = st.session_state['current_input']
        
        # Transform input
        preprocessor = model_pipeline.named_steps['preprocessor']
        model = model_pipeline.named_steps['regressor']
        
        X_transformed = preprocessor.transform(input_data)
        feature_names = list(preprocessor.get_feature_names_out())
        
        # Create SHAP explainer
        with st.spinner("Calculating feature impacts..."):
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_transformed)
            
            if isinstance(explainer.expected_value, np.ndarray):
                expected_value = float(explainer.expected_value[0])
            else:
                expected_value = float(explainer.expected_value)
        
        st.subheader("🎯 Feature Impact Visualization")
        st.write(f"**Base salary** (average): €{expected_value:,.0f}")
        st.write("Features in **red** increase your salary. Features in **blue** decrease it.")
        
        # Force plot
        st_shap(shap.force_plot(
            expected_value,
            shap_values[0],
            X_transformed[0],
            feature_names=feature_names
        ))
        
        st.markdown("---")
        
        # Feature contribution table
        col1, col2 = st.columns([2, 1])

        with col1:
            st.subheader("📈 Top Contributing Factors")
            
            # Create feature mapping WITHOUT emojis for chart
            def clean_feature_name(feature):
                """Convert technical feature names to user-friendly labels WITHOUT emojis"""
                # Remove prefixes
                feature = feature.replace('cat__', '').replace('num__', '').replace('remainder__', '')
                
                # Simple mappings
                simple_map = {
                    'years_code_pro': 'Years of Experience',
                    'age_group': 'Age Group',
                    'so_account': 'Stack Overflow Account',
                    'ai_select': 'Uses AI Tools'
                }
                
                if feature in simple_map:
                    return simple_map[feature]
                
                # Handle categorical variables
                replacements = {
                    'country_': 'Country: ',
                    'remote_work_': 'Work: ',
                    'dev_type_': 'Role: ',
                    'org_size_': 'Company Size: ',
                    'ed_level_': 'Education: '
                }
                
                for prefix, label in replacements.items():
                    if prefix in feature:
                        return label + feature.replace(prefix, '').replace('_', ' ')
                
                # Fallback
                return feature.replace('_', ' ').title()
            
            # Create emoji version for the table only
            def clean_feature_name_with_emoji(feature):
                """Convert technical feature names to user-friendly labels WITH emojis"""
                base_name = clean_feature_name(feature)
                
                # Add emojis based on content
                if 'Years of Experience' in base_name:
                    return '⏱️ ' + base_name
                elif 'Age Group' in base_name:
                    return '👤 ' + base_name
                elif 'Country:' in base_name:
                    return '🌍 ' + base_name
                elif 'Work:' in base_name:
                    return '🏠 ' + base_name
                elif 'Role:' in base_name:
                    return '💻 ' + base_name
                elif 'Company Size:' in base_name:
                    return '🏢 ' + base_name
                elif 'Education:' in base_name:
                    return '🎓 ' + base_name
                elif 'Stack Overflow' in base_name:
                    return '📚 ' + base_name
                elif 'AI Tools' in base_name:
                    return '🤖 ' + base_name
                
                return base_name
            
            shap_df = pd.DataFrame({
                'Feature': feature_names,
                'SHAP Value': shap_values[0],
                'Impact': ['⬆️ Increases' if x > 0 else '⬇️ Decreases' for x in shap_values[0]]
            })
            shap_df['Abs SHAP'] = shap_df['SHAP Value'].abs()
            shap_df = shap_df.sort_values('Abs SHAP', ascending=False).head(10)
            
            # Clean feature names - NO emojis for chart, WITH emojis for table
            shap_df['Feature_Clean'] = shap_df['Feature'].apply(clean_feature_name)
            shap_df['Feature_Clean_Emoji'] = shap_df['Feature'].apply(clean_feature_name_with_emoji)
            
            # Create visualization with improved styling
            fig, ax = plt.subplots(figsize=(10, 6))
            
            # Modern color scheme
            colors = ['#10b981' if x > 0 else '#ef4444' for x in shap_df['SHAP Value']]
            
            # Create bars
            bars = ax.barh(range(len(shap_df)), shap_df['SHAP Value'], color=colors, alpha=0.85, height=0.7)
            
            # Add value labels on bars - improved positioning
            max_abs_value = shap_df['Abs SHAP'].max()
            
            for i, (bar, value) in enumerate(zip(bars, shap_df['SHAP Value'])):
                abs_value = abs(value)
                
                # For large bars (>30% of max), place label inside
                # For small bars, place label outside
                if abs_value > max_abs_value * 0.3:
                    # Inside the bar
                    x_pos = value / 2
                    color = 'white'
                    ha = 'center'
                else:
                    # Outside the bar
                    offset = max_abs_value * 0.05  # 5% of max value as offset
                    x_pos = value + (offset if value > 0 else -offset)
                    color = '#10b981' if value > 0 else '#ef4444'
                    ha = 'left' if value > 0 else 'right'
                
                ax.text(x_pos, i, f'€{abs_value:,.0f}',
                    ha=ha, va='center', 
                    fontweight='bold', fontsize=10,
                    color=color)
            
            # Set labels with cleaned names WITHOUT EMOJIS
            ax.set_yticks(range(len(shap_df)))
            ax.set_yticklabels(shap_df['Feature_Clean'], fontsize=10)
            ax.set_xlabel('Impact on Salary (EUR)', fontsize=11, fontweight='bold')
            ax.set_title('How Different Factors Affect Your Salary', 
                        fontsize=13, fontweight='bold', pad=20)
            
            # Add zero line
            ax.axvline(x=0, color='#64748b', linestyle='-', linewidth=2, alpha=0.5)
            
            # Add legend
            from matplotlib.patches import Patch
            legend_elements = [
                Patch(facecolor='#10b981', alpha=0.85, label='Increases Salary'),
                Patch(facecolor='#ef4444', alpha=0.85, label='Decreases Salary')
            ]
            ax.legend(handles=legend_elements, loc='upper right', frameon=True, 
                    fancybox=True, shadow=True, fontsize=10)
            
            # Styling
            ax.grid(axis='x', alpha=0.2, linestyle='--')
            ax.set_facecolor('#f8fafc')
            fig.patch.set_facecolor('white')
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            
            plt.tight_layout()
            st.pyplot(fig)
            
            # Add explanation box with improved text
            st.info("""
            **💡 How to read this chart:**
            - **Green bars** pointing right → These factors *increase* your salary
            - **Red bars** pointing left → These factors *decrease* your salary  
            - **Longer bars** = Bigger impact on your predicted salary
            - **Why do I see other countries/categories I didn't select?** The chart shows the top 10 most impactful features for your prediction. When you see a **red bar** for a category you *didn't* select (like other countries), it means "not having this characteristic lowers your salary compared to having it." For example, if you see "Country: Germany" with a **red bar showing €1,059**, it means being from Germany would have added €1,059 to your salary compared to your current country.
            """)

        with col2:
            st.subheader("📋 Impact Details")
            
            # Format the table with cleaned names (WITH emojis for table)
            display_df = shap_df[['Feature_Clean_Emoji', 'SHAP Value']].copy()
            display_df.columns = ['Factor', 'Impact Amount']
            display_df['Impact Amount'] = display_df['Impact Amount'].apply(
                lambda x: f"+€{x:,.0f}" if x > 0 else f"-€{abs(x):,.0f}"
            )
            display_df = display_df.reset_index(drop=True)
            
            st.dataframe(
                display_df,
                use_container_width=True,
                hide_index=True
            )
    
    # TAB 2: WHAT-IF ANALYSIS
    with tab2:
        st.header("🔄 What-If Scenario Analysis")
        st.write("Explore how changing different factors affects your predicted salary.")
        
        original_input = st.session_state['current_input'].copy()
        original_prediction = st.session_state['current_prediction']
        
        col1, col2 = st.columns([1, 2])
        
        with col1:
            st.subheader("🎛️ Modify Factor")
            
            feature_to_change = st.selectbox(
                "Select factor to modify",
                options=['years_code_pro', 'country', 'remote_work', 'dev_type', 'org_size', 'ed_level'],
                format_func=lambda x: {
                    'years_code_pro': '⏱️ Years of Experience',
                    'country': '🌍 Country',
                    'remote_work': '🏠 Work Arrangement',
                    'dev_type': '💻 Developer Type',
                    'org_size': '🏢 Organization Size',
                    'ed_level': '🎓 Education Level'
                }[x]
            )
            
            modified_input = original_input.copy()
            
            if feature_to_change == 'years_code_pro':
                new_value = st.slider(
                    "New years of experience",
                    min_value=0, max_value=40,
                    value=int(original_input[feature_to_change].values[0]),
                    key="what_if_years"
                )
                modified_input[feature_to_change] = new_value
                
            elif feature_to_change == 'country':
                new_value = st.selectbox("New country", COUNTRY_OPTIONS, key="what_if_country")
                modified_input[feature_to_change] = new_value
                
            elif feature_to_change == 'remote_work':
                new_value = st.selectbox("New work arrangement", REMOTE_WORK_OPTIONS, key="what_if_remote")
                modified_input[feature_to_change] = new_value
                
            elif feature_to_change == 'dev_type':
                new_value = st.selectbox("New developer type", DEV_TYPE_OPTIONS, key="what_if_dev")
                modified_input[feature_to_change] = new_value
                
            elif feature_to_change == 'org_size':
                new_value = st.selectbox("New org size", ORG_SIZE_OPTIONS, key="what_if_org")
                modified_input[feature_to_change] = new_value
                
            elif feature_to_change == 'ed_level':
                new_value = st.selectbox("New education level", ED_LEVEL_OPTIONS, key="what_if_ed")
                modified_input[feature_to_change] = new_value
            
            # Calculate comparison button
            if st.button("🔄 Compare Scenarios", use_container_width=True):
                st.session_state['comparison_active'] = True
                st.session_state['modified_input'] = modified_input
        
        with col2:
            if st.session_state.get('comparison_active', False):
                modified_input = st.session_state['modified_input']
                modified_prediction = model_pipeline.predict(modified_input)[0]
                difference = modified_prediction - original_prediction
                percent_change = (difference / original_prediction) * 100
                
                st.subheader("📊 Comparison Results")
                
                # Visual comparison
                fig, ax = plt.subplots(figsize=(10, 5))
                scenarios = ['Current\nProfile', 'Modified\nProfile']
                salaries = [original_prediction, modified_prediction]
                colors = ['#3498db', '#e74c3c' if difference < 0 else '#2ecc71']
                
                bars = ax.bar(scenarios, salaries, color=colors, alpha=0.7, width=0.6)
                
                # Add value labels on bars
                for bar, salary in zip(bars, salaries):
                    height = bar.get_height()
                    ax.text(bar.get_x() + bar.get_width()/2., height,
                           f'€{salary:,.0f}',
                           ha='center', va='bottom', fontweight='bold', fontsize=12)
                
                ax.set_ylabel('Annual Salary (EUR)', fontweight='bold', fontsize=11)
                ax.set_title('Salary Comparison', fontweight='bold', fontsize=13, pad=20)
                ax.grid(axis='y', alpha=0.3)
                
                plt.tight_layout()
                st.pyplot(fig)
                
                # Summary metrics
                col_a, col_b, col_c = st.columns(3)
                with col_a:
                    st.metric("Current Salary", f"€{original_prediction:,.0f}")
                with col_b:
                    st.metric("Modified Salary", f"€{modified_prediction:,.0f}")
                with col_c:
                    st.metric("Difference", f"€{abs(difference):,.0f}", 
                             f"{percent_change:+.1f}%")
                
                # Interpretation
                if difference > 0:
                    st.success(f"✅ This change would **increase** your salary by €{difference:,.0f} ({percent_change:.1f}%)")
                elif difference < 0:
                    st.error(f"⚠️ This change would **decrease** your salary by €{abs(difference):,.0f} ({percent_change:.1f}%)")
                else:
                    st.info("➡️ This change has **no significant impact** on salary")
            else:
                st.info("👈 Select a factor to modify and click 'Compare Scenarios' to see the impact")
    
    # TAB 3: ABOUT PREDICTION
    with tab3:
        st.header("ℹ️ About This Prediction")
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("📊 Model Information")
            st.markdown("""
            - **Algorithm**: Random Forest Regressor (Optimized)
            - **Training Data**: Stack Overflow 2024 Developer Survey
            - **Sample Size**: 7,000+ European developers
            - **Model Accuracy**: RMSE ≈ €18,600
            - **Features Used**: 9 key factors
            """)
            
            st.subheader("🎯 Prediction Confidence")
            st.info("This model performs best for developers with 0-20 years of experience in Eurozone countries. Average prediction error: ±€18,600")
        
        with col2:
            st.subheader("📋 Your Profile Summary")
            
            profile_data = {
                'Factor': ['Age Group', 'Experience', 'Country', 'Developer Type', 
                          'Education', 'Org Size', 'Work Arrangement', 'SO Account', 'Uses AI'],
                'Value': [
                    {1: "18-24", 2: "25-34", 3: "35-44", 4: "45-54", 5: "55+"}[age_group],
                    f"{years_code_pro} years",
                    country,
                    dev_type,
                    ed_level[:30] + "..." if len(ed_level) > 30 else ed_level,
                    org_size,
                    remote_work,
                    "Yes" if so_account else "No",
                    "Yes" if ai_select else "No"
                ]
            }
            
            st.dataframe(
                pd.DataFrame(profile_data),
                use_container_width=True,
                hide_index=True
            )
        
        st.markdown("---")
        st.warning("""
        **⚠️ Important Disclaimer**: This prediction is an **estimate** based on historical survey data. 
        Actual salaries can vary significantly based on:
        - Specific technical skills and expertise
        - Company size, stage, and funding
        - Individual negotiation and performance
        - Local market conditions and demand
        - Benefits, equity, and other compensation
        
        Use this tool as a **reference point**, not a definitive salary expectation.
        """)

else:
    # Only show the instructions when no prediction has been made
    if not st.session_state.get("has_prediction", False):
        st.markdown("---")
        st.info("👈 **Get Started**: Fill in your profile in the sidebar and click **'Predict Salary'** to see your results!")
    
        col1, col2, col3 = st.columns(3)
    
        with col1:
            st.markdown("### 🎯 Step 1")
            st.write("Enter personal information (age, experience, country)")
    
        with col2:
            st.markdown("### 💼 Step 2")
            st.write("Add professional details (role, education, company)")
    
        with col3:
            st.markdown("### 🔮 Step 3")
            st.write("Click **'Predict Salary'** to see your estimate!")