File size: 6,314 Bytes
5f19f8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# -*- coding: utf-8 -*-
"""Untitled8.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1SnoorFAucvS1FXD1vzyJnJ-_hoZUfJ_u
"""

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px

# Page configuration
st.set_page_config(
    page_title="Developer Salary Explorer",
    page_icon="πŸ’»",
    layout="wide"
)

@st.cache_data
def load_data():
    """Load and preprocess the Stack Overflow survey data"""
    try:
        df = pd.read_csv('stackoverflow_survey_single_response.txt')

        # Filter only rows with compensation data
        df_clean = df[df['converted_comp_yearly'].notna()].copy()
        df_clean = df_clean[df_clean['converted_comp_yearly'] > 1000]

        # Handle missing values in numeric columns
        for col in ['years_code', 'years_code_pro', 'age']:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())

        # Create experience levels
        df_clean['experience_level'] = pd.cut(
            df_clean['years_code_pro'],
            bins=[0, 2, 5, 10, 50],
            labels=['Junior (0-2 yrs)', 'Mid (3-5 yrs)', 'Senior (6-10 yrs)', 'Expert (10+ yrs)']
        )

        # Simplify country to major regions
        top_countries = ['United States of America', 'United Kingdom of Great Britain and Northern Ireland',
                        'Germany', 'India', 'Canada', 'France', 'Australia']
        df_clean['country'] = df_clean['country'].apply(
            lambda x: x if x in top_countries else 'Other'
        )

        # Map education levels to readable names
        education_map = {
            1: 'Less than Bachelor',
            2: 'Bachelor\'s Degree',
            3: 'Master\'s Degree',
            4: 'Doctoral Degree',
            5: 'Professional Degree'
        }
        df_clean['education_level'] = df_clean['ed_level'].map(education_map)
        df_clean['education_level'] = df_clean['education_level'].fillna('Other')

        return df_clean

    except Exception as e:
        st.error(f"Error loading data: {str(e)}")
        return pd.DataFrame()

def main():
    st.title("πŸ’» Developer Salary Explorer")
    st.markdown("Explore how country, education, and experience influence developer salaries worldwide.")

    # Load data
    df = load_data()

    if df.empty:
        st.error("No data loaded. Please check your data file.")
        return

    st.sidebar.header("πŸ” Filter Data")

    # Country filter
    countries = sorted(df['country'].unique())
    selected_countries = st.sidebar.multiselect(
        "Select Countries:",
        options=countries,
        default=countries[:3]  # Default to first 3 countries
    )

    # Education level filter
    education_levels = sorted(df['education_level'].unique())
    selected_education = st.sidebar.multiselect(
        "Select Education Levels:",
        options=education_levels,
        default=education_levels
    )

    # Years of experience slider
    min_exp, max_exp = st.sidebar.slider(
        "Years of Professional Experience:",
        min_value=int(df['years_code_pro'].min()),
        max_value=int(min(df['years_code_pro'].max(), 40)),  # Cap at 40 for better UX
        value=(0, 15)
    )

    # Apply filters
    filtered_df = df[
        (df['country'].isin(selected_countries)) &
        (df['education_level'].isin(selected_education)) &
        (df['years_code_pro'] >= min_exp) &
        (df['years_code_pro'] <= max_exp)
    ]

    # Display metrics
    st.header("πŸ“Š Key Metrics")

    col1, col2, col3, col4 = st.columns(4)

    with col1:
        median_salary = filtered_df['converted_comp_yearly'].median()
        st.metric("Median Salary", f"${median_salary:,.0f}")

    with col2:
        avg_salary = filtered_df['converted_comp_yearly'].mean()
        st.metric("Average Salary", f"${avg_salary:,.0f}")

    with col3:
        sample_size = len(filtered_df)
        st.metric("Sample Size", f"{sample_size:,}")

    with col4:
        salary_range = f"${filtered_df['converted_comp_yearly'].min():,.0f} - ${filtered_df['converted_comp_yearly'].max():,.0f}"
        st.metric("Salary Range", salary_range)

    if sample_size == 0:
        st.warning("No data matches your filters. Please adjust your selection.")
        return

    # Visualizations
    st.header("πŸ“ˆ Salary Analysis")

    # 1. Salary by Country
    st.subheader("🌍 Salary by Country")
    country_stats = filtered_df.groupby('country')['converted_comp_yearly'].median().sort_values(ascending=False)
    fig1 = px.bar(
        x=country_stats.index,
        y=country_stats.values,
        title="Median Salary by Country",
        labels={'x': 'Country', 'y': 'Median Salary (USD)'}
    )
    st.plotly_chart(fig1, use_container_width=True)

    # 2. Salary by Education Level
    st.subheader("πŸŽ“ Salary by Education Level")
    fig2 = px.box(
        filtered_df,
        x='education_level',
        y='converted_comp_yearly',
        title="Salary Distribution by Education Level"
    )
    st.plotly_chart(fig2, use_container_width=True)

    # 3. Salary by Experience
    st.subheader("πŸ“… Salary vs Experience")
    fig3 = px.scatter(
        filtered_df,
        x='years_code_pro',
        y='converted_comp_yearly',
        color='country',
        title="Salary Growth with Experience",
        trendline="lowess"
    )
    st.plotly_chart(fig3, use_container_width=True)

    # 4. Experience Level Analysis
    st.subheader("πŸ‘¨β€πŸ’» Salary by Experience Level")
    exp_stats = filtered_df.groupby('experience_level')['converted_comp_yearly'].median()
    fig4 = px.bar(
        x=exp_stats.index,
        y=exp_stats.values,
        title="Median Salary by Experience Level"
    )
    st.plotly_chart(fig4, use_container_width=True)

    # Data Table
    st.header("πŸ“‹ Detailed Data View")
    if st.checkbox("Show filtered data table"):
        display_cols = ['country', 'education_level', 'experience_level', 'years_code_pro', 'converted_comp_yearly']
        st.dataframe(
            filtered_df[display_cols].sort_values('converted_comp_yearly', ascending=False),
            use_container_width=True
        )

if __name__ == "__main__":
    main()