# -*- coding: utf-8 -*-
"""Untitled8.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1SnoorFAucvS1FXD1vzyJnJ-_hoZUfJ_u
"""

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px

# Page configuration
st.set_page_config(
    page_title="Developer Salary Explorer",
    page_icon="💻",
    layout="wide"
)

@st.cache_data
def load_data():
    """Load and preprocess the Stack Overflow survey data"""
    try:
        df = pd.read_csv('stackoverflow_survey_single_response.txt')

        # Filter only rows with compensation data
        df_clean = df[df['converted_comp_yearly'].notna()].copy()
        df_clean = df_clean[df_clean['converted_comp_yearly'] > 1000]

        # Handle missing values in numeric columns
        for col in ['years_code', 'years_code_pro', 'age']:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())

        # Create experience levels
        df_clean['experience_level'] = pd.cut(
            df_clean['years_code_pro'],
            bins=[0, 2, 5, 10, 50],
            labels=['Junior (0-2 yrs)', 'Mid (3-5 yrs)', 'Senior (6-10 yrs)', 'Expert (10+ yrs)']
        )

        # Simplify country to major regions
        top_countries = ['United States of America', 'United Kingdom of Great Britain and Northern Ireland',
                        'Germany', 'India', 'Canada', 'France', 'Australia']
        df_clean['country'] = df_clean['country'].apply(
            lambda x: x if x in top_countries else 'Other'
        )

        # Map education levels to readable names
        education_map = {
            1: 'Less than Bachelor',
            2: 'Bachelor\'s Degree',
            3: 'Master\'s Degree',
            4: 'Doctoral Degree',
            5: 'Professional Degree'
        }
        df_clean['education_level'] = df_clean['ed_level'].map(education_map)
        df_clean['education_level'] = df_clean['education_level'].fillna('Other')

        return df_clean

    except Exception as e:
        st.error(f"Error loading data: {str(e)}")
        return pd.DataFrame()

def main():
    st.title("💻 Developer Salary Explorer")
    st.markdown("Explore how country, education, and experience influence developer salaries worldwide.")

    # Load data
    df = load_data()

    if df.empty:
        st.error("No data loaded. Please check your data file.")
        return

    st.sidebar.header("🔍 Filter Data")

    # Country filter
    countries = sorted(df['country'].unique())
    selected_countries = st.sidebar.multiselect(
        "Select Countries:",
        options=countries,
        default=countries[:3]  # Default to first 3 countries
    )

    # Education level filter
    education_levels = sorted(df['education_level'].unique())
    selected_education = st.sidebar.multiselect(
        "Select Education Levels:",
        options=education_levels,
        default=education_levels
    )

    # Years of experience slider
    min_exp, max_exp = st.sidebar.slider(
        "Years of Professional Experience:",
        min_value=int(df['years_code_pro'].min()),
        max_value=int(min(df['years_code_pro'].max(), 40)),  # Cap at 40 for better UX
        value=(0, 15)
    )

    # Apply filters
    filtered_df = df[
        (df['country'].isin(selected_countries)) &
        (df['education_level'].isin(selected_education)) &
        (df['years_code_pro'] >= min_exp) &
        (df['years_code_pro'] <= max_exp)
    ]

    # Display metrics
    st.header("📊 Key Metrics")

    col1, col2, col3, col4 = st.columns(4)

    with col1:
        median_salary = filtered_df['converted_comp_yearly'].median()
        st.metric("Median Salary", f"${median_salary:,.0f}")

    with col2:
        avg_salary = filtered_df['converted_comp_yearly'].mean()
        st.metric("Average Salary", f"${avg_salary:,.0f}")

    with col3:
        sample_size = len(filtered_df)
        st.metric("Sample Size", f"{sample_size:,}")

    with col4:
        salary_range = f"${filtered_df['converted_comp_yearly'].min():,.0f} - ${filtered_df['converted_comp_yearly'].max():,.0f}"
        st.metric("Salary Range", salary_range)

    if sample_size == 0:
        st.warning("No data matches your filters. Please adjust your selection.")
        return

    # Visualizations
    st.header("📈 Salary Analysis")

    # 1. Salary by Country
    st.subheader("🌍 Salary by Country")
    country_stats = filtered_df.groupby('country')['converted_comp_yearly'].median().sort_values(ascending=False)
    fig1 = px.bar(
        x=country_stats.index,
        y=country_stats.values,
        title="Median Salary by Country",
        labels={'x': 'Country', 'y': 'Median Salary (USD)'}
    )
    st.plotly_chart(fig1, use_container_width=True)

    # 2. Salary by Education Level
    st.subheader("🎓 Salary by Education Level")
    fig2 = px.box(
        filtered_df,
        x='education_level',
        y='converted_comp_yearly',
        title="Salary Distribution by Education Level"
    )
    st.plotly_chart(fig2, use_container_width=True)

    # 3. Salary by Experience
    st.subheader("📅 Salary vs Experience")
    fig3 = px.scatter(
        filtered_df,
        x='years_code_pro',
        y='converted_comp_yearly',
        color='country',
        title="Salary Growth with Experience",
        trendline="lowess"
    )
    st.plotly_chart(fig3, use_container_width=True)

    # 4. Experience Level Analysis
    st.subheader("👨‍💻 Salary by Experience Level")
    exp_stats = filtered_df.groupby('experience_level')['converted_comp_yearly'].median()
    fig4 = px.bar(
        x=exp_stats.index,
        y=exp_stats.values,
        title="Median Salary by Experience Level"
    )
    st.plotly_chart(fig4, use_container_width=True)

    # Data Table
    st.header("📋 Detailed Data View")
    if st.checkbox("Show filtered data table"):
        display_cols = ['country', 'education_level', 'experience_level', 'years_code_pro', 'converted_comp_yearly']
        st.dataframe(
            filtered_df[display_cols].sort_values('converted_comp_yearly', ascending=False),
            use_container_width=True
        )

if __name__ == "__main__":
    main()