File size: 6,681 Bytes
fc375a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36891cf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def load_data():
    """Load and enhance the insurance dataset."""
    df = pd.read_csv('insurance.csv')
    
    # Add derived features for better analysis
    df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 100], 
                            labels=['Young Adult', 'Adult', 'Middle Age', 'Senior'])
    df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 24.9, 29.9, 100], 
                               labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
    
    return df

def main():
    st.set_page_config(layout="wide")
    
    # Custom styling
    st.markdown("""
        <style>
        .main {
            background-color: #f5f5f5;
        }
        .stButton>button {
            background-color: #4CAF50;
            color: white;
            border-radius: 5px;
        }
        </style>
    """, unsafe_allow_html=True)
    
    st.title("🏥 Advanced Insurance Data Analysis Dashboard")
    
    # Load data
    df = load_data()
    
    # Sidebar for analysis controls
    st.sidebar.header("Analysis Controls")
    
    # Variable selection for analysis
    primary_var = st.sidebar.selectbox(
        "Select Primary Variable",
        ['age', 'bmi', 'children', 'charges', 'region', 'smoker', 'sex']
    )
    
    secondary_var = st.sidebar.selectbox(
        "Select Secondary Variable",
        ['charges' if primary_var != 'charges' else 'age'] + 
        [var for var in ['age', 'bmi', 'children', 'region', 'smoker', 'sex'] if var != primary_var]
    )
    
    # Advanced Analysis Section
    st.header("📊 Advanced Data Analysis")
    
    tab1, tab2, tab3 = st.tabs(["Distribution Analysis", "Relationship Analysis", "Categorical Insights"])
    
    with tab1:
        col1, col2 = st.columns(2)
        
        with col1:
            # Enhanced distribution plot
            fig = plt.figure(figsize=(10, 6))
            if df[primary_var].dtype in ['int64', 'float64']:
                sns.histplot(data=df, x=primary_var, hue='smoker')
            else:
                sns.countplot(data=df, x=primary_var, hue='smoker')
            plt.title(f'Distribution of {primary_var} by Smoking Status')
            plt.xticks(rotation=45)
            st.pyplot(fig)
        
        with col2:
            # Box plot with individual points
            fig = plt.figure(figsize=(10, 6))
            if df[primary_var].dtype in ['int64', 'float64']:
                sns.boxplot(data=df, y=primary_var, x='region', hue='smoker')
                plt.title(f'{primary_var} Distribution by Region')
            else:
                sns.boxplot(data=df, y='charges', x=primary_var, hue='smoker')
                plt.title(f'Charges Distribution by {primary_var}')
            plt.xticks(rotation=45)
            st.pyplot(fig)
    
    with tab2:
        col1, col2 = st.columns(2)
        
        with col1:
            # Scatter plot without trend line
            if df[primary_var].dtype in ['int64', 'float64'] and df[secondary_var].dtype in ['int64', 'float64']:
                fig = px.scatter(df, x=primary_var, y=secondary_var, 
                               color='smoker',
                               title=f'Relationship between {primary_var} and {secondary_var}')
                st.plotly_chart(fig, use_container_width=True)
            else:
                st.write("Cannot create scatter plot for categorical variables")
        
        with col2:
            # Advanced violin plot
            fig = plt.figure(figsize=(10, 6))
            if df[primary_var].dtype in ['int64', 'float64']:
                sns.violinplot(data=df, y=primary_var, x='region', hue='smoker', split=True)
                plt.title(f'{primary_var} Distribution by Region and Smoking Status')
            else:
                sns.violinplot(data=df, y='charges', x=primary_var, hue='smoker', split=True)
                plt.title(f'Charges Distribution by {primary_var} and Smoking Status')
            plt.xticks(rotation=45)
            st.pyplot(fig)
    
    with tab3:
        col1, col2 = st.columns(2)
        
        with col1:
            # Heatmap for numerical variables
            numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
            correlation = df[numerical_cols].corr()
            fig = px.imshow(correlation, 
                           title="Correlation Heatmap",
                           labels=dict(color="Correlation"))
            st.plotly_chart(fig, use_container_width=True)
        
        with col2:
            # Advanced categorical analysis
            if df[primary_var].dtype not in ['int64', 'float64']:
                # Create joint plot for categorical variables
                fig = plt.figure(figsize=(10, 6))
                sns.barplot(data=df, x=primary_var, y='charges', hue='smoker')
                plt.title(f'Average Charges by {primary_var} and Smoking Status')
                plt.xticks(rotation=45)
                st.pyplot(fig)
            else:
                # Alternative visualization for numerical primary variable
                fig = px.box(df, x='region', y=primary_var, color='smoker',
                           title=f'{primary_var} Distribution by Region and Smoking Status')
                st.plotly_chart(fig, use_container_width=True)
    
    # Detailed Insights Section
    st.header("🔍 Detailed Insights")
    
    # Summary statistics
    if st.checkbox("Show Summary Statistics"):
        if df[primary_var].dtype in ['int64', 'float64']:
            summary = df.groupby('region')[primary_var].describe()
            st.write(f"Summary Statistics for {primary_var} by Region:")
            st.dataframe(summary)
        else:
            summary = df.groupby(primary_var)['charges'].describe()
            st.write(f"Charges Summary Statistics by {primary_var}:")
            st.dataframe(summary)
    
    # Cross-analysis
    if st.checkbox("Show Cross Analysis"):
        if df[primary_var].dtype not in ['int64', 'float64']:
            cross_analysis = pd.crosstab(df[primary_var], df['region'], margins=True)
            st.write(f"Cross Analysis of {primary_var} by Region:")
            st.dataframe(cross_analysis)
        else:
            grouped_analysis = df.groupby('region')[primary_var].agg(['mean', 'median', 'std', 'count'])
            st.write(f"Grouped Analysis of {primary_var} by Region:")
            st.dataframe(grouped_analysis)

if __name__ == "__main__":
    main()