File size: 6,681 Bytes
fc375a7 36891cf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def load_data():
"""Load and enhance the insurance dataset."""
df = pd.read_csv('insurance.csv')
# Add derived features for better analysis
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 100],
labels=['Young Adult', 'Adult', 'Middle Age', 'Senior'])
df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 24.9, 29.9, 100],
labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
return df
def main():
st.set_page_config(layout="wide")
# Custom styling
st.markdown("""
<style>
.main {
background-color: #f5f5f5;
}
.stButton>button {
background-color: #4CAF50;
color: white;
border-radius: 5px;
}
</style>
""", unsafe_allow_html=True)
st.title("🏥 Advanced Insurance Data Analysis Dashboard")
# Load data
df = load_data()
# Sidebar for analysis controls
st.sidebar.header("Analysis Controls")
# Variable selection for analysis
primary_var = st.sidebar.selectbox(
"Select Primary Variable",
['age', 'bmi', 'children', 'charges', 'region', 'smoker', 'sex']
)
secondary_var = st.sidebar.selectbox(
"Select Secondary Variable",
['charges' if primary_var != 'charges' else 'age'] +
[var for var in ['age', 'bmi', 'children', 'region', 'smoker', 'sex'] if var != primary_var]
)
# Advanced Analysis Section
st.header("📊 Advanced Data Analysis")
tab1, tab2, tab3 = st.tabs(["Distribution Analysis", "Relationship Analysis", "Categorical Insights"])
with tab1:
col1, col2 = st.columns(2)
with col1:
# Enhanced distribution plot
fig = plt.figure(figsize=(10, 6))
if df[primary_var].dtype in ['int64', 'float64']:
sns.histplot(data=df, x=primary_var, hue='smoker')
else:
sns.countplot(data=df, x=primary_var, hue='smoker')
plt.title(f'Distribution of {primary_var} by Smoking Status')
plt.xticks(rotation=45)
st.pyplot(fig)
with col2:
# Box plot with individual points
fig = plt.figure(figsize=(10, 6))
if df[primary_var].dtype in ['int64', 'float64']:
sns.boxplot(data=df, y=primary_var, x='region', hue='smoker')
plt.title(f'{primary_var} Distribution by Region')
else:
sns.boxplot(data=df, y='charges', x=primary_var, hue='smoker')
plt.title(f'Charges Distribution by {primary_var}')
plt.xticks(rotation=45)
st.pyplot(fig)
with tab2:
col1, col2 = st.columns(2)
with col1:
# Scatter plot without trend line
if df[primary_var].dtype in ['int64', 'float64'] and df[secondary_var].dtype in ['int64', 'float64']:
fig = px.scatter(df, x=primary_var, y=secondary_var,
color='smoker',
title=f'Relationship between {primary_var} and {secondary_var}')
st.plotly_chart(fig, use_container_width=True)
else:
st.write("Cannot create scatter plot for categorical variables")
with col2:
# Advanced violin plot
fig = plt.figure(figsize=(10, 6))
if df[primary_var].dtype in ['int64', 'float64']:
sns.violinplot(data=df, y=primary_var, x='region', hue='smoker', split=True)
plt.title(f'{primary_var} Distribution by Region and Smoking Status')
else:
sns.violinplot(data=df, y='charges', x=primary_var, hue='smoker', split=True)
plt.title(f'Charges Distribution by {primary_var} and Smoking Status')
plt.xticks(rotation=45)
st.pyplot(fig)
with tab3:
col1, col2 = st.columns(2)
with col1:
# Heatmap for numerical variables
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
correlation = df[numerical_cols].corr()
fig = px.imshow(correlation,
title="Correlation Heatmap",
labels=dict(color="Correlation"))
st.plotly_chart(fig, use_container_width=True)
with col2:
# Advanced categorical analysis
if df[primary_var].dtype not in ['int64', 'float64']:
# Create joint plot for categorical variables
fig = plt.figure(figsize=(10, 6))
sns.barplot(data=df, x=primary_var, y='charges', hue='smoker')
plt.title(f'Average Charges by {primary_var} and Smoking Status')
plt.xticks(rotation=45)
st.pyplot(fig)
else:
# Alternative visualization for numerical primary variable
fig = px.box(df, x='region', y=primary_var, color='smoker',
title=f'{primary_var} Distribution by Region and Smoking Status')
st.plotly_chart(fig, use_container_width=True)
# Detailed Insights Section
st.header("🔍 Detailed Insights")
# Summary statistics
if st.checkbox("Show Summary Statistics"):
if df[primary_var].dtype in ['int64', 'float64']:
summary = df.groupby('region')[primary_var].describe()
st.write(f"Summary Statistics for {primary_var} by Region:")
st.dataframe(summary)
else:
summary = df.groupby(primary_var)['charges'].describe()
st.write(f"Charges Summary Statistics by {primary_var}:")
st.dataframe(summary)
# Cross-analysis
if st.checkbox("Show Cross Analysis"):
if df[primary_var].dtype not in ['int64', 'float64']:
cross_analysis = pd.crosstab(df[primary_var], df['region'], margins=True)
st.write(f"Cross Analysis of {primary_var} by Region:")
st.dataframe(cross_analysis)
else:
grouped_analysis = df.groupby('region')[primary_var].agg(['mean', 'median', 'std', 'count'])
st.write(f"Grouped Analysis of {primary_var} by Region:")
st.dataframe(grouped_analysis)
if __name__ == "__main__":
main() |