EDA / app.py
Surish's picture
Create app.py
fbd38e9
Raw
History Blame Contribute Delete
11.7 kB
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
import pandas as pd
import numpy as np
# Streamlit ์•ฑ ์ œ๋ชฉ ์„ค์ •
st.title("๐Ÿ“Š ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™” ๋ฐ ๋ถ„์„")
# ๋ฐ์ดํ„ฐ์…‹ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ ๋˜๋Š” ํŒŒ์ผ ์—…๋กœ๋“œ ์„ ํƒ ์ฐฝ
st.subheader("1๏ธโƒฃ ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ")
tab1, tab2 = st.tabs(["seaborn ๋ฐ์ดํ„ฐ์…‹", "ํŒŒ์ผ ์—…๋กœ๋“œ"])
# ๋ฐ์ดํ„ฐ๋ฅผ ๋ถˆ๋Ÿฌ์˜ค๋Š” ๋กœ์ง
with tab1:
dataset_name = st.text_input('๋ฐ์ดํ„ฐ ์˜ˆ์‹œ: titanic, tips, taxis, penguins, iris...:')
sample_checked = st.checkbox('seaborn ๋ฐ์ดํ„ฐ ํ™•์ธํ•˜๊ธฐ')
if sample_checked:
with st.spinner('์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ๋ฅผ ๋ถˆ๋Ÿฌ์˜ค๋Š” ์ค‘ ์ž…๋‹ˆ๋‹ค...'):
try:
df = sns.load_dataset(dataset_name)
st.write(df.head(3))
except:
st.write("โš ๋ฐ์ดํ„ฐ์…‹ ์ด๋ฆ„์„ ๋‹ค์‹œ ํ™•์ธํ•ด์ฃผ์„ธ์š”!")
with tab2:
st.write("๋‹จ, ์ด ๋ฐฉ๋ฒ•์€ csv ํŒŒ์ผ๋งŒ ์ง€์›ํ•ฉ๋‹ˆ๋‹ค.")
custom_data = st.file_uploader("๋ถ„์„ํ•˜๊ณ  ์‹ถ์€ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”.", type="csv")
if custom_data:
custom_data = pd.read_csv(custom_data, encoding = 'utf-8')
st.session_state['custom_data'] = custom_data
upload_checked = st.checkbox('์—…๋กœ๋“œํ•œ ํŒŒ์ผ ํ™•์ธํ•˜๊ธฐ!')
if upload_checked:
with st.spinner('์ค‘๋ณต์„ ํ™•์ธํ•˜๋Š” ์ค‘์ž…๋‹ˆ๋‹ค...'):
try:
st.write(custom_data.head(5))
df = custom_data
except:
st.write("โš ์˜ฌ๋ฐ”๋ฅธ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์…จ๋Š”์ง€ ํ™•์ธํ•ด์ฃผ์„ธ์š”!")
# ์—ด ์ด๋ฆ„ ๊ฐ€์ ธ์˜ค๊ธฐ
column_names = df.columns.tolist()
# ์—ด ์ด๋ฆ„์„ ๋ฆฌ์ŠคํŠธ๋กœ ๋‚˜์—ดํ•˜๊ณ  ํด๋ฆญ ๊ฐ€๋Šฅํ•˜๊ฒŒ ๋งŒ๋“ค๊ธฐ
selected_columns = st.multiselect('๋ถ„์„ํ•˜๊ณ ์ž ํ•˜๋Š” ์—ด์„ ์„ ํƒํ•˜์„ธ์š”:', column_names, default=column_names)
# ์„ ํƒ๋œ ์—ด์˜ ๋ฐ์ดํ„ฐ ํ‘œ์‹œ
if selected_columns:
st.write(df[selected_columns].head(3))
tab1, tab2 = st.tabs(["๊ธฐ์ˆ ํ†ต๊ณ„๋Ÿ‰", "๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™”"])
with tab1:
st.write('# ๊ธฐ์ˆ ํ†ต๊ณ„๋Ÿ‰')
# ์—ด ์ด๋ฆ„ ๊ฐ€์ ธ์˜ค๊ธฐ
with tab2:
# ๋ผ๋””์˜ค ๋ฒ„ํŠผ ์ƒ์„ฑ
variable_type = st.radio("์ผ๋ณ€๋Ÿ‰ ๋ฐ์ดํ„ฐ๋ฅผ ์„ ํƒํ•ด์ฃผ์„ธ์š”.", ("์ˆ˜์น˜ํ˜•", "๋ฒ”์ฃผํ˜•"))
def get_slider_step(min_value, max_value):
value_range = max_value - min_value
bins_size_min = float((min_value // 5) * 5)
bins_size_max = float((value_range) / 5) if value_range != 0 else 1.0
if value_range < 1:
step = 0.1
elif value_range < 10:
step = 0.5
else:
step = 1
bins_size_min = int(bins_size_min)
bins_size_max = int(bins_size_max)
return bins_size_min+step, bins_size_max, step
# ๋ผ๋””์˜ค ๋ฒ„ํŠผ์˜ ์„ ํƒ์— ๋”ฐ๋ผ ์‹คํ–‰๋˜๋Š” ์ฝ”๋“œ ๋ธ”๋ก
if variable_type == "์ˆ˜์น˜ํ˜•":
# ๋ณ€๋Ÿ‰์ด ์ˆ˜์น˜ํ˜•์ธ ๊ฒฝ์šฐ ์‹คํ–‰๋˜๋Š” ์ฝ”๋“œ
st.write("์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ๋ฅผ ํžˆ์Šคํ† ๊ทธ๋žจ๊ณผ ์ƒ์ž๊ทธ๋ฆผ์œผ๋กœ ํ‘œํ˜„ํ•ฉ๋‹ˆ๋‹ค. ")
try:
colname = st.text_input("์‹œ๊ฐํ™”ํ•˜๊ณ  ์‹ถ์€ ์—ด ์ด๋ฆ„์„ ์จ์ฃผ์„ธ์š”!")
if colname != "":
data = df[colname]
# ๋ฐ์ดํ„ฐ์˜ ๊ธฐ์ˆ ํ†ต๊ณ„๋Ÿ‰ ๊ณ„์‚ฐ
summary_stats = pd.DataFrame({
'ํ‰๊ท ': np.mean(data),
'ํ‘œ์ค€ํŽธ์ฐจ': np.std(data),
'์ตœ์†Ÿ๊ฐ’': np.min(data),
'์ค‘์•™๊ฐ’': np.median(data),
'์ตœ๋Œ“๊ฐ’': np.max(data)
}, index=['ํ†ต๊ณ„๋Ÿ‰'])
st.write(summary_stats)
minvalue = min(df[colname])
maxvalue = max(df[colname])
st.write(colname, '์˜ ์ตœ์†Ÿ๊ฐ’:', minvalue, '์˜ ์ตœ๋Œ“๊ฐ’:',maxvalue)
bins_size_min, bins_size_max, step = get_slider_step(minvalue, maxvalue)
st.write(step)
bins_size = st.slider("๊ณ„๊ธ‰์˜ ํฌ๊ธฐ๋ฅผ ์„ค์ •ํ•ด์ฃผ์„ธ์š”.",
min_value=bins_size_min,
max_value=bins_size_max,
step=step)
# Create a figure and adjust the histogram parameters
fig = plt.figure(figsize=(5, 3))
# bins_size = st.slider("๊ณ„๊ธ‰์˜ ํฌ๊ธฐ๋ฅผ ์„ค์ •ํ•ด์ฃผ์„ธ์š”.",
# min_value= 0.5,#+ int((minvalue // 5) * 5),
# max_value= 1.0+ int((max(df[colname]-min(df[colname])))/5),
# step= 0.5)
st.write("ํžˆ์Šคํ† ๊ทธ๋žจ์˜ ๊ณ„๊ธ‰์˜ ํฌ๊ธฐ:",bins_size)
# Plot the histogram with adjusted parameters
sns.set_style("darkgrid")
plt.title('Histogram of {}'.format(colname))
sns.histplot(x=df[colname], binwidth=bins_size, binrange = [min(df[colname]), max(df[colname])], kde=False)
plt.xlabel("")
st.pyplot(fig)
# ์ด์ƒ์น˜ ์ˆจ๊ธฐ๊ธฐ ์ฒดํฌ๋ฐ•์Šค
hide_outliers = st.checkbox("์ด์ƒ์น˜ ์ˆจ๊ธฐ๊ธฐ")
# ์ด์ƒ์น˜๋ฅผ ์ˆจ๊ธฐ๋Š” ์˜ต์…˜ ์„ค์ •
showfliers = not hide_outliers
fig2 = plt.figure(figsize=(5, 1))
plt.title('Boxplot of {}'.format(colname))
sns.set_style("darkgrid")
# ๋ฐ•์Šค ํ”Œ๋กฏ ๊ทธ๋ฆฌ๊ธฐ
sns.boxplot(x=df[colname], palette="Set2", showfliers=showfliers)
plt.xlabel("")
st.pyplot(fig2)
except ValueError:
st.write("์˜ฌ๋ฐ”๋ฅธ ์—ด ์ด๋ฆ„์„ ์จ์ฃผ์„ธ์š”!")
st.stop()
# ๋ฒ”์ฃผํ˜•
elif variable_type =='๋ฒ”์ฃผํ˜•':
# ๋ณ€๋Ÿ‰์ด ๋ฒ”์ฃผํ˜•์ธ ๊ฒฝ์šฐ ์‹คํ–‰๋˜๋Š” ์ฝ”๋“œ
st.write("๋ฒ”์ฃผํ˜• ๋ฐ์ดํ„ฐ๋ฅผ ๋ง‰๋Œ€๊ทธ๋ž˜ํ”„๋กœ ํ‘œํ˜„ํ•ฉ๋‹ˆ๋‹ค.")
try:
colname = st.text_input("์‹œ๊ฐํ™”ํ•˜๊ณ  ์‹ถ์€ ์—ด ์ด๋ฆ„์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”!")
if colname != "":
# Create a figure and adjust the bar plot parameters
fig = plt.figure(figsize=(5,3))
ax = sns.countplot(x=df[colname], palette="Blues")
# Add frequency labels on top of each bar with white outline
for p in ax.patches:
height = p.get_height()
ax.annotate(format(height, ','),
(p.get_x() + p.get_width() / 2, height),
ha='center', va='center',
xytext=(0, -10), textcoords='offset points',
fontsize=10, color='black',
path_effects=[path_effects.Stroke(linewidth=3, foreground='white'),
path_effects.Normal()])
plt.title('Barplot of {}'.format(colname))
sns.set_style("darkgrid")
plt.xlabel("")
st.pyplot(fig)
except ValueError:
st.write("์˜ฌ๋ฐ”๋ฅธ ์—ด ์ด๋ฆ„์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”!")
st.stop()
variable_type_group = st.radio("๊ทธ๋ฃน๋ณ„ ๋ฐ์ดํ„ฐ๋ฅผ ์„ ํƒํ•ด์ฃผ์„ธ์š”. (์˜ˆ: ํด๋ž˜์Šค์— ๋”ฐ๋ฅธ ์ƒ์กด์œจ)", ("์ˆ˜์น˜ํ˜•", "๋ฒ”์ฃผํ˜•"))
if variable_type_group == "์ˆ˜์น˜ํ˜•":
# ๋ณ€๋Ÿ‰์ด ์ˆ˜์น˜ํ˜•์ธ ๊ฒฝ์šฐ ์‹คํ–‰๋˜๋Š” ์ฝ”๋“œ
st.write("๊ทธ๋ฃน๋ณ„ ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ๋ฅผ ํžˆ์Šคํ† ๊ทธ๋žจ๊ณผ ์ƒ์ž๊ทธ๋ฆผ์œผ๋กœ ํ‘œํ˜„ํ•ฉ๋‹ˆ๋‹ค. ")
try:
colname_group = st.text_input("๊ทธ๋ฃน ์—ด ์ด๋ฆ„์„ ์จ์ฃผ์„ธ์š”!")
colname_2 = st.text_input("๊ทธ๋ฃน๋ณ„๋กœ ์‹œ๊ฐํ™”ํ•˜๊ณ  ์‹ถ์€ ์ˆ˜์น˜ํ˜• ์—ด ์ด๋ฆ„์„ ์จ์ฃผ์„ธ์š”!")
if colname_2 != "":
data = df[[colname_2, colname_group]]
# ๋ฐ์ดํ„ฐ์˜ ๊ธฐ์ˆ ํ†ต๊ณ„๋Ÿ‰ ๊ณ„์‚ฐ
stat = data.groupby(colname_group)[colname_2].agg(
mean='mean',
std='std',
min='min',
median='median',
max='max'
)
stat.columns = ['ํ‰๊ท ', 'ํ‘œ์ค€ํŽธ์ฐจ', '์ตœ์†Ÿ๊ฐ’', '์ค‘์•™๊ฐ’', '์ตœ๋Œ“๊ฐ’']
st.write(stat)
minvalue = min(df[colname_2])
maxvalue = max(df[colname_2])
st.write(colname_2, '์˜ ์ตœ์†Ÿ๊ฐ’:', minvalue, '์˜ ์ตœ๋Œ“๊ฐ’:',maxvalue)
bins_size_min, bins_size_max, step = get_slider_step(minvalue, maxvalue)
st.write(step)
bins_size = st.slider("๊ณ„๊ธ‰์˜ ํฌ๊ธฐ๋ฅผ ์„ค์ •ํ•ด์ฃผ์„ธ์š”. (group)",
min_value=bins_size_min,
max_value=bins_size_max,
step=step)
# Create a figure and adjust the histogram parameters
fig = plt.figure(figsize=(5, 3))
st.write("ํžˆ์Šคํ† ๊ทธ๋žจ์˜ ๊ณ„๊ธ‰์˜ ํฌ๊ธฐ:",bins_size)
# Plot the histogram with adjusted parameters
sns.set_style("darkgrid")
plt.title('Histogram of {}'.format(colname_2))
sns.histplot(data = data, x = colname_2, hue = colname_group, binwidth=bins_size, binrange = [min(df[colname_2]), max(df[colname_2])], kde=False)
plt.xlabel("")
st.pyplot(fig)
# ์ด์ƒ์น˜ ์ˆจ๊ธฐ๊ธฐ ์ฒดํฌ๋ฐ•์Šค
hide_outliers = st.checkbox("์ด์ƒ์น˜ ์ˆจ๊ธฐ๊ธฐ ")
# ์ด์ƒ์น˜๋ฅผ ์ˆจ๊ธฐ๋Š” ์˜ต์…˜ ์„ค์ •
showfliers = not hide_outliers
fig2 = plt.figure(figsize=(5, 5))
plt.title('Boxplot of {}'.format(colname_2))
sns.set_style("darkgrid")
# ๋ฐ•์Šค ํ”Œ๋กฏ ๊ทธ๋ฆฌ๊ธฐ
sns.boxplot(data = df, x = colname_group, y = colname_2, palette="Set2", showfliers=showfliers)
plt.xlabel("")
st.pyplot(fig2)
except ValueError:
st.write("์˜ฌ๋ฐ”๋ฅธ ์—ด ์ด๋ฆ„์„ ์จ์ฃผ์„ธ์š”!")
st.stop()
# ๋ฒ”์ฃผํ˜•
elif variable_type_group=='๋ฒ”์ฃผํ˜•':
# ๋ณ€๋Ÿ‰์ด ๋ฒ”์ฃผํ˜•์ธ ๊ฒฝ์šฐ ์‹คํ–‰๋˜๋Š” ์ฝ”๋“œ
st.write("๋ฒ”์ฃผํ˜• ๋ฐ์ดํ„ฐ๋ฅผ ๋ง‰๋Œ€๊ทธ๋ž˜ํ”„๋กœ ํ‘œํ˜„ํ•ฉ๋‹ˆ๋‹ค.")
try:
colname_group = st.text_input("๊ทธ๋ฃน ์—ด ์ด๋ฆ„์„ ์จ์ฃผ์„ธ์š”!")
colname_2 = st.text_input("๊ทธ๋ฃน๋ณ„๋กœ ์‹œ๊ฐํ™”ํ•˜๊ณ  ์‹ถ์€ ์ˆ˜์น˜ํ˜• ์—ด ์ด๋ฆ„์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”!")
if colname_2 != "":
# Create a figure and adjust the bar plot parameters
fig = plt.figure(figsize=(5,3))
ax = sns.countplot(x=df[colname_2], palette="Blues")
# Add frequency labels on top of each bar with white outline
for p in ax.patches:
height = p.get_height()
ax.annotate(format(height, ','),
(p.get_x() + p.get_width() / 2, height),
ha='center', va='center',
xytext=(0, -10), textcoords='offset points',
fontsize=10, color='black',
path_effects=[path_effects.Stroke(linewidth=3, foreground='white'),
path_effects.Normal()])
plt.title('Barplot of {}'.format(colname_2))
sns.set_style("darkgrid")
plt.xlabel("")
st.pyplot(fig)
except ValueError:
st.write("์˜ฌ๋ฐ”๋ฅธ ์—ด ์ด๋ฆ„์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”!")
st.stop()