achmaddhani's picture
Upload 12 files
619bdd7
'''
Achmad Dhani
Objective : Creating EDA page specifically to explain insights from EDA
'''
import streamlit as st
import pandas as pd
from PIL import Image
def run():
'''
Function for EDA page
'''
st.title('Exploration Data Analysis Section')
df= pd.read_csv('water_potability.csv') # reading CSV
#============================= Display Data ===============================
col1, col2 = st.columns(2)
with col1.expander("View the top 10 entries of the original dataset"):
st.table(df.head(10))
with col2.expander("View the bottom 10 entries of the original dataset"):
st.table(df.tail(10))
#============================= Correlation =====================================
st.subheader('Correlation Matrix Between The Chemicals')
col3, col4 = st.columns(2)
# 1st image
col3.write('Pearsons Correlation Matrix')
image1 = Image.open('pearsons.png')
col3.image(image1, caption='Figure 1 Pearsons Correlation Matrix of All Chemicals')
# 2nd image
col4.write('Spearman Correlation Matrix')
image2 = Image.open('spearman.png')
col4.image(image2, caption='Figure 2 Spearman Correlation Matrix of All Chemicals')
# explaination
with st.expander('Explanation'):
st.caption(
'''
Based on both visualization, most of the variables do not have any relationship except for a few.
Based on both visualization, most of the variables do not have any relationship except for a few.
- `Hardness` has a very positive low value with `ph` in spearman but close to 0 in pearsons. This suggests there might be a very weak positive non
linear relationship.
- `Sulfate` with `Solids` and with `Sulfate` has a very low negative value both in spearman and pearsons. This suggests there might be a very weak
negative linear relationship.
'''
)
#================================ ph ==========================================
st.subheader('ph Values Distribution')
image3 = Image.open('ph.png')
st.image(image3, caption='Figure 3 ph values distribution histogram', width=600)
# explaination
with st.expander('Explanation'):
st.caption(
'''
- The water sample taken mostly has ph between `5-9`
- The visualization also suggest a lot of data are in the range for drinkable water but doesn't mean that the water is drinkable.
- This could mean most water samples that's taken could come contaminated water bodies.
'''
)
#================================ Missing Values ===============================
st.subheader('Missing Values Visualizations')
# missing plot
st.write('Missing Values Bar Plot')
image4 = Image.open('missing_values.png')
st.image(image4, caption='Figure 4 Bar plot of missing values of each column')
# displaying explaination
with st.expander('Explanation'):
st.caption(
'''
**From Data Loading**
- There are otal missing values in the dataset: 1434
- Columns with missing values:
`['ph', 'Sulfate', 'Trihalomethanes']`
Number of missing values per column:
>ph `491`
>
>Sulfate `781`
>
>Trihalomethanes `162`
>
>dtype: int64
Missing data percentage (%):
>ph `15`
>
>Sulfate `24`
>
>Trihalomethanes `5`
'''
)
# missing matrix
st.write('Missing Values Correlation Matrix')
image5 = Image.open('missing_corr.png')
st.image(image5, caption='Figure 5 Correlation matrix of the missing values')
# display explaination
with st.expander('Explanation'):
st.caption(
'''
- Based on the visualization above, the missing values have no correlation and can be cosidered the missingness is `completly random`
- The missing values being random could be due to the person that took the water sample did not have the equipment to measure the chemical level.
'''
)
#================================== PCA =============================
st.subheader('Feature Importance')
image6 = Image.open('PCA.png')
st.image(image6, caption='Figure 6 Linechart of explained variance ratio with number of components')
# displaying explaination
with st.expander('Explanation'):
st.caption(
'''
- Based on the visualization of PCA, there is a linear relationship between number of components and the EVR cummulative
- This suggest, each feature is important and retains unique information of the dataset
'''
)