Spaces:
Build error
Build error
Commit ·
619bdd7
1
Parent(s): ecc7f18
Upload 12 files
Browse files- PCA.png +0 -0
- app.py +58 -0
- best_model.pkl +3 -0
- eda.py +141 -0
- missing_corr.png +0 -0
- missing_values.png +0 -0
- model.py +67 -0
- pearsons.png +0 -0
- ph.png +0 -0
- requirements.txt +6 -0
- spearman.png +0 -0
- water_potability.csv +0 -0
PCA.png
ADDED
|
app.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
Achmad Dhani
|
| 3 |
+
|
| 4 |
+
Objective : Creating a main page of the webapps.
|
| 5 |
+
'''
|
| 6 |
+
|
| 7 |
+
import streamlit as st
|
| 8 |
+
import eda
|
| 9 |
+
import model
|
| 10 |
+
|
| 11 |
+
# navigating pages
|
| 12 |
+
page = st.sidebar.selectbox(label='Select Page:', options=['Home Page', 'Exploration Data Analysis', 'Prediction Model'])
|
| 13 |
+
|
| 14 |
+
if page == 'Home Page':
|
| 15 |
+
st.header('Home Page')
|
| 16 |
+
st.write('')
|
| 17 |
+
st.write('Phase 1 Milestone 2')
|
| 18 |
+
st.write('Name : Achmad Dhani')
|
| 19 |
+
st.write('Batch : HCK-009')
|
| 20 |
+
st.markdown('Dataset: [Water Quality](https://www.kaggle.com/datasets/adityakadiwal/water-potability)')
|
| 21 |
+
st.write('Objective : Water is essential for all forms of life, yet its quality can vary dramatically, with the potential to sustain health or cause disease. The distinction between potable water, which is safe for consumption, and non-potable water, which poses health risks, determined by the presence of certain chemicals. By employing classification model focused on the Recall metric, we can effectively predict the potability of water, ensuring its safety for consumption.')
|
| 22 |
+
st.write('')
|
| 23 |
+
st.caption('Please pick the options in the Select Page Box located on the left of the screen to start!')
|
| 24 |
+
st.write('')
|
| 25 |
+
st.write('')
|
| 26 |
+
|
| 27 |
+
#============================= Background Info ==========================
|
| 28 |
+
|
| 29 |
+
with st.expander("Background Information"):
|
| 30 |
+
st.caption('The dataset used `Water Quality` is a public dataset from keggle consist of data of water samples from different water bodies. It has 3276 entries with 10 columns. The dataset also has a total of 1434 missing values')
|
| 31 |
+
|
| 32 |
+
#============================= Work Flow ================================
|
| 33 |
+
|
| 34 |
+
with st.expander("Work Flow"):
|
| 35 |
+
st.caption(
|
| 36 |
+
'''
|
| 37 |
+
- Loading the data, checking for duplicated and missing values
|
| 38 |
+
- EDA on the dataset to gain insights regarding the dataset and the missing value
|
| 39 |
+
- Feature Engineering to prepare data for the model
|
| 40 |
+
- Creating and Evaluating to get the best model
|
| 41 |
+
- Deployment in Hugging Face
|
| 42 |
+
|
| 43 |
+
'''
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
#============================= Conclussion =================================
|
| 47 |
+
with st.expander("Conclusion"): # conclusion
|
| 48 |
+
st.caption(
|
| 49 |
+
'''
|
| 50 |
+
The dataset is well documented but has missing values. These missing values after exploration deemed MCAR and most likely due to the person in charge taking the water samples did not have the equipment to messure these missing values chemical levels. All the chemicals don't have a relationship with each other or a trend, and each of them are important because the PCA shows a linear relationship between number of features and the percentage of data kept. Most water sample has a ph of 5-9, has a quite high amount of chemicals like sulfate, chloramines and trihalomethanes which doesn't seem to have difference between the ones potable and non-potable. It can be assumed that the water samples are from environment that is scarce on really good drinkable water or a highly contaminated environment like a factory side of the city. The model that gives the best overall score is RandomForest which has a recall of 65% meaning, when predicting if the water is drinkable or not, it will be correct 65% of the time. There are few things to suggest for the future. The SVC model can be further searched if resources are available, there needed to be more potable water data since the data is imbalanced towards non-potable water and dataset potability validity needed to be checked by the author.
|
| 51 |
+
'''
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
#============================ Other Page ======================================
|
| 55 |
+
elif page == 'Exploration Data Analysis':
|
| 56 |
+
eda.run()
|
| 57 |
+
else:
|
| 58 |
+
model.run()
|
best_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f805e87ad5e8acd605e99c5ce9a5bf8f9c24cc16471f87d7eabcf52d9a81649d
|
| 3 |
+
size 181035
|
eda.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
Achmad Dhani
|
| 3 |
+
|
| 4 |
+
Objective : Creating EDA page specifically to explain insights from EDA
|
| 5 |
+
'''
|
| 6 |
+
|
| 7 |
+
import streamlit as st
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from PIL import Image
|
| 10 |
+
|
| 11 |
+
def run():
|
| 12 |
+
'''
|
| 13 |
+
Function for EDA page
|
| 14 |
+
'''
|
| 15 |
+
st.title('Exploration Data Analysis Section')
|
| 16 |
+
|
| 17 |
+
df= pd.read_csv('water_potability.csv') # reading CSV
|
| 18 |
+
|
| 19 |
+
#============================= Display Data ===============================
|
| 20 |
+
|
| 21 |
+
col1, col2 = st.columns(2)
|
| 22 |
+
|
| 23 |
+
with col1.expander("View the top 10 entries of the original dataset"):
|
| 24 |
+
st.table(df.head(10))
|
| 25 |
+
|
| 26 |
+
with col2.expander("View the bottom 10 entries of the original dataset"):
|
| 27 |
+
st.table(df.tail(10))
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
#============================= Correlation =====================================
|
| 31 |
+
st.subheader('Correlation Matrix Between The Chemicals')
|
| 32 |
+
col3, col4 = st.columns(2)
|
| 33 |
+
|
| 34 |
+
# 1st image
|
| 35 |
+
col3.write('Pearsons Correlation Matrix')
|
| 36 |
+
image1 = Image.open('pearsons.png')
|
| 37 |
+
col3.image(image1, caption='Figure 1 Pearsons Correlation Matrix of All Chemicals')
|
| 38 |
+
|
| 39 |
+
# 2nd image
|
| 40 |
+
col4.write('Spearman Correlation Matrix')
|
| 41 |
+
image2 = Image.open('spearman.png')
|
| 42 |
+
col4.image(image2, caption='Figure 2 Spearman Correlation Matrix of All Chemicals')
|
| 43 |
+
|
| 44 |
+
# explaination
|
| 45 |
+
with st.expander('Explanation'):
|
| 46 |
+
st.caption(
|
| 47 |
+
'''
|
| 48 |
+
Based on both visualization, most of the variables do not have any relationship except for a few.
|
| 49 |
+
|
| 50 |
+
Based on both visualization, most of the variables do not have any relationship except for a few.
|
| 51 |
+
|
| 52 |
+
- `Hardness` has a very positive low value with `ph` in spearman but close to 0 in pearsons. This suggests there might be a very weak positive non
|
| 53 |
+
linear relationship.
|
| 54 |
+
- `Sulfate` with `Solids` and with `Sulfate` has a very low negative value both in spearman and pearsons. This suggests there might be a very weak
|
| 55 |
+
negative linear relationship.
|
| 56 |
+
'''
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
#================================ ph ==========================================
|
| 60 |
+
|
| 61 |
+
st.subheader('ph Values Distribution')
|
| 62 |
+
image3 = Image.open('ph.png')
|
| 63 |
+
st.image(image3, caption='Figure 3 ph values distribution histogram', width=600)
|
| 64 |
+
|
| 65 |
+
# explaination
|
| 66 |
+
with st.expander('Explanation'):
|
| 67 |
+
st.caption(
|
| 68 |
+
'''
|
| 69 |
+
- The water sample taken mostly has ph between `5-9`
|
| 70 |
+
- The visualization also suggest a lot of data are in the range for drinkable water but doesn't mean that the water is drinkable.
|
| 71 |
+
- This could mean most water samples that's taken could come contaminated water bodies.
|
| 72 |
+
'''
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
#================================ Missing Values ===============================
|
| 76 |
+
st.subheader('Missing Values Visualizations')
|
| 77 |
+
|
| 78 |
+
# missing plot
|
| 79 |
+
st.write('Missing Values Bar Plot')
|
| 80 |
+
image4 = Image.open('missing_values.png')
|
| 81 |
+
st.image(image4, caption='Figure 4 Bar plot of missing values of each column')
|
| 82 |
+
|
| 83 |
+
# displaying explaination
|
| 84 |
+
with st.expander('Explanation'):
|
| 85 |
+
st.caption(
|
| 86 |
+
'''
|
| 87 |
+
**From Data Loading**
|
| 88 |
+
|
| 89 |
+
- There are otal missing values in the dataset: 1434
|
| 90 |
+
|
| 91 |
+
- Columns with missing values:
|
| 92 |
+
|
| 93 |
+
`['ph', 'Sulfate', 'Trihalomethanes']`
|
| 94 |
+
|
| 95 |
+
Number of missing values per column:
|
| 96 |
+
>ph `491`
|
| 97 |
+
>
|
| 98 |
+
>Sulfate `781`
|
| 99 |
+
>
|
| 100 |
+
>Trihalomethanes `162`
|
| 101 |
+
>
|
| 102 |
+
>dtype: int64
|
| 103 |
+
|
| 104 |
+
Missing data percentage (%):
|
| 105 |
+
>ph `15`
|
| 106 |
+
>
|
| 107 |
+
>Sulfate `24`
|
| 108 |
+
>
|
| 109 |
+
>Trihalomethanes `5`
|
| 110 |
+
'''
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# missing matrix
|
| 114 |
+
st.write('Missing Values Correlation Matrix')
|
| 115 |
+
image5 = Image.open('missing_corr.png')
|
| 116 |
+
st.image(image5, caption='Figure 5 Correlation matrix of the missing values')
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# display explaination
|
| 120 |
+
with st.expander('Explanation'):
|
| 121 |
+
st.caption(
|
| 122 |
+
'''
|
| 123 |
+
- Based on the visualization above, the missing values have no correlation and can be cosidered the missingness is `completly random`
|
| 124 |
+
- The missing values being random could be due to the person that took the water sample did not have the equipment to measure the chemical level.
|
| 125 |
+
'''
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
#================================== PCA =============================
|
| 129 |
+
|
| 130 |
+
st.subheader('Feature Importance')
|
| 131 |
+
image6 = Image.open('PCA.png')
|
| 132 |
+
st.image(image6, caption='Figure 6 Linechart of explained variance ratio with number of components')
|
| 133 |
+
|
| 134 |
+
# displaying explaination
|
| 135 |
+
with st.expander('Explanation'):
|
| 136 |
+
st.caption(
|
| 137 |
+
'''
|
| 138 |
+
- Based on the visualization of PCA, there is a linear relationship between number of components and the EVR cummulative
|
| 139 |
+
- This suggest, each feature is important and retains unique information of the dataset
|
| 140 |
+
'''
|
| 141 |
+
)
|
missing_corr.png
ADDED
|
missing_values.png
ADDED
|
model.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
Achmad Dhani
|
| 3 |
+
|
| 4 |
+
Objective : Creating a page for classification prediction
|
| 5 |
+
'''
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import joblib
|
| 9 |
+
from PIL import Image
|
| 10 |
+
|
| 11 |
+
def run():
|
| 12 |
+
'''
|
| 13 |
+
This function is for running the page for predictions
|
| 14 |
+
'''
|
| 15 |
+
st.title('Is Your Water Drinkable ?')
|
| 16 |
+
model = joblib.load("best_model.pkl") # loading the model
|
| 17 |
+
|
| 18 |
+
ph = st.number_input(label='Input ph:',min_value=0,max_value=14)
|
| 19 |
+
hard= st.slider('Hardness', min_value=40, max_value=320)
|
| 20 |
+
solid= st.slider('Total dissolved solids', min_value=320, max_value=60000)
|
| 21 |
+
chlo= st.slider('Chloramines Level', min_value=0.0, max_value=13.0, step=0.1)
|
| 22 |
+
sulf= st.slider('Sulfate', min_value=130, max_value=480)
|
| 23 |
+
cond= st.slider('Conductivity', min_value=180, max_value=750)
|
| 24 |
+
organ= st.slider('Total Organic Carbon', min_value=2.0, max_value=28.0, step=0.1)
|
| 25 |
+
thm= st.slider('Trihalomethanes (THM)', min_value=0, max_value=120)
|
| 26 |
+
turb= st.slider('Turbidity', min_value=0.0, max_value=7.0, step=0.1)
|
| 27 |
+
|
| 28 |
+
# data for predictions
|
| 29 |
+
data_pred={
|
| 30 |
+
'ph': ph,
|
| 31 |
+
'hardness':hard,
|
| 32 |
+
'solids': solid,
|
| 33 |
+
'chloramines':chlo,
|
| 34 |
+
'sulfate': sulf,
|
| 35 |
+
'conductivity': cond,
|
| 36 |
+
'organic_carbon':organ,
|
| 37 |
+
'trihalomethanes': thm,
|
| 38 |
+
'turbidity': turb
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# data for display
|
| 42 |
+
data_show = {
|
| 43 |
+
'Parameters': ['ph', 'hardness', 'solids', 'chloramines', 'sulfate', 'conductivity', 'organic_carbon', 'trihalomethanes', 'turbidity'],
|
| 44 |
+
'Value': [ph, hard, solid, chlo, sulf, cond, organ, thm, turb]
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
st.write('The following table is the result of the data you have input : ')
|
| 48 |
+
|
| 49 |
+
# display table
|
| 50 |
+
display = pd.DataFrame(data_show)
|
| 51 |
+
st.table(display)
|
| 52 |
+
|
| 53 |
+
# df predictions
|
| 54 |
+
df= pd.DataFrame([data_pred])
|
| 55 |
+
|
| 56 |
+
# button
|
| 57 |
+
if st.button(label='Predict'):
|
| 58 |
+
|
| 59 |
+
y_pred_inf = model.predict(df)
|
| 60 |
+
|
| 61 |
+
# printing result
|
| 62 |
+
if y_pred_inf == 1:
|
| 63 |
+
st.write('The water is DRINKABLE')
|
| 64 |
+
|
| 65 |
+
else:
|
| 66 |
+
st.write('The water is NOT DRINKABLE')
|
| 67 |
+
|
pearsons.png
ADDED
|
ph.png
ADDED
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
numpy
|
| 3 |
+
scikit-learn
|
| 4 |
+
matplotlib
|
| 5 |
+
seaborn
|
| 6 |
+
joblib
|
spearman.png
ADDED
|
water_potability.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|