achmaddhani commited on
Commit
619bdd7
·
1 Parent(s): ecc7f18

Upload 12 files

Browse files
Files changed (12) hide show
  1. PCA.png +0 -0
  2. app.py +58 -0
  3. best_model.pkl +3 -0
  4. eda.py +141 -0
  5. missing_corr.png +0 -0
  6. missing_values.png +0 -0
  7. model.py +67 -0
  8. pearsons.png +0 -0
  9. ph.png +0 -0
  10. requirements.txt +6 -0
  11. spearman.png +0 -0
  12. water_potability.csv +0 -0
PCA.png ADDED
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Achmad Dhani
3
+
4
+ Objective : Creating a main page of the webapps.
5
+ '''
6
+
7
+ import streamlit as st
8
+ import eda
9
+ import model
10
+
11
+ # navigating pages
12
+ page = st.sidebar.selectbox(label='Select Page:', options=['Home Page', 'Exploration Data Analysis', 'Prediction Model'])
13
+
14
+ if page == 'Home Page':
15
+ st.header('Home Page')
16
+ st.write('')
17
+ st.write('Phase 1 Milestone 2')
18
+ st.write('Name : Achmad Dhani')
19
+ st.write('Batch : HCK-009')
20
+ st.markdown('Dataset: [Water Quality](https://www.kaggle.com/datasets/adityakadiwal/water-potability)')
21
+ st.write('Objective : Water is essential for all forms of life, yet its quality can vary dramatically, with the potential to sustain health or cause disease. The distinction between potable water, which is safe for consumption, and non-potable water, which poses health risks, determined by the presence of certain chemicals. By employing classification model focused on the Recall metric, we can effectively predict the potability of water, ensuring its safety for consumption.')
22
+ st.write('')
23
+ st.caption('Please pick the options in the Select Page Box located on the left of the screen to start!')
24
+ st.write('')
25
+ st.write('')
26
+
27
+ #============================= Background Info ==========================
28
+
29
+ with st.expander("Background Information"):
30
+ st.caption('The dataset used `Water Quality` is a public dataset from keggle consist of data of water samples from different water bodies. It has 3276 entries with 10 columns. The dataset also has a total of 1434 missing values')
31
+
32
+ #============================= Work Flow ================================
33
+
34
+ with st.expander("Work Flow"):
35
+ st.caption(
36
+ '''
37
+ - Loading the data, checking for duplicated and missing values
38
+ - EDA on the dataset to gain insights regarding the dataset and the missing value
39
+ - Feature Engineering to prepare data for the model
40
+ - Creating and Evaluating to get the best model
41
+ - Deployment in Hugging Face
42
+
43
+ '''
44
+ )
45
+
46
+ #============================= Conclussion =================================
47
+ with st.expander("Conclusion"): # conclusion
48
+ st.caption(
49
+ '''
50
+ The dataset is well documented but has missing values. These missing values after exploration deemed MCAR and most likely due to the person in charge taking the water samples did not have the equipment to messure these missing values chemical levels. All the chemicals don't have a relationship with each other or a trend, and each of them are important because the PCA shows a linear relationship between number of features and the percentage of data kept. Most water sample has a ph of 5-9, has a quite high amount of chemicals like sulfate, chloramines and trihalomethanes which doesn't seem to have difference between the ones potable and non-potable. It can be assumed that the water samples are from environment that is scarce on really good drinkable water or a highly contaminated environment like a factory side of the city. The model that gives the best overall score is RandomForest which has a recall of 65% meaning, when predicting if the water is drinkable or not, it will be correct 65% of the time. There are few things to suggest for the future. The SVC model can be further searched if resources are available, there needed to be more potable water data since the data is imbalanced towards non-potable water and dataset potability validity needed to be checked by the author.
51
+ '''
52
+ )
53
+
54
+ #============================ Other Page ======================================
55
+ elif page == 'Exploration Data Analysis':
56
+ eda.run()
57
+ else:
58
+ model.run()
best_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f805e87ad5e8acd605e99c5ce9a5bf8f9c24cc16471f87d7eabcf52d9a81649d
3
+ size 181035
eda.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Achmad Dhani
3
+
4
+ Objective : Creating EDA page specifically to explain insights from EDA
5
+ '''
6
+
7
+ import streamlit as st
8
+ import pandas as pd
9
+ from PIL import Image
10
+
11
+ def run():
12
+ '''
13
+ Function for EDA page
14
+ '''
15
+ st.title('Exploration Data Analysis Section')
16
+
17
+ df= pd.read_csv('water_potability.csv') # reading CSV
18
+
19
+ #============================= Display Data ===============================
20
+
21
+ col1, col2 = st.columns(2)
22
+
23
+ with col1.expander("View the top 10 entries of the original dataset"):
24
+ st.table(df.head(10))
25
+
26
+ with col2.expander("View the bottom 10 entries of the original dataset"):
27
+ st.table(df.tail(10))
28
+
29
+
30
+ #============================= Correlation =====================================
31
+ st.subheader('Correlation Matrix Between The Chemicals')
32
+ col3, col4 = st.columns(2)
33
+
34
+ # 1st image
35
+ col3.write('Pearsons Correlation Matrix')
36
+ image1 = Image.open('pearsons.png')
37
+ col3.image(image1, caption='Figure 1 Pearsons Correlation Matrix of All Chemicals')
38
+
39
+ # 2nd image
40
+ col4.write('Spearman Correlation Matrix')
41
+ image2 = Image.open('spearman.png')
42
+ col4.image(image2, caption='Figure 2 Spearman Correlation Matrix of All Chemicals')
43
+
44
+ # explaination
45
+ with st.expander('Explanation'):
46
+ st.caption(
47
+ '''
48
+ Based on both visualization, most of the variables do not have any relationship except for a few.
49
+
50
+ Based on both visualization, most of the variables do not have any relationship except for a few.
51
+
52
+ - `Hardness` has a very positive low value with `ph` in spearman but close to 0 in pearsons. This suggests there might be a very weak positive non
53
+ linear relationship.
54
+ - `Sulfate` with `Solids` and with `Sulfate` has a very low negative value both in spearman and pearsons. This suggests there might be a very weak
55
+ negative linear relationship.
56
+ '''
57
+ )
58
+
59
+ #================================ ph ==========================================
60
+
61
+ st.subheader('ph Values Distribution')
62
+ image3 = Image.open('ph.png')
63
+ st.image(image3, caption='Figure 3 ph values distribution histogram', width=600)
64
+
65
+ # explaination
66
+ with st.expander('Explanation'):
67
+ st.caption(
68
+ '''
69
+ - The water sample taken mostly has ph between `5-9`
70
+ - The visualization also suggest a lot of data are in the range for drinkable water but doesn't mean that the water is drinkable.
71
+ - This could mean most water samples that's taken could come contaminated water bodies.
72
+ '''
73
+ )
74
+
75
+ #================================ Missing Values ===============================
76
+ st.subheader('Missing Values Visualizations')
77
+
78
+ # missing plot
79
+ st.write('Missing Values Bar Plot')
80
+ image4 = Image.open('missing_values.png')
81
+ st.image(image4, caption='Figure 4 Bar plot of missing values of each column')
82
+
83
+ # displaying explaination
84
+ with st.expander('Explanation'):
85
+ st.caption(
86
+ '''
87
+ **From Data Loading**
88
+
89
+ - There are otal missing values in the dataset: 1434
90
+
91
+ - Columns with missing values:
92
+
93
+ `['ph', 'Sulfate', 'Trihalomethanes']`
94
+
95
+ Number of missing values per column:
96
+ >ph `491`
97
+ >
98
+ >Sulfate `781`
99
+ >
100
+ >Trihalomethanes `162`
101
+ >
102
+ >dtype: int64
103
+
104
+ Missing data percentage (%):
105
+ >ph `15`
106
+ >
107
+ >Sulfate `24`
108
+ >
109
+ >Trihalomethanes `5`
110
+ '''
111
+ )
112
+
113
+ # missing matrix
114
+ st.write('Missing Values Correlation Matrix')
115
+ image5 = Image.open('missing_corr.png')
116
+ st.image(image5, caption='Figure 5 Correlation matrix of the missing values')
117
+
118
+
119
+ # display explaination
120
+ with st.expander('Explanation'):
121
+ st.caption(
122
+ '''
123
+ - Based on the visualization above, the missing values have no correlation and can be cosidered the missingness is `completly random`
124
+ - The missing values being random could be due to the person that took the water sample did not have the equipment to measure the chemical level.
125
+ '''
126
+ )
127
+
128
+ #================================== PCA =============================
129
+
130
+ st.subheader('Feature Importance')
131
+ image6 = Image.open('PCA.png')
132
+ st.image(image6, caption='Figure 6 Linechart of explained variance ratio with number of components')
133
+
134
+ # displaying explaination
135
+ with st.expander('Explanation'):
136
+ st.caption(
137
+ '''
138
+ - Based on the visualization of PCA, there is a linear relationship between number of components and the EVR cummulative
139
+ - This suggest, each feature is important and retains unique information of the dataset
140
+ '''
141
+ )
missing_corr.png ADDED
missing_values.png ADDED
model.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Achmad Dhani
3
+
4
+ Objective : Creating a page for classification prediction
5
+ '''
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import joblib
9
+ from PIL import Image
10
+
11
+ def run():
12
+ '''
13
+ This function is for running the page for predictions
14
+ '''
15
+ st.title('Is Your Water Drinkable ?')
16
+ model = joblib.load("best_model.pkl") # loading the model
17
+
18
+ ph = st.number_input(label='Input ph:',min_value=0,max_value=14)
19
+ hard= st.slider('Hardness', min_value=40, max_value=320)
20
+ solid= st.slider('Total dissolved solids', min_value=320, max_value=60000)
21
+ chlo= st.slider('Chloramines Level', min_value=0.0, max_value=13.0, step=0.1)
22
+ sulf= st.slider('Sulfate', min_value=130, max_value=480)
23
+ cond= st.slider('Conductivity', min_value=180, max_value=750)
24
+ organ= st.slider('Total Organic Carbon', min_value=2.0, max_value=28.0, step=0.1)
25
+ thm= st.slider('Trihalomethanes (THM)', min_value=0, max_value=120)
26
+ turb= st.slider('Turbidity', min_value=0.0, max_value=7.0, step=0.1)
27
+
28
+ # data for predictions
29
+ data_pred={
30
+ 'ph': ph,
31
+ 'hardness':hard,
32
+ 'solids': solid,
33
+ 'chloramines':chlo,
34
+ 'sulfate': sulf,
35
+ 'conductivity': cond,
36
+ 'organic_carbon':organ,
37
+ 'trihalomethanes': thm,
38
+ 'turbidity': turb
39
+ }
40
+
41
+ # data for display
42
+ data_show = {
43
+ 'Parameters': ['ph', 'hardness', 'solids', 'chloramines', 'sulfate', 'conductivity', 'organic_carbon', 'trihalomethanes', 'turbidity'],
44
+ 'Value': [ph, hard, solid, chlo, sulf, cond, organ, thm, turb]
45
+ }
46
+
47
+ st.write('The following table is the result of the data you have input : ')
48
+
49
+ # display table
50
+ display = pd.DataFrame(data_show)
51
+ st.table(display)
52
+
53
+ # df predictions
54
+ df= pd.DataFrame([data_pred])
55
+
56
+ # button
57
+ if st.button(label='Predict'):
58
+
59
+ y_pred_inf = model.predict(df)
60
+
61
+ # printing result
62
+ if y_pred_inf == 1:
63
+ st.write('The water is DRINKABLE')
64
+
65
+ else:
66
+ st.write('The water is NOT DRINKABLE')
67
+
pearsons.png ADDED
ph.png ADDED
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ scikit-learn
4
+ matplotlib
5
+ seaborn
6
+ joblib
spearman.png ADDED
water_potability.csv ADDED
The diff for this file is too large to render. See raw diff