usernameiskheejay commited on
Commit
4289ce9
·
1 Parent(s): 9f2720b
Files changed (2) hide show
  1. app.py +160 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import os
6
+ from sklearn.model_selection import train_test_split, cross_val_score
7
+ from sklearn.preprocessing import StandardScaler
8
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.svm import SVC
11
+ from sklearn.tree import DecisionTreeClassifier
12
+ from sklearn.neighbors import KNeighborsClassifier
13
+ from sklearn.naive_bayes import GaussianNB
14
+ from sklearn.metrics import accuracy_score
15
+ from imblearn.over_sampling import SMOTE
16
+ import joblib
17
+ import time
18
+
19
+ # Load dataset
20
+ @st.cache_data
21
+ def load_data():
22
+ df = pd.read_csv("water_potability.csv")
23
+ return df
24
+
25
+ df = load_data()
26
+
27
+ # Data Cleaning
28
+ st.title("Water Potability Prediction(Supervised)")
29
+ st.write("This is a supervised machine learning application to predict water potability based on various variables. Note that the accuracy level of the models may not be ideal for practical usage. The essence is to demonstrate the performance comparison of different machine learning models on a particular dataset. To achieve better accuracy, further data preprocessing, feature engineering, hyperparameter tuning, etc., need to be performed.")
30
+ st.subheader("Dataset Overview")
31
+ st.write("Original Dataset:")
32
+ st.write(df.head())
33
+
34
+ df.fillna(df.median(), inplace=True)
35
+ st.write("Dataset after handling missing values:")
36
+ st.write(df.head())
37
+
38
+ # Data Visualization
39
+ st.subheader("Data Visualization")
40
+ fig, ax = plt.subplots(figsize=(10, 5))
41
+ sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
42
+ st.pyplot(fig)
43
+
44
+ # Feature Importance Analysis
45
+ X = df.drop("Potability", axis=1)
46
+ y = df["Potability"]
47
+
48
+ # Handle class imbalance
49
+ smote = SMOTE()
50
+ X, y = smote.fit_resample(X, y)
51
+
52
+ # Train-test split
53
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
54
+
55
+ # Feature scaling
56
+ scaler = StandardScaler()
57
+ X_train = scaler.fit_transform(X_train)
58
+ X_test = scaler.transform(X_test)
59
+
60
+ # Define models
61
+ models = {
62
+ "Logistic Regression": LogisticRegression(),
63
+ "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=20),
64
+ "SVM": SVC(kernel='rbf', C=1, probability=True),
65
+ "Decision Tree": DecisionTreeClassifier(max_depth=10),
66
+ "KNN": KNeighborsClassifier(n_neighbors=5),
67
+ "Naive Bayes": GaussianNB(),
68
+ "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
69
+ "AdaBoost": AdaBoostClassifier(n_estimators=100),
70
+ "Extra Trees": ExtraTreesClassifier(n_estimators=150)
71
+ }
72
+
73
+ st.subheader("Model Performance with Cross-Validation")
74
+ results = {}
75
+ loading_status = st.empty()
76
+
77
+ # File names for persistence
78
+ model_filename = "best_model.pkl"
79
+ model_name_filename = "best_model_name.txt"
80
+ model_accuracy_filename = "best_model_accuracy.txt"
81
+ all_model_accuracies_filename = "all_model_accuracies.txt"
82
+
83
+ if os.path.exists(model_filename) and os.path.exists(model_name_filename) and os.path.exists(model_accuracy_filename):
84
+ with open(model_name_filename, "r") as f:
85
+ best_model_name = f.read().strip()
86
+ with open(model_accuracy_filename, "r") as f:
87
+ best_model_accuracy = float(f.read().strip())
88
+ st.success(f"Best model ({best_model_name}) already exists. Skipping training.")
89
+
90
+ # Display saved model accuracies
91
+ if os.path.exists(all_model_accuracies_filename):
92
+ st.subheader("Saved Model Accuracies")
93
+ with open(all_model_accuracies_filename, "r") as f:
94
+ saved_accuracies = f.read()
95
+ st.text(saved_accuracies)
96
+ else:
97
+ loading_status.text("Training models...")
98
+ time.sleep(1) # Simulate loading time
99
+ with open(all_model_accuracies_filename, "w") as f:
100
+ for name, model in models.items():
101
+ scores = cross_val_score(model, X_train, y_train, cv=5)
102
+ accuracy = scores.mean()
103
+ results[name] = accuracy
104
+ st.write(f"{name}: Accuracy = {accuracy:.2f}")
105
+ f.write(f"{name}: {accuracy:.2f}\n")
106
+
107
+ # Select and train the best model
108
+ best_model_name = max(results, key=results.get)
109
+ best_model_accuracy = results[best_model_name]
110
+ best_model = models[best_model_name]
111
+ best_model.fit(X_train, y_train)
112
+ joblib.dump(best_model, model_filename)
113
+ with open(model_name_filename, "w") as f:
114
+ f.write(best_model_name)
115
+ with open(model_accuracy_filename, "w") as f:
116
+ f.write(str(best_model_accuracy))
117
+ st.success(f"Best Model: {best_model_name} trained and saved!")
118
+
119
+ # Model Testing with User Input
120
+ st.subheader("Test the Model")
121
+ st.write("Tips: Based on the data correlation heatmap ph, hardness, and sulfate has a higher relation to each other. (POTABLE = lower ph || higher hardness || higher sulfate)")
122
+ user_input = {}
123
+
124
+ for col in X.columns:
125
+ # Persist user input values across interactions
126
+ if col not in st.session_state:
127
+ st.session_state[col] = float(X[col].mean())
128
+
129
+ user_input[col] = st.number_input(
130
+ f"{col}",
131
+ float(X[col].min()),
132
+ float(X[col].max()),
133
+ st.session_state[col],
134
+ key=col
135
+ )
136
+
137
+ # Prediction button
138
+ if st.button("Predict Water Potability"):
139
+ loading_status.text("Testing model...")
140
+
141
+ # Load best model
142
+ model = joblib.load(model_filename)
143
+
144
+ with open(model_name_filename, "r") as f:
145
+ best_model_name = f.read().strip()
146
+ with open(model_accuracy_filename, "r") as f:
147
+ best_model_accuracy = float(f.read().strip())
148
+
149
+ # Convert user input to DataFrame
150
+ input_df = pd.DataFrame([user_input])
151
+ input_df = scaler.transform(input_df) # Apply scaling
152
+
153
+ # Predict
154
+ prediction = model.predict(input_df)[0]
155
+ label = "Potable" if prediction == 1 else "Not Potable"
156
+
157
+ # Display results
158
+ st.write(f"Predicted Potability: {label}")
159
+ st.write(f"Model Used in Prediction: {best_model_name} (Accuracy: {best_model_accuracy:.2f})")
160
+ loading_status.text("")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ matplotlib
5
+ seaborn
6
+ scikit-learn
7
+ joblib
8
+ imblearn