usernameiskheejay
commited on
Commit
·
4289ce9
1
Parent(s):
9f2720b
wp
Browse files- app.py +160 -0
- requirements.txt +8 -0
app.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import os
|
| 6 |
+
from sklearn.model_selection import train_test_split, cross_val_score
|
| 7 |
+
from sklearn.preprocessing import StandardScaler
|
| 8 |
+
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
|
| 9 |
+
from sklearn.linear_model import LogisticRegression
|
| 10 |
+
from sklearn.svm import SVC
|
| 11 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 12 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 13 |
+
from sklearn.naive_bayes import GaussianNB
|
| 14 |
+
from sklearn.metrics import accuracy_score
|
| 15 |
+
from imblearn.over_sampling import SMOTE
|
| 16 |
+
import joblib
|
| 17 |
+
import time
|
| 18 |
+
|
| 19 |
+
# Load dataset
|
| 20 |
+
@st.cache_data
|
| 21 |
+
def load_data():
|
| 22 |
+
df = pd.read_csv("water_potability.csv")
|
| 23 |
+
return df
|
| 24 |
+
|
| 25 |
+
df = load_data()
|
| 26 |
+
|
| 27 |
+
# Data Cleaning
|
| 28 |
+
st.title("Water Potability Prediction(Supervised)")
|
| 29 |
+
st.write("This is a supervised machine learning application to predict water potability based on various variables. Note that the accuracy level of the models may not be ideal for practical usage. The essence is to demonstrate the performance comparison of different machine learning models on a particular dataset. To achieve better accuracy, further data preprocessing, feature engineering, hyperparameter tuning, etc., need to be performed.")
|
| 30 |
+
st.subheader("Dataset Overview")
|
| 31 |
+
st.write("Original Dataset:")
|
| 32 |
+
st.write(df.head())
|
| 33 |
+
|
| 34 |
+
df.fillna(df.median(), inplace=True)
|
| 35 |
+
st.write("Dataset after handling missing values:")
|
| 36 |
+
st.write(df.head())
|
| 37 |
+
|
| 38 |
+
# Data Visualization
|
| 39 |
+
st.subheader("Data Visualization")
|
| 40 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
| 41 |
+
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
|
| 42 |
+
st.pyplot(fig)
|
| 43 |
+
|
| 44 |
+
# Feature Importance Analysis
|
| 45 |
+
X = df.drop("Potability", axis=1)
|
| 46 |
+
y = df["Potability"]
|
| 47 |
+
|
| 48 |
+
# Handle class imbalance
|
| 49 |
+
smote = SMOTE()
|
| 50 |
+
X, y = smote.fit_resample(X, y)
|
| 51 |
+
|
| 52 |
+
# Train-test split
|
| 53 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 54 |
+
|
| 55 |
+
# Feature scaling
|
| 56 |
+
scaler = StandardScaler()
|
| 57 |
+
X_train = scaler.fit_transform(X_train)
|
| 58 |
+
X_test = scaler.transform(X_test)
|
| 59 |
+
|
| 60 |
+
# Define models
|
| 61 |
+
models = {
|
| 62 |
+
"Logistic Regression": LogisticRegression(),
|
| 63 |
+
"Random Forest": RandomForestClassifier(n_estimators=200, max_depth=20),
|
| 64 |
+
"SVM": SVC(kernel='rbf', C=1, probability=True),
|
| 65 |
+
"Decision Tree": DecisionTreeClassifier(max_depth=10),
|
| 66 |
+
"KNN": KNeighborsClassifier(n_neighbors=5),
|
| 67 |
+
"Naive Bayes": GaussianNB(),
|
| 68 |
+
"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
|
| 69 |
+
"AdaBoost": AdaBoostClassifier(n_estimators=100),
|
| 70 |
+
"Extra Trees": ExtraTreesClassifier(n_estimators=150)
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
st.subheader("Model Performance with Cross-Validation")
|
| 74 |
+
results = {}
|
| 75 |
+
loading_status = st.empty()
|
| 76 |
+
|
| 77 |
+
# File names for persistence
|
| 78 |
+
model_filename = "best_model.pkl"
|
| 79 |
+
model_name_filename = "best_model_name.txt"
|
| 80 |
+
model_accuracy_filename = "best_model_accuracy.txt"
|
| 81 |
+
all_model_accuracies_filename = "all_model_accuracies.txt"
|
| 82 |
+
|
| 83 |
+
if os.path.exists(model_filename) and os.path.exists(model_name_filename) and os.path.exists(model_accuracy_filename):
|
| 84 |
+
with open(model_name_filename, "r") as f:
|
| 85 |
+
best_model_name = f.read().strip()
|
| 86 |
+
with open(model_accuracy_filename, "r") as f:
|
| 87 |
+
best_model_accuracy = float(f.read().strip())
|
| 88 |
+
st.success(f"Best model ({best_model_name}) already exists. Skipping training.")
|
| 89 |
+
|
| 90 |
+
# Display saved model accuracies
|
| 91 |
+
if os.path.exists(all_model_accuracies_filename):
|
| 92 |
+
st.subheader("Saved Model Accuracies")
|
| 93 |
+
with open(all_model_accuracies_filename, "r") as f:
|
| 94 |
+
saved_accuracies = f.read()
|
| 95 |
+
st.text(saved_accuracies)
|
| 96 |
+
else:
|
| 97 |
+
loading_status.text("Training models...")
|
| 98 |
+
time.sleep(1) # Simulate loading time
|
| 99 |
+
with open(all_model_accuracies_filename, "w") as f:
|
| 100 |
+
for name, model in models.items():
|
| 101 |
+
scores = cross_val_score(model, X_train, y_train, cv=5)
|
| 102 |
+
accuracy = scores.mean()
|
| 103 |
+
results[name] = accuracy
|
| 104 |
+
st.write(f"{name}: Accuracy = {accuracy:.2f}")
|
| 105 |
+
f.write(f"{name}: {accuracy:.2f}\n")
|
| 106 |
+
|
| 107 |
+
# Select and train the best model
|
| 108 |
+
best_model_name = max(results, key=results.get)
|
| 109 |
+
best_model_accuracy = results[best_model_name]
|
| 110 |
+
best_model = models[best_model_name]
|
| 111 |
+
best_model.fit(X_train, y_train)
|
| 112 |
+
joblib.dump(best_model, model_filename)
|
| 113 |
+
with open(model_name_filename, "w") as f:
|
| 114 |
+
f.write(best_model_name)
|
| 115 |
+
with open(model_accuracy_filename, "w") as f:
|
| 116 |
+
f.write(str(best_model_accuracy))
|
| 117 |
+
st.success(f"Best Model: {best_model_name} trained and saved!")
|
| 118 |
+
|
| 119 |
+
# Model Testing with User Input
|
| 120 |
+
st.subheader("Test the Model")
|
| 121 |
+
st.write("Tips: Based on the data correlation heatmap ph, hardness, and sulfate has a higher relation to each other. (POTABLE = lower ph || higher hardness || higher sulfate)")
|
| 122 |
+
user_input = {}
|
| 123 |
+
|
| 124 |
+
for col in X.columns:
|
| 125 |
+
# Persist user input values across interactions
|
| 126 |
+
if col not in st.session_state:
|
| 127 |
+
st.session_state[col] = float(X[col].mean())
|
| 128 |
+
|
| 129 |
+
user_input[col] = st.number_input(
|
| 130 |
+
f"{col}",
|
| 131 |
+
float(X[col].min()),
|
| 132 |
+
float(X[col].max()),
|
| 133 |
+
st.session_state[col],
|
| 134 |
+
key=col
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Prediction button
|
| 138 |
+
if st.button("Predict Water Potability"):
|
| 139 |
+
loading_status.text("Testing model...")
|
| 140 |
+
|
| 141 |
+
# Load best model
|
| 142 |
+
model = joblib.load(model_filename)
|
| 143 |
+
|
| 144 |
+
with open(model_name_filename, "r") as f:
|
| 145 |
+
best_model_name = f.read().strip()
|
| 146 |
+
with open(model_accuracy_filename, "r") as f:
|
| 147 |
+
best_model_accuracy = float(f.read().strip())
|
| 148 |
+
|
| 149 |
+
# Convert user input to DataFrame
|
| 150 |
+
input_df = pd.DataFrame([user_input])
|
| 151 |
+
input_df = scaler.transform(input_df) # Apply scaling
|
| 152 |
+
|
| 153 |
+
# Predict
|
| 154 |
+
prediction = model.predict(input_df)[0]
|
| 155 |
+
label = "Potable" if prediction == 1 else "Not Potable"
|
| 156 |
+
|
| 157 |
+
# Display results
|
| 158 |
+
st.write(f"Predicted Potability: {label}")
|
| 159 |
+
st.write(f"Model Used in Prediction: {best_model_name} (Accuracy: {best_model_accuracy:.2f})")
|
| 160 |
+
loading_status.text("")
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
matplotlib
|
| 5 |
+
seaborn
|
| 6 |
+
scikit-learn
|
| 7 |
+
joblib
|
| 8 |
+
imblearn
|