|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
import os |
|
|
from sklearn.model_selection import train_test_split, cross_val_score |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.svm import SVC |
|
|
from sklearn.tree import DecisionTreeClassifier |
|
|
from sklearn.neighbors import KNeighborsClassifier |
|
|
from sklearn.naive_bayes import GaussianNB |
|
|
from sklearn.metrics import accuracy_score |
|
|
from imblearn.over_sampling import SMOTE |
|
|
import joblib |
|
|
import time |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_data(): |
|
|
df = load_dataset("kheejay88/water_potability")["train"].to_pandas() |
|
|
return df |
|
|
|
|
|
df = load_data() |
|
|
|
|
|
|
|
|
st.title("Water Potability Prediction(Supervised)") |
|
|
st.write("This is a supervised machine learning application to predict water potability based on various variables. Note that the accuracy level of the models may not be ideal for practical usage. The essence is to demonstrate the performance comparison of different machine learning models on a particular dataset. To achieve better accuracy, further data preprocessing, feature engineering, hyperparameter tuning, etc., need to be performed.") |
|
|
st.subheader("Dataset Overview") |
|
|
st.write("Original Dataset:") |
|
|
st.write(df.head()) |
|
|
|
|
|
df.fillna(df.median(), inplace=True) |
|
|
st.write("Dataset after handling missing values:") |
|
|
st.write(df.head()) |
|
|
|
|
|
|
|
|
st.subheader("Data Visualization") |
|
|
fig, ax = plt.subplots(figsize=(10, 5)) |
|
|
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax) |
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
X = df.drop("Potability", axis=1) |
|
|
y = df["Potability"] |
|
|
|
|
|
|
|
|
smote = SMOTE() |
|
|
X, y = smote.fit_resample(X, y) |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
|
scaler = StandardScaler() |
|
|
X_train = scaler.fit_transform(X_train) |
|
|
X_test = scaler.transform(X_test) |
|
|
|
|
|
|
|
|
models = { |
|
|
"Logistic Regression": LogisticRegression(), |
|
|
"Random Forest": RandomForestClassifier(n_estimators=200, max_depth=20), |
|
|
"SVM": SVC(kernel='rbf', C=1, probability=True), |
|
|
"Decision Tree": DecisionTreeClassifier(max_depth=10), |
|
|
"KNN": KNeighborsClassifier(n_neighbors=5), |
|
|
"Naive Bayes": GaussianNB(), |
|
|
"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1), |
|
|
"AdaBoost": AdaBoostClassifier(n_estimators=100), |
|
|
"Extra Trees": ExtraTreesClassifier(n_estimators=150) |
|
|
} |
|
|
|
|
|
st.subheader("Model Performance with Cross-Validation") |
|
|
results = {} |
|
|
loading_status = st.empty() |
|
|
|
|
|
|
|
|
model_filename = "best_model.pkl" |
|
|
model_name_filename = "best_model_name.txt" |
|
|
model_accuracy_filename = "best_model_accuracy.txt" |
|
|
all_model_accuracies_filename = "all_model_accuracies.txt" |
|
|
|
|
|
if os.path.exists(model_filename) and os.path.exists(model_name_filename) and os.path.exists(model_accuracy_filename): |
|
|
with open(model_name_filename, "r") as f: |
|
|
best_model_name = f.read().strip() |
|
|
with open(model_accuracy_filename, "r") as f: |
|
|
best_model_accuracy = float(f.read().strip()) |
|
|
st.success(f"Best model ({best_model_name}) already exists. Skipping training.") |
|
|
|
|
|
|
|
|
if os.path.exists(all_model_accuracies_filename): |
|
|
st.subheader("Saved Model Accuracies") |
|
|
with open(all_model_accuracies_filename, "r") as f: |
|
|
saved_accuracies = f.read() |
|
|
st.text(saved_accuracies) |
|
|
else: |
|
|
loading_status.text("Training models...") |
|
|
time.sleep(1) |
|
|
with open(all_model_accuracies_filename, "w") as f: |
|
|
for name, model in models.items(): |
|
|
scores = cross_val_score(model, X_train, y_train, cv=5) |
|
|
accuracy = scores.mean() |
|
|
results[name] = accuracy |
|
|
st.write(f"{name}: Accuracy = {accuracy:.2f}") |
|
|
f.write(f"{name}: {accuracy:.2f}\n") |
|
|
|
|
|
|
|
|
best_model_name = max(results, key=results.get) |
|
|
best_model_accuracy = results[best_model_name] |
|
|
best_model = models[best_model_name] |
|
|
best_model.fit(X_train, y_train) |
|
|
joblib.dump(best_model, model_filename) |
|
|
with open(model_name_filename, "w") as f: |
|
|
f.write(best_model_name) |
|
|
with open(model_accuracy_filename, "w") as f: |
|
|
f.write(str(best_model_accuracy)) |
|
|
st.success(f"Best Model: {best_model_name} trained and saved!") |
|
|
|
|
|
|
|
|
st.subheader("Test the Model") |
|
|
st.write("Tips: Based on the data correlation heatmap ph, hardness, and sulfate has a higher relation to each other. (POTABLE = lower ph || higher hardness || higher sulfate)") |
|
|
user_input = {} |
|
|
|
|
|
for col in X.columns: |
|
|
|
|
|
if col not in st.session_state: |
|
|
st.session_state[col] = float(X[col].mean()) |
|
|
|
|
|
user_input[col] = st.number_input( |
|
|
f"{col}", |
|
|
float(X[col].min()), |
|
|
float(X[col].max()), |
|
|
st.session_state[col], |
|
|
key=col |
|
|
) |
|
|
|
|
|
|
|
|
if st.button("Predict Water Potability"): |
|
|
loading_status.text("Testing model...") |
|
|
|
|
|
|
|
|
model = joblib.load(model_filename) |
|
|
|
|
|
with open(model_name_filename, "r") as f: |
|
|
best_model_name = f.read().strip() |
|
|
with open(model_accuracy_filename, "r") as f: |
|
|
best_model_accuracy = float(f.read().strip()) |
|
|
|
|
|
|
|
|
input_df = pd.DataFrame([user_input]) |
|
|
input_df = scaler.transform(input_df) |
|
|
|
|
|
|
|
|
prediction = model.predict(input_df)[0] |
|
|
label = "Potable" if prediction == 1 else "Not Potable" |
|
|
|
|
|
|
|
|
st.write(f"Predicted Potability: {label}") |
|
|
st.write(f"Model Used in Prediction: {best_model_name} (Accuracy: {best_model_accuracy:.2f})") |
|
|
loading_status.text("") |
|
|
|