kheejay88's picture
Update app.py
056351c verified
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import joblib
import time
from datasets import load_dataset
# Load dataset
@st.cache_data
def load_data():
df = load_dataset("kheejay88/water_potability")["train"].to_pandas()
return df
df = load_data()
# Data Cleaning
st.title("Water Potability Prediction(Supervised)")
st.write("This is a supervised machine learning application to predict water potability based on various variables. Note that the accuracy level of the models may not be ideal for practical usage. The essence is to demonstrate the performance comparison of different machine learning models on a particular dataset. To achieve better accuracy, further data preprocessing, feature engineering, hyperparameter tuning, etc., need to be performed.")
st.subheader("Dataset Overview")
st.write("Original Dataset:")
st.write(df.head())
df.fillna(df.median(), inplace=True)
st.write("Dataset after handling missing values:")
st.write(df.head())
# Data Visualization
st.subheader("Data Visualization")
fig, ax = plt.subplots(figsize=(10, 5))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
st.pyplot(fig)
# Feature Importance Analysis
X = df.drop("Potability", axis=1)
y = df["Potability"]
# Handle class imbalance
smote = SMOTE()
X, y = smote.fit_resample(X, y)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Define models
models = {
"Logistic Regression": LogisticRegression(),
"Random Forest": RandomForestClassifier(n_estimators=200, max_depth=20),
"SVM": SVC(kernel='rbf', C=1, probability=True),
"Decision Tree": DecisionTreeClassifier(max_depth=10),
"KNN": KNeighborsClassifier(n_neighbors=5),
"Naive Bayes": GaussianNB(),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
"AdaBoost": AdaBoostClassifier(n_estimators=100),
"Extra Trees": ExtraTreesClassifier(n_estimators=150)
}
st.subheader("Model Performance with Cross-Validation")
results = {}
loading_status = st.empty()
# File names for persistence
model_filename = "best_model.pkl"
model_name_filename = "best_model_name.txt"
model_accuracy_filename = "best_model_accuracy.txt"
all_model_accuracies_filename = "all_model_accuracies.txt"
if os.path.exists(model_filename) and os.path.exists(model_name_filename) and os.path.exists(model_accuracy_filename):
with open(model_name_filename, "r") as f:
best_model_name = f.read().strip()
with open(model_accuracy_filename, "r") as f:
best_model_accuracy = float(f.read().strip())
st.success(f"Best model ({best_model_name}) already exists. Skipping training.")
# Display saved model accuracies
if os.path.exists(all_model_accuracies_filename):
st.subheader("Saved Model Accuracies")
with open(all_model_accuracies_filename, "r") as f:
saved_accuracies = f.read()
st.text(saved_accuracies)
else:
loading_status.text("Training models...")
time.sleep(1) # Simulate loading time
with open(all_model_accuracies_filename, "w") as f:
for name, model in models.items():
scores = cross_val_score(model, X_train, y_train, cv=5)
accuracy = scores.mean()
results[name] = accuracy
st.write(f"{name}: Accuracy = {accuracy:.2f}")
f.write(f"{name}: {accuracy:.2f}\n")
# Select and train the best model
best_model_name = max(results, key=results.get)
best_model_accuracy = results[best_model_name]
best_model = models[best_model_name]
best_model.fit(X_train, y_train)
joblib.dump(best_model, model_filename)
with open(model_name_filename, "w") as f:
f.write(best_model_name)
with open(model_accuracy_filename, "w") as f:
f.write(str(best_model_accuracy))
st.success(f"Best Model: {best_model_name} trained and saved!")
# Model Testing with User Input
st.subheader("Test the Model")
st.write("Tips: Based on the data correlation heatmap ph, hardness, and sulfate has a higher relation to each other. (POTABLE = lower ph || higher hardness || higher sulfate)")
user_input = {}
for col in X.columns:
# Persist user input values across interactions
if col not in st.session_state:
st.session_state[col] = float(X[col].mean())
user_input[col] = st.number_input(
f"{col}",
float(X[col].min()),
float(X[col].max()),
st.session_state[col],
key=col
)
# Prediction button
if st.button("Predict Water Potability"):
loading_status.text("Testing model...")
# Load best model
model = joblib.load(model_filename)
with open(model_name_filename, "r") as f:
best_model_name = f.read().strip()
with open(model_accuracy_filename, "r") as f:
best_model_accuracy = float(f.read().strip())
# Convert user input to DataFrame
input_df = pd.DataFrame([user_input])
input_df = scaler.transform(input_df) # Apply scaling
# Predict
prediction = model.predict(input_df)[0]
label = "Potable" if prediction == 1 else "Not Potable"
# Display results
st.write(f"Predicted Potability: {label}")
st.write(f"Model Used in Prediction: {best_model_name} (Accuracy: {best_model_accuracy:.2f})")
loading_status.text("")