Spaces:

kheejay88
/

water_potability_prediction_S1

Sleeping

File size: 6,030 Bytes

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import joblib
import time
from datasets import load_dataset

# Load dataset
@st.cache_data
def load_data():
    df = load_dataset("kheejay88/water_potability")["train"].to_pandas()
    return df

df = load_data()

# Data Cleaning
st.title("Water Potability Prediction(Supervised)")
st.write("This is a supervised machine learning application to predict water potability based on various variables. Note that the accuracy level of the models may not be ideal for practical usage. The essence is to demonstrate the performance comparison of different machine learning models on a particular dataset. To achieve better accuracy, further data preprocessing, feature engineering, hyperparameter tuning, etc., need to be performed.")
st.subheader("Dataset Overview")
st.write("Original Dataset:")
st.write(df.head())

df.fillna(df.median(), inplace=True)
st.write("Dataset after handling missing values:")
st.write(df.head())

# Data Visualization
st.subheader("Data Visualization")
fig, ax = plt.subplots(figsize=(10, 5))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
st.pyplot(fig)

# Feature Importance Analysis
X = df.drop("Potability", axis=1)
y = df["Potability"]

# Handle class imbalance
smote = SMOTE()
X, y = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=20),
    "SVM": SVC(kernel='rbf', C=1, probability=True),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
    "AdaBoost": AdaBoostClassifier(n_estimators=100),
    "Extra Trees": ExtraTreesClassifier(n_estimators=150)
}

st.subheader("Model Performance with Cross-Validation")
results = {}
loading_status = st.empty()

# File names for persistence
model_filename = "best_model.pkl"
model_name_filename = "best_model_name.txt"
model_accuracy_filename = "best_model_accuracy.txt"
all_model_accuracies_filename = "all_model_accuracies.txt"

if os.path.exists(model_filename) and os.path.exists(model_name_filename) and os.path.exists(model_accuracy_filename):
    with open(model_name_filename, "r") as f:
        best_model_name = f.read().strip()
    with open(model_accuracy_filename, "r") as f:
        best_model_accuracy = float(f.read().strip())
    st.success(f"Best model ({best_model_name}) already exists. Skipping training.")

    # Display saved model accuracies
    if os.path.exists(all_model_accuracies_filename):
        st.subheader("Saved Model Accuracies")
        with open(all_model_accuracies_filename, "r") as f:
            saved_accuracies = f.read()
        st.text(saved_accuracies)
else:
    loading_status.text("Training models...")
    time.sleep(1)  # Simulate loading time
    with open(all_model_accuracies_filename, "w") as f:
        for name, model in models.items():
            scores = cross_val_score(model, X_train, y_train, cv=5)
            accuracy = scores.mean()
            results[name] = accuracy
            st.write(f"{name}: Accuracy = {accuracy:.2f}")
            f.write(f"{name}: {accuracy:.2f}\n")

    # Select and train the best model
    best_model_name = max(results, key=results.get)
    best_model_accuracy = results[best_model_name]
    best_model = models[best_model_name]
    best_model.fit(X_train, y_train)
    joblib.dump(best_model, model_filename)
    with open(model_name_filename, "w") as f:
        f.write(best_model_name)
    with open(model_accuracy_filename, "w") as f:
        f.write(str(best_model_accuracy))
    st.success(f"Best Model: {best_model_name} trained and saved!")

# Model Testing with User Input
st.subheader("Test the Model")
st.write("Tips: Based on the data correlation heatmap ph, hardness, and sulfate has a higher relation to each other. (POTABLE = lower ph || higher hardness || higher sulfate)")
user_input = {}

for col in X.columns:
    # Persist user input values across interactions
    if col not in st.session_state:
        st.session_state[col] = float(X[col].mean())

    user_input[col] = st.number_input(
        f"{col}",
        float(X[col].min()),
        float(X[col].max()),
        st.session_state[col],
        key=col
    )

# Prediction button
if st.button("Predict Water Potability"):
    loading_status.text("Testing model...")

    # Load best model
    model = joblib.load(model_filename)

    with open(model_name_filename, "r") as f:
        best_model_name = f.read().strip()
    with open(model_accuracy_filename, "r") as f:
        best_model_accuracy = float(f.read().strip())

    # Convert user input to DataFrame
    input_df = pd.DataFrame([user_input])
    input_df = scaler.transform(input_df)  # Apply scaling

    # Predict
    prediction = model.predict(input_df)[0]
    label = "Potable" if prediction == 1 else "Not Potable"

    # Display results
    st.write(f"Predicted Potability: {label}")
    st.write(f"Model Used in Prediction: {best_model_name} (Accuracy: {best_model_accuracy:.2f})")
    loading_status.text("")