File size: 6,030 Bytes
a5c5461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
056351c
a5c5461
 
 
 
056351c
a5c5461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import joblib
import time
from datasets import load_dataset

# Load dataset
@st.cache_data
def load_data():
    df = load_dataset("kheejay88/water_potability")["train"].to_pandas()
    return df

df = load_data()

# Data Cleaning
st.title("Water Potability Prediction(Supervised)")
st.write("This is a supervised machine learning application to predict water potability based on various variables. Note that the accuracy level of the models may not be ideal for practical usage. The essence is to demonstrate the performance comparison of different machine learning models on a particular dataset. To achieve better accuracy, further data preprocessing, feature engineering, hyperparameter tuning, etc., need to be performed.")
st.subheader("Dataset Overview")
st.write("Original Dataset:")
st.write(df.head())

df.fillna(df.median(), inplace=True)
st.write("Dataset after handling missing values:")
st.write(df.head())

# Data Visualization
st.subheader("Data Visualization")
fig, ax = plt.subplots(figsize=(10, 5))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
st.pyplot(fig)

# Feature Importance Analysis
X = df.drop("Potability", axis=1)
y = df["Potability"]

# Handle class imbalance
smote = SMOTE()
X, y = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=20),
    "SVM": SVC(kernel='rbf', C=1, probability=True),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
    "AdaBoost": AdaBoostClassifier(n_estimators=100),
    "Extra Trees": ExtraTreesClassifier(n_estimators=150)
}

st.subheader("Model Performance with Cross-Validation")
results = {}
loading_status = st.empty()

# File names for persistence
model_filename = "best_model.pkl"
model_name_filename = "best_model_name.txt"
model_accuracy_filename = "best_model_accuracy.txt"
all_model_accuracies_filename = "all_model_accuracies.txt"

if os.path.exists(model_filename) and os.path.exists(model_name_filename) and os.path.exists(model_accuracy_filename):
    with open(model_name_filename, "r") as f:
        best_model_name = f.read().strip()
    with open(model_accuracy_filename, "r") as f:
        best_model_accuracy = float(f.read().strip())
    st.success(f"Best model ({best_model_name}) already exists. Skipping training.")

    # Display saved model accuracies
    if os.path.exists(all_model_accuracies_filename):
        st.subheader("Saved Model Accuracies")
        with open(all_model_accuracies_filename, "r") as f:
            saved_accuracies = f.read()
        st.text(saved_accuracies)
else:
    loading_status.text("Training models...")
    time.sleep(1)  # Simulate loading time
    with open(all_model_accuracies_filename, "w") as f:
        for name, model in models.items():
            scores = cross_val_score(model, X_train, y_train, cv=5)
            accuracy = scores.mean()
            results[name] = accuracy
            st.write(f"{name}: Accuracy = {accuracy:.2f}")
            f.write(f"{name}: {accuracy:.2f}\n")

    # Select and train the best model
    best_model_name = max(results, key=results.get)
    best_model_accuracy = results[best_model_name]
    best_model = models[best_model_name]
    best_model.fit(X_train, y_train)
    joblib.dump(best_model, model_filename)
    with open(model_name_filename, "w") as f:
        f.write(best_model_name)
    with open(model_accuracy_filename, "w") as f:
        f.write(str(best_model_accuracy))
    st.success(f"Best Model: {best_model_name} trained and saved!")

# Model Testing with User Input
st.subheader("Test the Model")
st.write("Tips: Based on the data correlation heatmap ph, hardness, and sulfate has a higher relation to each other. (POTABLE = lower ph || higher hardness || higher sulfate)")
user_input = {}

for col in X.columns:
    # Persist user input values across interactions
    if col not in st.session_state:
        st.session_state[col] = float(X[col].mean())

    user_input[col] = st.number_input(
        f"{col}",
        float(X[col].min()),
        float(X[col].max()),
        st.session_state[col],
        key=col
    )

# Prediction button
if st.button("Predict Water Potability"):
    loading_status.text("Testing model...")

    # Load best model
    model = joblib.load(model_filename)

    with open(model_name_filename, "r") as f:
        best_model_name = f.read().strip()
    with open(model_accuracy_filename, "r") as f:
        best_model_accuracy = float(f.read().strip())

    # Convert user input to DataFrame
    input_df = pd.DataFrame([user_input])
    input_df = scaler.transform(input_df)  # Apply scaling

    # Predict
    prediction = model.predict(input_df)[0]
    label = "Potable" if prediction == 1 else "Not Potable"

    # Display results
    st.write(f"Predicted Potability: {label}")
    st.write(f"Model Used in Prediction: {best_model_name} (Accuracy: {best_model_accuracy:.2f})")
    loading_status.text("")