import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import os from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score from imblearn.over_sampling import SMOTE import joblib import time from datasets import load_dataset # Load dataset @st.cache_data def load_data(): df = load_dataset("kheejay88/water_potability")["train"].to_pandas() return df df = load_data() # Data Cleaning st.title("Water Potability Prediction(Supervised)") st.write("This is a supervised machine learning application to predict water potability based on various variables. Note that the accuracy level of the models may not be ideal for practical usage. The essence is to demonstrate the performance comparison of different machine learning models on a particular dataset. To achieve better accuracy, further data preprocessing, feature engineering, hyperparameter tuning, etc., need to be performed.") st.subheader("Dataset Overview") st.write("Original Dataset:") st.write(df.head()) df.fillna(df.median(), inplace=True) st.write("Dataset after handling missing values:") st.write(df.head()) # Data Visualization st.subheader("Data Visualization") fig, ax = plt.subplots(figsize=(10, 5)) sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax) st.pyplot(fig) # Feature Importance Analysis X = df.drop("Potability", axis=1) y = df["Potability"] # Handle class imbalance smote = SMOTE() X, y = smote.fit_resample(X, y) # Train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Feature scaling scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Define models models = { "Logistic Regression": LogisticRegression(), "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=20), "SVM": SVC(kernel='rbf', C=1, probability=True), "Decision Tree": DecisionTreeClassifier(max_depth=10), "KNN": KNeighborsClassifier(n_neighbors=5), "Naive Bayes": GaussianNB(), "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1), "AdaBoost": AdaBoostClassifier(n_estimators=100), "Extra Trees": ExtraTreesClassifier(n_estimators=150) } st.subheader("Model Performance with Cross-Validation") results = {} loading_status = st.empty() # File names for persistence model_filename = "best_model.pkl" model_name_filename = "best_model_name.txt" model_accuracy_filename = "best_model_accuracy.txt" all_model_accuracies_filename = "all_model_accuracies.txt" if os.path.exists(model_filename) and os.path.exists(model_name_filename) and os.path.exists(model_accuracy_filename): with open(model_name_filename, "r") as f: best_model_name = f.read().strip() with open(model_accuracy_filename, "r") as f: best_model_accuracy = float(f.read().strip()) st.success(f"Best model ({best_model_name}) already exists. Skipping training.") # Display saved model accuracies if os.path.exists(all_model_accuracies_filename): st.subheader("Saved Model Accuracies") with open(all_model_accuracies_filename, "r") as f: saved_accuracies = f.read() st.text(saved_accuracies) else: loading_status.text("Training models...") time.sleep(1) # Simulate loading time with open(all_model_accuracies_filename, "w") as f: for name, model in models.items(): scores = cross_val_score(model, X_train, y_train, cv=5) accuracy = scores.mean() results[name] = accuracy st.write(f"{name}: Accuracy = {accuracy:.2f}") f.write(f"{name}: {accuracy:.2f}\n") # Select and train the best model best_model_name = max(results, key=results.get) best_model_accuracy = results[best_model_name] best_model = models[best_model_name] best_model.fit(X_train, y_train) joblib.dump(best_model, model_filename) with open(model_name_filename, "w") as f: f.write(best_model_name) with open(model_accuracy_filename, "w") as f: f.write(str(best_model_accuracy)) st.success(f"Best Model: {best_model_name} trained and saved!") # Model Testing with User Input st.subheader("Test the Model") st.write("Tips: Based on the data correlation heatmap ph, hardness, and sulfate has a higher relation to each other. (POTABLE = lower ph || higher hardness || higher sulfate)") user_input = {} for col in X.columns: # Persist user input values across interactions if col not in st.session_state: st.session_state[col] = float(X[col].mean()) user_input[col] = st.number_input( f"{col}", float(X[col].min()), float(X[col].max()), st.session_state[col], key=col ) # Prediction button if st.button("Predict Water Potability"): loading_status.text("Testing model...") # Load best model model = joblib.load(model_filename) with open(model_name_filename, "r") as f: best_model_name = f.read().strip() with open(model_accuracy_filename, "r") as f: best_model_accuracy = float(f.read().strip()) # Convert user input to DataFrame input_df = pd.DataFrame([user_input]) input_df = scaler.transform(input_df) # Apply scaling # Predict prediction = model.predict(input_df)[0] label = "Potable" if prediction == 1 else "Not Potable" # Display results st.write(f"Predicted Potability: {label}") st.write(f"Model Used in Prediction: {best_model_name} (Accuracy: {best_model_accuracy:.2f})") loading_status.text("")