Spaces:

kheejay88
/

water_potability_prediction_S1

Sleeping

App Files Files Community

water_potability_prediction_S1 / app.py

kheejay88

Update app.py

056351c verified 10 months ago

raw

history blame contribute delete

6.03 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import os
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.naive_bayes import GaussianNB
	from sklearn.metrics import accuracy_score
	from imblearn.over_sampling import SMOTE
	import joblib
	import time
	from datasets import load_dataset

	# Load dataset
	@st.cache_data
	def load_data():
	df = load_dataset("kheejay88/water_potability")["train"].to_pandas()
	return df

	df = load_data()

	# Data Cleaning
	st.title("Water Potability Prediction(Supervised)")
	st.write("This is a supervised machine learning application to predict water potability based on various variables. Note that the accuracy level of the models may not be ideal for practical usage. The essence is to demonstrate the performance comparison of different machine learning models on a particular dataset. To achieve better accuracy, further data preprocessing, feature engineering, hyperparameter tuning, etc., need to be performed.")
	st.subheader("Dataset Overview")
	st.write("Original Dataset:")
	st.write(df.head())

	df.fillna(df.median(), inplace=True)
	st.write("Dataset after handling missing values:")
	st.write(df.head())

	# Data Visualization
	st.subheader("Data Visualization")
	fig, ax = plt.subplots(figsize=(10, 5))
	sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
	st.pyplot(fig)

	# Feature Importance Analysis
	X = df.drop("Potability", axis=1)
	y = df["Potability"]

	# Handle class imbalance
	smote = SMOTE()
	X, y = smote.fit_resample(X, y)

	# Train-test split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Feature scaling
	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_test = scaler.transform(X_test)

	# Define models
	models = {
	"Logistic Regression": LogisticRegression(),
	"Random Forest": RandomForestClassifier(n_estimators=200, max_depth=20),
	"SVM": SVC(kernel='rbf', C=1, probability=True),
	"Decision Tree": DecisionTreeClassifier(max_depth=10),
	"KNN": KNeighborsClassifier(n_neighbors=5),
	"Naive Bayes": GaussianNB(),
	"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
	"AdaBoost": AdaBoostClassifier(n_estimators=100),
	"Extra Trees": ExtraTreesClassifier(n_estimators=150)
	}

	st.subheader("Model Performance with Cross-Validation")
	results = {}
	loading_status = st.empty()

	# File names for persistence
	model_filename = "best_model.pkl"
	model_name_filename = "best_model_name.txt"
	model_accuracy_filename = "best_model_accuracy.txt"
	all_model_accuracies_filename = "all_model_accuracies.txt"

	if os.path.exists(model_filename) and os.path.exists(model_name_filename) and os.path.exists(model_accuracy_filename):
	with open(model_name_filename, "r") as f:
	best_model_name = f.read().strip()
	with open(model_accuracy_filename, "r") as f:
	best_model_accuracy = float(f.read().strip())
	st.success(f"Best model ({best_model_name}) already exists. Skipping training.")

	# Display saved model accuracies
	if os.path.exists(all_model_accuracies_filename):
	st.subheader("Saved Model Accuracies")
	with open(all_model_accuracies_filename, "r") as f:
	saved_accuracies = f.read()
	st.text(saved_accuracies)
	else:
	loading_status.text("Training models...")
	time.sleep(1) # Simulate loading time
	with open(all_model_accuracies_filename, "w") as f:
	for name, model in models.items():
	scores = cross_val_score(model, X_train, y_train, cv=5)
	accuracy = scores.mean()
	results[name] = accuracy
	st.write(f"{name}: Accuracy = {accuracy:.2f}")
	f.write(f"{name}: {accuracy:.2f}\n")

	# Select and train the best model
	best_model_name = max(results, key=results.get)
	best_model_accuracy = results[best_model_name]
	best_model = models[best_model_name]
	best_model.fit(X_train, y_train)
	joblib.dump(best_model, model_filename)
	with open(model_name_filename, "w") as f:
	f.write(best_model_name)
	with open(model_accuracy_filename, "w") as f:
	f.write(str(best_model_accuracy))
	st.success(f"Best Model: {best_model_name} trained and saved!")

	# Model Testing with User Input
	st.subheader("Test the Model")
	st.write("Tips: Based on the data correlation heatmap ph, hardness, and sulfate has a higher relation to each other. (POTABLE = lower ph \|\| higher hardness \|\| higher sulfate)")
	user_input = {}

	for col in X.columns:
	# Persist user input values across interactions
	if col not in st.session_state:
	st.session_state[col] = float(X[col].mean())

	user_input[col] = st.number_input(
	f"{col}",
	float(X[col].min()),
	float(X[col].max()),
	st.session_state[col],
	key=col
	)

	# Prediction button
	if st.button("Predict Water Potability"):
	loading_status.text("Testing model...")

	# Load best model
	model = joblib.load(model_filename)

	with open(model_name_filename, "r") as f:
	best_model_name = f.read().strip()
	with open(model_accuracy_filename, "r") as f:
	best_model_accuracy = float(f.read().strip())

	# Convert user input to DataFrame
	input_df = pd.DataFrame([user_input])
	input_df = scaler.transform(input_df) # Apply scaling

	# Predict
	prediction = model.predict(input_df)[0]
	label = "Potable" if prediction == 1 else "Not Potable"

	# Display results
	st.write(f"Predicted Potability: {label}")
	st.write(f"Model Used in Prediction: {best_model_name} (Accuracy: {best_model_accuracy:.2f})")
	loading_status.text("")