Spaces:

JanithDeshan24
/

AquaTest

Sleeping

App Files Files Community

AquaTest / code /correct_water_qulity_01.py

JanithDeshan24

feat: Initial project setup

3815023 verified 4 months ago

raw

history blame contribute delete

13.1 kB

	# -- coding: utf-8 --
	"""correct water qulity 01

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1P_fudbhG4Zu0c7jfo1ohnHoQLyG5yjyo
	"""

	# --- 1. SETUP AND IMPORTS ---
	import tensorflow as tf
	from tensorflow.keras.models import Sequential, load_model
	from tensorflow.keras.layers import Dense, Dropout
	from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.impute import KNNImputer
	from sklearn.metrics import confusion_matrix, classification_report, precision_score
	from imblearn.over_sampling import SMOTE
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import joblib

	print("TensorFlow Version:", tf.__version__)

	# --- 2. DATA LOADING ---
	try:
	data = pd.read_csv('water_potability.csv')
	print("Dataset loaded successfully.")
	except FileNotFoundError:
	print("Error: 'water_potability.csv' not found.")
	print("Please download the dataset from Kaggle and place it in the same directory.")
	exit()

	# --- 3. TASK 1: PREPROCESSING TECHNIQUES & EDA ---
	# Each subsection represents a specific technique with its own EDA.

	# --------------------------------------------------------------------------
	# Technique 1 (Member 1): Handling Missing Values
	# --------------------------------------------------------------------------
	print("\n--- EDA for Technique 1: Missing Values ---")
	missing_percent = (data.isnull().sum() / len(data)) * 100
	plt.figure(figsize=(10, 6))
	sns.barplot(x=missing_percent.index, y=missing_percent.values)
	plt.title('Percentage of Missing Values per Feature', fontsize=16)
	plt.ylabel('Percentage Missing (%)')
	plt.xlabel('Features')
	plt.xticks(rotation=45)
	plt.show()

	print("EDA Conclusion: 'ph', 'Sulfate', and 'Trihalomethanes' have significant missing data.")
	print("Preprocessing Step: We will use KNNImputer to fill these, as it's more accurate than a simple mean.")

	# --------------------------------------------------------------------------
	# Technique 2 (Member 2): Handling Class Imbalance
	# --------------------------------------------------------------------------
	print("\n--- EDA for Technique 2: Class Imbalance ---")
	plt.figure(figsize=(7, 5))
	sns.countplot(x='Potability', data=data)
	plt.title('Class Distribution (0 = Not Potable, 1 = Potable)', fontsize=16)
	plt.xlabel('Potability')
	plt.ylabel('Count')
	plt.show()

	print(f"Distribution:\n{data['Potability'].value_counts(normalize=True)}")
	print("EDA Conclusion: The dataset is imbalanced. There are more 'Not Potable' (0) samples.")
	print("Preprocessing Step: We will use SMOTE (Synthetic Minority Over-sampling Technique) on the training data to create a balanced dataset for the model to learn from.")

	# --------------------------------------------------------------------------
	# Technique 3 (Member 3): Exploring Feature Distributions & Outliers
	# --------------------------------------------------------------------------
	print("\n--- EDA for Technique 3: Feature Distributions (Outliers) ---")
	# Melt the dataframe for easier plotting with Seaborn
	data_melted = pd.melt(data, id_vars=['Potability'], var_name='Feature', value_name='Value')

	plt.figure(figsize=(15, 8))
	sns.boxplot(x='Feature', y='Value', data=data_melted, showfliers=True) # showfliers=True to show outliers
	plt.title('Boxplots for Each Feature (Showing Outliers)', fontsize=16)
	plt.xticks(rotation=45)
	plt.yscale('log') # Use log scale for better visibility of distributions
	plt.show()

	print("EDA Conclusion: Features have vastly different scales and ranges (e.g., 'Solids' is in 10,000s, 'pH' is 0-14).")
	print("Many features also have significant outliers.")
	print("Preprocessing Step: Feature Scaling is mandatory for neural networks.")

	# --------------------------------------------------------------------------
	# Technique 4 (Member 4): Feature Scaling
	# --------------------------------------------------------------------------
	print("\n--- EDA for Technique 4: Feature Scaling (Before/After) ---")
	# We'll simulate the scaling on 'Solids' (a high-value feature) to visualize the effect.
	# Note: We only use non-null values for this specific plot.
	scaler_demo = StandardScaler()
	solids_data = data[['Solids']].dropna()
	solids_scaled = scaler_demo.fit_transform(solids_data)

	plt.figure(figsize=(12, 5))
	plt.subplot(1, 2, 1)
	sns.kdeplot(solids_data['Solids'], fill=True)
	plt.title('Before Scaling (Solids)')
	plt.xlabel('TDS (ppm)')

	plt.subplot(1, 2, 2)
	sns.kdeplot(solids_scaled.flatten(), fill=True, color='green')
	plt.title('After Scaling (Solids)')
	plt.xlabel('Standardized Value')
	plt.suptitle('Technique 4: Effect of StandardScaler', fontsize=16)
	plt.show()

	print("EDA Conclusion: Scaling centers the data around 0 and squashes it to a standard range.")
	print("Preprocessing Step: We will apply StandardScaler to all 9 features after splitting the data.")

	# --------------------------------------------------------------------------
	# Technique 5 (Member 5): Correlation Analysis
	# --------------------------------------------------------------------------
	print("\n--- EDA for Technique 5: Feature Correlation ---")
	# Use the imputed data just for this visualization (otherwise NaNs mess up the heatmap)
	imputer_demo = KNNImputer(n_neighbors=5)
	data_imputed_demo = pd.DataFrame(imputer_demo.fit_transform(data), columns=data.columns)
	corr = data_imputed_demo.corr()

	plt.figure(figsize=(12, 10))
	sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
	plt.title('Feature Correlation Heatmap', fontsize=16)
	plt.show()

	print("EDA Conclusion: No features are extremely highly correlated (e.g., > 0.9 or < -0.9).")
	print("This suggests that all 9 features provide unique information and should be kept for the model.")

	# --------------------------------------------------------------------------
	# Final Combined Preprocessing Pipeline (The "How-To")
	# --------------------------------------------------------------------------
	print("\n--- Final Preprocessing Pipeline (Code) ---")
	print("Combining all techniques to prepare data for the model...")

	# 1. Impute Missing Values
	print("Step 1: Imputing missing values with KNNImputer...")
	imputer = KNNImputer(n_neighbors=5)
	data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

	# 2. Feature / Target Split
	print("Step 2: Separating features (X) and target (y)...")
	X = data_imputed.drop('Potability', axis=1)
	y = data_imputed['Potability']

	# 3. Data Splitting (Train/Test)
	print("Step 3: Splitting data into training and test sets...")
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
	print(f"Original training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

	# 4. Handle Class Imbalance (SMOTE)
	print("Step 4: Balancing training data with SMOTE...")
	smote = SMOTE(random_state=42)
	X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
	print(f"Resampled training samples: {X_train_resampled.shape[0]}")

	# 5. Feature Scaling
	print("Step 5: Applying StandardScaler...")
	scaler = StandardScaler()
	# Fit the scaler ONLY on the training data
	X_train_scaled = scaler.fit_transform(X_train_resampled)
	# Apply the same scaler to the test data
	X_test_scaled = scaler.transform(X_test)

	print("\n✅ Final data pipelines are built and ready for model training.")
	print("The 'scaler' object is saved to apply to new user input in the app.")

	# --- 4. TASK 2: ALGORITHM SELECTION, IMPLEMENTATION & HYPERPARAMETER TUNING ---

	"""
	### Task 2.1: Algorithm Selection
	For this tabular, binary classification task, we will use a Deep Neural Network (DNN),
	also known as a Multi-Layer Perceptron (MLP). This is a powerful and flexible
	choice that can learn complex, non-linear relationships between the 9 features.
	"""

	# --- Task 2.2: Model Implementation ---
	def build_model(input_shape):
	model = Sequential([
	# Input layer: 9 features
	Dense(64, activation='relu', input_shape=[input_shape]),
	Dropout(0.3), # Dropout layer to prevent overfitting
	Dense(128, activation='relu'),
	Dropout(0.3),
	Dense(64, activation='relu'),
	Dropout(0.3),
	# Output layer: 1 neuron with sigmoid activation
	# for binary classification (0 or 1)
	Dense(1, activation='sigmoid')
	])
	return model

	model = build_model(X_train_scaled.shape[1])
	model.summary()

	"""
	### Task 2.3: Hyperparameter Tuning Strategy
	* Optimizer: Adam (an efficient and popular choice).
	* Loss Function: `binary_crossentropy` (This is REQUIRED for a two-class, 0/1 problem).
	* Metrics: We will monitor `accuracy`.
	* Callbacks:
	* `EarlyStopping`: Stops training when validation accuracy stops improving.
	* `ReduceLROnPlateau`: Lowers the learning rate if training plateaus.
	"""

	# --- Model Training ---
	print("\n--- Model Training ---")

	model.compile(
	optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
	loss='binary_crossentropy',
	metrics=['accuracy']
	)

	callbacks = [
	EarlyStopping(monitor='val_accuracy', patience=20, verbose=1, restore_best_weights=True),
	ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-6, verbose=1)
	]

	# Train on the RESAMPLED and SCALED data
	history = model.fit(
	X_train_scaled,
	y_train_resampled, # Use the balanced target
	epochs=200, # Set high, EarlyStopping will handle it
	validation_data=(X_test_scaled, y_test), # Validate on the original, unbalanced test set
	callbacks=callbacks,
	batch_size=32
	)

	# --- 5. TASK 3: EVALUATION METRICS ---
	"""
	### Task 3.1: Evaluation Metrics
	For this problem, Accuracy is misleading. We MUST focus on the
	Confusion Matrix and Precision for Class 1.

	* DANGER: A False Positive (model says 'Potable' when it's 'Not Potable')
	is the worst possible error.
	* Our Goal: Minimize False Positives.
	* Key Metric: Precision (Class 1) tells us: "Of all the times the
	model said 'Potable', what percentage was it correct?"
	"""
	print("\n--- Final Model Evaluation ---")
	final_loss, final_accuracy = model.evaluate(X_test_scaled, y_test)
	print(f"\nFinal Test Loss: {final_loss:.4f}")
	print(f"Final Test Accuracy: {final_accuracy * 100:.2f}% (Can be misleading!)")

	y_pred_probs = model.predict(X_test_scaled)
	y_pred = (y_pred_probs > 0.5).astype(int)

	# --- CRITICAL EVALUATION ---
	cm = confusion_matrix(y_test, y_pred)
	precision_class_1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
	false_positives = cm[0][1]

	print("\n--- Detailed Classification Report ---")
	print(classification_report(y_test, y_pred, target_names=['Not Potable (0)', 'Potable (1)'], zero_division=0))

	print("\n--- CRITICAL METRIC ANALYSIS ---")
	print(f"Precision (Class 1 - Potable): {precision_class_1 * 100:.2f}%")
	print(" > This means when the model says water IS 'Potable', it is correct this % of the time.")
	print(f"\nTotal DANGEROUS Predictions (False Positives): {false_positives}")
	print(f" > The model incorrectly labeled {false_positives} unsafe samples as 'safe'.")
	print("-----------------------------------")


	plt.figure(figsize=(8, 6))
	sns.heatmap(
	cm,
	annot=True, fmt='d', cmap='Reds', # Use 'Reds' to highlight danger
	xticklabels=['Predicted Not Potable (0)', 'Predicted Potable (1)'],
	yticklabels=['Actual Not Potable (0)', 'Actual Potable (1)']
	)
	plt.title(f'Confusion Matrix\n{false_positives} False Positives (DANGEROUS)', fontsize=14, color='red')
	plt.xlabel('Predicted Label')
	plt.ylabel('True Label')
	plt.show()

	# --- 6. TASK 4: ETHICAL AND BIAS ANALYSIS ---
	"""
	### Task 4.1: Ethical and Bias Analysis

	* CRITICAL RISK: False Positives.
	As shown in the evaluation, a False Positive (predicting 'Potable' when
	water is 'Not Potable') is a severe health risk. The model's Precision
	for the 'Potable' class must be as high as possible.

	* Dataset Bias:
	The dataset's origin is not specified. It may represent water from a
	specific region or type of source (e.g., municipal vs. well). The
	model may not generalize well to water with different chemical profiles
	from other parts of the world.

	* Conclusion & Disclaimer:
	This application MUST be deployed with a very strong
	disclaimer. It should be labeled: "For educational and
	informational purposes ONLY. This is NOT a substitute
	for a professional, laboratory-based water quality test."
	The developer has a responsibility to make this clear to all users.
	"""

	# --- 7. SAVE THE FINAL MODEL AND SCALER ---
	# We must save TWO files:
	# 1. The trained Keras model (.h5)
	# 2. The StandardScaler object (.joblib)
	model.save('water_quality_model.h5')
	joblib.dump(scaler, 'scaler.joblib')

	print("\n✅ Final model saved as 'water_quality_model.h5'")
	print("✅ Scaler saved as 'scaler.joblib'")
	print("\nProject setup complete. You are ready to build the Flask app.")