AquaTest / code /correct_water_qulity_01.py
JanithDeshan24's picture
feat: Initial project setup
3815023 verified
# -*- coding: utf-8 -*-
"""correct water qulity 01
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1P_fudbhG4Zu0c7jfo1ohnHoQLyG5yjyo
"""
# --- 1. SETUP AND IMPORTS ---
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
print("TensorFlow Version:", tf.__version__)
# --- 2. DATA LOADING ---
try:
data = pd.read_csv('water_potability.csv')
print("Dataset loaded successfully.")
except FileNotFoundError:
print("Error: 'water_potability.csv' not found.")
print("Please download the dataset from Kaggle and place it in the same directory.")
exit()
# --- 3. TASK 1: PREPROCESSING TECHNIQUES & EDA ---
# Each subsection represents a specific technique with its own EDA.
# --------------------------------------------------------------------------
# Technique 1 (Member 1): Handling Missing Values
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 1: Missing Values ---")
missing_percent = (data.isnull().sum() / len(data)) * 100
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_percent.index, y=missing_percent.values)
plt.title('Percentage of Missing Values per Feature', fontsize=16)
plt.ylabel('Percentage Missing (%)')
plt.xlabel('Features')
plt.xticks(rotation=45)
plt.show()
print("EDA Conclusion: 'ph', 'Sulfate', and 'Trihalomethanes' have significant missing data.")
print("Preprocessing Step: We will use KNNImputer to fill these, as it's more accurate than a simple mean.")
# --------------------------------------------------------------------------
# Technique 2 (Member 2): Handling Class Imbalance
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 2: Class Imbalance ---")
plt.figure(figsize=(7, 5))
sns.countplot(x='Potability', data=data)
plt.title('Class Distribution (0 = Not Potable, 1 = Potable)', fontsize=16)
plt.xlabel('Potability')
plt.ylabel('Count')
plt.show()
print(f"Distribution:\n{data['Potability'].value_counts(normalize=True)}")
print("EDA Conclusion: The dataset is imbalanced. There are more 'Not Potable' (0) samples.")
print("Preprocessing Step: We will use SMOTE (Synthetic Minority Over-sampling Technique) on the training data to create a balanced dataset for the model to learn from.")
# --------------------------------------------------------------------------
# Technique 3 (Member 3): Exploring Feature Distributions & Outliers
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 3: Feature Distributions (Outliers) ---")
# Melt the dataframe for easier plotting with Seaborn
data_melted = pd.melt(data, id_vars=['Potability'], var_name='Feature', value_name='Value')
plt.figure(figsize=(15, 8))
sns.boxplot(x='Feature', y='Value', data=data_melted, showfliers=True) # showfliers=True to show outliers
plt.title('Boxplots for Each Feature (Showing Outliers)', fontsize=16)
plt.xticks(rotation=45)
plt.yscale('log') # Use log scale for better visibility of distributions
plt.show()
print("EDA Conclusion: Features have vastly different scales and ranges (e.g., 'Solids' is in 10,000s, 'pH' is 0-14).")
print("Many features also have significant outliers.")
print("Preprocessing Step: Feature Scaling is mandatory for neural networks.")
# --------------------------------------------------------------------------
# Technique 4 (Member 4): Feature Scaling
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 4: Feature Scaling (Before/After) ---")
# We'll simulate the scaling on 'Solids' (a high-value feature) to visualize the effect.
# Note: We only use non-null values for this specific plot.
scaler_demo = StandardScaler()
solids_data = data[['Solids']].dropna()
solids_scaled = scaler_demo.fit_transform(solids_data)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.kdeplot(solids_data['Solids'], fill=True)
plt.title('Before Scaling (Solids)')
plt.xlabel('TDS (ppm)')
plt.subplot(1, 2, 2)
sns.kdeplot(solids_scaled.flatten(), fill=True, color='green')
plt.title('After Scaling (Solids)')
plt.xlabel('Standardized Value')
plt.suptitle('Technique 4: Effect of StandardScaler', fontsize=16)
plt.show()
print("EDA Conclusion: Scaling centers the data around 0 and squashes it to a standard range.")
print("Preprocessing Step: We will apply StandardScaler to all 9 features after splitting the data.")
# --------------------------------------------------------------------------
# Technique 5 (Member 5): Correlation Analysis
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 5: Feature Correlation ---")
# Use the imputed data just for this visualization (otherwise NaNs mess up the heatmap)
imputer_demo = KNNImputer(n_neighbors=5)
data_imputed_demo = pd.DataFrame(imputer_demo.fit_transform(data), columns=data.columns)
corr = data_imputed_demo.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=16)
plt.show()
print("EDA Conclusion: No features are extremely highly correlated (e.g., > 0.9 or < -0.9).")
print("This suggests that all 9 features provide unique information and should be kept for the model.")
# --------------------------------------------------------------------------
# Final Combined Preprocessing Pipeline (The "How-To")
# --------------------------------------------------------------------------
print("\n--- Final Preprocessing Pipeline (Code) ---")
print("Combining all techniques to prepare data for the model...")
# 1. Impute Missing Values
print("Step 1: Imputing missing values with KNNImputer...")
imputer = KNNImputer(n_neighbors=5)
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
# 2. Feature / Target Split
print("Step 2: Separating features (X) and target (y)...")
X = data_imputed.drop('Potability', axis=1)
y = data_imputed['Potability']
# 3. Data Splitting (Train/Test)
print("Step 3: Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Original training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")
# 4. Handle Class Imbalance (SMOTE)
print("Step 4: Balancing training data with SMOTE...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(f"Resampled training samples: {X_train_resampled.shape[0]}")
# 5. Feature Scaling
print("Step 5: Applying StandardScaler...")
scaler = StandardScaler()
# Fit the scaler ONLY on the training data
X_train_scaled = scaler.fit_transform(X_train_resampled)
# Apply the same scaler to the test data
X_test_scaled = scaler.transform(X_test)
print("\n✅ Final data pipelines are built and ready for model training.")
print("The 'scaler' object is saved to apply to new user input in the app.")
# --- 4. TASK 2: ALGORITHM SELECTION, IMPLEMENTATION & HYPERPARAMETER TUNING ---
"""
### Task 2.1: Algorithm Selection
For this tabular, binary classification task, we will use a **Deep Neural Network (DNN)**,
also known as a Multi-Layer Perceptron (MLP). This is a powerful and flexible
choice that can learn complex, non-linear relationships between the 9 features.
"""
# --- Task 2.2: Model Implementation ---
def build_model(input_shape):
model = Sequential([
# Input layer: 9 features
Dense(64, activation='relu', input_shape=[input_shape]),
Dropout(0.3), # Dropout layer to prevent overfitting
Dense(128, activation='relu'),
Dropout(0.3),
Dense(64, activation='relu'),
Dropout(0.3),
# Output layer: 1 neuron with sigmoid activation
# for binary classification (0 or 1)
Dense(1, activation='sigmoid')
])
return model
model = build_model(X_train_scaled.shape[1])
model.summary()
"""
### Task 2.3: Hyperparameter Tuning Strategy
* **Optimizer:** Adam (an efficient and popular choice).
* **Loss Function:** `binary_crossentropy` (This is REQUIRED for a two-class, 0/1 problem).
* **Metrics:** We will monitor `accuracy`.
* **Callbacks:**
* `EarlyStopping`: Stops training when validation accuracy stops improving.
* `ReduceLROnPlateau`: Lowers the learning rate if training plateaus.
"""
# --- Model Training ---
print("\n--- Model Training ---")
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss='binary_crossentropy',
metrics=['accuracy']
)
callbacks = [
EarlyStopping(monitor='val_accuracy', patience=20, verbose=1, restore_best_weights=True),
ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-6, verbose=1)
]
# Train on the RESAMPLED and SCALED data
history = model.fit(
X_train_scaled,
y_train_resampled, # Use the balanced target
epochs=200, # Set high, EarlyStopping will handle it
validation_data=(X_test_scaled, y_test), # Validate on the original, unbalanced test set
callbacks=callbacks,
batch_size=32
)
# --- 5. TASK 3: EVALUATION METRICS ---
"""
### Task 3.1: Evaluation Metrics
For this problem, **Accuracy is misleading**. We MUST focus on the
**Confusion Matrix** and **Precision for Class 1**.
* **DANGER:** A **False Positive** (model says 'Potable' when it's 'Not Potable')
is the worst possible error.
* **Our Goal:** Minimize False Positives.
* **Key Metric:** **Precision (Class 1)** tells us: "Of all the times the
model said 'Potable', what percentage was it correct?"
"""
print("\n--- Final Model Evaluation ---")
final_loss, final_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"\nFinal Test Loss: {final_loss:.4f}")
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}% (Can be misleading!)")
y_pred_probs = model.predict(X_test_scaled)
y_pred = (y_pred_probs > 0.5).astype(int)
# --- CRITICAL EVALUATION ---
cm = confusion_matrix(y_test, y_pred)
precision_class_1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
false_positives = cm[0][1]
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Not Potable (0)', 'Potable (1)'], zero_division=0))
print("\n--- CRITICAL METRIC ANALYSIS ---")
print(f"Precision (Class 1 - Potable): {precision_class_1 * 100:.2f}%")
print(" > This means when the model says water IS 'Potable', it is correct this % of the time.")
print(f"\nTotal DANGEROUS Predictions (False Positives): {false_positives}")
print(f" > The model incorrectly labeled {false_positives} unsafe samples as 'safe'.")
print("-----------------------------------")
plt.figure(figsize=(8, 6))
sns.heatmap(
cm,
annot=True, fmt='d', cmap='Reds', # Use 'Reds' to highlight danger
xticklabels=['Predicted Not Potable (0)', 'Predicted Potable (1)'],
yticklabels=['Actual Not Potable (0)', 'Actual Potable (1)']
)
plt.title(f'Confusion Matrix\n{false_positives} False Positives (DANGEROUS)', fontsize=14, color='red')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# --- 6. TASK 4: ETHICAL AND BIAS ANALYSIS ---
"""
### Task 4.1: Ethical and Bias Analysis
* **CRITICAL RISK: False Positives.**
As shown in the evaluation, a False Positive (predicting 'Potable' when
water is 'Not Potable') is a severe health risk. The model's Precision
for the 'Potable' class must be as high as possible.
* **Dataset Bias:**
The dataset's origin is not specified. It may represent water from a
specific region or type of source (e.g., municipal vs. well). The
model may not generalize well to water with different chemical profiles
from other parts of the world.
* **Conclusion & Disclaimer:**
This application **MUST** be deployed with a very strong
disclaimer. It should be labeled: "For educational and
informational purposes ONLY. This is NOT a substitute
for a professional, laboratory-based water quality test."
The developer has a responsibility to make this clear to all users.
"""
# --- 7. SAVE THE FINAL MODEL AND SCALER ---
# We must save TWO files:
# 1. The trained Keras model (.h5)
# 2. The StandardScaler object (.joblib)
model.save('water_quality_model.h5')
joblib.dump(scaler, 'scaler.joblib')
print("\n✅ Final model saved as 'water_quality_model.h5'")
print("✅ Scaler saved as 'scaler.joblib'")
print("\nProject setup complete. You are ready to build the Flask app.")