Spaces:

JanithDeshan24
/

AquaTest

Sleeping

File size: 13,051 Bytes
# -*- coding: utf-8 -*-
"""correct water qulity 01

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1P_fudbhG4Zu0c7jfo1ohnHoQLyG5yjyo
"""

# --- 1. SETUP AND IMPORTS ---
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

print("TensorFlow Version:", tf.__version__)

# --- 2. DATA LOADING ---
try:
    data = pd.read_csv('water_potability.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'water_potability.csv' not found.")
    print("Please download the dataset from Kaggle and place it in the same directory.")
    exit()

# --- 3. TASK 1: PREPROCESSING TECHNIQUES & EDA ---
# Each subsection represents a specific technique with its own EDA.

# --------------------------------------------------------------------------
# Technique 1 (Member 1): Handling Missing Values
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 1: Missing Values ---")
missing_percent = (data.isnull().sum() / len(data)) * 100
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_percent.index, y=missing_percent.values)
plt.title('Percentage of Missing Values per Feature', fontsize=16)
plt.ylabel('Percentage Missing (%)')
plt.xlabel('Features')
plt.xticks(rotation=45)
plt.show()

print("EDA Conclusion: 'ph', 'Sulfate', and 'Trihalomethanes' have significant missing data.")
print("Preprocessing Step: We will use KNNImputer to fill these, as it's more accurate than a simple mean.")

# --------------------------------------------------------------------------
# Technique 2 (Member 2): Handling Class Imbalance
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 2: Class Imbalance ---")
plt.figure(figsize=(7, 5))
sns.countplot(x='Potability', data=data)
plt.title('Class Distribution (0 = Not Potable, 1 = Potable)', fontsize=16)
plt.xlabel('Potability')
plt.ylabel('Count')
plt.show()

print(f"Distribution:\n{data['Potability'].value_counts(normalize=True)}")
print("EDA Conclusion: The dataset is imbalanced. There are more 'Not Potable' (0) samples.")
print("Preprocessing Step: We will use SMOTE (Synthetic Minority Over-sampling Technique) on the training data to create a balanced dataset for the model to learn from.")

# --------------------------------------------------------------------------
# Technique 3 (Member 3): Exploring Feature Distributions & Outliers
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 3: Feature Distributions (Outliers) ---")
# Melt the dataframe for easier plotting with Seaborn
data_melted = pd.melt(data, id_vars=['Potability'], var_name='Feature', value_name='Value')

plt.figure(figsize=(15, 8))
sns.boxplot(x='Feature', y='Value', data=data_melted, showfliers=True) # showfliers=True to show outliers
plt.title('Boxplots for Each Feature (Showing Outliers)', fontsize=16)
plt.xticks(rotation=45)
plt.yscale('log') # Use log scale for better visibility of distributions
plt.show()

print("EDA Conclusion: Features have vastly different scales and ranges (e.g., 'Solids' is in 10,000s, 'pH' is 0-14).")
print("Many features also have significant outliers.")
print("Preprocessing Step: Feature Scaling is mandatory for neural networks.")

# --------------------------------------------------------------------------
# Technique 4 (Member 4): Feature Scaling
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 4: Feature Scaling (Before/After) ---")
# We'll simulate the scaling on 'Solids' (a high-value feature) to visualize the effect.
# Note: We only use non-null values for this specific plot.
scaler_demo = StandardScaler()
solids_data = data[['Solids']].dropna()
solids_scaled = scaler_demo.fit_transform(solids_data)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.kdeplot(solids_data['Solids'], fill=True)
plt.title('Before Scaling (Solids)')
plt.xlabel('TDS (ppm)')

plt.subplot(1, 2, 2)
sns.kdeplot(solids_scaled.flatten(), fill=True, color='green')
plt.title('After Scaling (Solids)')
plt.xlabel('Standardized Value')
plt.suptitle('Technique 4: Effect of StandardScaler', fontsize=16)
plt.show()

print("EDA Conclusion: Scaling centers the data around 0 and squashes it to a standard range.")
print("Preprocessing Step: We will apply StandardScaler to all 9 features after splitting the data.")

# --------------------------------------------------------------------------
# Technique 5 (Member 5): Correlation Analysis
# --------------------------------------------------------------------------
print("\n--- EDA for Technique 5: Feature Correlation ---")
# Use the imputed data just for this visualization (otherwise NaNs mess up the heatmap)
imputer_demo = KNNImputer(n_neighbors=5)
data_imputed_demo = pd.DataFrame(imputer_demo.fit_transform(data), columns=data.columns)
corr = data_imputed_demo.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=16)
plt.show()

print("EDA Conclusion: No features are extremely highly correlated (e.g., > 0.9 or < -0.9).")
print("This suggests that all 9 features provide unique information and should be kept for the model.")

# --------------------------------------------------------------------------
# Final Combined Preprocessing Pipeline (The "How-To")
# --------------------------------------------------------------------------
print("\n--- Final Preprocessing Pipeline (Code) ---")
print("Combining all techniques to prepare data for the model...")

# 1. Impute Missing Values
print("Step 1: Imputing missing values with KNNImputer...")
imputer = KNNImputer(n_neighbors=5)
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 2. Feature / Target Split
print("Step 2: Separating features (X) and target (y)...")
X = data_imputed.drop('Potability', axis=1)
y = data_imputed['Potability']

# 3. Data Splitting (Train/Test)
print("Step 3: Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Original training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# 4. Handle Class Imbalance (SMOTE)
print("Step 4: Balancing training data with SMOTE...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(f"Resampled training samples: {X_train_resampled.shape[0]}")

# 5. Feature Scaling
print("Step 5: Applying StandardScaler...")
scaler = StandardScaler()
# Fit the scaler ONLY on the training data
X_train_scaled = scaler.fit_transform(X_train_resampled)
# Apply the same scaler to the test data
X_test_scaled = scaler.transform(X_test)

print("\n✅ Final data pipelines are built and ready for model training.")
print("The 'scaler' object is saved to apply to new user input in the app.")

# --- 4. TASK 2: ALGORITHM SELECTION, IMPLEMENTATION & HYPERPARAMETER TUNING ---

"""
### Task 2.1: Algorithm Selection
For this tabular, binary classification task, we will use a **Deep Neural Network (DNN)**,
also known as a Multi-Layer Perceptron (MLP). This is a powerful and flexible
choice that can learn complex, non-linear relationships between the 9 features.
"""

# --- Task 2.2: Model Implementation ---
def build_model(input_shape):
    model = Sequential([
        # Input layer: 9 features
        Dense(64, activation='relu', input_shape=[input_shape]),
        Dropout(0.3), # Dropout layer to prevent overfitting
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        # Output layer: 1 neuron with sigmoid activation
        # for binary classification (0 or 1)
        Dense(1, activation='sigmoid')
    ])
    return model

model = build_model(X_train_scaled.shape[1])
model.summary()

"""
### Task 2.3: Hyperparameter Tuning Strategy
* **Optimizer:** Adam (an efficient and popular choice).
* **Loss Function:** `binary_crossentropy` (This is REQUIRED for a two-class, 0/1 problem).
* **Metrics:** We will monitor `accuracy`.
* **Callbacks:**
    * `EarlyStopping`: Stops training when validation accuracy stops improving.
    * `ReduceLROnPlateau`: Lowers the learning rate if training plateaus.
"""

# --- Model Training ---
print("\n--- Model Training ---")

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

callbacks = [
    EarlyStopping(monitor='val_accuracy', patience=20, verbose=1, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-6, verbose=1)
]

# Train on the RESAMPLED and SCALED data
history = model.fit(
    X_train_scaled,
    y_train_resampled, # Use the balanced target
    epochs=200, # Set high, EarlyStopping will handle it
    validation_data=(X_test_scaled, y_test), # Validate on the original, unbalanced test set
    callbacks=callbacks,
    batch_size=32
)

# --- 5. TASK 3: EVALUATION METRICS ---
"""
### Task 3.1: Evaluation Metrics
For this problem, **Accuracy is misleading**. We MUST focus on the
**Confusion Matrix** and **Precision for Class 1**.

* **DANGER:** A **False Positive** (model says 'Potable' when it's 'Not Potable')
    is the worst possible error.
* **Our Goal:** Minimize False Positives.
* **Key Metric:** **Precision (Class 1)** tells us: "Of all the times the
    model said 'Potable', what percentage was it correct?"
"""
print("\n--- Final Model Evaluation ---")
final_loss, final_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"\nFinal Test Loss: {final_loss:.4f}")
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}% (Can be misleading!)")

y_pred_probs = model.predict(X_test_scaled)
y_pred = (y_pred_probs > 0.5).astype(int)

# --- CRITICAL EVALUATION ---
cm = confusion_matrix(y_test, y_pred)
precision_class_1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
false_positives = cm[0][1]

print("\n--- Detailed Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Not Potable (0)', 'Potable (1)'], zero_division=0))

print("\n--- CRITICAL METRIC ANALYSIS ---")
print(f"Precision (Class 1 - Potable): {precision_class_1 * 100:.2f}%")
print("  > This means when the model says water IS 'Potable', it is correct this % of the time.")
print(f"\nTotal DANGEROUS Predictions (False Positives): {false_positives}")
print(f"  > The model incorrectly labeled {false_positives} unsafe samples as 'safe'.")
print("-----------------------------------")


plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True, fmt='d', cmap='Reds', # Use 'Reds' to highlight danger
    xticklabels=['Predicted Not Potable (0)', 'Predicted Potable (1)'],
    yticklabels=['Actual Not Potable (0)', 'Actual Potable (1)']
)
plt.title(f'Confusion Matrix\n{false_positives} False Positives (DANGEROUS)', fontsize=14, color='red')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# --- 6. TASK 4: ETHICAL AND BIAS ANALYSIS ---
"""
### Task 4.1: Ethical and Bias Analysis

* **CRITICAL RISK: False Positives.**
    As shown in the evaluation, a False Positive (predicting 'Potable' when
    water is 'Not Potable') is a severe health risk. The model's Precision
    for the 'Potable' class must be as high as possible.

* **Dataset Bias:**
    The dataset's origin is not specified. It may represent water from a
    specific region or type of source (e.g., municipal vs. well). The
    model may not generalize well to water with different chemical profiles
    from other parts of the world.

* **Conclusion & Disclaimer:**
    This application **MUST** be deployed with a very strong
    disclaimer. It should be labeled: "For educational and
    informational purposes ONLY. This is NOT a substitute
    for a professional, laboratory-based water quality test."
    The developer has a responsibility to make this clear to all users.
"""

# --- 7. SAVE THE FINAL MODEL AND SCALER ---
# We must save TWO files:
# 1. The trained Keras model (.h5)
# 2. The StandardScaler object (.joblib)
model.save('water_quality_model.h5')
joblib.dump(scaler, 'scaler.joblib')

print("\n✅ Final model saved as 'water_quality_model.h5'")
print("✅ Scaler saved as 'scaler.joblib'")
print("\nProject setup complete. You are ready to build the Flask app.")