# -*- coding: utf-8 -*- """correct water qulity 01 Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1P_fudbhG4Zu0c7jfo1ohnHoQLyG5yjyo """ # --- 1. SETUP AND IMPORTS --- import tensorflow as tf from tensorflow.keras.models import Sequential, load_model from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.impute import KNNImputer from sklearn.metrics import confusion_matrix, classification_report, precision_score from imblearn.over_sampling import SMOTE import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import joblib print("TensorFlow Version:", tf.__version__) # --- 2. DATA LOADING --- try: data = pd.read_csv('water_potability.csv') print("Dataset loaded successfully.") except FileNotFoundError: print("Error: 'water_potability.csv' not found.") print("Please download the dataset from Kaggle and place it in the same directory.") exit() # --- 3. TASK 1: PREPROCESSING TECHNIQUES & EDA --- # Each subsection represents a specific technique with its own EDA. # -------------------------------------------------------------------------- # Technique 1 (Member 1): Handling Missing Values # -------------------------------------------------------------------------- print("\n--- EDA for Technique 1: Missing Values ---") missing_percent = (data.isnull().sum() / len(data)) * 100 plt.figure(figsize=(10, 6)) sns.barplot(x=missing_percent.index, y=missing_percent.values) plt.title('Percentage of Missing Values per Feature', fontsize=16) plt.ylabel('Percentage Missing (%)') plt.xlabel('Features') plt.xticks(rotation=45) plt.show() print("EDA Conclusion: 'ph', 'Sulfate', and 'Trihalomethanes' have significant missing data.") print("Preprocessing Step: We will use KNNImputer to fill these, as it's more accurate than a simple mean.") # -------------------------------------------------------------------------- # Technique 2 (Member 2): Handling Class Imbalance # -------------------------------------------------------------------------- print("\n--- EDA for Technique 2: Class Imbalance ---") plt.figure(figsize=(7, 5)) sns.countplot(x='Potability', data=data) plt.title('Class Distribution (0 = Not Potable, 1 = Potable)', fontsize=16) plt.xlabel('Potability') plt.ylabel('Count') plt.show() print(f"Distribution:\n{data['Potability'].value_counts(normalize=True)}") print("EDA Conclusion: The dataset is imbalanced. There are more 'Not Potable' (0) samples.") print("Preprocessing Step: We will use SMOTE (Synthetic Minority Over-sampling Technique) on the training data to create a balanced dataset for the model to learn from.") # -------------------------------------------------------------------------- # Technique 3 (Member 3): Exploring Feature Distributions & Outliers # -------------------------------------------------------------------------- print("\n--- EDA for Technique 3: Feature Distributions (Outliers) ---") # Melt the dataframe for easier plotting with Seaborn data_melted = pd.melt(data, id_vars=['Potability'], var_name='Feature', value_name='Value') plt.figure(figsize=(15, 8)) sns.boxplot(x='Feature', y='Value', data=data_melted, showfliers=True) # showfliers=True to show outliers plt.title('Boxplots for Each Feature (Showing Outliers)', fontsize=16) plt.xticks(rotation=45) plt.yscale('log') # Use log scale for better visibility of distributions plt.show() print("EDA Conclusion: Features have vastly different scales and ranges (e.g., 'Solids' is in 10,000s, 'pH' is 0-14).") print("Many features also have significant outliers.") print("Preprocessing Step: Feature Scaling is mandatory for neural networks.") # -------------------------------------------------------------------------- # Technique 4 (Member 4): Feature Scaling # -------------------------------------------------------------------------- print("\n--- EDA for Technique 4: Feature Scaling (Before/After) ---") # We'll simulate the scaling on 'Solids' (a high-value feature) to visualize the effect. # Note: We only use non-null values for this specific plot. scaler_demo = StandardScaler() solids_data = data[['Solids']].dropna() solids_scaled = scaler_demo.fit_transform(solids_data) plt.figure(figsize=(12, 5)) plt.subplot(1, 2, 1) sns.kdeplot(solids_data['Solids'], fill=True) plt.title('Before Scaling (Solids)') plt.xlabel('TDS (ppm)') plt.subplot(1, 2, 2) sns.kdeplot(solids_scaled.flatten(), fill=True, color='green') plt.title('After Scaling (Solids)') plt.xlabel('Standardized Value') plt.suptitle('Technique 4: Effect of StandardScaler', fontsize=16) plt.show() print("EDA Conclusion: Scaling centers the data around 0 and squashes it to a standard range.") print("Preprocessing Step: We will apply StandardScaler to all 9 features after splitting the data.") # -------------------------------------------------------------------------- # Technique 5 (Member 5): Correlation Analysis # -------------------------------------------------------------------------- print("\n--- EDA for Technique 5: Feature Correlation ---") # Use the imputed data just for this visualization (otherwise NaNs mess up the heatmap) imputer_demo = KNNImputer(n_neighbors=5) data_imputed_demo = pd.DataFrame(imputer_demo.fit_transform(data), columns=data.columns) corr = data_imputed_demo.corr() plt.figure(figsize=(12, 10)) sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5) plt.title('Feature Correlation Heatmap', fontsize=16) plt.show() print("EDA Conclusion: No features are extremely highly correlated (e.g., > 0.9 or < -0.9).") print("This suggests that all 9 features provide unique information and should be kept for the model.") # -------------------------------------------------------------------------- # Final Combined Preprocessing Pipeline (The "How-To") # -------------------------------------------------------------------------- print("\n--- Final Preprocessing Pipeline (Code) ---") print("Combining all techniques to prepare data for the model...") # 1. Impute Missing Values print("Step 1: Imputing missing values with KNNImputer...") imputer = KNNImputer(n_neighbors=5) data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns) # 2. Feature / Target Split print("Step 2: Separating features (X) and target (y)...") X = data_imputed.drop('Potability', axis=1) y = data_imputed['Potability'] # 3. Data Splitting (Train/Test) print("Step 3: Splitting data into training and test sets...") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) print(f"Original training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}") # 4. Handle Class Imbalance (SMOTE) print("Step 4: Balancing training data with SMOTE...") smote = SMOTE(random_state=42) X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) print(f"Resampled training samples: {X_train_resampled.shape[0]}") # 5. Feature Scaling print("Step 5: Applying StandardScaler...") scaler = StandardScaler() # Fit the scaler ONLY on the training data X_train_scaled = scaler.fit_transform(X_train_resampled) # Apply the same scaler to the test data X_test_scaled = scaler.transform(X_test) print("\nāœ… Final data pipelines are built and ready for model training.") print("The 'scaler' object is saved to apply to new user input in the app.") # --- 4. TASK 2: ALGORITHM SELECTION, IMPLEMENTATION & HYPERPARAMETER TUNING --- """ ### Task 2.1: Algorithm Selection For this tabular, binary classification task, we will use a **Deep Neural Network (DNN)**, also known as a Multi-Layer Perceptron (MLP). This is a powerful and flexible choice that can learn complex, non-linear relationships between the 9 features. """ # --- Task 2.2: Model Implementation --- def build_model(input_shape): model = Sequential([ # Input layer: 9 features Dense(64, activation='relu', input_shape=[input_shape]), Dropout(0.3), # Dropout layer to prevent overfitting Dense(128, activation='relu'), Dropout(0.3), Dense(64, activation='relu'), Dropout(0.3), # Output layer: 1 neuron with sigmoid activation # for binary classification (0 or 1) Dense(1, activation='sigmoid') ]) return model model = build_model(X_train_scaled.shape[1]) model.summary() """ ### Task 2.3: Hyperparameter Tuning Strategy * **Optimizer:** Adam (an efficient and popular choice). * **Loss Function:** `binary_crossentropy` (This is REQUIRED for a two-class, 0/1 problem). * **Metrics:** We will monitor `accuracy`. * **Callbacks:** * `EarlyStopping`: Stops training when validation accuracy stops improving. * `ReduceLROnPlateau`: Lowers the learning rate if training plateaus. """ # --- Model Training --- print("\n--- Model Training ---") model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'] ) callbacks = [ EarlyStopping(monitor='val_accuracy', patience=20, verbose=1, restore_best_weights=True), ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-6, verbose=1) ] # Train on the RESAMPLED and SCALED data history = model.fit( X_train_scaled, y_train_resampled, # Use the balanced target epochs=200, # Set high, EarlyStopping will handle it validation_data=(X_test_scaled, y_test), # Validate on the original, unbalanced test set callbacks=callbacks, batch_size=32 ) # --- 5. TASK 3: EVALUATION METRICS --- """ ### Task 3.1: Evaluation Metrics For this problem, **Accuracy is misleading**. We MUST focus on the **Confusion Matrix** and **Precision for Class 1**. * **DANGER:** A **False Positive** (model says 'Potable' when it's 'Not Potable') is the worst possible error. * **Our Goal:** Minimize False Positives. * **Key Metric:** **Precision (Class 1)** tells us: "Of all the times the model said 'Potable', what percentage was it correct?" """ print("\n--- Final Model Evaluation ---") final_loss, final_accuracy = model.evaluate(X_test_scaled, y_test) print(f"\nFinal Test Loss: {final_loss:.4f}") print(f"Final Test Accuracy: {final_accuracy * 100:.2f}% (Can be misleading!)") y_pred_probs = model.predict(X_test_scaled) y_pred = (y_pred_probs > 0.5).astype(int) # --- CRITICAL EVALUATION --- cm = confusion_matrix(y_test, y_pred) precision_class_1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0) false_positives = cm[0][1] print("\n--- Detailed Classification Report ---") print(classification_report(y_test, y_pred, target_names=['Not Potable (0)', 'Potable (1)'], zero_division=0)) print("\n--- CRITICAL METRIC ANALYSIS ---") print(f"Precision (Class 1 - Potable): {precision_class_1 * 100:.2f}%") print(" > This means when the model says water IS 'Potable', it is correct this % of the time.") print(f"\nTotal DANGEROUS Predictions (False Positives): {false_positives}") print(f" > The model incorrectly labeled {false_positives} unsafe samples as 'safe'.") print("-----------------------------------") plt.figure(figsize=(8, 6)) sns.heatmap( cm, annot=True, fmt='d', cmap='Reds', # Use 'Reds' to highlight danger xticklabels=['Predicted Not Potable (0)', 'Predicted Potable (1)'], yticklabels=['Actual Not Potable (0)', 'Actual Potable (1)'] ) plt.title(f'Confusion Matrix\n{false_positives} False Positives (DANGEROUS)', fontsize=14, color='red') plt.xlabel('Predicted Label') plt.ylabel('True Label') plt.show() # --- 6. TASK 4: ETHICAL AND BIAS ANALYSIS --- """ ### Task 4.1: Ethical and Bias Analysis * **CRITICAL RISK: False Positives.** As shown in the evaluation, a False Positive (predicting 'Potable' when water is 'Not Potable') is a severe health risk. The model's Precision for the 'Potable' class must be as high as possible. * **Dataset Bias:** The dataset's origin is not specified. It may represent water from a specific region or type of source (e.g., municipal vs. well). The model may not generalize well to water with different chemical profiles from other parts of the world. * **Conclusion & Disclaimer:** This application **MUST** be deployed with a very strong disclaimer. It should be labeled: "For educational and informational purposes ONLY. This is NOT a substitute for a professional, laboratory-based water quality test." The developer has a responsibility to make this clear to all users. """ # --- 7. SAVE THE FINAL MODEL AND SCALER --- # We must save TWO files: # 1. The trained Keras model (.h5) # 2. The StandardScaler object (.joblib) model.save('water_quality_model.h5') joblib.dump(scaler, 'scaler.joblib') print("\nāœ… Final model saved as 'water_quality_model.h5'") print("āœ… Scaler saved as 'scaler.joblib'") print("\nProject setup complete. You are ready to build the Flask app.")