Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """correct water qulity 01 | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1P_fudbhG4Zu0c7jfo1ohnHoQLyG5yjyo | |
| """ | |
| # --- 1. SETUP AND IMPORTS --- | |
| import tensorflow as tf | |
| from tensorflow.keras.models import Sequential, load_model | |
| from tensorflow.keras.layers import Dense, Dropout | |
| from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.impute import KNNImputer | |
| from sklearn.metrics import confusion_matrix, classification_report, precision_score | |
| from imblearn.over_sampling import SMOTE | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import joblib | |
| print("TensorFlow Version:", tf.__version__) | |
| # --- 2. DATA LOADING --- | |
| try: | |
| data = pd.read_csv('water_potability.csv') | |
| print("Dataset loaded successfully.") | |
| except FileNotFoundError: | |
| print("Error: 'water_potability.csv' not found.") | |
| print("Please download the dataset from Kaggle and place it in the same directory.") | |
| exit() | |
| # --- 3. TASK 1: PREPROCESSING TECHNIQUES & EDA --- | |
| # Each subsection represents a specific technique with its own EDA. | |
| # -------------------------------------------------------------------------- | |
| # Technique 1 (Member 1): Handling Missing Values | |
| # -------------------------------------------------------------------------- | |
| print("\n--- EDA for Technique 1: Missing Values ---") | |
| missing_percent = (data.isnull().sum() / len(data)) * 100 | |
| plt.figure(figsize=(10, 6)) | |
| sns.barplot(x=missing_percent.index, y=missing_percent.values) | |
| plt.title('Percentage of Missing Values per Feature', fontsize=16) | |
| plt.ylabel('Percentage Missing (%)') | |
| plt.xlabel('Features') | |
| plt.xticks(rotation=45) | |
| plt.show() | |
| print("EDA Conclusion: 'ph', 'Sulfate', and 'Trihalomethanes' have significant missing data.") | |
| print("Preprocessing Step: We will use KNNImputer to fill these, as it's more accurate than a simple mean.") | |
| # -------------------------------------------------------------------------- | |
| # Technique 2 (Member 2): Handling Class Imbalance | |
| # -------------------------------------------------------------------------- | |
| print("\n--- EDA for Technique 2: Class Imbalance ---") | |
| plt.figure(figsize=(7, 5)) | |
| sns.countplot(x='Potability', data=data) | |
| plt.title('Class Distribution (0 = Not Potable, 1 = Potable)', fontsize=16) | |
| plt.xlabel('Potability') | |
| plt.ylabel('Count') | |
| plt.show() | |
| print(f"Distribution:\n{data['Potability'].value_counts(normalize=True)}") | |
| print("EDA Conclusion: The dataset is imbalanced. There are more 'Not Potable' (0) samples.") | |
| print("Preprocessing Step: We will use SMOTE (Synthetic Minority Over-sampling Technique) on the training data to create a balanced dataset for the model to learn from.") | |
| # -------------------------------------------------------------------------- | |
| # Technique 3 (Member 3): Exploring Feature Distributions & Outliers | |
| # -------------------------------------------------------------------------- | |
| print("\n--- EDA for Technique 3: Feature Distributions (Outliers) ---") | |
| # Melt the dataframe for easier plotting with Seaborn | |
| data_melted = pd.melt(data, id_vars=['Potability'], var_name='Feature', value_name='Value') | |
| plt.figure(figsize=(15, 8)) | |
| sns.boxplot(x='Feature', y='Value', data=data_melted, showfliers=True) # showfliers=True to show outliers | |
| plt.title('Boxplots for Each Feature (Showing Outliers)', fontsize=16) | |
| plt.xticks(rotation=45) | |
| plt.yscale('log') # Use log scale for better visibility of distributions | |
| plt.show() | |
| print("EDA Conclusion: Features have vastly different scales and ranges (e.g., 'Solids' is in 10,000s, 'pH' is 0-14).") | |
| print("Many features also have significant outliers.") | |
| print("Preprocessing Step: Feature Scaling is mandatory for neural networks.") | |
| # -------------------------------------------------------------------------- | |
| # Technique 4 (Member 4): Feature Scaling | |
| # -------------------------------------------------------------------------- | |
| print("\n--- EDA for Technique 4: Feature Scaling (Before/After) ---") | |
| # We'll simulate the scaling on 'Solids' (a high-value feature) to visualize the effect. | |
| # Note: We only use non-null values for this specific plot. | |
| scaler_demo = StandardScaler() | |
| solids_data = data[['Solids']].dropna() | |
| solids_scaled = scaler_demo.fit_transform(solids_data) | |
| plt.figure(figsize=(12, 5)) | |
| plt.subplot(1, 2, 1) | |
| sns.kdeplot(solids_data['Solids'], fill=True) | |
| plt.title('Before Scaling (Solids)') | |
| plt.xlabel('TDS (ppm)') | |
| plt.subplot(1, 2, 2) | |
| sns.kdeplot(solids_scaled.flatten(), fill=True, color='green') | |
| plt.title('After Scaling (Solids)') | |
| plt.xlabel('Standardized Value') | |
| plt.suptitle('Technique 4: Effect of StandardScaler', fontsize=16) | |
| plt.show() | |
| print("EDA Conclusion: Scaling centers the data around 0 and squashes it to a standard range.") | |
| print("Preprocessing Step: We will apply StandardScaler to all 9 features after splitting the data.") | |
| # -------------------------------------------------------------------------- | |
| # Technique 5 (Member 5): Correlation Analysis | |
| # -------------------------------------------------------------------------- | |
| print("\n--- EDA for Technique 5: Feature Correlation ---") | |
| # Use the imputed data just for this visualization (otherwise NaNs mess up the heatmap) | |
| imputer_demo = KNNImputer(n_neighbors=5) | |
| data_imputed_demo = pd.DataFrame(imputer_demo.fit_transform(data), columns=data.columns) | |
| corr = data_imputed_demo.corr() | |
| plt.figure(figsize=(12, 10)) | |
| sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5) | |
| plt.title('Feature Correlation Heatmap', fontsize=16) | |
| plt.show() | |
| print("EDA Conclusion: No features are extremely highly correlated (e.g., > 0.9 or < -0.9).") | |
| print("This suggests that all 9 features provide unique information and should be kept for the model.") | |
| # -------------------------------------------------------------------------- | |
| # Final Combined Preprocessing Pipeline (The "How-To") | |
| # -------------------------------------------------------------------------- | |
| print("\n--- Final Preprocessing Pipeline (Code) ---") | |
| print("Combining all techniques to prepare data for the model...") | |
| # 1. Impute Missing Values | |
| print("Step 1: Imputing missing values with KNNImputer...") | |
| imputer = KNNImputer(n_neighbors=5) | |
| data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns) | |
| # 2. Feature / Target Split | |
| print("Step 2: Separating features (X) and target (y)...") | |
| X = data_imputed.drop('Potability', axis=1) | |
| y = data_imputed['Potability'] | |
| # 3. Data Splitting (Train/Test) | |
| print("Step 3: Splitting data into training and test sets...") | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) | |
| print(f"Original training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}") | |
| # 4. Handle Class Imbalance (SMOTE) | |
| print("Step 4: Balancing training data with SMOTE...") | |
| smote = SMOTE(random_state=42) | |
| X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) | |
| print(f"Resampled training samples: {X_train_resampled.shape[0]}") | |
| # 5. Feature Scaling | |
| print("Step 5: Applying StandardScaler...") | |
| scaler = StandardScaler() | |
| # Fit the scaler ONLY on the training data | |
| X_train_scaled = scaler.fit_transform(X_train_resampled) | |
| # Apply the same scaler to the test data | |
| X_test_scaled = scaler.transform(X_test) | |
| print("\n✅ Final data pipelines are built and ready for model training.") | |
| print("The 'scaler' object is saved to apply to new user input in the app.") | |
| # --- 4. TASK 2: ALGORITHM SELECTION, IMPLEMENTATION & HYPERPARAMETER TUNING --- | |
| """ | |
| ### Task 2.1: Algorithm Selection | |
| For this tabular, binary classification task, we will use a **Deep Neural Network (DNN)**, | |
| also known as a Multi-Layer Perceptron (MLP). This is a powerful and flexible | |
| choice that can learn complex, non-linear relationships between the 9 features. | |
| """ | |
| # --- Task 2.2: Model Implementation --- | |
| def build_model(input_shape): | |
| model = Sequential([ | |
| # Input layer: 9 features | |
| Dense(64, activation='relu', input_shape=[input_shape]), | |
| Dropout(0.3), # Dropout layer to prevent overfitting | |
| Dense(128, activation='relu'), | |
| Dropout(0.3), | |
| Dense(64, activation='relu'), | |
| Dropout(0.3), | |
| # Output layer: 1 neuron with sigmoid activation | |
| # for binary classification (0 or 1) | |
| Dense(1, activation='sigmoid') | |
| ]) | |
| return model | |
| model = build_model(X_train_scaled.shape[1]) | |
| model.summary() | |
| """ | |
| ### Task 2.3: Hyperparameter Tuning Strategy | |
| * **Optimizer:** Adam (an efficient and popular choice). | |
| * **Loss Function:** `binary_crossentropy` (This is REQUIRED for a two-class, 0/1 problem). | |
| * **Metrics:** We will monitor `accuracy`. | |
| * **Callbacks:** | |
| * `EarlyStopping`: Stops training when validation accuracy stops improving. | |
| * `ReduceLROnPlateau`: Lowers the learning rate if training plateaus. | |
| """ | |
| # --- Model Training --- | |
| print("\n--- Model Training ---") | |
| model.compile( | |
| optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), | |
| loss='binary_crossentropy', | |
| metrics=['accuracy'] | |
| ) | |
| callbacks = [ | |
| EarlyStopping(monitor='val_accuracy', patience=20, verbose=1, restore_best_weights=True), | |
| ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-6, verbose=1) | |
| ] | |
| # Train on the RESAMPLED and SCALED data | |
| history = model.fit( | |
| X_train_scaled, | |
| y_train_resampled, # Use the balanced target | |
| epochs=200, # Set high, EarlyStopping will handle it | |
| validation_data=(X_test_scaled, y_test), # Validate on the original, unbalanced test set | |
| callbacks=callbacks, | |
| batch_size=32 | |
| ) | |
| # --- 5. TASK 3: EVALUATION METRICS --- | |
| """ | |
| ### Task 3.1: Evaluation Metrics | |
| For this problem, **Accuracy is misleading**. We MUST focus on the | |
| **Confusion Matrix** and **Precision for Class 1**. | |
| * **DANGER:** A **False Positive** (model says 'Potable' when it's 'Not Potable') | |
| is the worst possible error. | |
| * **Our Goal:** Minimize False Positives. | |
| * **Key Metric:** **Precision (Class 1)** tells us: "Of all the times the | |
| model said 'Potable', what percentage was it correct?" | |
| """ | |
| print("\n--- Final Model Evaluation ---") | |
| final_loss, final_accuracy = model.evaluate(X_test_scaled, y_test) | |
| print(f"\nFinal Test Loss: {final_loss:.4f}") | |
| print(f"Final Test Accuracy: {final_accuracy * 100:.2f}% (Can be misleading!)") | |
| y_pred_probs = model.predict(X_test_scaled) | |
| y_pred = (y_pred_probs > 0.5).astype(int) | |
| # --- CRITICAL EVALUATION --- | |
| cm = confusion_matrix(y_test, y_pred) | |
| precision_class_1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0) | |
| false_positives = cm[0][1] | |
| print("\n--- Detailed Classification Report ---") | |
| print(classification_report(y_test, y_pred, target_names=['Not Potable (0)', 'Potable (1)'], zero_division=0)) | |
| print("\n--- CRITICAL METRIC ANALYSIS ---") | |
| print(f"Precision (Class 1 - Potable): {precision_class_1 * 100:.2f}%") | |
| print(" > This means when the model says water IS 'Potable', it is correct this % of the time.") | |
| print(f"\nTotal DANGEROUS Predictions (False Positives): {false_positives}") | |
| print(f" > The model incorrectly labeled {false_positives} unsafe samples as 'safe'.") | |
| print("-----------------------------------") | |
| plt.figure(figsize=(8, 6)) | |
| sns.heatmap( | |
| cm, | |
| annot=True, fmt='d', cmap='Reds', # Use 'Reds' to highlight danger | |
| xticklabels=['Predicted Not Potable (0)', 'Predicted Potable (1)'], | |
| yticklabels=['Actual Not Potable (0)', 'Actual Potable (1)'] | |
| ) | |
| plt.title(f'Confusion Matrix\n{false_positives} False Positives (DANGEROUS)', fontsize=14, color='red') | |
| plt.xlabel('Predicted Label') | |
| plt.ylabel('True Label') | |
| plt.show() | |
| # --- 6. TASK 4: ETHICAL AND BIAS ANALYSIS --- | |
| """ | |
| ### Task 4.1: Ethical and Bias Analysis | |
| * **CRITICAL RISK: False Positives.** | |
| As shown in the evaluation, a False Positive (predicting 'Potable' when | |
| water is 'Not Potable') is a severe health risk. The model's Precision | |
| for the 'Potable' class must be as high as possible. | |
| * **Dataset Bias:** | |
| The dataset's origin is not specified. It may represent water from a | |
| specific region or type of source (e.g., municipal vs. well). The | |
| model may not generalize well to water with different chemical profiles | |
| from other parts of the world. | |
| * **Conclusion & Disclaimer:** | |
| This application **MUST** be deployed with a very strong | |
| disclaimer. It should be labeled: "For educational and | |
| informational purposes ONLY. This is NOT a substitute | |
| for a professional, laboratory-based water quality test." | |
| The developer has a responsibility to make this clear to all users. | |
| """ | |
| # --- 7. SAVE THE FINAL MODEL AND SCALER --- | |
| # We must save TWO files: | |
| # 1. The trained Keras model (.h5) | |
| # 2. The StandardScaler object (.joblib) | |
| model.save('water_quality_model.h5') | |
| joblib.dump(scaler, 'scaler.joblib') | |
| print("\n✅ Final model saved as 'water_quality_model.h5'") | |
| print("✅ Scaler saved as 'scaler.joblib'") | |
| print("\nProject setup complete. You are ready to build the Flask app.") |