Spaces:

YashChowdhary
/

Privacy_Preserving_Machine_Learning

Running

App Files Files Community

YashChowdhary commited on 11 days ago

Commit

5eb498a

verified ·

1 Parent(s): 0551c02

Upload 6 files

Browse files

Files changed (6) hide show

Assignment2Dataset-1_encrypted.csv +28 -0
Privacy_Preserving_ML_Report.docx +0 -0
app.py +356 -0
model_comparison_results.csv +5 -0
privacy_ml_solution.py +515 -0
requirements.txt +16 -0

Assignment2Dataset-1_encrypted.csv ADDED Viewed

	@@ -0,0 +1,28 @@

+Country,Sex,Marital Status,Education,Loan,House Status,Blood Type,Blood Pressure,Heart Rate,Oxygen Level,Medical Procedure,Smoking,Alcohol Consumption,Allergies,Vaccinations,Tumor Condition,SSN_Hash,Name_Pseudo,Age_Range,Income_Noisy,Income_Range,Heart_Rate_Noisy
+USA,Male,Married,Bachelor's Degree,Yes,Own,O+,120/80,72.0,98%,Appendectomy,No,No,Pollen,Yes,Normal,69891950fb458416,P_75B5BC,35-44,51155.5844842771,Medium (50-75K),81.57509386656757
+Canada,Female,Single,Master's Degree,No,Rent,A-,110/70,68.0,96%,Laser Eye Surgery,No,Yes,Shellfish,No,Normal,e573f4894e4fcb00,P_088715,35-44,59640.73215253263,Medium (50-75K),62.59225523500463
+UK,Male,Divorced,High School Diploma,Yes,Own,B+,130/85,75.0,97%,Colonoscopy,No,Yes,Cats,Yes,Abnormal,4adda9543e0f08c4,P_04F2A9,45-54,69253.27571360671,Medium-High (75-100K),82.27721675527945
+Australia,Female,Married,Associate's Degree,No,Own,AB-,115/75,70.0,99%,Mammogram,No,No,Dust,No,Normal,bc4c4550714f5339,P_6F9923,35-44,48733.380890115775,Medium-Low (30-50K),69.77898304800297
+USA,Male,Single,Some College,No,Rent,O-,125/80,68.0,97%,Dental Cleaning,Yes,Yes,Peanuts,Yes,Normal,dd3b91b2d74edc57,P_B6108A,25-34,31565.34274597414,Medium-Low (30-50K),59.67859243578279
+USA,Female,Widowed,Doctorate Degree,Yes,Own,A+,120/80,80.0,95%,MRI Scan,No,No,Latex,No,Normal,44be5370eb60f1e4,P_94771B,35-44,50163.70931808812,Medium (50-75K),78.65840113607736
+Canada,Male,Married,Master's Degree,Yes,Own,B-,130/85,75.0,98%,Knee Surgery,Yes,No,Pollen,Yes,Abnormal,505cce52b3edc8cc,P_F0560C,45-54,84259.24677047139,Medium-High (75-100K),75.2419591619475
+UK,Female,Single,Bachelor's Degree,No,Rent,AB+,110/70,70.0,99%,Physical Therapy,No,Yes,Shellfish,No,Normal,913377a1e870f8aa,P_03BFB0,25-34,45801.41294513538,Medium-Low (30-50K),69.18261975188135
+Australia,Male,Married,Bachelor's Degree,Yes,Rent,A-,120/80,72.0,98%,Cataract Surgery,No,Yes,Dust,Yes,Normal,a830c60ec19369b1,P_676D45,35-44,59721.88201617836,Medium (50-75K),72.23989641611111
+USA,Female,Married,High School Diploma,Yes,Own,O+,115/75,68.0,97%,Cholecystectomy,Yes,Yes,Cats,Yes,Normal,9699bfb12929a6c5,P_A30D29,25-34,57232.92907292804,Medium (50-75K),77.64247698528224
+USA,Female,Single,Some College,No,Rent,O+,120/80,70.0,96%,Dental Filling,No,No,Pollen,No,Normal,e82a2e8a465ba7f9,P_CB6D4C,35-44,38532.14871211542,Medium-Low (30-50K),69.77146719536536
+Canada,Male,Married,Master's Degree,Yes,Own,B+,130/85,78.0,98%,Hip Replacement,Yes,Yes,Peanuts,No,Normal,710f1475be4d054a,P_B4F2FE,45-54,72904.38784704273,Medium (50-75K),80.96190601495982
+UK,Female,Single,Bachelor's Degree,No,Rent,A-,110/70,65.0,99%,Colon,,,,,,1ede2e6e64afdd33,P_31442C,25-34,49233.46394962361,Medium (50-75K),52.52619618830219
+Canada,Male,Married,Master's Degree,Yes,Own,B+,130/85,78.0,98%,Hip Replacement,Yes,Yes,Peanuts,No,Normal,710f1475be4d054a,P_B4F2FE,45-54,70803.9157620703,Medium (50-75K),80.52811139533243
+USA,Male,Married,Bachelor's Degree,Yes,Own,O+,120/80,72.0,98%,Appendectomy,No,No,Pollen,Yes,Normal,b287d3ae6c91b428,P_F5B786,35-44,61345.57997661682,Medium (50-75K),66.21390327777792
+Canada,Female,Single,Associate's Degree,No,Rent,B-,110/70,68.0,96%,Laser Eye Surgery,No,Yes,Shellfish,No,Normal,5699c90cc1364467,P_6093E9,25-34,51463.02208034713,Medium (50-75K),77.41488800572482
+USA,Male,Married,Master's Degree,Yes,Own,AB+,130/85,75.0,97%,Colonoscopy,Yes,Yes,Cats,Yes,Abnormal,0cf19876616eada6,P_EF0ACF,45-54,76622.73586878179,Medium-High (75-100K),64.28324654278879
+Australia,Female,Married,High School Diploma,Yes,Own,A-,115/75,70.0,99%,Mammogram,No,No,Dust,Yes,Normal,087631cf05b79e32,P_F29CC2,35-44,41259.68599562599,Medium-Low (30-50K),69.66222411037084
+USA,Male,Single,Some College,No,Rent,A+,120/80,68.0,97%,Dental Cleaning,No,Yes,Peanuts,No,Normal,57ddb7a16d4354e3,P_8F4E61,25-34,34701.94661720233,Medium-Low (30-50K),66.40198214113455
+Canada,Female,Divorced,Doctorate Degree,Yes,Rent,O-,125/80,80.0,95%,MRI Scan,Yes,Yes,Latex,Yes,Normal,f515750bc201d1d0,P_0128F3,35-44,60548.79819565589,Medium (50-75K),85.6713329152437
+USA,Male,Married,Bachelor's Degree,Yes,Own,B+,130/85,75.0,98%,Knee Surgery,Yes,Yes,Pollen,No,Abnormal,9b0b122756eb12a5,P_52A9FD,35-44,84743.72157762632,Medium-High (75-100K),66.14148464953857
+UK,Female,Single,Master's Degree,No,Rent,AB-,110/70,70.0,99%,Physical Therapy,No,Yes,Shellfish,No,Normal,31ac9adf7e207829,P_2275B2,25-34,55961.29781433357,Medium (50-75K),74.99332893386477
+Australia,Male,Married,Bachelor's Degree,Yes,Own,A-,120/80,72.0,98%,Cataract Surgery,No,Yes,Dust,Yes,Normal,79c4c73d3f73ac4a,P_F1F296,35-44,103190.7265862128,Medium (50-75K),71.81775356510083
+USA,Female,Married,High School Diploma,Yes,Own,O+,115/75,68.0,97%,Cholecystectomy,Yes,Yes,Cats,Yes,Normal,bab1b29286838d5d,P_29ABBA,25-34,55212.24984731953,Medium (50-75K),75.97531341973752
+USA,Male,Single,Some College,No,Rent,O+,120/80,70.0,96%,Dental Filling,No,No,Pollen,No,Normal,9621c5c20b3eb8fc,P_CE87DA,35-44,62928.41706081162,Medium-Low (30-50K),64.43977881843813
+Canada,Female,Married,Doctorate Degree,Yes,Own,B-,130/85,78.0,98%,Hip Replacement,Yes,No,Peanuts,Yes,Normal,4c07f0c3b1011bcb,P_91B4F0,45-54,67464.52628665997,Medium (50-75K),82.04101765429179
+UK,Male,Single,Bachelor's Degree,No,Rent,A+,110/,,,,,,,,,4c2061381af63b5e,P_CA4564,25-34,52578.59807847751,Medium (50-75K),

Privacy_Preserving_ML_Report.docx ADDED Viewed

Binary file (15.8 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,356 @@

+"""
+Privacy-Preserving ML Demo - Hugging Face Spaces
+================================================
+Interactive demo showing how privacy techniques affect ML model performance.
+Upload your data or use the sample dataset to see encryption + DP in action.
+"""
+import gradio as gr
+import pandas as pd
+import numpy as np
+import hashlib
+from datetime import datetime
+import io
+# ML imports
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, f1_score
+# Differential Privacy (lightweight, CPU-friendly)
+try:
+    from diffprivlib.models import LogisticRegression as DPLogisticRegression
+    DP_AVAILABLE = True
+except ImportError:
+    DP_AVAILABLE = False
+# ========== PRIVACY FUNCTIONS ==========
+def hash_value(val, salt="privacy2024"):
+    """SHA-256 hash for identifiers."""
+    if pd.isna(val):
+        return "NULL"
+    return hashlib.sha256(f"{salt}{val}".encode()).hexdigest()[:12]
+def pseudonymize(name, salt="privacy2024"):
+    """Create deterministic pseudonym."""
+    if pd.isna(name):
+        return "P_NULL"
+    h = hashlib.md5(f"{salt}{name}".encode()).hexdigest()[:6]
+    return f"PERSON_{h.upper()}"
+def generalize_dob(dob_str):
+    """Convert DOB to age range."""
+    if pd.isna(dob_str):
+        return "Unknown"
+    try:
+        for fmt in ['%m/%d/%Y', '%Y-%m-%d', '%d/%m/%Y']:
+            try:
+                dob = datetime.strptime(str(dob_str), fmt)
+                break
+            except:
+                continue
+        else:
+            return "Unknown"
+        age = (datetime.now() - dob).days // 365
+        if age < 30: return "Under 30"
+        elif age < 45: return "30-44"
+        elif age < 60: return "45-59"
+        else: return "60+"
+    except:
+        return "Unknown"
+def add_laplace_noise(val, epsilon=1.0, sensitivity=1.0):
+    """Add Laplace noise for differential privacy."""
+    if pd.isna(val):
+        return val
+    scale = sensitivity / epsilon
+    return float(val) + np.random.laplace(0, scale)
+def encrypt_dataframe(df, epsilon=1.0):
+    """Apply all privacy transformations to a dataframe."""
+    encrypted = df.copy()
+    transformations = []
+    # Hash SSN
+    if 'SSN' in encrypted.columns:
+        encrypted['SSN_Hashed'] = encrypted['SSN'].apply(hash_value)
+        encrypted = encrypted.drop('SSN', axis=1)
+        transformations.append("SSN → SHA-256 hash")
+    # Pseudonymize names
+    if 'Name' in encrypted.columns:
+        encrypted['Name_Pseudo'] = encrypted['Name'].apply(pseudonymize)
+        encrypted = encrypted.drop('Name', axis=1)
+        transformations.append("Name → Pseudonym")
+    # Generalize DOB
+    if 'DOB' in encrypted.columns:
+        encrypted['Age_Range'] = encrypted['DOB'].apply(generalize_dob)
+        encrypted = encrypted.drop('DOB', axis=1)
+        transformations.append("DOB → Age range (k-anonymity)")
+    # Add noise to income
+    if 'Income' in encrypted.columns:
+        encrypted['Income_Noisy'] = encrypted['Income'].apply(
+            lambda x: add_laplace_noise(x, epsilon, 5000)
+        )
+        encrypted = encrypted.drop('Income', axis=1)
+        transformations.append(f"Income → Laplace noise (ε={epsilon})")
+    # Add noise to heart rate
+    if 'Heart Rate' in encrypted.columns:
+        encrypted['Heart_Rate_Noisy'] = encrypted['Heart Rate'].apply(
+            lambda x: add_laplace_noise(x, epsilon, 5)
+        )
+        transformations.append("Heart Rate → Laplace noise")
+    return encrypted, transformations
+def prepare_for_ml(df, target_col='Tumor Condition'):
+    """Prepare dataframe for ML training."""
+    if target_col not in df.columns:
+        return None, None, f"Target column '{target_col}' not found"
+    # Copy and clean
+    df_clean = df.dropna(axis=1, how='all').copy()
+    # Separate target
+    y = df_clean[target_col].copy()
+    X = df_clean.drop(columns=[target_col])
+    # Remove identifier columns
+    id_cols = ['Name', 'SSN', 'DOB', 'Name_Pseudo', 'SSN_Hashed', 'Age_Range']
+    X = X.drop(columns=[c for c in id_cols if c in X.columns], errors='ignore')
+    # Encode
+    for col in X.columns:
+        if X[col].dtype == 'object':
+            le = LabelEncoder()
+            X[col] = le.fit_transform(X[col].fillna('Unknown').astype(str))
+        else:
+            X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0)
+    le_y = LabelEncoder()
+    y_encoded = le_y.fit_transform(y.fillna('Unknown'))
+    return X.values, y_encoded, None
+def run_ml_comparison(df_original, df_encrypted, epsilon):
+    """Train models and compare performance."""
+    results = []
+    # Prepare original data
+    X_orig, y_orig, err = prepare_for_ml(df_original)
+    if err:
+        return f"Error with original data: {err}"
+    # Prepare encrypted data
+    X_enc, y_enc, err = prepare_for_ml(df_encrypted)
+    if err:
+        return f"Error with encrypted data: {err}"
+    # Split data
+    X_tr_o, X_te_o, y_tr_o, y_te_o = train_test_split(
+        X_orig, y_orig, test_size=0.2, random_state=42
+    )
+    X_tr_e, X_te_e, y_tr_e, y_te_e = train_test_split(
+        X_enc, y_enc, test_size=0.2, random_state=42
+    )
+    # Scale
+    scaler = StandardScaler()
+    X_tr_o = scaler.fit_transform(X_tr_o)
+    X_te_o = scaler.transform(X_te_o)
+    scaler2 = StandardScaler()
+    X_tr_e = scaler2.fit_transform(X_tr_e)
+    X_te_e = scaler2.transform(X_te_e)
+    # Model 1: Standard LR on original data
+    lr = LogisticRegression(max_iter=1000, random_state=42)
+    lr.fit(X_tr_o, y_tr_o)
+    pred = lr.predict(X_te_o)
+    results.append({
+        'Model': 'Standard Logistic Regression',
+        'Data': 'Original (No Privacy)',
+        'Accuracy': round(accuracy_score(y_te_o, pred), 4),
+        'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4),
+        'Privacy Level': 'None ❌'
+    })
+    # Model 2: DP Logistic Regression
+    if DP_AVAILABLE:
+        try:
+            data_norm = np.linalg.norm(X_tr_o, axis=1).max()
+            dp_lr = DPLogisticRegression(
+                epsilon=epsilon, data_norm=data_norm,
+                max_iter=1000, random_state=42
+            )
+            dp_lr.fit(X_tr_o, y_tr_o)
+            pred = dp_lr.predict(X_te_o)
+            results.append({
+                'Model': f'DP Logistic Regression (ε={epsilon})',
+                'Data': 'Original + DP Training',
+                'Accuracy': round(accuracy_score(y_te_o, pred), 4),
+                'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4),
+                'Privacy Level': f'High ✓ (ε={epsilon})'
+            })
+        except Exception as e:
+            results.append({
+                'Model': 'DP Logistic Regression',
+                'Data': 'Error',
+                'Accuracy': 0,
+                'F1 Score': 0,
+                'Privacy Level': f'Error: {str(e)[:50]}'
+            })
+    # Model 3: RF on encrypted data
+    rf = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42)
+    rf.fit(X_tr_e, y_tr_e)
+    pred = rf.predict(X_te_e)
+    results.append({
+        'Model': 'Random Forest',
+        'Data': 'Encrypted Data',
+        'Accuracy': round(accuracy_score(y_te_e, pred), 4),
+        'F1 Score': round(f1_score(y_te_e, pred, average='weighted'), 4),
+        'Privacy Level': 'High ✓ (Data Encrypted)'
+    })
+    return pd.DataFrame(results)
+# ========== GRADIO INTERFACE ==========
+def process_data(file, epsilon, show_sample):
+    """Main processing function for Gradio."""
+    # Load data
+    if file is None:
+        return "Please upload a CSV file.", None, None, None
+    try:
+        df = pd.read_csv(file.name)
+    except Exception as e:
+        return f"Error reading file: {e}", None, None, None
+    # Clean
+    df = df.dropna(axis=1, how='all').drop_duplicates()
+    df.columns = df.columns.str.strip()
+    # Encrypt
+    df_encrypted, transformations = encrypt_dataframe(df, epsilon)
+    # Run ML comparison
+    comparison_df = run_ml_comparison(df, df_encrypted, epsilon)
+    # Prepare outputs
+    transform_text = "**Privacy Transformations Applied:**\n" + "\n".join(
+        [f"• {t}" for t in transformations]
+    )
+    # Sample data (first 5 rows)
+    sample_orig = df.head(5) if show_sample else None
+    sample_enc = df_encrypted.head(5) if show_sample else None
+    # Create downloadable encrypted CSV
+    csv_buffer = io.StringIO()
+    df_encrypted.to_csv(csv_buffer, index=False)
+    csv_content = csv_buffer.getvalue()
+    return transform_text, comparison_df, sample_orig, sample_enc
+def create_demo():
+    """Build the Gradio interface."""
+    with gr.Blocks(title="Privacy-Preserving ML Demo", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🔒 Privacy-Preserving Machine Learning Demo
+        This demo shows how **differential privacy** and **data encryption** techniques
+        can protect sensitive data while still allowing useful ML predictions.
+        ## How it works:
+        1. Upload your healthcare/financial CSV dataset
+        2. Adjust the privacy budget (epsilon) - lower = more privacy, less accuracy
+        3. See how different privacy techniques transform your data
+        4. Compare model performance: original vs. encrypted data
+        ---
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_input = gr.File(
+                    label="📁 Upload CSV Dataset",
+                    file_types=[".csv"]
+                )
+                epsilon_slider = gr.Slider(
+                    minimum=0.1, maximum=10.0, value=1.0, step=0.1,
+                    label="🔐 Privacy Budget (Epsilon)",
+                    info="Lower = more privacy, less utility. Typical: 0.1-2.0"
+                )
+                show_sample = gr.Checkbox(
+                    value=True,
+                    label="Show data samples"
+                )
+                run_btn = gr.Button("🚀 Run Privacy Analysis", variant="primary")
+        with gr.Row():
+            transform_output = gr.Markdown(label="Transformations Applied")
+        gr.Markdown("## 📊 Model Performance Comparison")
+        comparison_output = gr.Dataframe(label="Results")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Original Data (Sample)")
+                orig_sample = gr.Dataframe(label="First 5 rows")
+            with gr.Column():
+                gr.Markdown("### Encrypted Data (Sample)")
+                enc_sample = gr.Dataframe(label="First 5 rows - PII Protected")
+        gr.Markdown("""
+        ---
+        ## 📚 Privacy Techniques Used
+        | Technique | What it Does | Applied To |
+        |-----------|--------------|------------|
+        | **SHA-256 Hashing** | One-way irreversible hash | SSN |
+        | **Pseudonymization** | Replace with fake IDs | Names |
+        | **K-Anonymity** | Generalize to ranges | DOB, Income |
+        | **Laplace Noise** | Add random noise | Numeric values |
+        | **Differential Privacy** | Mathematical privacy guarantee | ML training |
+        **Privacy Budget (ε):** Controls the trade-off between privacy and utility.
+        - ε = 0.1: Very high privacy, significant accuracy loss
+        - ε = 1.0: Good balance (recommended)
+        - ε = 10.0: Low privacy, minimal accuracy loss
+        """)
+        # Connect button to function
+        run_btn.click(
+            fn=process_data,
+            inputs=[file_input, epsilon_slider, show_sample],
+            outputs=[transform_output, comparison_output, orig_sample, enc_sample]
+        )
+    return demo
+# Launch
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()

model_comparison_results.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+Model,Accuracy,F1_Score
+Standard LR (No Privacy),1.0,1.0
+Standard RF (No Privacy),1.0,1.0
+LR on Encrypted Data,1.0,1.0
+RF on Encrypted Data,0.6666666666666666,0.8000000000000002

privacy_ml_solution.py ADDED Viewed

	@@ -0,0 +1,515 @@

+"""
+Privacy-Preserving Machine Learning Solution
+=============================================
+Implements differential privacy and data encryption for healthcare data classification.
+Designed for Hugging Face Spaces deployment (CPU-only, free tier compatible).
+Author: Data Science Assignment
+"""
+import pandas as pd
+import numpy as np
+import hashlib
+import base64
+import warnings
+from datetime import datetime
+from typing import Tuple, Dict, Any
+# Core ML libraries
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
+# Differential Privacy library - IBM's diffprivlib
+# Lightweight, sklearn-compatible, works on CPU
+try:
+    from diffprivlib.models import LogisticRegression as DPLogisticRegression
+    from diffprivlib.models import GaussianNB as DPGaussianNB
+    DIFFPRIVLIB_AVAILABLE = True
+except ImportError:
+    DIFFPRIVLIB_AVAILABLE = False
+    print("Warning: diffprivlib not installed. Install with: pip install diffprivlib")
+warnings.filterwarnings('ignore')
+# ============================================================================
+# SECTION 1: DATA ENCRYPTION UTILITIES
+# ============================================================================
+class DataPrivacyProcessor:
+    """
+    Handles multiple privacy-preserving transformations:
+    1. Hashing (SHA-256) for direct identifiers like SSN
+    2. K-anonymity style generalization for quasi-identifiers
+    3. Data masking for names
+    4. Noise addition (Laplace mechanism) for numerical values
+    """
+    def __init__(self, epsilon: float = 1.0):
+        """
+        Args:
+            epsilon: Privacy budget for differential privacy.
+                     Lower = more privacy, less utility.
+                     Typical range: 0.1 (high privacy) to 10 (low privacy)
+        """
+        self.epsilon = epsilon
+        self.salt = "privacy_salt_2024"  # Salt for hashing
+    def hash_identifier(self, value: str) -> str:
+        """
+        One-way hash for direct identifiers (SSN, etc.).
+        Uses SHA-256 with salt to prevent rainbow table attacks.
+        """
+        if pd.isna(value):
+            return "HASH_NULL"
+        salted = f"{self.salt}{value}"
+        return hashlib.sha256(salted.encode()).hexdigest()[:16]
+    def mask_name(self, name: str) -> str:
+        """
+        Pseudonymizes names while keeping format for utility.
+        Example: 'John Smith' -> 'P_A1B2C3'
+        """
+        if pd.isna(name):
+            return "P_NULL"
+        # Create deterministic pseudonym from hash
+        hash_val = hashlib.md5(f"{self.salt}{name}".encode()).hexdigest()[:6]
+        return f"P_{hash_val.upper()}"
+    def generalize_age(self, dob_str: str) -> str:
+        """
+        K-anonymity: Generalizes exact DOB to age ranges.
+        Reduces re-identification risk while preserving analytical value.
+        """
+        if pd.isna(dob_str):
+            return "Unknown"
+        try:
+            # Handle multiple date formats
+            for fmt in ['%m/%d/%Y', '%Y-%m-%d', '%d/%m/%Y']:
+                try:
+                    dob = datetime.strptime(str(dob_str), fmt)
+                    break
+                except ValueError:
+                    continue
+            else:
+                return "Unknown"
+            age = (datetime.now() - dob).days // 365
+            # Create age buckets (5-year ranges for k-anonymity)
+            if age < 25:
+                return "18-24"
+            elif age < 35:
+                return "25-34"
+            elif age < 45:
+                return "35-44"
+            elif age < 55:
+                return "45-54"
+            elif age < 65:
+                return "55-64"
+            else:
+                return "65+"
+        except Exception:
+            return "Unknown"
+    def generalize_income(self, income: float) -> str:
+        """
+        K-anonymity: Buckets income into ranges.
+        Prevents exact salary identification.
+        """
+        if pd.isna(income):
+            return "Unknown"
+        try:
+            income = float(income)
+            if income < 30000:
+                return "Low (<30K)"
+            elif income < 50000:
+                return "Medium-Low (30-50K)"
+            elif income < 75000:
+                return "Medium (50-75K)"
+            elif income < 100000:
+                return "Medium-High (75-100K)"
+            else:
+                return "High (100K+)"
+        except (ValueError, TypeError):
+            return "Unknown"
+    def add_laplace_noise(self, value: float, sensitivity: float = 1.0) -> float:
+        """
+        Differential Privacy: Adds calibrated Laplace noise.
+        Provides plausible deniability for individual records.
+        Args:
+            value: Original numeric value
+            sensitivity: How much one person can affect the output
+        """
+        if pd.isna(value):
+            return value
+        scale = sensitivity / self.epsilon
+        noise = np.random.laplace(0, scale)
+        return value + noise
+    def encrypt_dataset(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Applies appropriate privacy technique to each column type.
+        Returns fully anonymized/encrypted dataset.
+        """
+        encrypted_df = df.copy()
+        print("Applying privacy-preserving transformations...")
+        # 1. Hash direct identifiers (SSN) - irreversible
+        if 'SSN' in encrypted_df.columns:
+            encrypted_df['SSN_Hash'] = encrypted_df['SSN'].apply(self.hash_identifier)
+            encrypted_df.drop('SSN', axis=1, inplace=True)
+            print("  ✓ SSN hashed with SHA-256")
+        # 2. Pseudonymize names
+        if 'Name' in encrypted_df.columns:
+            encrypted_df['Name_Pseudo'] = encrypted_df['Name'].apply(self.mask_name)
+            encrypted_df.drop('Name', axis=1, inplace=True)
+            print("  ✓ Names pseudonymized")
+        # 3. Generalize DOB to age ranges (k-anonymity)
+        if 'DOB' in encrypted_df.columns:
+            encrypted_df['Age_Range'] = encrypted_df['DOB'].apply(self.generalize_age)
+            encrypted_df.drop('DOB', axis=1, inplace=True)
+            print("  ✓ DOB generalized to age ranges")
+        # 4. Generalize income (k-anonymity)
+        if 'Income' in encrypted_df.columns:
+            # Keep noisy version for ML, generalized for reporting
+            encrypted_df['Income_Noisy'] = encrypted_df['Income'].apply(
+                lambda x: self.add_laplace_noise(x, sensitivity=5000)
+            )
+            encrypted_df['Income_Range'] = encrypted_df['Income'].apply(self.generalize_income)
+            encrypted_df.drop('Income', axis=1, inplace=True)
+            print("  ✓ Income: noise added + generalized")
+        # 5. Add noise to other numerical health metrics
+        numeric_noise_cols = ['Heart Rate']
+        for col in numeric_noise_cols:
+            if col in encrypted_df.columns:
+                encrypted_df[f'{col}_Noisy'] = encrypted_df[col].apply(
+                    lambda x: self.add_laplace_noise(x, sensitivity=5)
+                )
+                print(f"  ✓ {col}: Laplace noise added")
+        print(f"\nPrivacy budget (epsilon) used: {self.epsilon}")
+        return encrypted_df
+# ============================================================================
+# SECTION 2: DATA PREPROCESSING
+# ============================================================================
+class HealthcareDataProcessor:
+    """
+    Prepares healthcare data for ML model training.
+    Handles encoding, scaling, and feature engineering.
+    """
+    def __init__(self):
+        self.label_encoders = {}
+        self.scaler = StandardScaler()
+        self.feature_columns = []
+    def load_and_clean(self, filepath: str) -> pd.DataFrame:
+        """Load CSV and perform basic cleaning."""
+        df = pd.read_csv(filepath)
+        # Remove completely empty columns
+        df = df.dropna(axis=1, how='all')
+        # Remove duplicate rows
+        df = df.drop_duplicates()
+        # Clean column names
+        df.columns = df.columns.str.strip()
+        print(f"Loaded {len(df)} records with {len(df.columns)} features")
+        return df
+    def prepare_features(self, df: pd.DataFrame, target_col: str = 'Tumor Condition') -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Encodes categorical features and prepares for ML.
+        Returns feature matrix X and target vector y.
+        """
+        # Identify target
+        if target_col not in df.columns:
+            raise ValueError(f"Target column '{target_col}' not found!")
+        # Separate features and target
+        y = df[target_col].copy()
+        X_df = df.drop(columns=[target_col])
+        # Remove non-predictive columns (identifiers)
+        cols_to_drop = ['Name', 'SSN', 'Name_Pseudo', 'SSN_Hash', 'DOB']
+        X_df = X_df.drop(columns=[c for c in cols_to_drop if c in X_df.columns], errors='ignore')
+        # Encode target variable
+        le_target = LabelEncoder()
+        y_encoded = le_target.fit_transform(y.fillna('Unknown'))
+        self.label_encoders['target'] = le_target
+        # Process each column
+        processed_cols = []
+        for col in X_df.columns:
+            if X_df[col].dtype in ['object', 'category']:
+                # Categorical: label encode
+                le = LabelEncoder()
+                X_df[col] = le.fit_transform(X_df[col].fillna('Unknown').astype(str))
+                self.label_encoders[col] = le
+            else:
+                # Numeric: fill NaN with median
+                X_df[col] = pd.to_numeric(X_df[col], errors='coerce')
+                X_df[col] = X_df[col].fillna(X_df[col].median())
+            processed_cols.append(col)
+        self.feature_columns = processed_cols
+        # Scale features
+        X_scaled = self.scaler.fit_transform(X_df)
+        print(f"Prepared {X_scaled.shape[1]} features for {X_scaled.shape[0]} samples")
+        return X_scaled, y_encoded
+# ============================================================================
+# SECTION 3: MODEL TRAINING AND EVALUATION
+# ============================================================================
+class PrivacyPreservingMLPipeline:
+    """
+    Complete ML pipeline comparing:
+    1. Standard model (no privacy)
+    2. Differentially private model
+    3. Model trained on encrypted data
+    """
+    def __init__(self, epsilon: float = 1.0):
+        self.epsilon = epsilon
+        self.results = {}
+    def evaluate_model(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, float]:
+        """Calculate and store standard metrics."""
+        metrics = {
+            'accuracy': accuracy_score(y_true, y_pred),
+            'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
+            'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
+            'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0)
+        }
+        self.results[model_name] = metrics
+        return metrics
+    def train_standard_model(self, X_train: np.ndarray, X_test: np.ndarray,
+                             y_train: np.ndarray, y_test: np.ndarray) -> Dict[str, float]:
+        """Train standard logistic regression (no privacy)."""
+        print("\n" + "="*60)
+        print("TRAINING STANDARD MODEL (No Privacy Protection)")
+        print("="*60)
+        model = LogisticRegression(max_iter=1000, random_state=42)
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
+        metrics = self.evaluate_model(y_test, y_pred, 'Standard LR')
+        print(f"Accuracy: {metrics['accuracy']:.4f}")
+        print(f"F1 Score: {metrics['f1']:.4f}")
+        return metrics
+    def train_dp_model(self, X_train: np.ndarray, X_test: np.ndarray,
+                       y_train: np.ndarray, y_test: np.ndarray) -> Dict[str, float]:
+        """Train differentially private logistic regression."""
+        print("\n" + "="*60)
+        print(f"TRAINING DP MODEL (Epsilon = {self.epsilon})")
+        print("="*60)
+        if not DIFFPRIVLIB_AVAILABLE:
+            print("diffprivlib not available - skipping DP model")
+            return {}
+        # Calculate data bounds for DP (required by diffprivlib)
+        data_norm = np.linalg.norm(X_train, axis=1).max()
+        dp_model = DPLogisticRegression(
+            epsilon=self.epsilon,
+            data_norm=data_norm,
+            max_iter=1000,
+            random_state=42
+        )
+        dp_model.fit(X_train, y_train)
+        y_pred = dp_model.predict(X_test)
+        metrics = self.evaluate_model(y_test, y_pred, f'DP LR (ε={self.epsilon})')
+        print(f"Accuracy: {metrics['accuracy']:.4f}")
+        print(f"F1 Score: {metrics['f1']:.4f}")
+        return metrics
+    def train_on_encrypted_data(self, X_train: np.ndarray, X_test: np.ndarray,
+                                y_train: np.ndarray, y_test: np.ndarray) -> Dict[str, float]:
+        """Train model on encrypted/anonymized dataset."""
+        print("\n" + "="*60)
+        print("TRAINING ON ENCRYPTED DATA")
+        print("="*60)
+        # The data passed here is already encrypted/anonymized
+        # We use Random Forest as it handles noisy data better
+        model = RandomForestClassifier(
+            n_estimators=100,
+            max_depth=10,
+            random_state=42,
+            n_jobs=-1
+        )
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
+        metrics = self.evaluate_model(y_test, y_pred, 'RF on Encrypted Data')
+        print(f"Accuracy: {metrics['accuracy']:.4f}")
+        print(f"F1 Score: {metrics['f1']:.4f}")
+        return metrics
+    def compare_results(self) -> pd.DataFrame:
+        """Generate comparison table of all models."""
+        if not self.results:
+            return pd.DataFrame()
+        comparison = pd.DataFrame(self.results).T
+        comparison = comparison.round(4)
+        print("\n" + "="*60)
+        print("MODEL COMPARISON RESULTS")
+        print("="*60)
+        print(comparison.to_string())
+        return comparison
+# ============================================================================
+# SECTION 4: MAIN EXECUTION
+# ============================================================================
+def run_complete_pipeline(data_path: str, epsilon: float = 1.0) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
+    """
+    Execute the complete privacy-preserving ML pipeline.
+    Args:
+        data_path: Path to the CSV dataset
+        epsilon: Privacy budget for differential privacy
+    Returns:
+        - Original cleaned DataFrame
+        - Encrypted DataFrame
+        - Dictionary of all results
+    """
+    print("="*70)
+    print("PRIVACY-PRESERVING MACHINE LEARNING PIPELINE")
+    print("="*70)
+    print(f"Privacy budget (epsilon): {epsilon}")
+    print(f"Data file: {data_path}")
+    print("="*70)
+    # Step 1: Load and clean data
+    processor = HealthcareDataProcessor()
+    df_original = processor.load_and_clean(data_path)
+    print("\n--- ORIGINAL DATA SAMPLE ---")
+    print(df_original.head(3).to_string())
+    # Step 2: Apply privacy transformations
+    privacy_processor = DataPrivacyProcessor(epsilon=epsilon)
+    df_encrypted = privacy_processor.encrypt_dataset(df_original)
+    print("\n--- ENCRYPTED DATA SAMPLE ---")
+    print(df_encrypted.head(3).to_string())
+    # Save encrypted dataset
+    encrypted_path = data_path.replace('.csv', '_encrypted.csv')
+    df_encrypted.to_csv(encrypted_path, index=False)
+    print(f"\n✓ Encrypted dataset saved to: {encrypted_path}")
+    # Step 3: Prepare features from ORIGINAL data
+    X_orig, y_orig = processor.prepare_features(df_original.copy())
+    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
+        X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig
+    )
+    # Step 4: Prepare features from ENCRYPTED data
+    processor_enc = HealthcareDataProcessor()
+    df_enc_clean = df_encrypted.copy()
+    X_enc, y_enc = processor_enc.prepare_features(df_enc_clean)
+    X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(
+        X_enc, y_enc, test_size=0.2, random_state=42, stratify=y_enc
+    )
+    # Step 5: Train and evaluate models
+    pipeline = PrivacyPreservingMLPipeline(epsilon=epsilon)
+    # Model 1: Standard (no privacy)
+    pipeline.train_standard_model(X_train_orig, X_test_orig, y_train_orig, y_test_orig)
+    # Model 2: Differential Privacy
+    if DIFFPRIVLIB_AVAILABLE:
+        pipeline.train_dp_model(X_train_orig, X_test_orig, y_train_orig, y_test_orig)
+    # Model 3: Trained on encrypted data
+    pipeline.train_on_encrypted_data(X_train_enc, X_test_enc, y_train_enc, y_test_enc)
+    # Step 6: Generate comparison
+    comparison = pipeline.compare_results()
+    # Step 7: Summary
+    results = {
+        'original_shape': df_original.shape,
+        'encrypted_shape': df_encrypted.shape,
+        'epsilon': epsilon,
+        'model_comparison': comparison.to_dict(),
+        'privacy_techniques_applied': [
+            'SHA-256 Hashing (SSN)',
+            'Pseudonymization (Names)',
+            'K-Anonymity Generalization (DOB, Income)',
+            'Laplace Noise Addition (Numerical features)',
+            f'Differential Privacy (ε={epsilon})'
+        ]
+    }
+    print("\n" + "="*70)
+    print("PIPELINE COMPLETED SUCCESSFULLY")
+    print("="*70)
+    return df_original, df_encrypted, results
+# ============================================================================
+# SECTION 5: COMMAND LINE INTERFACE
+# ============================================================================
+if __name__ == "__main__":
+    import sys
+    # Default settings
+    data_file = "Assignment2Dataset-1.csv"
+    epsilon = 1.0  # Balance between privacy and utility
+    # Allow command line arguments
+    if len(sys.argv) > 1:
+        data_file = sys.argv[1]
+    if len(sys.argv) > 2:
+        epsilon = float(sys.argv[2])
+    # Run the complete pipeline
+    df_orig, df_enc, results = run_complete_pipeline(data_file, epsilon)
+    print("\n\nFinal Summary:")
+    print("-" * 40)
+    print(f"Original records: {results['original_shape'][0]}")
+    print(f"Privacy techniques applied: {len(results['privacy_techniques_applied'])}")
+    print(f"Epsilon value: {results['epsilon']}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+# Requirements for Privacy-Preserving ML Demo
+# Hugging Face Spaces - CPU Only (Free Tier Compatible)
+# Core ML
+pandas>=2.0.0
+numpy>=1.24.0
+scikit-learn>=1.3.0
+# Differential Privacy - IBM's library (lightweight, pure Python)
+diffprivlib>=0.6.0
+# Gradio for web interface
+gradio>=4.0.0
+# Note: No GPU libraries needed - runs on CPU
+# Total install size: ~200MB (within free tier limits)