| """ |
| Privacy-Preserving ML Demo - Hugging Face Spaces |
| ================================================ |
| Interactive demo showing how privacy techniques affect ML model performance. |
| Upload your data or use the sample dataset to see encryption + DP in action. |
| """ |
|
|
| import gradio as gr |
| import pandas as pd |
| import numpy as np |
| import hashlib |
| from datetime import datetime |
| import io |
|
|
| |
| from sklearn.model_selection import train_test_split |
| from sklearn.preprocessing import LabelEncoder, StandardScaler |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.metrics import accuracy_score, f1_score |
|
|
| |
| try: |
| from diffprivlib.models import LogisticRegression as DPLogisticRegression |
| DP_AVAILABLE = True |
| except ImportError: |
| DP_AVAILABLE = False |
|
|
|
|
| |
|
|
| def hash_value(val, salt="privacy2024"): |
| """SHA-256 hash for identifiers.""" |
| if pd.isna(val): |
| return "NULL" |
| return hashlib.sha256(f"{salt}{val}".encode()).hexdigest()[:12] |
|
|
| def pseudonymize(name, salt="privacy2024"): |
| """Create deterministic pseudonym.""" |
| if pd.isna(name): |
| return "P_NULL" |
| h = hashlib.md5(f"{salt}{name}".encode()).hexdigest()[:6] |
| return f"PERSON_{h.upper()}" |
|
|
| def generalize_dob(dob_str): |
| """Convert DOB to age range.""" |
| if pd.isna(dob_str): |
| return "Unknown" |
| try: |
| for fmt in ['%m/%d/%Y', '%Y-%m-%d', '%d/%m/%Y']: |
| try: |
| dob = datetime.strptime(str(dob_str), fmt) |
| break |
| except: |
| continue |
| else: |
| return "Unknown" |
| |
| age = (datetime.now() - dob).days // 365 |
| if age < 30: return "Under 30" |
| elif age < 45: return "30-44" |
| elif age < 60: return "45-59" |
| else: return "60+" |
| except: |
| return "Unknown" |
|
|
| def add_laplace_noise(val, epsilon=1.0, sensitivity=1.0): |
| """Add Laplace noise for differential privacy.""" |
| if pd.isna(val): |
| return val |
| scale = sensitivity / epsilon |
| return float(val) + np.random.laplace(0, scale) |
|
|
|
|
| def encrypt_dataframe(df, epsilon=1.0): |
| """Apply all privacy transformations to a dataframe.""" |
| encrypted = df.copy() |
| transformations = [] |
| |
| |
| if 'SSN' in encrypted.columns: |
| encrypted['SSN_Hashed'] = encrypted['SSN'].apply(hash_value) |
| encrypted = encrypted.drop('SSN', axis=1) |
| transformations.append("SSN β SHA-256 hash") |
| |
| |
| if 'Name' in encrypted.columns: |
| encrypted['Name_Pseudo'] = encrypted['Name'].apply(pseudonymize) |
| encrypted = encrypted.drop('Name', axis=1) |
| transformations.append("Name β Pseudonym") |
| |
| |
| if 'DOB' in encrypted.columns: |
| encrypted['Age_Range'] = encrypted['DOB'].apply(generalize_dob) |
| encrypted = encrypted.drop('DOB', axis=1) |
| transformations.append("DOB β Age range (k-anonymity)") |
| |
| |
| if 'Income' in encrypted.columns: |
| encrypted['Income_Noisy'] = encrypted['Income'].apply( |
| lambda x: add_laplace_noise(x, epsilon, 5000) |
| ) |
| encrypted = encrypted.drop('Income', axis=1) |
| transformations.append(f"Income β Laplace noise (Ξ΅={epsilon})") |
| |
| |
| if 'Heart Rate' in encrypted.columns: |
| encrypted['Heart_Rate_Noisy'] = encrypted['Heart Rate'].apply( |
| lambda x: add_laplace_noise(x, epsilon, 5) |
| ) |
| transformations.append("Heart Rate β Laplace noise") |
| |
| return encrypted, transformations |
|
|
|
|
| def prepare_for_ml(df, target_col='Tumor Condition'): |
| """Prepare dataframe for ML training.""" |
| if target_col not in df.columns: |
| return None, None, f"Target column '{target_col}' not found" |
| |
| |
| df_clean = df.dropna(axis=1, how='all').copy() |
| |
| |
| y = df_clean[target_col].copy() |
| X = df_clean.drop(columns=[target_col]) |
| |
| |
| id_cols = ['Name', 'SSN', 'DOB', 'Name_Pseudo', 'SSN_Hashed', 'Age_Range'] |
| X = X.drop(columns=[c for c in id_cols if c in X.columns], errors='ignore') |
| |
| |
| for col in X.columns: |
| if X[col].dtype == 'object': |
| le = LabelEncoder() |
| X[col] = le.fit_transform(X[col].fillna('Unknown').astype(str)) |
| else: |
| X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0) |
| |
| le_y = LabelEncoder() |
| y_encoded = le_y.fit_transform(y.fillna('Unknown')) |
| |
| return X.values, y_encoded, None |
|
|
|
|
| def run_ml_comparison(df_original, df_encrypted, epsilon): |
| """Train models and compare performance.""" |
| results = [] |
| |
| |
| X_orig, y_orig, err = prepare_for_ml(df_original) |
| if err: |
| return f"Error with original data: {err}" |
| |
| |
| X_enc, y_enc, err = prepare_for_ml(df_encrypted) |
| if err: |
| return f"Error with encrypted data: {err}" |
| |
| |
| X_tr_o, X_te_o, y_tr_o, y_te_o = train_test_split( |
| X_orig, y_orig, test_size=0.2, random_state=42 |
| ) |
| X_tr_e, X_te_e, y_tr_e, y_te_e = train_test_split( |
| X_enc, y_enc, test_size=0.2, random_state=42 |
| ) |
| |
| |
| scaler = StandardScaler() |
| X_tr_o = scaler.fit_transform(X_tr_o) |
| X_te_o = scaler.transform(X_te_o) |
| |
| scaler2 = StandardScaler() |
| X_tr_e = scaler2.fit_transform(X_tr_e) |
| X_te_e = scaler2.transform(X_te_e) |
| |
| |
| lr = LogisticRegression(max_iter=1000, random_state=42) |
| lr.fit(X_tr_o, y_tr_o) |
| pred = lr.predict(X_te_o) |
| results.append({ |
| 'Model': 'Standard Logistic Regression', |
| 'Data': 'Original (No Privacy)', |
| 'Accuracy': round(accuracy_score(y_te_o, pred), 4), |
| 'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4), |
| 'Privacy Level': 'None β' |
| }) |
| |
| |
| if DP_AVAILABLE: |
| try: |
| data_norm = np.linalg.norm(X_tr_o, axis=1).max() |
| dp_lr = DPLogisticRegression( |
| epsilon=epsilon, data_norm=data_norm, |
| max_iter=1000, random_state=42 |
| ) |
| dp_lr.fit(X_tr_o, y_tr_o) |
| pred = dp_lr.predict(X_te_o) |
| results.append({ |
| 'Model': f'DP Logistic Regression (Ξ΅={epsilon})', |
| 'Data': 'Original + DP Training', |
| 'Accuracy': round(accuracy_score(y_te_o, pred), 4), |
| 'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4), |
| 'Privacy Level': f'High β (Ξ΅={epsilon})' |
| }) |
| except Exception as e: |
| results.append({ |
| 'Model': 'DP Logistic Regression', |
| 'Data': 'Error', |
| 'Accuracy': 0, |
| 'F1 Score': 0, |
| 'Privacy Level': f'Error: {str(e)[:50]}' |
| }) |
| |
| |
| rf = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42) |
| rf.fit(X_tr_e, y_tr_e) |
| pred = rf.predict(X_te_e) |
| results.append({ |
| 'Model': 'Random Forest', |
| 'Data': 'Encrypted Data', |
| 'Accuracy': round(accuracy_score(y_te_e, pred), 4), |
| 'F1 Score': round(f1_score(y_te_e, pred, average='weighted'), 4), |
| 'Privacy Level': 'High β (Data Encrypted)' |
| }) |
| |
| return pd.DataFrame(results) |
|
|
|
|
| |
|
|
| def process_data(file, epsilon, show_sample): |
| """Main processing function for Gradio.""" |
| |
| |
| if file is None: |
| return "Please upload a CSV file.", None, None, None |
| |
| try: |
| df = pd.read_csv(file.name) |
| except Exception as e: |
| return f"Error reading file: {e}", None, None, None |
| |
| |
| df = df.dropna(axis=1, how='all').drop_duplicates() |
| df.columns = df.columns.str.strip() |
| |
| |
| df_encrypted, transformations = encrypt_dataframe(df, epsilon) |
| |
| |
| comparison_df = run_ml_comparison(df, df_encrypted, epsilon) |
| |
| |
| transform_text = "**Privacy Transformations Applied:**\n" + "\n".join( |
| [f"β’ {t}" for t in transformations] |
| ) |
| |
| |
| sample_orig = df.head(5) if show_sample else None |
| sample_enc = df_encrypted.head(5) if show_sample else None |
| |
| |
| csv_buffer = io.StringIO() |
| df_encrypted.to_csv(csv_buffer, index=False) |
| csv_content = csv_buffer.getvalue() |
| |
| return transform_text, comparison_df, sample_orig, sample_enc |
|
|
|
|
| def create_demo(): |
| """Build the Gradio interface.""" |
| |
| with gr.Blocks(title="Privacy-Preserving ML Demo", theme=gr.themes.Soft()) as demo: |
| |
| gr.Markdown(""" |
| # π Privacy-Preserving Machine Learning Demo |
| |
| This demo shows how **differential privacy** and **data encryption** techniques |
| can protect sensitive data while still allowing useful ML predictions. |
| |
| ## How it works: |
| 1. Upload your healthcare/financial CSV dataset |
| 2. Adjust the privacy budget (epsilon) - lower = more privacy, less accuracy |
| 3. See how different privacy techniques transform your data |
| 4. Compare model performance: original vs. encrypted data |
| |
| --- |
| """) |
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| file_input = gr.File( |
| label="π Upload CSV Dataset", |
| file_types=[".csv"] |
| ) |
| |
| epsilon_slider = gr.Slider( |
| minimum=0.1, maximum=10.0, value=1.0, step=0.1, |
| label="π Privacy Budget (Epsilon)", |
| info="Lower = more privacy, less utility. Typical: 0.1-2.0" |
| ) |
| |
| show_sample = gr.Checkbox( |
| value=True, |
| label="Show data samples" |
| ) |
| |
| run_btn = gr.Button("π Run Privacy Analysis", variant="primary") |
| |
| with gr.Row(): |
| transform_output = gr.Markdown(label="Transformations Applied") |
| |
| gr.Markdown("## π Model Performance Comparison") |
| comparison_output = gr.Dataframe(label="Results") |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Original Data (Sample)") |
| orig_sample = gr.Dataframe(label="First 5 rows") |
| with gr.Column(): |
| gr.Markdown("### Encrypted Data (Sample)") |
| enc_sample = gr.Dataframe(label="First 5 rows - PII Protected") |
| |
| gr.Markdown(""" |
| --- |
| ## π Privacy Techniques Used |
| |
| | Technique | What it Does | Applied To | |
| |-----------|--------------|------------| |
| | **SHA-256 Hashing** | One-way irreversible hash | SSN | |
| | **Pseudonymization** | Replace with fake IDs | Names | |
| | **K-Anonymity** | Generalize to ranges | DOB, Income | |
| | **Laplace Noise** | Add random noise | Numeric values | |
| | **Differential Privacy** | Mathematical privacy guarantee | ML training | |
| |
| **Privacy Budget (Ξ΅):** Controls the trade-off between privacy and utility. |
| - Ξ΅ = 0.1: Very high privacy, significant accuracy loss |
| - Ξ΅ = 1.0: Good balance (recommended) |
| - Ξ΅ = 10.0: Low privacy, minimal accuracy loss |
| """) |
| |
| |
| run_btn.click( |
| fn=process_data, |
| inputs=[file_input, epsilon_slider, show_sample], |
| outputs=[transform_output, comparison_output, orig_sample, enc_sample] |
| ) |
| |
| return demo |
|
|
|
|
| |
| if __name__ == "__main__": |
| demo = create_demo() |
| demo.launch() |
|
|