YashChowdhary's picture
Upload 6 files
5eb498a verified
"""
Privacy-Preserving ML Demo - Hugging Face Spaces
================================================
Interactive demo showing how privacy techniques affect ML model performance.
Upload your data or use the sample dataset to see encryption + DP in action.
"""
import gradio as gr
import pandas as pd
import numpy as np
import hashlib
from datetime import datetime
import io
# ML imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
# Differential Privacy (lightweight, CPU-friendly)
try:
from diffprivlib.models import LogisticRegression as DPLogisticRegression
DP_AVAILABLE = True
except ImportError:
DP_AVAILABLE = False
# ========== PRIVACY FUNCTIONS ==========
def hash_value(val, salt="privacy2024"):
"""SHA-256 hash for identifiers."""
if pd.isna(val):
return "NULL"
return hashlib.sha256(f"{salt}{val}".encode()).hexdigest()[:12]
def pseudonymize(name, salt="privacy2024"):
"""Create deterministic pseudonym."""
if pd.isna(name):
return "P_NULL"
h = hashlib.md5(f"{salt}{name}".encode()).hexdigest()[:6]
return f"PERSON_{h.upper()}"
def generalize_dob(dob_str):
"""Convert DOB to age range."""
if pd.isna(dob_str):
return "Unknown"
try:
for fmt in ['%m/%d/%Y', '%Y-%m-%d', '%d/%m/%Y']:
try:
dob = datetime.strptime(str(dob_str), fmt)
break
except:
continue
else:
return "Unknown"
age = (datetime.now() - dob).days // 365
if age < 30: return "Under 30"
elif age < 45: return "30-44"
elif age < 60: return "45-59"
else: return "60+"
except:
return "Unknown"
def add_laplace_noise(val, epsilon=1.0, sensitivity=1.0):
"""Add Laplace noise for differential privacy."""
if pd.isna(val):
return val
scale = sensitivity / epsilon
return float(val) + np.random.laplace(0, scale)
def encrypt_dataframe(df, epsilon=1.0):
"""Apply all privacy transformations to a dataframe."""
encrypted = df.copy()
transformations = []
# Hash SSN
if 'SSN' in encrypted.columns:
encrypted['SSN_Hashed'] = encrypted['SSN'].apply(hash_value)
encrypted = encrypted.drop('SSN', axis=1)
transformations.append("SSN β†’ SHA-256 hash")
# Pseudonymize names
if 'Name' in encrypted.columns:
encrypted['Name_Pseudo'] = encrypted['Name'].apply(pseudonymize)
encrypted = encrypted.drop('Name', axis=1)
transformations.append("Name β†’ Pseudonym")
# Generalize DOB
if 'DOB' in encrypted.columns:
encrypted['Age_Range'] = encrypted['DOB'].apply(generalize_dob)
encrypted = encrypted.drop('DOB', axis=1)
transformations.append("DOB β†’ Age range (k-anonymity)")
# Add noise to income
if 'Income' in encrypted.columns:
encrypted['Income_Noisy'] = encrypted['Income'].apply(
lambda x: add_laplace_noise(x, epsilon, 5000)
)
encrypted = encrypted.drop('Income', axis=1)
transformations.append(f"Income β†’ Laplace noise (Ξ΅={epsilon})")
# Add noise to heart rate
if 'Heart Rate' in encrypted.columns:
encrypted['Heart_Rate_Noisy'] = encrypted['Heart Rate'].apply(
lambda x: add_laplace_noise(x, epsilon, 5)
)
transformations.append("Heart Rate β†’ Laplace noise")
return encrypted, transformations
def prepare_for_ml(df, target_col='Tumor Condition'):
"""Prepare dataframe for ML training."""
if target_col not in df.columns:
return None, None, f"Target column '{target_col}' not found"
# Copy and clean
df_clean = df.dropna(axis=1, how='all').copy()
# Separate target
y = df_clean[target_col].copy()
X = df_clean.drop(columns=[target_col])
# Remove identifier columns
id_cols = ['Name', 'SSN', 'DOB', 'Name_Pseudo', 'SSN_Hashed', 'Age_Range']
X = X.drop(columns=[c for c in id_cols if c in X.columns], errors='ignore')
# Encode
for col in X.columns:
if X[col].dtype == 'object':
le = LabelEncoder()
X[col] = le.fit_transform(X[col].fillna('Unknown').astype(str))
else:
X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0)
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y.fillna('Unknown'))
return X.values, y_encoded, None
def run_ml_comparison(df_original, df_encrypted, epsilon):
"""Train models and compare performance."""
results = []
# Prepare original data
X_orig, y_orig, err = prepare_for_ml(df_original)
if err:
return f"Error with original data: {err}"
# Prepare encrypted data
X_enc, y_enc, err = prepare_for_ml(df_encrypted)
if err:
return f"Error with encrypted data: {err}"
# Split data
X_tr_o, X_te_o, y_tr_o, y_te_o = train_test_split(
X_orig, y_orig, test_size=0.2, random_state=42
)
X_tr_e, X_te_e, y_tr_e, y_te_e = train_test_split(
X_enc, y_enc, test_size=0.2, random_state=42
)
# Scale
scaler = StandardScaler()
X_tr_o = scaler.fit_transform(X_tr_o)
X_te_o = scaler.transform(X_te_o)
scaler2 = StandardScaler()
X_tr_e = scaler2.fit_transform(X_tr_e)
X_te_e = scaler2.transform(X_te_e)
# Model 1: Standard LR on original data
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_tr_o, y_tr_o)
pred = lr.predict(X_te_o)
results.append({
'Model': 'Standard Logistic Regression',
'Data': 'Original (No Privacy)',
'Accuracy': round(accuracy_score(y_te_o, pred), 4),
'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4),
'Privacy Level': 'None ❌'
})
# Model 2: DP Logistic Regression
if DP_AVAILABLE:
try:
data_norm = np.linalg.norm(X_tr_o, axis=1).max()
dp_lr = DPLogisticRegression(
epsilon=epsilon, data_norm=data_norm,
max_iter=1000, random_state=42
)
dp_lr.fit(X_tr_o, y_tr_o)
pred = dp_lr.predict(X_te_o)
results.append({
'Model': f'DP Logistic Regression (Ξ΅={epsilon})',
'Data': 'Original + DP Training',
'Accuracy': round(accuracy_score(y_te_o, pred), 4),
'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4),
'Privacy Level': f'High βœ“ (Ξ΅={epsilon})'
})
except Exception as e:
results.append({
'Model': 'DP Logistic Regression',
'Data': 'Error',
'Accuracy': 0,
'F1 Score': 0,
'Privacy Level': f'Error: {str(e)[:50]}'
})
# Model 3: RF on encrypted data
rf = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42)
rf.fit(X_tr_e, y_tr_e)
pred = rf.predict(X_te_e)
results.append({
'Model': 'Random Forest',
'Data': 'Encrypted Data',
'Accuracy': round(accuracy_score(y_te_e, pred), 4),
'F1 Score': round(f1_score(y_te_e, pred, average='weighted'), 4),
'Privacy Level': 'High βœ“ (Data Encrypted)'
})
return pd.DataFrame(results)
# ========== GRADIO INTERFACE ==========
def process_data(file, epsilon, show_sample):
"""Main processing function for Gradio."""
# Load data
if file is None:
return "Please upload a CSV file.", None, None, None
try:
df = pd.read_csv(file.name)
except Exception as e:
return f"Error reading file: {e}", None, None, None
# Clean
df = df.dropna(axis=1, how='all').drop_duplicates()
df.columns = df.columns.str.strip()
# Encrypt
df_encrypted, transformations = encrypt_dataframe(df, epsilon)
# Run ML comparison
comparison_df = run_ml_comparison(df, df_encrypted, epsilon)
# Prepare outputs
transform_text = "**Privacy Transformations Applied:**\n" + "\n".join(
[f"β€’ {t}" for t in transformations]
)
# Sample data (first 5 rows)
sample_orig = df.head(5) if show_sample else None
sample_enc = df_encrypted.head(5) if show_sample else None
# Create downloadable encrypted CSV
csv_buffer = io.StringIO()
df_encrypted.to_csv(csv_buffer, index=False)
csv_content = csv_buffer.getvalue()
return transform_text, comparison_df, sample_orig, sample_enc
def create_demo():
"""Build the Gradio interface."""
with gr.Blocks(title="Privacy-Preserving ML Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ”’ Privacy-Preserving Machine Learning Demo
This demo shows how **differential privacy** and **data encryption** techniques
can protect sensitive data while still allowing useful ML predictions.
## How it works:
1. Upload your healthcare/financial CSV dataset
2. Adjust the privacy budget (epsilon) - lower = more privacy, less accuracy
3. See how different privacy techniques transform your data
4. Compare model performance: original vs. encrypted data
---
""")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="πŸ“ Upload CSV Dataset",
file_types=[".csv"]
)
epsilon_slider = gr.Slider(
minimum=0.1, maximum=10.0, value=1.0, step=0.1,
label="πŸ” Privacy Budget (Epsilon)",
info="Lower = more privacy, less utility. Typical: 0.1-2.0"
)
show_sample = gr.Checkbox(
value=True,
label="Show data samples"
)
run_btn = gr.Button("πŸš€ Run Privacy Analysis", variant="primary")
with gr.Row():
transform_output = gr.Markdown(label="Transformations Applied")
gr.Markdown("## πŸ“Š Model Performance Comparison")
comparison_output = gr.Dataframe(label="Results")
with gr.Row():
with gr.Column():
gr.Markdown("### Original Data (Sample)")
orig_sample = gr.Dataframe(label="First 5 rows")
with gr.Column():
gr.Markdown("### Encrypted Data (Sample)")
enc_sample = gr.Dataframe(label="First 5 rows - PII Protected")
gr.Markdown("""
---
## πŸ“š Privacy Techniques Used
| Technique | What it Does | Applied To |
|-----------|--------------|------------|
| **SHA-256 Hashing** | One-way irreversible hash | SSN |
| **Pseudonymization** | Replace with fake IDs | Names |
| **K-Anonymity** | Generalize to ranges | DOB, Income |
| **Laplace Noise** | Add random noise | Numeric values |
| **Differential Privacy** | Mathematical privacy guarantee | ML training |
**Privacy Budget (Ξ΅):** Controls the trade-off between privacy and utility.
- Ξ΅ = 0.1: Very high privacy, significant accuracy loss
- Ξ΅ = 1.0: Good balance (recommended)
- Ξ΅ = 10.0: Low privacy, minimal accuracy loss
""")
# Connect button to function
run_btn.click(
fn=process_data,
inputs=[file_input, epsilon_slider, show_sample],
outputs=[transform_output, comparison_output, orig_sample, enc_sample]
)
return demo
# Launch
if __name__ == "__main__":
demo = create_demo()
demo.launch()