""" Privacy-Preserving ML Demo - Hugging Face Spaces ================================================ Interactive demo showing how privacy techniques affect ML model performance. Upload your data or use the sample dataset to see encryption + DP in action. """ import gradio as gr import pandas as pd import numpy as np import hashlib from datetime import datetime import io # ML imports from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, f1_score # Differential Privacy (lightweight, CPU-friendly) try: from diffprivlib.models import LogisticRegression as DPLogisticRegression DP_AVAILABLE = True except ImportError: DP_AVAILABLE = False # ========== PRIVACY FUNCTIONS ========== def hash_value(val, salt="privacy2024"): """SHA-256 hash for identifiers.""" if pd.isna(val): return "NULL" return hashlib.sha256(f"{salt}{val}".encode()).hexdigest()[:12] def pseudonymize(name, salt="privacy2024"): """Create deterministic pseudonym.""" if pd.isna(name): return "P_NULL" h = hashlib.md5(f"{salt}{name}".encode()).hexdigest()[:6] return f"PERSON_{h.upper()}" def generalize_dob(dob_str): """Convert DOB to age range.""" if pd.isna(dob_str): return "Unknown" try: for fmt in ['%m/%d/%Y', '%Y-%m-%d', '%d/%m/%Y']: try: dob = datetime.strptime(str(dob_str), fmt) break except: continue else: return "Unknown" age = (datetime.now() - dob).days // 365 if age < 30: return "Under 30" elif age < 45: return "30-44" elif age < 60: return "45-59" else: return "60+" except: return "Unknown" def add_laplace_noise(val, epsilon=1.0, sensitivity=1.0): """Add Laplace noise for differential privacy.""" if pd.isna(val): return val scale = sensitivity / epsilon return float(val) + np.random.laplace(0, scale) def encrypt_dataframe(df, epsilon=1.0): """Apply all privacy transformations to a dataframe.""" encrypted = df.copy() transformations = [] # Hash SSN if 'SSN' in encrypted.columns: encrypted['SSN_Hashed'] = encrypted['SSN'].apply(hash_value) encrypted = encrypted.drop('SSN', axis=1) transformations.append("SSN → SHA-256 hash") # Pseudonymize names if 'Name' in encrypted.columns: encrypted['Name_Pseudo'] = encrypted['Name'].apply(pseudonymize) encrypted = encrypted.drop('Name', axis=1) transformations.append("Name → Pseudonym") # Generalize DOB if 'DOB' in encrypted.columns: encrypted['Age_Range'] = encrypted['DOB'].apply(generalize_dob) encrypted = encrypted.drop('DOB', axis=1) transformations.append("DOB → Age range (k-anonymity)") # Add noise to income if 'Income' in encrypted.columns: encrypted['Income_Noisy'] = encrypted['Income'].apply( lambda x: add_laplace_noise(x, epsilon, 5000) ) encrypted = encrypted.drop('Income', axis=1) transformations.append(f"Income → Laplace noise (ε={epsilon})") # Add noise to heart rate if 'Heart Rate' in encrypted.columns: encrypted['Heart_Rate_Noisy'] = encrypted['Heart Rate'].apply( lambda x: add_laplace_noise(x, epsilon, 5) ) transformations.append("Heart Rate → Laplace noise") return encrypted, transformations def prepare_for_ml(df, target_col='Tumor Condition'): """Prepare dataframe for ML training.""" if target_col not in df.columns: return None, None, f"Target column '{target_col}' not found" # Copy and clean df_clean = df.dropna(axis=1, how='all').copy() # Separate target y = df_clean[target_col].copy() X = df_clean.drop(columns=[target_col]) # Remove identifier columns id_cols = ['Name', 'SSN', 'DOB', 'Name_Pseudo', 'SSN_Hashed', 'Age_Range'] X = X.drop(columns=[c for c in id_cols if c in X.columns], errors='ignore') # Encode for col in X.columns: if X[col].dtype == 'object': le = LabelEncoder() X[col] = le.fit_transform(X[col].fillna('Unknown').astype(str)) else: X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0) le_y = LabelEncoder() y_encoded = le_y.fit_transform(y.fillna('Unknown')) return X.values, y_encoded, None def run_ml_comparison(df_original, df_encrypted, epsilon): """Train models and compare performance.""" results = [] # Prepare original data X_orig, y_orig, err = prepare_for_ml(df_original) if err: return f"Error with original data: {err}" # Prepare encrypted data X_enc, y_enc, err = prepare_for_ml(df_encrypted) if err: return f"Error with encrypted data: {err}" # Split data X_tr_o, X_te_o, y_tr_o, y_te_o = train_test_split( X_orig, y_orig, test_size=0.2, random_state=42 ) X_tr_e, X_te_e, y_tr_e, y_te_e = train_test_split( X_enc, y_enc, test_size=0.2, random_state=42 ) # Scale scaler = StandardScaler() X_tr_o = scaler.fit_transform(X_tr_o) X_te_o = scaler.transform(X_te_o) scaler2 = StandardScaler() X_tr_e = scaler2.fit_transform(X_tr_e) X_te_e = scaler2.transform(X_te_e) # Model 1: Standard LR on original data lr = LogisticRegression(max_iter=1000, random_state=42) lr.fit(X_tr_o, y_tr_o) pred = lr.predict(X_te_o) results.append({ 'Model': 'Standard Logistic Regression', 'Data': 'Original (No Privacy)', 'Accuracy': round(accuracy_score(y_te_o, pred), 4), 'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4), 'Privacy Level': 'None ❌' }) # Model 2: DP Logistic Regression if DP_AVAILABLE: try: data_norm = np.linalg.norm(X_tr_o, axis=1).max() dp_lr = DPLogisticRegression( epsilon=epsilon, data_norm=data_norm, max_iter=1000, random_state=42 ) dp_lr.fit(X_tr_o, y_tr_o) pred = dp_lr.predict(X_te_o) results.append({ 'Model': f'DP Logistic Regression (ε={epsilon})', 'Data': 'Original + DP Training', 'Accuracy': round(accuracy_score(y_te_o, pred), 4), 'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4), 'Privacy Level': f'High ✓ (ε={epsilon})' }) except Exception as e: results.append({ 'Model': 'DP Logistic Regression', 'Data': 'Error', 'Accuracy': 0, 'F1 Score': 0, 'Privacy Level': f'Error: {str(e)[:50]}' }) # Model 3: RF on encrypted data rf = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42) rf.fit(X_tr_e, y_tr_e) pred = rf.predict(X_te_e) results.append({ 'Model': 'Random Forest', 'Data': 'Encrypted Data', 'Accuracy': round(accuracy_score(y_te_e, pred), 4), 'F1 Score': round(f1_score(y_te_e, pred, average='weighted'), 4), 'Privacy Level': 'High ✓ (Data Encrypted)' }) return pd.DataFrame(results) # ========== GRADIO INTERFACE ========== def process_data(file, epsilon, show_sample): """Main processing function for Gradio.""" # Load data if file is None: return "Please upload a CSV file.", None, None, None try: df = pd.read_csv(file.name) except Exception as e: return f"Error reading file: {e}", None, None, None # Clean df = df.dropna(axis=1, how='all').drop_duplicates() df.columns = df.columns.str.strip() # Encrypt df_encrypted, transformations = encrypt_dataframe(df, epsilon) # Run ML comparison comparison_df = run_ml_comparison(df, df_encrypted, epsilon) # Prepare outputs transform_text = "**Privacy Transformations Applied:**\n" + "\n".join( [f"• {t}" for t in transformations] ) # Sample data (first 5 rows) sample_orig = df.head(5) if show_sample else None sample_enc = df_encrypted.head(5) if show_sample else None # Create downloadable encrypted CSV csv_buffer = io.StringIO() df_encrypted.to_csv(csv_buffer, index=False) csv_content = csv_buffer.getvalue() return transform_text, comparison_df, sample_orig, sample_enc def create_demo(): """Build the Gradio interface.""" with gr.Blocks(title="Privacy-Preserving ML Demo", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🔒 Privacy-Preserving Machine Learning Demo This demo shows how **differential privacy** and **data encryption** techniques can protect sensitive data while still allowing useful ML predictions. ## How it works: 1. Upload your healthcare/financial CSV dataset 2. Adjust the privacy budget (epsilon) - lower = more privacy, less accuracy 3. See how different privacy techniques transform your data 4. Compare model performance: original vs. encrypted data --- """) with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="📁 Upload CSV Dataset", file_types=[".csv"] ) epsilon_slider = gr.Slider( minimum=0.1, maximum=10.0, value=1.0, step=0.1, label="🔐 Privacy Budget (Epsilon)", info="Lower = more privacy, less utility. Typical: 0.1-2.0" ) show_sample = gr.Checkbox( value=True, label="Show data samples" ) run_btn = gr.Button("🚀 Run Privacy Analysis", variant="primary") with gr.Row(): transform_output = gr.Markdown(label="Transformations Applied") gr.Markdown("## 📊 Model Performance Comparison") comparison_output = gr.Dataframe(label="Results") with gr.Row(): with gr.Column(): gr.Markdown("### Original Data (Sample)") orig_sample = gr.Dataframe(label="First 5 rows") with gr.Column(): gr.Markdown("### Encrypted Data (Sample)") enc_sample = gr.Dataframe(label="First 5 rows - PII Protected") gr.Markdown(""" --- ## 📚 Privacy Techniques Used | Technique | What it Does | Applied To | |-----------|--------------|------------| | **SHA-256 Hashing** | One-way irreversible hash | SSN | | **Pseudonymization** | Replace with fake IDs | Names | | **K-Anonymity** | Generalize to ranges | DOB, Income | | **Laplace Noise** | Add random noise | Numeric values | | **Differential Privacy** | Mathematical privacy guarantee | ML training | **Privacy Budget (ε):** Controls the trade-off between privacy and utility. - ε = 0.1: Very high privacy, significant accuracy loss - ε = 1.0: Good balance (recommended) - ε = 10.0: Low privacy, minimal accuracy loss """) # Connect button to function run_btn.click( fn=process_data, inputs=[file_input, epsilon_slider, show_sample], outputs=[transform_output, comparison_output, orig_sample, enc_sample] ) return demo # Launch if __name__ == "__main__": demo = create_demo() demo.launch()