YashChowdhary commited on
Commit
5eb498a
·
verified ·
1 Parent(s): 0551c02

Upload 6 files

Browse files
Assignment2Dataset-1_encrypted.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Country,Sex,Marital Status,Education,Loan,House Status,Blood Type,Blood Pressure,Heart Rate,Oxygen Level,Medical Procedure,Smoking,Alcohol Consumption,Allergies,Vaccinations,Tumor Condition,SSN_Hash,Name_Pseudo,Age_Range,Income_Noisy,Income_Range,Heart_Rate_Noisy
2
+ USA,Male,Married,Bachelor's Degree,Yes,Own,O+,120/80,72.0,98%,Appendectomy,No,No,Pollen,Yes,Normal,69891950fb458416,P_75B5BC,35-44,51155.5844842771,Medium (50-75K),81.57509386656757
3
+ Canada,Female,Single,Master's Degree,No,Rent,A-,110/70,68.0,96%,Laser Eye Surgery,No,Yes,Shellfish,No,Normal,e573f4894e4fcb00,P_088715,35-44,59640.73215253263,Medium (50-75K),62.59225523500463
4
+ UK,Male,Divorced,High School Diploma,Yes,Own,B+,130/85,75.0,97%,Colonoscopy,No,Yes,Cats,Yes,Abnormal,4adda9543e0f08c4,P_04F2A9,45-54,69253.27571360671,Medium-High (75-100K),82.27721675527945
5
+ Australia,Female,Married,Associate's Degree,No,Own,AB-,115/75,70.0,99%,Mammogram,No,No,Dust,No,Normal,bc4c4550714f5339,P_6F9923,35-44,48733.380890115775,Medium-Low (30-50K),69.77898304800297
6
+ USA,Male,Single,Some College,No,Rent,O-,125/80,68.0,97%,Dental Cleaning,Yes,Yes,Peanuts,Yes,Normal,dd3b91b2d74edc57,P_B6108A,25-34,31565.34274597414,Medium-Low (30-50K),59.67859243578279
7
+ USA,Female,Widowed,Doctorate Degree,Yes,Own,A+,120/80,80.0,95%,MRI Scan,No,No,Latex,No,Normal,44be5370eb60f1e4,P_94771B,35-44,50163.70931808812,Medium (50-75K),78.65840113607736
8
+ Canada,Male,Married,Master's Degree,Yes,Own,B-,130/85,75.0,98%,Knee Surgery,Yes,No,Pollen,Yes,Abnormal,505cce52b3edc8cc,P_F0560C,45-54,84259.24677047139,Medium-High (75-100K),75.2419591619475
9
+ UK,Female,Single,Bachelor's Degree,No,Rent,AB+,110/70,70.0,99%,Physical Therapy,No,Yes,Shellfish,No,Normal,913377a1e870f8aa,P_03BFB0,25-34,45801.41294513538,Medium-Low (30-50K),69.18261975188135
10
+ Australia,Male,Married,Bachelor's Degree,Yes,Rent,A-,120/80,72.0,98%,Cataract Surgery,No,Yes,Dust,Yes,Normal,a830c60ec19369b1,P_676D45,35-44,59721.88201617836,Medium (50-75K),72.23989641611111
11
+ USA,Female,Married,High School Diploma,Yes,Own,O+,115/75,68.0,97%,Cholecystectomy,Yes,Yes,Cats,Yes,Normal,9699bfb12929a6c5,P_A30D29,25-34,57232.92907292804,Medium (50-75K),77.64247698528224
12
+ USA,Female,Single,Some College,No,Rent,O+,120/80,70.0,96%,Dental Filling,No,No,Pollen,No,Normal,e82a2e8a465ba7f9,P_CB6D4C,35-44,38532.14871211542,Medium-Low (30-50K),69.77146719536536
13
+ Canada,Male,Married,Master's Degree,Yes,Own,B+,130/85,78.0,98%,Hip Replacement,Yes,Yes,Peanuts,No,Normal,710f1475be4d054a,P_B4F2FE,45-54,72904.38784704273,Medium (50-75K),80.96190601495982
14
+ UK,Female,Single,Bachelor's Degree,No,Rent,A-,110/70,65.0,99%,Colon,,,,,,1ede2e6e64afdd33,P_31442C,25-34,49233.46394962361,Medium (50-75K),52.52619618830219
15
+ Canada,Male,Married,Master's Degree,Yes,Own,B+,130/85,78.0,98%,Hip Replacement,Yes,Yes,Peanuts,No,Normal,710f1475be4d054a,P_B4F2FE,45-54,70803.9157620703,Medium (50-75K),80.52811139533243
16
+ USA,Male,Married,Bachelor's Degree,Yes,Own,O+,120/80,72.0,98%,Appendectomy,No,No,Pollen,Yes,Normal,b287d3ae6c91b428,P_F5B786,35-44,61345.57997661682,Medium (50-75K),66.21390327777792
17
+ Canada,Female,Single,Associate's Degree,No,Rent,B-,110/70,68.0,96%,Laser Eye Surgery,No,Yes,Shellfish,No,Normal,5699c90cc1364467,P_6093E9,25-34,51463.02208034713,Medium (50-75K),77.41488800572482
18
+ USA,Male,Married,Master's Degree,Yes,Own,AB+,130/85,75.0,97%,Colonoscopy,Yes,Yes,Cats,Yes,Abnormal,0cf19876616eada6,P_EF0ACF,45-54,76622.73586878179,Medium-High (75-100K),64.28324654278879
19
+ Australia,Female,Married,High School Diploma,Yes,Own,A-,115/75,70.0,99%,Mammogram,No,No,Dust,Yes,Normal,087631cf05b79e32,P_F29CC2,35-44,41259.68599562599,Medium-Low (30-50K),69.66222411037084
20
+ USA,Male,Single,Some College,No,Rent,A+,120/80,68.0,97%,Dental Cleaning,No,Yes,Peanuts,No,Normal,57ddb7a16d4354e3,P_8F4E61,25-34,34701.94661720233,Medium-Low (30-50K),66.40198214113455
21
+ Canada,Female,Divorced,Doctorate Degree,Yes,Rent,O-,125/80,80.0,95%,MRI Scan,Yes,Yes,Latex,Yes,Normal,f515750bc201d1d0,P_0128F3,35-44,60548.79819565589,Medium (50-75K),85.6713329152437
22
+ USA,Male,Married,Bachelor's Degree,Yes,Own,B+,130/85,75.0,98%,Knee Surgery,Yes,Yes,Pollen,No,Abnormal,9b0b122756eb12a5,P_52A9FD,35-44,84743.72157762632,Medium-High (75-100K),66.14148464953857
23
+ UK,Female,Single,Master's Degree,No,Rent,AB-,110/70,70.0,99%,Physical Therapy,No,Yes,Shellfish,No,Normal,31ac9adf7e207829,P_2275B2,25-34,55961.29781433357,Medium (50-75K),74.99332893386477
24
+ Australia,Male,Married,Bachelor's Degree,Yes,Own,A-,120/80,72.0,98%,Cataract Surgery,No,Yes,Dust,Yes,Normal,79c4c73d3f73ac4a,P_F1F296,35-44,103190.7265862128,Medium (50-75K),71.81775356510083
25
+ USA,Female,Married,High School Diploma,Yes,Own,O+,115/75,68.0,97%,Cholecystectomy,Yes,Yes,Cats,Yes,Normal,bab1b29286838d5d,P_29ABBA,25-34,55212.24984731953,Medium (50-75K),75.97531341973752
26
+ USA,Male,Single,Some College,No,Rent,O+,120/80,70.0,96%,Dental Filling,No,No,Pollen,No,Normal,9621c5c20b3eb8fc,P_CE87DA,35-44,62928.41706081162,Medium-Low (30-50K),64.43977881843813
27
+ Canada,Female,Married,Doctorate Degree,Yes,Own,B-,130/85,78.0,98%,Hip Replacement,Yes,No,Peanuts,Yes,Normal,4c07f0c3b1011bcb,P_91B4F0,45-54,67464.52628665997,Medium (50-75K),82.04101765429179
28
+ UK,Male,Single,Bachelor's Degree,No,Rent,A+,110/,,,,,,,,,4c2061381af63b5e,P_CA4564,25-34,52578.59807847751,Medium (50-75K),
Privacy_Preserving_ML_Report.docx ADDED
Binary file (15.8 kB). View file
 
app.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Privacy-Preserving ML Demo - Hugging Face Spaces
3
+ ================================================
4
+ Interactive demo showing how privacy techniques affect ML model performance.
5
+ Upload your data or use the sample dataset to see encryption + DP in action.
6
+ """
7
+
8
+ import gradio as gr
9
+ import pandas as pd
10
+ import numpy as np
11
+ import hashlib
12
+ from datetime import datetime
13
+ import io
14
+
15
+ # ML imports
16
+ from sklearn.model_selection import train_test_split
17
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
18
+ from sklearn.ensemble import RandomForestClassifier
19
+ from sklearn.linear_model import LogisticRegression
20
+ from sklearn.metrics import accuracy_score, f1_score
21
+
22
+ # Differential Privacy (lightweight, CPU-friendly)
23
+ try:
24
+ from diffprivlib.models import LogisticRegression as DPLogisticRegression
25
+ DP_AVAILABLE = True
26
+ except ImportError:
27
+ DP_AVAILABLE = False
28
+
29
+
30
+ # ========== PRIVACY FUNCTIONS ==========
31
+
32
+ def hash_value(val, salt="privacy2024"):
33
+ """SHA-256 hash for identifiers."""
34
+ if pd.isna(val):
35
+ return "NULL"
36
+ return hashlib.sha256(f"{salt}{val}".encode()).hexdigest()[:12]
37
+
38
+ def pseudonymize(name, salt="privacy2024"):
39
+ """Create deterministic pseudonym."""
40
+ if pd.isna(name):
41
+ return "P_NULL"
42
+ h = hashlib.md5(f"{salt}{name}".encode()).hexdigest()[:6]
43
+ return f"PERSON_{h.upper()}"
44
+
45
+ def generalize_dob(dob_str):
46
+ """Convert DOB to age range."""
47
+ if pd.isna(dob_str):
48
+ return "Unknown"
49
+ try:
50
+ for fmt in ['%m/%d/%Y', '%Y-%m-%d', '%d/%m/%Y']:
51
+ try:
52
+ dob = datetime.strptime(str(dob_str), fmt)
53
+ break
54
+ except:
55
+ continue
56
+ else:
57
+ return "Unknown"
58
+
59
+ age = (datetime.now() - dob).days // 365
60
+ if age < 30: return "Under 30"
61
+ elif age < 45: return "30-44"
62
+ elif age < 60: return "45-59"
63
+ else: return "60+"
64
+ except:
65
+ return "Unknown"
66
+
67
+ def add_laplace_noise(val, epsilon=1.0, sensitivity=1.0):
68
+ """Add Laplace noise for differential privacy."""
69
+ if pd.isna(val):
70
+ return val
71
+ scale = sensitivity / epsilon
72
+ return float(val) + np.random.laplace(0, scale)
73
+
74
+
75
+ def encrypt_dataframe(df, epsilon=1.0):
76
+ """Apply all privacy transformations to a dataframe."""
77
+ encrypted = df.copy()
78
+ transformations = []
79
+
80
+ # Hash SSN
81
+ if 'SSN' in encrypted.columns:
82
+ encrypted['SSN_Hashed'] = encrypted['SSN'].apply(hash_value)
83
+ encrypted = encrypted.drop('SSN', axis=1)
84
+ transformations.append("SSN → SHA-256 hash")
85
+
86
+ # Pseudonymize names
87
+ if 'Name' in encrypted.columns:
88
+ encrypted['Name_Pseudo'] = encrypted['Name'].apply(pseudonymize)
89
+ encrypted = encrypted.drop('Name', axis=1)
90
+ transformations.append("Name → Pseudonym")
91
+
92
+ # Generalize DOB
93
+ if 'DOB' in encrypted.columns:
94
+ encrypted['Age_Range'] = encrypted['DOB'].apply(generalize_dob)
95
+ encrypted = encrypted.drop('DOB', axis=1)
96
+ transformations.append("DOB → Age range (k-anonymity)")
97
+
98
+ # Add noise to income
99
+ if 'Income' in encrypted.columns:
100
+ encrypted['Income_Noisy'] = encrypted['Income'].apply(
101
+ lambda x: add_laplace_noise(x, epsilon, 5000)
102
+ )
103
+ encrypted = encrypted.drop('Income', axis=1)
104
+ transformations.append(f"Income → Laplace noise (ε={epsilon})")
105
+
106
+ # Add noise to heart rate
107
+ if 'Heart Rate' in encrypted.columns:
108
+ encrypted['Heart_Rate_Noisy'] = encrypted['Heart Rate'].apply(
109
+ lambda x: add_laplace_noise(x, epsilon, 5)
110
+ )
111
+ transformations.append("Heart Rate → Laplace noise")
112
+
113
+ return encrypted, transformations
114
+
115
+
116
+ def prepare_for_ml(df, target_col='Tumor Condition'):
117
+ """Prepare dataframe for ML training."""
118
+ if target_col not in df.columns:
119
+ return None, None, f"Target column '{target_col}' not found"
120
+
121
+ # Copy and clean
122
+ df_clean = df.dropna(axis=1, how='all').copy()
123
+
124
+ # Separate target
125
+ y = df_clean[target_col].copy()
126
+ X = df_clean.drop(columns=[target_col])
127
+
128
+ # Remove identifier columns
129
+ id_cols = ['Name', 'SSN', 'DOB', 'Name_Pseudo', 'SSN_Hashed', 'Age_Range']
130
+ X = X.drop(columns=[c for c in id_cols if c in X.columns], errors='ignore')
131
+
132
+ # Encode
133
+ for col in X.columns:
134
+ if X[col].dtype == 'object':
135
+ le = LabelEncoder()
136
+ X[col] = le.fit_transform(X[col].fillna('Unknown').astype(str))
137
+ else:
138
+ X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0)
139
+
140
+ le_y = LabelEncoder()
141
+ y_encoded = le_y.fit_transform(y.fillna('Unknown'))
142
+
143
+ return X.values, y_encoded, None
144
+
145
+
146
+ def run_ml_comparison(df_original, df_encrypted, epsilon):
147
+ """Train models and compare performance."""
148
+ results = []
149
+
150
+ # Prepare original data
151
+ X_orig, y_orig, err = prepare_for_ml(df_original)
152
+ if err:
153
+ return f"Error with original data: {err}"
154
+
155
+ # Prepare encrypted data
156
+ X_enc, y_enc, err = prepare_for_ml(df_encrypted)
157
+ if err:
158
+ return f"Error with encrypted data: {err}"
159
+
160
+ # Split data
161
+ X_tr_o, X_te_o, y_tr_o, y_te_o = train_test_split(
162
+ X_orig, y_orig, test_size=0.2, random_state=42
163
+ )
164
+ X_tr_e, X_te_e, y_tr_e, y_te_e = train_test_split(
165
+ X_enc, y_enc, test_size=0.2, random_state=42
166
+ )
167
+
168
+ # Scale
169
+ scaler = StandardScaler()
170
+ X_tr_o = scaler.fit_transform(X_tr_o)
171
+ X_te_o = scaler.transform(X_te_o)
172
+
173
+ scaler2 = StandardScaler()
174
+ X_tr_e = scaler2.fit_transform(X_tr_e)
175
+ X_te_e = scaler2.transform(X_te_e)
176
+
177
+ # Model 1: Standard LR on original data
178
+ lr = LogisticRegression(max_iter=1000, random_state=42)
179
+ lr.fit(X_tr_o, y_tr_o)
180
+ pred = lr.predict(X_te_o)
181
+ results.append({
182
+ 'Model': 'Standard Logistic Regression',
183
+ 'Data': 'Original (No Privacy)',
184
+ 'Accuracy': round(accuracy_score(y_te_o, pred), 4),
185
+ 'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4),
186
+ 'Privacy Level': 'None ❌'
187
+ })
188
+
189
+ # Model 2: DP Logistic Regression
190
+ if DP_AVAILABLE:
191
+ try:
192
+ data_norm = np.linalg.norm(X_tr_o, axis=1).max()
193
+ dp_lr = DPLogisticRegression(
194
+ epsilon=epsilon, data_norm=data_norm,
195
+ max_iter=1000, random_state=42
196
+ )
197
+ dp_lr.fit(X_tr_o, y_tr_o)
198
+ pred = dp_lr.predict(X_te_o)
199
+ results.append({
200
+ 'Model': f'DP Logistic Regression (ε={epsilon})',
201
+ 'Data': 'Original + DP Training',
202
+ 'Accuracy': round(accuracy_score(y_te_o, pred), 4),
203
+ 'F1 Score': round(f1_score(y_te_o, pred, average='weighted'), 4),
204
+ 'Privacy Level': f'High ✓ (ε={epsilon})'
205
+ })
206
+ except Exception as e:
207
+ results.append({
208
+ 'Model': 'DP Logistic Regression',
209
+ 'Data': 'Error',
210
+ 'Accuracy': 0,
211
+ 'F1 Score': 0,
212
+ 'Privacy Level': f'Error: {str(e)[:50]}'
213
+ })
214
+
215
+ # Model 3: RF on encrypted data
216
+ rf = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42)
217
+ rf.fit(X_tr_e, y_tr_e)
218
+ pred = rf.predict(X_te_e)
219
+ results.append({
220
+ 'Model': 'Random Forest',
221
+ 'Data': 'Encrypted Data',
222
+ 'Accuracy': round(accuracy_score(y_te_e, pred), 4),
223
+ 'F1 Score': round(f1_score(y_te_e, pred, average='weighted'), 4),
224
+ 'Privacy Level': 'High ✓ (Data Encrypted)'
225
+ })
226
+
227
+ return pd.DataFrame(results)
228
+
229
+
230
+ # ========== GRADIO INTERFACE ==========
231
+
232
+ def process_data(file, epsilon, show_sample):
233
+ """Main processing function for Gradio."""
234
+
235
+ # Load data
236
+ if file is None:
237
+ return "Please upload a CSV file.", None, None, None
238
+
239
+ try:
240
+ df = pd.read_csv(file.name)
241
+ except Exception as e:
242
+ return f"Error reading file: {e}", None, None, None
243
+
244
+ # Clean
245
+ df = df.dropna(axis=1, how='all').drop_duplicates()
246
+ df.columns = df.columns.str.strip()
247
+
248
+ # Encrypt
249
+ df_encrypted, transformations = encrypt_dataframe(df, epsilon)
250
+
251
+ # Run ML comparison
252
+ comparison_df = run_ml_comparison(df, df_encrypted, epsilon)
253
+
254
+ # Prepare outputs
255
+ transform_text = "**Privacy Transformations Applied:**\n" + "\n".join(
256
+ [f"• {t}" for t in transformations]
257
+ )
258
+
259
+ # Sample data (first 5 rows)
260
+ sample_orig = df.head(5) if show_sample else None
261
+ sample_enc = df_encrypted.head(5) if show_sample else None
262
+
263
+ # Create downloadable encrypted CSV
264
+ csv_buffer = io.StringIO()
265
+ df_encrypted.to_csv(csv_buffer, index=False)
266
+ csv_content = csv_buffer.getvalue()
267
+
268
+ return transform_text, comparison_df, sample_orig, sample_enc
269
+
270
+
271
+ def create_demo():
272
+ """Build the Gradio interface."""
273
+
274
+ with gr.Blocks(title="Privacy-Preserving ML Demo", theme=gr.themes.Soft()) as demo:
275
+
276
+ gr.Markdown("""
277
+ # 🔒 Privacy-Preserving Machine Learning Demo
278
+
279
+ This demo shows how **differential privacy** and **data encryption** techniques
280
+ can protect sensitive data while still allowing useful ML predictions.
281
+
282
+ ## How it works:
283
+ 1. Upload your healthcare/financial CSV dataset
284
+ 2. Adjust the privacy budget (epsilon) - lower = more privacy, less accuracy
285
+ 3. See how different privacy techniques transform your data
286
+ 4. Compare model performance: original vs. encrypted data
287
+
288
+ ---
289
+ """)
290
+
291
+ with gr.Row():
292
+ with gr.Column(scale=1):
293
+ file_input = gr.File(
294
+ label="📁 Upload CSV Dataset",
295
+ file_types=[".csv"]
296
+ )
297
+
298
+ epsilon_slider = gr.Slider(
299
+ minimum=0.1, maximum=10.0, value=1.0, step=0.1,
300
+ label="🔐 Privacy Budget (Epsilon)",
301
+ info="Lower = more privacy, less utility. Typical: 0.1-2.0"
302
+ )
303
+
304
+ show_sample = gr.Checkbox(
305
+ value=True,
306
+ label="Show data samples"
307
+ )
308
+
309
+ run_btn = gr.Button("🚀 Run Privacy Analysis", variant="primary")
310
+
311
+ with gr.Row():
312
+ transform_output = gr.Markdown(label="Transformations Applied")
313
+
314
+ gr.Markdown("## 📊 Model Performance Comparison")
315
+ comparison_output = gr.Dataframe(label="Results")
316
+
317
+ with gr.Row():
318
+ with gr.Column():
319
+ gr.Markdown("### Original Data (Sample)")
320
+ orig_sample = gr.Dataframe(label="First 5 rows")
321
+ with gr.Column():
322
+ gr.Markdown("### Encrypted Data (Sample)")
323
+ enc_sample = gr.Dataframe(label="First 5 rows - PII Protected")
324
+
325
+ gr.Markdown("""
326
+ ---
327
+ ## 📚 Privacy Techniques Used
328
+
329
+ | Technique | What it Does | Applied To |
330
+ |-----------|--------------|------------|
331
+ | **SHA-256 Hashing** | One-way irreversible hash | SSN |
332
+ | **Pseudonymization** | Replace with fake IDs | Names |
333
+ | **K-Anonymity** | Generalize to ranges | DOB, Income |
334
+ | **Laplace Noise** | Add random noise | Numeric values |
335
+ | **Differential Privacy** | Mathematical privacy guarantee | ML training |
336
+
337
+ **Privacy Budget (ε):** Controls the trade-off between privacy and utility.
338
+ - ε = 0.1: Very high privacy, significant accuracy loss
339
+ - ε = 1.0: Good balance (recommended)
340
+ - ε = 10.0: Low privacy, minimal accuracy loss
341
+ """)
342
+
343
+ # Connect button to function
344
+ run_btn.click(
345
+ fn=process_data,
346
+ inputs=[file_input, epsilon_slider, show_sample],
347
+ outputs=[transform_output, comparison_output, orig_sample, enc_sample]
348
+ )
349
+
350
+ return demo
351
+
352
+
353
+ # Launch
354
+ if __name__ == "__main__":
355
+ demo = create_demo()
356
+ demo.launch()
model_comparison_results.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Model,Accuracy,F1_Score
2
+ Standard LR (No Privacy),1.0,1.0
3
+ Standard RF (No Privacy),1.0,1.0
4
+ LR on Encrypted Data,1.0,1.0
5
+ RF on Encrypted Data,0.6666666666666666,0.8000000000000002
privacy_ml_solution.py ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Privacy-Preserving Machine Learning Solution
3
+ =============================================
4
+ Implements differential privacy and data encryption for healthcare data classification.
5
+ Designed for Hugging Face Spaces deployment (CPU-only, free tier compatible).
6
+
7
+ Author: Data Science Assignment
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import hashlib
13
+ import base64
14
+ import warnings
15
+ from datetime import datetime
16
+ from typing import Tuple, Dict, Any
17
+
18
+ # Core ML libraries
19
+ from sklearn.model_selection import train_test_split
20
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
21
+ from sklearn.ensemble import RandomForestClassifier
22
+ from sklearn.linear_model import LogisticRegression
23
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
24
+
25
+ # Differential Privacy library - IBM's diffprivlib
26
+ # Lightweight, sklearn-compatible, works on CPU
27
+ try:
28
+ from diffprivlib.models import LogisticRegression as DPLogisticRegression
29
+ from diffprivlib.models import GaussianNB as DPGaussianNB
30
+ DIFFPRIVLIB_AVAILABLE = True
31
+ except ImportError:
32
+ DIFFPRIVLIB_AVAILABLE = False
33
+ print("Warning: diffprivlib not installed. Install with: pip install diffprivlib")
34
+
35
+ warnings.filterwarnings('ignore')
36
+
37
+
38
+ # ============================================================================
39
+ # SECTION 1: DATA ENCRYPTION UTILITIES
40
+ # ============================================================================
41
+
42
+ class DataPrivacyProcessor:
43
+ """
44
+ Handles multiple privacy-preserving transformations:
45
+ 1. Hashing (SHA-256) for direct identifiers like SSN
46
+ 2. K-anonymity style generalization for quasi-identifiers
47
+ 3. Data masking for names
48
+ 4. Noise addition (Laplace mechanism) for numerical values
49
+ """
50
+
51
+ def __init__(self, epsilon: float = 1.0):
52
+ """
53
+ Args:
54
+ epsilon: Privacy budget for differential privacy.
55
+ Lower = more privacy, less utility.
56
+ Typical range: 0.1 (high privacy) to 10 (low privacy)
57
+ """
58
+ self.epsilon = epsilon
59
+ self.salt = "privacy_salt_2024" # Salt for hashing
60
+
61
+ def hash_identifier(self, value: str) -> str:
62
+ """
63
+ One-way hash for direct identifiers (SSN, etc.).
64
+ Uses SHA-256 with salt to prevent rainbow table attacks.
65
+ """
66
+ if pd.isna(value):
67
+ return "HASH_NULL"
68
+ salted = f"{self.salt}{value}"
69
+ return hashlib.sha256(salted.encode()).hexdigest()[:16]
70
+
71
+ def mask_name(self, name: str) -> str:
72
+ """
73
+ Pseudonymizes names while keeping format for utility.
74
+ Example: 'John Smith' -> 'P_A1B2C3'
75
+ """
76
+ if pd.isna(name):
77
+ return "P_NULL"
78
+ # Create deterministic pseudonym from hash
79
+ hash_val = hashlib.md5(f"{self.salt}{name}".encode()).hexdigest()[:6]
80
+ return f"P_{hash_val.upper()}"
81
+
82
+ def generalize_age(self, dob_str: str) -> str:
83
+ """
84
+ K-anonymity: Generalizes exact DOB to age ranges.
85
+ Reduces re-identification risk while preserving analytical value.
86
+ """
87
+ if pd.isna(dob_str):
88
+ return "Unknown"
89
+ try:
90
+ # Handle multiple date formats
91
+ for fmt in ['%m/%d/%Y', '%Y-%m-%d', '%d/%m/%Y']:
92
+ try:
93
+ dob = datetime.strptime(str(dob_str), fmt)
94
+ break
95
+ except ValueError:
96
+ continue
97
+ else:
98
+ return "Unknown"
99
+
100
+ age = (datetime.now() - dob).days // 365
101
+
102
+ # Create age buckets (5-year ranges for k-anonymity)
103
+ if age < 25:
104
+ return "18-24"
105
+ elif age < 35:
106
+ return "25-34"
107
+ elif age < 45:
108
+ return "35-44"
109
+ elif age < 55:
110
+ return "45-54"
111
+ elif age < 65:
112
+ return "55-64"
113
+ else:
114
+ return "65+"
115
+ except Exception:
116
+ return "Unknown"
117
+
118
+ def generalize_income(self, income: float) -> str:
119
+ """
120
+ K-anonymity: Buckets income into ranges.
121
+ Prevents exact salary identification.
122
+ """
123
+ if pd.isna(income):
124
+ return "Unknown"
125
+ try:
126
+ income = float(income)
127
+ if income < 30000:
128
+ return "Low (<30K)"
129
+ elif income < 50000:
130
+ return "Medium-Low (30-50K)"
131
+ elif income < 75000:
132
+ return "Medium (50-75K)"
133
+ elif income < 100000:
134
+ return "Medium-High (75-100K)"
135
+ else:
136
+ return "High (100K+)"
137
+ except (ValueError, TypeError):
138
+ return "Unknown"
139
+
140
+ def add_laplace_noise(self, value: float, sensitivity: float = 1.0) -> float:
141
+ """
142
+ Differential Privacy: Adds calibrated Laplace noise.
143
+ Provides plausible deniability for individual records.
144
+
145
+ Args:
146
+ value: Original numeric value
147
+ sensitivity: How much one person can affect the output
148
+ """
149
+ if pd.isna(value):
150
+ return value
151
+ scale = sensitivity / self.epsilon
152
+ noise = np.random.laplace(0, scale)
153
+ return value + noise
154
+
155
+ def encrypt_dataset(self, df: pd.DataFrame) -> pd.DataFrame:
156
+ """
157
+ Applies appropriate privacy technique to each column type.
158
+ Returns fully anonymized/encrypted dataset.
159
+ """
160
+ encrypted_df = df.copy()
161
+
162
+ print("Applying privacy-preserving transformations...")
163
+
164
+ # 1. Hash direct identifiers (SSN) - irreversible
165
+ if 'SSN' in encrypted_df.columns:
166
+ encrypted_df['SSN_Hash'] = encrypted_df['SSN'].apply(self.hash_identifier)
167
+ encrypted_df.drop('SSN', axis=1, inplace=True)
168
+ print(" ✓ SSN hashed with SHA-256")
169
+
170
+ # 2. Pseudonymize names
171
+ if 'Name' in encrypted_df.columns:
172
+ encrypted_df['Name_Pseudo'] = encrypted_df['Name'].apply(self.mask_name)
173
+ encrypted_df.drop('Name', axis=1, inplace=True)
174
+ print(" ✓ Names pseudonymized")
175
+
176
+ # 3. Generalize DOB to age ranges (k-anonymity)
177
+ if 'DOB' in encrypted_df.columns:
178
+ encrypted_df['Age_Range'] = encrypted_df['DOB'].apply(self.generalize_age)
179
+ encrypted_df.drop('DOB', axis=1, inplace=True)
180
+ print(" ✓ DOB generalized to age ranges")
181
+
182
+ # 4. Generalize income (k-anonymity)
183
+ if 'Income' in encrypted_df.columns:
184
+ # Keep noisy version for ML, generalized for reporting
185
+ encrypted_df['Income_Noisy'] = encrypted_df['Income'].apply(
186
+ lambda x: self.add_laplace_noise(x, sensitivity=5000)
187
+ )
188
+ encrypted_df['Income_Range'] = encrypted_df['Income'].apply(self.generalize_income)
189
+ encrypted_df.drop('Income', axis=1, inplace=True)
190
+ print(" ✓ Income: noise added + generalized")
191
+
192
+ # 5. Add noise to other numerical health metrics
193
+ numeric_noise_cols = ['Heart Rate']
194
+ for col in numeric_noise_cols:
195
+ if col in encrypted_df.columns:
196
+ encrypted_df[f'{col}_Noisy'] = encrypted_df[col].apply(
197
+ lambda x: self.add_laplace_noise(x, sensitivity=5)
198
+ )
199
+ print(f" ✓ {col}: Laplace noise added")
200
+
201
+ print(f"\nPrivacy budget (epsilon) used: {self.epsilon}")
202
+ return encrypted_df
203
+
204
+
205
+ # ============================================================================
206
+ # SECTION 2: DATA PREPROCESSING
207
+ # ============================================================================
208
+
209
+ class HealthcareDataProcessor:
210
+ """
211
+ Prepares healthcare data for ML model training.
212
+ Handles encoding, scaling, and feature engineering.
213
+ """
214
+
215
+ def __init__(self):
216
+ self.label_encoders = {}
217
+ self.scaler = StandardScaler()
218
+ self.feature_columns = []
219
+
220
+ def load_and_clean(self, filepath: str) -> pd.DataFrame:
221
+ """Load CSV and perform basic cleaning."""
222
+ df = pd.read_csv(filepath)
223
+
224
+ # Remove completely empty columns
225
+ df = df.dropna(axis=1, how='all')
226
+
227
+ # Remove duplicate rows
228
+ df = df.drop_duplicates()
229
+
230
+ # Clean column names
231
+ df.columns = df.columns.str.strip()
232
+
233
+ print(f"Loaded {len(df)} records with {len(df.columns)} features")
234
+ return df
235
+
236
+ def prepare_features(self, df: pd.DataFrame, target_col: str = 'Tumor Condition') -> Tuple[np.ndarray, np.ndarray]:
237
+ """
238
+ Encodes categorical features and prepares for ML.
239
+ Returns feature matrix X and target vector y.
240
+ """
241
+ # Identify target
242
+ if target_col not in df.columns:
243
+ raise ValueError(f"Target column '{target_col}' not found!")
244
+
245
+ # Separate features and target
246
+ y = df[target_col].copy()
247
+ X_df = df.drop(columns=[target_col])
248
+
249
+ # Remove non-predictive columns (identifiers)
250
+ cols_to_drop = ['Name', 'SSN', 'Name_Pseudo', 'SSN_Hash', 'DOB']
251
+ X_df = X_df.drop(columns=[c for c in cols_to_drop if c in X_df.columns], errors='ignore')
252
+
253
+ # Encode target variable
254
+ le_target = LabelEncoder()
255
+ y_encoded = le_target.fit_transform(y.fillna('Unknown'))
256
+ self.label_encoders['target'] = le_target
257
+
258
+ # Process each column
259
+ processed_cols = []
260
+ for col in X_df.columns:
261
+ if X_df[col].dtype in ['object', 'category']:
262
+ # Categorical: label encode
263
+ le = LabelEncoder()
264
+ X_df[col] = le.fit_transform(X_df[col].fillna('Unknown').astype(str))
265
+ self.label_encoders[col] = le
266
+ else:
267
+ # Numeric: fill NaN with median
268
+ X_df[col] = pd.to_numeric(X_df[col], errors='coerce')
269
+ X_df[col] = X_df[col].fillna(X_df[col].median())
270
+ processed_cols.append(col)
271
+
272
+ self.feature_columns = processed_cols
273
+
274
+ # Scale features
275
+ X_scaled = self.scaler.fit_transform(X_df)
276
+
277
+ print(f"Prepared {X_scaled.shape[1]} features for {X_scaled.shape[0]} samples")
278
+ return X_scaled, y_encoded
279
+
280
+
281
+ # ============================================================================
282
+ # SECTION 3: MODEL TRAINING AND EVALUATION
283
+ # ============================================================================
284
+
285
+ class PrivacyPreservingMLPipeline:
286
+ """
287
+ Complete ML pipeline comparing:
288
+ 1. Standard model (no privacy)
289
+ 2. Differentially private model
290
+ 3. Model trained on encrypted data
291
+ """
292
+
293
+ def __init__(self, epsilon: float = 1.0):
294
+ self.epsilon = epsilon
295
+ self.results = {}
296
+
297
+ def evaluate_model(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, float]:
298
+ """Calculate and store standard metrics."""
299
+ metrics = {
300
+ 'accuracy': accuracy_score(y_true, y_pred),
301
+ 'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
302
+ 'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
303
+ 'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0)
304
+ }
305
+ self.results[model_name] = metrics
306
+ return metrics
307
+
308
+ def train_standard_model(self, X_train: np.ndarray, X_test: np.ndarray,
309
+ y_train: np.ndarray, y_test: np.ndarray) -> Dict[str, float]:
310
+ """Train standard logistic regression (no privacy)."""
311
+ print("\n" + "="*60)
312
+ print("TRAINING STANDARD MODEL (No Privacy Protection)")
313
+ print("="*60)
314
+
315
+ model = LogisticRegression(max_iter=1000, random_state=42)
316
+ model.fit(X_train, y_train)
317
+ y_pred = model.predict(X_test)
318
+
319
+ metrics = self.evaluate_model(y_test, y_pred, 'Standard LR')
320
+ print(f"Accuracy: {metrics['accuracy']:.4f}")
321
+ print(f"F1 Score: {metrics['f1']:.4f}")
322
+
323
+ return metrics
324
+
325
+ def train_dp_model(self, X_train: np.ndarray, X_test: np.ndarray,
326
+ y_train: np.ndarray, y_test: np.ndarray) -> Dict[str, float]:
327
+ """Train differentially private logistic regression."""
328
+ print("\n" + "="*60)
329
+ print(f"TRAINING DP MODEL (Epsilon = {self.epsilon})")
330
+ print("="*60)
331
+
332
+ if not DIFFPRIVLIB_AVAILABLE:
333
+ print("diffprivlib not available - skipping DP model")
334
+ return {}
335
+
336
+ # Calculate data bounds for DP (required by diffprivlib)
337
+ data_norm = np.linalg.norm(X_train, axis=1).max()
338
+
339
+ dp_model = DPLogisticRegression(
340
+ epsilon=self.epsilon,
341
+ data_norm=data_norm,
342
+ max_iter=1000,
343
+ random_state=42
344
+ )
345
+
346
+ dp_model.fit(X_train, y_train)
347
+ y_pred = dp_model.predict(X_test)
348
+
349
+ metrics = self.evaluate_model(y_test, y_pred, f'DP LR (ε={self.epsilon})')
350
+ print(f"Accuracy: {metrics['accuracy']:.4f}")
351
+ print(f"F1 Score: {metrics['f1']:.4f}")
352
+
353
+ return metrics
354
+
355
+ def train_on_encrypted_data(self, X_train: np.ndarray, X_test: np.ndarray,
356
+ y_train: np.ndarray, y_test: np.ndarray) -> Dict[str, float]:
357
+ """Train model on encrypted/anonymized dataset."""
358
+ print("\n" + "="*60)
359
+ print("TRAINING ON ENCRYPTED DATA")
360
+ print("="*60)
361
+
362
+ # The data passed here is already encrypted/anonymized
363
+ # We use Random Forest as it handles noisy data better
364
+ model = RandomForestClassifier(
365
+ n_estimators=100,
366
+ max_depth=10,
367
+ random_state=42,
368
+ n_jobs=-1
369
+ )
370
+
371
+ model.fit(X_train, y_train)
372
+ y_pred = model.predict(X_test)
373
+
374
+ metrics = self.evaluate_model(y_test, y_pred, 'RF on Encrypted Data')
375
+ print(f"Accuracy: {metrics['accuracy']:.4f}")
376
+ print(f"F1 Score: {metrics['f1']:.4f}")
377
+
378
+ return metrics
379
+
380
+ def compare_results(self) -> pd.DataFrame:
381
+ """Generate comparison table of all models."""
382
+ if not self.results:
383
+ return pd.DataFrame()
384
+
385
+ comparison = pd.DataFrame(self.results).T
386
+ comparison = comparison.round(4)
387
+
388
+ print("\n" + "="*60)
389
+ print("MODEL COMPARISON RESULTS")
390
+ print("="*60)
391
+ print(comparison.to_string())
392
+
393
+ return comparison
394
+
395
+
396
+ # ============================================================================
397
+ # SECTION 4: MAIN EXECUTION
398
+ # ============================================================================
399
+
400
+ def run_complete_pipeline(data_path: str, epsilon: float = 1.0) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
401
+ """
402
+ Execute the complete privacy-preserving ML pipeline.
403
+
404
+ Args:
405
+ data_path: Path to the CSV dataset
406
+ epsilon: Privacy budget for differential privacy
407
+
408
+ Returns:
409
+ - Original cleaned DataFrame
410
+ - Encrypted DataFrame
411
+ - Dictionary of all results
412
+ """
413
+ print("="*70)
414
+ print("PRIVACY-PRESERVING MACHINE LEARNING PIPELINE")
415
+ print("="*70)
416
+ print(f"Privacy budget (epsilon): {epsilon}")
417
+ print(f"Data file: {data_path}")
418
+ print("="*70)
419
+
420
+ # Step 1: Load and clean data
421
+ processor = HealthcareDataProcessor()
422
+ df_original = processor.load_and_clean(data_path)
423
+
424
+ print("\n--- ORIGINAL DATA SAMPLE ---")
425
+ print(df_original.head(3).to_string())
426
+
427
+ # Step 2: Apply privacy transformations
428
+ privacy_processor = DataPrivacyProcessor(epsilon=epsilon)
429
+ df_encrypted = privacy_processor.encrypt_dataset(df_original)
430
+
431
+ print("\n--- ENCRYPTED DATA SAMPLE ---")
432
+ print(df_encrypted.head(3).to_string())
433
+
434
+ # Save encrypted dataset
435
+ encrypted_path = data_path.replace('.csv', '_encrypted.csv')
436
+ df_encrypted.to_csv(encrypted_path, index=False)
437
+ print(f"\n✓ Encrypted dataset saved to: {encrypted_path}")
438
+
439
+ # Step 3: Prepare features from ORIGINAL data
440
+ X_orig, y_orig = processor.prepare_features(df_original.copy())
441
+ X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
442
+ X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig
443
+ )
444
+
445
+ # Step 4: Prepare features from ENCRYPTED data
446
+ processor_enc = HealthcareDataProcessor()
447
+ df_enc_clean = df_encrypted.copy()
448
+ X_enc, y_enc = processor_enc.prepare_features(df_enc_clean)
449
+ X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(
450
+ X_enc, y_enc, test_size=0.2, random_state=42, stratify=y_enc
451
+ )
452
+
453
+ # Step 5: Train and evaluate models
454
+ pipeline = PrivacyPreservingMLPipeline(epsilon=epsilon)
455
+
456
+ # Model 1: Standard (no privacy)
457
+ pipeline.train_standard_model(X_train_orig, X_test_orig, y_train_orig, y_test_orig)
458
+
459
+ # Model 2: Differential Privacy
460
+ if DIFFPRIVLIB_AVAILABLE:
461
+ pipeline.train_dp_model(X_train_orig, X_test_orig, y_train_orig, y_test_orig)
462
+
463
+ # Model 3: Trained on encrypted data
464
+ pipeline.train_on_encrypted_data(X_train_enc, X_test_enc, y_train_enc, y_test_enc)
465
+
466
+ # Step 6: Generate comparison
467
+ comparison = pipeline.compare_results()
468
+
469
+ # Step 7: Summary
470
+ results = {
471
+ 'original_shape': df_original.shape,
472
+ 'encrypted_shape': df_encrypted.shape,
473
+ 'epsilon': epsilon,
474
+ 'model_comparison': comparison.to_dict(),
475
+ 'privacy_techniques_applied': [
476
+ 'SHA-256 Hashing (SSN)',
477
+ 'Pseudonymization (Names)',
478
+ 'K-Anonymity Generalization (DOB, Income)',
479
+ 'Laplace Noise Addition (Numerical features)',
480
+ f'Differential Privacy (ε={epsilon})'
481
+ ]
482
+ }
483
+
484
+ print("\n" + "="*70)
485
+ print("PIPELINE COMPLETED SUCCESSFULLY")
486
+ print("="*70)
487
+
488
+ return df_original, df_encrypted, results
489
+
490
+
491
+ # ============================================================================
492
+ # SECTION 5: COMMAND LINE INTERFACE
493
+ # ============================================================================
494
+
495
+ if __name__ == "__main__":
496
+ import sys
497
+
498
+ # Default settings
499
+ data_file = "Assignment2Dataset-1.csv"
500
+ epsilon = 1.0 # Balance between privacy and utility
501
+
502
+ # Allow command line arguments
503
+ if len(sys.argv) > 1:
504
+ data_file = sys.argv[1]
505
+ if len(sys.argv) > 2:
506
+ epsilon = float(sys.argv[2])
507
+
508
+ # Run the complete pipeline
509
+ df_orig, df_enc, results = run_complete_pipeline(data_file, epsilon)
510
+
511
+ print("\n\nFinal Summary:")
512
+ print("-" * 40)
513
+ print(f"Original records: {results['original_shape'][0]}")
514
+ print(f"Privacy techniques applied: {len(results['privacy_techniques_applied'])}")
515
+ print(f"Epsilon value: {results['epsilon']}")
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for Privacy-Preserving ML Demo
2
+ # Hugging Face Spaces - CPU Only (Free Tier Compatible)
3
+
4
+ # Core ML
5
+ pandas>=2.0.0
6
+ numpy>=1.24.0
7
+ scikit-learn>=1.3.0
8
+
9
+ # Differential Privacy - IBM's library (lightweight, pure Python)
10
+ diffprivlib>=0.6.0
11
+
12
+ # Gradio for web interface
13
+ gradio>=4.0.0
14
+
15
+ # Note: No GPU libraries needed - runs on CPU
16
+ # Total install size: ~200MB (within free tier limits)