fredwebx commited on
Commit
aecd0c4
·
verified ·
1 Parent(s): f2ce9fa

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -0
app.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from faker import Faker
4
+ import os
5
+ import re
6
+
7
+ # ── Example CSV (created at startup, before Blocks definition) ────────────────
8
+ def _create_example():
9
+ if os.path.exists("example.csv"):
10
+ return
11
+ sample = pd.DataFrame({
12
+ "id": range(1, 6),
13
+ "first_name": ["Mario", "Lucia", "Giovanni", "Sofia", "Marco"],
14
+ "last_name": ["Rossi", "Bianchi", "Ferrari", "Esposito", "Romano"],
15
+ "email": ["mario.rossi@gmail.com", "lucia.b@yahoo.it",
16
+ "g.ferrari@outlook.com", "sofia.e@libero.it",
17
+ "m.romano@hotmail.com"],
18
+ "phone": ["+39 333 1234567", "02 9876543", "+39 347 9988776",
19
+ "080 5551234", "+39 320 4567890"],
20
+ "city": ["Roma", "Milano", "Napoli", "Bari", "Torino"],
21
+ "notes": ["premium customer", "newsletter yes", "B2B",
22
+ "trial", "enterprise"],
23
+ })
24
+ sample.to_csv("example.csv", index=False)
25
+
26
+ _create_example()
27
+
28
+ # ── Available anonymization types ─────────────────────────────────────────────
29
+ ANON_TYPES = {
30
+ "email": "📧 Email",
31
+ "first_name": "👤 First Name",
32
+ "last_name": "👤 Last Name",
33
+ "full_name": "👤 Full Name",
34
+ "phone": "📱 Phone",
35
+ "address": "🏠 Address",
36
+ "city": "🏙️ City",
37
+ "postal_code": "📮 Postal Code",
38
+ "tax_id": "🪪 Tax ID",
39
+ "date_of_birth": "📅 Date of Birth",
40
+ "generic": "🔒 Generic",
41
+ }
42
+ TYPE_LABELS = {v: k for k, v in ANON_TYPES.items()}
43
+ TYPE_CHOICES = list(ANON_TYPES.values())
44
+
45
+ # ── Automatic type detection ──────────────────────────────────────────────────
46
+ def _detect_type(series):
47
+ name = (series.name or "").lower()
48
+ sample = series.dropna().astype(str).head(200)
49
+
50
+ if any(k in name for k in ("email", "mail", "e-mail")):
51
+ return "email"
52
+ if any(k in name for k in ("telefon", "phone", "cell", "mobile", "tel")):
53
+ return "phone"
54
+ if any(k in name for k in ("full_name", "fullname", "nome_completo")):
55
+ return "full_name"
56
+ if any(k in name for k in ("nome", "first", "given")):
57
+ return "first_name"
58
+ if any(k in name for k in ("cognome", "surname", "last")):
59
+ return "last_name"
60
+ if any(k in name for k in ("indirizzo", "address", "via", "street")):
61
+ return "address"
62
+ if any(k in name for k in ("città", "city", "comune", "citta")):
63
+ return "city"
64
+ if any(k in name for k in ("cap", "postal", "zip")):
65
+ return "postal_code"
66
+ if any(k in name for k in ("cf", "codice_fiscale", "fiscal", "tax")):
67
+ return "tax_id"
68
+ if any(k in name for k in ("nascita", "birth", "dob", "birthday")):
69
+ return "date_of_birth"
70
+
71
+ if len(sample) > 0:
72
+ if sample.str.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$").mean() > 0.5:
73
+ return "email"
74
+ if sample.str.match(r"^[\+\d\s\-\(\)]{7,15}$").mean() > 0.5:
75
+ return "phone"
76
+
77
+ return "generic"
78
+
79
+ # ── Fake-data generators (deterministic per value) ────────────────────────────
80
+ def _make_fake(value, anon_type, cache):
81
+ if value in cache:
82
+ return cache[value]
83
+
84
+ seed = abs(hash(value)) % (2**32)
85
+ fake_local = Faker(["it_IT", "en_US"])
86
+ fake_local.seed_instance(seed)
87
+
88
+ generators = {
89
+ "email": lambda: fake_local.email(),
90
+ "first_name": lambda: fake_local.first_name(),
91
+ "last_name": lambda: fake_local.last_name(),
92
+ "full_name": lambda: fake_local.name(),
93
+ "phone": lambda: fake_local.phone_number(),
94
+ "address": lambda: fake_local.street_address(),
95
+ "city": lambda: fake_local.city(),
96
+ "postal_code": lambda: fake_local.postcode(),
97
+ "tax_id": lambda: _fake_tax_id(fake_local),
98
+ "date_of_birth": lambda: fake_local.date_of_birth(minimum_age=18, maximum_age=80).strftime("%d/%m/%Y"),
99
+ "generic": lambda: f"ANONYMIZED_{abs(hash(value)) % 100000:05d}",
100
+ }
101
+
102
+ result = generators.get(anon_type, generators["generic"])()
103
+ cache[value] = result
104
+ return result
105
+
106
+ def _fake_tax_id(f):
107
+ letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
108
+ digits = "0123456789"
109
+ return (
110
+ "".join(f.random_choices(letters, length=3))
111
+ + "".join(f.random_choices(letters, length=3))
112
+ + "".join(f.random_choices(digits, length=2))
113
+ + f.random_choices(letters, length=1)[0]
114
+ + "".join(f.random_choices(digits, length=2))
115
+ + f.random_choices(letters, length=1)[0]
116
+ + "".join(f.random_choices(digits, length=3))
117
+ + f.random_choices(letters, length=1)[0]
118
+ )
119
+
120
+ # ── Main logic ────────────────────────────────────────────────────────────────
121
+ EMPTY_CONFIG = [["", TYPE_CHOICES[-1], False]]
122
+
123
+ def reset_state():
124
+ """Reset state when the user clears the uploaded file."""
125
+ return None, EMPTY_CONFIG, "Upload a CSV file to get started."
126
+
127
+ def load_csv(file_path):
128
+ """Load CSV → return (DataFrame state, column-config list-of-lists, status)."""
129
+ if not file_path:
130
+ return None, EMPTY_CONFIG, "Upload a CSV file to get started."
131
+
132
+ try:
133
+ df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip")
134
+ except UnicodeDecodeError:
135
+ df = pd.read_csv(file_path, encoding="latin-1", on_bad_lines="skip")
136
+ except Exception as e:
137
+ return None, EMPTY_CONFIG, f"❌ Error reading the file: {e}"
138
+
139
+ if df.empty or len(df.columns) == 0:
140
+ return None, EMPTY_CONFIG, "❌ The CSV file looks empty or malformed."
141
+
142
+ col_config = [
143
+ [str(c), ANON_TYPES[_detect_type(df[c])], True]
144
+ for c in df.columns
145
+ ]
146
+ msg = (
147
+ f"✅ Loaded: **{len(df)} rows × {len(df.columns)} columns**. "
148
+ f"Review the detected types below and click **Anonymize**."
149
+ )
150
+ return df, col_config, msg
151
+
152
+
153
+ def anonymize(df, col_config):
154
+ """Anonymize df according to col_config (a list-of-lists or DataFrame)."""
155
+ if df is None or not isinstance(df, pd.DataFrame) or df.empty:
156
+ return pd.DataFrame(), None, "❌ Please upload a CSV first."
157
+
158
+ # Normalize col_config to list of [col, type_label, anonymize_bool]
159
+ if isinstance(col_config, pd.DataFrame):
160
+ rows = col_config.values.tolist()
161
+ else:
162
+ rows = list(col_config) if col_config else []
163
+
164
+ if not rows:
165
+ return pd.DataFrame(), None, "❌ Column configuration is empty."
166
+
167
+ result = df.copy()
168
+ processed = 0
169
+
170
+ for row in rows:
171
+ if len(row) < 3:
172
+ continue
173
+ col_name, type_label, do_anon = row[0], row[1], row[2]
174
+ if not do_anon or not col_name or col_name not in result.columns:
175
+ continue
176
+
177
+ anon_type = TYPE_LABELS.get(type_label, "generic")
178
+ cache = {}
179
+ result[col_name] = result[col_name].apply(
180
+ lambda v: _make_fake(str(v), anon_type, cache)
181
+ if pd.notna(v) and str(v).strip() != "" else v
182
+ )
183
+ processed += 1
184
+
185
+ csv_path = "/tmp/anonymized.csv"
186
+ result.to_csv(csv_path, index=False, encoding="utf-8")
187
+
188
+ msg = f"✅ Anonymized **{processed} columns** out of {len(df.columns)} total. Download below."
189
+ return result.head(10), csv_path, msg
190
+
191
+ # ── UI ────────────────────────────────────────────────────────────────────────
192
+ DESCRIPTION = """
193
+ # 🔒 CSV Data Anonymizer — GDPR Ready
194
+
195
+ Upload a CSV, review the columns detected as sensitive, and download the anonymized version.
196
+ The mapping is **deterministic**: the same value always produces the same fake data, preserving dataset consistency.
197
+
198
+ ✨ Clean, merge, and complete Excel tasks in seconds with [XLclick Add-in for Excel](https://xlclick.com/?so=hface).
199
+ """
200
+
201
+ with gr.Blocks(theme=gr.themes.Soft(), title="CSV Data Anonymizer") as demo:
202
+ df_state = gr.State()
203
+
204
+ gr.Markdown(DESCRIPTION)
205
+
206
+ with gr.Row():
207
+ with gr.Column(scale=1):
208
+ file_input = gr.File(
209
+ label="📂 Upload CSV",
210
+ file_types=[".csv"],
211
+ type="filepath",
212
+ )
213
+ status_box = gr.Markdown("Upload a CSV file to get started.")
214
+
215
+ with gr.Column(scale=2):
216
+ col_editor = gr.Dataframe(
217
+ label="⚙️ Column configuration",
218
+ headers=["Column", "Type", "Anonymize"],
219
+ datatype=["str", "str", "bool"],
220
+ value=EMPTY_CONFIG,
221
+ col_count=(3, "fixed"),
222
+ interactive=True,
223
+ wrap=True,
224
+ )
225
+
226
+ anon_btn = gr.Button("🔒 Anonymize", variant="primary", size="lg")
227
+
228
+ gr.Markdown("### Result preview (first 10 rows)")
229
+ preview_out = gr.Dataframe(label="Anonymized preview", interactive=False, wrap=True)
230
+
231
+ download_out = gr.File(label="⬇️ Download anonymized CSV")
232
+ result_status = gr.Markdown("")
233
+
234
+ file_input.upload(
235
+ fn=load_csv,
236
+ inputs=[file_input],
237
+ outputs=[df_state, col_editor, status_box],
238
+ api_name="load_csv",
239
+ )
240
+
241
+ file_input.clear(
242
+ fn=reset_state,
243
+ inputs=[],
244
+ outputs=[df_state, col_editor, status_box],
245
+ api_name="clear",
246
+ )
247
+
248
+ anon_btn.click(
249
+ fn=anonymize,
250
+ inputs=[df_state, col_editor],
251
+ outputs=[preview_out, download_out, result_status],
252
+ api_name="anonymize",
253
+ )
254
+
255
+ gr.Markdown(
256
+ "---\n*The file is processed entirely in memory — "
257
+ "no data is saved to disk or transmitted to third parties.*"
258
+ )
259
+
260
+ if __name__ == "__main__":
261
+ demo.launch()