fredwebx commited on
Commit
f2ce9fa
·
verified ·
1 Parent(s): df202bc

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -259
app.py DELETED
@@ -1,259 +0,0 @@
1
- import gradio as gr
2
- import pandas as pd
3
- from faker import Faker
4
- import os
5
- import re
6
-
7
- # ── Example CSV (created at startup, before Blocks definition) ────────────────
8
- def _create_example():
9
- if os.path.exists("example.csv"):
10
- return
11
- sample = pd.DataFrame({
12
- "id": range(1, 6),
13
- "first_name": ["Mario", "Lucia", "Giovanni", "Sofia", "Marco"],
14
- "last_name": ["Rossi", "Bianchi", "Ferrari", "Esposito", "Romano"],
15
- "email": ["mario.rossi@gmail.com", "lucia.b@yahoo.it",
16
- "g.ferrari@outlook.com", "sofia.e@libero.it",
17
- "m.romano@hotmail.com"],
18
- "phone": ["+39 333 1234567", "02 9876543", "+39 347 9988776",
19
- "080 5551234", "+39 320 4567890"],
20
- "city": ["Roma", "Milano", "Napoli", "Bari", "Torino"],
21
- "notes": ["premium customer", "newsletter yes", "B2B",
22
- "trial", "enterprise"],
23
- })
24
- sample.to_csv("example.csv", index=False)
25
-
26
- _create_example()
27
-
28
- # ── Available anonymization types ─────────────────────────────────────────────
29
- ANON_TYPES = {
30
- "email": "📧 Email",
31
- "first_name": "👤 First Name",
32
- "last_name": "👤 Last Name",
33
- "full_name": "👤 Full Name",
34
- "phone": "📱 Phone",
35
- "address": "🏠 Address",
36
- "city": "🏙️ City",
37
- "postal_code": "📮 Postal Code",
38
- "tax_id": "🪪 Tax ID",
39
- "date_of_birth": "📅 Date of Birth",
40
- "generic": "🔒 Generic",
41
- }
42
- TYPE_LABELS = {v: k for k, v in ANON_TYPES.items()}
43
- TYPE_CHOICES = list(ANON_TYPES.values())
44
-
45
- # ── Automatic type detection ──────────────────────────────────────────────────
46
- def _detect_type(series):
47
- name = (series.name or "").lower()
48
- sample = series.dropna().astype(str).head(200)
49
-
50
- if any(k in name for k in ("email", "mail", "e-mail")):
51
- return "email"
52
- if any(k in name for k in ("telefon", "phone", "cell", "mobile", "tel")):
53
- return "phone"
54
- if any(k in name for k in ("full_name", "fullname", "nome_completo")):
55
- return "full_name"
56
- if any(k in name for k in ("nome", "first", "given")):
57
- return "first_name"
58
- if any(k in name for k in ("cognome", "surname", "last")):
59
- return "last_name"
60
- if any(k in name for k in ("indirizzo", "address", "via", "street")):
61
- return "address"
62
- if any(k in name for k in ("città", "city", "comune", "citta")):
63
- return "city"
64
- if any(k in name for k in ("cap", "postal", "zip")):
65
- return "postal_code"
66
- if any(k in name for k in ("cf", "codice_fiscale", "fiscal", "tax")):
67
- return "tax_id"
68
- if any(k in name for k in ("nascita", "birth", "dob", "birthday")):
69
- return "date_of_birth"
70
-
71
- if len(sample) > 0:
72
- if sample.str.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$").mean() > 0.5:
73
- return "email"
74
- if sample.str.match(r"^[\+\d\s\-\(\)]{7,15}$").mean() > 0.5:
75
- return "phone"
76
-
77
- return "generic"
78
-
79
- # ── Fake-data generators (deterministic per value) ────────────────────────────
80
- def _make_fake(value, anon_type, cache):
81
- if value in cache:
82
- return cache[value]
83
-
84
- seed = abs(hash(value)) % (2**32)
85
- fake_local = Faker(["it_IT", "en_US"])
86
- fake_local.seed_instance(seed)
87
-
88
- generators = {
89
- "email": lambda: fake_local.email(),
90
- "first_name": lambda: fake_local.first_name(),
91
- "last_name": lambda: fake_local.last_name(),
92
- "full_name": lambda: fake_local.name(),
93
- "phone": lambda: fake_local.phone_number(),
94
- "address": lambda: fake_local.street_address(),
95
- "city": lambda: fake_local.city(),
96
- "postal_code": lambda: fake_local.postcode(),
97
- "tax_id": lambda: _fake_tax_id(fake_local),
98
- "date_of_birth": lambda: fake_local.date_of_birth(minimum_age=18, maximum_age=80).strftime("%d/%m/%Y"),
99
- "generic": lambda: f"ANONYMIZED_{abs(hash(value)) % 100000:05d}",
100
- }
101
-
102
- result = generators.get(anon_type, generators["generic"])()
103
- cache[value] = result
104
- return result
105
-
106
- def _fake_tax_id(f):
107
- letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
108
- digits = "0123456789"
109
- return (
110
- "".join(f.random_choices(letters, length=3))
111
- + "".join(f.random_choices(letters, length=3))
112
- + "".join(f.random_choices(digits, length=2))
113
- + f.random_choices(letters, length=1)[0]
114
- + "".join(f.random_choices(digits, length=2))
115
- + f.random_choices(letters, length=1)[0]
116
- + "".join(f.random_choices(digits, length=3))
117
- + f.random_choices(letters, length=1)[0]
118
- )
119
-
120
- # ── Main logic ────────────────────────────────────────────────────────────────
121
- EMPTY_CONFIG = [["", TYPE_CHOICES[-1], False]]
122
-
123
- def reset_state():
124
- """Reset state when the user clears the uploaded file."""
125
- return None, EMPTY_CONFIG, "Upload a CSV file to get started."
126
-
127
- def load_csv(file_path):
128
- """Load CSV → return (DataFrame state, column-config list-of-lists, status)."""
129
- if not file_path:
130
- return None, EMPTY_CONFIG, "Upload a CSV file to get started."
131
-
132
- try:
133
- df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip")
134
- except UnicodeDecodeError:
135
- df = pd.read_csv(file_path, encoding="latin-1", on_bad_lines="skip")
136
- except Exception as e:
137
- return None, EMPTY_CONFIG, f"❌ Error reading the file: {e}"
138
-
139
- if df.empty or len(df.columns) == 0:
140
- return None, EMPTY_CONFIG, "❌ The CSV file looks empty or malformed."
141
-
142
- col_config = [
143
- [str(c), ANON_TYPES[_detect_type(df[c])], True]
144
- for c in df.columns
145
- ]
146
- msg = (
147
- f"✅ Loaded: **{len(df)} rows × {len(df.columns)} columns**. "
148
- f"Review the detected types below and click **Anonymize**."
149
- )
150
- return df, col_config, msg
151
-
152
-
153
- def anonymize(df, col_config):
154
- """Anonymize df according to col_config (a list-of-lists or DataFrame)."""
155
- if df is None or not isinstance(df, pd.DataFrame) or df.empty:
156
- return pd.DataFrame(), None, "❌ Please upload a CSV first."
157
-
158
- # Normalize col_config to list of [col, type_label, anonymize_bool]
159
- if isinstance(col_config, pd.DataFrame):
160
- rows = col_config.values.tolist()
161
- else:
162
- rows = list(col_config) if col_config else []
163
-
164
- if not rows:
165
- return pd.DataFrame(), None, "❌ Column configuration is empty."
166
-
167
- result = df.copy()
168
- processed = 0
169
-
170
- for row in rows:
171
- if len(row) < 3:
172
- continue
173
- col_name, type_label, do_anon = row[0], row[1], row[2]
174
- if not do_anon or not col_name or col_name not in result.columns:
175
- continue
176
-
177
- anon_type = TYPE_LABELS.get(type_label, "generic")
178
- cache = {}
179
- result[col_name] = result[col_name].apply(
180
- lambda v: _make_fake(str(v), anon_type, cache)
181
- if pd.notna(v) and str(v).strip() != "" else v
182
- )
183
- processed += 1
184
-
185
- csv_path = "/tmp/anonymized.csv"
186
- result.to_csv(csv_path, index=False, encoding="utf-8")
187
-
188
- msg = f"✅ Anonymized **{processed} columns** out of {len(df.columns)} total. Download below."
189
- return result.head(10), csv_path, msg
190
-
191
- # ── UI ────────────────────────────────────────────────────────────────────────
192
- DESCRIPTION = """
193
- # 🔒 CSV Data Anonymizer — GDPR Ready
194
-
195
- Upload a CSV, review the columns detected as sensitive, and download the anonymized version.
196
- The mapping is **deterministic**: the same value always produces the same fake data, preserving dataset consistency.
197
- """
198
-
199
- with gr.Blocks(theme=gr.themes.Soft(), title="CSV Data Anonymizer") as demo:
200
- df_state = gr.State()
201
-
202
- gr.Markdown(DESCRIPTION)
203
-
204
- with gr.Row():
205
- with gr.Column(scale=1):
206
- file_input = gr.File(
207
- label="📂 Upload CSV",
208
- file_types=[".csv"],
209
- type="filepath",
210
- )
211
- status_box = gr.Markdown("Upload a CSV file to get started.")
212
-
213
- with gr.Column(scale=2):
214
- col_editor = gr.Dataframe(
215
- label="⚙️ Column configuration",
216
- headers=["Column", "Type", "Anonymize"],
217
- datatype=["str", "str", "bool"],
218
- value=EMPTY_CONFIG,
219
- col_count=(3, "fixed"),
220
- interactive=True,
221
- wrap=True,
222
- )
223
-
224
- anon_btn = gr.Button("🔒 Anonymize", variant="primary", size="lg")
225
-
226
- gr.Markdown("### Result preview (first 10 rows)")
227
- preview_out = gr.Dataframe(label="Anonymized preview", interactive=False, wrap=True)
228
-
229
- download_out = gr.File(label="⬇️ Download anonymized CSV")
230
- result_status = gr.Markdown("")
231
-
232
- file_input.upload(
233
- fn=load_csv,
234
- inputs=[file_input],
235
- outputs=[df_state, col_editor, status_box],
236
- api_name="load_csv",
237
- )
238
-
239
- file_input.clear(
240
- fn=reset_state,
241
- inputs=[],
242
- outputs=[df_state, col_editor, status_box],
243
- api_name="clear",
244
- )
245
-
246
- anon_btn.click(
247
- fn=anonymize,
248
- inputs=[df_state, col_editor],
249
- outputs=[preview_out, download_out, result_status],
250
- api_name="anonymize",
251
- )
252
-
253
- gr.Markdown(
254
- "---\n*The file is processed entirely in memory — "
255
- "no data is saved to disk or transmitted to third parties.*"
256
- )
257
-
258
- if __name__ == "__main__":
259
- demo.launch()