Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from faker import Faker
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
# ── Example CSV (created at startup, before Blocks definition) ────────────────
|
| 8 |
+
def _create_example():
|
| 9 |
+
if os.path.exists("example.csv"):
|
| 10 |
+
return
|
| 11 |
+
sample = pd.DataFrame({
|
| 12 |
+
"id": range(1, 6),
|
| 13 |
+
"first_name": ["Mario", "Lucia", "Giovanni", "Sofia", "Marco"],
|
| 14 |
+
"last_name": ["Rossi", "Bianchi", "Ferrari", "Esposito", "Romano"],
|
| 15 |
+
"email": ["mario.rossi@gmail.com", "lucia.b@yahoo.it",
|
| 16 |
+
"g.ferrari@outlook.com", "sofia.e@libero.it",
|
| 17 |
+
"m.romano@hotmail.com"],
|
| 18 |
+
"phone": ["+39 333 1234567", "02 9876543", "+39 347 9988776",
|
| 19 |
+
"080 5551234", "+39 320 4567890"],
|
| 20 |
+
"city": ["Roma", "Milano", "Napoli", "Bari", "Torino"],
|
| 21 |
+
"notes": ["premium customer", "newsletter yes", "B2B",
|
| 22 |
+
"trial", "enterprise"],
|
| 23 |
+
})
|
| 24 |
+
sample.to_csv("example.csv", index=False)
|
| 25 |
+
|
| 26 |
+
_create_example()
|
| 27 |
+
|
| 28 |
+
# ── Available anonymization types ─────────────────────────────────────────────
|
| 29 |
+
ANON_TYPES = {
|
| 30 |
+
"email": "📧 Email",
|
| 31 |
+
"first_name": "👤 First Name",
|
| 32 |
+
"last_name": "👤 Last Name",
|
| 33 |
+
"full_name": "👤 Full Name",
|
| 34 |
+
"phone": "📱 Phone",
|
| 35 |
+
"address": "🏠 Address",
|
| 36 |
+
"city": "🏙️ City",
|
| 37 |
+
"postal_code": "📮 Postal Code",
|
| 38 |
+
"tax_id": "🪪 Tax ID",
|
| 39 |
+
"date_of_birth": "📅 Date of Birth",
|
| 40 |
+
"generic": "🔒 Generic",
|
| 41 |
+
}
|
| 42 |
+
TYPE_LABELS = {v: k for k, v in ANON_TYPES.items()}
|
| 43 |
+
TYPE_CHOICES = list(ANON_TYPES.values())
|
| 44 |
+
|
| 45 |
+
# ── Automatic type detection ──────────────────────────────────────────────────
|
| 46 |
+
def _detect_type(series):
|
| 47 |
+
name = (series.name or "").lower()
|
| 48 |
+
sample = series.dropna().astype(str).head(200)
|
| 49 |
+
|
| 50 |
+
if any(k in name for k in ("email", "mail", "e-mail")):
|
| 51 |
+
return "email"
|
| 52 |
+
if any(k in name for k in ("telefon", "phone", "cell", "mobile", "tel")):
|
| 53 |
+
return "phone"
|
| 54 |
+
if any(k in name for k in ("full_name", "fullname", "nome_completo")):
|
| 55 |
+
return "full_name"
|
| 56 |
+
if any(k in name for k in ("nome", "first", "given")):
|
| 57 |
+
return "first_name"
|
| 58 |
+
if any(k in name for k in ("cognome", "surname", "last")):
|
| 59 |
+
return "last_name"
|
| 60 |
+
if any(k in name for k in ("indirizzo", "address", "via", "street")):
|
| 61 |
+
return "address"
|
| 62 |
+
if any(k in name for k in ("città", "city", "comune", "citta")):
|
| 63 |
+
return "city"
|
| 64 |
+
if any(k in name for k in ("cap", "postal", "zip")):
|
| 65 |
+
return "postal_code"
|
| 66 |
+
if any(k in name for k in ("cf", "codice_fiscale", "fiscal", "tax")):
|
| 67 |
+
return "tax_id"
|
| 68 |
+
if any(k in name for k in ("nascita", "birth", "dob", "birthday")):
|
| 69 |
+
return "date_of_birth"
|
| 70 |
+
|
| 71 |
+
if len(sample) > 0:
|
| 72 |
+
if sample.str.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$").mean() > 0.5:
|
| 73 |
+
return "email"
|
| 74 |
+
if sample.str.match(r"^[\+\d\s\-\(\)]{7,15}$").mean() > 0.5:
|
| 75 |
+
return "phone"
|
| 76 |
+
|
| 77 |
+
return "generic"
|
| 78 |
+
|
| 79 |
+
# ── Fake-data generators (deterministic per value) ────────────────────────────
|
| 80 |
+
def _make_fake(value, anon_type, cache):
|
| 81 |
+
if value in cache:
|
| 82 |
+
return cache[value]
|
| 83 |
+
|
| 84 |
+
seed = abs(hash(value)) % (2**32)
|
| 85 |
+
fake_local = Faker(["it_IT", "en_US"])
|
| 86 |
+
fake_local.seed_instance(seed)
|
| 87 |
+
|
| 88 |
+
generators = {
|
| 89 |
+
"email": lambda: fake_local.email(),
|
| 90 |
+
"first_name": lambda: fake_local.first_name(),
|
| 91 |
+
"last_name": lambda: fake_local.last_name(),
|
| 92 |
+
"full_name": lambda: fake_local.name(),
|
| 93 |
+
"phone": lambda: fake_local.phone_number(),
|
| 94 |
+
"address": lambda: fake_local.street_address(),
|
| 95 |
+
"city": lambda: fake_local.city(),
|
| 96 |
+
"postal_code": lambda: fake_local.postcode(),
|
| 97 |
+
"tax_id": lambda: _fake_tax_id(fake_local),
|
| 98 |
+
"date_of_birth": lambda: fake_local.date_of_birth(minimum_age=18, maximum_age=80).strftime("%d/%m/%Y"),
|
| 99 |
+
"generic": lambda: f"ANONYMIZED_{abs(hash(value)) % 100000:05d}",
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
result = generators.get(anon_type, generators["generic"])()
|
| 103 |
+
cache[value] = result
|
| 104 |
+
return result
|
| 105 |
+
|
| 106 |
+
def _fake_tax_id(f):
|
| 107 |
+
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
| 108 |
+
digits = "0123456789"
|
| 109 |
+
return (
|
| 110 |
+
"".join(f.random_choices(letters, length=3))
|
| 111 |
+
+ "".join(f.random_choices(letters, length=3))
|
| 112 |
+
+ "".join(f.random_choices(digits, length=2))
|
| 113 |
+
+ f.random_choices(letters, length=1)[0]
|
| 114 |
+
+ "".join(f.random_choices(digits, length=2))
|
| 115 |
+
+ f.random_choices(letters, length=1)[0]
|
| 116 |
+
+ "".join(f.random_choices(digits, length=3))
|
| 117 |
+
+ f.random_choices(letters, length=1)[0]
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# ── Main logic ────────────────────────────────────────────────────────────────
|
| 121 |
+
EMPTY_CONFIG = [["", TYPE_CHOICES[-1], False]]
|
| 122 |
+
|
| 123 |
+
def reset_state():
|
| 124 |
+
"""Reset state when the user clears the uploaded file."""
|
| 125 |
+
return None, EMPTY_CONFIG, "Upload a CSV file to get started."
|
| 126 |
+
|
| 127 |
+
def load_csv(file_path):
|
| 128 |
+
"""Load CSV → return (DataFrame state, column-config list-of-lists, status)."""
|
| 129 |
+
if not file_path:
|
| 130 |
+
return None, EMPTY_CONFIG, "Upload a CSV file to get started."
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip")
|
| 134 |
+
except UnicodeDecodeError:
|
| 135 |
+
df = pd.read_csv(file_path, encoding="latin-1", on_bad_lines="skip")
|
| 136 |
+
except Exception as e:
|
| 137 |
+
return None, EMPTY_CONFIG, f"❌ Error reading the file: {e}"
|
| 138 |
+
|
| 139 |
+
if df.empty or len(df.columns) == 0:
|
| 140 |
+
return None, EMPTY_CONFIG, "❌ The CSV file looks empty or malformed."
|
| 141 |
+
|
| 142 |
+
col_config = [
|
| 143 |
+
[str(c), ANON_TYPES[_detect_type(df[c])], True]
|
| 144 |
+
for c in df.columns
|
| 145 |
+
]
|
| 146 |
+
msg = (
|
| 147 |
+
f"✅ Loaded: **{len(df)} rows × {len(df.columns)} columns**. "
|
| 148 |
+
f"Review the detected types below and click **Anonymize**."
|
| 149 |
+
)
|
| 150 |
+
return df, col_config, msg
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def anonymize(df, col_config):
|
| 154 |
+
"""Anonymize df according to col_config (a list-of-lists or DataFrame)."""
|
| 155 |
+
if df is None or not isinstance(df, pd.DataFrame) or df.empty:
|
| 156 |
+
return pd.DataFrame(), None, "❌ Please upload a CSV first."
|
| 157 |
+
|
| 158 |
+
# Normalize col_config to list of [col, type_label, anonymize_bool]
|
| 159 |
+
if isinstance(col_config, pd.DataFrame):
|
| 160 |
+
rows = col_config.values.tolist()
|
| 161 |
+
else:
|
| 162 |
+
rows = list(col_config) if col_config else []
|
| 163 |
+
|
| 164 |
+
if not rows:
|
| 165 |
+
return pd.DataFrame(), None, "❌ Column configuration is empty."
|
| 166 |
+
|
| 167 |
+
result = df.copy()
|
| 168 |
+
processed = 0
|
| 169 |
+
|
| 170 |
+
for row in rows:
|
| 171 |
+
if len(row) < 3:
|
| 172 |
+
continue
|
| 173 |
+
col_name, type_label, do_anon = row[0], row[1], row[2]
|
| 174 |
+
if not do_anon or not col_name or col_name not in result.columns:
|
| 175 |
+
continue
|
| 176 |
+
|
| 177 |
+
anon_type = TYPE_LABELS.get(type_label, "generic")
|
| 178 |
+
cache = {}
|
| 179 |
+
result[col_name] = result[col_name].apply(
|
| 180 |
+
lambda v: _make_fake(str(v), anon_type, cache)
|
| 181 |
+
if pd.notna(v) and str(v).strip() != "" else v
|
| 182 |
+
)
|
| 183 |
+
processed += 1
|
| 184 |
+
|
| 185 |
+
csv_path = "/tmp/anonymized.csv"
|
| 186 |
+
result.to_csv(csv_path, index=False, encoding="utf-8")
|
| 187 |
+
|
| 188 |
+
msg = f"✅ Anonymized **{processed} columns** out of {len(df.columns)} total. Download below."
|
| 189 |
+
return result.head(10), csv_path, msg
|
| 190 |
+
|
| 191 |
+
# ── UI ────────────────────────────────────────────────────────────────────────
|
| 192 |
+
DESCRIPTION = """
|
| 193 |
+
# 🔒 CSV Data Anonymizer — GDPR Ready
|
| 194 |
+
|
| 195 |
+
Upload a CSV, review the columns detected as sensitive, and download the anonymized version.
|
| 196 |
+
The mapping is **deterministic**: the same value always produces the same fake data, preserving dataset consistency.
|
| 197 |
+
|
| 198 |
+
✨ Clean, merge, and complete Excel tasks in seconds with [XLclick Add-in for Excel](https://xlclick.com/?so=hface).
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="CSV Data Anonymizer") as demo:
|
| 202 |
+
df_state = gr.State()
|
| 203 |
+
|
| 204 |
+
gr.Markdown(DESCRIPTION)
|
| 205 |
+
|
| 206 |
+
with gr.Row():
|
| 207 |
+
with gr.Column(scale=1):
|
| 208 |
+
file_input = gr.File(
|
| 209 |
+
label="📂 Upload CSV",
|
| 210 |
+
file_types=[".csv"],
|
| 211 |
+
type="filepath",
|
| 212 |
+
)
|
| 213 |
+
status_box = gr.Markdown("Upload a CSV file to get started.")
|
| 214 |
+
|
| 215 |
+
with gr.Column(scale=2):
|
| 216 |
+
col_editor = gr.Dataframe(
|
| 217 |
+
label="⚙️ Column configuration",
|
| 218 |
+
headers=["Column", "Type", "Anonymize"],
|
| 219 |
+
datatype=["str", "str", "bool"],
|
| 220 |
+
value=EMPTY_CONFIG,
|
| 221 |
+
col_count=(3, "fixed"),
|
| 222 |
+
interactive=True,
|
| 223 |
+
wrap=True,
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
anon_btn = gr.Button("🔒 Anonymize", variant="primary", size="lg")
|
| 227 |
+
|
| 228 |
+
gr.Markdown("### Result preview (first 10 rows)")
|
| 229 |
+
preview_out = gr.Dataframe(label="Anonymized preview", interactive=False, wrap=True)
|
| 230 |
+
|
| 231 |
+
download_out = gr.File(label="⬇️ Download anonymized CSV")
|
| 232 |
+
result_status = gr.Markdown("")
|
| 233 |
+
|
| 234 |
+
file_input.upload(
|
| 235 |
+
fn=load_csv,
|
| 236 |
+
inputs=[file_input],
|
| 237 |
+
outputs=[df_state, col_editor, status_box],
|
| 238 |
+
api_name="load_csv",
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
file_input.clear(
|
| 242 |
+
fn=reset_state,
|
| 243 |
+
inputs=[],
|
| 244 |
+
outputs=[df_state, col_editor, status_box],
|
| 245 |
+
api_name="clear",
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
anon_btn.click(
|
| 249 |
+
fn=anonymize,
|
| 250 |
+
inputs=[df_state, col_editor],
|
| 251 |
+
outputs=[preview_out, download_out, result_status],
|
| 252 |
+
api_name="anonymize",
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
gr.Markdown(
|
| 256 |
+
"---\n*The file is processed entirely in memory — "
|
| 257 |
+
"no data is saved to disk or transmitted to third parties.*"
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
if __name__ == "__main__":
|
| 261 |
+
demo.launch()
|