Spaces:

fredwebx
/

CSV-Data-Anonymizer-for-GDPR

Sleeping

App Files Files Community

fredwebx commited on May 26

Commit

aecd0c4

verified ·

1 Parent(s): f2ce9fa

Upload app.py

Browse files

Files changed (1) hide show

app.py +261 -0

app.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import gradio as gr
+import pandas as pd
+from faker import Faker
+import os
+import re
+# ── Example CSV (created at startup, before Blocks definition) ────────────────
+def _create_example():
+    if os.path.exists("example.csv"):
+        return
+    sample = pd.DataFrame({
+        "id":         range(1, 6),
+        "first_name": ["Mario", "Lucia", "Giovanni", "Sofia", "Marco"],
+        "last_name":  ["Rossi", "Bianchi", "Ferrari", "Esposito", "Romano"],
+        "email":      ["mario.rossi@gmail.com", "lucia.b@yahoo.it",
+                       "g.ferrari@outlook.com", "sofia.e@libero.it",
+                       "m.romano@hotmail.com"],
+        "phone":      ["+39 333 1234567", "02 9876543", "+39 347 9988776",
+                       "080 5551234", "+39 320 4567890"],
+        "city":       ["Roma", "Milano", "Napoli", "Bari", "Torino"],
+        "notes":      ["premium customer", "newsletter yes", "B2B",
+                       "trial", "enterprise"],
+    })
+    sample.to_csv("example.csv", index=False)
+_create_example()
+# ── Available anonymization types ─────────────────────────────────────────────
+ANON_TYPES = {
+    "email":          "📧 Email",
+    "first_name":     "👤 First Name",
+    "last_name":      "👤 Last Name",
+    "full_name":      "👤 Full Name",
+    "phone":          "📱 Phone",
+    "address":        "🏠 Address",
+    "city":           "🏙️ City",
+    "postal_code":    "📮 Postal Code",
+    "tax_id":         "🪪 Tax ID",
+    "date_of_birth":  "📅 Date of Birth",
+    "generic":        "🔒 Generic",
+}
+TYPE_LABELS = {v: k for k, v in ANON_TYPES.items()}
+TYPE_CHOICES = list(ANON_TYPES.values())
+# ── Automatic type detection ──────────────────────────────────────────────────
+def _detect_type(series):
+    name = (series.name or "").lower()
+    sample = series.dropna().astype(str).head(200)
+    if any(k in name for k in ("email", "mail", "e-mail")):
+        return "email"
+    if any(k in name for k in ("telefon", "phone", "cell", "mobile", "tel")):
+        return "phone"
+    if any(k in name for k in ("full_name", "fullname", "nome_completo")):
+        return "full_name"
+    if any(k in name for k in ("nome", "first", "given")):
+        return "first_name"
+    if any(k in name for k in ("cognome", "surname", "last")):
+        return "last_name"
+    if any(k in name for k in ("indirizzo", "address", "via", "street")):
+        return "address"
+    if any(k in name for k in ("città", "city", "comune", "citta")):
+        return "city"
+    if any(k in name for k in ("cap", "postal", "zip")):
+        return "postal_code"
+    if any(k in name for k in ("cf", "codice_fiscale", "fiscal", "tax")):
+        return "tax_id"
+    if any(k in name for k in ("nascita", "birth", "dob", "birthday")):
+        return "date_of_birth"
+    if len(sample) > 0:
+        if sample.str.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$").mean() > 0.5:
+            return "email"
+        if sample.str.match(r"^[\+\d\s\-\(\)]{7,15}$").mean() > 0.5:
+            return "phone"
+    return "generic"
+# ── Fake-data generators (deterministic per value) ────────────────────────────
+def _make_fake(value, anon_type, cache):
+    if value in cache:
+        return cache[value]
+    seed = abs(hash(value)) % (2**32)
+    fake_local = Faker(["it_IT", "en_US"])
+    fake_local.seed_instance(seed)
+    generators = {
+        "email":         lambda: fake_local.email(),
+        "first_name":    lambda: fake_local.first_name(),
+        "last_name":     lambda: fake_local.last_name(),
+        "full_name":     lambda: fake_local.name(),
+        "phone":         lambda: fake_local.phone_number(),
+        "address":       lambda: fake_local.street_address(),
+        "city":          lambda: fake_local.city(),
+        "postal_code":   lambda: fake_local.postcode(),
+        "tax_id":        lambda: _fake_tax_id(fake_local),
+        "date_of_birth": lambda: fake_local.date_of_birth(minimum_age=18, maximum_age=80).strftime("%d/%m/%Y"),
+        "generic":       lambda: f"ANONYMIZED_{abs(hash(value)) % 100000:05d}",
+    }
+    result = generators.get(anon_type, generators["generic"])()
+    cache[value] = result
+    return result
+def _fake_tax_id(f):
+    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    digits = "0123456789"
+    return (
+        "".join(f.random_choices(letters, length=3))
+        + "".join(f.random_choices(letters, length=3))
+        + "".join(f.random_choices(digits, length=2))
+        + f.random_choices(letters, length=1)[0]
+        + "".join(f.random_choices(digits, length=2))
+        + f.random_choices(letters, length=1)[0]
+        + "".join(f.random_choices(digits, length=3))
+        + f.random_choices(letters, length=1)[0]
+    )
+# ── Main logic ────────────────────────────────────────────────────────────────
+EMPTY_CONFIG = [["", TYPE_CHOICES[-1], False]]
+def reset_state():
+    """Reset state when the user clears the uploaded file."""
+    return None, EMPTY_CONFIG, "Upload a CSV file to get started."
+def load_csv(file_path):
+    """Load CSV → return (DataFrame state, column-config list-of-lists, status)."""
+    if not file_path:
+        return None, EMPTY_CONFIG, "Upload a CSV file to get started."
+    try:
+        df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip")
+    except UnicodeDecodeError:
+        df = pd.read_csv(file_path, encoding="latin-1", on_bad_lines="skip")
+    except Exception as e:
+        return None, EMPTY_CONFIG, f"❌ Error reading the file: {e}"
+    if df.empty or len(df.columns) == 0:
+        return None, EMPTY_CONFIG, "❌ The CSV file looks empty or malformed."
+    col_config = [
+        [str(c), ANON_TYPES[_detect_type(df[c])], True]
+        for c in df.columns
+    ]
+    msg = (
+        f"✅ Loaded: **{len(df)} rows × {len(df.columns)} columns**. "
+        f"Review the detected types below and click **Anonymize**."
+    )
+    return df, col_config, msg
+def anonymize(df, col_config):
+    """Anonymize df according to col_config (a list-of-lists or DataFrame)."""
+    if df is None or not isinstance(df, pd.DataFrame) or df.empty:
+        return pd.DataFrame(), None, "❌ Please upload a CSV first."
+    # Normalize col_config to list of [col, type_label, anonymize_bool]
+    if isinstance(col_config, pd.DataFrame):
+        rows = col_config.values.tolist()
+    else:
+        rows = list(col_config) if col_config else []
+    if not rows:
+        return pd.DataFrame(), None, "❌ Column configuration is empty."
+    result = df.copy()
+    processed = 0
+    for row in rows:
+        if len(row) < 3:
+            continue
+        col_name, type_label, do_anon = row[0], row[1], row[2]
+        if not do_anon or not col_name or col_name not in result.columns:
+            continue
+        anon_type = TYPE_LABELS.get(type_label, "generic")
+        cache = {}
+        result[col_name] = result[col_name].apply(
+            lambda v: _make_fake(str(v), anon_type, cache)
+            if pd.notna(v) and str(v).strip() != "" else v
+        )
+        processed += 1
+    csv_path = "/tmp/anonymized.csv"
+    result.to_csv(csv_path, index=False, encoding="utf-8")
+    msg = f"✅ Anonymized **{processed} columns** out of {len(df.columns)} total. Download below."
+    return result.head(10), csv_path, msg
+# ── UI ────────────────────────────────────────────────────────────────────────
+DESCRIPTION = """
+# 🔒 CSV Data Anonymizer — GDPR Ready
+Upload a CSV, review the columns detected as sensitive, and download the anonymized version.
+The mapping is **deterministic**: the same value always produces the same fake data, preserving dataset consistency.
+✨ Clean, merge, and complete Excel tasks in seconds with [XLclick Add-in for Excel](https://xlclick.com/?so=hface).
+"""
+with gr.Blocks(theme=gr.themes.Soft(), title="CSV Data Anonymizer") as demo:
+    df_state = gr.State()
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_input = gr.File(
+                label="📂 Upload CSV",
+                file_types=[".csv"],
+                type="filepath",
+            )
+            status_box = gr.Markdown("Upload a CSV file to get started.")
+        with gr.Column(scale=2):
+            col_editor = gr.Dataframe(
+                label="⚙️ Column configuration",
+                headers=["Column", "Type", "Anonymize"],
+                datatype=["str", "str", "bool"],
+                value=EMPTY_CONFIG,
+                col_count=(3, "fixed"),
+                interactive=True,
+                wrap=True,
+            )
+    anon_btn = gr.Button("🔒 Anonymize", variant="primary", size="lg")
+    gr.Markdown("### Result preview (first 10 rows)")
+    preview_out = gr.Dataframe(label="Anonymized preview", interactive=False, wrap=True)
+    download_out = gr.File(label="⬇️ Download anonymized CSV")
+    result_status = gr.Markdown("")
+    file_input.upload(
+        fn=load_csv,
+        inputs=[file_input],
+        outputs=[df_state, col_editor, status_box],
+        api_name="load_csv",
+    )
+    file_input.clear(
+        fn=reset_state,
+        inputs=[],
+        outputs=[df_state, col_editor, status_box],
+        api_name="clear",
+    )
+    anon_btn.click(
+        fn=anonymize,
+        inputs=[df_state, col_editor],
+        outputs=[preview_out, download_out, result_status],
+        api_name="anonymize",
+    )
+    gr.Markdown(
+        "---\n*The file is processed entirely in memory — "
+        "no data is saved to disk or transmitted to third parties.*"
+    )
+if __name__ == "__main__":
+    demo.launch()