import gradio as gr import pandas as pd from faker import Faker import os import re # ── Example CSV (created at startup, before Blocks definition) ──────────────── def _create_example(): if os.path.exists("example.csv"): return sample = pd.DataFrame({ "id": range(1, 6), "first_name": ["Mario", "Lucia", "Giovanni", "Sofia", "Marco"], "last_name": ["Rossi", "Bianchi", "Ferrari", "Esposito", "Romano"], "email": ["mario.rossi@gmail.com", "lucia.b@yahoo.it", "g.ferrari@outlook.com", "sofia.e@libero.it", "m.romano@hotmail.com"], "phone": ["+39 333 1234567", "02 9876543", "+39 347 9988776", "080 5551234", "+39 320 4567890"], "city": ["Roma", "Milano", "Napoli", "Bari", "Torino"], "notes": ["premium customer", "newsletter yes", "B2B", "trial", "enterprise"], }) sample.to_csv("example.csv", index=False) _create_example() # ── Available anonymization types ───────────────────────────────────────────── ANON_TYPES = { "email": "📧 Email", "first_name": "👤 First Name", "last_name": "👤 Last Name", "full_name": "👤 Full Name", "phone": "📱 Phone", "address": "🏠 Address", "city": "🏙️ City", "postal_code": "📮 Postal Code", "tax_id": "🪪 Tax ID", "date_of_birth": "📅 Date of Birth", "generic": "🔒 Generic", } TYPE_LABELS = {v: k for k, v in ANON_TYPES.items()} TYPE_CHOICES = list(ANON_TYPES.values()) # ── Automatic type detection ────────────────────────────────────────────────── def _detect_type(series): name = (series.name or "").lower() sample = series.dropna().astype(str).head(200) if any(k in name for k in ("email", "mail", "e-mail")): return "email" if any(k in name for k in ("telefon", "phone", "cell", "mobile", "tel")): return "phone" if any(k in name for k in ("full_name", "fullname", "nome_completo")): return "full_name" if any(k in name for k in ("nome", "first", "given")): return "first_name" if any(k in name for k in ("cognome", "surname", "last")): return "last_name" if any(k in name for k in ("indirizzo", "address", "via", "street")): return "address" if any(k in name for k in ("città", "city", "comune", "citta")): return "city" if any(k in name for k in ("cap", "postal", "zip")): return "postal_code" if any(k in name for k in ("cf", "codice_fiscale", "fiscal", "tax")): return "tax_id" if any(k in name for k in ("nascita", "birth", "dob", "birthday")): return "date_of_birth" if len(sample) > 0: if sample.str.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$").mean() > 0.5: return "email" if sample.str.match(r"^[\+\d\s\-\(\)]{7,15}$").mean() > 0.5: return "phone" return "generic" # ── Fake-data generators (deterministic per value) ──────────────────────────── def _make_fake(value, anon_type, cache): if value in cache: return cache[value] seed = abs(hash(value)) % (2**32) fake_local = Faker(["it_IT", "en_US"]) fake_local.seed_instance(seed) generators = { "email": lambda: fake_local.email(), "first_name": lambda: fake_local.first_name(), "last_name": lambda: fake_local.last_name(), "full_name": lambda: fake_local.name(), "phone": lambda: fake_local.phone_number(), "address": lambda: fake_local.street_address(), "city": lambda: fake_local.city(), "postal_code": lambda: fake_local.postcode(), "tax_id": lambda: _fake_tax_id(fake_local), "date_of_birth": lambda: fake_local.date_of_birth(minimum_age=18, maximum_age=80).strftime("%d/%m/%Y"), "generic": lambda: f"ANONYMIZED_{abs(hash(value)) % 100000:05d}", } result = generators.get(anon_type, generators["generic"])() cache[value] = result return result def _fake_tax_id(f): letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" digits = "0123456789" return ( "".join(f.random_choices(letters, length=3)) + "".join(f.random_choices(letters, length=3)) + "".join(f.random_choices(digits, length=2)) + f.random_choices(letters, length=1)[0] + "".join(f.random_choices(digits, length=2)) + f.random_choices(letters, length=1)[0] + "".join(f.random_choices(digits, length=3)) + f.random_choices(letters, length=1)[0] ) # ── Main logic ──────────────────────────────────────────────────────────────── EMPTY_CONFIG = [["", TYPE_CHOICES[-1], False]] def reset_state(): """Reset state when the user clears the uploaded file.""" return None, EMPTY_CONFIG, "Upload a CSV file to get started." def load_csv(file_path): """Load CSV → return (DataFrame state, column-config list-of-lists, status).""" if not file_path: return None, EMPTY_CONFIG, "Upload a CSV file to get started." try: df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip") except UnicodeDecodeError: df = pd.read_csv(file_path, encoding="latin-1", on_bad_lines="skip") except Exception as e: return None, EMPTY_CONFIG, f"❌ Error reading the file: {e}" if df.empty or len(df.columns) == 0: return None, EMPTY_CONFIG, "❌ The CSV file looks empty or malformed." col_config = [ [str(c), ANON_TYPES[_detect_type(df[c])], True] for c in df.columns ] msg = ( f"✅ Loaded: **{len(df)} rows × {len(df.columns)} columns**. " f"Review the detected types below and click **Anonymize**." ) return df, col_config, msg def anonymize(df, col_config): """Anonymize df according to col_config (a list-of-lists or DataFrame).""" if df is None or not isinstance(df, pd.DataFrame) or df.empty: return pd.DataFrame(), None, "❌ Please upload a CSV first." # Normalize col_config to list of [col, type_label, anonymize_bool] if isinstance(col_config, pd.DataFrame): rows = col_config.values.tolist() else: rows = list(col_config) if col_config else [] if not rows: return pd.DataFrame(), None, "❌ Column configuration is empty." result = df.copy() processed = 0 for row in rows: if len(row) < 3: continue col_name, type_label, do_anon = row[0], row[1], row[2] if not do_anon or not col_name or col_name not in result.columns: continue anon_type = TYPE_LABELS.get(type_label, "generic") cache = {} result[col_name] = result[col_name].apply( lambda v: _make_fake(str(v), anon_type, cache) if pd.notna(v) and str(v).strip() != "" else v ) processed += 1 csv_path = "/tmp/anonymized.csv" result.to_csv(csv_path, index=False, encoding="utf-8") msg = f"✅ Anonymized **{processed} columns** out of {len(df.columns)} total. Download below." return result.head(10), csv_path, msg # ── UI ──────────────────────────────────────────────────────────────────────── DESCRIPTION = """ # 🔒 CSV Data Anonymizer — GDPR Ready Upload a CSV, review the columns detected as sensitive, and download the anonymized version. The mapping is **deterministic**: the same value always produces the same fake data, preserving dataset consistency. ✨ Clean, merge, and complete Excel tasks in seconds with [XLclick Add-in for Excel](https://xlclick.com/?so=hface). """ with gr.Blocks(theme=gr.themes.Soft(), title="CSV Data Anonymizer") as demo: df_state = gr.State() gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="📂 Upload CSV", file_types=[".csv"], type="filepath", ) status_box = gr.Markdown("Upload a CSV file to get started.") with gr.Column(scale=2): col_editor = gr.Dataframe( label="⚙️ Column configuration", headers=["Column", "Type", "Anonymize"], datatype=["str", "str", "bool"], value=EMPTY_CONFIG, col_count=(3, "fixed"), interactive=True, wrap=True, ) anon_btn = gr.Button("🔒 Anonymize", variant="primary", size="lg") gr.Markdown("### Result preview (first 10 rows)") preview_out = gr.Dataframe(label="Anonymized preview", interactive=False, wrap=True) download_out = gr.File(label="⬇️ Download anonymized CSV") result_status = gr.Markdown("") file_input.upload( fn=load_csv, inputs=[file_input], outputs=[df_state, col_editor, status_box], api_name="load_csv", ) file_input.clear( fn=reset_state, inputs=[], outputs=[df_state, col_editor, status_box], api_name="clear", ) anon_btn.click( fn=anonymize, inputs=[df_state, col_editor], outputs=[preview_out, download_out, result_status], api_name="anonymize", ) gr.Markdown( "---\n*The file is processed entirely in memory — " "no data is saved to disk or transmitted to third parties.*" ) if __name__ == "__main__": demo.launch()