| import gradio as gr |
| import pandas as pd |
| from faker import Faker |
| import os |
| import re |
|
|
| |
| def _create_example(): |
| if os.path.exists("example.csv"): |
| return |
| sample = pd.DataFrame({ |
| "id": range(1, 6), |
| "first_name": ["Mario", "Lucia", "Giovanni", "Sofia", "Marco"], |
| "last_name": ["Rossi", "Bianchi", "Ferrari", "Esposito", "Romano"], |
| "email": ["mario.rossi@gmail.com", "lucia.b@yahoo.it", |
| "g.ferrari@outlook.com", "sofia.e@libero.it", |
| "m.romano@hotmail.com"], |
| "phone": ["+39 333 1234567", "02 9876543", "+39 347 9988776", |
| "080 5551234", "+39 320 4567890"], |
| "city": ["Roma", "Milano", "Napoli", "Bari", "Torino"], |
| "notes": ["premium customer", "newsletter yes", "B2B", |
| "trial", "enterprise"], |
| }) |
| sample.to_csv("example.csv", index=False) |
|
|
| _create_example() |
|
|
| |
| ANON_TYPES = { |
| "email": "📧 Email", |
| "first_name": "👤 First Name", |
| "last_name": "👤 Last Name", |
| "full_name": "👤 Full Name", |
| "phone": "📱 Phone", |
| "address": "🏠 Address", |
| "city": "🏙️ City", |
| "postal_code": "📮 Postal Code", |
| "tax_id": "🪪 Tax ID", |
| "date_of_birth": "📅 Date of Birth", |
| "generic": "🔒 Generic", |
| } |
| TYPE_LABELS = {v: k for k, v in ANON_TYPES.items()} |
| TYPE_CHOICES = list(ANON_TYPES.values()) |
|
|
| |
| def _detect_type(series): |
| name = (series.name or "").lower() |
| sample = series.dropna().astype(str).head(200) |
|
|
| if any(k in name for k in ("email", "mail", "e-mail")): |
| return "email" |
| if any(k in name for k in ("telefon", "phone", "cell", "mobile", "tel")): |
| return "phone" |
| if any(k in name for k in ("full_name", "fullname", "nome_completo")): |
| return "full_name" |
| if any(k in name for k in ("nome", "first", "given")): |
| return "first_name" |
| if any(k in name for k in ("cognome", "surname", "last")): |
| return "last_name" |
| if any(k in name for k in ("indirizzo", "address", "via", "street")): |
| return "address" |
| if any(k in name for k in ("città", "city", "comune", "citta")): |
| return "city" |
| if any(k in name for k in ("cap", "postal", "zip")): |
| return "postal_code" |
| if any(k in name for k in ("cf", "codice_fiscale", "fiscal", "tax")): |
| return "tax_id" |
| if any(k in name for k in ("nascita", "birth", "dob", "birthday")): |
| return "date_of_birth" |
|
|
| if len(sample) > 0: |
| if sample.str.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$").mean() > 0.5: |
| return "email" |
| if sample.str.match(r"^[\+\d\s\-\(\)]{7,15}$").mean() > 0.5: |
| return "phone" |
|
|
| return "generic" |
|
|
| |
| def _make_fake(value, anon_type, cache): |
| if value in cache: |
| return cache[value] |
|
|
| seed = abs(hash(value)) % (2**32) |
| fake_local = Faker(["it_IT", "en_US"]) |
| fake_local.seed_instance(seed) |
|
|
| generators = { |
| "email": lambda: fake_local.email(), |
| "first_name": lambda: fake_local.first_name(), |
| "last_name": lambda: fake_local.last_name(), |
| "full_name": lambda: fake_local.name(), |
| "phone": lambda: fake_local.phone_number(), |
| "address": lambda: fake_local.street_address(), |
| "city": lambda: fake_local.city(), |
| "postal_code": lambda: fake_local.postcode(), |
| "tax_id": lambda: _fake_tax_id(fake_local), |
| "date_of_birth": lambda: fake_local.date_of_birth(minimum_age=18, maximum_age=80).strftime("%d/%m/%Y"), |
| "generic": lambda: f"ANONYMIZED_{abs(hash(value)) % 100000:05d}", |
| } |
|
|
| result = generators.get(anon_type, generators["generic"])() |
| cache[value] = result |
| return result |
|
|
| def _fake_tax_id(f): |
| letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| digits = "0123456789" |
| return ( |
| "".join(f.random_choices(letters, length=3)) |
| + "".join(f.random_choices(letters, length=3)) |
| + "".join(f.random_choices(digits, length=2)) |
| + f.random_choices(letters, length=1)[0] |
| + "".join(f.random_choices(digits, length=2)) |
| + f.random_choices(letters, length=1)[0] |
| + "".join(f.random_choices(digits, length=3)) |
| + f.random_choices(letters, length=1)[0] |
| ) |
|
|
| |
| EMPTY_CONFIG = [["", TYPE_CHOICES[-1], False]] |
|
|
| def reset_state(): |
| """Reset state when the user clears the uploaded file.""" |
| return None, EMPTY_CONFIG, "Upload a CSV file to get started." |
|
|
| def load_csv(file_path): |
| """Load CSV → return (DataFrame state, column-config list-of-lists, status).""" |
| if not file_path: |
| return None, EMPTY_CONFIG, "Upload a CSV file to get started." |
|
|
| try: |
| df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip") |
| except UnicodeDecodeError: |
| df = pd.read_csv(file_path, encoding="latin-1", on_bad_lines="skip") |
| except Exception as e: |
| return None, EMPTY_CONFIG, f"❌ Error reading the file: {e}" |
|
|
| if df.empty or len(df.columns) == 0: |
| return None, EMPTY_CONFIG, "❌ The CSV file looks empty or malformed." |
|
|
| col_config = [ |
| [str(c), ANON_TYPES[_detect_type(df[c])], True] |
| for c in df.columns |
| ] |
| msg = ( |
| f"✅ Loaded: **{len(df)} rows × {len(df.columns)} columns**. " |
| f"Review the detected types below and click **Anonymize**." |
| ) |
| return df, col_config, msg |
|
|
|
|
| def anonymize(df, col_config): |
| """Anonymize df according to col_config (a list-of-lists or DataFrame).""" |
| if df is None or not isinstance(df, pd.DataFrame) or df.empty: |
| return pd.DataFrame(), None, "❌ Please upload a CSV first." |
|
|
| |
| if isinstance(col_config, pd.DataFrame): |
| rows = col_config.values.tolist() |
| else: |
| rows = list(col_config) if col_config else [] |
|
|
| if not rows: |
| return pd.DataFrame(), None, "❌ Column configuration is empty." |
|
|
| result = df.copy() |
| processed = 0 |
|
|
| for row in rows: |
| if len(row) < 3: |
| continue |
| col_name, type_label, do_anon = row[0], row[1], row[2] |
| if not do_anon or not col_name or col_name not in result.columns: |
| continue |
|
|
| anon_type = TYPE_LABELS.get(type_label, "generic") |
| cache = {} |
| result[col_name] = result[col_name].apply( |
| lambda v: _make_fake(str(v), anon_type, cache) |
| if pd.notna(v) and str(v).strip() != "" else v |
| ) |
| processed += 1 |
|
|
| csv_path = "/tmp/anonymized.csv" |
| result.to_csv(csv_path, index=False, encoding="utf-8") |
|
|
| msg = f"✅ Anonymized **{processed} columns** out of {len(df.columns)} total. Download below." |
| return result.head(10), csv_path, msg |
|
|
| |
| DESCRIPTION = """ |
| # 🔒 CSV Data Anonymizer — GDPR Ready |
| |
| Upload a CSV, review the columns detected as sensitive, and download the anonymized version. |
| The mapping is **deterministic**: the same value always produces the same fake data, preserving dataset consistency. |
| |
| ✨ Clean, merge, and complete Excel tasks in seconds with [XLclick Add-in for Excel](https://xlclick.com/?so=hface). |
| """ |
|
|
| with gr.Blocks(theme=gr.themes.Soft(), title="CSV Data Anonymizer") as demo: |
| df_state = gr.State() |
|
|
| gr.Markdown(DESCRIPTION) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| file_input = gr.File( |
| label="📂 Upload CSV", |
| file_types=[".csv"], |
| type="filepath", |
| ) |
| status_box = gr.Markdown("Upload a CSV file to get started.") |
|
|
| with gr.Column(scale=2): |
| col_editor = gr.Dataframe( |
| label="⚙️ Column configuration", |
| headers=["Column", "Type", "Anonymize"], |
| datatype=["str", "str", "bool"], |
| value=EMPTY_CONFIG, |
| col_count=(3, "fixed"), |
| interactive=True, |
| wrap=True, |
| ) |
|
|
| anon_btn = gr.Button("🔒 Anonymize", variant="primary", size="lg") |
|
|
| gr.Markdown("### Result preview (first 10 rows)") |
| preview_out = gr.Dataframe(label="Anonymized preview", interactive=False, wrap=True) |
|
|
| download_out = gr.File(label="⬇️ Download anonymized CSV") |
| result_status = gr.Markdown("") |
|
|
| file_input.upload( |
| fn=load_csv, |
| inputs=[file_input], |
| outputs=[df_state, col_editor, status_box], |
| api_name="load_csv", |
| ) |
|
|
| file_input.clear( |
| fn=reset_state, |
| inputs=[], |
| outputs=[df_state, col_editor, status_box], |
| api_name="clear", |
| ) |
|
|
| anon_btn.click( |
| fn=anonymize, |
| inputs=[df_state, col_editor], |
| outputs=[preview_out, download_out, result_status], |
| api_name="anonymize", |
| ) |
|
|
| gr.Markdown( |
| "---\n*The file is processed entirely in memory — " |
| "no data is saved to disk or transmitted to third parties.*" |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|