Spaces:

fredwebx
/

CSV-Data-Anonymizer-for-GDPR

Sleeping

App Files Files Community

CSV-Data-Anonymizer-for-GDPR / app.py

fredwebx

Upload app.py

aecd0c4 verified 8 days ago

raw

history blame contribute delete

10.1 kB

	import gradio as gr
	import pandas as pd
	from faker import Faker
	import os
	import re

	# ── Example CSV (created at startup, before Blocks definition) ────────────────
	def _create_example():
	if os.path.exists("example.csv"):
	return
	sample = pd.DataFrame({
	"id": range(1, 6),
	"first_name": ["Mario", "Lucia", "Giovanni", "Sofia", "Marco"],
	"last_name": ["Rossi", "Bianchi", "Ferrari", "Esposito", "Romano"],
	"email": ["mario.rossi@gmail.com", "lucia.b@yahoo.it",
	"g.ferrari@outlook.com", "sofia.e@libero.it",
	"m.romano@hotmail.com"],
	"phone": ["+39 333 1234567", "02 9876543", "+39 347 9988776",
	"080 5551234", "+39 320 4567890"],
	"city": ["Roma", "Milano", "Napoli", "Bari", "Torino"],
	"notes": ["premium customer", "newsletter yes", "B2B",
	"trial", "enterprise"],
	})
	sample.to_csv("example.csv", index=False)

	_create_example()

	# ── Available anonymization types ─────────────────────────────────────────────
	ANON_TYPES = {
	"email": "📧 Email",
	"first_name": "👤 First Name",
	"last_name": "👤 Last Name",
	"full_name": "👤 Full Name",
	"phone": "📱 Phone",
	"address": "🏠 Address",
	"city": "🏙️ City",
	"postal_code": "📮 Postal Code",
	"tax_id": "🪪 Tax ID",
	"date_of_birth": "📅 Date of Birth",
	"generic": "🔒 Generic",
	}
	TYPE_LABELS = {v: k for k, v in ANON_TYPES.items()}
	TYPE_CHOICES = list(ANON_TYPES.values())

	# ── Automatic type detection ──────────────────────────────────────────────────
	def _detect_type(series):
	name = (series.name or "").lower()
	sample = series.dropna().astype(str).head(200)

	if any(k in name for k in ("email", "mail", "e-mail")):
	return "email"
	if any(k in name for k in ("telefon", "phone", "cell", "mobile", "tel")):
	return "phone"
	if any(k in name for k in ("full_name", "fullname", "nome_completo")):
	return "full_name"
	if any(k in name for k in ("nome", "first", "given")):
	return "first_name"
	if any(k in name for k in ("cognome", "surname", "last")):
	return "last_name"
	if any(k in name for k in ("indirizzo", "address", "via", "street")):
	return "address"
	if any(k in name for k in ("città", "city", "comune", "citta")):
	return "city"
	if any(k in name for k in ("cap", "postal", "zip")):
	return "postal_code"
	if any(k in name for k in ("cf", "codice_fiscale", "fiscal", "tax")):
	return "tax_id"
	if any(k in name for k in ("nascita", "birth", "dob", "birthday")):
	return "date_of_birth"

	if len(sample) > 0:
	if sample.str.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$").mean() > 0.5:
	return "email"
	if sample.str.match(r"^[\+\d\s\-]{7,15}$").mean() > 0.5:
	return "phone"

	return "generic"

	# ── Fake-data generators (deterministic per value) ────────────────────────────
	def _make_fake(value, anon_type, cache):
	if value in cache:
	return cache[value]

	seed = abs(hash(value)) % (2**32)
	fake_local = Faker(["it_IT", "en_US"])
	fake_local.seed_instance(seed)

	generators = {
	"email": lambda: fake_local.email(),
	"first_name": lambda: fake_local.first_name(),
	"last_name": lambda: fake_local.last_name(),
	"full_name": lambda: fake_local.name(),
	"phone": lambda: fake_local.phone_number(),
	"address": lambda: fake_local.street_address(),
	"city": lambda: fake_local.city(),
	"postal_code": lambda: fake_local.postcode(),
	"tax_id": lambda: _fake_tax_id(fake_local),
	"date_of_birth": lambda: fake_local.date_of_birth(minimum_age=18, maximum_age=80).strftime("%d/%m/%Y"),
	"generic": lambda: f"ANONYMIZED_{abs(hash(value)) % 100000:05d}",
	}

	result = generators.get(anon_type, generators["generic"])()
	cache[value] = result
	return result

	def _fake_tax_id(f):
	letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	digits = "0123456789"
	return (
	"".join(f.random_choices(letters, length=3))
	+ "".join(f.random_choices(letters, length=3))
	+ "".join(f.random_choices(digits, length=2))
	+ f.random_choices(letters, length=1)[0]
	+ "".join(f.random_choices(digits, length=2))
	+ f.random_choices(letters, length=1)[0]
	+ "".join(f.random_choices(digits, length=3))
	+ f.random_choices(letters, length=1)[0]
	)

	# ── Main logic ────────────────────────────────────────────────────────────────
	EMPTY_CONFIG = [["", TYPE_CHOICES[-1], False]]

	def reset_state():
	"""Reset state when the user clears the uploaded file."""
	return None, EMPTY_CONFIG, "Upload a CSV file to get started."

	def load_csv(file_path):
	"""Load CSV → return (DataFrame state, column-config list-of-lists, status)."""
	if not file_path:
	return None, EMPTY_CONFIG, "Upload a CSV file to get started."

	try:
	df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip")
	except UnicodeDecodeError:
	df = pd.read_csv(file_path, encoding="latin-1", on_bad_lines="skip")
	except Exception as e:
	return None, EMPTY_CONFIG, f"❌ Error reading the file: {e}"

	if df.empty or len(df.columns) == 0:
	return None, EMPTY_CONFIG, "❌ The CSV file looks empty or malformed."

	col_config = [
	[str(c), ANON_TYPES[_detect_type(df[c])], True]
	for c in df.columns
	]
	msg = (
	f"✅ Loaded: {len(df)} rows × {len(df.columns)} columns. "
	f"Review the detected types below and click Anonymize."
	)
	return df, col_config, msg


	def anonymize(df, col_config):
	"""Anonymize df according to col_config (a list-of-lists or DataFrame)."""
	if df is None or not isinstance(df, pd.DataFrame) or df.empty:
	return pd.DataFrame(), None, "❌ Please upload a CSV first."

	# Normalize col_config to list of [col, type_label, anonymize_bool]
	if isinstance(col_config, pd.DataFrame):
	rows = col_config.values.tolist()
	else:
	rows = list(col_config) if col_config else []

	if not rows:
	return pd.DataFrame(), None, "❌ Column configuration is empty."

	result = df.copy()
	processed = 0

	for row in rows:
	if len(row) < 3:
	continue
	col_name, type_label, do_anon = row[0], row[1], row[2]
	if not do_anon or not col_name or col_name not in result.columns:
	continue

	anon_type = TYPE_LABELS.get(type_label, "generic")
	cache = {}
	result[col_name] = result[col_name].apply(
	lambda v: _make_fake(str(v), anon_type, cache)
	if pd.notna(v) and str(v).strip() != "" else v
	)
	processed += 1

	csv_path = "/tmp/anonymized.csv"
	result.to_csv(csv_path, index=False, encoding="utf-8")

	msg = f"✅ Anonymized {processed} columns out of {len(df.columns)} total. Download below."
	return result.head(10), csv_path, msg

	# ── UI ────────────────────────────────────────────────────────────────────────
	DESCRIPTION = """
	# 🔒 CSV Data Anonymizer — GDPR Ready

	Upload a CSV, review the columns detected as sensitive, and download the anonymized version.
	The mapping is deterministic: the same value always produces the same fake data, preserving dataset consistency.

	✨ Clean, merge, and complete Excel tasks in seconds with [XLclick Add-in for Excel](https://xlclick.com/?so=hface).
	"""

	with gr.Blocks(theme=gr.themes.Soft(), title="CSV Data Anonymizer") as demo:
	df_state = gr.State()

	gr.Markdown(DESCRIPTION)

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(
	label="📂 Upload CSV",
	file_types=[".csv"],
	type="filepath",
	)
	status_box = gr.Markdown("Upload a CSV file to get started.")

	with gr.Column(scale=2):
	col_editor = gr.Dataframe(
	label="⚙️ Column configuration",
	headers=["Column", "Type", "Anonymize"],
	datatype=["str", "str", "bool"],
	value=EMPTY_CONFIG,
	col_count=(3, "fixed"),
	interactive=True,
	wrap=True,
	)

	anon_btn = gr.Button("🔒 Anonymize", variant="primary", size="lg")

	gr.Markdown("### Result preview (first 10 rows)")
	preview_out = gr.Dataframe(label="Anonymized preview", interactive=False, wrap=True)

	download_out = gr.File(label="⬇️ Download anonymized CSV")
	result_status = gr.Markdown("")

	file_input.upload(
	fn=load_csv,
	inputs=[file_input],
	outputs=[df_state, col_editor, status_box],
	api_name="load_csv",
	)

	file_input.clear(
	fn=reset_state,
	inputs=[],
	outputs=[df_state, col_editor, status_box],
	api_name="clear",
	)

	anon_btn.click(
	fn=anonymize,
	inputs=[df_state, col_editor],
	outputs=[preview_out, download_out, result_status],
	api_name="anonymize",
	)

	gr.Markdown(
	"---\n*The file is processed entirely in memory — "
	"no data is saved to disk or transmitted to third parties.*"
	)

	if __name__ == "__main__":
	demo.launch()