Spaces:

MOPAC-DS
/

PII_Remover

Runtime error

App Files Files Community

PII_Remover / app.py

dhammo2

Upload app.py

af9438a verified 10 months ago

raw

history blame contribute delete

7.47 kB

	import gradio as gr
	import pdfplumber
	import io
	from docx import Document
	from pptx import Presentation
	from presidio_analyzer import AnalyzerEngine
	from presidio_anonymizer import AnonymizerEngine, OperatorConfig
	from cryptography.fernet import Fernet
	import hashlib
	import tempfile
	import os

	# Initialize Presidio engines
	analyzer = AnalyzerEngine()
	anonymizer = AnonymizerEngine()

	# Generate encryption key (This should be securely stored and retrieved for real-world use)
	encryption_key = Fernet.generate_key()
	fernet = Fernet(encryption_key)

	# Microsoft Presidio Global + UK PII Entity List
	PII_ENTITIES = [
	"CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IBAN_CODE", "IP_ADDRESS",
	"NRP", "LOCATION", "PERSON", "PHONE_NUMBER", "MEDICAL_LICENSE", "URL", "UK_NHS", "UK_NINO"
	]

	REDACTION_METHODS = {
	"Remove": OperatorConfig("redact"),
	"Redact": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
	"Replace": OperatorConfig("replace", {"new_value": ""}),
	"Mask": OperatorConfig("mask", {"chars_to_mask": 4, "masking_char": "*", "from_end": False}),
	"Hash": "hash",
	"Encrypt": "encrypt",
	}

	def hash_pii(text):
	return hashlib.sha256(text.encode("utf-8")).hexdigest()

	def encrypt_pii(text):
	return fernet.encrypt(text.encode("utf-8")).decode("utf-8")

	def redact_text(text, selected_entities, redaction_method):
	"""Identifies and redacts selected PII types based on the chosen method."""
	selected_entities = selected_entities or None # If empty, redact all entities
	results = analyzer.analyze(text=text, entities=selected_entities, language="en")

	if redaction_method == "Hash":
	for result in results:
	text = text.replace(result.text, hash_pii(result.text))
	elif redaction_method == "Encrypt":
	for result in results:
	text = text.replace(result.text, encrypt_pii(result.text))
	elif redaction_method == "Replace":
	operators = {entity.entity_type: OperatorConfig("replace") for entity in results}
	text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
	else:
	operators = {entity.entity_type: REDACTION_METHODS[redaction_method] for entity in results}
	text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text

	return text

	# Document Processing Functions

	def process_pdf(file):
	with pdfplumber.open(file.name) as pdf:
	pages = [page.extract_text() or "" for page in pdf.pages]
	return "\n".join(pages)

	def process_docx(file):
	doc = Document(file.name)
	return "\n".join([para.text for para in doc.paragraphs])

	def process_pptx(file):
	ppt = Presentation(file.name)
	return "\n".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])

	def process_txt(file):
	return file.read().decode('utf-8')

	def read_document(file):
	ext = file.name.split(".")[-1].lower()
	if ext == "pdf":
	return process_pdf(file)
	elif ext == "docx":
	return process_docx(file)
	elif ext == "pptx":
	return process_pptx(file)
	else:
	return process_txt(file)

	def save_redacted_file(original_file, redacted_text, selected_entities, redaction_method):
	ext = original_file.name.split(".")[-1].lower()
	temp_dir = tempfile.gettempdir()
	safe_filename = f"redacted_{os.path.basename(original_file.name)}"
	redacted_file_path = os.path.join(temp_dir, safe_filename)

	if ext == "docx":
	doc = Document(original_file.name)
	for para in doc.paragraphs:
	para.text = redact_text(para.text, selected_entities, redaction_method) # Use redaction_method passed from UI
	doc.save(redacted_file_path)
	elif ext == "pptx":
	ppt = Presentation(original_file.name)

	# Loop through each slide in the original PPTX and add redacted text
	for slide_num, slide in enumerate(ppt.slides):
	# Loop through all shapes on the slide
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	# Redact the text inside the shape
	redacted_text_in_shape = redact_text(shape.text, selected_entities, redaction_method)
	shape.text = redacted_text_in_shape # Apply the redacted text back to the shape

	ppt.save(redacted_file_path)
	else:
	with open(redacted_file_path, "w", encoding="utf-8") as f:
	f.write(redacted_text)

	return redacted_file_path


	def process_file(file, selected_entities, redaction_method):
	"""Handles file upload, redacts selected PII, and returns redacted file."""
	text = read_document(file)
	redacted_text = redact_text(text, selected_entities, redaction_method)
	redacted_file_path = save_redacted_file(file, redacted_text, selected_entities, redaction_method)

	return redacted_text, redacted_file_path # Returning only a valid file path


	def select_all_entities():
	return PII_ENTITIES

	def deselect_all_entities():
	return []

	custom_css = """
	<style>
	#redact_button {
	/background-color: #E691FF !important;/
	color: #4B23C0;
	}
	</style>
	"""

	# Gradio UI
	with gr.Blocks() as app:

	gr.Markdown(
	"""
	<div style="
	background-color: #4B23C0;
	color: white;
	padding: 20px;
	text-align: left;
	font-size: 24px;
	font-weight: bold;
	margin: 0;
	border-radius: 4px; /* Rounded edges */
	">
	🔒 PII Remover  -  Secure Document Redaction Tool
	</div>
	""",
	sanitize_html=False
	)

	gr.Markdown(
	"<div style='text-align: center; font-size: 24px; font-weight: bold; color: red;'>"
	"⚠️ THIS IS A DEMONSTRATION. DO NOT UPLOAD SENSITIVE DOCUMENTS. ⚠️"
	"</div>",
	sanitize_html=False
	)


	gr.Markdown("Upload a TXT, DOCX, PPTX, or PDF file to remove Personal Identifiable Information (PII) while keeping formatting.")

	# Load CSS
	gr.HTML(custom_css)

	with gr.Row():
	file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, TXT)")

	entity_selector = gr.CheckboxGroup(PII_ENTITIES, label="Select PII Entities to Redact (Leave blank to redact all)")

	with gr.Row():
	select_all_button = gr.Button("Select All")
	deselect_all_button = gr.Button("Deselect All")

	redaction_method = gr.Radio(
	["Remove", "Redact", "Replace", "Mask", "Hash", "Encrypt"],
	label="Redaction Method",
	value="Redact"
	)

	process_button = gr.Button("Redact Document", elem_id="redact_button")

	output_text = gr.Textbox(label="Redacted Text", lines=10)
	download_button = gr.File(label="Download Redacted File")

	# Button Actions
	select_all_button.click(fn=select_all_entities, outputs=entity_selector)
	deselect_all_button.click(fn=deselect_all_entities, outputs=entity_selector)

	process_button.click(fn=process_file, inputs=[file_input, entity_selector, redaction_method], outputs=[output_text, download_button])

	app.launch()