PII_Remover / app.py
dhammo2's picture
Upload app.py
af9438a verified
import gradio as gr
import pdfplumber
import io
from docx import Document
from pptx import Presentation
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, OperatorConfig
from cryptography.fernet import Fernet
import hashlib
import tempfile
import os
# Initialize Presidio engines
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
# Generate encryption key (This should be securely stored and retrieved for real-world use)
encryption_key = Fernet.generate_key()
fernet = Fernet(encryption_key)
# Microsoft Presidio Global + UK PII Entity List
PII_ENTITIES = [
"CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IBAN_CODE", "IP_ADDRESS",
"NRP", "LOCATION", "PERSON", "PHONE_NUMBER", "MEDICAL_LICENSE", "URL", "UK_NHS", "UK_NINO"
]
REDACTION_METHODS = {
"Remove": OperatorConfig("redact"),
"Redact": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
"Replace": OperatorConfig("replace", {"new_value": ""}),
"Mask": OperatorConfig("mask", {"chars_to_mask": 4, "masking_char": "*", "from_end": False}),
"Hash": "hash",
"Encrypt": "encrypt",
}
def hash_pii(text):
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def encrypt_pii(text):
return fernet.encrypt(text.encode("utf-8")).decode("utf-8")
def redact_text(text, selected_entities, redaction_method):
"""Identifies and redacts selected PII types based on the chosen method."""
selected_entities = selected_entities or None # If empty, redact all entities
results = analyzer.analyze(text=text, entities=selected_entities, language="en")
if redaction_method == "Hash":
for result in results:
text = text.replace(result.text, hash_pii(result.text))
elif redaction_method == "Encrypt":
for result in results:
text = text.replace(result.text, encrypt_pii(result.text))
elif redaction_method == "Replace":
operators = {entity.entity_type: OperatorConfig("replace") for entity in results}
text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
else:
operators = {entity.entity_type: REDACTION_METHODS[redaction_method] for entity in results}
text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
return text
# Document Processing Functions
def process_pdf(file):
with pdfplumber.open(file.name) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(pages)
def process_docx(file):
doc = Document(file.name)
return "\n".join([para.text for para in doc.paragraphs])
def process_pptx(file):
ppt = Presentation(file.name)
return "\n".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
def process_txt(file):
return file.read().decode('utf-8')
def read_document(file):
ext = file.name.split(".")[-1].lower()
if ext == "pdf":
return process_pdf(file)
elif ext == "docx":
return process_docx(file)
elif ext == "pptx":
return process_pptx(file)
else:
return process_txt(file)
def save_redacted_file(original_file, redacted_text, selected_entities, redaction_method):
ext = original_file.name.split(".")[-1].lower()
temp_dir = tempfile.gettempdir()
safe_filename = f"redacted_{os.path.basename(original_file.name)}"
redacted_file_path = os.path.join(temp_dir, safe_filename)
if ext == "docx":
doc = Document(original_file.name)
for para in doc.paragraphs:
para.text = redact_text(para.text, selected_entities, redaction_method) # Use redaction_method passed from UI
doc.save(redacted_file_path)
elif ext == "pptx":
ppt = Presentation(original_file.name)
# Loop through each slide in the original PPTX and add redacted text
for slide_num, slide in enumerate(ppt.slides):
# Loop through all shapes on the slide
for shape in slide.shapes:
if hasattr(shape, "text"):
# Redact the text inside the shape
redacted_text_in_shape = redact_text(shape.text, selected_entities, redaction_method)
shape.text = redacted_text_in_shape # Apply the redacted text back to the shape
ppt.save(redacted_file_path)
else:
with open(redacted_file_path, "w", encoding="utf-8") as f:
f.write(redacted_text)
return redacted_file_path
def process_file(file, selected_entities, redaction_method):
"""Handles file upload, redacts selected PII, and returns redacted file."""
text = read_document(file)
redacted_text = redact_text(text, selected_entities, redaction_method)
redacted_file_path = save_redacted_file(file, redacted_text, selected_entities, redaction_method)
return redacted_text, redacted_file_path # Returning only a valid file path
def select_all_entities():
return PII_ENTITIES
def deselect_all_entities():
return []
custom_css = """
<style>
#redact_button {
/*background-color: #E691FF !important;*/
color: #4B23C0;
}
</style>
"""
# Gradio UI
with gr.Blocks() as app:
gr.Markdown(
"""
<div style="
background-color: #4B23C0;
color: white;
padding: 20px;
text-align: left;
font-size: 24px;
font-weight: bold;
margin: 0;
border-radius: 4px; /* Rounded edges */
">
🔒 PII Remover &nbsp;-&nbsp; Secure Document Redaction Tool
</div>
""",
sanitize_html=False
)
gr.Markdown(
"<div style='text-align: center; font-size: 24px; font-weight: bold; color: red;'>"
"⚠️ THIS IS A DEMONSTRATION. DO NOT UPLOAD SENSITIVE DOCUMENTS. ⚠️"
"</div>",
sanitize_html=False
)
gr.Markdown("Upload a **TXT, DOCX, PPTX, or PDF** file to remove **Personal Identifiable Information (PII)** while keeping formatting.")
# Load CSS
gr.HTML(custom_css)
with gr.Row():
file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, TXT)")
entity_selector = gr.CheckboxGroup(PII_ENTITIES, label="Select PII Entities to Redact (Leave blank to redact all)")
with gr.Row():
select_all_button = gr.Button("Select All")
deselect_all_button = gr.Button("Deselect All")
redaction_method = gr.Radio(
["Remove", "Redact", "Replace", "Mask", "Hash", "Encrypt"],
label="Redaction Method",
value="Redact"
)
process_button = gr.Button("Redact Document", elem_id="redact_button")
output_text = gr.Textbox(label="Redacted Text", lines=10)
download_button = gr.File(label="Download Redacted File")
# Button Actions
select_all_button.click(fn=select_all_entities, outputs=entity_selector)
deselect_all_button.click(fn=deselect_all_entities, outputs=entity_selector)
process_button.click(fn=process_file, inputs=[file_input, entity_selector, redaction_method], outputs=[output_text, download_button])
app.launch()