Spaces:
Runtime error
Runtime error
File size: 7,465 Bytes
8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb af9438a 8cb0bcb af9438a 8cb0bcb af9438a 8cb0bcb af9438a 8cb0bcb af9438a 8cb0bcb af9438a 2301910 8cb0bcb af9438a 8cb0bcb af9438a 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 8cb0bcb 2301910 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
import gradio as gr
import pdfplumber
import io
from docx import Document
from pptx import Presentation
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, OperatorConfig
from cryptography.fernet import Fernet
import hashlib
import tempfile
import os
# Initialize Presidio engines
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
# Generate encryption key (This should be securely stored and retrieved for real-world use)
encryption_key = Fernet.generate_key()
fernet = Fernet(encryption_key)
# Microsoft Presidio Global + UK PII Entity List
PII_ENTITIES = [
"CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IBAN_CODE", "IP_ADDRESS",
"NRP", "LOCATION", "PERSON", "PHONE_NUMBER", "MEDICAL_LICENSE", "URL", "UK_NHS", "UK_NINO"
]
REDACTION_METHODS = {
"Remove": OperatorConfig("redact"),
"Redact": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
"Replace": OperatorConfig("replace", {"new_value": ""}),
"Mask": OperatorConfig("mask", {"chars_to_mask": 4, "masking_char": "*", "from_end": False}),
"Hash": "hash",
"Encrypt": "encrypt",
}
def hash_pii(text):
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def encrypt_pii(text):
return fernet.encrypt(text.encode("utf-8")).decode("utf-8")
def redact_text(text, selected_entities, redaction_method):
"""Identifies and redacts selected PII types based on the chosen method."""
selected_entities = selected_entities or None # If empty, redact all entities
results = analyzer.analyze(text=text, entities=selected_entities, language="en")
if redaction_method == "Hash":
for result in results:
text = text.replace(result.text, hash_pii(result.text))
elif redaction_method == "Encrypt":
for result in results:
text = text.replace(result.text, encrypt_pii(result.text))
elif redaction_method == "Replace":
operators = {entity.entity_type: OperatorConfig("replace") for entity in results}
text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
else:
operators = {entity.entity_type: REDACTION_METHODS[redaction_method] for entity in results}
text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
return text
# Document Processing Functions
def process_pdf(file):
with pdfplumber.open(file.name) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(pages)
def process_docx(file):
doc = Document(file.name)
return "\n".join([para.text for para in doc.paragraphs])
def process_pptx(file):
ppt = Presentation(file.name)
return "\n".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
def process_txt(file):
return file.read().decode('utf-8')
def read_document(file):
ext = file.name.split(".")[-1].lower()
if ext == "pdf":
return process_pdf(file)
elif ext == "docx":
return process_docx(file)
elif ext == "pptx":
return process_pptx(file)
else:
return process_txt(file)
def save_redacted_file(original_file, redacted_text, selected_entities, redaction_method):
ext = original_file.name.split(".")[-1].lower()
temp_dir = tempfile.gettempdir()
safe_filename = f"redacted_{os.path.basename(original_file.name)}"
redacted_file_path = os.path.join(temp_dir, safe_filename)
if ext == "docx":
doc = Document(original_file.name)
for para in doc.paragraphs:
para.text = redact_text(para.text, selected_entities, redaction_method) # Use redaction_method passed from UI
doc.save(redacted_file_path)
elif ext == "pptx":
ppt = Presentation(original_file.name)
# Loop through each slide in the original PPTX and add redacted text
for slide_num, slide in enumerate(ppt.slides):
# Loop through all shapes on the slide
for shape in slide.shapes:
if hasattr(shape, "text"):
# Redact the text inside the shape
redacted_text_in_shape = redact_text(shape.text, selected_entities, redaction_method)
shape.text = redacted_text_in_shape # Apply the redacted text back to the shape
ppt.save(redacted_file_path)
else:
with open(redacted_file_path, "w", encoding="utf-8") as f:
f.write(redacted_text)
return redacted_file_path
def process_file(file, selected_entities, redaction_method):
"""Handles file upload, redacts selected PII, and returns redacted file."""
text = read_document(file)
redacted_text = redact_text(text, selected_entities, redaction_method)
redacted_file_path = save_redacted_file(file, redacted_text, selected_entities, redaction_method)
return redacted_text, redacted_file_path # Returning only a valid file path
def select_all_entities():
return PII_ENTITIES
def deselect_all_entities():
return []
custom_css = """
<style>
#redact_button {
/*background-color: #E691FF !important;*/
color: #4B23C0;
}
</style>
"""
# Gradio UI
with gr.Blocks() as app:
gr.Markdown(
"""
<div style="
background-color: #4B23C0;
color: white;
padding: 20px;
text-align: left;
font-size: 24px;
font-weight: bold;
margin: 0;
border-radius: 4px; /* Rounded edges */
">
🔒 PII Remover - Secure Document Redaction Tool
</div>
""",
sanitize_html=False
)
gr.Markdown(
"<div style='text-align: center; font-size: 24px; font-weight: bold; color: red;'>"
"⚠️ THIS IS A DEMONSTRATION. DO NOT UPLOAD SENSITIVE DOCUMENTS. ⚠️"
"</div>",
sanitize_html=False
)
gr.Markdown("Upload a **TXT, DOCX, PPTX, or PDF** file to remove **Personal Identifiable Information (PII)** while keeping formatting.")
# Load CSS
gr.HTML(custom_css)
with gr.Row():
file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, TXT)")
entity_selector = gr.CheckboxGroup(PII_ENTITIES, label="Select PII Entities to Redact (Leave blank to redact all)")
with gr.Row():
select_all_button = gr.Button("Select All")
deselect_all_button = gr.Button("Deselect All")
redaction_method = gr.Radio(
["Remove", "Redact", "Replace", "Mask", "Hash", "Encrypt"],
label="Redaction Method",
value="Redact"
)
process_button = gr.Button("Redact Document", elem_id="redact_button")
output_text = gr.Textbox(label="Redacted Text", lines=10)
download_button = gr.File(label="Download Redacted File")
# Button Actions
select_all_button.click(fn=select_all_entities, outputs=entity_selector)
deselect_all_button.click(fn=deselect_all_entities, outputs=entity_selector)
process_button.click(fn=process_file, inputs=[file_input, entity_selector, redaction_method], outputs=[output_text, download_button])
app.launch()
|