import gradio as gr import pdfplumber import io from docx import Document from pptx import Presentation from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine, OperatorConfig from cryptography.fernet import Fernet import hashlib import tempfile import os # Initialize Presidio engines analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() # Generate encryption key (This should be securely stored and retrieved for real-world use) encryption_key = Fernet.generate_key() fernet = Fernet(encryption_key) # Microsoft Presidio Global + UK PII Entity List PII_ENTITIES = [ "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IBAN_CODE", "IP_ADDRESS", "NRP", "LOCATION", "PERSON", "PHONE_NUMBER", "MEDICAL_LICENSE", "URL", "UK_NHS", "UK_NINO" ] REDACTION_METHODS = { "Remove": OperatorConfig("redact"), "Redact": OperatorConfig("replace", {"new_value": "[REDACTED]"}), "Replace": OperatorConfig("replace", {"new_value": ""}), "Mask": OperatorConfig("mask", {"chars_to_mask": 4, "masking_char": "*", "from_end": False}), "Hash": "hash", "Encrypt": "encrypt", } def hash_pii(text): return hashlib.sha256(text.encode("utf-8")).hexdigest() def encrypt_pii(text): return fernet.encrypt(text.encode("utf-8")).decode("utf-8") def redact_text(text, selected_entities, redaction_method): """Identifies and redacts selected PII types based on the chosen method.""" selected_entities = selected_entities or None # If empty, redact all entities results = analyzer.analyze(text=text, entities=selected_entities, language="en") if redaction_method == "Hash": for result in results: text = text.replace(result.text, hash_pii(result.text)) elif redaction_method == "Encrypt": for result in results: text = text.replace(result.text, encrypt_pii(result.text)) elif redaction_method == "Replace": operators = {entity.entity_type: OperatorConfig("replace") for entity in results} text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text else: operators = {entity.entity_type: REDACTION_METHODS[redaction_method] for entity in results} text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text return text # Document Processing Functions def process_pdf(file): with pdfplumber.open(file.name) as pdf: pages = [page.extract_text() or "" for page in pdf.pages] return "\n".join(pages) def process_docx(file): doc = Document(file.name) return "\n".join([para.text for para in doc.paragraphs]) def process_pptx(file): ppt = Presentation(file.name) return "\n".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")]) def process_txt(file): return file.read().decode('utf-8') def read_document(file): ext = file.name.split(".")[-1].lower() if ext == "pdf": return process_pdf(file) elif ext == "docx": return process_docx(file) elif ext == "pptx": return process_pptx(file) else: return process_txt(file) def save_redacted_file(original_file, redacted_text, selected_entities, redaction_method): ext = original_file.name.split(".")[-1].lower() temp_dir = tempfile.gettempdir() safe_filename = f"redacted_{os.path.basename(original_file.name)}" redacted_file_path = os.path.join(temp_dir, safe_filename) if ext == "docx": doc = Document(original_file.name) for para in doc.paragraphs: para.text = redact_text(para.text, selected_entities, redaction_method) # Use redaction_method passed from UI doc.save(redacted_file_path) elif ext == "pptx": ppt = Presentation(original_file.name) # Loop through each slide in the original PPTX and add redacted text for slide_num, slide in enumerate(ppt.slides): # Loop through all shapes on the slide for shape in slide.shapes: if hasattr(shape, "text"): # Redact the text inside the shape redacted_text_in_shape = redact_text(shape.text, selected_entities, redaction_method) shape.text = redacted_text_in_shape # Apply the redacted text back to the shape ppt.save(redacted_file_path) else: with open(redacted_file_path, "w", encoding="utf-8") as f: f.write(redacted_text) return redacted_file_path def process_file(file, selected_entities, redaction_method): """Handles file upload, redacts selected PII, and returns redacted file.""" text = read_document(file) redacted_text = redact_text(text, selected_entities, redaction_method) redacted_file_path = save_redacted_file(file, redacted_text, selected_entities, redaction_method) return redacted_text, redacted_file_path # Returning only a valid file path def select_all_entities(): return PII_ENTITIES def deselect_all_entities(): return [] custom_css = """ """ # Gradio UI with gr.Blocks() as app: gr.Markdown( """