Spaces:

MOPAC-DS
/

PII_Remover

Runtime error

File size: 7,465 Bytes

import gradio as gr
import pdfplumber
import io
from docx import Document
from pptx import Presentation
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, OperatorConfig
from cryptography.fernet import Fernet
import hashlib
import tempfile
import os

# Initialize Presidio engines
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Generate encryption key (This should be securely stored and retrieved for real-world use)
encryption_key = Fernet.generate_key()
fernet = Fernet(encryption_key)

# Microsoft Presidio Global + UK PII Entity List
PII_ENTITIES = [
    "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IBAN_CODE", "IP_ADDRESS",
    "NRP", "LOCATION", "PERSON", "PHONE_NUMBER", "MEDICAL_LICENSE", "URL", "UK_NHS", "UK_NINO"
]

REDACTION_METHODS = {
    "Remove": OperatorConfig("redact"),
    "Redact": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
    "Replace": OperatorConfig("replace", {"new_value": ""}),
    "Mask": OperatorConfig("mask", {"chars_to_mask": 4, "masking_char": "*", "from_end": False}),
    "Hash": "hash",
    "Encrypt": "encrypt",
}

def hash_pii(text):
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

def encrypt_pii(text):
    return fernet.encrypt(text.encode("utf-8")).decode("utf-8")

def redact_text(text, selected_entities, redaction_method):
    """Identifies and redacts selected PII types based on the chosen method."""
    selected_entities = selected_entities or None  # If empty, redact all entities
    results = analyzer.analyze(text=text, entities=selected_entities, language="en")
    
    if redaction_method == "Hash":
        for result in results:
            text = text.replace(result.text, hash_pii(result.text))
    elif redaction_method == "Encrypt":
        for result in results:
            text = text.replace(result.text, encrypt_pii(result.text))
    elif redaction_method == "Replace":
        operators = {entity.entity_type: OperatorConfig("replace") for entity in results}
        text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
    else:
        operators = {entity.entity_type: REDACTION_METHODS[redaction_method] for entity in results}
        text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
    
    return text

# Document Processing Functions

def process_pdf(file):
    with pdfplumber.open(file.name) as pdf:
        pages = [page.extract_text() or "" for page in pdf.pages]
    return "\n".join(pages)

def process_docx(file):
    doc = Document(file.name)
    return "\n".join([para.text for para in doc.paragraphs])

def process_pptx(file):
    ppt = Presentation(file.name)
    return "\n".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])

def process_txt(file):
    return file.read().decode('utf-8')

def read_document(file):
    ext = file.name.split(".")[-1].lower()
    if ext == "pdf":
        return process_pdf(file)
    elif ext == "docx":
        return process_docx(file)
    elif ext == "pptx":
        return process_pptx(file)
    else:
        return process_txt(file)

def save_redacted_file(original_file, redacted_text, selected_entities, redaction_method):
    ext = original_file.name.split(".")[-1].lower()
    temp_dir = tempfile.gettempdir()
    safe_filename = f"redacted_{os.path.basename(original_file.name)}"
    redacted_file_path = os.path.join(temp_dir, safe_filename)
    
    if ext == "docx":
        doc = Document(original_file.name)
        for para in doc.paragraphs:
            para.text = redact_text(para.text, selected_entities, redaction_method)  # Use redaction_method passed from UI
        doc.save(redacted_file_path)
    elif ext == "pptx":
        ppt = Presentation(original_file.name)
        
        # Loop through each slide in the original PPTX and add redacted text
        for slide_num, slide in enumerate(ppt.slides):
            # Loop through all shapes on the slide
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    # Redact the text inside the shape
                    redacted_text_in_shape = redact_text(shape.text, selected_entities, redaction_method)
                    shape.text = redacted_text_in_shape  # Apply the redacted text back to the shape
                    
        ppt.save(redacted_file_path)
    else:
        with open(redacted_file_path, "w", encoding="utf-8") as f:
            f.write(redacted_text)
    
    return redacted_file_path


def process_file(file, selected_entities, redaction_method):
    """Handles file upload, redacts selected PII, and returns redacted file."""
    text = read_document(file)
    redacted_text = redact_text(text, selected_entities, redaction_method)
    redacted_file_path = save_redacted_file(file, redacted_text, selected_entities, redaction_method)

    return redacted_text, redacted_file_path  # Returning only a valid file path


def select_all_entities():
    return PII_ENTITIES

def deselect_all_entities():
    return []

custom_css = """

<style>

    #redact_button {

        /*background-color: #E691FF !important;*/

        color: #4B23C0;

    }

</style>

"""

# Gradio UI
with gr.Blocks() as app:
    
    gr.Markdown(
        """

        <div style="

            background-color: #4B23C0; 

            color: white; 

            padding: 20px; 

            text-align: left; 

            font-size: 24px; 

            font-weight: bold; 

            margin: 0;

            border-radius: 4px; /* Rounded edges */

        ">

            🔒 PII Remover &nbsp;-&nbsp; Secure Document Redaction Tool

        </div>

        """,
        sanitize_html=False
    )
    
    gr.Markdown(
        "<div style='text-align: center; font-size: 24px; font-weight: bold; color: red;'>"
        "⚠️ THIS IS A DEMONSTRATION. DO NOT UPLOAD SENSITIVE DOCUMENTS. ⚠️"
        "</div>",
        sanitize_html=False
    )


    gr.Markdown("Upload a **TXT, DOCX, PPTX, or PDF** file to remove **Personal Identifiable Information (PII)** while keeping formatting.")

    # Load CSS
    gr.HTML(custom_css)
    
    with gr.Row():
        file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, TXT)")
    
    entity_selector = gr.CheckboxGroup(PII_ENTITIES, label="Select PII Entities to Redact (Leave blank to redact all)")
    
    with gr.Row():
        select_all_button = gr.Button("Select All")
        deselect_all_button = gr.Button("Deselect All")
    
    redaction_method = gr.Radio(
        ["Remove", "Redact", "Replace", "Mask", "Hash", "Encrypt"], 
        label="Redaction Method", 
        value="Redact"
    )
    
    process_button = gr.Button("Redact Document", elem_id="redact_button")
    
    output_text = gr.Textbox(label="Redacted Text", lines=10)
    download_button = gr.File(label="Download Redacted File")

    # Button Actions
    select_all_button.click(fn=select_all_entities, outputs=entity_selector)
    deselect_all_button.click(fn=deselect_all_entities, outputs=entity_selector)
    
    process_button.click(fn=process_file, inputs=[file_input, entity_selector, redaction_method], outputs=[output_text, download_button])

app.launch()