File size: 7,465 Bytes
8cb0bcb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2301910
8cb0bcb
2301910
8cb0bcb
 
 
 
 
 
 
 
 
 
 
 
2301910
 
8cb0bcb
 
 
 
 
 
 
 
2301910
 
 
8cb0bcb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af9438a
8cb0bcb
af9438a
8cb0bcb
 
af9438a
 
 
 
 
 
8cb0bcb
af9438a
 
 
 
 
 
 
 
 
 
 
8cb0bcb
af9438a
8cb0bcb
 
af9438a
 
2301910
8cb0bcb
 
af9438a
8cb0bcb
 
af9438a
 
 
 
8cb0bcb
 
 
 
 
 
 
 
 
 
2301910
8cb0bcb
 
 
 
 
 
 
2301910
8cb0bcb
 
2301910
 
 
 
 
 
 
 
 
 
8cb0bcb
 
 
 
 
2301910
 
 
 
 
 
 
 
 
8cb0bcb
2301910
 
8cb0bcb
 
 
 
2301910
 
 
8cb0bcb
 
 
2301910
 
 
 
 
 
 
8cb0bcb
2301910
8cb0bcb
 
2301910
 
8cb0bcb
 
 
2301910
 
8cb0bcb
2301910
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import gradio as gr
import pdfplumber
import io
from docx import Document
from pptx import Presentation
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, OperatorConfig
from cryptography.fernet import Fernet
import hashlib
import tempfile
import os

# Initialize Presidio engines
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Generate encryption key (This should be securely stored and retrieved for real-world use)
encryption_key = Fernet.generate_key()
fernet = Fernet(encryption_key)

# Microsoft Presidio Global + UK PII Entity List
PII_ENTITIES = [
    "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IBAN_CODE", "IP_ADDRESS",
    "NRP", "LOCATION", "PERSON", "PHONE_NUMBER", "MEDICAL_LICENSE", "URL", "UK_NHS", "UK_NINO"
]

REDACTION_METHODS = {
    "Remove": OperatorConfig("redact"),
    "Redact": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
    "Replace": OperatorConfig("replace", {"new_value": ""}),
    "Mask": OperatorConfig("mask", {"chars_to_mask": 4, "masking_char": "*", "from_end": False}),
    "Hash": "hash",
    "Encrypt": "encrypt",
}

def hash_pii(text):
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

def encrypt_pii(text):
    return fernet.encrypt(text.encode("utf-8")).decode("utf-8")

def redact_text(text, selected_entities, redaction_method):
    """Identifies and redacts selected PII types based on the chosen method."""
    selected_entities = selected_entities or None  # If empty, redact all entities
    results = analyzer.analyze(text=text, entities=selected_entities, language="en")
    
    if redaction_method == "Hash":
        for result in results:
            text = text.replace(result.text, hash_pii(result.text))
    elif redaction_method == "Encrypt":
        for result in results:
            text = text.replace(result.text, encrypt_pii(result.text))
    elif redaction_method == "Replace":
        operators = {entity.entity_type: OperatorConfig("replace") for entity in results}
        text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
    else:
        operators = {entity.entity_type: REDACTION_METHODS[redaction_method] for entity in results}
        text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
    
    return text

# Document Processing Functions

def process_pdf(file):
    with pdfplumber.open(file.name) as pdf:
        pages = [page.extract_text() or "" for page in pdf.pages]
    return "\n".join(pages)

def process_docx(file):
    doc = Document(file.name)
    return "\n".join([para.text for para in doc.paragraphs])

def process_pptx(file):
    ppt = Presentation(file.name)
    return "\n".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])

def process_txt(file):
    return file.read().decode('utf-8')

def read_document(file):
    ext = file.name.split(".")[-1].lower()
    if ext == "pdf":
        return process_pdf(file)
    elif ext == "docx":
        return process_docx(file)
    elif ext == "pptx":
        return process_pptx(file)
    else:
        return process_txt(file)

def save_redacted_file(original_file, redacted_text, selected_entities, redaction_method):
    ext = original_file.name.split(".")[-1].lower()
    temp_dir = tempfile.gettempdir()
    safe_filename = f"redacted_{os.path.basename(original_file.name)}"
    redacted_file_path = os.path.join(temp_dir, safe_filename)
    
    if ext == "docx":
        doc = Document(original_file.name)
        for para in doc.paragraphs:
            para.text = redact_text(para.text, selected_entities, redaction_method)  # Use redaction_method passed from UI
        doc.save(redacted_file_path)
    elif ext == "pptx":
        ppt = Presentation(original_file.name)
        
        # Loop through each slide in the original PPTX and add redacted text
        for slide_num, slide in enumerate(ppt.slides):
            # Loop through all shapes on the slide
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    # Redact the text inside the shape
                    redacted_text_in_shape = redact_text(shape.text, selected_entities, redaction_method)
                    shape.text = redacted_text_in_shape  # Apply the redacted text back to the shape
                    
        ppt.save(redacted_file_path)
    else:
        with open(redacted_file_path, "w", encoding="utf-8") as f:
            f.write(redacted_text)
    
    return redacted_file_path


def process_file(file, selected_entities, redaction_method):
    """Handles file upload, redacts selected PII, and returns redacted file."""
    text = read_document(file)
    redacted_text = redact_text(text, selected_entities, redaction_method)
    redacted_file_path = save_redacted_file(file, redacted_text, selected_entities, redaction_method)

    return redacted_text, redacted_file_path  # Returning only a valid file path


def select_all_entities():
    return PII_ENTITIES

def deselect_all_entities():
    return []

custom_css = """

<style>

    #redact_button {

        /*background-color: #E691FF !important;*/

        color: #4B23C0;

    }

</style>

"""

# Gradio UI
with gr.Blocks() as app:
    
    gr.Markdown(
        """

        <div style="

            background-color: #4B23C0; 

            color: white; 

            padding: 20px; 

            text-align: left; 

            font-size: 24px; 

            font-weight: bold; 

            margin: 0;

            border-radius: 4px; /* Rounded edges */

        ">

            🔒 PII Remover &nbsp;-&nbsp; Secure Document Redaction Tool

        </div>

        """,
        sanitize_html=False
    )
    
    gr.Markdown(
        "<div style='text-align: center; font-size: 24px; font-weight: bold; color: red;'>"
        "⚠️ THIS IS A DEMONSTRATION. DO NOT UPLOAD SENSITIVE DOCUMENTS. ⚠️"
        "</div>",
        sanitize_html=False
    )


    gr.Markdown("Upload a **TXT, DOCX, PPTX, or PDF** file to remove **Personal Identifiable Information (PII)** while keeping formatting.")

    # Load CSS
    gr.HTML(custom_css)
    
    with gr.Row():
        file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, TXT)")
    
    entity_selector = gr.CheckboxGroup(PII_ENTITIES, label="Select PII Entities to Redact (Leave blank to redact all)")
    
    with gr.Row():
        select_all_button = gr.Button("Select All")
        deselect_all_button = gr.Button("Deselect All")
    
    redaction_method = gr.Radio(
        ["Remove", "Redact", "Replace", "Mask", "Hash", "Encrypt"], 
        label="Redaction Method", 
        value="Redact"
    )
    
    process_button = gr.Button("Redact Document", elem_id="redact_button")
    
    output_text = gr.Textbox(label="Redacted Text", lines=10)
    download_button = gr.File(label="Download Redacted File")

    # Button Actions
    select_all_button.click(fn=select_all_entities, outputs=entity_selector)
    deselect_all_button.click(fn=deselect_all_entities, outputs=entity_selector)
    
    process_button.click(fn=process_file, inputs=[file_input, entity_selector, redaction_method], outputs=[output_text, download_button])

app.launch()