dhammo2 commited on
Commit
8cb0bcb
·
verified ·
1 Parent(s): d93b7b1

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -0
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ import io
4
+ from docx import Document
5
+ from pptx import Presentation
6
+ from presidio_analyzer import AnalyzerEngine
7
+ from presidio_anonymizer import AnonymizerEngine, OperatorConfig
8
+ from cryptography.fernet import Fernet
9
+ import hashlib
10
+ import tempfile
11
+ import os
12
+
13
+ # Initialize Presidio engines
14
+ analyzer = AnalyzerEngine()
15
+ anonymizer = AnonymizerEngine()
16
+
17
+ # Generate encryption key (This should be securely stored and retrieved for real-world use)
18
+ encryption_key = Fernet.generate_key()
19
+ fernet = Fernet(encryption_key)
20
+
21
+ # Microsoft Presidio Global + UK PII Entity List
22
+ PII_ENTITIES = [
23
+ "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IBAN_CODE", "IP_ADDRESS",
24
+ "NRP", "LOCATION", "PERSON", "PHONE_NUMBER", "MEDICAL_LICENSE", "URL", "UK_NHS", "UK_NINO"
25
+ ]
26
+
27
+ REDACTION_METHODS = {
28
+ "Redact": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
29
+ "Remove": OperatorConfig("remove"),
30
+ "Mask": OperatorConfig("mask", {"chars_to_mask": 4, "masking_char": "*", "from_end": False}),
31
+ "Hash": "hash",
32
+ "Encrypt": "encrypt",
33
+ }
34
+
35
+ def hash_pii(text):
36
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
37
+
38
+ def encrypt_pii(text):
39
+ return fernet.encrypt(text.encode("utf-8")).decode("utf-8")
40
+
41
+ def redact_text(text, selected_entities, redaction_method):
42
+ selected_entities = selected_entities or None
43
+ results = analyzer.analyze(text=text, entities=selected_entities, language="en")
44
+
45
+ if redaction_method == "Hash":
46
+ for result in results:
47
+ text = text.replace(result.text, hash_pii(result.text))
48
+ elif redaction_method == "Encrypt":
49
+ for result in results:
50
+ text = text.replace(result.text, encrypt_pii(result.text))
51
+ else:
52
+ operators = {entity.entity_type: REDACTION_METHODS[redaction_method] for entity in results}
53
+ text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
54
+
55
+ return text
56
+
57
+ # Document Processing Functions
58
+
59
+ def process_pdf(file):
60
+ with pdfplumber.open(file.name) as pdf:
61
+ pages = [page.extract_text() or "" for page in pdf.pages]
62
+ return "\n".join(pages)
63
+
64
+ def process_docx(file):
65
+ doc = Document(file.name)
66
+ return "\n".join([para.text for para in doc.paragraphs])
67
+
68
+ def process_pptx(file):
69
+ ppt = Presentation(file.name)
70
+ return "\n".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
71
+
72
+ def process_txt(file):
73
+ return file.read().decode('utf-8')
74
+
75
+ def read_document(file):
76
+ ext = file.name.split(".")[-1].lower()
77
+ if ext == "pdf":
78
+ return process_pdf(file)
79
+ elif ext == "docx":
80
+ return process_docx(file)
81
+ elif ext == "pptx":
82
+ return process_pptx(file)
83
+ else:
84
+ return process_txt(file)
85
+
86
+ def save_redacted_file(original_file, redacted_text):
87
+ ext = original_file.name.split(".")[-1].lower()
88
+ temp_dir = tempfile.gettempdir()
89
+ safe_filename = f"redacted_{os.path.basename(original_file.name)}"
90
+ redacted_file_path = os.path.join(temp_dir, safe_filename)
91
+
92
+ if ext == "docx":
93
+ doc = Document(original_file.name)
94
+ for para in doc.paragraphs:
95
+ para.text = redact_text(para.text, PII_ENTITIES, "Redact")
96
+ doc.save(redacted_file_path)
97
+ elif ext == "pptx":
98
+ ppt = Presentation(original_file.name)
99
+ for slide in ppt.slides:
100
+ for shape in slide.shapes:
101
+ if hasattr(shape, "text"):
102
+ shape.text = redact_text(shape.text, PII_ENTITIES, "Redact")
103
+ ppt.save(redacted_file_path)
104
+ else:
105
+ with open(redacted_file_path, "w", encoding="utf-8") as f:
106
+ f.write(redacted_text)
107
+
108
+ return redacted_file_path
109
+
110
+ def process_file(file, selected_entities, redaction_method):
111
+ text = read_document(file)
112
+ redacted_text = redact_text(text, selected_entities, redaction_method)
113
+ redacted_file_path = save_redacted_file(file, redacted_text)
114
+ return redacted_text, redacted_file_path
115
+
116
+ def select_all_entities():
117
+ return PII_ENTITIES
118
+
119
+ def deselect_all_entities():
120
+ return []
121
+
122
+ custom_css = """
123
+ <style>
124
+ #redact_button {
125
+ background-color: #E691FF !important;
126
+ color: #4B23C0;
127
+ }
128
+ </style>
129
+ """
130
+
131
+ # Gradio UI
132
+ with gr.Blocks() as app:
133
+ gr.Markdown(
134
+ """
135
+ <div style="background-color: #4B23C0; color: white; padding: 20px; text-align: left; font-size: 24px; font-weight: bold; margin: 0; border-radius: 4px;">
136
+ 🔒 PII Remover &nbsp;-&nbsp; Secure Document Redaction Tool
137
+ </div>
138
+ """,
139
+ sanitize_html=False
140
+ )
141
+ gr.Markdown("Upload a **TXT, DOCX, PPTX, or PDF** file to remove **Personal Identifiable Information (PII)** while keeping formatting.")
142
+ gr.HTML(custom_css)
143
+
144
+ with gr.Row():
145
+ file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, TXT)")
146
+ entity_selector = gr.CheckboxGroup(PII_ENTITIES, label="Select PII Entities to Redact")
147
+ with gr.Row():
148
+ select_all_button = gr.Button("Select All")
149
+ deselect_all_button = gr.Button("Deselect All")
150
+ redaction_method = gr.Radio(["Redact", "Remove", "Mask", "Hash", "Encrypt"], label="Redaction Method", value="Redact")
151
+ process_button = gr.Button("Redact Document", elem_id="redact_button")
152
+ output_text = gr.Textbox(label="Redacted Text", lines=10)
153
+ download_button = gr.File(label="Download Redacted File")
154
+
155
+ select_all_button.click(fn=select_all_entities, outputs=entity_selector)
156
+ deselect_all_button.click(fn=deselect_all_entities, outputs=entity_selector)
157
+ process_button.click(fn=process_file, inputs=[file_input, entity_selector, redaction_method], outputs=[output_text, download_button])
158
+
159
+ app.launch()