dhammo2 commited on
Commit
2301910
·
verified ·
1 Parent(s): 92f82b0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -23
app.py CHANGED
@@ -25,8 +25,9 @@ PII_ENTITIES = [
25
  ]
26
 
27
  REDACTION_METHODS = {
 
28
  "Redact": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
29
- "Remove": OperatorConfig("remove"),
30
  "Mask": OperatorConfig("mask", {"chars_to_mask": 4, "masking_char": "*", "from_end": False}),
31
  "Hash": "hash",
32
  "Encrypt": "encrypt",
@@ -39,7 +40,8 @@ def encrypt_pii(text):
39
  return fernet.encrypt(text.encode("utf-8")).decode("utf-8")
40
 
41
  def redact_text(text, selected_entities, redaction_method):
42
- selected_entities = selected_entities or None
 
43
  results = analyzer.analyze(text=text, entities=selected_entities, language="en")
44
 
45
  if redaction_method == "Hash":
@@ -48,6 +50,9 @@ def redact_text(text, selected_entities, redaction_method):
48
  elif redaction_method == "Encrypt":
49
  for result in results:
50
  text = text.replace(result.text, encrypt_pii(result.text))
 
 
 
51
  else:
52
  operators = {entity.entity_type: REDACTION_METHODS[redaction_method] for entity in results}
53
  text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
@@ -84,28 +89,36 @@ def read_document(file):
84
  return process_txt(file)
85
 
86
  def save_redacted_file(original_file, redacted_text):
 
 
87
  ext = original_file.name.split(".")[-1].lower()
88
- temp_dir = tempfile.gettempdir()
 
 
89
  safe_filename = f"redacted_{os.path.basename(original_file.name)}"
90
  redacted_file_path = os.path.join(temp_dir, safe_filename)
91
-
92
- if ext == "docx":
93
- doc = Document(original_file.name)
94
- for para in doc.paragraphs:
95
- para.text = redact_text(para.text, PII_ENTITIES, "Redact")
96
- doc.save(redacted_file_path)
 
 
 
 
97
  elif ext == "pptx":
98
- ppt = Presentation(original_file.name)
99
- for slide in ppt.slides:
100
- for shape in slide.shapes:
101
- if hasattr(shape, "text"):
102
- shape.text = redact_text(shape.text, PII_ENTITIES, "Redact")
103
  ppt.save(redacted_file_path)
104
- else:
105
  with open(redacted_file_path, "w", encoding="utf-8") as f:
106
  f.write(redacted_text)
107
-
108
- return redacted_file_path
 
109
 
110
  def process_file(file, selected_entities, redaction_method):
111
  text = read_document(file)
@@ -122,7 +135,7 @@ def deselect_all_entities():
122
  custom_css = """
123
  <style>
124
  #redact_button {
125
- background-color: #E691FF !important;
126
  color: #4B23C0;
127
  }
128
  </style>
@@ -130,30 +143,63 @@ custom_css = """
130
 
131
  # Gradio UI
132
  with gr.Blocks() as app:
 
133
  gr.Markdown(
134
  """
135
- <div style="background-color: #4B23C0; color: white; padding: 20px; text-align: left; font-size: 24px; font-weight: bold; margin: 0; border-radius: 4px;">
 
 
 
 
 
 
 
 
 
136
  🔒 PII Remover &nbsp;-&nbsp; Secure Document Redaction Tool
137
  </div>
138
  """,
139
  sanitize_html=False
140
  )
 
 
 
 
 
 
 
 
 
141
  gr.Markdown("Upload a **TXT, DOCX, PPTX, or PDF** file to remove **Personal Identifiable Information (PII)** while keeping formatting.")
 
 
142
  gr.HTML(custom_css)
143
 
144
  with gr.Row():
145
  file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, TXT)")
146
- entity_selector = gr.CheckboxGroup(PII_ENTITIES, label="Select PII Entities to Redact")
 
 
147
  with gr.Row():
148
  select_all_button = gr.Button("Select All")
149
  deselect_all_button = gr.Button("Deselect All")
150
- redaction_method = gr.Radio(["Redact", "Remove", "Mask", "Hash", "Encrypt"], label="Redaction Method", value="Redact")
 
 
 
 
 
 
151
  process_button = gr.Button("Redact Document", elem_id="redact_button")
 
152
  output_text = gr.Textbox(label="Redacted Text", lines=10)
153
  download_button = gr.File(label="Download Redacted File")
154
-
 
155
  select_all_button.click(fn=select_all_entities, outputs=entity_selector)
156
  deselect_all_button.click(fn=deselect_all_entities, outputs=entity_selector)
157
- process_button.click(fn=process_file, inputs=[file_input, entity_selector, redaction_method], outputs=[output_text, download_button])
158
 
 
 
159
  app.launch()
 
 
25
  ]
26
 
27
  REDACTION_METHODS = {
28
+ "Remove": OperatorConfig("redact"),
29
  "Redact": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
30
+ "Replace": OperatorConfig("replace", {"new_value": ""}),
31
  "Mask": OperatorConfig("mask", {"chars_to_mask": 4, "masking_char": "*", "from_end": False}),
32
  "Hash": "hash",
33
  "Encrypt": "encrypt",
 
40
  return fernet.encrypt(text.encode("utf-8")).decode("utf-8")
41
 
42
  def redact_text(text, selected_entities, redaction_method):
43
+ """Identifies and redacts selected PII types based on the chosen method."""
44
+ selected_entities = selected_entities or None # If empty, redact all entities
45
  results = analyzer.analyze(text=text, entities=selected_entities, language="en")
46
 
47
  if redaction_method == "Hash":
 
50
  elif redaction_method == "Encrypt":
51
  for result in results:
52
  text = text.replace(result.text, encrypt_pii(result.text))
53
+ elif redaction_method == "Replace":
54
+ operators = {entity.entity_type: OperatorConfig("replace") for entity in results}
55
+ text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
56
  else:
57
  operators = {entity.entity_type: REDACTION_METHODS[redaction_method] for entity in results}
58
  text = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators).text
 
89
  return process_txt(file)
90
 
91
  def save_redacted_file(original_file, redacted_text):
92
+ """Saves redacted text back in the original format and returns a valid file path."""
93
+
94
  ext = original_file.name.split(".")[-1].lower()
95
+
96
+ # Create a temporary file with a proper path
97
+ temp_dir = tempfile.gettempdir() # OS-specific temp folder
98
  safe_filename = f"redacted_{os.path.basename(original_file.name)}"
99
  redacted_file_path = os.path.join(temp_dir, safe_filename)
100
+
101
+ if ext == "pdf":
102
+ # Assuming a simple text-based redacted PDF (can be further improved for real PDF processing)
103
+ with open(redacted_file_path, "w", encoding="utf-8") as f:
104
+ f.write(redacted_text) # Saving as text-based PDF
105
+ elif ext == "docx":
106
+ doc = Document()
107
+ for line in redacted_text.split("\n"):
108
+ doc.add_paragraph(line)
109
+ doc.save(redacted_file_path) # Save DOCX in temp folder
110
  elif ext == "pptx":
111
+ ppt = Presentation()
112
+ slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide
113
+ textbox = slide.shapes.add_textbox(0, 0, ppt.slide_width, ppt.slide_height)
114
+ textbox.text = redacted_text
 
115
  ppt.save(redacted_file_path)
116
+ else: # TXT
117
  with open(redacted_file_path, "w", encoding="utf-8") as f:
118
  f.write(redacted_text)
119
+
120
+ return redacted_file_path # Return the safe file path
121
+
122
 
123
  def process_file(file, selected_entities, redaction_method):
124
  text = read_document(file)
 
135
  custom_css = """
136
  <style>
137
  #redact_button {
138
+ /*background-color: #E691FF !important;*/
139
  color: #4B23C0;
140
  }
141
  </style>
 
143
 
144
  # Gradio UI
145
  with gr.Blocks() as app:
146
+
147
  gr.Markdown(
148
  """
149
+ <div style="
150
+ background-color: #4B23C0;
151
+ color: white;
152
+ padding: 20px;
153
+ text-align: left;
154
+ font-size: 24px;
155
+ font-weight: bold;
156
+ margin: 0;
157
+ border-radius: 4px; /* Rounded edges */
158
+ ">
159
  🔒 PII Remover &nbsp;-&nbsp; Secure Document Redaction Tool
160
  </div>
161
  """,
162
  sanitize_html=False
163
  )
164
+
165
+ gr.Markdown(
166
+ "<div style='text-align: center; font-size: 24px; font-weight: bold; color: red;'>"
167
+ "⚠️ THIS IS A DEMONSTRATION. DO NOT UPLOAD SENSITIVE DOCUMENTS. ⚠️"
168
+ "</div>",
169
+ sanitize_html=False
170
+ )
171
+
172
+
173
  gr.Markdown("Upload a **TXT, DOCX, PPTX, or PDF** file to remove **Personal Identifiable Information (PII)** while keeping formatting.")
174
+
175
+ # Load CSS
176
  gr.HTML(custom_css)
177
 
178
  with gr.Row():
179
  file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, TXT)")
180
+
181
+ entity_selector = gr.CheckboxGroup(PII_ENTITIES, label="Select PII Entities to Redact (Leave blank to redact all)")
182
+
183
  with gr.Row():
184
  select_all_button = gr.Button("Select All")
185
  deselect_all_button = gr.Button("Deselect All")
186
+
187
+ redaction_method = gr.Radio(
188
+ ["Remove", "Redact", "Replace", "Mask", "Hash", "Encrypt"],
189
+ label="Redaction Method",
190
+ value="Redact"
191
+ )
192
+
193
  process_button = gr.Button("Redact Document", elem_id="redact_button")
194
+
195
  output_text = gr.Textbox(label="Redacted Text", lines=10)
196
  download_button = gr.File(label="Download Redacted File")
197
+
198
+ # Button Actions
199
  select_all_button.click(fn=select_all_entities, outputs=entity_selector)
200
  deselect_all_button.click(fn=deselect_all_entities, outputs=entity_selector)
 
201
 
202
+ process_button.click(fn=process_file, inputs=[file_input, entity_selector, redaction_method], outputs=[output_text, download_button])
203
+
204
  app.launch()
205
+