dhammo2 commited on
Commit
af9438a
·
verified ·
1 Parent(s): 2301910

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -25
app.py CHANGED
@@ -88,43 +88,45 @@ def read_document(file):
88
  else:
89
  return process_txt(file)
90
 
91
- def save_redacted_file(original_file, redacted_text):
92
- """Saves redacted text back in the original format and returns a valid file path."""
93
-
94
  ext = original_file.name.split(".")[-1].lower()
95
-
96
- # Create a temporary file with a proper path
97
- temp_dir = tempfile.gettempdir() # OS-specific temp folder
98
  safe_filename = f"redacted_{os.path.basename(original_file.name)}"
99
  redacted_file_path = os.path.join(temp_dir, safe_filename)
100
-
101
- if ext == "pdf":
102
- # Assuming a simple text-based redacted PDF (can be further improved for real PDF processing)
103
- with open(redacted_file_path, "w", encoding="utf-8") as f:
104
- f.write(redacted_text) # Saving as text-based PDF
105
- elif ext == "docx":
106
- doc = Document()
107
- for line in redacted_text.split("\n"):
108
- doc.add_paragraph(line)
109
- doc.save(redacted_file_path) # Save DOCX in temp folder
110
  elif ext == "pptx":
111
- ppt = Presentation()
112
- slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide
113
- textbox = slide.shapes.add_textbox(0, 0, ppt.slide_width, ppt.slide_height)
114
- textbox.text = redacted_text
 
 
 
 
 
 
 
115
  ppt.save(redacted_file_path)
116
- else: # TXT
117
  with open(redacted_file_path, "w", encoding="utf-8") as f:
118
  f.write(redacted_text)
119
-
120
- return redacted_file_path # Return the safe file path
121
 
122
 
123
  def process_file(file, selected_entities, redaction_method):
 
124
  text = read_document(file)
125
  redacted_text = redact_text(text, selected_entities, redaction_method)
126
- redacted_file_path = save_redacted_file(file, redacted_text)
127
- return redacted_text, redacted_file_path
 
 
128
 
129
  def select_all_entities():
130
  return PII_ENTITIES
 
88
  else:
89
  return process_txt(file)
90
 
91
+ def save_redacted_file(original_file, redacted_text, selected_entities, redaction_method):
 
 
92
  ext = original_file.name.split(".")[-1].lower()
93
+ temp_dir = tempfile.gettempdir()
 
 
94
  safe_filename = f"redacted_{os.path.basename(original_file.name)}"
95
  redacted_file_path = os.path.join(temp_dir, safe_filename)
96
+
97
+ if ext == "docx":
98
+ doc = Document(original_file.name)
99
+ for para in doc.paragraphs:
100
+ para.text = redact_text(para.text, selected_entities, redaction_method) # Use redaction_method passed from UI
101
+ doc.save(redacted_file_path)
 
 
 
 
102
  elif ext == "pptx":
103
+ ppt = Presentation(original_file.name)
104
+
105
+ # Loop through each slide in the original PPTX and add redacted text
106
+ for slide_num, slide in enumerate(ppt.slides):
107
+ # Loop through all shapes on the slide
108
+ for shape in slide.shapes:
109
+ if hasattr(shape, "text"):
110
+ # Redact the text inside the shape
111
+ redacted_text_in_shape = redact_text(shape.text, selected_entities, redaction_method)
112
+ shape.text = redacted_text_in_shape # Apply the redacted text back to the shape
113
+
114
  ppt.save(redacted_file_path)
115
+ else:
116
  with open(redacted_file_path, "w", encoding="utf-8") as f:
117
  f.write(redacted_text)
118
+
119
+ return redacted_file_path
120
 
121
 
122
  def process_file(file, selected_entities, redaction_method):
123
+ """Handles file upload, redacts selected PII, and returns redacted file."""
124
  text = read_document(file)
125
  redacted_text = redact_text(text, selected_entities, redaction_method)
126
+ redacted_file_path = save_redacted_file(file, redacted_text, selected_entities, redaction_method)
127
+
128
+ return redacted_text, redacted_file_path # Returning only a valid file path
129
+
130
 
131
  def select_all_entities():
132
  return PII_ENTITIES