MosaHosseini's picture
Upload 2 files
7f196ee verified
import fitz # PyMuPDF
import gradio as gr
from anonymize import SwedishTextMasker
# Instantiate once, globally
text_anonymizer = SwedishTextMasker(threshold= 0.5)
def join_short_lines(text, min_length=30):
"""
Joins lines that are shorter than min_length with the next line.
"""
lines = text.split('\n')
new_lines = []
buffer = ""
for line in lines:
stripped = line.strip()
if not stripped:
if buffer:
new_lines.append(buffer)
buffer = ""
new_lines.append("") # preserve empty lines
continue
if len(stripped) < min_length and not stripped.endswith(('.', ':', ';', '?', '!')):
buffer += " " + stripped if buffer else stripped
else:
if buffer:
buffer += " " + stripped
new_lines.append(buffer)
buffer = ""
else:
new_lines.append(stripped)
if buffer:
new_lines.append(buffer)
return "\n".join(new_lines)
def extract_text_from_pdf(pdf_file):
if pdf_file is None:
return "No file uploaded."
# Approach 1: open via file path (usually safer)
with fitz.open(pdf_file.name) as doc:
text_output = ""
for page in doc:
text_output += page.get_text(flags=1)
raw_text = text_output.strip()
# raw_text = join_short_lines(raw_text) # <--- Add this line!
print(raw_text)
anonymized_text = text_anonymizer.mask_all(raw_text)
return anonymized_text
# Gradio interface
with gr.Blocks(title="PDF -> Anonymized Text") as demo:
gr.Markdown("### ๐Ÿ“„ PDF Anonymizer (text only, skips images)")
with gr.Row():
pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"])
text_output = gr.Textbox(label="Anonymized Output", lines=20, interactive=False)
extract_button = gr.Button("Anonymize Text")
extract_button.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=text_output)
if __name__ == "__main__":
demo.launch()