Spaces:
Build error
Build error
| import fitz # PyMuPDF | |
| import gradio as gr | |
| from anonymize import SwedishTextMasker | |
| # Instantiate once, globally | |
| text_anonymizer = SwedishTextMasker(threshold= 0.5) | |
| def join_short_lines(text, min_length=30): | |
| """ | |
| Joins lines that are shorter than min_length with the next line. | |
| """ | |
| lines = text.split('\n') | |
| new_lines = [] | |
| buffer = "" | |
| for line in lines: | |
| stripped = line.strip() | |
| if not stripped: | |
| if buffer: | |
| new_lines.append(buffer) | |
| buffer = "" | |
| new_lines.append("") # preserve empty lines | |
| continue | |
| if len(stripped) < min_length and not stripped.endswith(('.', ':', ';', '?', '!')): | |
| buffer += " " + stripped if buffer else stripped | |
| else: | |
| if buffer: | |
| buffer += " " + stripped | |
| new_lines.append(buffer) | |
| buffer = "" | |
| else: | |
| new_lines.append(stripped) | |
| if buffer: | |
| new_lines.append(buffer) | |
| return "\n".join(new_lines) | |
| def extract_text_from_pdf(pdf_file): | |
| if pdf_file is None: | |
| return "No file uploaded." | |
| # Approach 1: open via file path (usually safer) | |
| with fitz.open(pdf_file.name) as doc: | |
| text_output = "" | |
| for page in doc: | |
| text_output += page.get_text(flags=1) | |
| raw_text = text_output.strip() | |
| # raw_text = join_short_lines(raw_text) # <--- Add this line! | |
| print(raw_text) | |
| anonymized_text = text_anonymizer.mask_all(raw_text) | |
| return anonymized_text | |
| # Gradio interface | |
| with gr.Blocks(title="PDF -> Anonymized Text") as demo: | |
| gr.Markdown("### ๐ PDF Anonymizer (text only, skips images)") | |
| with gr.Row(): | |
| pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"]) | |
| text_output = gr.Textbox(label="Anonymized Output", lines=20, interactive=False) | |
| extract_button = gr.Button("Anonymize Text") | |
| extract_button.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=text_output) | |
| if __name__ == "__main__": | |
| demo.launch() | |