Spaces:

MosaHosseini
/

Swedish_Text_Anonymizer

Build error

App Files Files Community

Swedish_Text_Anonymizer / app.py

MosaHosseini

Upload 2 files

7f196ee verified 9 months ago

raw

history blame contribute delete

2.12 kB

	import fitz # PyMuPDF
	import gradio as gr
	from anonymize import SwedishTextMasker

	# Instantiate once, globally
	text_anonymizer = SwedishTextMasker(threshold= 0.5)

	def join_short_lines(text, min_length=30):
	"""
	Joins lines that are shorter than min_length with the next line.
	"""
	lines = text.split('\n')
	new_lines = []
	buffer = ""
	for line in lines:
	stripped = line.strip()
	if not stripped:
	if buffer:
	new_lines.append(buffer)
	buffer = ""
	new_lines.append("") # preserve empty lines
	continue
	if len(stripped) < min_length and not stripped.endswith(('.', ':', ';', '?', '!')):
	buffer += " " + stripped if buffer else stripped
	else:
	if buffer:
	buffer += " " + stripped
	new_lines.append(buffer)
	buffer = ""
	else:
	new_lines.append(stripped)
	if buffer:
	new_lines.append(buffer)
	return "\n".join(new_lines)


	def extract_text_from_pdf(pdf_file):
	if pdf_file is None:
	return "No file uploaded."

	# Approach 1: open via file path (usually safer)
	with fitz.open(pdf_file.name) as doc:
	text_output = ""
	for page in doc:
	text_output += page.get_text(flags=1)

	raw_text = text_output.strip()
	# raw_text = join_short_lines(raw_text) # <--- Add this line!
	print(raw_text)
	anonymized_text = text_anonymizer.mask_all(raw_text)
	return anonymized_text

	# Gradio interface
	with gr.Blocks(title="PDF -> Anonymized Text") as demo:
	gr.Markdown("### 📄 PDF Anonymizer (text only, skips images)")
	with gr.Row():
	pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"])
	text_output = gr.Textbox(label="Anonymized Output", lines=20, interactive=False)

	extract_button = gr.Button("Anonymize Text")
	extract_button.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=text_output)

	if __name__ == "__main__":
	demo.launch()