presidio-de-identify

Sleeping

App Files Files Community

presidio-de-identify / app.py

awacke1

Update app.py

bbda733 verified 8 months ago

raw

history blame

6.74 kB

	import logging
	import os
	import base64
	import datetime
	import dotenv
	import pandas as pd
	import streamlit as st
	from streamlit_tags import st_tags
	from PyPDF2 import PdfReader, PdfWriter
	from presidio_helpers import (
	analyzer_engine,
	get_supported_entities,
	analyze,
	anonymize,
	)

	st.set_page_config(
	page_title="Presidio PHI De-identification",
	layout="wide",
	initial_sidebar_state="expanded",
	menu_items={"About": "https://microsoft.github.io/presidio/"},
	)

	dotenv.load_dotenv()
	logger = logging.getLogger("presidio-streamlit")

	# Sidebar
	st.sidebar.header("PHI De-identification with Presidio")

	model_help_text = "Select Named Entity Recognition (NER) model for PHI detection."
	model_list = [
	("spaCy/en_core_web_lg", "https://huggingface.co/spacy/en_core_web_lg"),
	("HuggingFace/obi/deid_roberta_i2b2", "https://huggingface.co/obi/deid_roberta_i2b2"),
	("flair/ner-english-large", "https://huggingface.co/flair/ner-english-large"),
	("HuggingFace/StanfordAIMI/stanford-deidentifier-base", "https://huggingface.co/StanfordAIMI/stanford-deidentifier-base"),
	]

	st_model = st.sidebar.selectbox(
	"NER model package",
	[model[0] for model in model_list],
	index=1,
	help=model_help_text,
	)

	# Display HuggingFace link for selected model
	selected_model_url = next(url for model, url in model_list if model == st_model)
	st.sidebar.markdown(f"[View model on HuggingFace]({selected_model_url})")

	# Extract model package
	st_model_package = st_model.split("/")[0]
	st_model = st_model if st_model_package.lower() not in ("spacy", "huggingface") else "/".join(st_model.split("/")[1:])

	analyzer_params = (st_model_package, st_model)
	st.sidebar.warning("Note: Models might take some time to download on first run.")

	st_operator = st.sidebar.selectbox(
	"De-identification approach",
	["replace", "redact", "mask"],
	index=0,
	help="Select PHI manipulation method.",
	)

	st_threshold = st.sidebar.slider(
	label="Acceptance threshold",
	min_value=0.0,
	max_value=1.0,
	value=0.35,
	)

	st_return_decision_process = st.sidebar.checkbox(
	"Add analysis explanations",
	value=False,
	)

	# Allow and deny lists
	with st.sidebar.expander("Allowlists and denylists", expanded=False):
	st_allow_list = st_tags(label="Add words to allowlist", text="Enter word and press enter.")
	st_deny_list = st_tags(label="Add words to denylist", text="Enter word and press enter.")

	# Main panel
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Input")
	uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])

	if uploaded_file:
	try:
	# Read PDF
	pdf_reader = PdfReader(uploaded_file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"

	# Initialize analyzer
	try:
	analyzer = analyzer_engine(*analyzer_params)
	except Exception as e:
	st.error(f"Failed to load model: {str(e)}")
	st.info("Ensure models are downloaded (e.g., 'python -m spacy download en_core_web_lg') and check network/permissions.")
	raise

	# Analyze
	st_analyze_results = analyze(
	analyzer=analyzer,
	text=text,
	entities=get_supported_entities(*analyzer_params),
	language="en",
	score_threshold=st_threshold,
	return_decision_process=st_return_decision_process,
	allow_list=st_allow_list,
	deny_list=st_deny_list,
	)

	# Process results
	phi_types = set(res.entity_type for res in st_analyze_results)
	if phi_types:
	st.success(f"Removed PHI types: {', '.join(phi_types)}")
	else:
	st.info("No PHI detected")

	# Anonymize
	anonymized_result = anonymize(
	text=text,
	operator=st_operator,
	analyze_results=st_analyze_results,
	)

	# Create new PDF
	pdf_writer = PdfWriter()
	for page in pdf_reader.pages:
	pdf_writer.add_page(page)

	# Generate output filename with timestamp
	timestamp = datetime.datetime.now().strftime("%I%M%p_%d-%m-%y")
	output_filename = f"{timestamp}_{uploaded_file.name}"

	# Save modified PDF
	try:
	with open(output_filename, "wb") as f:
	pdf_writer.write(f)
	except PermissionError as e:
	st.error(f"Permission denied when saving PDF: {str(e)}")
	st.info("Check write permissions in the current directory.")
	raise

	# Generate base64 download link
	try:
	with open(output_filename, "rb") as f:
	pdf_bytes = f.read()
	b64 = base64.b64encode(pdf_bytes).decode()
	href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
	st.markdown(href, unsafe_allow_html=True)
	except Exception as e:
	st.error(f"Error generating download link: {str(e)}")
	raise

	# Display findings
	with col2:
	st.subheader("Findings")
	if st_analyze_results:
	df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
	df["text"] = [text[res.start:res.end] for res in st_analyze_results]
	df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
	{
	"entity_type": "Entity type",
	"text": "Text",
	"start": "Start",
	"end": "End",
	"score": "Confidence",
	},
	axis=1,
	)
	if st_return_decision_process:
	analysis_explanation_df = pd.DataFrame.from_records(
	[r.analysis_explanation.to_dict() for r in st_analyze_results]
	)
	df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
	st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
	else:
	st.text("No findings")

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")
	logger.error(f"Processing error: {str(e)}")