Spaces:

liaad
/

Citilink_Text_Anonymization_Demo

Running

App Files Files Community

Citilink_Text_Anonymization_Demo / src /streamlit_app.py

miguelalmqs

Update src/streamlit_app.py

557b544 verified 6 days ago

raw

history blame contribute delete

26.1 kB

	import streamlit as st
	import torch
	import os
	import re
	import json
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	from sentence_transformers import SentenceTransformer, util
	from collections import defaultdict
	import os

	MODEL_PATH = "liaad/Citilink-XLMR-Anonymization-pt"
	MODEL_REL_PATH = "liaad/Citilink-mpnet-Entity-Linker-pt"

	st.set_page_config(
	page_title="️ Text Anonymization Demo",
	page_icon="🛡️",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	st.markdown("""
	<style>

	/* 1. Elimina a capacidade de arrastar/redimensionar a barra */
	[data-testid="stSidebarResizer"] {
	display: none !important;
	}

	/* 2. Garante que a barra tem uma largura fixa apenas enquanto aberta */
	/* Assim ela não 'dança' e o fecho continua a ser total */
	[data-testid="stSidebar"][aria-expanded="true"] {
	min-width: 320px;
	max-width: 320px;
	}

	/* 3. Ajusta o conteúdo principal para colar à esquerda quando fechada */
	[data-testid="stMain"] {
	margin-left: 0px;
	}
	.main-header {
	font-size: 2.5rem;
	font-weight: bold;
	color: #e63946;
	text-align: center;
	margin-bottom: 1rem;
	}
	.anon-box {
	padding: 1.5rem;
	margin: 0.5rem 0;
	border-radius: 0.5rem;
	border-left: 5px solid #1d3557;
	background-color: #f8f9fa;
	font-family: 'Courier New', Courier, monospace;
	line-height: 1.6;
	color: #1e1e1e;
	}
	@media (prefers-color-scheme: dark) {
	.anon-box { background-color: #1e212b; color: #e0e0e0; border-left: 5px solid #a8dadc; }
	}
	.entity-tag {
	background-color: #e9ecef;
	padding: 2px 6px;
	border-radius: 4px;
	font-weight: bold;
	color: #1d3557;
	}
	.metric-box {
	background-color: #f1faee;
	padding: 1rem;
	border-radius: 0.5rem;
	text-align: center;
	border: 1px solid #a8dadc;
	}

	.small-metric-container {
	display: flex;
	justify-content: space-between;
	gap: 10px;
	margin-top: 10px; /* Espaço logo abaixo do botão */
	}
	.small-metric-box {
	flex: 1;
	background-color: #f8f9fa;
	border-radius: 6px;
	padding: 5px;
	text-align: center;
	border: 1px solid #dee2e6;
	}
	.metric-label {
	font-size: 0.65rem;
	color: #6c757d;
	text-transform: uppercase;
	font-weight: bold;
	}
	.metric-value {
	font-size: 1rem;
	color: #1d3557;
	font-weight: bold;
	}

	.result-window {
	height: 450px;
	overflow-y: auto;
	padding: 1rem;
	border-radius: 8px;
	border: 1px solid #dee2e6;
	background-color: #ffffff;
	font-family: 'Courier New', Courier, monospace;
	font-size: 0.85rem;
	line-height: 1.5;
	}
	@media (prefers-color-scheme: dark) {
	.result-window { background-color: #1e212b; color: #e0e0e0; border: 1px solid #444; }

	.browser-window {
	height: 500px;
	overflow-y: auto;
	padding: 15px;
	border-radius: 0px 0px 8px 8px; /* Arredondado apenas em baixo */
	border: 1px solid #d1d5db;
	background-color: #ffffff;
	font-family: 'Consolas', 'Monaco', monospace;
	font-size: 0.9rem;
	user-select: text; /* Garante que o utilizador pode selecionar o texto */
	}
	.browser-header {
	background-color: #f1f5f9;
	padding: 5px 15px;
	border: 1px solid #d1d5db;
	border-bottom: none;
	border-radius: 8px 8px 0px 0px;
	font-size: 0.75rem;
	font-weight: bold;
	color: #475569;
	display: flex;
	align-items: center;
	gap: 8px;
	}
	.dot { height: 10px; width: 10px; border-radius: 50%; display: inline-block; }
	}
	/* Estilo para a área de conteúdo das Tabs */
	/* Janela de Texto com fundo #262730 */
	.tab-window {
	/* Reduzimos para alinhar com a caixa de estatísticas da esquerda */
	height: 495px;

	overflow-y: auto;
	padding: 20px;
	border: 1px solid #444;
	border-radius: 0px 0px 8px 8px;
	background-color: #262730 !important;
	font-family: 'Consolas', 'Monaco', monospace;
	font-size: 0.95rem;
	line-height: 1.6;
	user-select: text;
	color: #efefef !important;
	margin-bottom: 20px;
	}

	/* Ajuste do sombreado das TAGS para o modo escuro */
	.entity-highlight {
	background-color: #3d3f4b; /* Cinza um pouco mais claro que o fundo */
	color: #a8dadc; /* Texto da tag num azul ciano suave */
	padding: 2px 6px;
	border-radius: 4px;
	border: 1px solid #555;
	display: inline-block;
	line-height: 1.2;
	font-weight: bold;
	}

	/* Estilização das Tabs (Abas) */
	.stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
	color: #9ca3af !important; /* Um cinza claro para as abas "apagadas" */
	font-size: 1rem;
	font-weight: bold;
	transition: color 0.3s ease;
	}

	/* 2. COR DAS ABAS QUANDO ESTÃO SELECIONADAS (ATIVAS) */
	/* Aqui mudamos para o Azul que pediste anteriormente */
	.stTabs [aria-selected="true"] [data-testid="stMarkdownContainer"] p {
	color: #a8dadc !important; /* Um azul ciano/claro para brilhar no dark mode */
	}

	/* 3. A LINHA (BARRA) QUE FICA POR BAIXO DA ABA SELECIONADA */
	.stTabs [data-baseweb="tab-highlight"] {
	background-color: #a8dadc !important; /* Cor da linha que corre por baixo */
	}

	/* 4. EFEITO AO PASSAR O RATO (HOVER) */
	.stTabs [data-baseweb="tab"]:hover [data-testid="stMarkdownContainer"] p {
	color: #ffffff !important; /* Fica branco ao passar o rato */
	}

	/* 1. Remove o espaço em branco excessivo no topo sem quebrar o botão da barra lateral */
	.block-container {
	padding-top: 1.5rem !important;
	padding-bottom: 0rem !important;
	}

	/* 2. Esconde o fundo e a decoração do header, mas MANTÉM o botão de abrir/fechar */
	header[data-testid="stHeader"] {
	background: transparent !important;
	color: transparent !important;
	}

	/* 3. Garante que o botão da barra lateral (Chevron) é visível mesmo com header transparente */
	[data-testid="collapsedControl"] {
	color: #bdbbbb !important; /* Mesma cor do seu título */
	visibility: visible !important;
	display: flex !important;
	}

	/* 4. Elimina a capacidade de arrastar a largura da barra */
	[data-testid="stSidebarResizer"] {
	display: none !important;
	}

	/* 5. Largura fixa da barra lateral */
	[data-testid="stSidebar"][aria-expanded="true"] {
	min-width: 320px;
	max-width: 320px;
	}

	/* Ajusta a margem do título principal */
	.main-title {
	margin-top: -45px !important;
	position: relative;
	}

	.streamlit-expanderHeader {
	background-color: #1e212b !important;
	border: 1px solid #444 !important;
	border-radius: 8px !important;
	}
	html, body {
	overflow: hidden !important;
	height: 100%;
	}

	/* 2. Garante que o container principal do Streamlit retém o scroll */
	/* Isso permite que a barra de scroll que vês seja a da App e não a da Web */
	[data-testid="stMainViewContainer"] {
	overflow-y: auto !important;
	}

	/* Opcional: Se quiseres que a barra do Streamlit seja mais discreta/fina */
	[data-testid="stMainViewContainer"]::-webkit-scrollbar {
	width: 8px;
	}
	[data-testid="stMainViewContainer"]::-webkit-scrollbar-thumb {
	background: #444;
	border-radius: 10px;
	}
	</style>
	""", unsafe_allow_html=True)


	@st.cache_resource
	def load_models():
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, add_prefix_space=True)
	model_ner = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
	rel_model = SentenceTransformer(MODEL_REL_PATH)
	return tokenizer, model_ner, rel_model, None
	except Exception as e:
	return None, None, None, str(e)


	def process_anonymization(text, threshold, tokenizer, model_ner, rel_model):
	if not text.strip():
	return "Por favor, insira um texto.", {}

	id2label = model_ner.config.id2label
	inputs = tokenizer(
	text,
	truncation=True,
	max_length=512,
	stride=164,
	return_overflowing_tokens=True,
	return_offsets_mapping=True,
	padding=True,
	return_tensors="pt"
	)

	all_predictions = []
	offset_mapping_all = inputs.pop("offset_mapping")
	overflow_to_sample = inputs.pop("overflow_to_sample_mapping")
	input_ids_all = inputs["input_ids"]

	with torch.no_grad():
	logits = model_ner(input_ids=input_ids_all).logits
	all_predictions = torch.argmax(logits, dim=2).tolist()

	entidades_brutas = []
	id2label = model_ner.config.id2label
	SPACE_PREFIXES = [" ", "▁", "Ġ"]

	for window_idx, (predictions, offsets) in enumerate(zip(all_predictions, offset_mapping_all)):
	temp_entity = {"tokens": [], "start": None, "end": None, "label": None}

	for idx, (pred_id, offset) in enumerate(zip(predictions, offsets)):
	label_name = id2label[pred_id]
	start_char, end_char = int(offset[0]), int(offset[1])

	if start_char == end_char: continue

	tag_base = label_name.split('-', 1)[1] if '-' in label_name else None

	token_id = input_ids_all[window_idx][idx].item()
	token = tokenizer.convert_ids_to_tokens(token_id)

	comeca_nova_palavra = any(token.startswith(p) for p in SPACE_PREFIXES)

	if label_name.startswith("B-"):
	if temp_entity["label"]:

	chars_to_remove = ".,;:!?"
	if temp_entity["label"] == "PERSONAL-PositionDepartment":
	chars_to_remove = ",;:!?"

	while (temp_entity["end"] - temp_entity["start"]) > 0 and text[
	temp_entity["end"] - 1] in chars_to_remove:
	temp_entity["end"] -= 1
	entidades_brutas.append(temp_entity)

	temp_entity = {"start": start_char, "end": end_char, "label": tag_base, "tokens": [token]}

	elif label_name.startswith("I-") and temp_entity["label"] == tag_base:
	temp_entity["tokens"].append(token)
	temp_entity["end"] = end_char
	elif not comeca_nova_palavra and temp_entity["label"] is not None:
	temp_entity["tokens"].append(token)
	temp_entity["end"] = end_char
	else:
	if temp_entity["label"]:

	chars_to_remove = ".,;:!?"
	if temp_entity["label"] == "PERSONAL-PositionDepartment":
	chars_to_remove = ",;:!?" # Removemos o ponto (.) da lista de limpeza

	while (temp_entity["end"] - temp_entity["start"]) > 0 and text[
	temp_entity["end"] - 1] in chars_to_remove:
	temp_entity["end"] -= 1
	entidades_brutas.append(temp_entity)

	temp_entity = {"tokens": [], "start": None, "end": None, "label": None}

	entidades_brutas.sort(key=lambda x: x["start"])
	entidades_finais = []

	for atual in entidades_brutas:
	if not entidades_finais:
	entidades_finais.append(atual)
	continue

	ultima = entidades_finais[-1]

	distancia = atual["start"] - ultima["end"]

	if distancia <= 1 and atual["label"] == ultima["label"]:

	ultima["end"] = atual["end"]

	ultima["tokens"].extend(atual["tokens"])
	else:

	adicionar = True
	for i, selecionada in enumerate(entidades_finais):
	interseccao = max(0,
	min(atual["end"], selecionada["end"]) - max(atual["start"], selecionada["start"]))
	if interseccao > 0:
	# Se houver sobreposição, mantém a maior
	if (atual["end"] - atual["start"]) > (selecionada["end"] - selecionada["start"]):
	entidades_finais[i] = atual
	adicionar = False
	break
	if adicionar:
	entidades_finais.append(atual)

	labels_modelo = ["Name", "Address", "Company", "Vehicle", "PositionDepartment"]
	labels_com_id = ["Name", "AdministrativeInformation", "PositionDepartment", "Address", "PersonalDocument", "Company",
	"LicensePlate", "Vehicle"]

	known_entities = {}
	known_embeddings = {}
	id_counters = defaultdict(int)

	entidades_finais.sort(key=lambda x: x["start"])
	for ent in entidades_finais:
	tag_limpa = ent["label"].replace("PERSONAL-", "")
	texto_original = text[ent["start"]:ent["end"]].strip()
	texto_key = texto_original.lower()
	assigned_id = None

	if tag_limpa in labels_com_id:
	if (tag_limpa, texto_key) in known_entities:
	assigned_id = known_entities[(tag_limpa, texto_key)]
	elif tag_limpa in labels_modelo:
	emb_atual = rel_model.encode(texto_key, convert_to_tensor=True)
	best_prob, best_match_id = 0.0, None
	candidatos = [(tk, tid) for (lbl, tk), tid in known_entities.items() if lbl == tag_limpa]

	for prev_text_key, prev_id in candidatos:
	emb_prev = known_embeddings.get(prev_text_key)
	if emb_prev is None:
	emb_prev = rel_model.encode(prev_text_key, convert_to_tensor=True)
	known_embeddings[prev_text_key] = emb_prev

	score = util.cos_sim(emb_atual, emb_prev).item()
	if score > best_prob:
	best_prob, best_match_id = score, prev_id

	if best_prob > threshold: assigned_id = best_match_id
	known_embeddings[texto_key] = emb_atual

	if assigned_id is None:
	id_counters[tag_limpa] += 1
	assigned_id = id_counters[tag_limpa]
	known_entities[(tag_limpa, texto_key)] = assigned_id

	ent["entity_id"] = assigned_id

	# Output Construction
	entidades_para_substituir = sorted(entidades_finais, key=lambda x: x["start"], reverse=True)
	texto_anon = text
	relatorio_json = []

	for ent in entidades_para_substituir:
	tag_limpa = ent["label"].replace("PERSONAL-", "")
	id_part = f"-{ent['entity_id']}" if ent.get('entity_id') else ""
	texto_original = text[ent["start"]:ent["end"]].strip()

	# 1. FORMATO DO ITEM JSON QUE PEDISTE
	relatorio_json.append({
	"category": ent["label"],
	"text": texto_original,
	"start": ent["start"],
	"end": ent["end"],
	"id": ent.get("entity_id")
	})

	placeholder = f' <span class="entity-highlight"><b><{tag_limpa}{id_part}></b></span> '
	texto_anon = texto_anon[:ent["start"]] + placeholder + texto_anon[ent["end"]:]

	relatorio_json.reverse()

	return re.sub(r' +', ' ', texto_anon).strip(), relatorio_json


	@st.cache_data
	def load_example_texts():
	json_path = os.path.join(os.path.dirname(__file__), 'example_text.json')
	try:
	with open(json_path, 'r', encoding='utf-8') as f:
	return json.load(f)
	except Exception:
	return {"Custom Text": "", "1º Portuguese Meeting Minute": "", "2º Portuguese Meeting Minute": "", "3º Portuguese Meeting Minute": ""}

	def main():
	st.markdown('<p style="font-size: 60px; font-weight: bold; color: #bdbbbb; text-align: center; margin-bottom: 10px;">🛡️ PID: Text Anonymization Demo</p>', unsafe_allow_html=True)
	st.markdown("""
	<p style="text-align: center; color: #666;">
	Automatic text anonymization for city council minutes and administrative documents
	</p>
	""", unsafe_allow_html=True)


	tokenizer, model_ner, rel_model, error = load_models()

	if error:
	st.error(f"Erro ao carregar modelos: {error}")
	st.stop()

	st.sidebar.header("⚙️ Configuration")

	st.sidebar.write("---")

	example_texts = load_example_texts()

	selected_example = st.sidebar.selectbox(
	"Choose an example:",
	options=list(example_texts.keys()),
	index=0
	)

	st.sidebar.markdown("<br><br>", unsafe_allow_html=True)

	threshold = st.sidebar.slider(
	"Entity Linking Threshold",
	min_value=0.0, max_value=1.0, value=0.80, step=0.05,
	help="Higher threshold = more strict. Use a higher value to ensure only very similar entities get the same ID, preventing different people from being grouped together."
	)

	st.sidebar.markdown("---")

	st.sidebar.markdown("### 📊 About")

	st.sidebar.info(f"""
	- Anonymization (NER) uses Token Classification to identify and mask sensitive information (PID) in administrative documents.
	- Model: XLM-RoBERTa fine-tuned for Named Entity Recognition.
	- Languages: Portuguese (pt-pt).
	- Method: Sequence Labeling with Bi-Encoder Entity Linking.
	""")

	st.sidebar.markdown("")
	st.sidebar.markdown("### 🔗 Resources")
	st.sidebar.markdown("""
	- [📖 Model Card](https://huggingface.co/liaad/Citilink-XLMR-Anonymization-pt) (Anonymization)
	- [📖 Model Card](https://huggingface.co/liaad/Citilink-mpnet-Entity-Linker-pt) (Entity Linking)
	- [💾 GitHub Repository](https://github.com/)
	""")

	st.write("")

	with st.expander("🎯 How it works", expanded=False):
	st.markdown("""
	The anonymization process is powered by two specialized AI models working in sequence:

	1. The Detector (NER): A model designed to extract personal information entities across multiple categories, identifying sensitive data within the document's context.
	2. The Linker (Entity Linking): Understands when different words refer to the same entity. For example, it knows that "João Silva" and "Sr. Silva" are the same person, assigning them a consistent ID (e.g., `<Name-1>`).

	""")

	# Quadrado azul com distinção de quem tem ID Linking
	st.markdown("""
	<div style="background-color: #262730; padding: 20px; border-radius: 10px; border-left: 5px solid #3b82f6; margin-bottom: 25px;">
	<p style="color: #3b82f6; font-weight: bold; margin-top: 0; margin-bottom: 5px;">Supported Entities:</p>
	<p style="font-size: 0.75em; color: #94a3b8; margin-bottom: 15px;">Note: Entities marked with <span style="color: #60a5fa; font-weight: bold;">(ID)</span> support consistent linking across the document.</p>
	<div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 10px; font-size: 0.85em; color: white;">
	<div>
	• Name <b style="color: #60a5fa;">(ID)</b><br>• Admin. Document <b style="color: #60a5fa;">(ID)</b><br>• Position/Department <b style="color: #60a5fa;">(ID)</b><br>• Address <b style="color: #60a5fa;">(ID)</b><br>• Date
	</div>
	<div>
	• Location<br>• Personal Document <b style="color: #60a5fa;">(ID)</b><br>• Company <b style="color: #60a5fa;">(ID)</b><br>• Artistic Activity
	</div>
	<div>
	• Degree<br>• Time<br>• License <b style="color: #60a5fa;">(ID)</b><br>• Job
	</div>
	<div>
	• Vehicle <b style="color: #60a5fa;">(ID)</b><br>• Faculty<br>• Family Relationship<br>• Other
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("INPUT:")
	st.markdown('''
	<div class="tab-window" style="height: auto; padding: 15px; border-top: 4px solid #666; margin-bottom: 10px;">
	O interessado Dr. João Silva submeteu o processo administrativo 5597/2023 no dia 20/05/2023, relativo ao imóvel localizado na Rua das Flores n.º 10 conforme o solicitado.
	</div>
	''', unsafe_allow_html=True)

	st.markdown("OUTPUT:")
	st.markdown(f'''
	<div class="tab-window" style="height: auto; padding: 15px; border-top: 4px solid #a8dadc;">
	O interessado <span class="entity-highlight"><b><PositionDepartment-1></b></span> <span class="entity-highlight"><b><Name-1></b></span>
	submeteu o processo administrativo <span class="entity-highlight"><b><AdministrativeInformation-1></b></span>
	no dia <span class="entity-highlight"><b><Date></b></span>,
	relativo ao imóvel localizado na <span class="entity-highlight"><b><Address></b></span> conforme o solicitado.
	</div>
	''', unsafe_allow_html=True)

	st.write("")

	col_ex_in, col_ex_out = st.columns(2)


	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("📝 Input Document")

	input_text = st.text_area(
	"Enter yout text here:",
	value=example_texts[selected_example],
	height=400,
	key=f"input_area_{selected_example}",
	placeholder="Paste your document text here..."
	)

	st.markdown("""
	<style>
	/* Remove o puxador de redimensionamento de todas as text areas */
	div[data-testid="stTextArea"] textarea {
	resize: none;
	}
	</style>
	""", unsafe_allow_html=True)

	process_btn = st.button("🔍 Anonymize", type="primary", use_container_width=True)

	if process_btn and input_text:
	with st.spinner("Processing..."):
	texto_final, relatorio = process_anonymization(input_text, threshold, tokenizer, model_ner, rel_model)

	total_entidades = len(relatorio)

	tipos_unicos = len(set(ent["category"] for ent in relatorio))

	st.markdown(f"""
	<div class="small-metric-container">
	<div class="small-metric-box">
	<div class="metric-label">Entities</div>
	<div class="metric-value">{total_entidades}</div>
	</div>
	<div class="small-metric-box">
	<div class="metric-label">Categories</div>
	<div class="metric-value">{tipos_unicos}</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	st.subheader("🔒 Anonymization Results")

	if process_btn and input_text:
	tab_text, tab_entities = st.tabs(["📄 Anonymized Text", "🔍 Extracted Entities"])

	with tab_text:

	st.markdown(f'''
	<div class="tab-window" style="border-top: 4px solid #2162bf;">
	{texto_final}
	</div>
	''', unsafe_allow_html=True)

	with tab_entities:

	agrupado = {}
	for item in relatorio:
	cat_limpa = item["category"].replace("PERSONAL-", "")
	if cat_limpa not in agrupado:
	agrupado[cat_limpa] = []
	agrupado[cat_limpa].append(item["text"])

	html_content = ""
	for cat, lista in agrupado.items():
	count = len(lista)
	html_content += f"<div style='margin-bottom:20px;'>"
	html_content += f"<b style='color:#a8dadc; font-size:1.1rem;'>{cat}</b> "
	html_content += f"<span style='color:#666;'>({count})</span><br>"

	for item in lista:
	html_content += f"<div style='color:#ffffff; margin-left:15px; margin-top:3px;'>- {item}</div>"

	html_content += f"</div>"

	st.markdown(f'''
	<div class="tab-window" style="border-top: 4px solid #2162bf;">
	{html_content if html_content else "No entities found."}
	</div>
	''', unsafe_allow_html=True)

	with st.expander("📋 Full Entity Report (JSON)"):

	download_data = {
	"full_text": input_text,
	"personal_info": relatorio
	}

	json_string = json.dumps(download_data, indent=4, ensure_ascii=False)

	st.download_button(
	label="📥 Download JSON Report",
	data=json_string,
	file_name="anonymization_report.json",
	mime="application/json",
	use_container_width=True
	)

	st.json(relatorio)
	else:
	st.markdown(f'''
	<div style="margin-top: 30px;">
	</div>
	''', unsafe_allow_html=True)
	st.info("Please process a document on the left to view the results.")
	st.markdown("---")


	if __name__ == "__main__":
	main()