miguelalmqs's picture
Update src/streamlit_app.py
557b544 verified
import streamlit as st
import torch
import os
import re
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sentence_transformers import SentenceTransformer, util
from collections import defaultdict
import os
MODEL_PATH = "liaad/Citilink-XLMR-Anonymization-pt"
MODEL_REL_PATH = "liaad/Citilink-mpnet-Entity-Linker-pt"
st.set_page_config(
page_title="️ Text Anonymization Demo",
page_icon="🛡️",
layout="wide",
initial_sidebar_state="expanded"
)
st.markdown("""
<style>
/* 1. Elimina a capacidade de arrastar/redimensionar a barra */
[data-testid="stSidebarResizer"] {
display: none !important;
}
/* 2. Garante que a barra tem uma largura fixa apenas enquanto aberta */
/* Assim ela não 'dança' e o fecho continua a ser total */
[data-testid="stSidebar"][aria-expanded="true"] {
min-width: 320px;
max-width: 320px;
}
/* 3. Ajusta o conteúdo principal para colar à esquerda quando fechada */
[data-testid="stMain"] {
margin-left: 0px;
}
.main-header {
font-size: 2.5rem;
font-weight: bold;
color: #e63946;
text-align: center;
margin-bottom: 1rem;
}
.anon-box {
padding: 1.5rem;
margin: 0.5rem 0;
border-radius: 0.5rem;
border-left: 5px solid #1d3557;
background-color: #f8f9fa;
font-family: 'Courier New', Courier, monospace;
line-height: 1.6;
color: #1e1e1e;
}
@media (prefers-color-scheme: dark) {
.anon-box { background-color: #1e212b; color: #e0e0e0; border-left: 5px solid #a8dadc; }
}
.entity-tag {
background-color: #e9ecef;
padding: 2px 6px;
border-radius: 4px;
font-weight: bold;
color: #1d3557;
}
.metric-box {
background-color: #f1faee;
padding: 1rem;
border-radius: 0.5rem;
text-align: center;
border: 1px solid #a8dadc;
}
.small-metric-container {
display: flex;
justify-content: space-between;
gap: 10px;
margin-top: 10px; /* Espaço logo abaixo do botão */
}
.small-metric-box {
flex: 1;
background-color: #f8f9fa;
border-radius: 6px;
padding: 5px;
text-align: center;
border: 1px solid #dee2e6;
}
.metric-label {
font-size: 0.65rem;
color: #6c757d;
text-transform: uppercase;
font-weight: bold;
}
.metric-value {
font-size: 1rem;
color: #1d3557;
font-weight: bold;
}
.result-window {
height: 450px;
overflow-y: auto;
padding: 1rem;
border-radius: 8px;
border: 1px solid #dee2e6;
background-color: #ffffff;
font-family: 'Courier New', Courier, monospace;
font-size: 0.85rem;
line-height: 1.5;
}
@media (prefers-color-scheme: dark) {
.result-window { background-color: #1e212b; color: #e0e0e0; border: 1px solid #444; }
.browser-window {
height: 500px;
overflow-y: auto;
padding: 15px;
border-radius: 0px 0px 8px 8px; /* Arredondado apenas em baixo */
border: 1px solid #d1d5db;
background-color: #ffffff;
font-family: 'Consolas', 'Monaco', monospace;
font-size: 0.9rem;
user-select: text; /* Garante que o utilizador pode selecionar o texto */
}
.browser-header {
background-color: #f1f5f9;
padding: 5px 15px;
border: 1px solid #d1d5db;
border-bottom: none;
border-radius: 8px 8px 0px 0px;
font-size: 0.75rem;
font-weight: bold;
color: #475569;
display: flex;
align-items: center;
gap: 8px;
}
.dot { height: 10px; width: 10px; border-radius: 50%; display: inline-block; }
}
/* Estilo para a área de conteúdo das Tabs */
/* Janela de Texto com fundo #262730 */
.tab-window {
/* Reduzimos para alinhar com a caixa de estatísticas da esquerda */
height: 495px;
overflow-y: auto;
padding: 20px;
border: 1px solid #444;
border-radius: 0px 0px 8px 8px;
background-color: #262730 !important;
font-family: 'Consolas', 'Monaco', monospace;
font-size: 0.95rem;
line-height: 1.6;
user-select: text;
color: #efefef !important;
margin-bottom: 20px;
}
/* Ajuste do sombreado das TAGS para o modo escuro */
.entity-highlight {
background-color: #3d3f4b; /* Cinza um pouco mais claro que o fundo */
color: #a8dadc; /* Texto da tag num azul ciano suave */
padding: 2px 6px;
border-radius: 4px;
border: 1px solid #555;
display: inline-block;
line-height: 1.2;
font-weight: bold;
}
/* Estilização das Tabs (Abas) */
.stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
color: #9ca3af !important; /* Um cinza claro para as abas "apagadas" */
font-size: 1rem;
font-weight: bold;
transition: color 0.3s ease;
}
/* 2. COR DAS ABAS QUANDO ESTÃO SELECIONADAS (ATIVAS) */
/* Aqui mudamos para o Azul que pediste anteriormente */
.stTabs [aria-selected="true"] [data-testid="stMarkdownContainer"] p {
color: #a8dadc !important; /* Um azul ciano/claro para brilhar no dark mode */
}
/* 3. A LINHA (BARRA) QUE FICA POR BAIXO DA ABA SELECIONADA */
.stTabs [data-baseweb="tab-highlight"] {
background-color: #a8dadc !important; /* Cor da linha que corre por baixo */
}
/* 4. EFEITO AO PASSAR O RATO (HOVER) */
.stTabs [data-baseweb="tab"]:hover [data-testid="stMarkdownContainer"] p {
color: #ffffff !important; /* Fica branco ao passar o rato */
}
/* 1. Remove o espaço em branco excessivo no topo sem quebrar o botão da barra lateral */
.block-container {
padding-top: 1.5rem !important;
padding-bottom: 0rem !important;
}
/* 2. Esconde o fundo e a decoração do header, mas MANTÉM o botão de abrir/fechar */
header[data-testid="stHeader"] {
background: transparent !important;
color: transparent !important;
}
/* 3. Garante que o botão da barra lateral (Chevron) é visível mesmo com header transparente */
[data-testid="collapsedControl"] {
color: #bdbbbb !important; /* Mesma cor do seu título */
visibility: visible !important;
display: flex !important;
}
/* 4. Elimina a capacidade de arrastar a largura da barra */
[data-testid="stSidebarResizer"] {
display: none !important;
}
/* 5. Largura fixa da barra lateral */
[data-testid="stSidebar"][aria-expanded="true"] {
min-width: 320px;
max-width: 320px;
}
/* Ajusta a margem do título principal */
.main-title {
margin-top: -45px !important;
position: relative;
}
.streamlit-expanderHeader {
background-color: #1e212b !important;
border: 1px solid #444 !important;
border-radius: 8px !important;
}
html, body {
overflow: hidden !important;
height: 100%;
}
/* 2. Garante que o container principal do Streamlit retém o scroll */
/* Isso permite que a barra de scroll que vês seja a da App e não a da Web */
[data-testid="stMainViewContainer"] {
overflow-y: auto !important;
}
/* Opcional: Se quiseres que a barra do Streamlit seja mais discreta/fina */
[data-testid="stMainViewContainer"]::-webkit-scrollbar {
width: 8px;
}
[data-testid="stMainViewContainer"]::-webkit-scrollbar-thumb {
background: #444;
border-radius: 10px;
}
</style>
""", unsafe_allow_html=True)
@st.cache_resource
def load_models():
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, add_prefix_space=True)
model_ner = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
rel_model = SentenceTransformer(MODEL_REL_PATH)
return tokenizer, model_ner, rel_model, None
except Exception as e:
return None, None, None, str(e)
def process_anonymization(text, threshold, tokenizer, model_ner, rel_model):
if not text.strip():
return "Por favor, insira um texto.", {}
id2label = model_ner.config.id2label
inputs = tokenizer(
text,
truncation=True,
max_length=512,
stride=164,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding=True,
return_tensors="pt"
)
all_predictions = []
offset_mapping_all = inputs.pop("offset_mapping")
overflow_to_sample = inputs.pop("overflow_to_sample_mapping")
input_ids_all = inputs["input_ids"]
with torch.no_grad():
logits = model_ner(input_ids=input_ids_all).logits
all_predictions = torch.argmax(logits, dim=2).tolist()
entidades_brutas = []
id2label = model_ner.config.id2label
SPACE_PREFIXES = [" ", "▁", "Ġ"]
for window_idx, (predictions, offsets) in enumerate(zip(all_predictions, offset_mapping_all)):
temp_entity = {"tokens": [], "start": None, "end": None, "label": None}
for idx, (pred_id, offset) in enumerate(zip(predictions, offsets)):
label_name = id2label[pred_id]
start_char, end_char = int(offset[0]), int(offset[1])
if start_char == end_char: continue
tag_base = label_name.split('-', 1)[1] if '-' in label_name else None
token_id = input_ids_all[window_idx][idx].item()
token = tokenizer.convert_ids_to_tokens(token_id)
comeca_nova_palavra = any(token.startswith(p) for p in SPACE_PREFIXES)
if label_name.startswith("B-"):
if temp_entity["label"]:
chars_to_remove = ".,;:!?"
if temp_entity["label"] == "PERSONAL-PositionDepartment":
chars_to_remove = ",;:!?"
while (temp_entity["end"] - temp_entity["start"]) > 0 and text[
temp_entity["end"] - 1] in chars_to_remove:
temp_entity["end"] -= 1
entidades_brutas.append(temp_entity)
temp_entity = {"start": start_char, "end": end_char, "label": tag_base, "tokens": [token]}
elif label_name.startswith("I-") and temp_entity["label"] == tag_base:
temp_entity["tokens"].append(token)
temp_entity["end"] = end_char
elif not comeca_nova_palavra and temp_entity["label"] is not None:
temp_entity["tokens"].append(token)
temp_entity["end"] = end_char
else:
if temp_entity["label"]:
chars_to_remove = ".,;:!?"
if temp_entity["label"] == "PERSONAL-PositionDepartment":
chars_to_remove = ",;:!?" # Removemos o ponto (.) da lista de limpeza
while (temp_entity["end"] - temp_entity["start"]) > 0 and text[
temp_entity["end"] - 1] in chars_to_remove:
temp_entity["end"] -= 1
entidades_brutas.append(temp_entity)
temp_entity = {"tokens": [], "start": None, "end": None, "label": None}
entidades_brutas.sort(key=lambda x: x["start"])
entidades_finais = []
for atual in entidades_brutas:
if not entidades_finais:
entidades_finais.append(atual)
continue
ultima = entidades_finais[-1]
distancia = atual["start"] - ultima["end"]
if distancia <= 1 and atual["label"] == ultima["label"]:
ultima["end"] = atual["end"]
ultima["tokens"].extend(atual["tokens"])
else:
adicionar = True
for i, selecionada in enumerate(entidades_finais):
interseccao = max(0,
min(atual["end"], selecionada["end"]) - max(atual["start"], selecionada["start"]))
if interseccao > 0:
# Se houver sobreposição, mantém a maior
if (atual["end"] - atual["start"]) > (selecionada["end"] - selecionada["start"]):
entidades_finais[i] = atual
adicionar = False
break
if adicionar:
entidades_finais.append(atual)
labels_modelo = ["Name", "Address", "Company", "Vehicle", "PositionDepartment"]
labels_com_id = ["Name", "AdministrativeInformation", "PositionDepartment", "Address", "PersonalDocument", "Company",
"LicensePlate", "Vehicle"]
known_entities = {}
known_embeddings = {}
id_counters = defaultdict(int)
entidades_finais.sort(key=lambda x: x["start"])
for ent in entidades_finais:
tag_limpa = ent["label"].replace("PERSONAL-", "")
texto_original = text[ent["start"]:ent["end"]].strip()
texto_key = texto_original.lower()
assigned_id = None
if tag_limpa in labels_com_id:
if (tag_limpa, texto_key) in known_entities:
assigned_id = known_entities[(tag_limpa, texto_key)]
elif tag_limpa in labels_modelo:
emb_atual = rel_model.encode(texto_key, convert_to_tensor=True)
best_prob, best_match_id = 0.0, None
candidatos = [(tk, tid) for (lbl, tk), tid in known_entities.items() if lbl == tag_limpa]
for prev_text_key, prev_id in candidatos:
emb_prev = known_embeddings.get(prev_text_key)
if emb_prev is None:
emb_prev = rel_model.encode(prev_text_key, convert_to_tensor=True)
known_embeddings[prev_text_key] = emb_prev
score = util.cos_sim(emb_atual, emb_prev).item()
if score > best_prob:
best_prob, best_match_id = score, prev_id
if best_prob > threshold: assigned_id = best_match_id
known_embeddings[texto_key] = emb_atual
if assigned_id is None:
id_counters[tag_limpa] += 1
assigned_id = id_counters[tag_limpa]
known_entities[(tag_limpa, texto_key)] = assigned_id
ent["entity_id"] = assigned_id
# Output Construction
entidades_para_substituir = sorted(entidades_finais, key=lambda x: x["start"], reverse=True)
texto_anon = text
relatorio_json = []
for ent in entidades_para_substituir:
tag_limpa = ent["label"].replace("PERSONAL-", "")
id_part = f"-{ent['entity_id']}" if ent.get('entity_id') else ""
texto_original = text[ent["start"]:ent["end"]].strip()
# 1. FORMATO DO ITEM JSON QUE PEDISTE
relatorio_json.append({
"category": ent["label"],
"text": texto_original,
"start": ent["start"],
"end": ent["end"],
"id": ent.get("entity_id")
})
placeholder = f' <span class="entity-highlight"><b>&lt;{tag_limpa}{id_part}&gt;</b></span> '
texto_anon = texto_anon[:ent["start"]] + placeholder + texto_anon[ent["end"]:]
relatorio_json.reverse()
return re.sub(r' +', ' ', texto_anon).strip(), relatorio_json
@st.cache_data
def load_example_texts():
json_path = os.path.join(os.path.dirname(__file__), 'example_text.json')
try:
with open(json_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception:
return {"Custom Text": "", "1º Portuguese Meeting Minute": "", "2º Portuguese Meeting Minute": "", "3º Portuguese Meeting Minute": ""}
def main():
st.markdown('<p style="font-size: 60px; font-weight: bold; color: #bdbbbb; text-align: center; margin-bottom: 10px;">🛡️ PID: Text Anonymization Demo</p>', unsafe_allow_html=True)
st.markdown("""
<p style="text-align: center; color: #666;">
Automatic text anonymization for city council minutes and administrative documents
</p>
""", unsafe_allow_html=True)
tokenizer, model_ner, rel_model, error = load_models()
if error:
st.error(f"Erro ao carregar modelos: {error}")
st.stop()
st.sidebar.header("⚙️ Configuration")
st.sidebar.write("---")
example_texts = load_example_texts()
selected_example = st.sidebar.selectbox(
"Choose an example:",
options=list(example_texts.keys()),
index=0
)
st.sidebar.markdown("<br><br>", unsafe_allow_html=True)
threshold = st.sidebar.slider(
"Entity Linking Threshold",
min_value=0.0, max_value=1.0, value=0.80, step=0.05,
help="Higher threshold = more strict. Use a higher value to ensure only very similar entities get the same ID, preventing different people from being grouped together."
)
st.sidebar.markdown("---")
st.sidebar.markdown("### 📊 About")
st.sidebar.info(f"""
- **Anonymization (NER)** uses Token Classification to identify and mask sensitive information (PID) in administrative documents.
- **Model**: XLM-RoBERTa fine-tuned for Named Entity Recognition.
- **Languages**: Portuguese (pt-pt).
- **Method**: Sequence Labeling with Bi-Encoder Entity Linking.
""")
st.sidebar.markdown("")
st.sidebar.markdown("### 🔗 Resources")
st.sidebar.markdown("""
- [📖 Model Card](https://huggingface.co/liaad/Citilink-XLMR-Anonymization-pt) (Anonymization)
- [📖 Model Card](https://huggingface.co/liaad/Citilink-mpnet-Entity-Linker-pt) (Entity Linking)
- [💾 GitHub Repository](https://github.com/)
""")
st.write("")
with st.expander("🎯 How it works", expanded=False):
st.markdown("""
The anonymization process is powered by two specialized AI models working in sequence:
1. **The Detector (NER):** A model designed to extract personal information entities across multiple categories, identifying sensitive data within the document's context.
2. **The Linker (Entity Linking):** Understands when different words refer to the same entity. For example, it knows that *"João Silva"* and *"Sr. Silva"* are the same person, assigning them a consistent ID (e.g., `<Name-1>`).
""")
# Quadrado azul com distinção de quem tem ID Linking
st.markdown("""
<div style="background-color: #262730; padding: 20px; border-radius: 10px; border-left: 5px solid #3b82f6; margin-bottom: 25px;">
<p style="color: #3b82f6; font-weight: bold; margin-top: 0; margin-bottom: 5px;">Supported Entities:</p>
<p style="font-size: 0.75em; color: #94a3b8; margin-bottom: 15px;">Note: Entities marked with <span style="color: #60a5fa; font-weight: bold;">(ID)</span> support consistent linking across the document.</p>
<div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 10px; font-size: 0.85em; color: white;">
<div>
• Name <b style="color: #60a5fa;">(ID)</b><br>• Admin. Document <b style="color: #60a5fa;">(ID)</b><br>• Position/Department <b style="color: #60a5fa;">(ID)</b><br>• Address <b style="color: #60a5fa;">(ID)</b><br>• Date
</div>
<div>
• Location<br>• Personal Document <b style="color: #60a5fa;">(ID)</b><br>• Company <b style="color: #60a5fa;">(ID)</b><br>• Artistic Activity
</div>
<div>
• Degree<br>• Time<br>• License <b style="color: #60a5fa;">(ID)</b><br>• Job
</div>
<div>
• Vehicle <b style="color: #60a5fa;">(ID)</b><br>• Faculty<br>• Family Relationship<br>• Other
</div>
</div>
</div>
""", unsafe_allow_html=True)
st.markdown("**INPUT:**")
st.markdown('''
<div class="tab-window" style="height: auto; padding: 15px; border-top: 4px solid #666; margin-bottom: 10px;">
O interessado Dr. João Silva submeteu o processo administrativo 5597/2023 no dia 20/05/2023, relativo ao imóvel localizado na Rua das Flores n.º 10 conforme o solicitado.
</div>
''', unsafe_allow_html=True)
st.markdown("**OUTPUT:**")
st.markdown(f'''
<div class="tab-window" style="height: auto; padding: 15px; border-top: 4px solid #a8dadc;">
O interessado <span class="entity-highlight"><b>&lt;PositionDepartment-1&gt;</b></span> <span class="entity-highlight"><b>&lt;Name-1&gt;</b></span>
submeteu o processo administrativo <span class="entity-highlight"><b>&lt;AdministrativeInformation-1&gt;</b></span>
no dia <span class="entity-highlight"><b>&lt;Date&gt;</b></span>,
relativo ao imóvel localizado na <span class="entity-highlight"><b>&lt;Address&gt;</b></span> conforme o solicitado.
</div>
''', unsafe_allow_html=True)
st.write("")
col_ex_in, col_ex_out = st.columns(2)
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("📝 Input Document")
input_text = st.text_area(
"Enter yout text here:",
value=example_texts[selected_example],
height=400,
key=f"input_area_{selected_example}",
placeholder="Paste your document text here..."
)
st.markdown("""
<style>
/* Remove o puxador de redimensionamento de todas as text areas */
div[data-testid="stTextArea"] textarea {
resize: none;
}
</style>
""", unsafe_allow_html=True)
process_btn = st.button("🔍 Anonymize", type="primary", use_container_width=True)
if process_btn and input_text:
with st.spinner("Processing..."):
texto_final, relatorio = process_anonymization(input_text, threshold, tokenizer, model_ner, rel_model)
total_entidades = len(relatorio)
tipos_unicos = len(set(ent["category"] for ent in relatorio))
st.markdown(f"""
<div class="small-metric-container">
<div class="small-metric-box">
<div class="metric-label">Entities</div>
<div class="metric-value">{total_entidades}</div>
</div>
<div class="small-metric-box">
<div class="metric-label">Categories</div>
<div class="metric-value">{tipos_unicos}</div>
</div>
</div>
""", unsafe_allow_html=True)
with col2:
st.subheader("🔒 Anonymization Results")
if process_btn and input_text:
tab_text, tab_entities = st.tabs(["📄 Anonymized Text", "🔍 Extracted Entities"])
with tab_text:
st.markdown(f'''
<div class="tab-window" style="border-top: 4px solid #2162bf;">
{texto_final}
</div>
''', unsafe_allow_html=True)
with tab_entities:
agrupado = {}
for item in relatorio:
cat_limpa = item["category"].replace("PERSONAL-", "")
if cat_limpa not in agrupado:
agrupado[cat_limpa] = []
agrupado[cat_limpa].append(item["text"])
html_content = ""
for cat, lista in agrupado.items():
count = len(lista)
html_content += f"<div style='margin-bottom:20px;'>"
html_content += f"<b style='color:#a8dadc; font-size:1.1rem;'>{cat}</b> "
html_content += f"<span style='color:#666;'>({count})</span><br>"
for item in lista:
html_content += f"<div style='color:#ffffff; margin-left:15px; margin-top:3px;'>- {item}</div>"
html_content += f"</div>"
st.markdown(f'''
<div class="tab-window" style="border-top: 4px solid #2162bf;">
{html_content if html_content else "No entities found."}
</div>
''', unsafe_allow_html=True)
with st.expander("📋 Full Entity Report (JSON)"):
download_data = {
"full_text": input_text,
"personal_info": relatorio
}
json_string = json.dumps(download_data, indent=4, ensure_ascii=False)
st.download_button(
label="📥 Download JSON Report",
data=json_string,
file_name="anonymization_report.json",
mime="application/json",
use_container_width=True
)
st.json(relatorio)
else:
st.markdown(f'''
<div style="margin-top: 30px;">
</div>
''', unsafe_allow_html=True)
st.info("Please process a document on the left to view the results.")
st.markdown("---")
if __name__ == "__main__":
main()