|
|
import streamlit as st |
|
|
import torch |
|
|
import os |
|
|
import re |
|
|
import json |
|
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
from collections import defaultdict |
|
|
import os |
|
|
|
|
|
MODEL_PATH = "liaad/Citilink-XLMR-Anonymization-pt" |
|
|
MODEL_REL_PATH = "liaad/Citilink-mpnet-Entity-Linker-pt" |
|
|
|
|
|
st.set_page_config( |
|
|
page_title="️ Text Anonymization Demo", |
|
|
page_icon="🛡️", |
|
|
layout="wide", |
|
|
initial_sidebar_state="expanded" |
|
|
) |
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
|
|
|
/* 1. Elimina a capacidade de arrastar/redimensionar a barra */ |
|
|
[data-testid="stSidebarResizer"] { |
|
|
display: none !important; |
|
|
} |
|
|
|
|
|
/* 2. Garante que a barra tem uma largura fixa apenas enquanto aberta */ |
|
|
/* Assim ela não 'dança' e o fecho continua a ser total */ |
|
|
[data-testid="stSidebar"][aria-expanded="true"] { |
|
|
min-width: 320px; |
|
|
max-width: 320px; |
|
|
} |
|
|
|
|
|
/* 3. Ajusta o conteúdo principal para colar à esquerda quando fechada */ |
|
|
[data-testid="stMain"] { |
|
|
margin-left: 0px; |
|
|
} |
|
|
.main-header { |
|
|
font-size: 2.5rem; |
|
|
font-weight: bold; |
|
|
color: #e63946; |
|
|
text-align: center; |
|
|
margin-bottom: 1rem; |
|
|
} |
|
|
.anon-box { |
|
|
padding: 1.5rem; |
|
|
margin: 0.5rem 0; |
|
|
border-radius: 0.5rem; |
|
|
border-left: 5px solid #1d3557; |
|
|
background-color: #f8f9fa; |
|
|
font-family: 'Courier New', Courier, monospace; |
|
|
line-height: 1.6; |
|
|
color: #1e1e1e; |
|
|
} |
|
|
@media (prefers-color-scheme: dark) { |
|
|
.anon-box { background-color: #1e212b; color: #e0e0e0; border-left: 5px solid #a8dadc; } |
|
|
} |
|
|
.entity-tag { |
|
|
background-color: #e9ecef; |
|
|
padding: 2px 6px; |
|
|
border-radius: 4px; |
|
|
font-weight: bold; |
|
|
color: #1d3557; |
|
|
} |
|
|
.metric-box { |
|
|
background-color: #f1faee; |
|
|
padding: 1rem; |
|
|
border-radius: 0.5rem; |
|
|
text-align: center; |
|
|
border: 1px solid #a8dadc; |
|
|
} |
|
|
|
|
|
.small-metric-container { |
|
|
display: flex; |
|
|
justify-content: space-between; |
|
|
gap: 10px; |
|
|
margin-top: 10px; /* Espaço logo abaixo do botão */ |
|
|
} |
|
|
.small-metric-box { |
|
|
flex: 1; |
|
|
background-color: #f8f9fa; |
|
|
border-radius: 6px; |
|
|
padding: 5px; |
|
|
text-align: center; |
|
|
border: 1px solid #dee2e6; |
|
|
} |
|
|
.metric-label { |
|
|
font-size: 0.65rem; |
|
|
color: #6c757d; |
|
|
text-transform: uppercase; |
|
|
font-weight: bold; |
|
|
} |
|
|
.metric-value { |
|
|
font-size: 1rem; |
|
|
color: #1d3557; |
|
|
font-weight: bold; |
|
|
} |
|
|
|
|
|
.result-window { |
|
|
height: 450px; |
|
|
overflow-y: auto; |
|
|
padding: 1rem; |
|
|
border-radius: 8px; |
|
|
border: 1px solid #dee2e6; |
|
|
background-color: #ffffff; |
|
|
font-family: 'Courier New', Courier, monospace; |
|
|
font-size: 0.85rem; |
|
|
line-height: 1.5; |
|
|
} |
|
|
@media (prefers-color-scheme: dark) { |
|
|
.result-window { background-color: #1e212b; color: #e0e0e0; border: 1px solid #444; } |
|
|
|
|
|
.browser-window { |
|
|
height: 500px; |
|
|
overflow-y: auto; |
|
|
padding: 15px; |
|
|
border-radius: 0px 0px 8px 8px; /* Arredondado apenas em baixo */ |
|
|
border: 1px solid #d1d5db; |
|
|
background-color: #ffffff; |
|
|
font-family: 'Consolas', 'Monaco', monospace; |
|
|
font-size: 0.9rem; |
|
|
user-select: text; /* Garante que o utilizador pode selecionar o texto */ |
|
|
} |
|
|
.browser-header { |
|
|
background-color: #f1f5f9; |
|
|
padding: 5px 15px; |
|
|
border: 1px solid #d1d5db; |
|
|
border-bottom: none; |
|
|
border-radius: 8px 8px 0px 0px; |
|
|
font-size: 0.75rem; |
|
|
font-weight: bold; |
|
|
color: #475569; |
|
|
display: flex; |
|
|
align-items: center; |
|
|
gap: 8px; |
|
|
} |
|
|
.dot { height: 10px; width: 10px; border-radius: 50%; display: inline-block; } |
|
|
} |
|
|
/* Estilo para a área de conteúdo das Tabs */ |
|
|
/* Janela de Texto com fundo #262730 */ |
|
|
.tab-window { |
|
|
/* Reduzimos para alinhar com a caixa de estatísticas da esquerda */ |
|
|
height: 495px; |
|
|
|
|
|
overflow-y: auto; |
|
|
padding: 20px; |
|
|
border: 1px solid #444; |
|
|
border-radius: 0px 0px 8px 8px; |
|
|
background-color: #262730 !important; |
|
|
font-family: 'Consolas', 'Monaco', monospace; |
|
|
font-size: 0.95rem; |
|
|
line-height: 1.6; |
|
|
user-select: text; |
|
|
color: #efefef !important; |
|
|
margin-bottom: 20px; |
|
|
} |
|
|
|
|
|
/* Ajuste do sombreado das TAGS para o modo escuro */ |
|
|
.entity-highlight { |
|
|
background-color: #3d3f4b; /* Cinza um pouco mais claro que o fundo */ |
|
|
color: #a8dadc; /* Texto da tag num azul ciano suave */ |
|
|
padding: 2px 6px; |
|
|
border-radius: 4px; |
|
|
border: 1px solid #555; |
|
|
display: inline-block; |
|
|
line-height: 1.2; |
|
|
font-weight: bold; |
|
|
} |
|
|
|
|
|
/* Estilização das Tabs (Abas) */ |
|
|
.stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p { |
|
|
color: #9ca3af !important; /* Um cinza claro para as abas "apagadas" */ |
|
|
font-size: 1rem; |
|
|
font-weight: bold; |
|
|
transition: color 0.3s ease; |
|
|
} |
|
|
|
|
|
/* 2. COR DAS ABAS QUANDO ESTÃO SELECIONADAS (ATIVAS) */ |
|
|
/* Aqui mudamos para o Azul que pediste anteriormente */ |
|
|
.stTabs [aria-selected="true"] [data-testid="stMarkdownContainer"] p { |
|
|
color: #a8dadc !important; /* Um azul ciano/claro para brilhar no dark mode */ |
|
|
} |
|
|
|
|
|
/* 3. A LINHA (BARRA) QUE FICA POR BAIXO DA ABA SELECIONADA */ |
|
|
.stTabs [data-baseweb="tab-highlight"] { |
|
|
background-color: #a8dadc !important; /* Cor da linha que corre por baixo */ |
|
|
} |
|
|
|
|
|
/* 4. EFEITO AO PASSAR O RATO (HOVER) */ |
|
|
.stTabs [data-baseweb="tab"]:hover [data-testid="stMarkdownContainer"] p { |
|
|
color: #ffffff !important; /* Fica branco ao passar o rato */ |
|
|
} |
|
|
|
|
|
/* 1. Remove o espaço em branco excessivo no topo sem quebrar o botão da barra lateral */ |
|
|
.block-container { |
|
|
padding-top: 1.5rem !important; |
|
|
padding-bottom: 0rem !important; |
|
|
} |
|
|
|
|
|
/* 2. Esconde o fundo e a decoração do header, mas MANTÉM o botão de abrir/fechar */ |
|
|
header[data-testid="stHeader"] { |
|
|
background: transparent !important; |
|
|
color: transparent !important; |
|
|
} |
|
|
|
|
|
/* 3. Garante que o botão da barra lateral (Chevron) é visível mesmo com header transparente */ |
|
|
[data-testid="collapsedControl"] { |
|
|
color: #bdbbbb !important; /* Mesma cor do seu título */ |
|
|
visibility: visible !important; |
|
|
display: flex !important; |
|
|
} |
|
|
|
|
|
/* 4. Elimina a capacidade de arrastar a largura da barra */ |
|
|
[data-testid="stSidebarResizer"] { |
|
|
display: none !important; |
|
|
} |
|
|
|
|
|
/* 5. Largura fixa da barra lateral */ |
|
|
[data-testid="stSidebar"][aria-expanded="true"] { |
|
|
min-width: 320px; |
|
|
max-width: 320px; |
|
|
} |
|
|
|
|
|
/* Ajusta a margem do título principal */ |
|
|
.main-title { |
|
|
margin-top: -45px !important; |
|
|
position: relative; |
|
|
} |
|
|
|
|
|
.streamlit-expanderHeader { |
|
|
background-color: #1e212b !important; |
|
|
border: 1px solid #444 !important; |
|
|
border-radius: 8px !important; |
|
|
} |
|
|
html, body { |
|
|
overflow: hidden !important; |
|
|
height: 100%; |
|
|
} |
|
|
|
|
|
/* 2. Garante que o container principal do Streamlit retém o scroll */ |
|
|
/* Isso permite que a barra de scroll que vês seja a da App e não a da Web */ |
|
|
[data-testid="stMainViewContainer"] { |
|
|
overflow-y: auto !important; |
|
|
} |
|
|
|
|
|
/* Opcional: Se quiseres que a barra do Streamlit seja mais discreta/fina */ |
|
|
[data-testid="stMainViewContainer"]::-webkit-scrollbar { |
|
|
width: 8px; |
|
|
} |
|
|
[data-testid="stMainViewContainer"]::-webkit-scrollbar-thumb { |
|
|
background: #444; |
|
|
border-radius: 10px; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_models(): |
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, add_prefix_space=True) |
|
|
model_ner = AutoModelForTokenClassification.from_pretrained(MODEL_PATH) |
|
|
rel_model = SentenceTransformer(MODEL_REL_PATH) |
|
|
return tokenizer, model_ner, rel_model, None |
|
|
except Exception as e: |
|
|
return None, None, None, str(e) |
|
|
|
|
|
|
|
|
def process_anonymization(text, threshold, tokenizer, model_ner, rel_model): |
|
|
if not text.strip(): |
|
|
return "Por favor, insira um texto.", {} |
|
|
|
|
|
id2label = model_ner.config.id2label |
|
|
inputs = tokenizer( |
|
|
text, |
|
|
truncation=True, |
|
|
max_length=512, |
|
|
stride=164, |
|
|
return_overflowing_tokens=True, |
|
|
return_offsets_mapping=True, |
|
|
padding=True, |
|
|
return_tensors="pt" |
|
|
) |
|
|
|
|
|
all_predictions = [] |
|
|
offset_mapping_all = inputs.pop("offset_mapping") |
|
|
overflow_to_sample = inputs.pop("overflow_to_sample_mapping") |
|
|
input_ids_all = inputs["input_ids"] |
|
|
|
|
|
with torch.no_grad(): |
|
|
logits = model_ner(input_ids=input_ids_all).logits |
|
|
all_predictions = torch.argmax(logits, dim=2).tolist() |
|
|
|
|
|
entidades_brutas = [] |
|
|
id2label = model_ner.config.id2label |
|
|
SPACE_PREFIXES = [" ", "▁", "Ġ"] |
|
|
|
|
|
for window_idx, (predictions, offsets) in enumerate(zip(all_predictions, offset_mapping_all)): |
|
|
temp_entity = {"tokens": [], "start": None, "end": None, "label": None} |
|
|
|
|
|
for idx, (pred_id, offset) in enumerate(zip(predictions, offsets)): |
|
|
label_name = id2label[pred_id] |
|
|
start_char, end_char = int(offset[0]), int(offset[1]) |
|
|
|
|
|
if start_char == end_char: continue |
|
|
|
|
|
tag_base = label_name.split('-', 1)[1] if '-' in label_name else None |
|
|
|
|
|
token_id = input_ids_all[window_idx][idx].item() |
|
|
token = tokenizer.convert_ids_to_tokens(token_id) |
|
|
|
|
|
comeca_nova_palavra = any(token.startswith(p) for p in SPACE_PREFIXES) |
|
|
|
|
|
if label_name.startswith("B-"): |
|
|
if temp_entity["label"]: |
|
|
|
|
|
chars_to_remove = ".,;:!?" |
|
|
if temp_entity["label"] == "PERSONAL-PositionDepartment": |
|
|
chars_to_remove = ",;:!?" |
|
|
|
|
|
while (temp_entity["end"] - temp_entity["start"]) > 0 and text[ |
|
|
temp_entity["end"] - 1] in chars_to_remove: |
|
|
temp_entity["end"] -= 1 |
|
|
entidades_brutas.append(temp_entity) |
|
|
|
|
|
temp_entity = {"start": start_char, "end": end_char, "label": tag_base, "tokens": [token]} |
|
|
|
|
|
elif label_name.startswith("I-") and temp_entity["label"] == tag_base: |
|
|
temp_entity["tokens"].append(token) |
|
|
temp_entity["end"] = end_char |
|
|
elif not comeca_nova_palavra and temp_entity["label"] is not None: |
|
|
temp_entity["tokens"].append(token) |
|
|
temp_entity["end"] = end_char |
|
|
else: |
|
|
if temp_entity["label"]: |
|
|
|
|
|
chars_to_remove = ".,;:!?" |
|
|
if temp_entity["label"] == "PERSONAL-PositionDepartment": |
|
|
chars_to_remove = ",;:!?" |
|
|
|
|
|
while (temp_entity["end"] - temp_entity["start"]) > 0 and text[ |
|
|
temp_entity["end"] - 1] in chars_to_remove: |
|
|
temp_entity["end"] -= 1 |
|
|
entidades_brutas.append(temp_entity) |
|
|
|
|
|
temp_entity = {"tokens": [], "start": None, "end": None, "label": None} |
|
|
|
|
|
entidades_brutas.sort(key=lambda x: x["start"]) |
|
|
entidades_finais = [] |
|
|
|
|
|
for atual in entidades_brutas: |
|
|
if not entidades_finais: |
|
|
entidades_finais.append(atual) |
|
|
continue |
|
|
|
|
|
ultima = entidades_finais[-1] |
|
|
|
|
|
distancia = atual["start"] - ultima["end"] |
|
|
|
|
|
if distancia <= 1 and atual["label"] == ultima["label"]: |
|
|
|
|
|
ultima["end"] = atual["end"] |
|
|
|
|
|
ultima["tokens"].extend(atual["tokens"]) |
|
|
else: |
|
|
|
|
|
adicionar = True |
|
|
for i, selecionada in enumerate(entidades_finais): |
|
|
interseccao = max(0, |
|
|
min(atual["end"], selecionada["end"]) - max(atual["start"], selecionada["start"])) |
|
|
if interseccao > 0: |
|
|
|
|
|
if (atual["end"] - atual["start"]) > (selecionada["end"] - selecionada["start"]): |
|
|
entidades_finais[i] = atual |
|
|
adicionar = False |
|
|
break |
|
|
if adicionar: |
|
|
entidades_finais.append(atual) |
|
|
|
|
|
labels_modelo = ["Name", "Address", "Company", "Vehicle", "PositionDepartment"] |
|
|
labels_com_id = ["Name", "AdministrativeInformation", "PositionDepartment", "Address", "PersonalDocument", "Company", |
|
|
"LicensePlate", "Vehicle"] |
|
|
|
|
|
known_entities = {} |
|
|
known_embeddings = {} |
|
|
id_counters = defaultdict(int) |
|
|
|
|
|
entidades_finais.sort(key=lambda x: x["start"]) |
|
|
for ent in entidades_finais: |
|
|
tag_limpa = ent["label"].replace("PERSONAL-", "") |
|
|
texto_original = text[ent["start"]:ent["end"]].strip() |
|
|
texto_key = texto_original.lower() |
|
|
assigned_id = None |
|
|
|
|
|
if tag_limpa in labels_com_id: |
|
|
if (tag_limpa, texto_key) in known_entities: |
|
|
assigned_id = known_entities[(tag_limpa, texto_key)] |
|
|
elif tag_limpa in labels_modelo: |
|
|
emb_atual = rel_model.encode(texto_key, convert_to_tensor=True) |
|
|
best_prob, best_match_id = 0.0, None |
|
|
candidatos = [(tk, tid) for (lbl, tk), tid in known_entities.items() if lbl == tag_limpa] |
|
|
|
|
|
for prev_text_key, prev_id in candidatos: |
|
|
emb_prev = known_embeddings.get(prev_text_key) |
|
|
if emb_prev is None: |
|
|
emb_prev = rel_model.encode(prev_text_key, convert_to_tensor=True) |
|
|
known_embeddings[prev_text_key] = emb_prev |
|
|
|
|
|
score = util.cos_sim(emb_atual, emb_prev).item() |
|
|
if score > best_prob: |
|
|
best_prob, best_match_id = score, prev_id |
|
|
|
|
|
if best_prob > threshold: assigned_id = best_match_id |
|
|
known_embeddings[texto_key] = emb_atual |
|
|
|
|
|
if assigned_id is None: |
|
|
id_counters[tag_limpa] += 1 |
|
|
assigned_id = id_counters[tag_limpa] |
|
|
known_entities[(tag_limpa, texto_key)] = assigned_id |
|
|
|
|
|
ent["entity_id"] = assigned_id |
|
|
|
|
|
|
|
|
entidades_para_substituir = sorted(entidades_finais, key=lambda x: x["start"], reverse=True) |
|
|
texto_anon = text |
|
|
relatorio_json = [] |
|
|
|
|
|
for ent in entidades_para_substituir: |
|
|
tag_limpa = ent["label"].replace("PERSONAL-", "") |
|
|
id_part = f"-{ent['entity_id']}" if ent.get('entity_id') else "" |
|
|
texto_original = text[ent["start"]:ent["end"]].strip() |
|
|
|
|
|
|
|
|
relatorio_json.append({ |
|
|
"category": ent["label"], |
|
|
"text": texto_original, |
|
|
"start": ent["start"], |
|
|
"end": ent["end"], |
|
|
"id": ent.get("entity_id") |
|
|
}) |
|
|
|
|
|
placeholder = f' <span class="entity-highlight"><b><{tag_limpa}{id_part}></b></span> ' |
|
|
texto_anon = texto_anon[:ent["start"]] + placeholder + texto_anon[ent["end"]:] |
|
|
|
|
|
relatorio_json.reverse() |
|
|
|
|
|
return re.sub(r' +', ' ', texto_anon).strip(), relatorio_json |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_example_texts(): |
|
|
json_path = os.path.join(os.path.dirname(__file__), 'example_text.json') |
|
|
try: |
|
|
with open(json_path, 'r', encoding='utf-8') as f: |
|
|
return json.load(f) |
|
|
except Exception: |
|
|
return {"Custom Text": "", "1º Portuguese Meeting Minute": "", "2º Portuguese Meeting Minute": "", "3º Portuguese Meeting Minute": ""} |
|
|
|
|
|
def main(): |
|
|
st.markdown('<p style="font-size: 60px; font-weight: bold; color: #bdbbbb; text-align: center; margin-bottom: 10px;">🛡️ PID: Text Anonymization Demo</p>', unsafe_allow_html=True) |
|
|
st.markdown(""" |
|
|
<p style="text-align: center; color: #666;"> |
|
|
Automatic text anonymization for city council minutes and administrative documents |
|
|
</p> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
tokenizer, model_ner, rel_model, error = load_models() |
|
|
|
|
|
if error: |
|
|
st.error(f"Erro ao carregar modelos: {error}") |
|
|
st.stop() |
|
|
|
|
|
st.sidebar.header("⚙️ Configuration") |
|
|
|
|
|
st.sidebar.write("---") |
|
|
|
|
|
example_texts = load_example_texts() |
|
|
|
|
|
selected_example = st.sidebar.selectbox( |
|
|
"Choose an example:", |
|
|
options=list(example_texts.keys()), |
|
|
index=0 |
|
|
) |
|
|
|
|
|
st.sidebar.markdown("<br><br>", unsafe_allow_html=True) |
|
|
|
|
|
threshold = st.sidebar.slider( |
|
|
"Entity Linking Threshold", |
|
|
min_value=0.0, max_value=1.0, value=0.80, step=0.05, |
|
|
help="Higher threshold = more strict. Use a higher value to ensure only very similar entities get the same ID, preventing different people from being grouped together." |
|
|
) |
|
|
|
|
|
st.sidebar.markdown("---") |
|
|
|
|
|
st.sidebar.markdown("### 📊 About") |
|
|
|
|
|
st.sidebar.info(f""" |
|
|
- **Anonymization (NER)** uses Token Classification to identify and mask sensitive information (PID) in administrative documents. |
|
|
- **Model**: XLM-RoBERTa fine-tuned for Named Entity Recognition. |
|
|
- **Languages**: Portuguese (pt-pt). |
|
|
- **Method**: Sequence Labeling with Bi-Encoder Entity Linking. |
|
|
""") |
|
|
|
|
|
st.sidebar.markdown("") |
|
|
st.sidebar.markdown("### 🔗 Resources") |
|
|
st.sidebar.markdown(""" |
|
|
- [📖 Model Card](https://huggingface.co/liaad/Citilink-XLMR-Anonymization-pt) (Anonymization) |
|
|
- [📖 Model Card](https://huggingface.co/liaad/Citilink-mpnet-Entity-Linker-pt) (Entity Linking) |
|
|
- [💾 GitHub Repository](https://github.com/) |
|
|
""") |
|
|
|
|
|
st.write("") |
|
|
|
|
|
with st.expander("🎯 How it works", expanded=False): |
|
|
st.markdown(""" |
|
|
The anonymization process is powered by two specialized AI models working in sequence: |
|
|
|
|
|
1. **The Detector (NER):** A model designed to extract personal information entities across multiple categories, identifying sensitive data within the document's context. |
|
|
2. **The Linker (Entity Linking):** Understands when different words refer to the same entity. For example, it knows that *"João Silva"* and *"Sr. Silva"* are the same person, assigning them a consistent ID (e.g., `<Name-1>`). |
|
|
|
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<div style="background-color: #262730; padding: 20px; border-radius: 10px; border-left: 5px solid #3b82f6; margin-bottom: 25px;"> |
|
|
<p style="color: #3b82f6; font-weight: bold; margin-top: 0; margin-bottom: 5px;">Supported Entities:</p> |
|
|
<p style="font-size: 0.75em; color: #94a3b8; margin-bottom: 15px;">Note: Entities marked with <span style="color: #60a5fa; font-weight: bold;">(ID)</span> support consistent linking across the document.</p> |
|
|
<div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 10px; font-size: 0.85em; color: white;"> |
|
|
<div> |
|
|
• Name <b style="color: #60a5fa;">(ID)</b><br>• Admin. Document <b style="color: #60a5fa;">(ID)</b><br>• Position/Department <b style="color: #60a5fa;">(ID)</b><br>• Address <b style="color: #60a5fa;">(ID)</b><br>• Date |
|
|
</div> |
|
|
<div> |
|
|
• Location<br>• Personal Document <b style="color: #60a5fa;">(ID)</b><br>• Company <b style="color: #60a5fa;">(ID)</b><br>• Artistic Activity |
|
|
</div> |
|
|
<div> |
|
|
• Degree<br>• Time<br>• License <b style="color: #60a5fa;">(ID)</b><br>• Job |
|
|
</div> |
|
|
<div> |
|
|
• Vehicle <b style="color: #60a5fa;">(ID)</b><br>• Faculty<br>• Family Relationship<br>• Other |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("**INPUT:**") |
|
|
st.markdown(''' |
|
|
<div class="tab-window" style="height: auto; padding: 15px; border-top: 4px solid #666; margin-bottom: 10px;"> |
|
|
O interessado Dr. João Silva submeteu o processo administrativo 5597/2023 no dia 20/05/2023, relativo ao imóvel localizado na Rua das Flores n.º 10 conforme o solicitado. |
|
|
</div> |
|
|
''', unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("**OUTPUT:**") |
|
|
st.markdown(f''' |
|
|
<div class="tab-window" style="height: auto; padding: 15px; border-top: 4px solid #a8dadc;"> |
|
|
O interessado <span class="entity-highlight"><b><PositionDepartment-1></b></span> <span class="entity-highlight"><b><Name-1></b></span> |
|
|
submeteu o processo administrativo <span class="entity-highlight"><b><AdministrativeInformation-1></b></span> |
|
|
no dia <span class="entity-highlight"><b><Date></b></span>, |
|
|
relativo ao imóvel localizado na <span class="entity-highlight"><b><Address></b></span> conforme o solicitado. |
|
|
</div> |
|
|
''', unsafe_allow_html=True) |
|
|
|
|
|
st.write("") |
|
|
|
|
|
col_ex_in, col_ex_out = st.columns(2) |
|
|
|
|
|
|
|
|
col1, col2 = st.columns([1, 1]) |
|
|
|
|
|
with col1: |
|
|
st.subheader("📝 Input Document") |
|
|
|
|
|
input_text = st.text_area( |
|
|
"Enter yout text here:", |
|
|
value=example_texts[selected_example], |
|
|
height=400, |
|
|
key=f"input_area_{selected_example}", |
|
|
placeholder="Paste your document text here..." |
|
|
) |
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
/* Remove o puxador de redimensionamento de todas as text areas */ |
|
|
div[data-testid="stTextArea"] textarea { |
|
|
resize: none; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
process_btn = st.button("🔍 Anonymize", type="primary", use_container_width=True) |
|
|
|
|
|
if process_btn and input_text: |
|
|
with st.spinner("Processing..."): |
|
|
texto_final, relatorio = process_anonymization(input_text, threshold, tokenizer, model_ner, rel_model) |
|
|
|
|
|
total_entidades = len(relatorio) |
|
|
|
|
|
tipos_unicos = len(set(ent["category"] for ent in relatorio)) |
|
|
|
|
|
st.markdown(f""" |
|
|
<div class="small-metric-container"> |
|
|
<div class="small-metric-box"> |
|
|
<div class="metric-label">Entities</div> |
|
|
<div class="metric-value">{total_entidades}</div> |
|
|
</div> |
|
|
<div class="small-metric-box"> |
|
|
<div class="metric-label">Categories</div> |
|
|
<div class="metric-value">{tipos_unicos}</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
with col2: |
|
|
st.subheader("🔒 Anonymization Results") |
|
|
|
|
|
if process_btn and input_text: |
|
|
tab_text, tab_entities = st.tabs(["📄 Anonymized Text", "🔍 Extracted Entities"]) |
|
|
|
|
|
with tab_text: |
|
|
|
|
|
st.markdown(f''' |
|
|
<div class="tab-window" style="border-top: 4px solid #2162bf;"> |
|
|
{texto_final} |
|
|
</div> |
|
|
''', unsafe_allow_html=True) |
|
|
|
|
|
with tab_entities: |
|
|
|
|
|
agrupado = {} |
|
|
for item in relatorio: |
|
|
cat_limpa = item["category"].replace("PERSONAL-", "") |
|
|
if cat_limpa not in agrupado: |
|
|
agrupado[cat_limpa] = [] |
|
|
agrupado[cat_limpa].append(item["text"]) |
|
|
|
|
|
html_content = "" |
|
|
for cat, lista in agrupado.items(): |
|
|
count = len(lista) |
|
|
html_content += f"<div style='margin-bottom:20px;'>" |
|
|
html_content += f"<b style='color:#a8dadc; font-size:1.1rem;'>{cat}</b> " |
|
|
html_content += f"<span style='color:#666;'>({count})</span><br>" |
|
|
|
|
|
for item in lista: |
|
|
html_content += f"<div style='color:#ffffff; margin-left:15px; margin-top:3px;'>- {item}</div>" |
|
|
|
|
|
html_content += f"</div>" |
|
|
|
|
|
st.markdown(f''' |
|
|
<div class="tab-window" style="border-top: 4px solid #2162bf;"> |
|
|
{html_content if html_content else "No entities found."} |
|
|
</div> |
|
|
''', unsafe_allow_html=True) |
|
|
|
|
|
with st.expander("📋 Full Entity Report (JSON)"): |
|
|
|
|
|
download_data = { |
|
|
"full_text": input_text, |
|
|
"personal_info": relatorio |
|
|
} |
|
|
|
|
|
json_string = json.dumps(download_data, indent=4, ensure_ascii=False) |
|
|
|
|
|
st.download_button( |
|
|
label="📥 Download JSON Report", |
|
|
data=json_string, |
|
|
file_name="anonymization_report.json", |
|
|
mime="application/json", |
|
|
use_container_width=True |
|
|
) |
|
|
|
|
|
st.json(relatorio) |
|
|
else: |
|
|
st.markdown(f''' |
|
|
<div style="margin-top: 30px;"> |
|
|
</div> |
|
|
''', unsafe_allow_html=True) |
|
|
st.info("Please process a document on the left to view the results.") |
|
|
st.markdown("---") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |