AiAnonymize_2 / app.py
Alessandro Tomassini
vs 2.1
3f87064
"""Anonimizzatore Gare d'Appalto — UI Gradio."""
import json
import gradio as gr
from config import (
MODES, DEFAULT_MIN_SCORE,
SEVERITY_ORDER, SEVERITY_LABELS, SEVERITY_COLORS,
SEVERITY_CRITICAL, SEVERITY_HIGH, SEVERITY_MEDIUM, SEVERITY_LOW,
get_severity,
)
from core.pipeline import detect, anonymize_from_entities, apply_custom_lines
from ui.renderers import (
render_highlighted_text, render_anonymized_text,
render_categorized_report, build_json_by_severity,
)
from utils import extract_pdf_text, extract_pdf_text_got_ocr, extract_pdf_text_smoldocling
from demo_text import DEMO_TEXT, DEMO_REGEX, DEMO_GLINER, DEMO_NER
_DEMO_TEXTS = {
"Bando completo": DEMO_TEXT,
"Layer 0 — Regex": DEMO_REGEX,
"Layer 1 — NER": DEMO_NER,
"Layer 2 — GLiNER": DEMO_GLINER,
}
_PDF_EXTRACTORS = {
"PyMuPDF (default)": extract_pdf_text,
"GOT-OCR 2.0 (~30s/pag, CPU)": extract_pdf_text_got_ocr,
"SmolDocling 256M (CPU)": extract_pdf_text_smoldocling,
}
_STATE_DEFAULT = {"orig_text": "", "mode": "", "anon_text": "", "entities": []}
_ALL_SEVERITIES = list(SEVERITY_ORDER)
_EMPTY_JSON = "{}"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _active_set(active_severities: list[str]) -> set[str]:
return set(active_severities) if active_severities is not None else set(_ALL_SEVERITIES)
# ---------------------------------------------------------------------------
# Handlers
# ---------------------------------------------------------------------------
def _process(text: str, mode: str, min_score: float, use_regex: bool, active_severities: list[str]):
"""Esegue il pipeline completo e restituisce i 4 output Gradio.
Le entità rilevate vengono salvate nello stato in modo che variazioni
successive di modalità o gravità possano ricalcolare l'output senza
rieseguire il costoso step di rilevamento NLP.
"""
active = _active_set(active_severities)
entities = detect(text, min_score=min_score, use_regex=use_regex)
filtered = [e for e in entities if get_severity(e.entity_type) in active]
anon_text = anonymize_from_entities(text, filtered, mode)
new_state = {"orig_text": text, "mode": mode, "anon_text": anon_text, "entities": entities}
json_c, json_h, json_m, json_l = build_json_by_severity(text, entities, active)
return (
render_highlighted_text(text, entities, active),
render_anonymized_text(anon_text),
render_categorized_report(text, entities, active),
new_state,
json_c, json_h, json_m, json_l,
)
def handle_settings_change(state_data: dict, mode: str, c: bool, h: bool, m: bool, l: bool):
"""Ricalcola l'output quando l'utente cambia modalità o livelli di gravità.
Non riesegue il rilevamento NLP: usa le entità già salvate nello stato.
"""
entities = state_data.get("entities") or []
orig_text = state_data.get("orig_text") or ""
if not orig_text or not entities:
return (gr.update(), gr.update(), gr.update(), state_data,
gr.update(), gr.update(), gr.update(), gr.update())
active = _active_set([s for s, on in zip(SEVERITY_ORDER, [c, h, m, l]) if on])
filtered = [e for e in entities if get_severity(e.entity_type) in active]
anon_text = anonymize_from_entities(orig_text, filtered, mode)
new_state = {**state_data, "mode": mode, "anon_text": anon_text}
json_c, json_h, json_m, json_l = build_json_by_severity(orig_text, entities, active)
return (
render_highlighted_text(orig_text, entities, active),
render_anonymized_text(anon_text),
render_categorized_report(orig_text, entities, active),
new_state,
json_c, json_h, json_m, json_l,
)
def handle_text(text, mode, min_score, use_regex, active_severities):
_empty = (
'<div style="padding:40px; text-align:center; color:#6b7280; '
'font-family:Arial,sans-serif;">Inserisci del testo per iniziare.</div>'
)
if not text or not text.strip():
return (_empty, _empty, _empty, _STATE_DEFAULT.copy(),
_EMPTY_JSON, _EMPTY_JSON, _EMPTY_JSON, _EMPTY_JSON)
return _process(text, mode, min_score, use_regex, active_severities)
def handle_pdf(pdf_file, mode, min_score, use_regex, active_severities, extractor_label="PyMuPDF (default)"):
def _err(msg):
e = (
f'<div style="padding:40px; text-align:center; color:#6b7280; '
f'font-family:Arial,sans-serif;">{msg}</div>'
)
return (e, e, e, _STATE_DEFAULT.copy(),
_EMPTY_JSON, _EMPTY_JSON, _EMPTY_JSON, _EMPTY_JSON)
if pdf_file is None:
return _err("Carica un PDF per iniziare.")
extractor = _PDF_EXTRACTORS.get(extractor_label, extract_pdf_text)
raw = extractor(pdf_file)
if not raw.strip():
return _err("⚠️ Impossibile estrarre testo dal PDF (scansionato?).")
return _process(raw, mode, min_score, use_regex, active_severities)
def handle_update_custom(selected_lines_json: str, state_data: dict):
"""Applica le righe selezionate (via JS al click) al testo anonimizzato."""
lines: list[str] = json.loads(selected_lines_json or "[]")
if not lines or not state_data.get("orig_text"):
return render_anonymized_text(state_data.get("anon_text", ""))
new_anon = apply_custom_lines(
state_data["anon_text"],
state_data["orig_text"],
lines,
state_data["mode"],
)
return render_anonymized_text(new_anon)
# ---------------------------------------------------------------------------
# JavaScript
# ---------------------------------------------------------------------------
_PAGE_JS = """
(function () {
/* ── Overlay fullscreen di loading ─────────────────────────────────── */
var ov = document.createElement('div');
ov.id = 'anon-fs-overlay';
ov.innerHTML = [
'<div style="display:flex;flex-direction:column;align-items:center;gap:22px;">',
' <svg width="52" height="52" viewBox="0 0 52 52" fill="none"',
' style="animation:_ovSpin .8s linear infinite;">',
' <circle cx="26" cy="26" r="20" stroke="#e5e7eb" stroke-width="4"/>',
' <path d="M26 6a20 20 0 0 1 20 20" stroke="#6366f1"',
' stroke-width="4" stroke-linecap="round"/>',
' </svg>',
' <p id="anon-fs-msg" style="margin:0;font-family:Arial,sans-serif;',
' font-size:1em;color:#374151;font-weight:500;">Analisi in corso…</p>',
'</div>',
'<style>',
' #anon-fs-overlay{position:fixed;inset:0;z-index:9999;',
' background:rgba(255,255,255,.9);backdrop-filter:blur(6px);',
' display:none;align-items:center;justify-content:center;}',
' @keyframes _ovSpin{to{transform:rotate(360deg)}}',
'</style>',
].join('');
document.body.appendChild(ov);
window._anonHideOverlay = function () {
clearTimeout(window._anonOverlayTimer);
ov.style.display = 'none';
};
function showOverlay() {
clearTimeout(window._anonOverlayTimer);
ov.style.display = 'flex';
/* Fallback: si nasconde automaticamente dopo 50 secondi */
window._anonOverlayTimer = setTimeout(window._anonHideOverlay, 50000);
}
/* Mostra overlay al click sui pulsanti "Anonimizza" */
document.addEventListener('click', function (e) {
var btn = e.target.closest('button');
if (btn && btn.classList.contains('primary') &&
(btn.textContent || '').trim() === 'Anonimizza') {
showOverlay();
}
}, true);
/* ── Checkbox righe custom ──────────────────────────────────────────── */
document.addEventListener('change', function (e) {
if (!e.target || !e.target.classList.contains('anon-line-cb')) return;
var hasChecked = document.querySelectorAll('.anon-line-cb:checked').length > 0;
var w = document.getElementById('anon-custom-warning');
if (w) w.style.display = hasChecked ? 'block' : 'none';
}, true);
})();
"""
# JS eseguito da .then() — nasconde l'overlay quando il processing Python termina
_HIDE_OVERLAY_JS = "() => { if (window._anonHideOverlay) window._anonHideOverlay(); }"
_COLLECT_JS = """
(placeholder, state) => {
var r = [];
document.querySelectorAll('.anon-line-cb:checked').forEach(function (cb) {
var row = cb.closest('.anon-line-row');
if (row) r.push(row.getAttribute('data-line'));
});
return [JSON.stringify(r), state];
}
"""
# ---------------------------------------------------------------------------
# CSS
# ---------------------------------------------------------------------------
_SEV_CSS = ""
for _s in SEVERITY_ORDER:
_c = SEVERITY_COLORS[_s]
# Selettori label: background, bordo, padding
_SEV_CSS += (
f'.sev-{_s} .wrap label, .sev-{_s} label {{'
f'background:{_c["bg"]} !important; '
f'border-left:3px solid {_c["border"]} !important; '
f'border-radius:6px !important; '
f'padding:4px 12px !important;}}\n'
)
# .label-text è lo span interno dove Gradio/Svelte mette il testo del checkbox
_SEV_CSS += (
f'.sev-{_s} .label-text, .sev-{_s} .wrap .label-text {{'
f'color:{_c["text"]} !important; '
f'font-size:0.85em !important; font-weight:500 !important;}}\n'
)
_CSS = """
.gradio-container {
max-width: 1200px !important;
margin: 0 auto !important;
font-family: Arial, Helvetica, sans-serif !important;
}
.gradio-container * { font-family: Arial, Helvetica, sans-serif !important; }
.header-block h1 { margin-bottom: 4px !important; font-weight: 700; }
.section-label {
font-size: 0.78em !important; font-weight: 700 !important;
text-transform: uppercase; letter-spacing: 0.06em;
color: #6b7280 !important; margin-bottom: 6px !important;
}
button.primary { font-weight: 600 !important; }
/* Forza il colore del testo nei box HTML di output (Evidenziato / Anonimizzato / Report).
Gradio sovrascrive il color dei figli con regole scoped — questo lo corregge. */
.output-html, .output-html *:not([style*="color"]) {
color: #111827 !important;
}
""" + _SEV_CSS
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
with gr.Blocks(
title="Anonimizzatore Gare d'Appalto",
theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate",
font=["Arial", "Helvetica", "sans-serif"]),
css=_CSS,
js=_PAGE_JS,
) as demo:
state = gr.State(_STATE_DEFAULT.copy())
_lines_placeholder = gr.Textbox(value="[]", visible=False)
with gr.Row(elem_classes=["header-block"]):
gr.Markdown(
"# 🔒 Anonimizzatore Gare d'Appalto\n"
"Pipeline a 3 livelli: **Regex** → **NER** (Italian_NER_XXL_v2) → "
"**GLiNER** (PII_ITA zero-shot), con doppio passaggio regex finale."
)
gr.Markdown("### Impostazioni", elem_classes=["section-label"])
mode_radio = gr.Radio(
choices=list(MODES.keys()),
value=list(MODES.keys())[0],
label="Modalità di offuscamento",
)
with gr.Row(equal_height=True):
score_slider = gr.Slider(
minimum=0.0, maximum=1.0, value=DEFAULT_MIN_SCORE, step=0.05,
label="Score minimo di confidenza", scale=3,
)
use_regex_chk = gr.Checkbox(
value=True, label="Livello 1: regex appalti", scale=1,
)
# Severity toggle — checkbox per livello; di default tutti attivi
with gr.Row():
sev_checks = []
for sev in SEVERITY_ORDER:
cb = gr.Checkbox(
value=True,
label=f"Gravità {SEVERITY_LABELS[sev]}",
elem_classes=[f"sev-{sev}"],
scale=1,
)
sev_checks.append(cb)
gr.Markdown("### Input", elem_classes=["section-label"])
with gr.Tabs():
with gr.TabItem("📝 Testo"):
txt_in = gr.Textbox(
label=None, lines=10,
placeholder="Incolla il testo del documento di gara…",
show_label=False,
)
with gr.Row():
txt_btn = gr.Button("Anonimizza", variant="primary", size="lg", scale=2)
demo_sel = gr.Dropdown(
choices=list(_DEMO_TEXTS.keys()),
value=list(_DEMO_TEXTS.keys())[0],
label=None, show_label=False,
scale=1,
)
demo_btn = gr.Button("📋 Carica", variant="secondary", size="lg", scale=1)
with gr.TabItem("📄 PDF"):
pdf_in = gr.File(label="Carica un PDF", file_types=[".pdf"])
pdf_extractor_radio = gr.Radio(
choices=list(_PDF_EXTRACTORS.keys()),
value=list(_PDF_EXTRACTORS.keys())[0],
label="Estrattore PDF",
)
pdf_btn = gr.Button("Anonimizza", variant="primary", size="lg")
gr.Markdown("### Risultato", elem_classes=["section-label"])
with gr.Tabs():
with gr.TabItem("🎨 Evidenziato"):
out_highlighted = gr.HTML(elem_classes=["output-html"])
with gr.TabItem("🔒 Anonimizzato"):
update_anon_btn = gr.Button(
"🔄 Applica anonimizzazione custom",
variant="secondary",
size="sm",
)
out_anonymized = gr.HTML(elem_classes=["output-html"])
with gr.TabItem("📊 Report"):
out_report = gr.HTML(elem_classes=["output-html"])
with gr.TabItem("🔍 JSON"):
with gr.Tabs():
with gr.TabItem("💥 Critica"):
out_json_c = gr.Code(language="json", interactive=False, show_label=False)
with gr.TabItem("🔴 Alta"):
out_json_h = gr.Code(language="json", interactive=False, show_label=False)
with gr.TabItem("🟡 Media"):
out_json_m = gr.Code(language="json", interactive=False, show_label=False)
with gr.TabItem("🔵 Bassa"):
out_json_l = gr.Code(language="json", interactive=False, show_label=False)
outputs_all = [out_highlighted, out_anonymized, out_report, state,
out_json_c, out_json_h, out_json_m, out_json_l]
inputs_common = [mode_radio, score_slider, use_regex_chk, *sev_checks]
def _with_sev(handler):
def _wrapped(inp, mode, min_score, use_regex, c, h, m, l):
active = [s for s, on in zip(SEVERITY_ORDER, [c, h, m, l]) if on]
return handler(inp, mode, min_score, use_regex, active)
return _wrapped
def _with_sev_pdf(handler):
def _wrapped(inp, mode, min_score, use_regex, c, h, m, l, extractor_label):
active = [s for s, on in zip(SEVERITY_ORDER, [c, h, m, l]) if on]
return handler(inp, mode, min_score, use_regex, active, extractor_label)
return _wrapped
(txt_btn.click(_with_sev(handle_text), inputs=[txt_in, *inputs_common], outputs=outputs_all)
.then(fn=None, js=_HIDE_OVERLAY_JS))
(pdf_btn.click(_with_sev_pdf(handle_pdf),
inputs=[pdf_in, *inputs_common, pdf_extractor_radio],
outputs=outputs_all)
.then(fn=None, js=_HIDE_OVERLAY_JS))
demo_btn.click(lambda sel: _DEMO_TEXTS.get(sel, DEMO_TEXT), inputs=[demo_sel], outputs=txt_in)
# Ricalcola output quando l'utente cambia modalità o gravità post-analisi.
# Non riesegue il rilevamento NLP: riusa le entità salvate nello stato.
_settings_inputs = [state, mode_radio, *sev_checks]
mode_radio.change(handle_settings_change, inputs=_settings_inputs, outputs=outputs_all)
for _cb in sev_checks:
_cb.change(handle_settings_change, inputs=_settings_inputs, outputs=outputs_all)
update_anon_btn.click(
fn=handle_update_custom,
inputs=[_lines_placeholder, state],
outputs=[out_anonymized],
js=_COLLECT_JS,
)
demo.launch()