Spaces:
Running
Running
| """Anonimizzatore Gare d'Appalto — UI Gradio.""" | |
| import json | |
| import gradio as gr | |
| from config import ( | |
| MODES, DEFAULT_MIN_SCORE, | |
| SEVERITY_ORDER, SEVERITY_LABELS, SEVERITY_COLORS, | |
| SEVERITY_CRITICAL, SEVERITY_HIGH, SEVERITY_MEDIUM, SEVERITY_LOW, | |
| get_severity, | |
| ) | |
| from core.pipeline import detect, anonymize_from_entities, apply_custom_lines | |
| from ui.renderers import ( | |
| render_highlighted_text, render_anonymized_text, | |
| render_categorized_report, build_json_by_severity, | |
| ) | |
| from utils import extract_pdf_text, extract_pdf_text_got_ocr, extract_pdf_text_smoldocling | |
| from demo_text import DEMO_TEXT, DEMO_REGEX, DEMO_GLINER, DEMO_NER | |
| _DEMO_TEXTS = { | |
| "Bando completo": DEMO_TEXT, | |
| "Layer 0 — Regex": DEMO_REGEX, | |
| "Layer 1 — NER": DEMO_NER, | |
| "Layer 2 — GLiNER": DEMO_GLINER, | |
| } | |
| _PDF_EXTRACTORS = { | |
| "PyMuPDF (default)": extract_pdf_text, | |
| "GOT-OCR 2.0 (~30s/pag, CPU)": extract_pdf_text_got_ocr, | |
| "SmolDocling 256M (CPU)": extract_pdf_text_smoldocling, | |
| } | |
| _STATE_DEFAULT = {"orig_text": "", "mode": "", "anon_text": "", "entities": []} | |
| _ALL_SEVERITIES = list(SEVERITY_ORDER) | |
| _EMPTY_JSON = "{}" | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| def _active_set(active_severities: list[str]) -> set[str]: | |
| return set(active_severities) if active_severities is not None else set(_ALL_SEVERITIES) | |
| # --------------------------------------------------------------------------- | |
| # Handlers | |
| # --------------------------------------------------------------------------- | |
| def _process(text: str, mode: str, min_score: float, use_regex: bool, active_severities: list[str]): | |
| """Esegue il pipeline completo e restituisce i 4 output Gradio. | |
| Le entità rilevate vengono salvate nello stato in modo che variazioni | |
| successive di modalità o gravità possano ricalcolare l'output senza | |
| rieseguire il costoso step di rilevamento NLP. | |
| """ | |
| active = _active_set(active_severities) | |
| entities = detect(text, min_score=min_score, use_regex=use_regex) | |
| filtered = [e for e in entities if get_severity(e.entity_type) in active] | |
| anon_text = anonymize_from_entities(text, filtered, mode) | |
| new_state = {"orig_text": text, "mode": mode, "anon_text": anon_text, "entities": entities} | |
| json_c, json_h, json_m, json_l = build_json_by_severity(text, entities, active) | |
| return ( | |
| render_highlighted_text(text, entities, active), | |
| render_anonymized_text(anon_text), | |
| render_categorized_report(text, entities, active), | |
| new_state, | |
| json_c, json_h, json_m, json_l, | |
| ) | |
| def handle_settings_change(state_data: dict, mode: str, c: bool, h: bool, m: bool, l: bool): | |
| """Ricalcola l'output quando l'utente cambia modalità o livelli di gravità. | |
| Non riesegue il rilevamento NLP: usa le entità già salvate nello stato. | |
| """ | |
| entities = state_data.get("entities") or [] | |
| orig_text = state_data.get("orig_text") or "" | |
| if not orig_text or not entities: | |
| return (gr.update(), gr.update(), gr.update(), state_data, | |
| gr.update(), gr.update(), gr.update(), gr.update()) | |
| active = _active_set([s for s, on in zip(SEVERITY_ORDER, [c, h, m, l]) if on]) | |
| filtered = [e for e in entities if get_severity(e.entity_type) in active] | |
| anon_text = anonymize_from_entities(orig_text, filtered, mode) | |
| new_state = {**state_data, "mode": mode, "anon_text": anon_text} | |
| json_c, json_h, json_m, json_l = build_json_by_severity(orig_text, entities, active) | |
| return ( | |
| render_highlighted_text(orig_text, entities, active), | |
| render_anonymized_text(anon_text), | |
| render_categorized_report(orig_text, entities, active), | |
| new_state, | |
| json_c, json_h, json_m, json_l, | |
| ) | |
| def handle_text(text, mode, min_score, use_regex, active_severities): | |
| _empty = ( | |
| '<div style="padding:40px; text-align:center; color:#6b7280; ' | |
| 'font-family:Arial,sans-serif;">Inserisci del testo per iniziare.</div>' | |
| ) | |
| if not text or not text.strip(): | |
| return (_empty, _empty, _empty, _STATE_DEFAULT.copy(), | |
| _EMPTY_JSON, _EMPTY_JSON, _EMPTY_JSON, _EMPTY_JSON) | |
| return _process(text, mode, min_score, use_regex, active_severities) | |
| def handle_pdf(pdf_file, mode, min_score, use_regex, active_severities, extractor_label="PyMuPDF (default)"): | |
| def _err(msg): | |
| e = ( | |
| f'<div style="padding:40px; text-align:center; color:#6b7280; ' | |
| f'font-family:Arial,sans-serif;">{msg}</div>' | |
| ) | |
| return (e, e, e, _STATE_DEFAULT.copy(), | |
| _EMPTY_JSON, _EMPTY_JSON, _EMPTY_JSON, _EMPTY_JSON) | |
| if pdf_file is None: | |
| return _err("Carica un PDF per iniziare.") | |
| extractor = _PDF_EXTRACTORS.get(extractor_label, extract_pdf_text) | |
| raw = extractor(pdf_file) | |
| if not raw.strip(): | |
| return _err("⚠️ Impossibile estrarre testo dal PDF (scansionato?).") | |
| return _process(raw, mode, min_score, use_regex, active_severities) | |
| def handle_update_custom(selected_lines_json: str, state_data: dict): | |
| """Applica le righe selezionate (via JS al click) al testo anonimizzato.""" | |
| lines: list[str] = json.loads(selected_lines_json or "[]") | |
| if not lines or not state_data.get("orig_text"): | |
| return render_anonymized_text(state_data.get("anon_text", "")) | |
| new_anon = apply_custom_lines( | |
| state_data["anon_text"], | |
| state_data["orig_text"], | |
| lines, | |
| state_data["mode"], | |
| ) | |
| return render_anonymized_text(new_anon) | |
| # --------------------------------------------------------------------------- | |
| # JavaScript | |
| # --------------------------------------------------------------------------- | |
| _PAGE_JS = """ | |
| (function () { | |
| /* ── Overlay fullscreen di loading ─────────────────────────────────── */ | |
| var ov = document.createElement('div'); | |
| ov.id = 'anon-fs-overlay'; | |
| ov.innerHTML = [ | |
| '<div style="display:flex;flex-direction:column;align-items:center;gap:22px;">', | |
| ' <svg width="52" height="52" viewBox="0 0 52 52" fill="none"', | |
| ' style="animation:_ovSpin .8s linear infinite;">', | |
| ' <circle cx="26" cy="26" r="20" stroke="#e5e7eb" stroke-width="4"/>', | |
| ' <path d="M26 6a20 20 0 0 1 20 20" stroke="#6366f1"', | |
| ' stroke-width="4" stroke-linecap="round"/>', | |
| ' </svg>', | |
| ' <p id="anon-fs-msg" style="margin:0;font-family:Arial,sans-serif;', | |
| ' font-size:1em;color:#374151;font-weight:500;">Analisi in corso…</p>', | |
| '</div>', | |
| '<style>', | |
| ' #anon-fs-overlay{position:fixed;inset:0;z-index:9999;', | |
| ' background:rgba(255,255,255,.9);backdrop-filter:blur(6px);', | |
| ' display:none;align-items:center;justify-content:center;}', | |
| ' @keyframes _ovSpin{to{transform:rotate(360deg)}}', | |
| '</style>', | |
| ].join(''); | |
| document.body.appendChild(ov); | |
| window._anonHideOverlay = function () { | |
| clearTimeout(window._anonOverlayTimer); | |
| ov.style.display = 'none'; | |
| }; | |
| function showOverlay() { | |
| clearTimeout(window._anonOverlayTimer); | |
| ov.style.display = 'flex'; | |
| /* Fallback: si nasconde automaticamente dopo 50 secondi */ | |
| window._anonOverlayTimer = setTimeout(window._anonHideOverlay, 50000); | |
| } | |
| /* Mostra overlay al click sui pulsanti "Anonimizza" */ | |
| document.addEventListener('click', function (e) { | |
| var btn = e.target.closest('button'); | |
| if (btn && btn.classList.contains('primary') && | |
| (btn.textContent || '').trim() === 'Anonimizza') { | |
| showOverlay(); | |
| } | |
| }, true); | |
| /* ── Checkbox righe custom ──────────────────────────────────────────── */ | |
| document.addEventListener('change', function (e) { | |
| if (!e.target || !e.target.classList.contains('anon-line-cb')) return; | |
| var hasChecked = document.querySelectorAll('.anon-line-cb:checked').length > 0; | |
| var w = document.getElementById('anon-custom-warning'); | |
| if (w) w.style.display = hasChecked ? 'block' : 'none'; | |
| }, true); | |
| })(); | |
| """ | |
| # JS eseguito da .then() — nasconde l'overlay quando il processing Python termina | |
| _HIDE_OVERLAY_JS = "() => { if (window._anonHideOverlay) window._anonHideOverlay(); }" | |
| _COLLECT_JS = """ | |
| (placeholder, state) => { | |
| var r = []; | |
| document.querySelectorAll('.anon-line-cb:checked').forEach(function (cb) { | |
| var row = cb.closest('.anon-line-row'); | |
| if (row) r.push(row.getAttribute('data-line')); | |
| }); | |
| return [JSON.stringify(r), state]; | |
| } | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # CSS | |
| # --------------------------------------------------------------------------- | |
| _SEV_CSS = "" | |
| for _s in SEVERITY_ORDER: | |
| _c = SEVERITY_COLORS[_s] | |
| # Selettori label: background, bordo, padding | |
| _SEV_CSS += ( | |
| f'.sev-{_s} .wrap label, .sev-{_s} label {{' | |
| f'background:{_c["bg"]} !important; ' | |
| f'border-left:3px solid {_c["border"]} !important; ' | |
| f'border-radius:6px !important; ' | |
| f'padding:4px 12px !important;}}\n' | |
| ) | |
| # .label-text è lo span interno dove Gradio/Svelte mette il testo del checkbox | |
| _SEV_CSS += ( | |
| f'.sev-{_s} .label-text, .sev-{_s} .wrap .label-text {{' | |
| f'color:{_c["text"]} !important; ' | |
| f'font-size:0.85em !important; font-weight:500 !important;}}\n' | |
| ) | |
| _CSS = """ | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: 0 auto !important; | |
| font-family: Arial, Helvetica, sans-serif !important; | |
| } | |
| .gradio-container * { font-family: Arial, Helvetica, sans-serif !important; } | |
| .header-block h1 { margin-bottom: 4px !important; font-weight: 700; } | |
| .section-label { | |
| font-size: 0.78em !important; font-weight: 700 !important; | |
| text-transform: uppercase; letter-spacing: 0.06em; | |
| color: #6b7280 !important; margin-bottom: 6px !important; | |
| } | |
| button.primary { font-weight: 600 !important; } | |
| /* Forza il colore del testo nei box HTML di output (Evidenziato / Anonimizzato / Report). | |
| Gradio sovrascrive il color dei figli con regole scoped — questo lo corregge. */ | |
| .output-html, .output-html *:not([style*="color"]) { | |
| color: #111827 !important; | |
| } | |
| """ + _SEV_CSS | |
| # --------------------------------------------------------------------------- | |
| # UI | |
| # --------------------------------------------------------------------------- | |
| with gr.Blocks( | |
| title="Anonimizzatore Gare d'Appalto", | |
| theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate", | |
| font=["Arial", "Helvetica", "sans-serif"]), | |
| css=_CSS, | |
| js=_PAGE_JS, | |
| ) as demo: | |
| state = gr.State(_STATE_DEFAULT.copy()) | |
| _lines_placeholder = gr.Textbox(value="[]", visible=False) | |
| with gr.Row(elem_classes=["header-block"]): | |
| gr.Markdown( | |
| "# 🔒 Anonimizzatore Gare d'Appalto\n" | |
| "Pipeline a 3 livelli: **Regex** → **NER** (Italian_NER_XXL_v2) → " | |
| "**GLiNER** (PII_ITA zero-shot), con doppio passaggio regex finale." | |
| ) | |
| gr.Markdown("### Impostazioni", elem_classes=["section-label"]) | |
| mode_radio = gr.Radio( | |
| choices=list(MODES.keys()), | |
| value=list(MODES.keys())[0], | |
| label="Modalità di offuscamento", | |
| ) | |
| with gr.Row(equal_height=True): | |
| score_slider = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=DEFAULT_MIN_SCORE, step=0.05, | |
| label="Score minimo di confidenza", scale=3, | |
| ) | |
| use_regex_chk = gr.Checkbox( | |
| value=True, label="Livello 1: regex appalti", scale=1, | |
| ) | |
| # Severity toggle — checkbox per livello; di default tutti attivi | |
| with gr.Row(): | |
| sev_checks = [] | |
| for sev in SEVERITY_ORDER: | |
| cb = gr.Checkbox( | |
| value=True, | |
| label=f"Gravità {SEVERITY_LABELS[sev]}", | |
| elem_classes=[f"sev-{sev}"], | |
| scale=1, | |
| ) | |
| sev_checks.append(cb) | |
| gr.Markdown("### Input", elem_classes=["section-label"]) | |
| with gr.Tabs(): | |
| with gr.TabItem("📝 Testo"): | |
| txt_in = gr.Textbox( | |
| label=None, lines=10, | |
| placeholder="Incolla il testo del documento di gara…", | |
| show_label=False, | |
| ) | |
| with gr.Row(): | |
| txt_btn = gr.Button("Anonimizza", variant="primary", size="lg", scale=2) | |
| demo_sel = gr.Dropdown( | |
| choices=list(_DEMO_TEXTS.keys()), | |
| value=list(_DEMO_TEXTS.keys())[0], | |
| label=None, show_label=False, | |
| scale=1, | |
| ) | |
| demo_btn = gr.Button("📋 Carica", variant="secondary", size="lg", scale=1) | |
| with gr.TabItem("📄 PDF"): | |
| pdf_in = gr.File(label="Carica un PDF", file_types=[".pdf"]) | |
| pdf_extractor_radio = gr.Radio( | |
| choices=list(_PDF_EXTRACTORS.keys()), | |
| value=list(_PDF_EXTRACTORS.keys())[0], | |
| label="Estrattore PDF", | |
| ) | |
| pdf_btn = gr.Button("Anonimizza", variant="primary", size="lg") | |
| gr.Markdown("### Risultato", elem_classes=["section-label"]) | |
| with gr.Tabs(): | |
| with gr.TabItem("🎨 Evidenziato"): | |
| out_highlighted = gr.HTML(elem_classes=["output-html"]) | |
| with gr.TabItem("🔒 Anonimizzato"): | |
| update_anon_btn = gr.Button( | |
| "🔄 Applica anonimizzazione custom", | |
| variant="secondary", | |
| size="sm", | |
| ) | |
| out_anonymized = gr.HTML(elem_classes=["output-html"]) | |
| with gr.TabItem("📊 Report"): | |
| out_report = gr.HTML(elem_classes=["output-html"]) | |
| with gr.TabItem("🔍 JSON"): | |
| with gr.Tabs(): | |
| with gr.TabItem("💥 Critica"): | |
| out_json_c = gr.Code(language="json", interactive=False, show_label=False) | |
| with gr.TabItem("🔴 Alta"): | |
| out_json_h = gr.Code(language="json", interactive=False, show_label=False) | |
| with gr.TabItem("🟡 Media"): | |
| out_json_m = gr.Code(language="json", interactive=False, show_label=False) | |
| with gr.TabItem("🔵 Bassa"): | |
| out_json_l = gr.Code(language="json", interactive=False, show_label=False) | |
| outputs_all = [out_highlighted, out_anonymized, out_report, state, | |
| out_json_c, out_json_h, out_json_m, out_json_l] | |
| inputs_common = [mode_radio, score_slider, use_regex_chk, *sev_checks] | |
| def _with_sev(handler): | |
| def _wrapped(inp, mode, min_score, use_regex, c, h, m, l): | |
| active = [s for s, on in zip(SEVERITY_ORDER, [c, h, m, l]) if on] | |
| return handler(inp, mode, min_score, use_regex, active) | |
| return _wrapped | |
| def _with_sev_pdf(handler): | |
| def _wrapped(inp, mode, min_score, use_regex, c, h, m, l, extractor_label): | |
| active = [s for s, on in zip(SEVERITY_ORDER, [c, h, m, l]) if on] | |
| return handler(inp, mode, min_score, use_regex, active, extractor_label) | |
| return _wrapped | |
| (txt_btn.click(_with_sev(handle_text), inputs=[txt_in, *inputs_common], outputs=outputs_all) | |
| .then(fn=None, js=_HIDE_OVERLAY_JS)) | |
| (pdf_btn.click(_with_sev_pdf(handle_pdf), | |
| inputs=[pdf_in, *inputs_common, pdf_extractor_radio], | |
| outputs=outputs_all) | |
| .then(fn=None, js=_HIDE_OVERLAY_JS)) | |
| demo_btn.click(lambda sel: _DEMO_TEXTS.get(sel, DEMO_TEXT), inputs=[demo_sel], outputs=txt_in) | |
| # Ricalcola output quando l'utente cambia modalità o gravità post-analisi. | |
| # Non riesegue il rilevamento NLP: riusa le entità salvate nello stato. | |
| _settings_inputs = [state, mode_radio, *sev_checks] | |
| mode_radio.change(handle_settings_change, inputs=_settings_inputs, outputs=outputs_all) | |
| for _cb in sev_checks: | |
| _cb.change(handle_settings_change, inputs=_settings_inputs, outputs=outputs_all) | |
| update_anon_btn.click( | |
| fn=handle_update_custom, | |
| inputs=[_lines_placeholder, state], | |
| outputs=[out_anonymized], | |
| js=_COLLECT_JS, | |
| ) | |
| demo.launch() | |