Spaces:

BrundageLab
/

SpotRemover

Running

App Files Files Community

BrundageLab commited on Jan 2

Commit

1adf975

verified ·

1 Parent(s): 6f0968f

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +699 -32

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,707 @@
-import altair as alt
-import numpy as np
 import pandas as pd
 import streamlit as st
 """
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
 """
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+# app.py
+# Streamlit "product-like" Vet De-ID demo (PIPELINE-FREE):
+# - Loads model from a Hugging Face repo ID (public or private via HF token)
+# - Runs token-classification via tokenizer+model directly (no HF pipeline kwargs issues)
+# - Single-note + batch (CSV/TXT) processing
+# - Highlighted redaction preview + entity table
+# - Downloads: redacted text, JSON entities, redacted CSV
+import os
+import re
+import json
+from typing import List, Dict, Any, Optional
+import streamlit.components.v1 as components
 import pandas as pd
 import streamlit as st
+import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+# =========================
+# Core utilities
+# =========================
+def get_group(ent: Dict[str, Any]) -> str:
+    return ent.get("entity_group") or ent.get("entity") or "UNK"
+def norm_contact(s: str) -> str:
+    s = s.strip().lower()
+    if "@" in s:
+        return s
+    return re.sub(r"\D", "", s)
+def resolve_overlaps(entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    # Keep longest span first, then higher score
+    ents = sorted(
+        entities,
+        key=lambda e: (e["start"], -(e["end"] - e["start"]), -float(e.get("score", 0.0)))
+    )
+    kept: List[Dict[str, Any]] = []
+    for e in ents:
+        overlap = False
+        for k in kept:
+            if e["start"] < k["end"] and e["end"] > k["start"]:
+                overlap = True
+                break
+        if not overlap:
+            kept.append(e)
+    return kept
+def dedup_entities_by_span(ents: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    seen = set()
+    out = []
+    for e in ents:
+        key = (get_group(e), int(e["start"]), int(e["end"]))
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(e)
+    return out
+def is_placeholder(word: str) -> bool:
+    w = word.strip()
+    if re.fullmatch(r"[_\s\-\(\)]+", w):
+        return True
+    if w.count("_") >= 2 and len(re.sub(r"[_\s\-\(\)]", "", w)) < 2:
+        return True
+    return False
+def merge_adjacent_entities(entities: List[Dict[str, Any]], text: str) -> List[Dict[str, Any]]:
+    """
+    Merge same-label spans separated only by safe punctuation/whitespace.
+    Prevent merges across newlines / field boundaries.
+    """
+    if not entities:
+        return []
+    entities = sorted(entities, key=lambda x: x["start"])
+    merged = [dict(entities[0])]
+    for nxt in entities[1:]:
+        cur = merged[-1]
+        same = (get_group(cur) == get_group(nxt))
+        gap_text = text[cur["end"]:nxt["start"]]
+        gap = nxt["start"] - cur["end"]
+        if "\n" in gap_text or "\r" in gap_text:
+            merged.append(dict(nxt))
+            continue
+        safe_gap = bool(re.fullmatch(r"[ \t,./\-()]*", gap_text))
+        if same and gap <= 3 and safe_gap:
+            new_end = nxt["end"]
+            cur["word"] = text[cur["start"]:new_end]
+            cur["end"] = new_end
+            cur["score"] = max(float(cur.get("score", 0.0)), float(nxt.get("score", 0.0)))
+        else:
+            merged.append(dict(nxt))
+    return merged
+def find_structured_pii(text: str) -> List[Dict[str, Any]]:
+    hits = []
+    # Emails
+    for m in re.finditer(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text):
+        hits.append({"word": m.group(), "entity_group": "CONTACT", "score": 1.0, "start": m.start(), "end": m.end()})
+    # Phones (US-ish)
+    for m in re.finditer(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text):
+        hits.append({"word": m.group(), "entity_group": "CONTACT", "score": 1.0, "start": m.start(), "end": m.end()})
+    return hits
+def redact_text(text: str, entities: List[Dict[str, Any]], mode: str = "tags") -> str:
+    """
+    mode="tags": [NAME], [LOC], etc.
+    mode="char": ***** preserving length
+    """
+    entities = resolve_overlaps(entities)
+    entities = sorted(entities, key=lambda x: x["start"], reverse=True)
+    redacted = text
+    for ent in entities:
+        start, end = ent["start"], ent["end"]
+        label = get_group(ent)
+        replacement = f"[{label}]" if mode == "tags" else "*" * max(1, (end - start))
+        redacted = redacted[:start] + replacement + redacted[end:]
+    return redacted
+def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
+    entities = resolve_overlaps(entities)
+    entities = sorted(entities, key=lambda x: x["start"])
+    # RGBA base colors (R,G,B); alpha is scaled by score
+    palette_rgb = {
+        "NAME": (255, 200, 87),
+        "LOC": (120, 180, 255),
+        "ORG": (140, 220, 160),
+        "DATE": (255, 140, 140),
+        "ID": (200, 160, 255),
+        "CONTACT": (120, 220, 220),
+        "UNK": (200, 200, 200),
+    }
+    def esc(s: str) -> str:
+        return (s.replace("&", "&amp;")
+                 .replace("<", "&lt;")
+                 .replace(">", "&gt;")
+                 .replace('"', "&quot;")
+                 .replace("'", "&#39;"))
+    css = """
+<style>
+  .note {
+    white-space: pre-wrap;
+    font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
+    font-size: 13px;
+    line-height: 1.45;
+    /* add these */
+    color: #e8eaed;
+    background: #0e1117;
+    padding: 12px 14px;
+    border-radius: 10px;
+  }
+  .ent {
+    position: relative;
+    border-radius: 4px;
+    padding: 0px 2px;
+    margin: 0px 1px;
+    box-decoration-break: clone;
+    -webkit-box-decoration-break: clone;
+    transition: filter 120ms ease;
+  }
+  .ent:hover { filter: brightness(1.05); }
+  .ent::after {
+    content: "";
+    position: absolute;
+    left: 0; right: 0; bottom: -1px;
+    height: 2px;
+    border-radius: 2px;
+    background: rgba(var(--rgb), 0.85);
+  }
+  .pill {
+    display: none;
+    position: absolute;
+    top: -14px;
+    left: 0px;
+    font-size: 10px;
+    line-height: 1;
+    padding: 2px 6px;
+    border-radius: 999px;
+    background: rgba(var(--rgb), 0.95);
+    color: #111;
+    box-shadow: 0 2px 8px rgba(0,0,0,0.25);
+    white-space: nowrap;
+    z-index: 5;
+  }
+  .ent:hover .pill { display: inline-block; }
+</style>
 """
+    out = []
+    cursor = 0
+    for e in entities:
+        s, t = e["start"], e["end"]
+        if s < cursor:
+            continue
+        out.append(esc(text[cursor:s]))
+        label = get_group(e)
+        r, g, b = palette_rgb.get(label, palette_rgb["UNK"])
+        score = float(e.get("score", 0.0))
+        # background alpha: 0.10 to 0.32 depending on confidence
+        alpha = 0.10 + 0.22 * max(0.0, min(1.0, score))
+        span_text = esc(text[s:t])
+        title = f"{label} • {score:.2f}"
+        out.append(
+            f'<span class="ent" title="{esc(title)}" style="--rgb:{r},{g},{b}; background: rgba({r},{g},{b},{alpha});">'
+            f'{span_text}'
+            f'<span class="pill">{label}</span>'
+            f"</span>"
+        )
+        cursor = t
+    out.append(esc(text[cursor:]))
+    return css + "<div class='note'>" + "".join(out) + "</div>"
+# =========================
+# Model loading from HF (NO PIPELINE)
+# =========================
+@st.cache_resource
+def load_hf_model(
+    repo_id: str,
+    revision: Optional[str],
+    hf_token: Optional[str],
+    device_str: str,
+):
+    device = torch.device(device_str)
+    tok = AutoTokenizer.from_pretrained(repo_id, revision=revision, token=hf_token)
+    mdl = AutoModelForTokenClassification.from_pretrained(repo_id, revision=revision, token=hf_token)
+    mdl.to(device)
+    mdl.eval()
+    return tok, mdl, device
+# =========================
+# NER: model-based inference with offsets (BIO -> spans)
+# =========================
+def ner_call_model(tokenizer, model, text: str, max_len: int, device: torch.device) -> List[Dict[str, Any]]:
+    enc = tokenizer(
+        text,
+        return_offsets_mapping=True,
+        truncation=True,
+        max_length=max_len,
+        return_tensors="pt",
+        padding=False,
+    )
+    offsets = enc.pop("offset_mapping")[0].tolist()
+    enc = {k: v.to(device) for k, v in enc.items()}
+    with torch.inference_mode():
+        logits = model(**enc).logits[0]  # (seq_len, num_labels)
+    probs = torch.softmax(logits, dim=-1)
+    pred_ids = probs.argmax(dim=-1).tolist()
+    pred_scores = probs.max(dim=-1).values.tolist()
+    id2label = model.config.id2label
+    def id_to_label(i: int) -> str:
+        if i in id2label:
+            return id2label[i]
+        return id2label.get(str(i), "O")
+    labels = [id_to_label(i) for i in pred_ids]
+    entities: List[Dict[str, Any]] = []
+    i = 0
+    while i < len(labels):
+        lab = labels[i]
+        s, e = offsets[i]
+        # skip special/empty
+        if s == e:
+            i += 1
+            continue
+        if lab == "O":
+            i += 1
+            continue
+        # if I- without B-, treat as B-
+        if lab.startswith("I-"):
+            lab = "B-" + lab[2:]
+        if lab.startswith("B-"):
+            typ = lab[2:]
+            start = s
+            end = e
+            scores = [pred_scores[i]]
+            j = i + 1
+            while j < len(labels):
+                lab2 = labels[j]
+                s2, e2 = offsets[j]
+                if s2 == e2:
+                    j += 1
+                    continue
+                if lab2 == f"I-{typ}":
+                    end = e2
+                    scores.append(pred_scores[j])
+                    j += 1
+                    continue
+                break
+            entities.append({
+                "word": text[start:end],
+                "entity_group": typ,
+                "start": start,
+                "end": end,
+                "score": float(sum(scores) / max(1, len(scores))),  # mean token confidence
+            })
+            i = j
+        else:
+            i += 1
+    return entities
+def run_ner_with_windows_model(
+    tokenizer,
+    model,
+    device: torch.device,
+    text: str,
+    pipe_max_len: int,
+    window_chars: int = 2000,
+    overlap_chars: int = 250,
+) -> List[Dict[str, Any]]:
+    ents: List[Dict[str, Any]] = []
+    start = 0
+    n = len(text)
+    while start < n:
+        end = min(n, start + window_chars)
+        chunk = text[start:end]
+        chunk_ents = ner_call_model(tokenizer, model, chunk, max_len=pipe_max_len, device=device)
+        for e in chunk_ents:
+            e = dict(e)
+            e["start"] += start
+            e["end"] += start
+            e["word"] = text[e["start"]:e["end"]]
+            ents.append(e)
+        if end == n:
+            break
+        start = max(0, end - overlap_chars)
+    return ents
+def propagate_entities(text: str, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Add additional spans by exact/normalized string matching for selected entity types.
+    Returns a new entity list (original + propagated), resolved/deduped.
+    """
+    # Which labels to propagate and how
+    PROPAGATE = {"CONTACT", "ID", "NAME"}  # consider adding DATE if needed
+    MIN_ID_LEN = 5                        # tune: avoid 2-3 digit labs, doses
+    MIN_NAME_LEN = 4                      # avoid tiny tokens
+    # Build patterns from existing entities
+    patterns = []
+    for e in entities:
+        label = get_group(e)
+        if label not in PROPAGATE:
+            continue
+        val = e["word"].strip()
+        if not val:
+            continue
+        if label == "CONTACT":
+            # Exact string match (case-insensitive for emails)
+            patterns.append((label, re.escape(val), re.IGNORECASE))
+        elif label == "ID":
+            # Only propagate "ID-like" tokens
+            compact = re.sub(r"\D", "", val)
+            if len(compact) < MIN_ID_LEN:
+                continue
+            # Match the same digit sequence allowing separators
+            # e.g. 261808 matches "261808" or "261-808" if present
+            digit_pat = r"\D*".join(list(compact))
+            patterns.append((label, digit_pat, 0))
+        elif label == "NAME":
+            # Prefer multi-token names; for single token be conservative
+            # You can tune this: in vet notes, patient single-token names are still PII.
+            is_multi = bool(re.search(r"\s", val))
+            if (not is_multi) and len(val) < MIN_NAME_LEN:
+                continue
+            # Exact token/phrase match with word boundaries
+            pat = r"\b" + re.escape(val) + r"\b"
+            patterns.append((label, pat, re.IGNORECASE))
+    # Find additional occurrences
+    added = []
+    for label, pat, flags in patterns:
+        for m in re.finditer(pat, text, flags=flags):
+            added.append({
+                "word": text[m.start():m.end()],
+                "entity_group": label,
+                "score": 1.0,           # propagated
+                "start": m.start(),
+                "end": m.end(),
+                "source": "propagated",
+            })
+    all_ents = list(entities) + added
+    all_ents = sorted(all_ents, key=lambda x: x["start"])
+    all_ents = dedup_entities_by_span(all_ents)
+    all_ents = resolve_overlaps(all_ents)
+    return all_ents
+def deidentify_note(
+    tokenizer,
+    model,
+    device: torch.device,
+    text: str,
+    pipe_max_len: int,
+    thresh: Dict[str, float],
+    global_stoplist: set,
+    stop_by_label: Dict[str, set],
+    use_windows: bool,
+    window_chars: int,
+    overlap_chars: int,
+) -> List[Dict[str, Any]]:
+    def pass_thresh(ent):
+        g = get_group(ent)
+        return float(ent.get("score", 0.0)) >= float(thresh.get(g, thresh.get("_default", 0.45)))
+    def stoplisted(ent):
+        g = get_group(ent)
+        w = ent["word"].strip().lower()
+        if w in global_stoplist:
+            return True
+        return w in stop_by_label.get(g, set())
+    # BERT
+    if use_windows:
+        bert_results = run_ner_with_windows_model(
+            tokenizer, model, device, text,
+            pipe_max_len=pipe_max_len,
+            window_chars=window_chars,
+            overlap_chars=overlap_chars,
+        )
+    else:
+        bert_results = ner_call_model(tokenizer, model, text, max_len=pipe_max_len, device=device)
+    # Merge adjacent same-label entities
+    bert_results = merge_adjacent_entities(bert_results, text)
+    # Regex CONTACT
+    regex_results = find_structured_pii(text)
+    final_entities: List[Dict[str, Any]] = []
+    final_entities.extend(regex_results)
+    for ent in bert_results:
+        word = ent["word"].strip()
+        if not pass_thresh(ent):
+            continue
+        if is_placeholder(word):
+            continue
+        if stoplisted(ent):
+            continue
+        if len(word) < 2 and not word.isdigit():
+            continue
+        # if overlaps regex CONTACT, skip BERT (regex wins)
+        dup = False
+        for reg in regex_results:
+            if ent["start"] < reg["end"] and ent["end"] > reg["start"]:
+                dup = True
+                break
+        if dup:
+            continue
+        final_entities.append(ent)
+    final_entities = sorted(final_entities, key=lambda x: x["start"])
+    final_entities = dedup_entities_by_span(final_entities)
+    final_entities = resolve_overlaps(final_entities)
+    return final_entities
+# =========================
+# Streamlit UI
+# =========================
+st.set_page_config(page_title="Vet De-ID Demo", layout="wide")
+st.title("Veterinary De-identification Demo (HF model + NER + Regex)")
+with st.sidebar:
+    st.header("Model (Hugging Face)")
+    repo_id = st.text_input("HF repo_id", value=os.environ.get("HF_REPO_ID", "YOUR_ORG/YOUR_VET_DEID_MODEL"))
+    revision = st.text_input("Revision (optional)", value=os.environ.get("HF_REVISION", "")).strip() or None
+    hf_token = st.text_input("HF token (optional for private repos)", value=os.environ.get("HF_TOKEN", ""), type="password").strip() or None
+    st.header("Runtime")
+    use_gpu = st.checkbox("Use GPU (CUDA)", value=torch.cuda.is_available())
+    device_str = "cuda:0" if (use_gpu and torch.cuda.is_available()) else "cpu"
+    pipe_max_len = st.selectbox("Max token length", options=[256, 512], index=0)
+    use_windows = st.checkbox("Window long notes (recommended)", value=True)
+    window_chars = st.slider("Window size (chars)", 500, 6000, 2000, 100)
+    overlap_chars = st.slider("Window overlap (chars)", 0, 1000, 250, 25)
+    st.header("Thresholds")
+    t_name = st.slider("NAME", 0.0, 1.0, 0.60, 0.01)
+    t_org  = st.slider("ORG",  0.0, 1.0, 0.60, 0.01)
+    t_loc  = st.slider("LOC",  0.0, 1.0, 0.60, 0.01)
+    t_date = st.slider("DATE", 0.0, 1.0, 0.45, 0.01)
+    t_id   = st.slider("ID",   0.0, 1.0, 0.50, 0.01)
+    t_contact = st.slider("CONTACT (model)", 0.0, 1.0, 0.99, 0.01)  # regex-first anyway
+    t_default = st.slider("Default", 0.0, 1.0, 0.45, 0.01)
+    redact_mode = st.selectbox("Redaction mode", options=["tags", "char"], index=0)
+    show_highlight = st.checkbox("Show highlighted original", value=True)
+# Load model/tokenizer
+try:
+    tokenizer, model, device = load_hf_model(repo_id=repo_id, revision=revision, hf_token=hf_token, device_str=device_str)
+except Exception as e:
+    st.error(f"Failed to load model/tokenizer from HF.\n\nrepo_id={repo_id}\nrevision={revision}\n\n{e}")
+    st.stop()
+# Stoplists (can be made editable later)
+GLOBAL_STOPLIST = {"er", "ve", "w", "dvm", "mph", "sex", "male", "female", "kg", "lb", "patient", "owner", "left", "right"}
+STOP_BY_LABEL = {
+    "LOC": {"dsh", "feline", "canine", "equine", "bovine", "species", "breed", "color"},
+    "NAME": {"owner", "patient"},
+}
+THRESH = {
+    "NAME": t_name,
+    "ORG": t_org,
+    "LOC": t_loc,
+    "DATE": t_date,
+    "ID": t_id,
+    "CONTACT": t_contact,
+    "_default": t_default,
+}
+tab1, tab2, tab3 = st.tabs(["Single note", "Batch (CSV/TXT)", "About"])
+with tab1:
+    st.subheader("Single note")
+    default_text = "Paste a veterinary note here..."
+    text = st.text_area("Input", height=260, value=default_text)
+    colA, colB = st.columns([1, 1])
+    with colA:
+        run_single = st.button("Run", type="primary")
+    with colB:
+        st.caption("CONTACT is extracted via regex (emails/phones). Model CONTACT output is effectively ignored by default.")
+    if run_single:
+        with st.spinner("Running de-identification..."):
+            final_ents = deidentify_note(
+                tokenizer=tokenizer,
+                model=model,
+                device=device,
+                text=text,
+                pipe_max_len=pipe_max_len,
+                thresh=THRESH,
+                global_stoplist=GLOBAL_STOPLIST,
+                stop_by_label=STOP_BY_LABEL,
+                use_windows=use_windows,
+                window_chars=window_chars,
+                overlap_chars=overlap_chars,
+            )
+            enable_propagation = st.checkbox("Propagate exact matches (recommended)", value=True)
+            if enable_propagation:
+                final_ents = propagate_entities(text, final_ents)
+            redacted = redact_text(text, final_ents, mode=redact_mode)
+        left, right = st.columns([1, 1])
+        with left:
+            st.subheader("Entities")
+            if final_ents:
+                df = pd.DataFrame([{
+                    "type": get_group(e),
+                    "text": e["word"],
+                    "score": float(e.get("score", 0.0)),
+                    "start": int(e["start"]),
+                    "end": int(e["end"]),
+                } for e in final_ents])
+                st.dataframe(df, use_container_width=True)
+            else:
+                st.write("No entities found.")
+            st.download_button(
+                "Download entities (JSON)",
+                data=json.dumps(final_ents, indent=2).encode("utf-8"),
+                file_name="entities.json",
+                mime="application/json",
+            )
+        with right:
+            st.subheader("Redacted output")
+            st.text_area("Output", height=260, value=redacted)
+            st.download_button(
+                "Download redacted text",
+                data=redacted.encode("utf-8"),
+                file_name="redacted.txt",
+                mime="text/plain",
+            )
+        if show_highlight:
+            st.subheader("Highlighted original (for demo)")
+            #st.markdown(highlight_entities_html(text, final_ents), unsafe_allow_html=True)
+            components.html(
+                            highlight_entities_html(text, final_ents),
+                            height=600,
+                            scrolling=True,
+                        )
+with tab2:
+    st.subheader("Batch processing")
+    st.write("Upload a CSV (one note per row) or a TXT file (single note).")
+    uploaded = st.file_uploader("Upload CSV or TXT", type=["csv", "txt"])
+    if uploaded is not None:
+        if uploaded.name.lower().endswith(".txt"):
+            raw = uploaded.getvalue().decode("utf-8", errors="replace")
+            st.write("Detected TXT input (single note). Use the Single note tab for best UX.")
+            st.text_area("Preview", value=raw[:5000], height=200)
+        else:
+            df_in = pd.read_csv(uploaded)
+            st.write(f"Loaded CSV with {len(df_in)} rows and columns: {list(df_in.columns)}")
+            text_col = st.selectbox("Text column", options=list(df_in.columns), index=0)
+            max_rows = st.slider("Max rows to process (demo)", 1, min(5000, len(df_in)), min(200, len(df_in)), 1)
+            if st.button("Run batch de-identification", type="primary"):
+                out_rows = []
+                progress = st.progress(0)
+                for i in range(max_rows):
+                    note = str(df_in.loc[i, text_col]) if pd.notna(df_in.loc[i, text_col]) else ""
+                    ents = deidentify_note(
+                        tokenizer=tokenizer,
+                        model=model,
+                        device=device,
+                        text=note,
+                        pipe_max_len=pipe_max_len,
+                        thresh=THRESH,
+                        global_stoplist=GLOBAL_STOPLIST,
+                        stop_by_label=STOP_BY_LABEL,
+                        use_windows=use_windows,
+                        window_chars=window_chars,
+                        overlap_chars=overlap_chars,
+                    )
+                    redacted = redact_text(note, ents, mode=redact_mode)
+                    out_rows.append({
+                        "row": i,
+                        "redacted": redacted,
+                        "entities_json": json.dumps(ents, ensure_ascii=False),
+                        "n_entities": len(ents),
+                    })
+                    if (i + 1) % 5 == 0 or (i + 1) == max_rows:
+                        progress.progress((i + 1) / max_rows)
+                out_df = pd.DataFrame(out_rows)
+                st.success(f"Processed {max_rows} rows.")
+                st.subheader("Batch results (preview)")
+                st.dataframe(out_df.head(50), use_container_width=True)
+                csv_bytes = out_df.to_csv(index=False).encode("utf-8")
+                st.download_button(
+                    "Download redacted CSV",
+                    data=csv_bytes,
+                    file_name="redacted_output.csv",
+                    mime="text/csv",
+                )
+with tab3:
+    st.subheader("About / demo notes")
+    st.markdown(
+        """
+- **Model source**: loaded directly from a Hugging Face `repo_id` (optionally pinned to a `revision`).
+- **CONTACT**: extracted via regex (emails/phones). Model CONTACT output is typically redundant; regex wins on overlaps.
+- **Long notes**: enable windowing to avoid truncation artifacts.
+- **Security**: run locally for PHI. Do not deploy publicly without access control, logging controls, and a privacy review.
 """
+    )
+st.caption("Tip: set env vars HF_REPO_ID, HF_REVISION, HF_TOKEN for smoother demos.")