Spaces:

BrundageLab
/

SpotRemover

Running

App Files Files Community

BrundageLab commited on Jan 2

Commit

cc60623

verified ·

1 Parent(s): bd550fc

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +273 -37

src/streamlit_app.py CHANGED Viewed

@@ -15,6 +15,177 @@ import pandas as pd
 import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 # =========================
@@ -129,13 +300,13 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
     # RGBA base colors (R,G,B); alpha is scaled by score
     palette_rgb = {
-        "NAME": (255, 200, 87),
-        "LOC": (120, 180, 255),
-        "ORG": (140, 220, 160),
-        "DATE": (255, 140, 140),
-        "ID": (200, 160, 255),
-        "CONTACT": (120, 220, 220),
-        "UNK": (200, 200, 200),
     }
     def esc(s: str) -> str:
@@ -153,28 +324,31 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
     font-size: 13px;
     line-height: 1.45;
-    /* add these */
-    color: #e8eaed;
-    background: #0e1117;
     padding: 12px 14px;
-    border-radius: 10px;
   }
   .ent {
     position: relative;
-    border-radius: 4px;
-    padding: 0px 2px;
     margin: 0px 1px;
     box-decoration-break: clone;
     -webkit-box-decoration-break: clone;
     transition: filter 120ms ease;
   }
-  .ent:hover { filter: brightness(1.05); }
   .ent::after {
     content: "";
     position: absolute;
-    left: 0; right: 0; bottom: -1px;
     height: 2px;
     border-radius: 2px;
     background: rgba(var(--rgb), 0.85);
@@ -183,15 +357,15 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
   .pill {
     display: none;
     position: absolute;
-    top: -14px;
     left: 0px;
     font-size: 10px;
     line-height: 1;
-    padding: 2px 6px;
     border-radius: 999px;
     background: rgba(var(--rgb), 0.95);
-    color: #111;
-    box-shadow: 0 2px 8px rgba(0,0,0,0.25);
     white-space: nowrap;
     z-index: 5;
   }
@@ -200,6 +374,7 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
 """
     out = []
     cursor = 0
     for e in entities:
@@ -503,17 +678,57 @@ def deidentify_note(
 # =========================
 # Streamlit UI
 # =========================
-st.set_page_config(page_title="Vet De-ID Demo", layout="wide")
-st.title("Veterinary De-identification Demo (HF model + NER + Regex)")
 with st.sidebar:
-    st.header("Model (Hugging Face)")
-    repo_id = st.text_input("HF repo_id", value=os.environ.get("HF_REPO_ID", "YOUR_ORG/YOUR_VET_DEID_MODEL"))
-    revision = st.text_input("Revision (optional)", value=os.environ.get("HF_REVISION", "")).strip() or None
-    hf_token = st.text_input("HF token (optional for private repos)", value=os.environ.get("HF_TOKEN", ""), type="password").strip() or None
     st.header("Runtime")
-    use_gpu = st.checkbox("Use GPU (CUDA)", value=torch.cuda.is_available())
     device_str = "cuda:0" if (use_gpu and torch.cuda.is_available()) else "cpu"
     pipe_max_len = st.selectbox("Max token length", options=[256, 512], index=0)
@@ -567,8 +782,6 @@ with tab1:
     colA, colB = st.columns([1, 1])
     with colA:
         run_single = st.button("Run", type="primary")
-    with colB:
-        st.caption("CONTACT is extracted via regex (emails/phones). Model CONTACT output is effectively ignored by default.")
     if run_single:
         with st.spinner("Running de-identification..."):
@@ -626,7 +839,7 @@ with tab1:
             )
         if show_highlight:
-            st.subheader("Highlighted original (for demo)")
             #st.markdown(highlight_entities_html(text, final_ents), unsafe_allow_html=True)
             components.html(
                             highlight_entities_html(text, final_ents),
@@ -694,14 +907,37 @@ with tab2:
                 )
 with tab3:
-    st.subheader("About / demo notes")
     st.markdown(
-        """
-- **Model source**: loaded directly from a Hugging Face `repo_id` (optionally pinned to a `revision`).
-- **CONTACT**: extracted via regex (emails/phones). Model CONTACT output is typically redundant; regex wins on overlaps.
-- **Long notes**: enable windowing to avoid truncation artifacts.
-- **Security**: run locally for PHI. Do not deploy publicly without access control, logging controls, and a privacy review.
 """
-    )
-st.caption("Tip: set env vars HF_REPO_ID, HF_REVISION, HF_TOKEN for smoother demos.")

 import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification
+from dotenv import load_dotenv
+load_dotenv()
+import base64
+APP_HOME_URL = "https://www.brundagelab.org/research/apps/"  # change to your desired destination
+def _img_to_data_uri(path: str) -> str:
+    ext = os.path.splitext(path)[1].lower().lstrip(".")
+    mime = "image/png" if ext == "png" else "image/jpeg" if ext in {"jpg","jpeg"} else "image/svg+xml"
+    with open(path, "rb") as f:
+        b64 = base64.b64encode(f.read()).decode("utf-8")
+    return f"data:{mime};base64,{b64}"
+def brundage_header():
+    col1, col2 = st.columns([2, 5])
+    with col1:
+        logo_path = "assets/brundage_logo.png"
+        logo_uri = _img_to_data_uri(logo_path)
+        st.markdown(
+            f"""
+            <a href="{APP_HOME_URL}" target="_self" style="display:inline-block;">
+              <img src="{logo_uri}" alt="Brundage Lab" style="width:256px; height:auto; cursor:pointer;" />
+            </a>
+            """,
+            unsafe_allow_html=True,
+        )
+    with col2:
+        st.markdown(
+            """
+<div style="padding-top:24px;">
+  <div style="font-size:34px; font-weight:850; letter-spacing:-0.02em; color:#111827;">
+    SpotRemover: Veterinary De-Identification
+  </div>
+</div>
+            """,
+            unsafe_allow_html=True
+        )
+def inject_brundage_theme():
+    st.markdown(
+        """
+<style>
+/* ---- App canvas ---- */
+html, body, [class*="css"]  {
+  font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "Apple Color Emoji","Segoe UI Emoji";
+}
+/* Remove Streamlit default top padding a bit */
+.block-container {
+  padding-top: 2.2rem;
+  padding-bottom: 2.5rem;
+  max-width: 1200px;
+}
+/* Hide Streamlit chrome */
+#MainMenu {visibility: hidden;}
+footer {visibility: hidden;}
+header {visibility: hidden;}
+/* Headings: closer to your site (bold, clean) */
+h1, h2, h3, h4 {
+  letter-spacing: -0.02em;
+  color: #111827;
+}
+h1 { font-weight: 800; }
+h2 { font-weight: 800; }
+h3 { font-weight: 750; }
+/* Sidebar polish */
+section[data-testid="stSidebar"] {
+  background: #FFFFFF;
+  border-right: 1px solid #EEF2F7;
+}
+section[data-testid="stSidebar"] .block-container {
+  padding-top: 1.5rem;
+}
+/* Buttons: purple primary like your site */
+.stButton > button {
+  border-radius: 12px;
+  padding: 0.55rem 0.95rem;
+  border: 1px solid #E5E7EB;
+  background: #FFFFFF;
+  color: #111827;
+  font-weight: 650;
+}
+.stButton > button:hover {
+  border-color: #C7B7FF;
+  background: #FBFAFF;
+}
+.stButton > button[kind="primary"] {
+  background: #6D28D9;
+  color: #FFFFFF;
+  border: 1px solid #6D28D9;
+  box-shadow: 0 6px 18px rgba(109, 40, 217, 0.18);
+}
+.stButton > button[kind="primary"]:hover {
+  background: #5B21B6;
+  border-color: #5B21B6;
+}
+/* Inputs: rounded + subtle border */
+div[data-baseweb="input"] > div,
+div[data-baseweb="textarea"] > div,
+div[data-baseweb="select"] > div {
+  border-radius: 12px !important;
+  border-color: #E5E7EB !important;
+  box-shadow: none !important;
+}
+div[data-baseweb="textarea"] textarea {
+  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
+  font-size: 13px;
+}
+/* Tabs: reduce Streamlit “blocky” feel */
+button[data-baseweb="tab"] {
+  border-radius: 12px 12px 0 0;
+  font-weight: 650;
+}
+/* Dataframe / tables: softer container */
+div[data-testid="stDataFrame"] {
+  border: 1px solid #EEF2F7;
+  border-radius: 14px;
+  overflow: hidden;
+}
+/* “Card” helper class you can use via st.markdown */
+.card {
+  border: 1px solid #EEF2F7;
+  border-radius: 16px;
+  padding: 14px 16px;
+  background: #FFFFFF;
+  box-shadow: 0 10px 24px rgba(17, 24, 39, 0.06);
+}
+/* Highlight rendering container (so text stays readable on light bg) */
+.note {
+  white-space: pre-wrap;
+  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
+  font-size: 13px;
+  line-height: 1.45;
+  color: #111827;
+}
+/* Entity pill styling (used in highlight) */
+.ent {
+  border-radius: 10px;
+  padding: 2px 6px;
+  border: 1px solid rgba(17, 24, 39, 0.08);
+  box-shadow: 0 6px 14px rgba(17, 24, 39, 0.06);
+}
+.ent sup {
+  font-size: 10px;
+  margin-left: 6px;
+  opacity: 0.75;
+}
+/* Make captions less “default streamlit” */
+.stCaption, small {
+  color: #6B7280;
+}
+</style>
+        """,
+        unsafe_allow_html=True,
+    )
 # =========================
     # RGBA base colors (R,G,B); alpha is scaled by score
     palette_rgb = {
+        "NAME": (124, 58, 237),
+        "LOC": (59, 130, 246),
+        "ORG": (16, 185, 129),
+        "DATE": (244, 63, 94),
+        "ID": (234, 179, 8),
+        "CONTACT": (14, 165, 233),
+        "UNK": (107, 114, 128),
     }
     def esc(s: str) -> str:
     font-size: 13px;
     line-height: 1.45;
+    /* Light card styling */
+    color: #111827;
+    background: #FFFFFF;
+    border: 1px solid #EEF2F7;
     padding: 12px 14px;
+    border-radius: 14px;
+    box-shadow: 0 10px 24px rgba(17, 24, 39, 0.06);
   }
   .ent {
     position: relative;
+    border-radius: 8px;
+    padding: 1px 4px;
     margin: 0px 1px;
     box-decoration-break: clone;
     -webkit-box-decoration-break: clone;
     transition: filter 120ms ease;
+    border: 1px solid rgba(17, 24, 39, 0.08);
   }
+  .ent:hover { filter: brightness(1.03); }
   .ent::after {
     content: "";
     position: absolute;
+    left: 6px; right: 6px; bottom: -2px;
     height: 2px;
     border-radius: 2px;
     background: rgba(var(--rgb), 0.85);
   .pill {
     display: none;
     position: absolute;
+    top: -18px;
     left: 0px;
     font-size: 10px;
     line-height: 1;
+    padding: 3px 8px;
     border-radius: 999px;
     background: rgba(var(--rgb), 0.95);
+    color: #111827;
+    box-shadow: 0 6px 16px rgba(17, 24, 39, 0.12);
     white-space: nowrap;
     z-index: 5;
   }
 """
     out = []
     cursor = 0
     for e in entities:
 # =========================
 # Streamlit UI
 # =========================
+st.set_page_config(page_title="SpotRemover", layout="wide")
+st.markdown(
+    """
+<div style="display:flex; gap:8px; flex-wrap:wrap; margin: 8px 0 16px 0;">
+  <span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">De-ID</span>
+  <span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">NER</span>
+  <span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">Veterinary</span>
+  <span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">One Health</span>
+</div>
+    """,
+    unsafe_allow_html=True
+)
+brundage_header()
+st.markdown("<div style='height:14px;'></div>", unsafe_allow_html=True)
+inject_brundage_theme()
 with st.sidebar:
+    st.header("Model")
+    # 1) Define your private fine-tuned model repo IDs (store actual values in env or hardcode)
+    # Option A (recommended): keep repo IDs in env so you don't commit them
+    MODEL_REGISTRY = {
+        "VetBERT (fine-tuned)": os.environ.get("HF_REPO_VETBERT", ""),
+        "PetBERT (fine-tuned)": os.environ.get("HF_REPO_PETBERT", ""),
+        "ClinicalBERT (fine-tuned)": os.environ.get("HF_REPO_CLINICALBERT", ""),
+    }
+    # 2) Dropdown selector
+    model_label = st.selectbox(
+        "Select model",
+        options=list(MODEL_REGISTRY.keys()),
+        index=0,
+    )
+    repo_id = MODEL_REGISTRY[model_label]
+    # 3) Optional revision (still OK to keep)
+    revision = (os.environ.get("HF_REVISION", "").strip() or None)
+    # 4) Token comes ONLY from environment
+    hf_token = (os.environ.get("HF_TOKEN", "").strip() or None)
+    if not repo_id:
+        st.error("Model repo_id is not set. Define HF_REPO_VETBERT / HF_REPO_PETBERT / HF_REPO_CLINICALBERT.")
+        st.stop()
+    if hf_token is None:
+        st.error("HF_TOKEN environment variable is not set (required for private models).")
+        st.stop()
     st.header("Runtime")
+    use_gpu = False
     device_str = "cuda:0" if (use_gpu and torch.cuda.is_available()) else "cpu"
     pipe_max_len = st.selectbox("Max token length", options=[256, 512], index=0)
     colA, colB = st.columns([1, 1])
     with colA:
         run_single = st.button("Run", type="primary")
     if run_single:
         with st.spinner("Running de-identification..."):
             )
         if show_highlight:
+            st.subheader("Highlighted original")
             #st.markdown(highlight_entities_html(text, final_ents), unsafe_allow_html=True)
             components.html(
                             highlight_entities_html(text, final_ents),
                 )
 with tab3:
+    st.subheader("About")
     st.markdown(
+    """
+### About this tool
+This interactive demo is part of the **Brundage Lab (brundagelab.org)** research program on **AI methods for veterinary clinical text** and privacy-preserving data sharing for veterinary and One Health applications.
+**What it does**
+- Performs **veterinary de-identification** on free-text clinical narratives by detecting and redacting identifiers such as **owner/client names**, **addresses/locations**, **dates**, **IDs**, and **contact information**.
+- Uses a **fine-tuned transformer NER model** (selectable backbone such as VetBERT / PetBERT / ClinicalBERT) loaded from a **private Hugging Face repository**.
+- Augments model predictions with **high-precision pattern matching** for structured identifiers (e.g., emails and phone numbers).
+**How to interpret results**
+- This tool prioritizes **high recall** for sensitive identifiers (reducing false negatives), with thresholds adjustable in the sidebar.
+- The highlighted view is provided for **demonstration and error analysis**; the redacted output is the intended downstream artifact.
+**Engineering notes**
+- **Model source**: loaded directly from Hugging Face (optionally pinned to a specific revision for reproducibility).
+- **CONTACT**: extracted via regex (emails/phones). If the model also predicts CONTACT, regex is treated as the source of truth on overlaps.
+- **Long notes**: optional windowing reduces truncation artifacts and improves coverage across multi-page notes.
+**Privacy and intended use**
+- This is a **research and demonstration tool**, not a certified de-identification system.
+- Do **not** paste sensitive/regulated data unless you are running the tool in an approved environment with appropriate controls.
+- For any public deployment, ensure **access control**, **minimal logging**, and a **privacy/security review** consistent with your institution’s policies.
+**Citation / attribution**
+If you use this tool or its outputs in a manuscript, please cite the Brundage Lab and describe the model backbone, training data composition (real vs. synthetic), and evaluation protocol.
 """
+)
+st.caption("Tip: Select your model backbone and explore a single document. Modify default thresholds to finetune your performance.")