Spaces:
Running
Running
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +273 -37
src/streamlit_app.py
CHANGED
|
@@ -15,6 +15,177 @@ import pandas as pd
|
|
| 15 |
import streamlit as st
|
| 16 |
import torch
|
| 17 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
# =========================
|
|
@@ -129,13 +300,13 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
|
|
| 129 |
|
| 130 |
# RGBA base colors (R,G,B); alpha is scaled by score
|
| 131 |
palette_rgb = {
|
| 132 |
-
"NAME": (
|
| 133 |
-
"LOC": (
|
| 134 |
-
"ORG": (
|
| 135 |
-
"DATE": (
|
| 136 |
-
"ID": (
|
| 137 |
-
"CONTACT": (
|
| 138 |
-
"UNK": (
|
| 139 |
}
|
| 140 |
|
| 141 |
def esc(s: str) -> str:
|
|
@@ -153,28 +324,31 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
|
|
| 153 |
font-size: 13px;
|
| 154 |
line-height: 1.45;
|
| 155 |
|
| 156 |
-
/*
|
| 157 |
-
color: #
|
| 158 |
-
background: #
|
|
|
|
| 159 |
padding: 12px 14px;
|
| 160 |
-
border-radius:
|
|
|
|
| 161 |
}
|
| 162 |
|
| 163 |
.ent {
|
| 164 |
position: relative;
|
| 165 |
-
border-radius:
|
| 166 |
-
padding:
|
| 167 |
margin: 0px 1px;
|
| 168 |
box-decoration-break: clone;
|
| 169 |
-webkit-box-decoration-break: clone;
|
| 170 |
transition: filter 120ms ease;
|
|
|
|
| 171 |
}
|
| 172 |
-
.ent:hover { filter: brightness(1.
|
| 173 |
|
| 174 |
.ent::after {
|
| 175 |
content: "";
|
| 176 |
position: absolute;
|
| 177 |
-
left:
|
| 178 |
height: 2px;
|
| 179 |
border-radius: 2px;
|
| 180 |
background: rgba(var(--rgb), 0.85);
|
|
@@ -183,15 +357,15 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
|
|
| 183 |
.pill {
|
| 184 |
display: none;
|
| 185 |
position: absolute;
|
| 186 |
-
top: -
|
| 187 |
left: 0px;
|
| 188 |
font-size: 10px;
|
| 189 |
line-height: 1;
|
| 190 |
-
padding:
|
| 191 |
border-radius: 999px;
|
| 192 |
background: rgba(var(--rgb), 0.95);
|
| 193 |
-
color: #
|
| 194 |
-
box-shadow: 0
|
| 195 |
white-space: nowrap;
|
| 196 |
z-index: 5;
|
| 197 |
}
|
|
@@ -200,6 +374,7 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
|
|
| 200 |
"""
|
| 201 |
|
| 202 |
|
|
|
|
| 203 |
out = []
|
| 204 |
cursor = 0
|
| 205 |
for e in entities:
|
|
@@ -503,17 +678,57 @@ def deidentify_note(
|
|
| 503 |
# =========================
|
| 504 |
# Streamlit UI
|
| 505 |
# =========================
|
| 506 |
-
st.set_page_config(page_title="
|
| 507 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
|
| 509 |
with st.sidebar:
|
| 510 |
-
st.header("Model
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
|
| 515 |
st.header("Runtime")
|
| 516 |
-
use_gpu =
|
| 517 |
device_str = "cuda:0" if (use_gpu and torch.cuda.is_available()) else "cpu"
|
| 518 |
|
| 519 |
pipe_max_len = st.selectbox("Max token length", options=[256, 512], index=0)
|
|
@@ -567,8 +782,6 @@ with tab1:
|
|
| 567 |
colA, colB = st.columns([1, 1])
|
| 568 |
with colA:
|
| 569 |
run_single = st.button("Run", type="primary")
|
| 570 |
-
with colB:
|
| 571 |
-
st.caption("CONTACT is extracted via regex (emails/phones). Model CONTACT output is effectively ignored by default.")
|
| 572 |
|
| 573 |
if run_single:
|
| 574 |
with st.spinner("Running de-identification..."):
|
|
@@ -626,7 +839,7 @@ with tab1:
|
|
| 626 |
)
|
| 627 |
|
| 628 |
if show_highlight:
|
| 629 |
-
st.subheader("Highlighted original
|
| 630 |
#st.markdown(highlight_entities_html(text, final_ents), unsafe_allow_html=True)
|
| 631 |
components.html(
|
| 632 |
highlight_entities_html(text, final_ents),
|
|
@@ -694,14 +907,37 @@ with tab2:
|
|
| 694 |
)
|
| 695 |
|
| 696 |
with tab3:
|
| 697 |
-
st.subheader("About
|
| 698 |
st.markdown(
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
"""
|
| 705 |
-
|
|
|
|
|
|
|
|
|
|
| 706 |
|
| 707 |
-
st.caption("Tip: set env vars HF_REPO_ID, HF_REVISION, HF_TOKEN for smoother demos.")
|
|
|
|
| 15 |
import streamlit as st
|
| 16 |
import torch
|
| 17 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 18 |
+
from dotenv import load_dotenv
|
| 19 |
+
load_dotenv()
|
| 20 |
+
|
| 21 |
+
import base64
|
| 22 |
+
|
| 23 |
+
APP_HOME_URL = "https://www.brundagelab.org/research/apps/" # change to your desired destination
|
| 24 |
+
|
| 25 |
+
def _img_to_data_uri(path: str) -> str:
|
| 26 |
+
ext = os.path.splitext(path)[1].lower().lstrip(".")
|
| 27 |
+
mime = "image/png" if ext == "png" else "image/jpeg" if ext in {"jpg","jpeg"} else "image/svg+xml"
|
| 28 |
+
with open(path, "rb") as f:
|
| 29 |
+
b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 30 |
+
return f"data:{mime};base64,{b64}"
|
| 31 |
+
|
| 32 |
+
def brundage_header():
|
| 33 |
+
col1, col2 = st.columns([2, 5])
|
| 34 |
+
|
| 35 |
+
with col1:
|
| 36 |
+
logo_path = "assets/brundage_logo.png"
|
| 37 |
+
logo_uri = _img_to_data_uri(logo_path)
|
| 38 |
+
|
| 39 |
+
st.markdown(
|
| 40 |
+
f"""
|
| 41 |
+
<a href="{APP_HOME_URL}" target="_self" style="display:inline-block;">
|
| 42 |
+
<img src="{logo_uri}" alt="Brundage Lab" style="width:256px; height:auto; cursor:pointer;" />
|
| 43 |
+
</a>
|
| 44 |
+
""",
|
| 45 |
+
unsafe_allow_html=True,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
with col2:
|
| 49 |
+
st.markdown(
|
| 50 |
+
"""
|
| 51 |
+
<div style="padding-top:24px;">
|
| 52 |
+
<div style="font-size:34px; font-weight:850; letter-spacing:-0.02em; color:#111827;">
|
| 53 |
+
SpotRemover: Veterinary De-Identification
|
| 54 |
+
</div>
|
| 55 |
+
</div>
|
| 56 |
+
""",
|
| 57 |
+
unsafe_allow_html=True
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def inject_brundage_theme():
|
| 62 |
+
st.markdown(
|
| 63 |
+
"""
|
| 64 |
+
<style>
|
| 65 |
+
/* ---- App canvas ---- */
|
| 66 |
+
html, body, [class*="css"] {
|
| 67 |
+
font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "Apple Color Emoji","Segoe UI Emoji";
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
/* Remove Streamlit default top padding a bit */
|
| 71 |
+
.block-container {
|
| 72 |
+
padding-top: 2.2rem;
|
| 73 |
+
padding-bottom: 2.5rem;
|
| 74 |
+
max-width: 1200px;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
/* Hide Streamlit chrome */
|
| 78 |
+
#MainMenu {visibility: hidden;}
|
| 79 |
+
footer {visibility: hidden;}
|
| 80 |
+
header {visibility: hidden;}
|
| 81 |
+
|
| 82 |
+
/* Headings: closer to your site (bold, clean) */
|
| 83 |
+
h1, h2, h3, h4 {
|
| 84 |
+
letter-spacing: -0.02em;
|
| 85 |
+
color: #111827;
|
| 86 |
+
}
|
| 87 |
+
h1 { font-weight: 800; }
|
| 88 |
+
h2 { font-weight: 800; }
|
| 89 |
+
h3 { font-weight: 750; }
|
| 90 |
+
|
| 91 |
+
/* Sidebar polish */
|
| 92 |
+
section[data-testid="stSidebar"] {
|
| 93 |
+
background: #FFFFFF;
|
| 94 |
+
border-right: 1px solid #EEF2F7;
|
| 95 |
+
}
|
| 96 |
+
section[data-testid="stSidebar"] .block-container {
|
| 97 |
+
padding-top: 1.5rem;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
/* Buttons: purple primary like your site */
|
| 101 |
+
.stButton > button {
|
| 102 |
+
border-radius: 12px;
|
| 103 |
+
padding: 0.55rem 0.95rem;
|
| 104 |
+
border: 1px solid #E5E7EB;
|
| 105 |
+
background: #FFFFFF;
|
| 106 |
+
color: #111827;
|
| 107 |
+
font-weight: 650;
|
| 108 |
+
}
|
| 109 |
+
.stButton > button:hover {
|
| 110 |
+
border-color: #C7B7FF;
|
| 111 |
+
background: #FBFAFF;
|
| 112 |
+
}
|
| 113 |
+
.stButton > button[kind="primary"] {
|
| 114 |
+
background: #6D28D9;
|
| 115 |
+
color: #FFFFFF;
|
| 116 |
+
border: 1px solid #6D28D9;
|
| 117 |
+
box-shadow: 0 6px 18px rgba(109, 40, 217, 0.18);
|
| 118 |
+
}
|
| 119 |
+
.stButton > button[kind="primary"]:hover {
|
| 120 |
+
background: #5B21B6;
|
| 121 |
+
border-color: #5B21B6;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
/* Inputs: rounded + subtle border */
|
| 125 |
+
div[data-baseweb="input"] > div,
|
| 126 |
+
div[data-baseweb="textarea"] > div,
|
| 127 |
+
div[data-baseweb="select"] > div {
|
| 128 |
+
border-radius: 12px !important;
|
| 129 |
+
border-color: #E5E7EB !important;
|
| 130 |
+
box-shadow: none !important;
|
| 131 |
+
}
|
| 132 |
+
div[data-baseweb="textarea"] textarea {
|
| 133 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
|
| 134 |
+
font-size: 13px;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
/* Tabs: reduce Streamlit “blocky” feel */
|
| 138 |
+
button[data-baseweb="tab"] {
|
| 139 |
+
border-radius: 12px 12px 0 0;
|
| 140 |
+
font-weight: 650;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
/* Dataframe / tables: softer container */
|
| 144 |
+
div[data-testid="stDataFrame"] {
|
| 145 |
+
border: 1px solid #EEF2F7;
|
| 146 |
+
border-radius: 14px;
|
| 147 |
+
overflow: hidden;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
/* “Card” helper class you can use via st.markdown */
|
| 151 |
+
.card {
|
| 152 |
+
border: 1px solid #EEF2F7;
|
| 153 |
+
border-radius: 16px;
|
| 154 |
+
padding: 14px 16px;
|
| 155 |
+
background: #FFFFFF;
|
| 156 |
+
box-shadow: 0 10px 24px rgba(17, 24, 39, 0.06);
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
/* Highlight rendering container (so text stays readable on light bg) */
|
| 160 |
+
.note {
|
| 161 |
+
white-space: pre-wrap;
|
| 162 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
|
| 163 |
+
font-size: 13px;
|
| 164 |
+
line-height: 1.45;
|
| 165 |
+
color: #111827;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
/* Entity pill styling (used in highlight) */
|
| 169 |
+
.ent {
|
| 170 |
+
border-radius: 10px;
|
| 171 |
+
padding: 2px 6px;
|
| 172 |
+
border: 1px solid rgba(17, 24, 39, 0.08);
|
| 173 |
+
box-shadow: 0 6px 14px rgba(17, 24, 39, 0.06);
|
| 174 |
+
}
|
| 175 |
+
.ent sup {
|
| 176 |
+
font-size: 10px;
|
| 177 |
+
margin-left: 6px;
|
| 178 |
+
opacity: 0.75;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
/* Make captions less “default streamlit” */
|
| 182 |
+
.stCaption, small {
|
| 183 |
+
color: #6B7280;
|
| 184 |
+
}
|
| 185 |
+
</style>
|
| 186 |
+
""",
|
| 187 |
+
unsafe_allow_html=True,
|
| 188 |
+
)
|
| 189 |
|
| 190 |
|
| 191 |
# =========================
|
|
|
|
| 300 |
|
| 301 |
# RGBA base colors (R,G,B); alpha is scaled by score
|
| 302 |
palette_rgb = {
|
| 303 |
+
"NAME": (124, 58, 237),
|
| 304 |
+
"LOC": (59, 130, 246),
|
| 305 |
+
"ORG": (16, 185, 129),
|
| 306 |
+
"DATE": (244, 63, 94),
|
| 307 |
+
"ID": (234, 179, 8),
|
| 308 |
+
"CONTACT": (14, 165, 233),
|
| 309 |
+
"UNK": (107, 114, 128),
|
| 310 |
}
|
| 311 |
|
| 312 |
def esc(s: str) -> str:
|
|
|
|
| 324 |
font-size: 13px;
|
| 325 |
line-height: 1.45;
|
| 326 |
|
| 327 |
+
/* Light card styling */
|
| 328 |
+
color: #111827;
|
| 329 |
+
background: #FFFFFF;
|
| 330 |
+
border: 1px solid #EEF2F7;
|
| 331 |
padding: 12px 14px;
|
| 332 |
+
border-radius: 14px;
|
| 333 |
+
box-shadow: 0 10px 24px rgba(17, 24, 39, 0.06);
|
| 334 |
}
|
| 335 |
|
| 336 |
.ent {
|
| 337 |
position: relative;
|
| 338 |
+
border-radius: 8px;
|
| 339 |
+
padding: 1px 4px;
|
| 340 |
margin: 0px 1px;
|
| 341 |
box-decoration-break: clone;
|
| 342 |
-webkit-box-decoration-break: clone;
|
| 343 |
transition: filter 120ms ease;
|
| 344 |
+
border: 1px solid rgba(17, 24, 39, 0.08);
|
| 345 |
}
|
| 346 |
+
.ent:hover { filter: brightness(1.03); }
|
| 347 |
|
| 348 |
.ent::after {
|
| 349 |
content: "";
|
| 350 |
position: absolute;
|
| 351 |
+
left: 6px; right: 6px; bottom: -2px;
|
| 352 |
height: 2px;
|
| 353 |
border-radius: 2px;
|
| 354 |
background: rgba(var(--rgb), 0.85);
|
|
|
|
| 357 |
.pill {
|
| 358 |
display: none;
|
| 359 |
position: absolute;
|
| 360 |
+
top: -18px;
|
| 361 |
left: 0px;
|
| 362 |
font-size: 10px;
|
| 363 |
line-height: 1;
|
| 364 |
+
padding: 3px 8px;
|
| 365 |
border-radius: 999px;
|
| 366 |
background: rgba(var(--rgb), 0.95);
|
| 367 |
+
color: #111827;
|
| 368 |
+
box-shadow: 0 6px 16px rgba(17, 24, 39, 0.12);
|
| 369 |
white-space: nowrap;
|
| 370 |
z-index: 5;
|
| 371 |
}
|
|
|
|
| 374 |
"""
|
| 375 |
|
| 376 |
|
| 377 |
+
|
| 378 |
out = []
|
| 379 |
cursor = 0
|
| 380 |
for e in entities:
|
|
|
|
| 678 |
# =========================
|
| 679 |
# Streamlit UI
|
| 680 |
# =========================
|
| 681 |
+
st.set_page_config(page_title="SpotRemover", layout="wide")
|
| 682 |
+
st.markdown(
|
| 683 |
+
"""
|
| 684 |
+
<div style="display:flex; gap:8px; flex-wrap:wrap; margin: 8px 0 16px 0;">
|
| 685 |
+
<span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">De-ID</span>
|
| 686 |
+
<span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">NER</span>
|
| 687 |
+
<span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">Veterinary</span>
|
| 688 |
+
<span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">One Health</span>
|
| 689 |
+
</div>
|
| 690 |
+
""",
|
| 691 |
+
unsafe_allow_html=True
|
| 692 |
+
)
|
| 693 |
+
|
| 694 |
+
brundage_header()
|
| 695 |
+
st.markdown("<div style='height:14px;'></div>", unsafe_allow_html=True)
|
| 696 |
+
inject_brundage_theme()
|
| 697 |
|
| 698 |
with st.sidebar:
|
| 699 |
+
st.header("Model")
|
| 700 |
+
|
| 701 |
+
# 1) Define your private fine-tuned model repo IDs (store actual values in env or hardcode)
|
| 702 |
+
# Option A (recommended): keep repo IDs in env so you don't commit them
|
| 703 |
+
MODEL_REGISTRY = {
|
| 704 |
+
"VetBERT (fine-tuned)": os.environ.get("HF_REPO_VETBERT", ""),
|
| 705 |
+
"PetBERT (fine-tuned)": os.environ.get("HF_REPO_PETBERT", ""),
|
| 706 |
+
"ClinicalBERT (fine-tuned)": os.environ.get("HF_REPO_CLINICALBERT", ""),
|
| 707 |
+
}
|
| 708 |
+
|
| 709 |
+
# 2) Dropdown selector
|
| 710 |
+
model_label = st.selectbox(
|
| 711 |
+
"Select model",
|
| 712 |
+
options=list(MODEL_REGISTRY.keys()),
|
| 713 |
+
index=0,
|
| 714 |
+
)
|
| 715 |
+
repo_id = MODEL_REGISTRY[model_label]
|
| 716 |
+
|
| 717 |
+
# 3) Optional revision (still OK to keep)
|
| 718 |
+
revision = (os.environ.get("HF_REVISION", "").strip() or None)
|
| 719 |
+
|
| 720 |
+
# 4) Token comes ONLY from environment
|
| 721 |
+
hf_token = (os.environ.get("HF_TOKEN", "").strip() or None)
|
| 722 |
+
|
| 723 |
+
if not repo_id:
|
| 724 |
+
st.error("Model repo_id is not set. Define HF_REPO_VETBERT / HF_REPO_PETBERT / HF_REPO_CLINICALBERT.")
|
| 725 |
+
st.stop()
|
| 726 |
+
if hf_token is None:
|
| 727 |
+
st.error("HF_TOKEN environment variable is not set (required for private models).")
|
| 728 |
+
st.stop()
|
| 729 |
|
| 730 |
st.header("Runtime")
|
| 731 |
+
use_gpu = False
|
| 732 |
device_str = "cuda:0" if (use_gpu and torch.cuda.is_available()) else "cpu"
|
| 733 |
|
| 734 |
pipe_max_len = st.selectbox("Max token length", options=[256, 512], index=0)
|
|
|
|
| 782 |
colA, colB = st.columns([1, 1])
|
| 783 |
with colA:
|
| 784 |
run_single = st.button("Run", type="primary")
|
|
|
|
|
|
|
| 785 |
|
| 786 |
if run_single:
|
| 787 |
with st.spinner("Running de-identification..."):
|
|
|
|
| 839 |
)
|
| 840 |
|
| 841 |
if show_highlight:
|
| 842 |
+
st.subheader("Highlighted original")
|
| 843 |
#st.markdown(highlight_entities_html(text, final_ents), unsafe_allow_html=True)
|
| 844 |
components.html(
|
| 845 |
highlight_entities_html(text, final_ents),
|
|
|
|
| 907 |
)
|
| 908 |
|
| 909 |
with tab3:
|
| 910 |
+
st.subheader("About")
|
| 911 |
st.markdown(
|
| 912 |
+
"""
|
| 913 |
+
### About this tool
|
| 914 |
+
|
| 915 |
+
This interactive demo is part of the **Brundage Lab (brundagelab.org)** research program on **AI methods for veterinary clinical text** and privacy-preserving data sharing for veterinary and One Health applications.
|
| 916 |
+
|
| 917 |
+
**What it does**
|
| 918 |
+
- Performs **veterinary de-identification** on free-text clinical narratives by detecting and redacting identifiers such as **owner/client names**, **addresses/locations**, **dates**, **IDs**, and **contact information**.
|
| 919 |
+
- Uses a **fine-tuned transformer NER model** (selectable backbone such as VetBERT / PetBERT / ClinicalBERT) loaded from a **private Hugging Face repository**.
|
| 920 |
+
- Augments model predictions with **high-precision pattern matching** for structured identifiers (e.g., emails and phone numbers).
|
| 921 |
+
|
| 922 |
+
**How to interpret results**
|
| 923 |
+
- This tool prioritizes **high recall** for sensitive identifiers (reducing false negatives), with thresholds adjustable in the sidebar.
|
| 924 |
+
- The highlighted view is provided for **demonstration and error analysis**; the redacted output is the intended downstream artifact.
|
| 925 |
+
|
| 926 |
+
**Engineering notes**
|
| 927 |
+
- **Model source**: loaded directly from Hugging Face (optionally pinned to a specific revision for reproducibility).
|
| 928 |
+
- **CONTACT**: extracted via regex (emails/phones). If the model also predicts CONTACT, regex is treated as the source of truth on overlaps.
|
| 929 |
+
- **Long notes**: optional windowing reduces truncation artifacts and improves coverage across multi-page notes.
|
| 930 |
+
|
| 931 |
+
**Privacy and intended use**
|
| 932 |
+
- This is a **research and demonstration tool**, not a certified de-identification system.
|
| 933 |
+
- Do **not** paste sensitive/regulated data unless you are running the tool in an approved environment with appropriate controls.
|
| 934 |
+
- For any public deployment, ensure **access control**, **minimal logging**, and a **privacy/security review** consistent with your institution’s policies.
|
| 935 |
+
|
| 936 |
+
**Citation / attribution**
|
| 937 |
+
If you use this tool or its outputs in a manuscript, please cite the Brundage Lab and describe the model backbone, training data composition (real vs. synthetic), and evaluation protocol.
|
| 938 |
"""
|
| 939 |
+
)
|
| 940 |
+
|
| 941 |
+
|
| 942 |
+
st.caption("Tip: Select your model backbone and explore a single document. Modify default thresholds to finetune your performance.")
|
| 943 |
|
|
|