BrundageLab commited on
Commit
cc60623
·
verified ·
1 Parent(s): bd550fc

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +273 -37
src/streamlit_app.py CHANGED
@@ -15,6 +15,177 @@ import pandas as pd
15
  import streamlit as st
16
  import torch
17
  from transformers import AutoTokenizer, AutoModelForTokenClassification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  # =========================
@@ -129,13 +300,13 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
129
 
130
  # RGBA base colors (R,G,B); alpha is scaled by score
131
  palette_rgb = {
132
- "NAME": (255, 200, 87),
133
- "LOC": (120, 180, 255),
134
- "ORG": (140, 220, 160),
135
- "DATE": (255, 140, 140),
136
- "ID": (200, 160, 255),
137
- "CONTACT": (120, 220, 220),
138
- "UNK": (200, 200, 200),
139
  }
140
 
141
  def esc(s: str) -> str:
@@ -153,28 +324,31 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
153
  font-size: 13px;
154
  line-height: 1.45;
155
 
156
- /* add these */
157
- color: #e8eaed;
158
- background: #0e1117;
 
159
  padding: 12px 14px;
160
- border-radius: 10px;
 
161
  }
162
 
163
  .ent {
164
  position: relative;
165
- border-radius: 4px;
166
- padding: 0px 2px;
167
  margin: 0px 1px;
168
  box-decoration-break: clone;
169
  -webkit-box-decoration-break: clone;
170
  transition: filter 120ms ease;
 
171
  }
172
- .ent:hover { filter: brightness(1.05); }
173
 
174
  .ent::after {
175
  content: "";
176
  position: absolute;
177
- left: 0; right: 0; bottom: -1px;
178
  height: 2px;
179
  border-radius: 2px;
180
  background: rgba(var(--rgb), 0.85);
@@ -183,15 +357,15 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
183
  .pill {
184
  display: none;
185
  position: absolute;
186
- top: -14px;
187
  left: 0px;
188
  font-size: 10px;
189
  line-height: 1;
190
- padding: 2px 6px;
191
  border-radius: 999px;
192
  background: rgba(var(--rgb), 0.95);
193
- color: #111;
194
- box-shadow: 0 2px 8px rgba(0,0,0,0.25);
195
  white-space: nowrap;
196
  z-index: 5;
197
  }
@@ -200,6 +374,7 @@ def highlight_entities_html(text: str, entities: List[Dict[str, Any]]) -> str:
200
  """
201
 
202
 
 
203
  out = []
204
  cursor = 0
205
  for e in entities:
@@ -503,17 +678,57 @@ def deidentify_note(
503
  # =========================
504
  # Streamlit UI
505
  # =========================
506
- st.set_page_config(page_title="Vet De-ID Demo", layout="wide")
507
- st.title("Veterinary De-identification Demo (HF model + NER + Regex)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
  with st.sidebar:
510
- st.header("Model (Hugging Face)")
511
- repo_id = st.text_input("HF repo_id", value=os.environ.get("HF_REPO_ID", "YOUR_ORG/YOUR_VET_DEID_MODEL"))
512
- revision = st.text_input("Revision (optional)", value=os.environ.get("HF_REVISION", "")).strip() or None
513
- hf_token = st.text_input("HF token (optional for private repos)", value=os.environ.get("HF_TOKEN", ""), type="password").strip() or None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
 
515
  st.header("Runtime")
516
- use_gpu = st.checkbox("Use GPU (CUDA)", value=torch.cuda.is_available())
517
  device_str = "cuda:0" if (use_gpu and torch.cuda.is_available()) else "cpu"
518
 
519
  pipe_max_len = st.selectbox("Max token length", options=[256, 512], index=0)
@@ -567,8 +782,6 @@ with tab1:
567
  colA, colB = st.columns([1, 1])
568
  with colA:
569
  run_single = st.button("Run", type="primary")
570
- with colB:
571
- st.caption("CONTACT is extracted via regex (emails/phones). Model CONTACT output is effectively ignored by default.")
572
 
573
  if run_single:
574
  with st.spinner("Running de-identification..."):
@@ -626,7 +839,7 @@ with tab1:
626
  )
627
 
628
  if show_highlight:
629
- st.subheader("Highlighted original (for demo)")
630
  #st.markdown(highlight_entities_html(text, final_ents), unsafe_allow_html=True)
631
  components.html(
632
  highlight_entities_html(text, final_ents),
@@ -694,14 +907,37 @@ with tab2:
694
  )
695
 
696
  with tab3:
697
- st.subheader("About / demo notes")
698
  st.markdown(
699
- """
700
- - **Model source**: loaded directly from a Hugging Face `repo_id` (optionally pinned to a `revision`).
701
- - **CONTACT**: extracted via regex (emails/phones). Model CONTACT output is typically redundant; regex wins on overlaps.
702
- - **Long notes**: enable windowing to avoid truncation artifacts.
703
- - **Security**: run locally for PHI. Do not deploy publicly without access control, logging controls, and a privacy review.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
  """
705
- )
 
 
 
706
 
707
- st.caption("Tip: set env vars HF_REPO_ID, HF_REVISION, HF_TOKEN for smoother demos.")
 
15
  import streamlit as st
16
  import torch
17
  from transformers import AutoTokenizer, AutoModelForTokenClassification
18
+ from dotenv import load_dotenv
19
+ load_dotenv()
20
+
21
+ import base64
22
+
23
+ APP_HOME_URL = "https://www.brundagelab.org/research/apps/" # change to your desired destination
24
+
25
+ def _img_to_data_uri(path: str) -> str:
26
+ ext = os.path.splitext(path)[1].lower().lstrip(".")
27
+ mime = "image/png" if ext == "png" else "image/jpeg" if ext in {"jpg","jpeg"} else "image/svg+xml"
28
+ with open(path, "rb") as f:
29
+ b64 = base64.b64encode(f.read()).decode("utf-8")
30
+ return f"data:{mime};base64,{b64}"
31
+
32
+ def brundage_header():
33
+ col1, col2 = st.columns([2, 5])
34
+
35
+ with col1:
36
+ logo_path = "assets/brundage_logo.png"
37
+ logo_uri = _img_to_data_uri(logo_path)
38
+
39
+ st.markdown(
40
+ f"""
41
+ <a href="{APP_HOME_URL}" target="_self" style="display:inline-block;">
42
+ <img src="{logo_uri}" alt="Brundage Lab" style="width:256px; height:auto; cursor:pointer;" />
43
+ </a>
44
+ """,
45
+ unsafe_allow_html=True,
46
+ )
47
+
48
+ with col2:
49
+ st.markdown(
50
+ """
51
+ <div style="padding-top:24px;">
52
+ <div style="font-size:34px; font-weight:850; letter-spacing:-0.02em; color:#111827;">
53
+ SpotRemover: Veterinary De-Identification
54
+ </div>
55
+ </div>
56
+ """,
57
+ unsafe_allow_html=True
58
+ )
59
+
60
+
61
+ def inject_brundage_theme():
62
+ st.markdown(
63
+ """
64
+ <style>
65
+ /* ---- App canvas ---- */
66
+ html, body, [class*="css"] {
67
+ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "Apple Color Emoji","Segoe UI Emoji";
68
+ }
69
+
70
+ /* Remove Streamlit default top padding a bit */
71
+ .block-container {
72
+ padding-top: 2.2rem;
73
+ padding-bottom: 2.5rem;
74
+ max-width: 1200px;
75
+ }
76
+
77
+ /* Hide Streamlit chrome */
78
+ #MainMenu {visibility: hidden;}
79
+ footer {visibility: hidden;}
80
+ header {visibility: hidden;}
81
+
82
+ /* Headings: closer to your site (bold, clean) */
83
+ h1, h2, h3, h4 {
84
+ letter-spacing: -0.02em;
85
+ color: #111827;
86
+ }
87
+ h1 { font-weight: 800; }
88
+ h2 { font-weight: 800; }
89
+ h3 { font-weight: 750; }
90
+
91
+ /* Sidebar polish */
92
+ section[data-testid="stSidebar"] {
93
+ background: #FFFFFF;
94
+ border-right: 1px solid #EEF2F7;
95
+ }
96
+ section[data-testid="stSidebar"] .block-container {
97
+ padding-top: 1.5rem;
98
+ }
99
+
100
+ /* Buttons: purple primary like your site */
101
+ .stButton > button {
102
+ border-radius: 12px;
103
+ padding: 0.55rem 0.95rem;
104
+ border: 1px solid #E5E7EB;
105
+ background: #FFFFFF;
106
+ color: #111827;
107
+ font-weight: 650;
108
+ }
109
+ .stButton > button:hover {
110
+ border-color: #C7B7FF;
111
+ background: #FBFAFF;
112
+ }
113
+ .stButton > button[kind="primary"] {
114
+ background: #6D28D9;
115
+ color: #FFFFFF;
116
+ border: 1px solid #6D28D9;
117
+ box-shadow: 0 6px 18px rgba(109, 40, 217, 0.18);
118
+ }
119
+ .stButton > button[kind="primary"]:hover {
120
+ background: #5B21B6;
121
+ border-color: #5B21B6;
122
+ }
123
+
124
+ /* Inputs: rounded + subtle border */
125
+ div[data-baseweb="input"] > div,
126
+ div[data-baseweb="textarea"] > div,
127
+ div[data-baseweb="select"] > div {
128
+ border-radius: 12px !important;
129
+ border-color: #E5E7EB !important;
130
+ box-shadow: none !important;
131
+ }
132
+ div[data-baseweb="textarea"] textarea {
133
+ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
134
+ font-size: 13px;
135
+ }
136
+
137
+ /* Tabs: reduce Streamlit “blocky” feel */
138
+ button[data-baseweb="tab"] {
139
+ border-radius: 12px 12px 0 0;
140
+ font-weight: 650;
141
+ }
142
+
143
+ /* Dataframe / tables: softer container */
144
+ div[data-testid="stDataFrame"] {
145
+ border: 1px solid #EEF2F7;
146
+ border-radius: 14px;
147
+ overflow: hidden;
148
+ }
149
+
150
+ /* “Card” helper class you can use via st.markdown */
151
+ .card {
152
+ border: 1px solid #EEF2F7;
153
+ border-radius: 16px;
154
+ padding: 14px 16px;
155
+ background: #FFFFFF;
156
+ box-shadow: 0 10px 24px rgba(17, 24, 39, 0.06);
157
+ }
158
+
159
+ /* Highlight rendering container (so text stays readable on light bg) */
160
+ .note {
161
+ white-space: pre-wrap;
162
+ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
163
+ font-size: 13px;
164
+ line-height: 1.45;
165
+ color: #111827;
166
+ }
167
+
168
+ /* Entity pill styling (used in highlight) */
169
+ .ent {
170
+ border-radius: 10px;
171
+ padding: 2px 6px;
172
+ border: 1px solid rgba(17, 24, 39, 0.08);
173
+ box-shadow: 0 6px 14px rgba(17, 24, 39, 0.06);
174
+ }
175
+ .ent sup {
176
+ font-size: 10px;
177
+ margin-left: 6px;
178
+ opacity: 0.75;
179
+ }
180
+
181
+ /* Make captions less “default streamlit” */
182
+ .stCaption, small {
183
+ color: #6B7280;
184
+ }
185
+ </style>
186
+ """,
187
+ unsafe_allow_html=True,
188
+ )
189
 
190
 
191
  # =========================
 
300
 
301
  # RGBA base colors (R,G,B); alpha is scaled by score
302
  palette_rgb = {
303
+ "NAME": (124, 58, 237),
304
+ "LOC": (59, 130, 246),
305
+ "ORG": (16, 185, 129),
306
+ "DATE": (244, 63, 94),
307
+ "ID": (234, 179, 8),
308
+ "CONTACT": (14, 165, 233),
309
+ "UNK": (107, 114, 128),
310
  }
311
 
312
  def esc(s: str) -> str:
 
324
  font-size: 13px;
325
  line-height: 1.45;
326
 
327
+ /* Light card styling */
328
+ color: #111827;
329
+ background: #FFFFFF;
330
+ border: 1px solid #EEF2F7;
331
  padding: 12px 14px;
332
+ border-radius: 14px;
333
+ box-shadow: 0 10px 24px rgba(17, 24, 39, 0.06);
334
  }
335
 
336
  .ent {
337
  position: relative;
338
+ border-radius: 8px;
339
+ padding: 1px 4px;
340
  margin: 0px 1px;
341
  box-decoration-break: clone;
342
  -webkit-box-decoration-break: clone;
343
  transition: filter 120ms ease;
344
+ border: 1px solid rgba(17, 24, 39, 0.08);
345
  }
346
+ .ent:hover { filter: brightness(1.03); }
347
 
348
  .ent::after {
349
  content: "";
350
  position: absolute;
351
+ left: 6px; right: 6px; bottom: -2px;
352
  height: 2px;
353
  border-radius: 2px;
354
  background: rgba(var(--rgb), 0.85);
 
357
  .pill {
358
  display: none;
359
  position: absolute;
360
+ top: -18px;
361
  left: 0px;
362
  font-size: 10px;
363
  line-height: 1;
364
+ padding: 3px 8px;
365
  border-radius: 999px;
366
  background: rgba(var(--rgb), 0.95);
367
+ color: #111827;
368
+ box-shadow: 0 6px 16px rgba(17, 24, 39, 0.12);
369
  white-space: nowrap;
370
  z-index: 5;
371
  }
 
374
  """
375
 
376
 
377
+
378
  out = []
379
  cursor = 0
380
  for e in entities:
 
678
  # =========================
679
  # Streamlit UI
680
  # =========================
681
+ st.set_page_config(page_title="SpotRemover", layout="wide")
682
+ st.markdown(
683
+ """
684
+ <div style="display:flex; gap:8px; flex-wrap:wrap; margin: 8px 0 16px 0;">
685
+ <span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">De-ID</span>
686
+ <span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">NER</span>
687
+ <span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">Veterinary</span>
688
+ <span style="border:1px solid #E5E7EB; padding:4px 10px; border-radius:999px; font-weight:600; font-size:13px;">One Health</span>
689
+ </div>
690
+ """,
691
+ unsafe_allow_html=True
692
+ )
693
+
694
+ brundage_header()
695
+ st.markdown("<div style='height:14px;'></div>", unsafe_allow_html=True)
696
+ inject_brundage_theme()
697
 
698
  with st.sidebar:
699
+ st.header("Model")
700
+
701
+ # 1) Define your private fine-tuned model repo IDs (store actual values in env or hardcode)
702
+ # Option A (recommended): keep repo IDs in env so you don't commit them
703
+ MODEL_REGISTRY = {
704
+ "VetBERT (fine-tuned)": os.environ.get("HF_REPO_VETBERT", ""),
705
+ "PetBERT (fine-tuned)": os.environ.get("HF_REPO_PETBERT", ""),
706
+ "ClinicalBERT (fine-tuned)": os.environ.get("HF_REPO_CLINICALBERT", ""),
707
+ }
708
+
709
+ # 2) Dropdown selector
710
+ model_label = st.selectbox(
711
+ "Select model",
712
+ options=list(MODEL_REGISTRY.keys()),
713
+ index=0,
714
+ )
715
+ repo_id = MODEL_REGISTRY[model_label]
716
+
717
+ # 3) Optional revision (still OK to keep)
718
+ revision = (os.environ.get("HF_REVISION", "").strip() or None)
719
+
720
+ # 4) Token comes ONLY from environment
721
+ hf_token = (os.environ.get("HF_TOKEN", "").strip() or None)
722
+
723
+ if not repo_id:
724
+ st.error("Model repo_id is not set. Define HF_REPO_VETBERT / HF_REPO_PETBERT / HF_REPO_CLINICALBERT.")
725
+ st.stop()
726
+ if hf_token is None:
727
+ st.error("HF_TOKEN environment variable is not set (required for private models).")
728
+ st.stop()
729
 
730
  st.header("Runtime")
731
+ use_gpu = False
732
  device_str = "cuda:0" if (use_gpu and torch.cuda.is_available()) else "cpu"
733
 
734
  pipe_max_len = st.selectbox("Max token length", options=[256, 512], index=0)
 
782
  colA, colB = st.columns([1, 1])
783
  with colA:
784
  run_single = st.button("Run", type="primary")
 
 
785
 
786
  if run_single:
787
  with st.spinner("Running de-identification..."):
 
839
  )
840
 
841
  if show_highlight:
842
+ st.subheader("Highlighted original")
843
  #st.markdown(highlight_entities_html(text, final_ents), unsafe_allow_html=True)
844
  components.html(
845
  highlight_entities_html(text, final_ents),
 
907
  )
908
 
909
  with tab3:
910
+ st.subheader("About")
911
  st.markdown(
912
+ """
913
+ ### About this tool
914
+
915
+ This interactive demo is part of the **Brundage Lab (brundagelab.org)** research program on **AI methods for veterinary clinical text** and privacy-preserving data sharing for veterinary and One Health applications.
916
+
917
+ **What it does**
918
+ - Performs **veterinary de-identification** on free-text clinical narratives by detecting and redacting identifiers such as **owner/client names**, **addresses/locations**, **dates**, **IDs**, and **contact information**.
919
+ - Uses a **fine-tuned transformer NER model** (selectable backbone such as VetBERT / PetBERT / ClinicalBERT) loaded from a **private Hugging Face repository**.
920
+ - Augments model predictions with **high-precision pattern matching** for structured identifiers (e.g., emails and phone numbers).
921
+
922
+ **How to interpret results**
923
+ - This tool prioritizes **high recall** for sensitive identifiers (reducing false negatives), with thresholds adjustable in the sidebar.
924
+ - The highlighted view is provided for **demonstration and error analysis**; the redacted output is the intended downstream artifact.
925
+
926
+ **Engineering notes**
927
+ - **Model source**: loaded directly from Hugging Face (optionally pinned to a specific revision for reproducibility).
928
+ - **CONTACT**: extracted via regex (emails/phones). If the model also predicts CONTACT, regex is treated as the source of truth on overlaps.
929
+ - **Long notes**: optional windowing reduces truncation artifacts and improves coverage across multi-page notes.
930
+
931
+ **Privacy and intended use**
932
+ - This is a **research and demonstration tool**, not a certified de-identification system.
933
+ - Do **not** paste sensitive/regulated data unless you are running the tool in an approved environment with appropriate controls.
934
+ - For any public deployment, ensure **access control**, **minimal logging**, and a **privacy/security review** consistent with your institution’s policies.
935
+
936
+ **Citation / attribution**
937
+ If you use this tool or its outputs in a manuscript, please cite the Brundage Lab and describe the model backbone, training data composition (real vs. synthetic), and evaluation protocol.
938
  """
939
+ )
940
+
941
+
942
+ st.caption("Tip: Select your model backbone and explore a single document. Modify default thresholds to finetune your performance.")
943