Spaces:
Sleeping
Sleeping
File size: 8,492 Bytes
eb83689 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | """LangGraph agent (Gemini + Tavily) wrapped by NoteGuard de-identification.
Guarantee enforced in-graph: the LLM and the Tavily tool only ever receive
DE-IDENTIFIED text. Real identifiers are restored only in the final,
clinician-facing answer (reidentify_out).
Run locally: langgraph dev (serves the `noteguard` graph for Agent Chat UI)
Trace: set LANGSMITH_TRACING=true + LANGSMITH_API_KEY (runs auto-trace)
Graph flow:
deidentify_in -> agent -> reidentify_out -> compute_trust
Version note: import names track LangGraph v1 / LangChain v0.3+. If your installed
versions differ, adjust the two prebuilt imports and the create_react_agent call.
"""
from __future__ import annotations
import os
from dotenv import load_dotenv
load_dotenv(override=True) # pick up .env before any os.getenv / API-key validation
from langchain.chat_models import init_chat_model
from langchain_core.messages import AIMessage, HumanMessage
from langgraph.graph import END, START, MessagesState, StateGraph
from langgraph.prebuilt import create_react_agent
try:
from langchain_tavily import TavilySearch
except ImportError: # older package name
from langchain_community.tools.tavily_search import TavilySearchResults as TavilySearch
from src.deid import NoteGuard
SYSTEM = """\
You are a clinical documentation assistant for NHS clinicians.
Your ONLY output is a compact discharge-summary card. Reproduce EXACTLY the format below β \
no headings, no bullets, no preamble, no sign-off.
## De-identification rules β NEVER violate
You only ever see DE-IDENTIFIED text. Patient identifiers β names, NHS numbers, dates of birth,
addresses, GP names, consultant names β have been replaced with surrogate tokens such as
[PERSON_1], [NHS_1], [DOB_1], [ADDRESS_1], [DATE_1].
Preserve every surrogate token exactly as given. A re-identification step restores real values
for the clinician after you respond. Never invent, guess, or expand a surrogate into a real value.
Do NOT name the patient anywhere in your output β no title, no real name, no patient surrogate
token. Refer to the patient only as "the patient". (Surrogate tokens for OTHER people, e.g. a GP
or consultant, may appear and will be restored.)
## Output format β three elements, blank line between each
Admitted <admission date> after <reason>. Background: <key conditions/meds>. <what was done>. <key finding>.
Follow-up: <GP action> Β· <action 2> Β· <action 3>
Grounded: <source name 1>, <source name 2> Β· via Tavily
## Rules for each element
**Narrative paragraph:** plain clinical prose, max 4 sentences. Include only facts stated in \
the source note β never invent investigations, doses, dates, or diagnoses. \
Refer to the patient as "the patient" β never write the patient's name or their surrogate token. \
Surrogate tokens for other people ([PERSON_1], etc.) may appear here and will be restored. \
For the admission date, copy the admission/visit date EXACTLY as it appears in the source note \
(a date such as 13/02/26) β character-for-character, same format; the system restores it to the \
correct value. Do NOT use the date of birth, and do NOT write a date token like [DATE_1] or any \
placeholder. If the note states no admission/visit date, omit it and write "Admitted after <reason>". \
Drop a sentence entirely when there is nothing to say (e.g. no imaging β omit that sentence).
**Follow-up line:** items separated by " Β· " (middle dot U+00B7). \
Always include the GP action as the first item.
**Grounded line:** list only the public guidance sources (short readable names, not URLs) \
actually returned by the Tavily search tool this run. \
If Tavily returned no results, omit the Grounded line entirely β never fabricate citations.
**Search tool:** use only for public NICE/NHS clinical guidance. \
Never send patient text or surrogate tokens to the search tool.
"""
class State(MessagesState):
forward: dict
reverse: dict
clinician_answer: str
# --- trust panel fields (all about de-identification correctness) ---
deid_text: str # de-identified note text (what the AI saw)
identifiers_removed: int # identifiers replaced in this turn
residual_count: int # known identifiers that survived de-id (pre-model)
leaked_tokens: list # orphaned surrogate tokens in the output (reversibility)
residual_pii: list # suspected un-redacted PII the model saw (de-id audit)
def build_graph(known: dict | None = None):
model = init_chat_model(os.getenv("NOTEGUARD_MODEL", "google_genai:gemini-2.5-flash"))
tools = [TavilySearch(max_results=3)]
react = create_react_agent(model, tools, prompt=SYSTEM)
def deidentify_in(state: State):
prior_n = len(state.get("forward") or {})
ng = NoteGuard(known=known, forward=state.get("forward"), reverse=state.get("reverse"))
last = state["messages"][-1]
if not isinstance(last, HumanMessage):
return {"forward": ng.forward, "reverse": ng.reverse}
res = ng.deidentify(last.content)
ng.assert_clean(res.clean_text) # hard guarantee before model/tool see anything
cleaned = HumanMessage(content=res.clean_text, id=last.id)
return {
"messages": [cleaned],
"forward": ng.forward,
"reverse": ng.reverse,
"deid_text": res.clean_text,
"identifiers_removed": len(ng.forward) - prior_n,
"residual_count": len(res.residual),
}
def run_agent(state: State):
out = react.invoke({"messages": state["messages"]})
return {"messages": out["messages"][len(state["messages"]) :]}
def reidentify_out(state: State):
ng = NoteGuard(reverse=state.get("reverse"))
last = state["messages"][-1]
if not isinstance(last, AIMessage):
return {"clinician_answer": "", "leaked_tokens": []}
content = last.content
# Gemini can return content as a list of blocks [{type, text}, ...]
if isinstance(content, list):
raw_text = " ".join(
block.get("text", "") if isinstance(block, dict) else str(block) for block in content
).strip()
else:
raw_text = content or ""
# Restore known surrogates for the clinician (the patient is never named).
restored = ng.reidentify(raw_text)
# Anything surrogate-shaped still present is either an unrestored surrogate or
# a stray template placeholder the model echoed (e.g. [DATE_X]); redact + flag
# so it never reaches the clinician verbatim.
restored, leaked_tokens = ng.redact_unresolved(restored)
leaked = [f"unresolved_token:{tok}" for tok in leaked_tokens]
return {"clinician_answer": restored, "leaked_tokens": leaked}
def compute_trust(state: State):
"""Audit de-identification quality for the trust panel.
Two independent de-id failures, neither needing PHI to compute:
- residual_pii: PII the vault/NER passes missed but that still reached the
model β scanned out of deid_text (what the model actually saw). This is
the failure the old residual_count was blind to (vault-only).
- leaked_tokens: orphaned surrogate tokens in the output that cannot be
reversed β the reversibility side of the pseudonymisation guarantee.
"""
ng = NoteGuard(known=known, reverse=state.get("reverse"))
deid_text = state.get("deid_text") or ""
residual_pii = ng.scan_pii(deid_text) if deid_text else []
leaked = list(state.get("leaked_tokens") or [])
return {
"residual_pii": residual_pii,
"leaked_tokens": leaked,
"residual_count": state.get("residual_count", 0) + len(leaked),
}
g = StateGraph(State)
g.add_node("deidentify_in", deidentify_in)
g.add_node("agent", run_agent)
g.add_node("reidentify_out", reidentify_out)
g.add_node("compute_trust", compute_trust)
g.add_edge(START, "deidentify_in")
g.add_edge("deidentify_in", "agent")
g.add_edge("agent", "reidentify_out")
g.add_edge("reidentify_out", "compute_trust")
g.add_edge("compute_trust", END)
return g.compile()
# Demo vault β seeds the known-identifier set so `langgraph dev` / the web UI
# resolve surrogates consistently on startup.
_DEMO_KNOWN = {"PERSON": ["Margaret Okafor"], "NHS": ["485 777 3456"]}
graph = build_graph(known=_DEMO_KNOWN)
|