Spaces:

chaeyoona
/

noteguard-agent

Running

github-actions[bot]

Deploy 8244dcbebc2e76b6843857728efe8f835b7426b4 from main

4477b4e about 5 hours ago

3.89 kB

	"""LangSmith evaluation for the NoteGuard agent slice.

	Two evaluators that map straight onto the judging story:
	1. zero_phi_to_model - the hard privacy guarantee (must be 1.0)
	2. faithfulness - LLM-as-judge: is every claim supported by the note?

	Run: python -m eval.run_eval
	Needs: LANGSMITH_API_KEY, GOOGLE_API_KEY, TAVILY_API_KEY (+ LANGSMITH_TRACING=true)

	API note: the LangSmith evaluate surface has shifted across versions. This targets
	langsmith>=0.1 with dict-style evaluators (inputs/outputs). Adjust signatures if
	your installed version differs.
	"""

	from __future__ import annotations

	from dotenv import load_dotenv

	load_dotenv()

	from langchain.chat_models import init_chat_model
	from langchain_core.messages import HumanMessage
	from langsmith import Client

	from agent.graph import build_graph
	from src.deid import NoteGuard

	KNOWN = {"PERSON": ["Margaret Okafor"], "NHS": ["485 777 3456"]}
	EXAMPLES = [
	{
	"note": (
	"Ward 4B. Pt Margaret Okafor (NHS 485 777 3456, DOB 22/09/1958, F, 45 Elm Road SW1A 1AA). "
	"GP: Dr James Obi, Riverside Surgery, Lambeth SE1 7PB. "
	"Admitted 12 Jan 2025 via ED with acute exacerbation of COPD. "
	"PMH: COPD (GOLD III), T2DM on metformin, hypertension on amlodipine. NKDA. "
	"O2 sats 88% on air. Managed with nebulised salbutamol, ipratropium, IV hydrocortisone, "
	"doxycycline. CXR: bilateral hyperinflation, no consolidation. WBC 11.2, CRP 78. "
	"Discharged 14 Jan 2025. TTO: carbocisteine 375 mg TDS, prednisolone 30 mg OD 5/7, "
	"doxycycline 100 mg OD 4/7. Metformin and amlodipine continued. "
	"Consultant: Dr Sarah Chen, Respiratory Medicine."
	),
	"question": "Draft an NHS eDischarge summary.",
	},
	]

	client = Client()
	_judge = None


	def _content_str(content) -> str:
	"""Flatten AIMessage content — Gemini returns a list of blocks, not a plain string."""
	if isinstance(content, list):
	return " ".join(b.get("text", "") if isinstance(b, dict) else str(b) for b in content)
	return content or ""


	def target(inputs: dict) -> dict:
	graph = build_graph(known=KNOWN)
	state = graph.invoke(
	{"messages": [HumanMessage(content=inputs["note"] + "\n\n" + inputs["question"])]},
	)
	model_facing = " ".join(_content_str(getattr(m, "content", "")) for m in state["messages"])
	return {"clinician_answer": state.get("clinician_answer", ""), "model_facing": model_facing}


	def zero_phi_to_model(inputs: dict, outputs: dict) -> dict:
	hits = NoteGuard(known=KNOWN).residual_identifiers(outputs["model_facing"])
	return {"key": "zero_phi_to_model", "score": 1.0 if not hits else 0.0}


	def faithfulness(inputs: dict, outputs: dict) -> dict:
	global _judge
	_judge = _judge or init_chat_model("google_genai:gemini-2.5-flash")
	prompt = (
	f"NOTE:\n{inputs['note']}\n\nSUMMARY:\n{outputs['clinician_answer']}\n\n"
	"Is every clinical claim in SUMMARY supported by NOTE? "
	"Reply with a single number between 0 and 1."
	)
	raw = _content_str(_judge.invoke(prompt).content)
	try:
	score = max(0.0, min(1.0, float(raw.strip().split()[0])))
	except (ValueError, IndexError):
	score = 0.0
	return {"key": "faithfulness", "score": score}


	if __name__ == "__main__":
	dataset_name = "noteguard-discharge-eval"
	try:
	dataset = client.create_dataset(dataset_name)
	client.create_examples(
	dataset_id=dataset.id,
	inputs=[{"note": e["note"], "question": e["question"]} for e in EXAMPLES],
	)
	except Exception:
	pass # dataset already exists from a previous run

	client.evaluate(
	target,
	data=dataset_name,
	evaluators=[zero_phi_to_model, faithfulness],
	experiment_prefix="noteguard-slice",
	)