"""LangSmith evaluation for the NoteGuard agent slice. Two evaluators that map straight onto the judging story: 1. zero_phi_to_model - the hard privacy guarantee (must be 1.0) 2. faithfulness - LLM-as-judge: is every claim supported by the note? Run: python -m eval.run_eval Needs: LANGSMITH_API_KEY, GOOGLE_API_KEY, TAVILY_API_KEY (+ LANGSMITH_TRACING=true) API note: the LangSmith evaluate surface has shifted across versions. This targets langsmith>=0.1 with dict-style evaluators (inputs/outputs). Adjust signatures if your installed version differs. """ from __future__ import annotations from dotenv import load_dotenv load_dotenv() from langchain.chat_models import init_chat_model from langchain_core.messages import HumanMessage from langsmith import Client from agent.graph import build_graph from src.deid import NoteGuard KNOWN = {"PERSON": ["Margaret Okafor"], "NHS": ["485 777 3456"]} EXAMPLES = [ { "note": ( "Ward 4B. Pt Margaret Okafor (NHS 485 777 3456, DOB 22/09/1958, F, 45 Elm Road SW1A 1AA). " "GP: Dr James Obi, Riverside Surgery, Lambeth SE1 7PB. " "Admitted 12 Jan 2025 via ED with acute exacerbation of COPD. " "PMH: COPD (GOLD III), T2DM on metformin, hypertension on amlodipine. NKDA. " "O2 sats 88% on air. Managed with nebulised salbutamol, ipratropium, IV hydrocortisone, " "doxycycline. CXR: bilateral hyperinflation, no consolidation. WBC 11.2, CRP 78. " "Discharged 14 Jan 2025. TTO: carbocisteine 375 mg TDS, prednisolone 30 mg OD 5/7, " "doxycycline 100 mg OD 4/7. Metformin and amlodipine continued. " "Consultant: Dr Sarah Chen, Respiratory Medicine." ), "question": "Draft an NHS eDischarge summary.", }, ] client = Client() _judge = None def _content_str(content) -> str: """Flatten AIMessage content — Gemini returns a list of blocks, not a plain string.""" if isinstance(content, list): return " ".join(b.get("text", "") if isinstance(b, dict) else str(b) for b in content) return content or "" def target(inputs: dict) -> dict: graph = build_graph(known=KNOWN) state = graph.invoke( {"messages": [HumanMessage(content=inputs["note"] + "\n\n" + inputs["question"])]}, ) model_facing = " ".join(_content_str(getattr(m, "content", "")) for m in state["messages"]) return {"clinician_answer": state.get("clinician_answer", ""), "model_facing": model_facing} def zero_phi_to_model(inputs: dict, outputs: dict) -> dict: hits = NoteGuard(known=KNOWN).residual_identifiers(outputs["model_facing"]) return {"key": "zero_phi_to_model", "score": 1.0 if not hits else 0.0} def faithfulness(inputs: dict, outputs: dict) -> dict: global _judge _judge = _judge or init_chat_model("google_genai:gemini-2.5-flash") prompt = ( f"NOTE:\n{inputs['note']}\n\nSUMMARY:\n{outputs['clinician_answer']}\n\n" "Is every clinical claim in SUMMARY supported by NOTE? " "Reply with a single number between 0 and 1." ) raw = _content_str(_judge.invoke(prompt).content) try: score = max(0.0, min(1.0, float(raw.strip().split()[0]))) except (ValueError, IndexError): score = 0.0 return {"key": "faithfulness", "score": score} if __name__ == "__main__": dataset_name = "noteguard-discharge-eval" try: dataset = client.create_dataset(dataset_name) client.create_examples( dataset_id=dataset.id, inputs=[{"note": e["note"], "question": e["question"]} for e in EXAMPLES], ) except Exception: pass # dataset already exists from a previous run client.evaluate( target, data=dataset_name, evaluators=[zero_phi_to_model, faithfulness], experiment_prefix="noteguard-slice", )