File size: 12,596 Bytes
268b40a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 | import anthropic
import os
import json
from dotenv import load_dotenv
from Agents.agent import run_agent
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
load_dotenv()
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
# ββ Semantic similarity model βββββββββββββββββββββββββββββββββ
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# ββ Test transcripts ββββββββββββββββββββββββββββββββββββββββββ
TEST_CASES = [
{
"name": "Chest Pain / Pericarditis",
"transcript": """
Dr: Good morning, how are you feeling today?
Patient: Not great, doctor. I've been having really bad chest pain for the past 3 days. It gets worse when I breathe deeply or lie down.
Dr: Can you describe the pain? Is it sharp or dull?
Patient: It's sharp. Like a stabbing feeling on the left side of my chest.
Dr: Does it radiate anywhere β like your arm, jaw, or back?
Patient: No, it stays in my chest mostly.
Dr: Any shortness of breath, dizziness, or sweating?
Patient: Some shortness of breath yes, especially when I climb stairs. No dizziness or sweating though.
Dr: How about coughing or fever?
Patient: I had a mild fever two days ago, about 99.8. No cough.
Dr: Any recent illness?
Patient: Yes, I had an upper respiratory infection about two weeks ago.
Dr: Are you on any medications currently?
Patient: I take lisinopril 10mg daily and aspirin 81mg daily. I also took ibuprofen 400mg a couple times for the chest pain but it didn't help much.
Dr: Any allergies?
Patient: Penicillin. I get a rash.
Dr: Any history of heart disease?
Patient: My father had a heart attack at 62. I've never had any heart issues myself.
Dr: Blood pressure is 138 over 88. Heart rate 92. Temperature 98.9. O2 sat 96% on room air. I can hear a friction rub on auscultation.
Dr: Based on your symptoms I believe you have pericarditis. I'm stopping the ibuprofen and starting colchicine 0.5mg twice daily and aspirin 650mg three times a day. I'm ordering an EKG, chest X-ray, CBC, CRP, and troponin.
Patient: Should I avoid exercise?
Dr: Yes, avoid strenuous activity. Follow up in one week or sooner if pain worsens or fever exceeds 101.
Patient: Thank you doctor.
"""
},
{
"name": "Diabetic Follow-up",
"transcript": """
Dr: Good afternoon. How have you been since your last visit?
Patient: Honestly not great. I've been feeling really tired and urinating a lot more, especially at night.
Dr: How long has this been going on?
Patient: About three weeks now.
Dr: Are you checking your blood sugars at home?
Patient: Yes, they've been running high. Usually between 250 and 310 in the mornings. Sometimes over 350 after meals.
Dr: Any blurry vision, numbness or tingling?
Patient: Yes, my feet have been tingling a lot at night. And my vision has been a little blurry on and off.
Dr: Any chest pain or swelling?
Patient: No chest pain. My ankles swell a little by end of day.
Dr: Are you taking your medications?
Patient: I take metformin 1000mg twice a day. But I ran out of glipizide about two weeks ago and haven't refilled it.
Dr: Any other medications?
Patient: Lisinopril 10mg daily, atorvastatin 40mg at night, aspirin 81mg daily.
Dr: Any allergies?
Patient: No known drug allergies.
Dr: Family history?
Patient: My mother had diabetes and eventually went on dialysis.
Dr: Blood pressure 148 over 92, heart rate 84, temperature 98.6, weight 218 pounds which is up 6 pounds. O2 sat 97%. Feet show decreased sensation bilaterally. Mild pitting edema in both ankles.
Dr: We need better control urgently. I'm refilling your glipizide 10mg daily and increasing lisinopril to 20mg. Check blood sugar four times daily. I'm ordering A1C, metabolic panel, urine microalbumin, and lipid panel. Referrals to ophthalmology and podiatry. Follow up in 4 weeks.
Patient: Should I change my diet?
Dr: Yes. Meet with our diabetes educator. Reduce refined carbs, increase fiber, walk 20 to 30 minutes daily.
Patient: Thank you doctor.
"""
},
{
"name": "Neck Pain",
"transcript": """
Dr: Good morning, what brings you in today?
Patient: I've been having neck pain for the past two weeks. It started after I slept in a bad position. The pain is about a 6 out of 10, radiates to my left shoulder.
Dr: Any headaches or numbness?
Patient: Some headaches yes, but no numbness.
Dr: Are you on any medications?
Patient: I take ibuprofen 400mg when the pain gets bad, and lisinopril 10mg daily for blood pressure.
Dr: Any allergies?
Patient: Penicillin. I get a rash.
Dr: Range of motion is limited on the left side. No neurological deficits. I think this is muscle strain. I'm prescribing cyclobenzaprine 5mg at night. Follow up in 2 weeks if no improvement.
Patient: Thank you doctor.
"""
}
]
# ββ LLM Judge βββββββββββββββββββββββββββββββββββββββββββββββββ
def llm_judge(transcript: str, soap_note: str) -> dict:
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1000,
messages=[{
"role": "user",
"content": f"""You are a clinical documentation expert.
Evaluate this SOAP note against the original transcript.
Score each category from 1-10.
TRANSCRIPT:
{transcript}
GENERATED SOAP NOTE:
{soap_note}
Return ONLY valid JSON in this exact format with no extra text:
{{
"completeness": {{"score": 0, "reason": ""}},
"accuracy": {{"score": 0, "reason": ""}},
"structure": {{"score": 0, "reason": ""}},
"medication_capture": {{"score": 0, "reason": ""}},
"clinical_reasoning": {{"score": 0, "reason": ""}},
"overall_score": 0,
"summary": ""
}}"""
}]
)
raw = response.content[0].text.strip()
clean = raw.replace("```json", "").replace("```", "").strip()
return json.loads(clean)
# ββ ROUGE Score ββββββββββββββββββββββββββββββββββββββββββββββββ
def compute_rouge(reference: str, generated: str) -> dict:
scorer = rouge_scorer.RougeScorer(
["rouge1", "rouge2", "rougeL"],
use_stemmer=True
)
scores = scorer.score(reference, generated)
return {
"rouge1": round(scores["rouge1"].fmeasure, 3),
"rouge2": round(scores["rouge2"].fmeasure, 3),
"rougeL": round(scores["rougeL"].fmeasure, 3),
}
# ββ Semantic Similarity ββββββββββββββββββββββββββββββββββββββββ
def compute_similarity(transcript: str, soap_note: str) -> float:
emb1 = embedder.encode(transcript, convert_to_tensor=True)
emb2 = embedder.encode(soap_note, convert_to_tensor=True)
score = util.cos_sim(emb1, emb2).item()
return round(score, 3)
# ββ Field Coverage βββββββββββββββββββββββββββββββββββββββββββββ
def compute_field_coverage(soap_note: str) -> dict:
required_fields = [
"Chief Complaint",
"History of Present Illness",
"Current Medications",
"Allergies",
"Vital Signs",
"Physical Exam",
"Primary Diagnosis",
"Plan",
"Follow-up"
]
covered = [f for f in required_fields if f.lower() in soap_note.lower()]
coverage = round(len(covered) / len(required_fields), 2)
return {
"covered": len(covered),
"total": len(required_fields),
"coverage": coverage,
"missing": [f for f in required_fields if f not in covered]
}
# ββ Print helpers ββββββββββββββββββββββββββββββββββββββββββββββ
def print_bar(score: float, out_of: float = 10) -> str:
filled = int((score / out_of) * 10)
return "β" * filled + "β" * (10 - filled)
def print_section(title: str):
print(f"\n{'β' * 55}")
print(f" {title}")
print(f"{'β' * 55}")
# ββ Main evaluation runner βββββββββββββββββββββββββββββββββββββ
def run_evaluation():
all_overall = []
all_rouge1 = []
all_sim = []
all_coverage = []
print("\n" + "β" * 55)
print(" CLINSCRIBE β EVALUATION REPORT")
print("β" * 55)
for i, case in enumerate(TEST_CASES):
print(f"\n\n{'β' * 55}")
print(f" CASE {i+1}: {case['name']}")
print(f"{'β' * 55}")
# ββ Run agent ββββββββββββββββββββββββββββββββββββββ
print("\n β³ Running agent...")
steps = run_agent(case["transcript"])
soap_note = next(
(s["content"] for s in steps if s["type"] == "final"),
None
)
if not soap_note:
print(" β οΈ Agent returned no note β skipping")
continue
print(" β
Note generated\n")
# ββ LLM Judge ββββββββββββββββββββββββββββββββββββββ
print(" β³ Running LLM judge...")
judge = llm_judge(case["transcript"], soap_note)
all_overall.append(judge["overall_score"])
print_section("LLM JUDGE SCORES")
categories = [
("completeness", "Completeness"),
("accuracy", "Accuracy"),
("structure", "Structure"),
("medication_capture","Medication Capture"),
("clinical_reasoning","Clinical Reasoning"),
]
for key, label in categories:
s = judge[key]["score"]
r = judge[key]["reason"]
print(f"\n {label}")
print(f" [{print_bar(s)}] {s}/10")
print(f" β³ {r}")
print(f"\n OVERALL: [{print_bar(judge['overall_score'])}] {judge['overall_score']}/10")
print(f" SUMMARY: {judge['summary']}")
# ββ ROUGE ββββββββββββββββββββββββββββββββββββββββββ
rouge = compute_rouge(case["transcript"], soap_note)
all_rouge1.append(rouge["rouge1"])
print_section("ROUGE SCORES (overlap with transcript)")
print(f" ROUGE-1: [{print_bar(rouge['rouge1'], 1)}] {rouge['rouge1']}")
print(f" ROUGE-2: [{print_bar(rouge['rouge2'], 1)}] {rouge['rouge2']}")
print(f" ROUGE-L: [{print_bar(rouge['rougeL'], 1)}] {rouge['rougeL']}")
# ββ Semantic Similarity ββββββββββββββββββββββββββββ
sim = compute_similarity(case["transcript"], soap_note)
all_sim.append(sim)
print_section("SEMANTIC SIMILARITY")
print(f" Score: [{print_bar(sim, 1)}] {sim}")
print(f" β³ How well the note captures the meaning of the transcript")
# ββ Field Coverage βββββββββββββββββββββββββββββββββ
cov = compute_field_coverage(soap_note)
all_coverage.append(cov["coverage"])
print_section("FIELD COVERAGE")
print(f" Covered: {cov['covered']}/{cov['total']} required fields")
print(f" Score: [{print_bar(cov['coverage'], 1)}] {cov['coverage']}")
if cov["missing"]:
print(f" Missing: {', '.join(cov['missing'])}")
else:
print(f" Missing: None β
")
# ββ Aggregate Results ββββββββββββββββββββββββββββββββββ
print("\n\n" + "β" * 55)
print(" AGGREGATE RESULTS ACROSS ALL CASES")
print("β" * 55)
def avg(lst):
return round(sum(lst) / len(lst), 2) if lst else 0
print(f"\n LLM Overall Score: {avg(all_overall)}/10")
print(f" Avg ROUGE-1: {avg(all_rouge1)}")
print(f" Avg Semantic Similarity:{avg(all_sim)}")
print(f" Avg Field Coverage: {avg(all_coverage)}")
print(f"\n Cases evaluated: {len(all_overall)}/{len(TEST_CASES)}")
print("\n" + "β" * 55)
print(" Evaluation complete")
print("β" * 55 + "\n")
if __name__ == "__main__":
run_evaluation() |