Spaces:

ghitaben
/

AMR-Guard

Running on Zero

App Files Files Community

ghitaben commited on Feb 22

Commit

ad9e267

1 Parent(s): 4c3f028

Refactor prompt templates and RAG module

Browse files

Files changed (11) hide show

notebooks/kaggle_medic_demo.ipynb +6 -54
src/agents.py +46 -217
src/config.py +15 -71
src/db/import_data.py +113 -164
src/graph.py +36 -205
src/loader.py +20 -92
src/prompts.py +7 -43
src/rag.py +68 -295
src/state.py +15 -47
src/tools/rag_tools.py +0 -1
src/utils.py +67 -216

notebooks/kaggle_medic_demo.ipynb CHANGED Viewed

@@ -72,15 +72,7 @@
    "id": "205d4ba2",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "%%capture\n",
-    "!pip install -q \\\n",
-    "    \"langgraph>=0.0.15\" \"langchain>=0.3.0\" langchain-text-splitters langchain-community \\\n",
-    "    \"chromadb>=0.4.0\" sentence-transformers \\\n",
-    "    \"transformers>=4.50.0\" accelerate bitsandbytes \\\n",
-    "    streamlit huggingface_hub \\\n",
-    "    \"pydantic>=2.0\" python-dotenv openpyxl pypdf \"pandas>=2.0\" jq"
-   ]
   },
   {
    "cell_type": "markdown",
@@ -267,9 +259,7 @@
    "cell_type": "markdown",
    "id": "37d17f6b",
    "metadata": {},
-   "source": [
-    "## 5 · Launch the App"
-   ]
   },
   {
    "cell_type": "code",
@@ -277,10 +267,7 @@
    "id": "96ff2d63",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "%%capture\n",
-    "!pip install -q localtunnel"
-   ]
   },
   {
    "cell_type": "code",
@@ -288,28 +275,7 @@
    "id": "ea6b1788",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import subprocess, time, requests\n",
-    "\n",
-    "streamlit_proc = subprocess.Popen(\n",
-    "    [\"streamlit\", \"run\", \"/kaggle/working/Med-I-C/app.py\",\n",
-    "     \"--server.port\", \"8501\",\n",
-    "     \"--server.headless\", \"true\",\n",
-    "     \"--server.enableCORS\", \"false\"],\n",
-    "    stdout=subprocess.DEVNULL,\n",
-    "    stderr=subprocess.DEVNULL,\n",
-    ")\n",
-    "\n",
-    "for _ in range(15):\n",
-    "    try:\n",
-    "        if requests.get(\"http://localhost:8501\", timeout=2).status_code == 200:\n",
-    "            print(\"Streamlit running on :8501\")\n",
-    "            break\n",
-    "    except Exception:\n",
-    "        time.sleep(2)\n",
-    "else:\n",
-    "    print(\"Streamlit may still be starting…\")"
-   ]
   },
   {
    "cell_type": "code",
@@ -317,21 +283,7 @@
    "id": "00ecfb17",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "tunnel_proc = subprocess.Popen(\n",
-    "    [\"npx\", \"localtunnel\", \"--port\", \"8501\"],\n",
-    "    stdout=subprocess.PIPE,\n",
-    "    stderr=subprocess.DEVNULL,\n",
-    "    text=True,\n",
-    ")\n",
-    "\n",
-    "for line in tunnel_proc.stdout:\n",
-    "    if \"https://\" in line:\n",
-    "        print(\"\\n\" + \"=\"*50)\n",
-    "        print(f\"  App URL: {line.strip()}\")\n",
-    "        print(\"=\"*50)\n",
-    "        break"
-   ]
   }
  ],
  "metadata": {
@@ -347,4 +299,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}

    "id": "205d4ba2",
    "metadata": {},
    "outputs": [],
+   "source": "%%capture\n!pip install -q \\\n    \"langgraph>=0.0.15\" \"langchain>=0.3.0\" langchain-text-splitters langchain-community \\\n    \"chromadb>=0.4.0\" sentence-transformers \\\n    \"transformers>=4.50.0\" accelerate bitsandbytes \\\n    gradio huggingface_hub \\\n    \"pydantic>=2.0\" python-dotenv openpyxl pypdf \"pandas>=2.0\" jq"
   },
   {
    "cell_type": "markdown",
    "cell_type": "markdown",
    "id": "37d17f6b",
    "metadata": {},
+   "source": "## 5 · Launch the Gradio App\n\nTwo tabbed scenarios are exposed in a single Gradio interface:\n\n| Tab | Scenario | Agents active |\n|---|---|---|\n| **Stage 1 — Empirical** | No lab results yet | Agent 1 (MedGemma 4B) → Agent 4 (MedGemma 4B + TxGemma 2B) |\n| **Stage 2 — Targeted** | Culture & sensitivity available | Agent 1 (MedGemma 4B) → Agent 2 (MedGemma 4B) → Agent 3 (MedGemma 4B→27B sub) → Agent 4 (MedGemma 4B + TxGemma 2B) |\n\n`demo.launch(share=True)` prints a public Gradio URL — no extra tunnel needed."
   },
   {
    "cell_type": "code",
    "id": "96ff2d63",
    "metadata": {},
    "outputs": [],
+   "source": "import json\nimport sys\n\nsys.path.insert(0, \"/kaggle/working/Med-I-C\")\n\n# ── Demo fallback (used when pipeline errors out before models are warm) ──────\n\ndef _demo_result(patient_data: dict, labs_text) -> dict:\n    result = {\n        \"stage\": \"targeted\" if labs_text else \"empirical\",\n        \"creatinine_clearance_ml_min\": 58.3,\n        \"intake_notes\": json.dumps({\n            \"patient_summary\": (\n                f\"{patient_data.get('age_years')}-year-old {patient_data.get('sex')} · \"\n                f\"{patient_data.get('suspected_source', 'infection')}\"\n            ),\n            \"creatinine_clearance_ml_min\": 58.3,\n            \"renal_dose_adjustment_needed\": True,\n            \"identified_risk_factors\": patient_data.get(\"comorbidities\", []),\n            \"infection_severity\": \"moderate\",\n            \"recommended_stage\": \"targeted\" if labs_text else \"empirical\",\n        }),\n        \"recommendation\": {\n            \"primary_antibiotic\": \"Ciprofloxacin\",\n            \"dose\": \"500 mg\",\n            \"route\": \"Oral\",\n            \"frequency\": \"Every 12 hours\",\n            \"duration\": \"7 days\",\n            \"backup_antibiotic\": \"Nitrofurantoin 100 mg MR BD × 5 days\",\n            \"rationale\": (\n                \"Community-acquired UTI with moderate renal impairment (CrCl 58 mL/min). \"\n                \"Ciprofloxacin provides broad Gram-negative coverage. No dose adjustment \"\n                \"required above CrCl 30 mL/min.\"\n            ),\n            \"references\": [\"IDSA UTI Guidelines 2024\", \"EUCAST Breakpoint Tables v16.0\"],\n        },\n        \"safety_warnings\": [],\n        \"errors\": [],\n    }\n    if labs_text:\n        result[\"vision_notes\"] = json.dumps({\n            \"specimen_type\": \"urine\",\n            \"identified_organisms\": [{\"organism_name\": \"Escherichia coli\", \"significance\": \"pathogen\"}],\n            \"susceptibility_results\": [\n                {\"organism\": \"E. coli\", \"antibiotic\": \"Ciprofloxacin\", \"mic_value\": 0.25, \"interpretation\": \"S\"},\n                {\"organism\": \"E. coli\", \"antibiotic\": \"Nitrofurantoin\", \"mic_value\": 16,   \"interpretation\": \"S\"},\n                {\"organism\": \"E. coli\", \"antibiotic\": \"Ampicillin\",     \"mic_value\": \">32\", \"interpretation\": \"R\"},\n            ],\n            \"extraction_confidence\": 0.95,\n        })\n        result[\"trend_notes\"] = json.dumps([{\n            \"organism\": \"E. coli\",\n            \"antibiotic\": \"Ciprofloxacin\",\n            \"risk_level\": \"LOW\",\n            \"recommendation\": \"No MIC creep detected — continue current therapy.\",\n        }])\n    return result\n\n\n# ── Output formatters ─────────────────────────────────────────────────────────\n\ndef _parse_json_field(raw):\n    if not raw or raw in (\"No lab data provided\", \"No MIC data available for trend analysis\", \"\"):\n        return None\n    if isinstance(raw, (dict, list)):\n        return raw\n    try:\n        return json.loads(raw)\n    except Exception:\n        return None\n\n\ndef format_recommendation(result: dict) -> str:\n    lines = [\"## ℞ Recommendation\\n\"]\n    rec = result.get(\"recommendation\", {})\n    if rec:\n        drug  = rec.get(\"primary_antibiotic\", \"—\")\n        dose  = rec.get(\"dose\", \"—\")\n        route = rec.get(\"route\", \"—\")\n        freq  = rec.get(\"frequency\", \"—\")\n        dur   = rec.get(\"duration\", \"—\")\n        lines.append(f\"**Drug:** {drug}\")\n        lines.append(\n            f\"**Dose:** {dose} &nbsp;·&nbsp; **Route:** {route} \"\n            f\"&nbsp;·&nbsp; **Frequency:** {freq} &nbsp;·&nbsp; **Duration:** {dur}\"\n        )\n        if rec.get(\"backup_antibiotic\"):\n            lines.append(f\"**Alternative:** {rec['backup_antibiotic']}\")\n        if rec.get(\"rationale\"):\n            lines.append(f\"\\n**Clinical rationale:** {rec['rationale']}\")\n        if rec.get(\"references\"):\n            lines.append(\"\\n**References:**\")\n            for ref in rec[\"references\"]:\n                lines.append(f\"- {ref}\")\n\n    intake = _parse_json_field(result.get(\"intake_notes\", \"\"))\n    if isinstance(intake, dict):\n        lines.append(\"\\n---\\n## Patient Summary\")\n        if intake.get(\"patient_summary\"):\n            lines.append(f\"> {intake['patient_summary']}\")\n        crcl = result.get(\"creatinine_clearance_ml_min\") or intake.get(\"creatinine_clearance_ml_min\")\n        if crcl:\n            lines.append(f\"**CrCl:** {float(crcl):.1f} mL/min\")\n        if intake.get(\"renal_dose_adjustment_needed\"):\n            lines.append(\"⚠ **Renal dose adjustment required**\")\n        factors = intake.get(\"identified_risk_factors\", [])\n        if factors:\n            lines.append(f\"**Risk factors:** {', '.join(factors)}\")\n\n    warnings = result.get(\"safety_warnings\", [])\n    if warnings:\n        lines.append(\"\\n---\\n## ⚠ Safety Warnings\")\n        for w in warnings:\n            lines.append(f\"- {w}\")\n\n    errors = result.get(\"errors\", [])\n    if errors:\n        lines.append(\"\\n---\\n## Errors\")\n        for e in errors:\n            lines.append(f\"- {e}\")\n\n    return \"\\n\".join(lines)\n\n\ndef format_lab_analysis(result: dict) -> str:\n    lines = []\n    vision = _parse_json_field(result.get(\"vision_notes\", \"\"))\n    trend  = _parse_json_field(result.get(\"trend_notes\", \"\"))\n\n    if vision is None:\n        return \"*No lab data processed.*\"\n\n    if isinstance(vision, dict):\n        lines.append(\"## Lab Extraction\")\n        if vision.get(\"specimen_type\"):\n            lines.append(f\"**Specimen:** {vision['specimen_type'].capitalize()}\")\n        if vision.get(\"extraction_confidence\") is not None:\n            conf = float(vision[\"extraction_confidence\"])\n            lines.append(f\"**Extraction confidence:** {conf:.0%}\")\n\n        orgs = vision.get(\"identified_organisms\", [])\n        if orgs:\n            lines.append(\"\\n**Identified organisms:**\")\n            for o in orgs:\n                name = o.get(\"organism_name\", \"Unknown\")\n                sig  = o.get(\"significance\", \"\")\n                lines.append(f\"- **{name}**\" + (f\" — {sig}\" if sig else \"\"))\n\n        sus = vision.get(\"susceptibility_results\", [])\n        if sus:\n            lines.append(\"\\n**Susceptibility results:**\")\n            lines.append(\"| Organism | Antibiotic | MIC (mg/L) | Result |\")\n            lines.append(\"|---|---|---|---|\")\n            for s in sus:\n                interp = s.get(\"interpretation\", \"\")\n                icon   = {\"S\": \"✓ S\", \"R\": \"✗ R\", \"I\": \"~ I\"}.get(interp.upper(), interp)\n                lines.append(\n                    f\"| {s.get('organism','')} | {s.get('antibiotic','')} \"\n                    f\"| {s.get('mic_value','')} | {icon} |\"\n                )\n\n    if trend:\n        items = trend if isinstance(trend, list) else [trend]\n        lines.append(\"\\n## MIC Trend Analysis\")\n        for item in items:\n            if not isinstance(item, dict):\n                lines.append(str(item))\n                continue\n            risk  = item.get(\"risk_level\", \"UNKNOWN\").upper()\n            icon  = {\"HIGH\": \"🚨\", \"MODERATE\": \"⚠\"}.get(risk, \"✓\")\n            org   = item.get(\"organism\", \"\")\n            ab    = item.get(\"antibiotic\", \"\")\n            label = f\"{org} / {ab} — \" if (org or ab) else \"\"\n            lines.append(f\"**{icon} {label}{risk}** — {item.get('recommendation', '')}\")\n\n    return \"\\n\".join(lines) if lines else \"*No lab analysis available.*\"\n\n\n# ── Pipeline runner helpers ───────────────────────────────────────────────────\n\ndef _build_patient_data(age, weight, height, sex, creatinine,\n                        infection_site, suspected_source,\n                        medications_str, allergies_str, comorbidities_str):\n    return {\n        \"age_years\":               float(age),\n        \"weight_kg\":               float(weight),\n        \"height_cm\":               float(height),\n        \"sex\":                     sex,\n        \"serum_creatinine_mg_dl\":  float(creatinine),\n        \"infection_site\":          infection_site,\n        \"suspected_source\":        suspected_source or f\"{infection_site} infection\",\n        \"medications\":  [m.strip() for m in medications_str.split(\"\\n\") if m.strip()],\n        \"allergies\":    [a.strip() for a in allergies_str.split(\"\\n\")    if a.strip()],\n        \"comorbidities\":[c.strip() for c in comorbidities_str.split(\"\\n\") if c.strip()],\n    }\n\n\ndef run_empirical_scenario(age, weight, height, sex, creatinine,\n                           infection_site, suspected_source,\n                           medications_str, allergies_str, comorbidities_str):\n    \"\"\"Stage 1 — Empirical: no lab results.\n    Active models: MedGemma 4B (Agent 1) → MedGemma 4B + TxGemma 2B (Agent 4).\n    \"\"\"\n    patient_data = _build_patient_data(\n        age, weight, height, sex, creatinine,\n        infection_site, suspected_source,\n        medications_str, allergies_str, comorbidities_str,\n    )\n    try:\n        from src.graph import run_pipeline\n        result = run_pipeline(patient_data, labs_raw_text=None)\n    except Exception as exc:\n        result = _demo_result(patient_data, None)\n        result[\"errors\"].append(f\"[Demo mode — pipeline error: {exc}]\")\n    return format_recommendation(result)\n\n\ndef run_targeted_scenario(age, weight, height, sex, creatinine,\n                          infection_site, suspected_source,\n                          medications_str, allergies_str, comorbidities_str,\n                          labs_text):\n    \"\"\"Stage 2 — Targeted: lab culture & sensitivity available.\n    Active models: MedGemma 4B (Agents 1, 2) → MedGemma 4B→27B sub (Agent 3)\n                   → MedGemma 4B + TxGemma 2B (Agent 4).\n    \"\"\"\n    patient_data = _build_patient_data(\n        age, weight, height, sex, creatinine,\n        infection_site, suspected_source,\n        medications_str, allergies_str, comorbidities_str,\n    )\n    labs = labs_text.strip() if labs_text else None\n    try:\n        from src.graph import run_pipeline\n        result = run_pipeline(patient_data, labs_raw_text=labs)\n    except Exception as exc:\n        result = _demo_result(patient_data, labs)\n        result[\"errors\"].append(f\"[Demo mode — pipeline error: {exc}]\")\n    return format_recommendation(result), format_lab_analysis(result)\n\n\nprint(\"Helper functions loaded.\")"
   },
   {
    "cell_type": "code",
    "id": "ea6b1788",
    "metadata": {},
    "outputs": [],
+   "source": "import gradio as gr\n\nINFECTION_SITES = [\"urinary\", \"respiratory\", \"bloodstream\", \"skin\", \"intra-abdominal\", \"CNS\", \"other\"]\n\n\ndef _patient_inputs():\n    \"\"\"Create patient-demographics input widgets inside the current gr.Blocks context.\"\"\"\n    with gr.Row():\n        age            = gr.Number(label=\"Age (years)\",            value=65,   minimum=0,   maximum=120,  precision=0)\n        weight         = gr.Number(label=\"Weight (kg)\",            value=70.0, minimum=1,   maximum=300)\n        height         = gr.Number(label=\"Height (cm)\",            value=170.0,minimum=50,  maximum=250)\n    with gr.Row():\n        sex            = gr.Dropdown(label=\"Biological sex\",       choices=[\"male\", \"female\"], value=\"male\")\n        creatinine     = gr.Number(label=\"Serum Creatinine (mg/dL)\", value=1.2, minimum=0.1, maximum=20.0)\n        infection_site = gr.Dropdown(label=\"Infection site\",       choices=INFECTION_SITES,    value=\"urinary\")\n    suspected_source = gr.Textbox(label=\"Suspected source\",\n                                  placeholder=\"e.g., community-acquired UTI\")\n    with gr.Row():\n        medications    = gr.Textbox(label=\"Current medications (one per line)\",\n                                    placeholder=\"Metformin\\nLisinopril\", lines=3)\n        allergies      = gr.Textbox(label=\"Drug allergies (one per line)\",\n                                    placeholder=\"Penicillin\\nSulfa\", lines=3)\n        comorbidities  = gr.Textbox(label=\"Comorbidities / MDR risk factors (one per line)\",\n                                    placeholder=\"Diabetes\\nCKD\\nPrior MRSA\", lines=3)\n    return [age, weight, height, sex, creatinine, infection_site,\n            suspected_source, medications, allergies, comorbidities]\n\n\nwith gr.Blocks(title=\"AMR-Guard · Med-I-C\", theme=gr.themes.Soft()) as demo:\n\n    gr.Markdown(\"\"\"\n# ⚕ AMR-Guard — Infection Lifecycle Orchestrator\n\n**Multi-Agent Clinical Decision Support for Antimicrobial Stewardship**\n\n| Model | Agent(s) | Role |\n|---|---|---|\n| `google/medgemma-4b-it` | 1, 2, 4 | Intake · Lab extraction · Final Rx |\n| `google/medgemma-4b-it` (27B sub on T4) | 3 | MIC trend analysis |\n| `google/txgemma-2b-predict` (9B sub on T4) | 4 (safety) | Drug interaction screening |\n\n> ⚠ **Research demo only** — not validated for clinical use. All output must be reviewed by a licensed clinician.\n---\n\"\"\")\n\n    with gr.Tabs():\n\n        # ──────────────────────────────────────────────────────────────────────\n        # TAB 1 — Stage 1: Empirical (no lab results)\n        # ──────────────────────────────────────────────────────────────────────\n        with gr.Tab(\"Stage 1 — Empirical (no lab results)\"):\n            gr.Markdown(\"\"\"\n**Scenario:** Patient presents without culture / sensitivity data.\n\n**Pipeline:** Agent 1 — *Intake Historian* (MedGemma 4B IT) → Agent 4 — *Clinical Pharmacologist* (MedGemma 4B IT + TxGemma 2B)\n\"\"\")\n            emp_inputs = _patient_inputs()\n            emp_btn    = gr.Button(\"Run Empirical Pipeline\", variant=\"primary\")\n            emp_output = gr.Markdown(label=\"Recommendation\")\n\n            emp_btn.click(\n                fn=run_empirical_scenario,\n                inputs=emp_inputs,\n                outputs=emp_output,\n            )\n\n        # ──────────────────────────────────────────────────────────────────────\n        # TAB 2 — Stage 2: Targeted (culture & sensitivity available)\n        # ──────────────────────────────────────────────────────────────────────\n        with gr.Tab(\"Stage 2 — Targeted (lab results available)\"):\n            gr.Markdown(\"\"\"\n**Scenario:** Culture & sensitivity report (any language) is available.\n\n**Pipeline:** Agent 1 (MedGemma 4B IT) → Agent 2 — *Vision Specialist* (MedGemma 4B IT) → Agent 3 — *Trend Analyst* (MedGemma 27B→4B sub) → Agent 4 (MedGemma 4B IT + TxGemma 2B)\n\"\"\")\n            tgt_inputs = _patient_inputs()\n            tgt_labs   = gr.Textbox(\n                label=\"Lab / Culture Report — paste text (any language)\",\n                placeholder=(\n                    \"Organism: Escherichia coli\\n\"\n                    \"Ciprofloxacin: S  MIC 0.25 mg/L\\n\"\n                    \"Nitrofurantoin: S  MIC 16 mg/L\\n\"\n                    \"Ampicillin: R  MIC >32 mg/L\"\n                ),\n                lines=6,\n            )\n            tgt_btn = gr.Button(\"Run Targeted Pipeline\", variant=\"primary\")\n\n            with gr.Row():\n                tgt_rec_output = gr.Markdown(label=\"Recommendation\")\n                tgt_lab_output = gr.Markdown(label=\"Lab Analysis & MIC Trend\")\n\n            tgt_btn.click(\n                fn=run_targeted_scenario,\n                inputs=tgt_inputs + [tgt_labs],\n                outputs=[tgt_rec_output, tgt_lab_output],\n            )\n\n    gr.Markdown(\"\"\"\n---\n**Knowledge bases:** EUCAST v16.0 · WHO AWaRe 2024 · IDSA AMR Guidance 2024 · ATLAS Surveillance · WHO GLASS · DDInter 2.0  \n**Inference:** HuggingFace Transformers · 4-bit quantization · Kaggle T4 GPU\n\"\"\")\n\nprint(\"Gradio app defined. Run the next cell to launch.\")"
   },
   {
    "cell_type": "code",
    "id": "00ecfb17",
    "metadata": {},
    "outputs": [],
+   "source": "# share=True creates a public Gradio URL (works out-of-the-box on Kaggle — no localtunnel needed).\n# The URL is printed below and stays live for ~72 hours.\ndemo.launch(share=True, quiet=True)"
   }
  ],
  "metadata": {
  },
  "nbformat": 4,
  "nbformat_minor": 5
+}

src/agents.py CHANGED Viewed

@@ -1,18 +1,15 @@
 """
-Multi-Agent System.
-Implements the 4 specialized agents for the infection lifecycle workflow:
-- Agent 1: Intake Historian - Parse patient data, risk factors, calculate CrCl
-- Agent 2: Vision Specialist - Extract structured data from lab reports
-- Agent 3: Trend Analyst - Detect MIC creep and resistance velocity
-- Agent 4: Clinical Pharmacologist - Final Rx recommendations + safety checks
 """
-from __future__ import annotations
 import json
 import logging
-from typing import Any, Dict, Optional
 from .config import get_settings
 from .loader import run_inference, TextModelName
@@ -40,36 +37,12 @@ from .utils import (
 logger = logging.getLogger(__name__)
-# =============================================================================
-# AGENT 1: INTAKE HISTORIAN
-# =============================================================================
 def run_intake_historian(state: InfectionState) -> InfectionState:
-    """
-    Agent 1: Parse patient data, calculate CrCl, identify risk factors.
-    Input state fields used:
-        - age_years, weight_kg, height_cm, sex
-        - serum_creatinine_mg_dl
-        - medications, allergies, comorbidities
-        - suspected_source, infection_site
-    Output state fields updated:
-        - creatinine_clearance_ml_min
-        - intake_notes
-        - stage (empirical/targeted)
-        - route_to_vision
-    """
     logger.info("Running Intake Historian agent...")
-    # Calculate CrCl if we have the required data
     crcl = None
-    if all([
-        state.get("age_years"),
-        state.get("weight_kg"),
-        state.get("serum_creatinine_mg_dl"),
-        state.get("sex"),
-    ]):
         try:
             crcl = calculate_crcl(
                 age_years=state["age_years"],
@@ -85,20 +58,14 @@ def run_intake_historian(state: InfectionState) -> InfectionState:
             logger.warning(f"Could not calculate CrCl: {e}")
             state.setdefault("errors", []).append(f"CrCl calculation error: {e}")
-    # Build patient data string for prompt
     patient_data = _format_patient_data(state)
-    # Get RAG context
     query = f"treatment {state.get('suspected_source', '')} {state.get('infection_site', '')}"
     rag_context = get_context_for_agent(
         agent_name="intake_historian",
         query=query,
-        patient_context={
-            "pathogen_type": state.get("suspected_source"),
-        },
     )
-    # Format the prompt
     prompt = f"{INTAKE_HISTORIAN_SYSTEM}\n\n{INTAKE_HISTORIAN_PROMPT.format(
         patient_data=patient_data,
         medications=', '.join(state.get('medications', [])) or 'None reported',
@@ -108,34 +75,20 @@ def run_intake_historian(state: InfectionState) -> InfectionState:
         rag_context=rag_context,
     )}"
-    # Run inference
     try:
-        response = run_inference(
-            prompt=prompt,
-            model_name="medgemma_4b",
-            max_new_tokens=1024,
-            temperature=0.2,
-        )
-        # Parse response
         parsed = safe_json_parse(response)
         if parsed:
             state["intake_notes"] = json.dumps(parsed, indent=2)
-            # Update state from parsed response
             if parsed.get("creatinine_clearance_ml_min") and crcl is None:
                 state["creatinine_clearance_ml_min"] = parsed["creatinine_clearance_ml_min"]
-            # Determine stage
-            recommended_stage = parsed.get("recommended_stage", "empirical")
-            state["stage"] = recommended_stage
-            # Route to vision if we have lab data to process
-            state["route_to_vision"] = bool(state.get("labs_raw_text"))
         else:
             state["intake_notes"] = response
             state["stage"] = "empirical"
-            state["route_to_vision"] = bool(state.get("labs_raw_text"))
     except Exception as e:
         logger.error(f"Intake Historian error: {e}")
@@ -147,23 +100,8 @@ def run_intake_historian(state: InfectionState) -> InfectionState:
     return state
-# =============================================================================
-# AGENT 2: VISION SPECIALIST
-# =============================================================================
 def run_vision_specialist(state: InfectionState) -> InfectionState:
-    """
-    Agent 2: Extract structured data from lab reports (text, images, PDFs).
-    Input state fields used:
-        - labs_raw_text (extracted text from lab report)
-    Output state fields updated:
-        - labs_parsed
-        - mic_data
-        - vision_notes
-        - route_to_trend_analyst
-    """
     logger.info("Running Vision Specialist agent...")
     labs_raw = state.get("labs_raw_text", "")
@@ -173,68 +111,54 @@ def run_vision_specialist(state: InfectionState) -> InfectionState:
         state["route_to_trend_analyst"] = False
         return state
-    # Detect language (simplified - in production would use langdetect)
     language = "English (assumed)"
-    # Get RAG context for lab interpretation
     rag_context = get_context_for_agent(
         agent_name="vision_specialist",
         query="culture sensitivity susceptibility interpretation",
         patient_context={},
     )
-    # Format the prompt
     prompt = f"{VISION_SPECIALIST_SYSTEM}\n\n{VISION_SPECIALIST_PROMPT.format(
         report_content=labs_raw,
         source_format='text',
         language=language,
     )}"
-    # Run inference
     try:
-        response = run_inference(
-            prompt=prompt,
-            model_name="medgemma_4b",
-            max_new_tokens=2048,
-            temperature=0.1,
-        )
-        # Parse response
         parsed = safe_json_parse(response)
         if parsed:
             state["vision_notes"] = json.dumps(parsed, indent=2)
-            # Extract organisms and susceptibility data
             organisms = parsed.get("identified_organisms", [])
             susceptibility = parsed.get("susceptibility_results", [])
-            # Convert to MICDatum format
-            mic_data = []
-            for result in susceptibility:
-                mic_datum = {
-                    "organism": normalize_organism_name(result.get("organism", "")),
-                    "antibiotic": normalize_antibiotic_name(result.get("antibiotic", "")),
-                    "mic_value": str(result.get("mic_value", "")),
-                    "mic_unit": result.get("mic_unit", "mg/L"),
-                    "interpretation": result.get("interpretation"),
                 }
-                mic_data.append(mic_datum)
             state["mic_data"] = mic_data
-            state["labs_parsed"] = [{
-                "name": org.get("organism_name", "Unknown"),
-                "value": org.get("colony_count", ""),
-                "flag": "pathogen" if org.get("significance") == "pathogen" else None,
-            } for org in organisms]
-            # Route to trend analyst if we have MIC data
             state["route_to_trend_analyst"] = len(mic_data) > 0
-            # Check for critical findings
             critical = parsed.get("critical_findings", [])
             if critical:
                 state.setdefault("safety_warnings", []).extend(critical)
         else:
             state["vision_notes"] = response
             state["route_to_trend_analyst"] = False
@@ -249,23 +173,8 @@ def run_vision_specialist(state: InfectionState) -> InfectionState:
     return state
-# =============================================================================
-# AGENT 3: TREND ANALYST
-# =============================================================================
 def run_trend_analyst(state: InfectionState) -> InfectionState:
-    """
-    Agent 3: Analyze MIC trends and detect resistance velocity.
-    Input state fields used:
-        - mic_data (current MIC readings)
-        - Historical MIC data (if available)
-    Output state fields updated:
-        - mic_trend_summary
-        - trend_notes
-        - safety_warnings (if high risk detected)
-    """
     logger.info("Running Trend Analyst agent...")
     mic_data = state.get("mic_data", [])
@@ -274,14 +183,12 @@ def run_trend_analyst(state: InfectionState) -> InfectionState:
         state["trend_notes"] = "No MIC data available for trend analysis"
         return state
-    # For each organism-antibiotic pair, analyze trends
     trend_results = []
     for mic in mic_data:
         organism = mic.get("organism", "Unknown")
         antibiotic = mic.get("antibiotic", "Unknown")
-        # Get RAG context for breakpoints
         rag_context = get_context_for_agent(
             agent_name="trend_analyst",
             query=f"breakpoint {organism} {antibiotic}",
@@ -292,10 +199,9 @@ def run_trend_analyst(state: InfectionState) -> InfectionState:
             },
         )
-        # Format MIC history (in production, would pull from database)
         mic_history = [{"date": "current", "mic_value": mic.get("mic_value", "0")}]
-        # Format prompt
         prompt = f"{TREND_ANALYST_SYSTEM}\n\n{TREND_ANALYST_PROMPT.format(
             organism=organism,
             antibiotic=antibiotic,
@@ -305,18 +211,16 @@ def run_trend_analyst(state: InfectionState) -> InfectionState:
         )}"
         try:
             response = run_inference(
                 prompt=prompt,
-                model_name="medgemma_27b",  # Agent 3: MedGemma 27B per PLAN.md (env maps to 4B on limited GPU)
                 max_new_tokens=1024,
                 temperature=0.2,
             )
             parsed = safe_json_parse(response)
             if parsed:
                 trend_results.append(parsed)
-                # Add safety warning if high/critical risk
                 risk_level = parsed.get("risk_level", "LOW")
                 if risk_level in ["HIGH", "CRITICAL"]:
                     warning = f"MIC trend alert for {organism}/{antibiotic}: {parsed.get('recommendation', 'Review needed')}"
@@ -328,10 +232,8 @@ def run_trend_analyst(state: InfectionState) -> InfectionState:
             logger.error(f"Trend analysis error for {organism}/{antibiotic}: {e}")
             trend_results.append({"error": str(e)})
-    # Summarize trends
     state["trend_notes"] = json.dumps(trend_results, indent=2)
-    # Create summary
     high_risk_count = sum(1 for t in trend_results if t.get("risk_level") in ["HIGH", "CRITICAL"])
     state["mic_trend_summary"] = f"Analyzed {len(trend_results)} organism-antibiotic pairs. High-risk findings: {high_risk_count}"
@@ -339,43 +241,21 @@ def run_trend_analyst(state: InfectionState) -> InfectionState:
     return state
-# =============================================================================
-# AGENT 4: CLINICAL PHARMACOLOGIST
-# =============================================================================
 def run_clinical_pharmacologist(state: InfectionState) -> InfectionState:
-    """
-    Agent 4: Generate final antibiotic recommendation with safety checks.
-    Input state fields used:
-        - intake_notes, vision_notes, trend_notes
-        - age_years, weight_kg, creatinine_clearance_ml_min
-        - allergies, medications
-        - infection_site, suspected_source
-    Output state fields updated:
-        - recommendation
-        - pharmacology_notes
-        - safety_warnings (additional alerts)
-    """
     logger.info("Running Clinical Pharmacologist agent...")
-    # Gather all previous agent outputs
     intake_summary = state.get("intake_notes", "No intake data")
     lab_results = state.get("vision_notes", "No lab data")
     trend_analysis = state.get("trend_notes", "No trend data")
-    # Get RAG context
     query = f"treatment {state.get('suspected_source', '')} antibiotic recommendation"
     rag_context = get_context_for_agent(
         agent_name="clinical_pharmacologist",
         query=query,
-        patient_context={
-            "proposed_antibiotic": None,  # Will be determined by agent
-        },
     )
-    # Format prompt
     prompt = f"{CLINICAL_PHARMACOLOGIST_SYSTEM}\n\n{CLINICAL_PHARMACOLOGIST_PROMPT.format(
         intake_summary=intake_summary,
         lab_results=lab_results,
@@ -392,18 +272,11 @@ def run_clinical_pharmacologist(state: InfectionState) -> InfectionState:
     )}"
     try:
-        response = run_inference(
-            prompt=prompt,
-            model_name="medgemma_4b",
-            max_new_tokens=2048,
-            temperature=0.2,
-        )
         parsed = safe_json_parse(response)
         if parsed:
             state["pharmacology_notes"] = json.dumps(parsed, indent=2)
-            # Build recommendation
             primary = parsed.get("primary_recommendation", {})
             recommendation = {
                 "primary_antibiotic": primary.get("antibiotic"),
@@ -416,19 +289,16 @@ def run_clinical_pharmacologist(state: InfectionState) -> InfectionState:
                 "safety_alerts": [a.get("message") for a in parsed.get("safety_alerts", [])],
             }
-            # Add alternative if provided
             alt = parsed.get("alternative_recommendation", {})
             if alt.get("antibiotic"):
                 recommendation["backup_antibiotic"] = alt.get("antibiotic")
             state["recommendation"] = recommendation
-            # Add safety alerts to state
             for alert in parsed.get("safety_alerts", []):
                 if alert.get("level") in ["WARNING", "CRITICAL"]:
                     state.setdefault("safety_warnings", []).append(alert.get("message"))
-            # Run TxGemma safety check (optional)
             if primary.get("antibiotic"):
                 safety_result = _run_txgemma_safety_check(
                     antibiotic=primary.get("antibiotic"),
@@ -441,7 +311,6 @@ def run_clinical_pharmacologist(state: InfectionState) -> InfectionState:
                 )
                 if safety_result:
                     state.setdefault("debug_log", []).append(f"TxGemma safety: {safety_result}")
         else:
             state["pharmacology_notes"] = response
             state["recommendation"] = {"rationale": response}
@@ -455,12 +324,8 @@ def run_clinical_pharmacologist(state: InfectionState) -> InfectionState:
     return state
-# =============================================================================
-# HELPER FUNCTIONS
-# =============================================================================
 def _format_patient_data(state: InfectionState) -> str:
-    """Format patient data for prompt injection."""
     lines = []
     if state.get("patient_id"):
@@ -505,11 +370,7 @@ def _run_txgemma_safety_check(
     crcl: Optional[float],
     medications: list,
 ) -> Optional[str]:
-    """
-    Run TxGemma safety check (supplementary).
-    TxGemma is used only for safety validation, not primary recommendations.
-    """
     try:
         prompt = TXGEMMA_SAFETY_PROMPT.format(
             antibiotic=antibiotic,
@@ -520,25 +381,13 @@ def _run_txgemma_safety_check(
             crcl=crcl or "Unknown",
             medications=", ".join(medications) if medications else "None",
         )
-        response = run_inference(
-            prompt=prompt,
-            model_name="txgemma_9b",  # Agent 4 safety: TxGemma 9B per PLAN.md (env maps to 2B on limited GPU)
-            max_new_tokens=256,
-            temperature=0.1,
-        )
-        return response
     except Exception as e:
         logger.warning(f"TxGemma safety check failed: {e}")
         return None
-# =============================================================================
-# AGENT REGISTRY
-# =============================================================================
 AGENTS = {
     "intake_historian": run_intake_historian,
     "vision_specialist": run_vision_specialist,
@@ -548,27 +397,7 @@ AGENTS = {
 def run_agent(agent_name: str, state: InfectionState) -> InfectionState:
-    """
-    Run a specific agent by name.
-    Args:
-        agent_name: Name of the agent to run
-        state: Current infection state
-    Returns:
-        Updated infection state
-    """
     if agent_name not in AGENTS:
         raise ValueError(f"Unknown agent: {agent_name}")
     return AGENTS[agent_name](state)
-__all__ = [
-    "run_intake_historian",
-    "run_vision_specialist",
-    "run_trend_analyst",
-    "run_clinical_pharmacologist",
-    "run_agent",
-    "AGENTS",
-]

 """
+Four-agent pipeline for the infection lifecycle workflow.
+Agent 1 - Intake Historian:     parse patient data, calculate CrCl, identify AMR risk factors
+Agent 2 - Vision Specialist:    extract organisms and MIC values from lab reports
+Agent 3 - Trend Analyst:        detect MIC creep and resistance velocity
+Agent 4 - Clinical Pharmacologist: generate final antibiotic recommendation with safety checks
 """
 import json
 import logging
+from typing import Optional
 from .config import get_settings
 from .loader import run_inference, TextModelName
 logger = logging.getLogger(__name__)
 def run_intake_historian(state: InfectionState) -> InfectionState:
+    """Parse patient data, calculate CrCl, identify MDR risk factors, and set the treatment stage."""
     logger.info("Running Intake Historian agent...")
     crcl = None
+    if all([state.get("age_years"), state.get("weight_kg"), state.get("serum_creatinine_mg_dl"), state.get("sex")]):
         try:
             crcl = calculate_crcl(
                 age_years=state["age_years"],
             logger.warning(f"Could not calculate CrCl: {e}")
             state.setdefault("errors", []).append(f"CrCl calculation error: {e}")
     patient_data = _format_patient_data(state)
     query = f"treatment {state.get('suspected_source', '')} {state.get('infection_site', '')}"
     rag_context = get_context_for_agent(
         agent_name="intake_historian",
         query=query,
+        patient_context={"pathogen_type": state.get("suspected_source")},
     )
     prompt = f"{INTAKE_HISTORIAN_SYSTEM}\n\n{INTAKE_HISTORIAN_PROMPT.format(
         patient_data=patient_data,
         medications=', '.join(state.get('medications', [])) or 'None reported',
         rag_context=rag_context,
     )}"
     try:
+        response = run_inference(prompt=prompt, model_name="medgemma_4b", max_new_tokens=1024, temperature=0.2)
         parsed = safe_json_parse(response)
         if parsed:
             state["intake_notes"] = json.dumps(parsed, indent=2)
             if parsed.get("creatinine_clearance_ml_min") and crcl is None:
                 state["creatinine_clearance_ml_min"] = parsed["creatinine_clearance_ml_min"]
+            state["stage"] = parsed.get("recommended_stage", "empirical")
         else:
             state["intake_notes"] = response
             state["stage"] = "empirical"
+        # Route to vision only if lab text was provided
+        state["route_to_vision"] = bool(state.get("labs_raw_text"))
     except Exception as e:
         logger.error(f"Intake Historian error: {e}")
     return state
 def run_vision_specialist(state: InfectionState) -> InfectionState:
+    """Extract pathogen names, MIC values, and S/I/R interpretations from lab report text."""
     logger.info("Running Vision Specialist agent...")
     labs_raw = state.get("labs_raw_text", "")
         state["route_to_trend_analyst"] = False
         return state
+    # Language detection is not implemented; we assume English or instruct the model to translate
     language = "English (assumed)"
     rag_context = get_context_for_agent(
         agent_name="vision_specialist",
         query="culture sensitivity susceptibility interpretation",
         patient_context={},
     )
     prompt = f"{VISION_SPECIALIST_SYSTEM}\n\n{VISION_SPECIALIST_PROMPT.format(
         report_content=labs_raw,
         source_format='text',
         language=language,
     )}"
     try:
+        response = run_inference(prompt=prompt, model_name="medgemma_4b", max_new_tokens=2048, temperature=0.1)
         parsed = safe_json_parse(response)
         if parsed:
             state["vision_notes"] = json.dumps(parsed, indent=2)
             organisms = parsed.get("identified_organisms", [])
             susceptibility = parsed.get("susceptibility_results", [])
+            mic_data = [
+                {
+                    "organism": normalize_organism_name(r.get("organism", "")),
+                    "antibiotic": normalize_antibiotic_name(r.get("antibiotic", "")),
+                    "mic_value": str(r.get("mic_value", "")),
+                    "mic_unit": r.get("mic_unit", "mg/L"),
+                    "interpretation": r.get("interpretation"),
                 }
+                for r in susceptibility
+            ]
             state["mic_data"] = mic_data
+            state["labs_parsed"] = [
+                {
+                    "name": org.get("organism_name", "Unknown"),
+                    "value": org.get("colony_count", ""),
+                    "flag": "pathogen" if org.get("significance") == "pathogen" else None,
+                }
+                for org in organisms
+            ]
             state["route_to_trend_analyst"] = len(mic_data) > 0
             critical = parsed.get("critical_findings", [])
             if critical:
                 state.setdefault("safety_warnings", []).extend(critical)
         else:
             state["vision_notes"] = response
             state["route_to_trend_analyst"] = False
     return state
 def run_trend_analyst(state: InfectionState) -> InfectionState:
+    """Analyze MIC trends per organism-antibiotic pair and flag high-risk creep."""
     logger.info("Running Trend Analyst agent...")
     mic_data = state.get("mic_data", [])
         state["trend_notes"] = "No MIC data available for trend analysis"
         return state
     trend_results = []
     for mic in mic_data:
         organism = mic.get("organism", "Unknown")
         antibiotic = mic.get("antibiotic", "Unknown")
         rag_context = get_context_for_agent(
             agent_name="trend_analyst",
             query=f"breakpoint {organism} {antibiotic}",
             },
         )
+        # Single time-point history — trend analysis requires historical data in production
         mic_history = [{"date": "current", "mic_value": mic.get("mic_value", "0")}]
         prompt = f"{TREND_ANALYST_SYSTEM}\n\n{TREND_ANALYST_PROMPT.format(
             organism=organism,
             antibiotic=antibiotic,
         )}"
         try:
+            # Agent 3 is designed for MedGemma 27B; on limited GPU the env var maps this to 4B
             response = run_inference(
                 prompt=prompt,
+                model_name="medgemma_27b",
                 max_new_tokens=1024,
                 temperature=0.2,
             )
             parsed = safe_json_parse(response)
             if parsed:
                 trend_results.append(parsed)
                 risk_level = parsed.get("risk_level", "LOW")
                 if risk_level in ["HIGH", "CRITICAL"]:
                     warning = f"MIC trend alert for {organism}/{antibiotic}: {parsed.get('recommendation', 'Review needed')}"
             logger.error(f"Trend analysis error for {organism}/{antibiotic}: {e}")
             trend_results.append({"error": str(e)})
     state["trend_notes"] = json.dumps(trend_results, indent=2)
     high_risk_count = sum(1 for t in trend_results if t.get("risk_level") in ["HIGH", "CRITICAL"])
     state["mic_trend_summary"] = f"Analyzed {len(trend_results)} organism-antibiotic pairs. High-risk findings: {high_risk_count}"
     return state
 def run_clinical_pharmacologist(state: InfectionState) -> InfectionState:
+    """Synthesize all agent outputs into a final antibiotic recommendation with safety checks."""
     logger.info("Running Clinical Pharmacologist agent...")
     intake_summary = state.get("intake_notes", "No intake data")
     lab_results = state.get("vision_notes", "No lab data")
     trend_analysis = state.get("trend_notes", "No trend data")
     query = f"treatment {state.get('suspected_source', '')} antibiotic recommendation"
     rag_context = get_context_for_agent(
         agent_name="clinical_pharmacologist",
         query=query,
+        patient_context={"proposed_antibiotic": None},
     )
     prompt = f"{CLINICAL_PHARMACOLOGIST_SYSTEM}\n\n{CLINICAL_PHARMACOLOGIST_PROMPT.format(
         intake_summary=intake_summary,
         lab_results=lab_results,
     )}"
     try:
+        response = run_inference(prompt=prompt, model_name="medgemma_4b", max_new_tokens=2048, temperature=0.2)
         parsed = safe_json_parse(response)
         if parsed:
             state["pharmacology_notes"] = json.dumps(parsed, indent=2)
             primary = parsed.get("primary_recommendation", {})
             recommendation = {
                 "primary_antibiotic": primary.get("antibiotic"),
                 "safety_alerts": [a.get("message") for a in parsed.get("safety_alerts", [])],
             }
             alt = parsed.get("alternative_recommendation", {})
             if alt.get("antibiotic"):
                 recommendation["backup_antibiotic"] = alt.get("antibiotic")
             state["recommendation"] = recommendation
             for alert in parsed.get("safety_alerts", []):
                 if alert.get("level") in ["WARNING", "CRITICAL"]:
                     state.setdefault("safety_warnings", []).append(alert.get("message"))
             if primary.get("antibiotic"):
                 safety_result = _run_txgemma_safety_check(
                     antibiotic=primary.get("antibiotic"),
                 )
                 if safety_result:
                     state.setdefault("debug_log", []).append(f"TxGemma safety: {safety_result}")
         else:
             state["pharmacology_notes"] = response
             state["recommendation"] = {"rationale": response}
     return state
 def _format_patient_data(state: InfectionState) -> str:
+    """Format patient fields from state into a readable string for prompt injection."""
     lines = []
     if state.get("patient_id"):
     crcl: Optional[float],
     medications: list,
 ) -> Optional[str]:
+    """Run a supplementary TxGemma toxicology check on the proposed prescription."""
     try:
         prompt = TXGEMMA_SAFETY_PROMPT.format(
             antibiotic=antibiotic,
             crcl=crcl or "Unknown",
             medications=", ".join(medications) if medications else "None",
         )
+        # Agent 4 safety check uses TxGemma 9B; on limited GPU the env var maps this to 2B
+        return run_inference(prompt=prompt, model_name="txgemma_9b", max_new_tokens=256, temperature=0.1)
     except Exception as e:
         logger.warning(f"TxGemma safety check failed: {e}")
         return None
 AGENTS = {
     "intake_historian": run_intake_historian,
     "vision_specialist": run_vision_specialist,
 def run_agent(agent_name: str, state: InfectionState) -> InfectionState:
+    """Dispatch to a named agent."""
     if agent_name not in AGENTS:
         raise ValueError(f"Unknown agent: {agent_name}")
     return AGENTS[agent_name](state)

src/config.py CHANGED Viewed

@@ -1,6 +1,4 @@
-from __future__ import annotations
 import os
 from functools import lru_cache
 from pathlib import Path
@@ -9,104 +7,63 @@ from typing import Literal, Optional
 from dotenv import load_dotenv
 from pydantic import BaseModel, Field
-# Load variables from a local .env if present (handy for local dev)
 load_dotenv()
 class Settings(BaseModel):
     """
-    Central configuration object for Med-I-C.
-    Values are read from environment variables where possible so that
-    the same code can run locally, on Kaggle, and in production.
     """
-    # ------------------------------------------------------------------
-    # General environment
-    # ------------------------------------------------------------------
     environment: Literal["local", "kaggle", "production"] = Field(
         default_factory=lambda: os.getenv("MEDIC_ENV", "local")
     )
     project_root: Path = Field(
         default_factory=lambda: Path(__file__).resolve().parents[1]
     )
     data_dir: Path = Field(
-        default_factory=lambda: Path(
-            os.getenv("MEDIC_DATA_DIR", "data")
-        )
     )
     chroma_db_dir: Path = Field(
-        default_factory=lambda: Path(
-            os.getenv("MEDIC_CHROMA_DB_DIR", "data/chroma_db")
-        )
     )
-    # ------------------------------------------------------------------
-    # Model + deployment preferences
-    # ------------------------------------------------------------------
     default_backend: Literal["vertex", "local"] = Field(
         default_factory=lambda: os.getenv("MEDIC_DEFAULT_BACKEND", "vertex")  # type: ignore[arg-type]
     )
-    # Quantization mode for local models
     quantization: Literal["none", "4bit"] = Field(
         default_factory=lambda: os.getenv("MEDIC_QUANTIZATION", "4bit")  # type: ignore[arg-type]
     )
-    # Embedding model used for ChromaDB / RAG
     embedding_model_name: str = Field(
-        default_factory=lambda: os.getenv(
-            "MEDIC_EMBEDDING_MODEL",
-            "sentence-transformers/all-MiniLM-L6-v2",
-        )
     )
-    # ------------------------------------------------------------------
-    # Vertex AI configuration (MedGemma / TxGemma hosted on Vertex)
-    # ------------------------------------------------------------------
     use_vertex: bool = Field(
-        default_factory=lambda: os.getenv("MEDIC_USE_VERTEX", "true").lower()
-        in {"1", "true", "yes"}
     )
     vertex_project_id: Optional[str] = Field(
         default_factory=lambda: os.getenv("MEDIC_VERTEX_PROJECT_ID")
     )
     vertex_location: str = Field(
         default_factory=lambda: os.getenv("MEDIC_VERTEX_LOCATION", "us-central1")
     )
-    # Model IDs as expected by Vertex / langchain-google-vertexai
     vertex_medgemma_4b_model: str = Field(
-        default_factory=lambda: os.getenv(
-            "MEDIC_VERTEX_MEDGEMMA_4B_MODEL",
-            "med-gemma-4b-it",
-        )
     )
     vertex_medgemma_27b_model: str = Field(
-        default_factory=lambda: os.getenv(
-            "MEDIC_VERTEX_MEDGEMMA_27B_MODEL",
-            "med-gemma-27b-text-it",
-        )
     )
     vertex_txgemma_9b_model: str = Field(
-        default_factory=lambda: os.getenv(
-            "MEDIC_VERTEX_TXGEMMA_9B_MODEL",
-            "tx-gemma-9b",
-        )
     )
     vertex_txgemma_2b_model: str = Field(
-        default_factory=lambda: os.getenv(
-            "MEDIC_VERTEX_TXGEMMA_2B_MODEL",
-            "tx-gemma-2b",
-        )
     )
-    # Standard GOOGLE_APPLICATION_CREDENTIALS path, if needed
     google_application_credentials: Optional[Path] = Field(
         default_factory=lambda: (
             Path(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])
@@ -115,9 +72,7 @@ class Settings(BaseModel):
         )
     )
-    # ------------------------------------------------------------------
-    # Local model paths (for offline / Kaggle GPU usage)
-    # ------------------------------------------------------------------
     local_medgemma_4b_model: Optional[str] = Field(
         default_factory=lambda: os.getenv("MEDIC_LOCAL_MEDGEMMA_4B_MODEL")
     )
@@ -134,17 +89,6 @@ class Settings(BaseModel):
 @lru_cache(maxsize=1)
 def get_settings() -> Settings:
-    """
-    Return a cached Settings instance.
-    Use this helper everywhere instead of instantiating Settings directly:
-        from src.config import get_settings
-        settings = get_settings()
-    """
     return Settings()
-__all__ = ["Settings", "get_settings"]

 import os
 from functools import lru_cache
 from pathlib import Path
 from dotenv import load_dotenv
 from pydantic import BaseModel, Field
 load_dotenv()
 class Settings(BaseModel):
     """
+    All configuration for Med-I-C, read from environment variables.
+    Supports three deployment targets via MEDIC_ENV: local, kaggle, production.
+    Backend selection (vertex or local) is controlled by MEDIC_DEFAULT_BACKEND.
     """
     environment: Literal["local", "kaggle", "production"] = Field(
         default_factory=lambda: os.getenv("MEDIC_ENV", "local")
     )
     project_root: Path = Field(
         default_factory=lambda: Path(__file__).resolve().parents[1]
     )
     data_dir: Path = Field(
+        default_factory=lambda: Path(os.getenv("MEDIC_DATA_DIR", "data"))
     )
     chroma_db_dir: Path = Field(
+        default_factory=lambda: Path(os.getenv("MEDIC_CHROMA_DB_DIR", "data/chroma_db"))
     )
     default_backend: Literal["vertex", "local"] = Field(
         default_factory=lambda: os.getenv("MEDIC_DEFAULT_BACKEND", "vertex")  # type: ignore[arg-type]
     )
+    # 4-bit quantization via bitsandbytes (local backend only)
     quantization: Literal["none", "4bit"] = Field(
         default_factory=lambda: os.getenv("MEDIC_QUANTIZATION", "4bit")  # type: ignore[arg-type]
     )
     embedding_model_name: str = Field(
+        default_factory=lambda: os.getenv("MEDIC_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
     )
+    # Vertex AI settings
     use_vertex: bool = Field(
+        default_factory=lambda: os.getenv("MEDIC_USE_VERTEX", "true").lower() in {"1", "true", "yes"}
     )
     vertex_project_id: Optional[str] = Field(
         default_factory=lambda: os.getenv("MEDIC_VERTEX_PROJECT_ID")
     )
     vertex_location: str = Field(
         default_factory=lambda: os.getenv("MEDIC_VERTEX_LOCATION", "us-central1")
     )
     vertex_medgemma_4b_model: str = Field(
+        default_factory=lambda: os.getenv("MEDIC_VERTEX_MEDGEMMA_4B_MODEL", "med-gemma-4b-it")
     )
     vertex_medgemma_27b_model: str = Field(
+        default_factory=lambda: os.getenv("MEDIC_VERTEX_MEDGEMMA_27B_MODEL", "med-gemma-27b-text-it")
     )
     vertex_txgemma_9b_model: str = Field(
+        default_factory=lambda: os.getenv("MEDIC_VERTEX_TXGEMMA_9B_MODEL", "tx-gemma-9b")
     )
     vertex_txgemma_2b_model: str = Field(
+        default_factory=lambda: os.getenv("MEDIC_VERTEX_TXGEMMA_2B_MODEL", "tx-gemma-2b")
     )
     google_application_credentials: Optional[Path] = Field(
         default_factory=lambda: (
             Path(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])
         )
     )
+    # Local HuggingFace model paths (used when MEDIC_DEFAULT_BACKEND=local)
     local_medgemma_4b_model: Optional[str] = Field(
         default_factory=lambda: os.getenv("MEDIC_LOCAL_MEDGEMMA_4B_MODEL")
     )
 @lru_cache(maxsize=1)
 def get_settings() -> Settings:
+    """Return the cached Settings singleton. Import this instead of instantiating Settings directly."""
     return Settings()

src/db/import_data.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Data import scripts for Med-I-C structured documents."""
 import pandas as pd
-import re
 from pathlib import Path
 from .database import (
     get_connection, init_database, execute_many,
@@ -10,7 +9,7 @@ from .database import (
 def safe_float(value):
-    """Safely convert a value to float, returning None on failure."""
     if pd.isna(value):
         return None
     try:
@@ -20,7 +19,7 @@ def safe_float(value):
 def safe_int(value):
-    """Safely convert a value to int, returning None on failure."""
     if pd.isna(value):
         return None
     try:
@@ -29,41 +28,46 @@ def safe_int(value):
         return None
 def classify_severity(description: str) -> str:
-    """Classify drug interaction severity based on description keywords."""
     if not description:
         return "unknown"
     desc_lower = description.lower()
-    # Major severity indicators
     major_keywords = [
         "cardiotoxic", "nephrotoxic", "hepatotoxic", "neurotoxic",
         "fatal", "death", "severe", "contraindicated", "arrhythmia",
         "qt prolongation", "seizure", "bleeding", "hemorrhage",
-        "serotonin syndrome", "neuroleptic malignant"
     ]
-    # Moderate severity indicators
     moderate_keywords = [
         "increase", "decrease", "reduce", "enhance", "inhibit",
         "metabolism", "concentration", "absorption", "excretion",
-        "therapeutic effect", "adverse effect", "toxicity"
     ]
-    for keyword in major_keywords:
-        if keyword in desc_lower:
-            return "major"
-    for keyword in moderate_keywords:
-        if keyword in desc_lower:
-            return "moderate"
     return "minor"
 def import_eml_antibiotics() -> int:
-    """Import WHO EML antibiotic classification data."""
     print("Importing EML antibiotic data...")
     eml_files = {
@@ -79,29 +83,21 @@ def import_eml_antibiotics() -> int:
             continue
         try:
-            # Use openpyxl directly with read_only=True for faster loading
             import openpyxl
             wb = openpyxl.load_workbook(filepath, read_only=True)
             ws = wb.active
-            # Get headers from first row
-            headers = []
-            for cell in ws[1]:
-                headers.append(str(cell.value).strip().lower().replace(' ', '_') if cell.value else f'col_{len(headers)}')
-            # Process data rows
-            for row_idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True), start=2):
                 row_dict = dict(zip(headers, row))
                 medicine = str(row_dict.get('medicine_name', row_dict.get('medicine', '')))
-                if not medicine or medicine == 'None' or medicine == 'nan':
                     continue
-                def safe_str(val):
-                    if val is None or pd.isna(val):
-                        return ''
-                    return str(val)
                 records.append((
                     medicine,
                     category,
@@ -114,20 +110,20 @@ def import_eml_antibiotics() -> int:
                 ))
             wb.close()
-            print(f"  Loaded {len([r for r in records if r[1] == category])} from {category}")
         except Exception as e:
             print(f"  Warning: Error reading {filepath}: {e}")
             continue
     if records:
-        query = """
-            INSERT INTO eml_antibiotics
-            (medicine_name, who_category, eml_section, formulations,
-             indication, atc_codes, combined_with, status)
-            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-        """
-        execute_many(query, records)
         print(f"  Imported {len(records)} EML antibiotic records total")
     return len(records)
@@ -143,95 +139,78 @@ def import_atlas_susceptibility() -> int:
         print(f"  Warning: {filepath} not found, skipping...")
         return 0
-    # Read the raw data to find the header row and extract region
     df_raw = pd.read_excel(filepath, sheet_name="Percent", header=None)
-    # Extract region from the title (row 1)
     region = "Unknown"
-    for idx, row in df_raw.head(5).iterrows():
         cell = str(row.iloc[0]) if pd.notna(row.iloc[0]) else ""
         if "from" in cell.lower():
-            # Extract country from "Percentage Susceptibility from Argentina"
             parts = cell.split("from")
             if len(parts) > 1:
                 region = parts[1].strip()
             break
-    # Find the header row (contains 'Antibacterial' or 'N')
-    header_row = 4  # Default
     for idx, row in df_raw.head(10).iterrows():
         if any('Antibacterial' in str(v) for v in row.values if pd.notna(v)):
             header_row = idx
             break
-    # Read with proper header
     df = pd.read_excel(filepath, sheet_name="Percent", header=header_row)
-    # Standardize column names
     df.columns = [str(col).strip().lower().replace(' ', '_').replace('.', '') for col in df.columns]
     records = []
     for _, row in df.iterrows():
         antibiotic = str(row.get('antibacterial', ''))
-        # Skip empty or non-antibiotic rows
         if not antibiotic or antibiotic == 'nan' or 'omitted' in antibiotic.lower():
             continue
         if 'in vitro' in antibiotic.lower() or 'table cells' in antibiotic.lower():
             continue
-        # Get susceptibility values
-        n_value = row.get('n', None)
-        pct_s = row.get('susc', row.get('susceptible', None))
-        pct_i = row.get('int', row.get('intermediate', None))
-        pct_r = row.get('res', row.get('resistant', None))
-        # Use safe conversion functions
-        n_int = safe_int(n_value)
-        s_float = safe_float(pct_s)
         if n_int is not None and s_float is not None:
             records.append((
-                "General",  # Species - will be refined if more data available
-                "",  # Family
                 antibiotic,
                 s_float,
-                safe_float(pct_i),
-                safe_float(pct_r),
                 n_int,
-                2024,  # Year - from the data context
                 region,
-                "ATLAS"
             ))
     if records:
-        query = """
-            INSERT INTO atlas_susceptibility
-            (species, family, antibiotic, percent_susceptible,
-             percent_intermediate, percent_resistant, total_isolates,
-             year, region, source)
-            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-        """
-        execute_many(query, records)
         print(f"  Imported {len(records)} ATLAS susceptibility records from {region}")
     return len(records)
 def import_mic_breakpoints() -> int:
-    """Import EUCAST MIC breakpoint tables."""
     print("Importing MIC breakpoint data...")
     filepath = DOCS_DIR / "mic_breakpoints" / "v_16.0__BreakpointTables.xlsx"
     if not filepath.exists():
         print(f"  Warning: {filepath} not found, skipping...")
         return 0
-    # Get all sheet names
     xl = pd.ExcelFile(filepath)
-    # Skip non-pathogen sheets
     skip_sheets = {'Content', 'Changes', 'Notes', 'Guidance', 'Dosages',
                    'Technical uncertainty', 'PK PD breakpoints', 'PK PD cutoffs'}
@@ -239,58 +218,48 @@ def import_mic_breakpoints() -> int:
     for sheet_name in xl.sheet_names:
         if sheet_name in skip_sheets:
             continue
         try:
             df = pd.read_excel(filepath, sheet_name=sheet_name, header=None)
-            # Try to find antibiotic data - look for rows with MIC values
-            pathogen_group = sheet_name
-            # Simple heuristic: look for rows that might contain antibiotic names and MIC values
-            for idx, row in df.iterrows():
                 row_values = [str(v).strip() for v in row.values if pd.notna(v)]
-                # Look for rows that might be antibiotic entries
-                if len(row_values) >= 2:
-                    potential_antibiotic = row_values[0]
-                    # Skip header-like rows
-                    if any(kw in potential_antibiotic.lower() for kw in
-                           ['antibiotic', 'agent', 'note', 'disk', 'mic', 'breakpoint']):
-                        continue
-                    # Try to extract MIC values (numbers)
-                    mic_values = []
-                    for v in row_values[1:]:
-                        try:
-                            mic_values.append(float(v.replace('≤', '').replace('>', '').replace('<', '').strip()))
-                        except (ValueError, AttributeError):
-                            pass
-                    if len(mic_values) >= 2 and len(potential_antibiotic) > 2:
-                        records.append((
-                            pathogen_group,
-                            potential_antibiotic,
-                            None,  # route
-                            mic_values[0] if len(mic_values) > 0 else None,  # S breakpoint
-                            mic_values[1] if len(mic_values) > 1 else None,  # R breakpoint
-                            None,  # disk S
-                            None,  # disk R
-                            None,  # notes
-                            "16.0"
-                        ))
         except Exception as e:
             print(f"  Warning: Could not parse sheet '{sheet_name}': {e}")
             continue
     if records:
-        query = """
-            INSERT INTO mic_breakpoints
-            (pathogen_group, antibiotic, route, mic_susceptible, mic_resistant,
-             disk_susceptible, disk_resistant, notes, eucast_version)
-            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
-        """
-        execute_many(query, records)
         print(f"  Imported {len(records)} MIC breakpoint records")
     return len(records)
@@ -303,36 +272,32 @@ INTERACTIONS_CSV = DOCS_DIR / "drug_safety" / "db_drug_interactions.csv"
 def _resolve_interactions_csv() -> Path | None:
     """
-    Return the path to the drug interactions CSV, downloading it if needed.
-    Resolution order:
-    1. docs/drug_safety/db_drug_interactions.csv  (already present locally)
-    2. /kaggle/input/drug-drug-interactions/       (Kaggle notebook with dataset attached)
-    3. Kaggle API download                         (local dev with ~/.kaggle/kaggle.json)
     """
-    # 1. Already present
     if INTERACTIONS_CSV.exists():
         return INTERACTIONS_CSV
-    # 2. Kaggle input mount (dataset added via Kaggle UI)
-    for candidate in KAGGLE_INPUT_DIR.glob("*.csv") if KAGGLE_INPUT_DIR.exists() else []:
-        print(f"  Found CSV in Kaggle input: {candidate}")
-        return candidate
-    # 3. Download via Kaggle API
     print(f"  CSV not found — downloading from Kaggle dataset '{KAGGLE_DATASET}' ...")
     try:
-        import kaggle  # noqa: F401 – triggers credential check
         dest = INTERACTIONS_CSV.parent
         dest.mkdir(parents=True, exist_ok=True)
-        import subprocess
         result = subprocess.run(
-            ["kaggle", "datasets", "download", "-d", KAGGLE_DATASET,
-             "--unzip", "-p", str(dest)],
             capture_output=True, text=True,
         )
         if result.returncode == 0:
-            # Find the downloaded CSV
             for f in dest.glob("*.csv"):
                 print(f"  Downloaded: {f.name}")
                 return f
@@ -347,23 +312,18 @@ def _resolve_interactions_csv() -> Path | None:
 def import_drug_interactions(limit: int = None) -> int:
-    """Import drug-drug interaction database from Kaggle dataset mghobashy/drug-drug-interactions."""
     print("Importing drug interactions data...")
     filepath = _resolve_interactions_csv()
     if filepath is None:
         print("  Skipping drug interactions — CSV unavailable.")
         print(f"  To fix: attach the Kaggle dataset '{KAGGLE_DATASET}' to your notebook,")
         print("  or set up ~/.kaggle/kaggle.json for API access.")
         return 0
-    # Read CSV in chunks due to large size
-    chunk_size = 10000
     total_records = 0
-    for chunk in pd.read_csv(filepath, chunksize=chunk_size):
-        # Standardize column names
         chunk.columns = [col.strip().lower().replace(' ', '_') for col in chunk.columns]
         records = []
@@ -372,19 +332,14 @@ def import_drug_interactions(limit: int = None) -> int:
             drug_2 = str(row.get('drug_2', row.get('drug2', row.iloc[1] if len(row) > 1 else '')))
             description = str(row.get('interaction_description', row.get('description',
                              row.get('interaction', row.iloc[2] if len(row) > 2 else ''))))
-            severity = classify_severity(description)
             if drug_1 and drug_2:
-                records.append((drug_1, drug_2, description, severity))
         if records:
-            query = """
-                INSERT INTO drug_interactions
-                (drug_1, drug_2, interaction_description, severity)
-                VALUES (?, ?, ?, ?)
-            """
-            execute_many(query, records)
             total_records += len(records)
         if limit and total_records >= limit:
@@ -395,24 +350,19 @@ def import_drug_interactions(limit: int = None) -> int:
 def import_all_data(interactions_limit: int = None) -> dict:
-    """Import all structured data into the database."""
     print(f"\n{'='*50}")
     print("Med-I-C Data Import")
     print(f"{'='*50}\n")
-    # Initialize database
     init_database()
-    # Clear existing data
     with get_connection() as conn:
-        conn.execute("DELETE FROM eml_antibiotics")
-        conn.execute("DELETE FROM atlas_susceptibility")
-        conn.execute("DELETE FROM mic_breakpoints")
-        conn.execute("DELETE FROM drug_interactions")
         conn.commit()
     print("Cleared existing data\n")
-    # Import all data
     results = {
         "eml_antibiotics": import_eml_antibiotics(),
         "atlas_susceptibility": import_atlas_susceptibility(),
@@ -430,5 +380,4 @@ def import_all_data(interactions_limit: int = None) -> dict:
 if __name__ == "__main__":
-    # Import with a limit on interactions for faster demo
     import_all_data(interactions_limit=50000)

 """Data import scripts for Med-I-C structured documents."""
 import pandas as pd
 from pathlib import Path
 from .database import (
     get_connection, init_database, execute_many,
 def safe_float(value):
+    """Convert value to float; return None if the value is NaN or non-numeric."""
     if pd.isna(value):
         return None
     try:
 def safe_int(value):
+    """Convert value to int via float; return None if the value is NaN or non-numeric."""
     if pd.isna(value):
         return None
     try:
         return None
+def safe_str(value) -> str:
+    """Convert value to string; return empty string for None or NaN."""
+    if value is None or pd.isna(value):
+        return ''
+    return str(value)
 def classify_severity(description: str) -> str:
+    """
+    Classify drug interaction severity from the interaction description text.
+    Returns 'major', 'moderate', or 'minor' based on keyword presence.
+    Major keywords take precedence over moderate.
+    """
     if not description:
         return "unknown"
     desc_lower = description.lower()
     major_keywords = [
         "cardiotoxic", "nephrotoxic", "hepatotoxic", "neurotoxic",
         "fatal", "death", "severe", "contraindicated", "arrhythmia",
         "qt prolongation", "seizure", "bleeding", "hemorrhage",
+        "serotonin syndrome", "neuroleptic malignant",
     ]
     moderate_keywords = [
         "increase", "decrease", "reduce", "enhance", "inhibit",
         "metabolism", "concentration", "absorption", "excretion",
+        "therapeutic effect", "adverse effect", "toxicity",
     ]
+    if any(kw in desc_lower for kw in major_keywords):
+        return "major"
+    if any(kw in desc_lower for kw in moderate_keywords):
+        return "moderate"
     return "minor"
 def import_eml_antibiotics() -> int:
+    """Import WHO EML antibiotic classification data from the three AWaRe Excel files."""
     print("Importing EML antibiotic data...")
     eml_files = {
             continue
         try:
             import openpyxl
             wb = openpyxl.load_workbook(filepath, read_only=True)
             ws = wb.active
+            headers = [
+                str(cell.value).strip().lower().replace(' ', '_') if cell.value else f'col_{i}'
+                for i, cell in enumerate(ws[1])
+            ]
+            for row in ws.iter_rows(min_row=2, values_only=True):
                 row_dict = dict(zip(headers, row))
                 medicine = str(row_dict.get('medicine_name', row_dict.get('medicine', '')))
+                if not medicine or medicine in ('None', 'nan'):
                     continue
                 records.append((
                     medicine,
                     category,
                 ))
             wb.close()
+            print(f"  Loaded {sum(1 for r in records if r[1] == category)} from {category}")
         except Exception as e:
             print(f"  Warning: Error reading {filepath}: {e}")
             continue
     if records:
+        execute_many(
+            """INSERT INTO eml_antibiotics
+               (medicine_name, who_category, eml_section, formulations,
+                indication, atc_codes, combined_with, status)
+               VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
+            records,
+        )
         print(f"  Imported {len(records)} EML antibiotic records total")
     return len(records)
         print(f"  Warning: {filepath} not found, skipping...")
         return 0
     df_raw = pd.read_excel(filepath, sheet_name="Percent", header=None)
+    # Title row contains "Percentage Susceptibility from <Country>"
     region = "Unknown"
+    for _, row in df_raw.head(5).iterrows():
         cell = str(row.iloc[0]) if pd.notna(row.iloc[0]) else ""
         if "from" in cell.lower():
             parts = cell.split("from")
             if len(parts) > 1:
                 region = parts[1].strip()
             break
+    # Locate the actual header row by finding "Antibacterial"
+    header_row = 4
     for idx, row in df_raw.head(10).iterrows():
         if any('Antibacterial' in str(v) for v in row.values if pd.notna(v)):
             header_row = idx
             break
     df = pd.read_excel(filepath, sheet_name="Percent", header=header_row)
     df.columns = [str(col).strip().lower().replace(' ', '_').replace('.', '') for col in df.columns]
     records = []
     for _, row in df.iterrows():
         antibiotic = str(row.get('antibacterial', ''))
         if not antibiotic or antibiotic == 'nan' or 'omitted' in antibiotic.lower():
             continue
         if 'in vitro' in antibiotic.lower() or 'table cells' in antibiotic.lower():
             continue
+        n_int = safe_int(row.get('n'))
+        s_float = safe_float(row.get('susc', row.get('susceptible')))
         if n_int is not None and s_float is not None:
             records.append((
+                "General",
+                "",
                 antibiotic,
                 s_float,
+                safe_float(row.get('int', row.get('intermediate'))),
+                safe_float(row.get('res', row.get('resistant'))),
                 n_int,
+                2024,
                 region,
+                "ATLAS",
             ))
     if records:
+        execute_many(
+            """INSERT INTO atlas_susceptibility
+               (species, family, antibiotic, percent_susceptible,
+                percent_intermediate, percent_resistant, total_isolates,
+                year, region, source)
+               VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+            records,
+        )
         print(f"  Imported {len(records)} ATLAS susceptibility records from {region}")
     return len(records)
 def import_mic_breakpoints() -> int:
+    """Import EUCAST MIC breakpoint tables from the Excel file."""
     print("Importing MIC breakpoint data...")
     filepath = DOCS_DIR / "mic_breakpoints" / "v_16.0__BreakpointTables.xlsx"
     if not filepath.exists():
         print(f"  Warning: {filepath} not found, skipping...")
         return 0
     xl = pd.ExcelFile(filepath)
+    # These sheets contain metadata/guidance, not pathogen-specific breakpoints
     skip_sheets = {'Content', 'Changes', 'Notes', 'Guidance', 'Dosages',
                    'Technical uncertainty', 'PK PD breakpoints', 'PK PD cutoffs'}
     for sheet_name in xl.sheet_names:
         if sheet_name in skip_sheets:
             continue
         try:
             df = pd.read_excel(filepath, sheet_name=sheet_name, header=None)
+            for _, row in df.iterrows():
                 row_values = [str(v).strip() for v in row.values if pd.notna(v)]
+                if len(row_values) < 2:
+                    continue
+                potential_antibiotic = row_values[0]
+                if any(kw in potential_antibiotic.lower() for kw in
+                       ['antibiotic', 'agent', 'note', 'disk', 'mic', 'breakpoint']):
+                    continue
+                # Extract numeric MIC values; strip inequality signs
+                mic_values = []
+                for v in row_values[1:]:
+                    try:
+                        mic_values.append(float(v.replace('≤', '').replace('>', '').replace('<', '').strip()))
+                    except (ValueError, AttributeError):
+                        pass
+                if len(mic_values) >= 2 and len(potential_antibiotic) > 2:
+                    records.append((
+                        sheet_name,          # pathogen_group
+                        potential_antibiotic,
+                        None,                # route
+                        mic_values[0],       # S breakpoint
+                        mic_values[1],       # R breakpoint
+                        None, None, None,    # disk S, disk R, notes
+                        "16.0",
+                    ))
         except Exception as e:
             print(f"  Warning: Could not parse sheet '{sheet_name}': {e}")
             continue
     if records:
+        execute_many(
+            """INSERT INTO mic_breakpoints
+               (pathogen_group, antibiotic, route, mic_susceptible, mic_resistant,
+                disk_susceptible, disk_resistant, notes, eucast_version)
+               VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+            records,
+        )
         print(f"  Imported {len(records)} MIC breakpoint records")
     return len(records)
 def _resolve_interactions_csv() -> Path | None:
     """
+    Find the drug interactions CSV file.
+    Checks in order:
+    1. docs/drug_safety/db_drug_interactions.csv (local)
+    2. /kaggle/input/drug-drug-interactions/ (Kaggle notebook with dataset attached)
+    3. Kaggle API download (requires ~/.kaggle/kaggle.json)
     """
     if INTERACTIONS_CSV.exists():
         return INTERACTIONS_CSV
+    if KAGGLE_INPUT_DIR.exists():
+        for candidate in KAGGLE_INPUT_DIR.glob("*.csv"):
+            print(f"  Found CSV in Kaggle input: {candidate}")
+            return candidate
     print(f"  CSV not found — downloading from Kaggle dataset '{KAGGLE_DATASET}' ...")
     try:
+        import kaggle  # noqa: F401 — triggers credential check
+        import subprocess
         dest = INTERACTIONS_CSV.parent
         dest.mkdir(parents=True, exist_ok=True)
         result = subprocess.run(
+            ["kaggle", "datasets", "download", "-d", KAGGLE_DATASET, "--unzip", "-p", str(dest)],
             capture_output=True, text=True,
         )
         if result.returncode == 0:
             for f in dest.glob("*.csv"):
                 print(f"  Downloaded: {f.name}")
                 return f
 def import_drug_interactions(limit: int = None) -> int:
+    """Import drug-drug interactions from the DDInter CSV (Kaggle dataset mghobashy/drug-drug-interactions)."""
     print("Importing drug interactions data...")
     filepath = _resolve_interactions_csv()
     if filepath is None:
         print("  Skipping drug interactions — CSV unavailable.")
         print(f"  To fix: attach the Kaggle dataset '{KAGGLE_DATASET}' to your notebook,")
         print("  or set up ~/.kaggle/kaggle.json for API access.")
         return 0
     total_records = 0
+    for chunk in pd.read_csv(filepath, chunksize=10000):
         chunk.columns = [col.strip().lower().replace(' ', '_') for col in chunk.columns]
         records = []
             drug_2 = str(row.get('drug_2', row.get('drug2', row.iloc[1] if len(row) > 1 else '')))
             description = str(row.get('interaction_description', row.get('description',
                              row.get('interaction', row.iloc[2] if len(row) > 2 else ''))))
             if drug_1 and drug_2:
+                records.append((drug_1, drug_2, description, classify_severity(description)))
         if records:
+            execute_many(
+                "INSERT INTO drug_interactions (drug_1, drug_2, interaction_description, severity) VALUES (?, ?, ?, ?)",
+                records,
+            )
             total_records += len(records)
         if limit and total_records >= limit:
 def import_all_data(interactions_limit: int = None) -> dict:
+    """Initialize the database and import all structured data sources."""
     print(f"\n{'='*50}")
     print("Med-I-C Data Import")
     print(f"{'='*50}\n")
     init_database()
     with get_connection() as conn:
+        for table in ("eml_antibiotics", "atlas_susceptibility", "mic_breakpoints", "drug_interactions"):
+            conn.execute(f"DELETE FROM {table}")
         conn.commit()
     print("Cleared existing data\n")
     results = {
         "eml_antibiotics": import_eml_antibiotics(),
         "atlas_susceptibility": import_atlas_susceptibility(),
 if __name__ == "__main__":
     import_all_data(interactions_limit=50000)

src/graph.py CHANGED Viewed

@@ -1,17 +1,13 @@
 """
-LangGraph Orchestrator for Med-I-C Multi-Agent System.
-Implements the infection lifecycle workflow with conditional routing:
-Stage 1 (Empirical - no lab results):
-    Intake Historian -> Clinical Pharmacologist
-Stage 2 (Targeted - lab results available):
-    Intake Historian -> Vision Specialist -> Trend Analyst -> Clinical Pharmacologist
 """
-from __future__ import annotations
 import logging
 from typing import Literal
@@ -28,189 +24,59 @@ from .state import InfectionState
 logger = logging.getLogger(__name__)
-# =============================================================================
-# NODE FUNCTIONS (Wrapper for agents)
-# =============================================================================
-def intake_historian_node(state: InfectionState) -> InfectionState:
-    """Node 1: Run Intake Historian agent."""
-    logger.info("Graph: Executing Intake Historian node")
-    return run_intake_historian(state)
-def vision_specialist_node(state: InfectionState) -> InfectionState:
-    """Node 2: Run Vision Specialist agent."""
-    logger.info("Graph: Executing Vision Specialist node")
-    return run_vision_specialist(state)
-def trend_analyst_node(state: InfectionState) -> InfectionState:
-    """Node 3: Run Trend Analyst agent."""
-    logger.info("Graph: Executing Trend Analyst node")
-    return run_trend_analyst(state)
-def clinical_pharmacologist_node(state: InfectionState) -> InfectionState:
-    """Node 4: Run Clinical Pharmacologist agent."""
-    logger.info("Graph: Executing Clinical Pharmacologist node")
-    return run_clinical_pharmacologist(state)
-# =============================================================================
-# CONDITIONAL ROUTING FUNCTIONS
-# =============================================================================
 def route_after_intake(state: InfectionState) -> Literal["vision_specialist", "clinical_pharmacologist"]:
-    """
-    Determine routing after Intake Historian.
-    Routes to Vision Specialist if:
-    - stage is "targeted" AND
-    - route_to_vision is True (i.e., we have lab data to process)
-    Otherwise routes directly to Clinical Pharmacologist (empirical path).
-    """
-    stage = state.get("stage", "empirical")
-    has_lab_data = state.get("route_to_vision", False)
-    if stage == "targeted" and has_lab_data:
-        logger.info("Graph: Routing to Vision Specialist (targeted path)")
         return "vision_specialist"
-    else:
-        logger.info("Graph: Routing to Clinical Pharmacologist (empirical path)")
-        return "clinical_pharmacologist"
 def route_after_vision(state: InfectionState) -> Literal["trend_analyst", "clinical_pharmacologist"]:
-    """
-    Determine routing after Vision Specialist.
-    Routes to Trend Analyst if:
-    - route_to_trend_analyst is True (i.e., we have MIC data to analyze)
-    Otherwise skips to Clinical Pharmacologist.
-    """
-    should_analyze_trends = state.get("route_to_trend_analyst", False)
-    if should_analyze_trends:
-        logger.info("Graph: Routing to Trend Analyst")
         return "trend_analyst"
-    else:
-        logger.info("Graph: Skipping Trend Analyst, routing to Clinical Pharmacologist")
-        return "clinical_pharmacologist"
-# =============================================================================
-# GRAPH CONSTRUCTION
-# =============================================================================
 def build_infection_graph() -> StateGraph:
-    """
-    Build the LangGraph StateGraph for the infection lifecycle workflow.
-    Returns:
-        Compiled StateGraph ready for execution
-    """
-    # Create the graph with InfectionState as the state schema
     graph = StateGraph(InfectionState)
-    # Add nodes
-    graph.add_node("intake_historian", intake_historian_node)
-    graph.add_node("vision_specialist", vision_specialist_node)
-    graph.add_node("trend_analyst", trend_analyst_node)
-    graph.add_node("clinical_pharmacologist", clinical_pharmacologist_node)
-    # Set entry point
     graph.set_entry_point("intake_historian")
-    # Add conditional edges from intake_historian
     graph.add_conditional_edges(
         "intake_historian",
         route_after_intake,
-        {
-            "vision_specialist": "vision_specialist",
-            "clinical_pharmacologist": "clinical_pharmacologist",
-        }
     )
-    # Add conditional edges from vision_specialist
     graph.add_conditional_edges(
         "vision_specialist",
         route_after_vision,
-        {
-            "trend_analyst": "trend_analyst",
-            "clinical_pharmacologist": "clinical_pharmacologist",
-        }
     )
-    # Add edge from trend_analyst to clinical_pharmacologist
     graph.add_edge("trend_analyst", "clinical_pharmacologist")
-    # Add edge from clinical_pharmacologist to END
     graph.add_edge("clinical_pharmacologist", END)
     return graph
-def compile_graph():
     """
-    Build and compile the graph for execution.
-    Returns:
-        Compiled graph that can be invoked with .invoke(state)
     """
-    graph = build_infection_graph()
-    return graph.compile()
-# =============================================================================
-# EXECUTION HELPERS
-# =============================================================================
-def run_pipeline(
-    patient_data: dict,
-    labs_raw_text: str | None = None,
-) -> InfectionState:
-    """
-    Run the full infection lifecycle pipeline.
-    This is the main entry point for executing the multi-agent workflow.
-    Args:
-        patient_data: Dict containing patient information:
-            - age_years: Patient age
-            - weight_kg: Patient weight
-            - sex: "male" or "female"
-            - serum_creatinine_mg_dl: Serum creatinine (optional)
-            - medications: List of current medications
-            - allergies: List of allergies
-            - comorbidities: List of comorbidities
-            - infection_site: Site of infection
-            - suspected_source: Suspected pathogen/source
-        labs_raw_text: Raw text from lab report (if available).
-                      If provided, triggers targeted (Stage 2) pathway.
-    Returns:
-        Final InfectionState with recommendation
-    Example:
-        >>> state = run_pipeline(
-        ...     patient_data={
-        ...         "age_years": 65,
-        ...         "weight_kg": 70,
-        ...         "sex": "male",
-        ...         "serum_creatinine_mg_dl": 1.2,
-        ...         "medications": ["metformin", "lisinopril"],
-        ...         "allergies": ["penicillin"],
-        ...         "infection_site": "urinary",
-        ...         "suspected_source": "community UTI",
-        ...     },
-        ...     labs_raw_text="E. coli isolated. Ciprofloxacin MIC: 0.5 mg/L (S)"
-        ... )
-        >>> print(state["recommendation"]["primary_antibiotic"])
-    """
-    # Build initial state from patient data
     initial_state: InfectionState = {
         "age_years": patient_data.get("age_years"),
         "weight_kg": patient_data.get("weight_kg"),
@@ -224,59 +90,34 @@ def run_pipeline(
         "suspected_source": patient_data.get("suspected_source"),
         "country_or_region": patient_data.get("country_or_region"),
         "vitals": patient_data.get("vitals", {}),
     }
-    # Add lab data if provided
     if labs_raw_text:
         initial_state["labs_raw_text"] = labs_raw_text
-        initial_state["stage"] = "targeted"
-    else:
-        initial_state["stage"] = "empirical"
-    # Compile and run the graph
-    logger.info(f"Starting pipeline execution (stage: {initial_state['stage']})")
-    compiled_graph = compile_graph()
-    final_state = compiled_graph.invoke(initial_state)
-    logger.info("Pipeline execution complete")
     return final_state
 def run_empirical_pipeline(patient_data: dict) -> InfectionState:
-    """
-    Run Stage 1 (Empirical) pipeline only.
-    Shorthand for run_pipeline without lab data.
-    """
-    return run_pipeline(patient_data, labs_raw_text=None)
 def run_targeted_pipeline(patient_data: dict, labs_raw_text: str) -> InfectionState:
-    """
-    Run Stage 2 (Targeted) pipeline with lab data.
-    Shorthand for run_pipeline with lab data.
-    """
     return run_pipeline(patient_data, labs_raw_text=labs_raw_text)
-# =============================================================================
-# VISUALIZATION (for debugging)
-# =============================================================================
 def get_graph_mermaid() -> str:
-    """
-    Get Mermaid diagram representation of the graph.
-    Useful for documentation and debugging.
-    """
-    graph = build_infection_graph()
     try:
-        return graph.compile().get_graph().draw_mermaid()
     except Exception:
-        # Fallback: return manual diagram
         return """
 graph TD
     A[intake_historian] --> B{route_after_intake}
@@ -288,13 +129,3 @@ graph TD
     F --> E
     E --> G[END]
 """
-__all__ = [
-    "build_infection_graph",
-    "compile_graph",
-    "run_pipeline",
-    "run_empirical_pipeline",
-    "run_targeted_pipeline",
-    "get_graph_mermaid",
-]

 """
+LangGraph orchestrator for the infection lifecycle workflow.
+Stage 1 (empirical - no lab results):
+    Intake Historian → Clinical Pharmacologist
+Stage 2 (targeted - lab results available):
+    Intake Historian → Vision Specialist → [Trend Analyst →] Clinical Pharmacologist
 """
 import logging
 from typing import Literal
 logger = logging.getLogger(__name__)
 def route_after_intake(state: InfectionState) -> Literal["vision_specialist", "clinical_pharmacologist"]:
+    """Route to Vision Specialist if we have lab text to parse; otherwise go straight to pharmacologist."""
+    if state.get("stage") == "targeted" and state.get("route_to_vision"):
+        logger.info("Graph: routing to Vision Specialist (targeted path)")
         return "vision_specialist"
+    logger.info("Graph: routing to Clinical Pharmacologist (empirical path)")
+    return "clinical_pharmacologist"
 def route_after_vision(state: InfectionState) -> Literal["trend_analyst", "clinical_pharmacologist"]:
+    """Route to Trend Analyst if Vision Specialist extracted MIC values."""
+    if state.get("route_to_trend_analyst"):
+        logger.info("Graph: routing to Trend Analyst")
         return "trend_analyst"
+    logger.info("Graph: skipping Trend Analyst (no MIC data)")
+    return "clinical_pharmacologist"
 def build_infection_graph() -> StateGraph:
+    """Build and return the compiled LangGraph for the infection pipeline."""
     graph = StateGraph(InfectionState)
+    graph.add_node("intake_historian", run_intake_historian)
+    graph.add_node("vision_specialist", run_vision_specialist)
+    graph.add_node("trend_analyst", run_trend_analyst)
+    graph.add_node("clinical_pharmacologist", run_clinical_pharmacologist)
     graph.set_entry_point("intake_historian")
     graph.add_conditional_edges(
         "intake_historian",
         route_after_intake,
+        {"vision_specialist": "vision_specialist", "clinical_pharmacologist": "clinical_pharmacologist"},
     )
     graph.add_conditional_edges(
         "vision_specialist",
         route_after_vision,
+        {"trend_analyst": "trend_analyst", "clinical_pharmacologist": "clinical_pharmacologist"},
     )
     graph.add_edge("trend_analyst", "clinical_pharmacologist")
     graph.add_edge("clinical_pharmacologist", END)
     return graph
+def run_pipeline(patient_data: dict, labs_raw_text: str | None = None) -> InfectionState:
     """
+    Run the full infection pipeline and return the final state.
+    Pass labs_raw_text to trigger the targeted (Stage 2) pathway.
+    Without it, only the empirical (Stage 1) pathway runs.
     """
     initial_state: InfectionState = {
         "age_years": patient_data.get("age_years"),
         "weight_kg": patient_data.get("weight_kg"),
         "suspected_source": patient_data.get("suspected_source"),
         "country_or_region": patient_data.get("country_or_region"),
         "vitals": patient_data.get("vitals", {}),
+        "stage": "targeted" if labs_raw_text else "empirical",
     }
     if labs_raw_text:
         initial_state["labs_raw_text"] = labs_raw_text
+    logger.info(f"Starting pipeline (stage: {initial_state['stage']})")
+    compiled = build_infection_graph().compile()
+    final_state = compiled.invoke(initial_state)
+    logger.info("Pipeline complete")
     return final_state
 def run_empirical_pipeline(patient_data: dict) -> InfectionState:
+    """Shorthand for run_pipeline without lab data (Stage 1)."""
+    return run_pipeline(patient_data)
 def run_targeted_pipeline(patient_data: dict, labs_raw_text: str) -> InfectionState:
+    """Shorthand for run_pipeline with lab data (Stage 2)."""
     return run_pipeline(patient_data, labs_raw_text=labs_raw_text)
 def get_graph_mermaid() -> str:
+    """Return a Mermaid diagram of the graph (for documentation and debugging)."""
     try:
+        return build_infection_graph().compile().get_graph().draw_mermaid()
     except Exception:
         return """
 graph TD
     A[intake_historian] --> B{route_after_intake}
     F --> E
     E --> G[END]
 """

src/loader.py CHANGED Viewed

@@ -1,22 +1,17 @@
-from __future__ import annotations
 import logging
 from functools import lru_cache
-from typing import Any, Callable, Dict, Literal, Optional, Tuple
 from .config import get_settings
 logger = logging.getLogger(__name__)
 TextBackend = Literal["vertex", "local"]
 TextModelName = Literal["medgemma_4b", "medgemma_27b", "txgemma_9b", "txgemma_2b"]
-def _resolve_backend(
-    requested: Optional[TextBackend],
-) -> TextBackend:
     settings = get_settings()
     backend = requested or settings.default_backend  # type: ignore[assignment]
     if backend == "vertex" and not settings.use_vertex:
@@ -27,23 +22,16 @@ def _resolve_backend(
 @lru_cache(maxsize=8)
 def _get_vertex_chat_model(model_name: TextModelName):
-    """
-    Lazily construct a Vertex AI chat model via langchain-google-vertexai.
-    Returns an object with an .invoke(str) method; we wrap this in a simple
-    callable for downstream use.
-    """
     try:
         from langchain_google_vertexai import ChatVertexAI
-    except Exception as exc:  # pragma: no cover - import-time failure
         raise RuntimeError(
             "langchain-google-vertexai is not available; "
             "install it or switch MEDIC_DEFAULT_BACKEND=local."
         ) from exc
     settings = get_settings()
     if settings.vertex_project_id is None:
         raise RuntimeError(
             "MEDIC_VERTEX_PROJECT_ID is not set. "
@@ -56,40 +44,28 @@ def _get_vertex_chat_model(model_name: TextModelName):
         "txgemma_9b": settings.vertex_txgemma_9b_model,
         "txgemma_2b": settings.vertex_txgemma_2b_model,
     }
-    model_id = model_id_map[model_name]
     llm = ChatVertexAI(
-        model=model_id,
         project=settings.vertex_project_id,
         location=settings.vertex_location,
         temperature=0.2,
     )
     def _call(prompt: str, **kwargs: Any) -> str:
-        """Thin wrapper returning plain text from ChatVertexAI."""
         result = llm.invoke(prompt, **kwargs)
-        # langchain BaseMessage or plain string
-        content = getattr(result, "content", result)
-        return str(content)
     return _call
 @lru_cache(maxsize=8)
 def _get_local_causal_lm(model_name: TextModelName):
-    """
-    Lazily load a local transformers model for offline / Kaggle usage.
-    Assumes model paths are provided via MEDIC_LOCAL_* env vars and that
-    the appropriate model weights are available in the environment.
-    """
     from transformers import AutoModelForCausalLM, AutoTokenizer
     import torch
     settings = get_settings()
     model_path_map: Dict[TextModelName, Optional[str]] = {
         "medgemma_4b": settings.local_medgemma_4b_model,
         "medgemma_27b": settings.local_medgemma_27b_model,
@@ -101,31 +77,19 @@ def _get_local_causal_lm(model_name: TextModelName):
     if not model_path:
         raise RuntimeError(
             f"No local model path configured for {model_name}. "
-            f"Set MEDIC_LOCAL_*_MODEL or use the Vertex backend."
         )
-    load_kwargs: Dict[str, Any] = {
-        "device_map": "auto",
-    }
-    # Optional 4-bit quantization via bitsandbytes
-    if get_settings().quantization == "4bit":
         load_kwargs["load_in_4bit"] = True
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model = AutoModelForCausalLM.from_pretrained(model_path, **load_kwargs)
-    def _call(
-        prompt: str,
-        max_new_tokens: int = 512,
-        temperature: float = 0.2,
-        **generate_kwargs: Any,
-    ) -> str:
-        inputs = tokenizer(prompt, return_tensors="pt")
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
         do_sample = temperature > 0
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
@@ -134,11 +98,9 @@ def _get_local_causal_lm(model_name: TextModelName):
                 max_new_tokens=max_new_tokens,
                 **generate_kwargs,
             )
-        # Drop the prompt tokens and decode only the completion
-        generated_ids = output_ids[0, inputs["input_ids"].shape[1] :]
-        text = tokenizer.decode(generated_ids, skip_special_tokens=True)
-        return text.strip()
     return _call
@@ -148,22 +110,9 @@ def get_text_model(
     model_name: TextModelName = "medgemma_4b",
     backend: Optional[TextBackend] = None,
 ) -> Callable[..., str]:
-    """
-    Return a cached text-generation callable.
-    Example:
-        from src.loader import get_text_model
-        model = get_text_model("medgemma_4b")
-        answer = model("Explain ESBL in simple terms.")
-    """
-    resolved_backend = _resolve_backend(backend)
-    if resolved_backend == "vertex":
-        return _get_vertex_chat_model(model_name)
-    else:
-        return _get_local_causal_lm(model_name)
 def run_inference(
@@ -174,28 +123,7 @@ def run_inference(
     temperature: float = 0.2,
     **kwargs: Any,
 ) -> str:
-    """
-    Convenience wrapper around `get_text_model`.
-    This is the simplest entry point to use inside agents:
-        from src.loader import run_inference
-        text = run_inference(prompt, model_name="medgemma_4b")
-    """
     model = get_text_model(model_name=model_name, backend=backend)
-    return model(
-        prompt,
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        **kwargs,
-    )
-__all__ = [
-    "TextBackend",
-    "TextModelName",
-    "get_text_model",
-    "run_inference",
-]

 import logging
 from functools import lru_cache
+from typing import Any, Callable, Dict, Literal, Optional
 from .config import get_settings
 logger = logging.getLogger(__name__)
 TextBackend = Literal["vertex", "local"]
 TextModelName = Literal["medgemma_4b", "medgemma_27b", "txgemma_9b", "txgemma_2b"]
+def _resolve_backend(requested: Optional[TextBackend]) -> TextBackend:
     settings = get_settings()
     backend = requested or settings.default_backend  # type: ignore[assignment]
     if backend == "vertex" and not settings.use_vertex:
 @lru_cache(maxsize=8)
 def _get_vertex_chat_model(model_name: TextModelName):
+    """Load a Vertex AI chat model and return a callable that takes a prompt string."""
     try:
         from langchain_google_vertexai import ChatVertexAI
+    except Exception as exc:
         raise RuntimeError(
             "langchain-google-vertexai is not available; "
             "install it or switch MEDIC_DEFAULT_BACKEND=local."
         ) from exc
     settings = get_settings()
     if settings.vertex_project_id is None:
         raise RuntimeError(
             "MEDIC_VERTEX_PROJECT_ID is not set. "
         "txgemma_9b": settings.vertex_txgemma_9b_model,
         "txgemma_2b": settings.vertex_txgemma_2b_model,
     }
     llm = ChatVertexAI(
+        model=model_id_map[model_name],
         project=settings.vertex_project_id,
         location=settings.vertex_location,
         temperature=0.2,
     )
     def _call(prompt: str, **kwargs: Any) -> str:
         result = llm.invoke(prompt, **kwargs)
+        return str(getattr(result, "content", result))
     return _call
 @lru_cache(maxsize=8)
 def _get_local_causal_lm(model_name: TextModelName):
+    """Load a local HuggingFace causal LM and return a generation callable."""
     from transformers import AutoModelForCausalLM, AutoTokenizer
     import torch
     settings = get_settings()
     model_path_map: Dict[TextModelName, Optional[str]] = {
         "medgemma_4b": settings.local_medgemma_4b_model,
         "medgemma_27b": settings.local_medgemma_27b_model,
     if not model_path:
         raise RuntimeError(
             f"No local model path configured for {model_name}. "
+            "Set MEDIC_LOCAL_*_MODEL or use the Vertex backend."
         )
+    load_kwargs: Dict[str, Any] = {"device_map": "auto"}
+    if settings.quantization == "4bit":
         load_kwargs["load_in_4bit"] = True
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model = AutoModelForCausalLM.from_pretrained(model_path, **load_kwargs)
+    def _call(prompt: str, max_new_tokens: int = 512, temperature: float = 0.2, **generate_kwargs: Any) -> str:
+        inputs = {k: v.to(model.device) for k, v in tokenizer(prompt, return_tensors="pt").items()}
         do_sample = temperature > 0
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
                 **generate_kwargs,
             )
+        # Decode only the newly generated tokens, not the input prompt
+        generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
+        return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
     return _call
     model_name: TextModelName = "medgemma_4b",
     backend: Optional[TextBackend] = None,
 ) -> Callable[..., str]:
+    """Return a cached callable for the requested model and backend."""
+    resolved = _resolve_backend(backend)
+    return _get_vertex_chat_model(model_name) if resolved == "vertex" else _get_local_causal_lm(model_name)
 def run_inference(
     temperature: float = 0.2,
     **kwargs: Any,
 ) -> str:
+    """Run inference with the specified model. This is the primary entry point for agents."""
     model = get_text_model(model_name=model_name, backend=backend)
+    return model(prompt, max_new_tokens=max_new_tokens, temperature=temperature, **kwargs)

src/prompts.py CHANGED Viewed

@@ -1,18 +1,7 @@
-"""
-Prompt templates for Med-I-C multi-agent system.
-Each agent has a specific role in the infection lifecycle workflow:
-- Agent 1: Intake Historian - Parse patient data, risk factors, calculate CrCl
-- Agent 2: Vision Specialist - Extract structured data from lab reports (images/PDFs)
-- Agent 3: Trend Analyst - Detect MIC creep and resistance velocity
-- Agent 4: Clinical Pharmacologist - Final Rx recommendations + safety checks
-"""
-from __future__ import annotations
-# =============================================================================
-# AGENT 1: INTAKE HISTORIAN
-# =============================================================================
 INTAKE_HISTORIAN_SYSTEM = """You are an expert clinical intake specialist. Your role is to:
@@ -66,9 +55,7 @@ RAG CONTEXT (Relevant Guidelines):
 Provide your structured assessment following the system instructions."""
-# =============================================================================
-# AGENT 2: VISION SPECIALIST
-# =============================================================================
 VISION_SPECIALIST_SYSTEM = """You are an expert medical laboratory data extraction specialist. Your role is to:
@@ -131,9 +118,7 @@ Flag any critical findings that require urgent attention.
 Provide your structured extraction following the system instructions."""
-# =============================================================================
-# AGENT 3: TREND ANALYST
-# =============================================================================
 TREND_ANALYST_SYSTEM = """You are an expert antimicrobial resistance trend analyst. Your role is to:
@@ -195,9 +180,7 @@ Analyze the trend, calculate risk level, and provide recommendations.
 Follow the system instructions for output format."""
-# =============================================================================
-# AGENT 4: CLINICAL PHARMACOLOGIST
-# =============================================================================
 CLINICAL_PHARMACOLOGIST_SYSTEM = """You are an expert clinical pharmacologist specializing in infectious diseases and antimicrobial stewardship. Your role is to:
@@ -291,9 +274,7 @@ Provide your final recommendation following the system instructions.
 Ensure all safety checks are performed and documented."""
-# =============================================================================
-# TXGEMMA SAFETY CHECKER (Supplementary)
-# =============================================================================
 TXGEMMA_SAFETY_PROMPT = """Evaluate the safety profile of the following antibiotic prescription:
@@ -315,9 +296,7 @@ Evaluate for:
 Provide a brief safety assessment (2-3 sentences) and a risk rating (LOW/MODERATE/HIGH)."""
-# =============================================================================
-# HELPER TEMPLATES
-# =============================================================================
 ERROR_RECOVERY_PROMPT = """The previous agent encountered an error or produced invalid output.
@@ -338,18 +317,3 @@ CLINICAL SCENARIO:
 - Local resistance patterns: {local_resistance}
 Recommend appropriate empirical therapy following WHO AWaRe principles."""
-__all__ = [
-    "INTAKE_HISTORIAN_SYSTEM",
-    "INTAKE_HISTORIAN_PROMPT",
-    "VISION_SPECIALIST_SYSTEM",
-    "VISION_SPECIALIST_PROMPT",
-    "TREND_ANALYST_SYSTEM",
-    "TREND_ANALYST_PROMPT",
-    "CLINICAL_PHARMACOLOGIST_SYSTEM",
-    "CLINICAL_PHARMACOLOGIST_PROMPT",
-    "TXGEMMA_SAFETY_PROMPT",
-    "ERROR_RECOVERY_PROMPT",
-    "FALLBACK_EMPIRICAL_PROMPT",
-]

+"""Prompt templates for each agent in the Med-I-C pipeline."""
+# --- Agent 1: Intake Historian ---
 INTAKE_HISTORIAN_SYSTEM = """You are an expert clinical intake specialist. Your role is to:
 Provide your structured assessment following the system instructions."""
+# --- Agent 2: Vision Specialist ---
 VISION_SPECIALIST_SYSTEM = """You are an expert medical laboratory data extraction specialist. Your role is to:
 Provide your structured extraction following the system instructions."""
+# --- Agent 3: Trend Analyst ---
 TREND_ANALYST_SYSTEM = """You are an expert antimicrobial resistance trend analyst. Your role is to:
 Follow the system instructions for output format."""
+# --- Agent 4: Clinical Pharmacologist ---
 CLINICAL_PHARMACOLOGIST_SYSTEM = """You are an expert clinical pharmacologist specializing in infectious diseases and antimicrobial stewardship. Your role is to:
 Ensure all safety checks are performed and documented."""
+# --- TxGemma safety check (supplementary, not primary decision-making) ---
 TXGEMMA_SAFETY_PROMPT = """Evaluate the safety profile of the following antibiotic prescription:
 Provide a brief safety assessment (2-3 sentences) and a risk rating (LOW/MODERATE/HIGH)."""
+# --- Fallback templates ---
 ERROR_RECOVERY_PROMPT = """The previous agent encountered an error or produced invalid output.
 - Local resistance patterns: {local_resistance}
 Recommend appropriate empirical therapy following WHO AWaRe principles."""

src/rag.py CHANGED Viewed

@@ -1,116 +1,80 @@
 """
-RAG (Retrieval Augmented Generation) module for Med-I-C.
-Provides unified retrieval across multiple knowledge collections:
-- antibiotic_guidelines: WHO/IDSA treatment guidelines
-- mic_breakpoints: EUCAST/CLSI breakpoint tables
-- drug_safety: Drug interactions, warnings, contraindications
-- pathogen_resistance: Regional resistance patterns
 """
-from __future__ import annotations
 import logging
-from pathlib import Path
 from typing import Any, Dict, List, Optional
 from .config import get_settings
 logger = logging.getLogger(__name__)
-# =============================================================================
-# CHROMA CLIENT & EMBEDDING SETUP
-# =============================================================================
 _chroma_client = None
 _embedding_function = None
 def get_chroma_client():
-    """Get or create ChromaDB persistent client."""
     global _chroma_client
     if _chroma_client is None:
         import chromadb
-        settings = get_settings()
-        chroma_path = settings.chroma_db_dir
         chroma_path.mkdir(parents=True, exist_ok=True)
         _chroma_client = chromadb.PersistentClient(path=str(chroma_path))
     return _chroma_client
 def get_embedding_function():
-    """Get or create the embedding function."""
     global _embedding_function
     if _embedding_function is None:
         from chromadb.utils import embedding_functions
-        settings = get_settings()
         _embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
-            model_name=settings.embedding_model_name.split("/")[-1]
         )
     return _embedding_function
 def get_collection(name: str):
-    """
-    Get a ChromaDB collection by name.
-    Returns None if collection doesn't exist.
-    """
-    client = get_chroma_client()
-    ef = get_embedding_function()
     try:
-        return client.get_collection(name=name, embedding_function=ef)
     except Exception:
         logger.warning(f"Collection '{name}' not found")
         return None
-# =============================================================================
-# COLLECTION-SPECIFIC RETRIEVERS
-# =============================================================================
 def search_antibiotic_guidelines(
     query: str,
     n_results: int = 5,
     pathogen_filter: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
-    """
-    Search antibiotic treatment guidelines.
-    Args:
-        query: Search query
-        n_results: Number of results to return
-        pathogen_filter: Optional pathogen type filter (e.g., "ESBL-E", "CRE")
-    Returns:
-        List of relevant guideline excerpts with metadata
-    """
     collection = get_collection("idsa_treatment_guidelines")
     if collection is None:
-        logger.warning("idsa_treatment_guidelines collection not available")
         return []
-    where_filter = None
-    if pathogen_filter:
-        where_filter = {"pathogen_type": pathogen_filter}
     try:
         results = collection.query(
             query_texts=[query],
             n_results=n_results,
-            where=where_filter,
             include=["documents", "metadatas", "distances"],
         )
     except Exception as e:
         logger.error(f"Error querying guidelines: {e}")
         return []
-    return _format_results(results)
 def search_mic_breakpoints(
     query: str,
@@ -118,79 +82,45 @@ def search_mic_breakpoints(
     organism: Optional[str] = None,
     antibiotic: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
-    """
-    Search MIC breakpoint reference documentation.
-    Args:
-        query: Search query
-        n_results: Number of results
-        organism: Optional organism name filter
-        antibiotic: Optional antibiotic name filter
-    Returns:
-        List of relevant breakpoint information
-    """
     collection = get_collection("mic_reference_docs")
     if collection is None:
-        logger.warning("mic_reference_docs collection not available")
         return []
-    # Build query with organism/antibiotic context if provided
-    enhanced_query = query
-    if organism:
-        enhanced_query = f"{organism} {enhanced_query}"
-    if antibiotic:
-        enhanced_query = f"{antibiotic} {enhanced_query}"
     try:
         results = collection.query(
             query_texts=[enhanced_query],
             n_results=n_results,
             include=["documents", "metadatas", "distances"],
         )
     except Exception as e:
         logger.error(f"Error querying breakpoints: {e}")
         return []
-    return _format_results(results)
 def search_drug_safety(
     query: str,
     n_results: int = 5,
     drug_name: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
-    """
-    Search drug safety information (interactions, warnings, contraindications).
-    Args:
-        query: Search query
-        n_results: Number of results
-        drug_name: Optional drug name to focus search
-    Returns:
-        List of relevant safety information
-    """
     collection = get_collection("drug_safety")
     if collection is None:
-        # Fallback: try existing collections
-        logger.warning("drug_safety collection not available")
         return []
     enhanced_query = f"{drug_name} {query}" if drug_name else query
     try:
         results = collection.query(
             query_texts=[enhanced_query],
             n_results=n_results,
             include=["documents", "metadatas", "distances"],
         )
     except Exception as e:
         logger.error(f"Error querying drug safety: {e}")
         return []
-    return _format_results(results)
 def search_resistance_patterns(
     query: str,
@@ -198,45 +128,22 @@ def search_resistance_patterns(
     organism: Optional[str] = None,
     region: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
-    """
-    Search pathogen resistance pattern data.
-    Args:
-        query: Search query
-        n_results: Number of results
-        organism: Optional organism filter
-        region: Optional geographic region filter
-    Returns:
-        List of relevant resistance data
-    """
     collection = get_collection("pathogen_resistance")
     if collection is None:
-        logger.warning("pathogen_resistance collection not available")
         return []
-    enhanced_query = query
-    if organism:
-        enhanced_query = f"{organism} {enhanced_query}"
-    if region:
-        enhanced_query = f"{region} {enhanced_query}"
     try:
         results = collection.query(
             query_texts=[enhanced_query],
             n_results=n_results,
             include=["documents", "metadatas", "distances"],
         )
     except Exception as e:
         logger.error(f"Error querying resistance patterns: {e}")
         return []
-    return _format_results(results)
-# =============================================================================
-# UNIFIED CONTEXT RETRIEVER
-# =============================================================================
 def get_context_for_agent(
     agent_name: str,
@@ -245,238 +152,104 @@ def get_context_for_agent(
     n_results: int = 3,
 ) -> str:
     """
-    Get formatted RAG context string for a specific agent.
-    This is the main entry point for agents to retrieve context.
-    Args:
-        agent_name: Name of the requesting agent
-        query: The primary search query
-        patient_context: Optional dict with patient-specific info
-        n_results: Number of results per collection
-    Returns:
-        Formatted context string for injection into prompts
     """
-    context_parts = []
-    patient_context = patient_context or {}
     if agent_name == "intake_historian":
-        # Get empirical therapy guidelines
-        guidelines = search_antibiotic_guidelines(
-            query=query,
-            n_results=n_results,
-            pathogen_filter=patient_context.get("pathogen_type"),
-        )
         if guidelines:
-            context_parts.append("RELEVANT TREATMENT GUIDELINES:")
             for g in guidelines:
-                context_parts.append(f"- {g['content'][:500]}...")
-                context_parts.append(f"  [Source: {g.get('source', 'IDSA Guidelines')}]")
     elif agent_name == "vision_specialist":
-        # Get MIC reference info for lab interpretation
-        breakpoints = search_mic_breakpoints(
-            query=query,
-            n_results=n_results,
-            organism=patient_context.get("organism"),
-            antibiotic=patient_context.get("antibiotic"),
-        )
         if breakpoints:
-            context_parts.append("RELEVANT BREAKPOINT INFORMATION:")
             for b in breakpoints:
-                context_parts.append(f"- {b['content'][:400]}...")
     elif agent_name == "trend_analyst":
-        # Get breakpoints and resistance trends
         breakpoints = search_mic_breakpoints(
-            query=f"breakpoint {patient_context.get('organism', '')} {patient_context.get('antibiotic', '')}",
-            n_results=n_results,
-        )
-        resistance = search_resistance_patterns(
-            query=query,
             n_results=n_results,
-            organism=patient_context.get("organism"),
-            region=patient_context.get("region"),
         )
         if breakpoints:
-            context_parts.append("EUCAST BREAKPOINT DATA:")
             for b in breakpoints:
-                context_parts.append(f"- {b['content'][:400]}...")
         if resistance:
-            context_parts.append("\nRESISTANCE PATTERN DATA:")
             for r in resistance:
-                context_parts.append(f"- {r['content'][:400]}...")
     elif agent_name == "clinical_pharmacologist":
-        # Get comprehensive context for final recommendation
-        guidelines = search_antibiotic_guidelines(
-            query=query,
-            n_results=n_results,
-        )
-        safety = search_drug_safety(
-            query=query,
-            n_results=n_results,
-            drug_name=patient_context.get("proposed_antibiotic"),
-        )
         if guidelines:
-            context_parts.append("TREATMENT GUIDELINES:")
             for g in guidelines:
-                context_parts.append(f"- {g['content'][:400]}...")
         if safety:
-            context_parts.append("\nDRUG SAFETY INFORMATION:")
             for s in safety:
-                context_parts.append(f"- {s['content'][:400]}...")
     else:
-        # Generic retrieval
         guidelines = search_antibiotic_guidelines(query, n_results=n_results)
-        if guidelines:
-            for g in guidelines:
-                context_parts.append(f"- {g['content'][:500]}...")
-    if not context_parts:
-        return "No relevant context found in knowledge base."
-    return "\n".join(context_parts)
-def get_context_string(
-    query: str,
-    collections: Optional[List[str]] = None,
-    n_results_per_collection: int = 3,
-    **filters,
-) -> str:
-    """
-    Get a combined context string from multiple collections.
-    This is a simpler interface for general-purpose RAG retrieval.
-    Args:
-        query: Search query
-        collections: List of collection names to search (defaults to all)
-        n_results_per_collection: Results per collection
-        **filters: Additional filters (organism, antibiotic, region, etc.)
-    Returns:
-        Combined context string
-    """
-    default_collections = [
-        "idsa_treatment_guidelines",
-        "mic_reference_docs",
-    ]
-    collections = collections or default_collections
-    context_parts = []
-    for collection_name in collections:
-        if collection_name == "idsa_treatment_guidelines":
-            results = search_antibiotic_guidelines(
-                query,
-                n_results=n_results_per_collection,
-                pathogen_filter=filters.get("pathogen_type"),
-            )
-        elif collection_name == "mic_reference_docs":
-            results = search_mic_breakpoints(
-                query,
-                n_results=n_results_per_collection,
-                organism=filters.get("organism"),
-                antibiotic=filters.get("antibiotic"),
-            )
-        elif collection_name == "drug_safety":
-            results = search_drug_safety(
-                query,
-                n_results=n_results_per_collection,
-                drug_name=filters.get("drug_name"),
-            )
-        elif collection_name == "pathogen_resistance":
-            results = search_resistance_patterns(
-                query,
-                n_results=n_results_per_collection,
-                organism=filters.get("organism"),
-                region=filters.get("region"),
-            )
-        else:
-            continue
-        if results:
-            context_parts.append(f"=== {collection_name.upper()} ===")
-            for r in results:
-                context_parts.append(r["content"])
-                context_parts.append(f"[Relevance: {1 - r.get('distance', 0):.2f}]")
-                context_parts.append("")
-    return "\n".join(context_parts) if context_parts else "No relevant context found."
-# =============================================================================
-# HELPER FUNCTIONS
-# =============================================================================
 def _format_results(results: Dict[str, Any]) -> List[Dict[str, Any]]:
-    """Format ChromaDB query results into a standard format."""
     if not results or not results.get("documents"):
         return []
-    formatted = []
     documents = results["documents"][0] if results["documents"] else []
     metadatas = results.get("metadatas", [[]])[0]
     distances = results.get("distances", [[]])[0]
-    for i, doc in enumerate(documents):
-        formatted.append({
             "content": doc,
             "metadata": metadatas[i] if i < len(metadatas) else {},
             "distance": distances[i] if i < len(distances) else None,
             "source": metadatas[i].get("source", "Unknown") if i < len(metadatas) else "Unknown",
             "relevance_score": 1 - (distances[i] if i < len(distances) else 0),
-        })
-    return formatted
 def list_available_collections() -> List[str]:
-    """List all available ChromaDB collections."""
-    client = get_chroma_client()
     try:
-        collections = client.list_collections()
-        return [c.name for c in collections]
     except Exception as e:
         logger.error(f"Error listing collections: {e}")
         return []
 def get_collection_info(name: str) -> Optional[Dict[str, Any]]:
-    """Get information about a specific collection."""
     collection = get_collection(name)
     if collection is None:
         return None
     try:
-        return {
-            "name": collection.name,
-            "count": collection.count(),
-            "metadata": collection.metadata,
-        }
     except Exception as e:
         logger.error(f"Error getting collection info: {e}")
         return None
-__all__ = [
-    "get_chroma_client",
-    "get_embedding_function",
-    "get_collection",
-    "search_antibiotic_guidelines",
-    "search_mic_breakpoints",
-    "search_drug_safety",
-    "search_resistance_patterns",
-    "get_context_for_agent",
-    "get_context_string",
-    "list_available_collections",
-    "get_collection_info",
-]

 """
+RAG module for Med-I-C.
+Retrieves context from four ChromaDB collections:
+- idsa_treatment_guidelines: IDSA 2024 AMR guidance
+- mic_reference_docs: EUCAST v16.0 breakpoint tables
+- drug_safety: Drug interactions and contraindications
+- pathogen_resistance: ATLAS regional susceptibility data
 """
 import logging
 from typing import Any, Dict, List, Optional
 from .config import get_settings
 logger = logging.getLogger(__name__)
+# Module-level singletons; initialized lazily to avoid import-time side effects
 _chroma_client = None
 _embedding_function = None
 def get_chroma_client():
+    """Return the ChromaDB persistent client, creating it on first call."""
     global _chroma_client
     if _chroma_client is None:
         import chromadb
+        chroma_path = get_settings().chroma_db_dir
         chroma_path.mkdir(parents=True, exist_ok=True)
         _chroma_client = chromadb.PersistentClient(path=str(chroma_path))
     return _chroma_client
 def get_embedding_function():
+    """Return the SentenceTransformer embedding function, creating it on first call."""
     global _embedding_function
     if _embedding_function is None:
         from chromadb.utils import embedding_functions
+        # Use only the model short name (not the full HuggingFace path)
+        model_short_name = get_settings().embedding_model_name.split("/")[-1]
         _embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name=model_short_name
         )
     return _embedding_function
 def get_collection(name: str):
+    """Return a ChromaDB collection by name, or None if it does not exist."""
     try:
+        return get_chroma_client().get_collection(name=name, embedding_function=get_embedding_function())
     except Exception:
         logger.warning(f"Collection '{name}' not found")
         return None
 def search_antibiotic_guidelines(
     query: str,
     n_results: int = 5,
     pathogen_filter: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
+    """Search the IDSA treatment guidelines collection."""
     collection = get_collection("idsa_treatment_guidelines")
     if collection is None:
         return []
     try:
+        where = {"pathogen_type": pathogen_filter} if pathogen_filter else None
         results = collection.query(
             query_texts=[query],
             n_results=n_results,
+            where=where,
             include=["documents", "metadatas", "distances"],
         )
+        return _format_results(results)
     except Exception as e:
         logger.error(f"Error querying guidelines: {e}")
         return []
 def search_mic_breakpoints(
     query: str,
     organism: Optional[str] = None,
     antibiotic: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
+    """Search the EUCAST MIC breakpoint reference collection."""
     collection = get_collection("mic_reference_docs")
     if collection is None:
         return []
+    # Prepend organism/antibiotic to query to narrow semantic search
+    enhanced_query = " ".join(filter(None, [organism, antibiotic, query]))
     try:
         results = collection.query(
             query_texts=[enhanced_query],
             n_results=n_results,
             include=["documents", "metadatas", "distances"],
         )
+        return _format_results(results)
     except Exception as e:
         logger.error(f"Error querying breakpoints: {e}")
         return []
 def search_drug_safety(
     query: str,
     n_results: int = 5,
     drug_name: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
+    """Search the drug safety collection (interactions, warnings, contraindications)."""
     collection = get_collection("drug_safety")
     if collection is None:
         return []
     enhanced_query = f"{drug_name} {query}" if drug_name else query
     try:
         results = collection.query(
             query_texts=[enhanced_query],
             n_results=n_results,
             include=["documents", "metadatas", "distances"],
         )
+        return _format_results(results)
     except Exception as e:
         logger.error(f"Error querying drug safety: {e}")
         return []
 def search_resistance_patterns(
     query: str,
     organism: Optional[str] = None,
     region: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
+    """Search the ATLAS pathogen resistance collection."""
     collection = get_collection("pathogen_resistance")
     if collection is None:
         return []
+    enhanced_query = " ".join(filter(None, [region, organism, query]))
     try:
         results = collection.query(
             query_texts=[enhanced_query],
             n_results=n_results,
             include=["documents", "metadatas", "distances"],
         )
+        return _format_results(results)
     except Exception as e:
         logger.error(f"Error querying resistance patterns: {e}")
         return []
 def get_context_for_agent(
     agent_name: str,
     n_results: int = 3,
 ) -> str:
     """
+    Return a formatted context string for a specific agent.
+    Each agent draws from the collections most relevant to its task:
+    - intake_historian: IDSA guidelines
+    - vision_specialist: MIC breakpoints
+    - trend_analyst: MIC breakpoints + resistance patterns
+    - clinical_pharmacologist: guidelines + drug safety
     """
+    ctx = patient_context or {}
+    parts = []
     if agent_name == "intake_historian":
+        guidelines = search_antibiotic_guidelines(query, n_results=n_results, pathogen_filter=ctx.get("pathogen_type"))
         if guidelines:
+            parts.append("RELEVANT TREATMENT GUIDELINES:")
             for g in guidelines:
+                parts.append(f"- {g['content'][:500]}...")
+                parts.append(f"  [Source: {g.get('source', 'IDSA Guidelines')}]")
     elif agent_name == "vision_specialist":
+        breakpoints = search_mic_breakpoints(query, n_results=n_results, organism=ctx.get("organism"), antibiotic=ctx.get("antibiotic"))
         if breakpoints:
+            parts.append("RELEVANT BREAKPOINT INFORMATION:")
             for b in breakpoints:
+                parts.append(f"- {b['content'][:400]}...")
     elif agent_name == "trend_analyst":
         breakpoints = search_mic_breakpoints(
+            f"breakpoint {ctx.get('organism', '')} {ctx.get('antibiotic', '')}",
             n_results=n_results,
         )
+        resistance = search_resistance_patterns(query, n_results=n_results, organism=ctx.get("organism"), region=ctx.get("region"))
         if breakpoints:
+            parts.append("EUCAST BREAKPOINT DATA:")
             for b in breakpoints:
+                parts.append(f"- {b['content'][:400]}...")
         if resistance:
+            parts.append("\nRESISTANCE PATTERN DATA:")
             for r in resistance:
+                parts.append(f"- {r['content'][:400]}...")
     elif agent_name == "clinical_pharmacologist":
+        guidelines = search_antibiotic_guidelines(query, n_results=n_results)
+        safety = search_drug_safety(query, n_results=n_results, drug_name=ctx.get("proposed_antibiotic"))
         if guidelines:
+            parts.append("TREATMENT GUIDELINES:")
             for g in guidelines:
+                parts.append(f"- {g['content'][:400]}...")
         if safety:
+            parts.append("\nDRUG SAFETY INFORMATION:")
             for s in safety:
+                parts.append(f"- {s['content'][:400]}...")
     else:
         guidelines = search_antibiotic_guidelines(query, n_results=n_results)
+        for g in guidelines:
+            parts.append(f"- {g['content'][:500]}...")
+    return "\n".join(parts) if parts else "No relevant context found in knowledge base."
 def _format_results(results: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """Flatten ChromaDB query results into a list of dicts."""
     if not results or not results.get("documents"):
         return []
     documents = results["documents"][0] if results["documents"] else []
     metadatas = results.get("metadatas", [[]])[0]
     distances = results.get("distances", [[]])[0]
+    return [
+        {
             "content": doc,
             "metadata": metadatas[i] if i < len(metadatas) else {},
             "distance": distances[i] if i < len(distances) else None,
             "source": metadatas[i].get("source", "Unknown") if i < len(metadatas) else "Unknown",
             "relevance_score": 1 - (distances[i] if i < len(distances) else 0),
+        }
+        for i, doc in enumerate(documents)
+    ]
 def list_available_collections() -> List[str]:
+    """Return names of all ChromaDB collections that exist."""
     try:
+        return [c.name for c in get_chroma_client().list_collections()]
     except Exception as e:
         logger.error(f"Error listing collections: {e}")
         return []
 def get_collection_info(name: str) -> Optional[Dict[str, Any]]:
+    """Return count and metadata for a collection, or None if it does not exist."""
     collection = get_collection(name)
     if collection is None:
         return None
     try:
+        return {"name": collection.name, "count": collection.count(), "metadata": collection.metadata}
     except Exception as e:
         logger.error(f"Error getting collection info: {e}")
         return None

src/state.py CHANGED Viewed

@@ -1,12 +1,9 @@
-from __future__ import annotations
 from typing import Dict, List, Literal, NotRequired, Optional, TypedDict
 class LabResult(TypedDict, total=False):
-    """Structured representation of a single lab value."""
     name: str
     value: str
     unit: NotRequired[Optional[str]]
@@ -15,21 +12,19 @@ class LabResult(TypedDict, total=False):
 class MICDatum(TypedDict, total=False):
-    """Single MIC measurement for a bug–drug pair."""
     organism: str
     antibiotic: str
     mic_value: str
     mic_unit: NotRequired[Optional[str]]
     interpretation: NotRequired[Optional[Literal["S", "I", "R"]]]
-    breakpoint_source: NotRequired[Optional[str]]  # e.g. EUCAST v16.0
     year: NotRequired[Optional[int]]
-    site: NotRequired[Optional[str]]  # e.g. blood, urine
 class Recommendation(TypedDict, total=False):
     """Final clinical recommendation assembled by Agent 4."""
     primary_antibiotic: Optional[str]
     backup_antibiotic: NotRequired[Optional[str]]
     dose: Optional[str]
@@ -43,24 +38,19 @@ class Recommendation(TypedDict, total=False):
 class InfectionState(TypedDict, total=False):
     """
-    Global LangGraph state for the Med-I-C pipeline.
-    All agents read from and write back to this object.
-    Most keys are optional to keep the schema flexible across stages.
     """
-    # ------------------------------------------------------------------
     # Patient identity & demographics
-    # ------------------------------------------------------------------
     patient_id: NotRequired[Optional[str]]
     age_years: NotRequired[Optional[float]]
     sex: NotRequired[Optional[Literal["male", "female", "other", "unknown"]]]
     weight_kg: NotRequired[Optional[float]]
     height_cm: NotRequired[Optional[float]]
-    # ------------------------------------------------------------------
     # Clinical context
-    # ------------------------------------------------------------------
     suspected_source: NotRequired[Optional[str]]  # e.g. "community UTI"
     comorbidities: NotRequired[List[str]]
     medications: NotRequired[List[str]]
@@ -68,58 +58,36 @@ class InfectionState(TypedDict, total=False):
     infection_site: NotRequired[Optional[str]]
     country_or_region: NotRequired[Optional[str]]
-    # ------------------------------------------------------------------
-    # Renal function / vitals
-    # ------------------------------------------------------------------
     serum_creatinine_mg_dl: NotRequired[Optional[float]]
     creatinine_clearance_ml_min: NotRequired[Optional[float]]
     vitals: NotRequired[Dict[str, str]]  # flexible key/value, e.g. {"BP": "120/80"}
-    # ------------------------------------------------------------------
     # Lab data & MICs
-    # ------------------------------------------------------------------
-    labs_raw_text: NotRequired[Optional[str]]  # raw OCR / PDF text
     labs_parsed: NotRequired[List[LabResult]]
     mic_data: NotRequired[List[MICDatum]]
     mic_trend_summary: NotRequired[Optional[str]]
-    # ------------------------------------------------------------------
-    # Stage / routing metadata
-    # ------------------------------------------------------------------
     stage: NotRequired[Literal["empirical", "targeted"]]
     route_to_vision: NotRequired[bool]
     route_to_trend_analyst: NotRequired[bool]
-    # ------------------------------------------------------------------
     # Agent outputs
-    # ------------------------------------------------------------------
-    intake_notes: NotRequired[Optional[str]]  # Agent 1
-    vision_notes: NotRequired[Optional[str]]  # Agent 2
-    trend_notes: NotRequired[Optional[str]]  # Agent 3
-    pharmacology_notes: NotRequired[Optional[str]]  # Agent 4
     recommendation: NotRequired[Optional[Recommendation]]
-    # ------------------------------------------------------------------
-    # RAG / context + safety
-    # ------------------------------------------------------------------
     rag_context: NotRequired[Optional[str]]
     guideline_sources: NotRequired[List[str]]
     breakpoint_sources: NotRequired[List[str]]
     safety_warnings: NotRequired[List[str]]
-    # ------------------------------------------------------------------
-    # Diagnostics / debugging
-    # ------------------------------------------------------------------
     errors: NotRequired[List[str]]
     debug_log: NotRequired[List[str]]
-__all__ = [
-    "LabResult",
-    "MICDatum",
-    "Recommendation",
-    "InfectionState",
-]

 from typing import Dict, List, Literal, NotRequired, Optional, TypedDict
 class LabResult(TypedDict, total=False):
+    """A single lab value with optional reference range and flag."""
     name: str
     value: str
     unit: NotRequired[Optional[str]]
 class MICDatum(TypedDict, total=False):
+    """A single MIC measurement for one organism–antibiotic pair."""
     organism: str
     antibiotic: str
     mic_value: str
     mic_unit: NotRequired[Optional[str]]
     interpretation: NotRequired[Optional[Literal["S", "I", "R"]]]
+    breakpoint_source: NotRequired[Optional[str]]  # e.g. "EUCAST v16.0"
     year: NotRequired[Optional[int]]
+    site: NotRequired[Optional[str]]  # e.g. "blood", "urine"
 class Recommendation(TypedDict, total=False):
     """Final clinical recommendation assembled by Agent 4."""
     primary_antibiotic: Optional[str]
     backup_antibiotic: NotRequired[Optional[str]]
     dose: Optional[str]
 class InfectionState(TypedDict, total=False):
     """
+    Shared state object passed between all agents in the pipeline.
+    All keys are optional so each agent only needs to populate its own outputs.
     """
     # Patient identity & demographics
     patient_id: NotRequired[Optional[str]]
     age_years: NotRequired[Optional[float]]
     sex: NotRequired[Optional[Literal["male", "female", "other", "unknown"]]]
     weight_kg: NotRequired[Optional[float]]
     height_cm: NotRequired[Optional[float]]
     # Clinical context
     suspected_source: NotRequired[Optional[str]]  # e.g. "community UTI"
     comorbidities: NotRequired[List[str]]
     medications: NotRequired[List[str]]
     infection_site: NotRequired[Optional[str]]
     country_or_region: NotRequired[Optional[str]]
+    # Renal function & vitals
     serum_creatinine_mg_dl: NotRequired[Optional[float]]
     creatinine_clearance_ml_min: NotRequired[Optional[float]]
     vitals: NotRequired[Dict[str, str]]  # flexible key/value, e.g. {"BP": "120/80"}
     # Lab data & MICs
+    labs_raw_text: NotRequired[Optional[str]]  # raw OCR or PDF text
     labs_parsed: NotRequired[List[LabResult]]
     mic_data: NotRequired[List[MICDatum]]
     mic_trend_summary: NotRequired[Optional[str]]
+    # Routing flags set by agents
     stage: NotRequired[Literal["empirical", "targeted"]]
     route_to_vision: NotRequired[bool]
     route_to_trend_analyst: NotRequired[bool]
     # Agent outputs
+    intake_notes: NotRequired[Optional[str]]       # Agent 1
+    vision_notes: NotRequired[Optional[str]]       # Agent 2
+    trend_notes: NotRequired[Optional[str]]        # Agent 3
+    pharmacology_notes: NotRequired[Optional[str]] # Agent 4
     recommendation: NotRequired[Optional[Recommendation]]
+    # RAG context & safety
     rag_context: NotRequired[Optional[str]]
     guideline_sources: NotRequired[List[str]]
     breakpoint_sources: NotRequired[List[str]]
     safety_warnings: NotRequired[List[str]]
+    # Diagnostics
     errors: NotRequired[List[str]]
     debug_log: NotRequired[List[str]]

src/tools/rag_tools.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """RAG tools for querying clinical guidelines via ChromaDB."""
-from typing import Optional
 from src.db.vector_store import search_guidelines, search_mic_reference


1	"""RAG tools for querying clinical guidelines via ChromaDB."""
2

3	from src.db.vector_store import search_guidelines, search_mic_reference
4
5

src/utils.py CHANGED Viewed

@@ -1,23 +1,19 @@
 """
-Utility functions for Med-I-C multi-agent system.
-Includes:
-- Creatinine Clearance (CrCl) calculator
 - MIC trend analysis and creep detection
 - Prescription card formatter
-- Data validation helpers
 """
-from __future__ import annotations
 import json
 import math
 from typing import Any, Dict, List, Literal, Optional, Tuple
-# =============================================================================
-# CREATININE CLEARANCE CALCULATOR
-# =============================================================================
 def calculate_crcl(
     age_years: float,
@@ -28,40 +24,25 @@ def calculate_crcl(
     height_cm: Optional[float] = None,
 ) -> float:
     """
-    Calculate Creatinine Clearance using the Cockcroft-Gault equation.
-    Formula:
-        CrCl = [(140 - age) × weight × (0.85 if female)] / (72 × SCr)
-    Args:
-        age_years: Patient age in years
-        weight_kg: Actual body weight in kg
-        serum_creatinine_mg_dl: Serum creatinine in mg/dL
-        sex: Patient sex ("male" or "female")
-        use_ibw: If True, use Ideal Body Weight instead of actual weight
-        height_cm: Height in cm (required if use_ibw=True)
-    Returns:
-        Estimated CrCl in mL/min
     """
     if serum_creatinine_mg_dl <= 0:
         raise ValueError("Serum creatinine must be positive")
     if age_years <= 0 or weight_kg <= 0:
         raise ValueError("Age and weight must be positive")
-    # Calculate weight to use
     weight = weight_kg
     if use_ibw and height_cm:
-        weight = calculate_ibw(height_cm, sex)
-        # Use adjusted body weight if actual weight > IBW
-        if weight_kg > weight * 1.3:
-            weight = calculate_adjusted_bw(weight, weight_kg)
-    # Cockcroft-Gault equation
     crcl = ((140 - age_years) * weight) / (72 * serum_creatinine_mg_dl)
-    # Apply sex factor
     if sex == "female":
         crcl *= 0.85
@@ -70,42 +51,27 @@ def calculate_crcl(
 def calculate_ibw(height_cm: float, sex: Literal["male", "female"]) -> float:
     """
-    Calculate Ideal Body Weight using the Devine formula.
-    Args:
-        height_cm: Height in centimeters
-        sex: Patient sex
-    Returns:
-        Ideal body weight in kg
     """
-    height_inches = height_cm / 2.54
-    height_over_60 = max(0, height_inches - 60)
-    if sex == "male":
-        ibw = 50 + 2.3 * height_over_60
-    else:
-        ibw = 45.5 + 2.3 * height_over_60
-    return round(ibw, 1)
 def calculate_adjusted_bw(ibw: float, actual_weight: float) -> float:
     """
-    Calculate Adjusted Body Weight for obese patients.
-    Formula: AdjBW = IBW + 0.4 × (Actual - IBW)
     """
     return round(ibw + 0.4 * (actual_weight - ibw), 1)
 def get_renal_dose_category(crcl: float) -> str:
-    """
-    Categorize renal function for dosing purposes.
-    Returns:
-        Renal function category
-    """
     if crcl >= 90:
         return "normal"
     elif crcl >= 60:
@@ -118,9 +84,7 @@ def get_renal_dose_category(crcl: float) -> str:
         return "esrd"
-# =============================================================================
-# MIC TREND ANALYSIS
-# =============================================================================
 def calculate_mic_trend(
     mic_values: List[Dict[str, Any]],
@@ -128,15 +92,11 @@ def calculate_mic_trend(
     resistant_breakpoint: Optional[float] = None,
 ) -> Dict[str, Any]:
     """
-    Analyze MIC trend over time and detect MIC creep.
-    Args:
-        mic_values: List of dicts with 'date' and 'mic_value' keys
-        susceptible_breakpoint: S breakpoint (optional)
-        resistant_breakpoint: R breakpoint (optional)
-    Returns:
-        Dict with trend analysis results
     """
     if len(mic_values) < 2:
         return {
@@ -145,49 +105,28 @@ def calculate_mic_trend(
             "alert": "Need at least 2 MIC values for trend analysis",
         }
-    # Extract MIC values
     mics = [float(v["mic_value"]) for v in mic_values]
     baseline_mic = mics[0]
     current_mic = mics[-1]
-    # Calculate fold change
-    if baseline_mic > 0:
-        fold_change = current_mic / baseline_mic
-    else:
-        fold_change = float("inf")
-    # Calculate trend
     if len(mics) >= 3:
-        # Linear regression slope
         n = len(mics)
         x_mean = (n - 1) / 2
         y_mean = sum(mics) / n
         numerator = sum((i - x_mean) * (mics[i] - y_mean) for i in range(n))
         denominator = sum((i - x_mean) ** 2 for i in range(n))
         slope = numerator / denominator if denominator != 0 else 0
-        if slope > 0.5:
-            trend = "increasing"
-        elif slope < -0.5:
-            trend = "decreasing"
-        else:
-            trend = "stable"
     else:
-        if current_mic > baseline_mic * 1.5:
-            trend = "increasing"
-        elif current_mic < baseline_mic * 0.67:
-            trend = "decreasing"
-        else:
-            trend = "stable"
-    # Calculate resistance velocity (fold change per time point)
     velocity = fold_change ** (1 / (len(mics) - 1)) if len(mics) > 1 else 1.0
-    # Determine risk level
     risk_level, alert = _assess_mic_risk(
         current_mic, baseline_mic, fold_change, trend,
-        susceptible_breakpoint, resistant_breakpoint
     )
     return {
@@ -211,51 +150,39 @@ def _assess_mic_risk(
     r_breakpoint: Optional[float],
 ) -> Tuple[str, str]:
     """
-    Assess risk level based on MIC trends and breakpoints.
-    Returns:
-        Tuple of (risk_level, alert_message)
     """
-    # If we have breakpoints, use them for risk assessment
     if s_breakpoint is not None and r_breakpoint is not None:
         margin = s_breakpoint / current_mic if current_mic > 0 else float("inf")
         if current_mic > r_breakpoint:
             return "CRITICAL", f"MIC ({current_mic}) exceeds resistant breakpoint ({r_breakpoint}). Organism is RESISTANT."
         if current_mic > s_breakpoint:
             return "HIGH", f"MIC ({current_mic}) exceeds susceptible breakpoint ({s_breakpoint}). Consider alternative therapy."
         if margin < 2:
             if trend == "increasing":
                 return "HIGH", f"MIC approaching breakpoint (margin: {margin:.1f}x) with increasing trend. High risk of resistance emergence."
-            else:
-                return "MODERATE", f"MIC close to breakpoint (margin: {margin:.1f}x). Monitor closely."
         if margin < 4:
             if trend == "increasing":
                 return "MODERATE", f"MIC rising with {margin:.1f}x margin to breakpoint. Consider enhanced monitoring."
-            else:
-                return "LOW", "MIC stable with adequate margin to breakpoint."
         return "LOW", "MIC well below breakpoint with good safety margin."
-    # Without breakpoints, use fold change and trend
     if fold_change >= 8:
         return "CRITICAL", f"MIC increased {fold_change:.1f}-fold from baseline. Urgent review needed."
     if fold_change >= 4:
         return "HIGH", f"MIC increased {fold_change:.1f}-fold from baseline. High risk of treatment failure."
     if fold_change >= 2:
         if trend == "increasing":
             return "MODERATE", f"MIC increased {fold_change:.1f}-fold with rising trend. Enhanced monitoring recommended."
-        else:
-            return "LOW", f"MIC increased {fold_change:.1f}-fold but trend is {trend}."
     if trend == "increasing":
         return "MODERATE", "MIC showing upward trend. Continue monitoring."
     return "LOW", "MIC stable or decreasing. Current therapy appropriate."
@@ -268,58 +195,37 @@ def detect_mic_creep(
     """
     Detect MIC creep for a specific organism-antibiotic pair.
-    Args:
-        organism: Pathogen name
-        antibiotic: Antibiotic name
-        mic_history: Historical MIC values with dates
-        breakpoints: Dict with 'susceptible' and 'resistant' keys
-    Returns:
-        Comprehensive MIC creep analysis
     """
-    trend_analysis = calculate_mic_trend(
         mic_history,
         susceptible_breakpoint=breakpoints.get("susceptible"),
         resistant_breakpoint=breakpoints.get("resistant"),
     )
-    # Add organism/antibiotic context
-    trend_analysis["organism"] = organism
-    trend_analysis["antibiotic"] = antibiotic
-    trend_analysis["breakpoint_susceptible"] = breakpoints.get("susceptible")
-    trend_analysis["breakpoint_resistant"] = breakpoints.get("resistant")
-    # Calculate time to resistance estimate
-    if trend_analysis["trend"] == "increasing" and trend_analysis["velocity"] > 1.0:
-        current = trend_analysis["current_mic"]
         s_bp = breakpoints.get("susceptible")
         if s_bp and current < s_bp:
-            # Estimate doublings needed to reach breakpoint
             doublings_needed = math.log2(s_bp / current) if current > 0 else 0
-            # Estimate time based on velocity
-            if trend_analysis["velocity"] > 1.0:
-                log_velocity = math.log(trend_analysis["velocity"]) / math.log(2)
-                if log_velocity > 0:
-                    time_estimate = doublings_needed / log_velocity
-                    trend_analysis["estimated_readings_to_resistance"] = round(time_estimate, 1)
-    return trend_analysis
-# =============================================================================
-# PRESCRIPTION FORMATTER
-# =============================================================================
 def format_prescription_card(recommendation: Dict[str, Any]) -> str:
-    """
-    Format a recommendation into a readable prescription card.
-    Args:
-        recommendation: Dict with recommendation details
-    Returns:
-        Formatted prescription card as string
-    """
     lines = []
     lines.append("=" * 50)
     lines.append("ANTIBIOTIC PRESCRIPTION")
@@ -336,14 +242,12 @@ def format_prescription_card(recommendation: Dict[str, Any]) -> str:
     if primary.get("aware_category"):
         lines.append(f"WHO AWaRe: {primary.get('aware_category')}")
-    # Dose adjustments
     adjustments = recommendation.get("dose_adjustments", {})
     if adjustments.get("renal") and adjustments["renal"] != "None needed":
         lines.append(f"\nRENAL ADJUSTMENT: {adjustments['renal']}")
     if adjustments.get("hepatic") and adjustments["hepatic"] != "None needed":
         lines.append(f"HEPATIC ADJUSTMENT: {adjustments['hepatic']}")
-    # Safety alerts
     alerts = recommendation.get("safety_alerts", [])
     if alerts:
         lines.append("\n" + "-" * 50)
@@ -353,7 +257,6 @@ def format_prescription_card(recommendation: Dict[str, Any]) -> str:
             marker = {"CRITICAL": "[!!!]", "WARNING": "[!!]", "INFO": "[i]"}.get(level, "[?]")
             lines.append(f"  {marker} {alert.get('message', '')}")
-    # Monitoring
     monitoring = recommendation.get("monitoring_parameters", [])
     if monitoring:
         lines.append("\n" + "-" * 50)
@@ -361,47 +264,33 @@ def format_prescription_card(recommendation: Dict[str, Any]) -> str:
         for param in monitoring:
             lines.append(f"  - {param}")
-    # Rationale
     if recommendation.get("rationale"):
         lines.append("\n" + "-" * 50)
         lines.append("RATIONALE:")
         lines.append(f"  {recommendation['rationale']}")
     lines.append("\n" + "=" * 50)
     return "\n".join(lines)
-# =============================================================================
-# JSON PARSING HELPERS
-# =============================================================================
 def safe_json_parse(text: str) -> Optional[Dict[str, Any]]:
     """
-    Safely parse JSON from agent output, handling common issues.
-    Attempts to extract JSON from text that may contain markdown code blocks
-    or other formatting.
     """
     if not text:
         return None
-    # Try direct parse first
     try:
         return json.loads(text)
     except json.JSONDecodeError:
         pass
-    # Try to extract JSON from markdown code block
-    import re
-    json_patterns = [
-        r"```json\s*\n?(.*?)\n?```",  # ```json ... ```
-        r"```\s*\n?(.*?)\n?```",       # ``` ... ```
-        r"\{[\s\S]*\}",                 # Raw JSON object
-    ]
-    for pattern in json_patterns:
         match = re.search(pattern, text, re.DOTALL)
         if match:
             try:
@@ -414,29 +303,15 @@ def safe_json_parse(text: str) -> Optional[Dict[str, Any]]:
 def validate_agent_output(output: Dict[str, Any], required_fields: List[str]) -> Tuple[bool, List[str]]:
-    """
-    Validate that agent output contains required fields.
-    Args:
-        output: Agent output dict
-        required_fields: List of required field names
-    Returns:
-        Tuple of (is_valid, list_of_missing_fields)
-    """
-    missing = [field for field in required_fields if field not in output]
     return len(missing) == 0, missing
-# =============================================================================
-# DATA NORMALIZATION
-# =============================================================================
 def normalize_antibiotic_name(name: str) -> str:
-    """
-    Normalize antibiotic name to standard format.
-    """
-    # Common name mappings
     mappings = {
         "amox": "amoxicillin",
         "amox/clav": "amoxicillin-clavulanate",
@@ -459,18 +334,11 @@ def normalize_antibiotic_name(name: str) -> str:
         "cefepime": "cefepime",
         "maxipime": "cefepime",
     }
-    normalized = name.lower().strip()
-    return mappings.get(normalized, normalized)
 def normalize_organism_name(name: str) -> str:
-    """
-    Normalize organism name to standard format.
-    """
-    name = name.strip()
-    # Common abbreviations
     abbreviations = {
         "e. coli": "Escherichia coli",
         "e.coli": "Escherichia coli",
@@ -485,21 +353,4 @@ def normalize_organism_name(name: str) -> str:
         "enterococcus": "Enterococcus species",
         "vre": "Enterococcus (VRE)",
     }
-    lower_name = name.lower()
-    return abbreviations.get(lower_name, name)
-__all__ = [
-    "calculate_crcl",
-    "calculate_ibw",
-    "calculate_adjusted_bw",
-    "get_renal_dose_category",
-    "calculate_mic_trend",
-    "detect_mic_creep",
-    "format_prescription_card",
-    "safe_json_parse",
-    "validate_agent_output",
-    "normalize_antibiotic_name",
-    "normalize_organism_name",
-]

 """
+Utility functions for clinical calculations and data parsing.
+- Creatinine Clearance (CrCl) via Cockcroft-Gault
 - MIC trend analysis and creep detection
 - Prescription card formatter
+- JSON parsing and data normalization helpers
 """
 import json
 import math
+import re
 from typing import Any, Dict, List, Literal, Optional, Tuple
+# --- CrCl calculator ---
 def calculate_crcl(
     age_years: float,
     height_cm: Optional[float] = None,
 ) -> float:
     """
+    Cockcroft-Gault equation.
+    CrCl = [(140 - age) × weight × (0.85 if female)] / (72 × SCr)
+    When use_ibw=True and height is given, uses Ideal Body Weight.
+    For obese patients (actual > 1.3 × IBW), switches to Adjusted Body Weight.
+    Returns CrCl in mL/min.
     """
     if serum_creatinine_mg_dl <= 0:
         raise ValueError("Serum creatinine must be positive")
     if age_years <= 0 or weight_kg <= 0:
         raise ValueError("Age and weight must be positive")
     weight = weight_kg
     if use_ibw and height_cm:
+        ibw = calculate_ibw(height_cm, sex)
+        weight = calculate_adjusted_bw(ibw, weight_kg) if weight_kg > ibw * 1.3 else ibw
     crcl = ((140 - age_years) * weight) / (72 * serum_creatinine_mg_dl)
     if sex == "female":
         crcl *= 0.85
 def calculate_ibw(height_cm: float, sex: Literal["male", "female"]) -> float:
     """
+    Devine formula for Ideal Body Weight.
+    Male: 50 kg + 2.3 kg per inch over 5 feet
+    Female: 45.5 kg + 2.3 kg per inch over 5 feet
     """
+    height_over_60_inches = max(0, height_cm / 2.54 - 60)
+    base = 50 if sex == "male" else 45.5
+    return round(base + 2.3 * height_over_60_inches, 1)
 def calculate_adjusted_bw(ibw: float, actual_weight: float) -> float:
     """
+    Adjusted Body Weight for obese patients.
+    AdjBW = IBW + 0.4 × (Actual - IBW)
     """
     return round(ibw + 0.4 * (actual_weight - ibw), 1)
 def get_renal_dose_category(crcl: float) -> str:
+    """Map CrCl value to a dosing category string."""
     if crcl >= 90:
         return "normal"
     elif crcl >= 60:
         return "esrd"
+# --- MIC trend analysis ---
 def calculate_mic_trend(
     mic_values: List[Dict[str, Any]],
     resistant_breakpoint: Optional[float] = None,
 ) -> Dict[str, Any]:
     """
+    Analyze a list of MIC readings over time.
+    Requires at least 2 readings. Uses linear regression slope for trend
+    direction when >= 3 points are available; falls back to ratio comparison
+    for exactly 2 points.
     """
     if len(mic_values) < 2:
         return {
             "alert": "Need at least 2 MIC values for trend analysis",
         }
     mics = [float(v["mic_value"]) for v in mic_values]
     baseline_mic = mics[0]
     current_mic = mics[-1]
+    fold_change = (current_mic / baseline_mic) if baseline_mic > 0 else float("inf")
     if len(mics) >= 3:
         n = len(mics)
         x_mean = (n - 1) / 2
         y_mean = sum(mics) / n
         numerator = sum((i - x_mean) * (mics[i] - y_mean) for i in range(n))
         denominator = sum((i - x_mean) ** 2 for i in range(n))
         slope = numerator / denominator if denominator != 0 else 0
+        trend = "increasing" if slope > 0.5 else "decreasing" if slope < -0.5 else "stable"
     else:
+        trend = "increasing" if current_mic > baseline_mic * 1.5 else "decreasing" if current_mic < baseline_mic * 0.67 else "stable"
+    # Fold change per time step (geometric rate of change)
     velocity = fold_change ** (1 / (len(mics) - 1)) if len(mics) > 1 else 1.0
     risk_level, alert = _assess_mic_risk(
         current_mic, baseline_mic, fold_change, trend,
+        susceptible_breakpoint, resistant_breakpoint,
     )
     return {
     r_breakpoint: Optional[float],
 ) -> Tuple[str, str]:
     """
+    Assign a risk level (LOW/MODERATE/HIGH/CRITICAL) based on breakpoints and fold change.
+    Prefers breakpoint-based assessment when breakpoints are available.
+    Falls back to fold-change thresholds otherwise.
     """
     if s_breakpoint is not None and r_breakpoint is not None:
         margin = s_breakpoint / current_mic if current_mic > 0 else float("inf")
         if current_mic > r_breakpoint:
             return "CRITICAL", f"MIC ({current_mic}) exceeds resistant breakpoint ({r_breakpoint}). Organism is RESISTANT."
         if current_mic > s_breakpoint:
             return "HIGH", f"MIC ({current_mic}) exceeds susceptible breakpoint ({s_breakpoint}). Consider alternative therapy."
         if margin < 2:
             if trend == "increasing":
                 return "HIGH", f"MIC approaching breakpoint (margin: {margin:.1f}x) with increasing trend. High risk of resistance emergence."
+            return "MODERATE", f"MIC close to breakpoint (margin: {margin:.1f}x). Monitor closely."
         if margin < 4:
             if trend == "increasing":
                 return "MODERATE", f"MIC rising with {margin:.1f}x margin to breakpoint. Consider enhanced monitoring."
+            return "LOW", "MIC stable with adequate margin to breakpoint."
         return "LOW", "MIC well below breakpoint with good safety margin."
+    # No breakpoints — use fold change thresholds from EUCAST MIC creep criteria
     if fold_change >= 8:
         return "CRITICAL", f"MIC increased {fold_change:.1f}-fold from baseline. Urgent review needed."
     if fold_change >= 4:
         return "HIGH", f"MIC increased {fold_change:.1f}-fold from baseline. High risk of treatment failure."
     if fold_change >= 2:
         if trend == "increasing":
             return "MODERATE", f"MIC increased {fold_change:.1f}-fold with rising trend. Enhanced monitoring recommended."
+        return "LOW", f"MIC increased {fold_change:.1f}-fold but trend is {trend}."
     if trend == "increasing":
         return "MODERATE", "MIC showing upward trend. Continue monitoring."
     return "LOW", "MIC stable or decreasing. Current therapy appropriate."
     """
     Detect MIC creep for a specific organism-antibiotic pair.
+    Augments calculate_mic_trend with a time-to-resistance estimate
+    when the MIC is rising and a susceptible breakpoint is available.
     """
+    result = calculate_mic_trend(
         mic_history,
         susceptible_breakpoint=breakpoints.get("susceptible"),
         resistant_breakpoint=breakpoints.get("resistant"),
     )
+    result["organism"] = organism
+    result["antibiotic"] = antibiotic
+    result["breakpoint_susceptible"] = breakpoints.get("susceptible")
+    result["breakpoint_resistant"] = breakpoints.get("resistant")
+    # Estimate how many more time-points until MIC reaches the susceptible breakpoint
+    if result["trend"] == "increasing" and result["velocity"] > 1.0:
+        current = result["current_mic"]
         s_bp = breakpoints.get("susceptible")
         if s_bp and current < s_bp:
             doublings_needed = math.log2(s_bp / current) if current > 0 else 0
+            log_velocity = math.log(result["velocity"]) / math.log(2)
+            if log_velocity > 0:
+                result["estimated_readings_to_resistance"] = round(doublings_needed / log_velocity, 1)
+    return result
+# --- Prescription formatter ---
 def format_prescription_card(recommendation: Dict[str, Any]) -> str:
+    """Format a recommendation dict as a plain-text prescription card."""
     lines = []
     lines.append("=" * 50)
     lines.append("ANTIBIOTIC PRESCRIPTION")
     if primary.get("aware_category"):
         lines.append(f"WHO AWaRe: {primary.get('aware_category')}")
     adjustments = recommendation.get("dose_adjustments", {})
     if adjustments.get("renal") and adjustments["renal"] != "None needed":
         lines.append(f"\nRENAL ADJUSTMENT: {adjustments['renal']}")
     if adjustments.get("hepatic") and adjustments["hepatic"] != "None needed":
         lines.append(f"HEPATIC ADJUSTMENT: {adjustments['hepatic']}")
     alerts = recommendation.get("safety_alerts", [])
     if alerts:
         lines.append("\n" + "-" * 50)
             marker = {"CRITICAL": "[!!!]", "WARNING": "[!!]", "INFO": "[i]"}.get(level, "[?]")
             lines.append(f"  {marker} {alert.get('message', '')}")
     monitoring = recommendation.get("monitoring_parameters", [])
     if monitoring:
         lines.append("\n" + "-" * 50)
         for param in monitoring:
             lines.append(f"  - {param}")
     if recommendation.get("rationale"):
         lines.append("\n" + "-" * 50)
         lines.append("RATIONALE:")
         lines.append(f"  {recommendation['rationale']}")
     lines.append("\n" + "=" * 50)
     return "\n".join(lines)
+# --- JSON parsing ---
 def safe_json_parse(text: str) -> Optional[Dict[str, Any]]:
     """
+    Extract and parse the first JSON object from a string.
+    Handles model output that may wrap JSON in markdown code fences.
+    Returns None if no valid JSON is found.
     """
     if not text:
         return None
     try:
         return json.loads(text)
     except json.JSONDecodeError:
         pass
+    for pattern in [r"```json\s*\n?(.*?)\n?```", r"```\s*\n?(.*?)\n?```", r"\{[\s\S]*\}"]:
         match = re.search(pattern, text, re.DOTALL)
         if match:
             try:
 def validate_agent_output(output: Dict[str, Any], required_fields: List[str]) -> Tuple[bool, List[str]]:
+    """Return (is_valid, missing_fields) for an agent output dict."""
+    missing = [f for f in required_fields if f not in output]
     return len(missing) == 0, missing
+# --- Name normalization ---
 def normalize_antibiotic_name(name: str) -> str:
+    """Map common abbreviations and brand names to standard antibiotic names."""
     mappings = {
         "amox": "amoxicillin",
         "amox/clav": "amoxicillin-clavulanate",
         "cefepime": "cefepime",
         "maxipime": "cefepime",
     }
+    return mappings.get(name.lower().strip(), name.lower().strip())
 def normalize_organism_name(name: str) -> str:
+    """Map common abbreviations to full organism names."""
     abbreviations = {
         "e. coli": "Escherichia coli",
         "e.coli": "Escherichia coli",
         "enterococcus": "Enterococcus species",
         "vre": "Enterococcus (VRE)",
     }
+    return abbreviations.get(name.strip().lower(), name.strip())