Spaces:

yuxbox
/

activemedagent-demo

Sleeping

App Files Files Community

yuxbox commited on 23 days ago

Commit

a1aaf30

verified ·

1 Parent(s): dbdae02

Upload folder using huggingface_hub

Browse files

Files changed (28) hide show

.env.template +3 -0
.gitattributes +5 -0
README.md +19 -6
agent.py +1170 -0
api_client.py +707 -0
app.py +1000 -0
baselines.py +694 -0
calibration.py +519 -0
config.py +276 -0
datasets/__init__.py +17 -0
datasets/base.py +146 -0
datasets/midas.py +444 -0
datasets/nejm.py +440 -0
datasets/olives.py +470 -0
demo_cases/chest_xray_ipf.png +3 -0
demo_cases/ct_pulmonary_pe.png +3 -0
demo_cases/fundus_dme.png +3 -0
demo_cases/oct_bscan_dme.png +3 -0
demo_cases/skin_lesion_dermoscopy.png +3 -0
evaluation/__init__.py +455 -0
evaluation/analysis.py +546 -0
information_gain.py +441 -0
policy.py +608 -0
prompts.py +228 -0
reasoning_analysis.py +612 -0
requirements.txt +8 -0
tools.py +185 -0
trajectory.py +338 -0

.env.template ADDED Viewed

	@@ -0,0 +1,3 @@

+# Set your API keys as HF Space secrets
+# OPENAI_API_KEY=sk-...
+# ANTHROPIC_API_KEY=sk-ant-...

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo_cases/chest_xray_ipf.png filter=lfs diff=lfs merge=lfs -text
+demo_cases/ct_pulmonary_pe.png filter=lfs diff=lfs merge=lfs -text
+demo_cases/fundus_dme.png filter=lfs diff=lfs merge=lfs -text
+demo_cases/oct_bscan_dme.png filter=lfs diff=lfs merge=lfs -text
+demo_cases/skin_lesion_dermoscopy.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,25 @@
 ---
-title: Activemedagent Demo
-emoji: 🌖
-colorFrom: gray
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.11.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ActiveMedAgent Demo
+emoji: 🏥
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 6.10.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# ActiveMedAgent: Learned Information Acquisition for Medical Diagnosis
+Interactive demo for the ActiveMedAgent framework. Watch the agent reason step-by-step,
+acquire information channels strategically, and track entropy reduction in real time.
+**No budget constraint** — the agent decides when to stop based on information-theoretic criteria.
+## Features
+- Pre-built demo cases (NEJM, MIDAS, OLIVES)
+- Custom case builder with image upload
+- Step-by-step reasoning trace with probability bars
+- Entropy trajectory and information gain tracking
+- Simulated mode (no API key needed) + real VLM backends

agent.py ADDED Viewed

	@@ -0,0 +1,1170 @@

+"""
+Core ActiveMedAgent — Tool-Use Architecture with Adaptive Context Management.
+Two modes of operation:
+1. FULL MODE (for capable models: GPT-4o, Claude, Qwen-72B):
+   Multi-turn conversation with full history. The VLM sees its own prior
+   reasoning and can build on it.
+2. CONDENSED MODE (for weaker models: GPT-4o-mini, etc.):
+   Each acquisition step is a fresh single-turn call. The VLM receives:
+     - Initial image(s)
+     - A compact structured acquisition log of all prior steps
+     - The latest channel data
+     - Available channels + tools
+   This keeps context size O(1) per step instead of O(n), preventing
+   weaker models from losing track of their own reasoning.
+In both modes:
+  - There is NO fixed budget. The agent acquires as many channels as it
+    needs (0 to all available). If a case needs all 5 NEJM vignettes, it
+    gets all 5. If the image alone is sufficient, it commits immediately.
+  - The agent decides when to stop via the commit_diagnosis tool.
+  - Probability distributions from tool calls are tracked for information-
+    theoretic analysis (entropy, IG, KL divergence).
+"""
+import json
+import logging
+from dataclasses import dataclass, field
+from api_client import BaseVLMClient, VLMResponse
+from datasets.base import MedicalCase, ChannelData
+from tools import (
+    ToolCall, ToolResult, AGENT_TOOLS,
+    to_openai_tools, to_anthropic_tools,
+    constrain_tools_for_step,
+)
+from information_gain import (
+    BeliefState, BeliefTrajectory,
+    compute_entropy, compute_kl_divergence,
+    estimate_expected_information_gain,
+    should_commit, compute_value_of_information,
+)
+from prompts import format_available_channels, format_acquired_info
+import config
+logger = logging.getLogger(__name__)
+# ============================================================
+# Data Structures
+# ============================================================
+@dataclass
+class AcquisitionStep:
+    """Record of a single acquisition decision made via tool call."""
+    step: int
+    tool_call: ToolCall | None
+    requested_channel: str | None       # None if agent committed
+    reasoning: str
+    differential: list[dict]            # [{name, confidence}, ...]
+    committed: bool
+    raw_response: str
+    latency_ms: float
+    entropy: float = 0.0
+    information_gain: float = 0.0
+    kl_divergence: float = 0.0
+    expected_impact: dict = field(default_factory=dict)
+@dataclass
+class AgentResult:
+    """Complete result of an agent's diagnostic process on one case."""
+    case_id: str
+    dataset: str
+    prompt_variant: str
+    backend: str
+    budget: int                         # max channels available (not a hard limit)
+    steps: list[AcquisitionStep] = field(default_factory=list)
+    final_ranking: list[dict] = field(default_factory=list)
+    acquired_channels: list[str] = field(default_factory=list)
+    total_latency_ms: float = 0.0
+    total_input_tokens: int = 0
+    total_output_tokens: int = 0
+    committed_early: bool = False
+    final_raw_response: str = ""
+    belief_trajectory: BeliefTrajectory | None = None
+    total_case_cost: float = 0.0
+    acquisition_cost: float = 0.0
+# ============================================================
+# System Prompts
+# ============================================================
+SYSTEM_PROMPT_FULL = """\
+You are a medical diagnostic agent. You may receive free clinical context, exam data, \
+labs, and images in a tiered pathway. Your job is to determine the correct diagnosis \
+from a set of candidates while avoiding unnecessary resource use.
+You have two tools:
+1. request_information — Request one additional data channel when its expected \
+clinical value justifies its cost. You MUST provide:
+   - channel_name: exactly one of the available channel names
+   - reasoning: why this channel best resolves your current uncertainty
+   - current_differential: your FULL ranked differential with calibrated \
+probabilities that sum to 1.0 across ALL candidates
+   - expected_impact: what you expect (if_positive and if_negative)
+2. commit_diagnosis — Submit your final ranked diagnosis. Provide:
+   - ranked_diagnoses: ALL candidates with calibrated probabilities summing to 1.0, \
+each with key_evidence
+   - reasoning: your complete diagnostic reasoning chain
+Strategy:
+- Start with whatever information is already available for free at presentation.
+- Escalate only when the currently available information cannot safely distinguish the top diagnoses.
+- If demographics, chief complaint, history, exam, or existing evidence are sufficient, commit without requesting imaging.
+- Commit when your top diagnosis has high probability and is well-separated from \
+alternatives, OR when no remaining channel would meaningfully change your differential.
+- Your probability estimates MUST sum to 1.0 and reflect genuine calibrated uncertainty."""
+SYSTEM_PROMPT_CONDENSED = """\
+You are a medical diagnostic agent. Examine all currently available clinical \
+information below, then decide your next action.
+You have two tools:
+1. request_information — Request one more data channel only if its expected value \
+justifies its cost. Provide:
+   - channel_name: one of the available channels listed below
+   - reasoning: why this channel would help
+   - current_differential: ranked diagnoses with probabilities summing to 1.0
+   - expected_impact: if_positive and if_negative predictions
+2. commit_diagnosis — Submit final diagnosis. Provide:
+   - ranked_diagnoses: ALL candidates with probabilities summing to 1.0 and key_evidence
+   - reasoning: complete reasoning chain
+Decide: if remaining channels would meaningfully change your differential enough to \
+justify their cost, request the best one. Otherwise, commit your diagnosis."""
+SYSTEM_PROMPT_FINAL = """\
+You are a medical diagnostic agent. You have gathered information. Now provide \
+your final ranked diagnosis using the commit_diagnosis tool.
+You MUST:
+- Include ALL candidate diagnoses in ranked_diagnoses
+- Assign calibrated probabilities summing to 1.0
+- Provide specific key_evidence for EACH diagnosis
+- Write a thorough reasoning chain synthesizing all acquired evidence
+- Favor the least resource-intensive pathway that still supports the diagnosis"""
+# ============================================================
+# Context Mode Detection
+# ============================================================
+def _should_use_condensed(model_name: str) -> bool:
+    """Determine if a model should use condensed context mode."""
+    if config.CONTEXT_MODE == "full":
+        return False
+    if config.CONTEXT_MODE == "condensed":
+        return True
+    # adaptive — check model name
+    for pattern in config.CONDENSED_MODELS:
+        if pattern in model_name:
+            return True
+    return False
+# ============================================================
+# Condensed Acquisition Log Builder
+# ============================================================
+def _build_acquisition_log(
+    steps: list[AcquisitionStep],
+    acquired_data: dict[str, str],
+) -> str:
+    """
+    Build a compact structured summary of all prior acquisition steps.
+    This replaces the full multi-turn conversation for condensed mode.
+    Each step is ~50-80 tokens instead of 300-500 for the full tool call
+    response, keeping context manageable for weaker models.
+    Example output:
+        === ACQUISITION LOG (2 channels acquired) ===
+        Step 1: Acquired [dermoscopy]
+          Reasoning: Need subsurface structures to distinguish melanoma from BCC
+          Data received: [dermoscopy]: (image — see above)
+          Updated differential: Melanoma(0.55), BCC(0.30), SCC(0.15)
+          Entropy: 1.37 bits | Information gain: 0.19 bits
+        Step 2: Acquired [patient_demographics]
+          Reasoning: Age and skin type are critical for melanoma risk
+          Data received: [demographics]: 34M, Fitzpatrick II
+          Updated differential: Melanoma(0.75), BCC(0.15), SCC(0.10)
+          Entropy: 1.06 bits | Information gain: 0.31 bits
+        === END LOG ===
+    """
+    if not steps:
+        return "(No information acquired yet.)"
+    lines = [f"=== ACQUISITION LOG ({len(steps)} channel(s) acquired) ==="]
+    for step in steps:
+        if step.committed:
+            continue
+        ch = step.requested_channel or "unknown"
+        lines.append(f"Step {step.step + 1}: Acquired [{ch}]")
+        if step.reasoning:
+            # Truncate reasoning to key point
+            reasoning = step.reasoning
+            if len(reasoning) > 200:
+                reasoning = reasoning[:197] + "..."
+            lines.append(f"  Reasoning: {reasoning}")
+        # Include the actual data received
+        data = acquired_data.get(ch, "")
+        if data:
+            if len(data) > 300:
+                data = data[:297] + "..."
+            lines.append(f"  Data received: {data}")
+        # Compact differential
+        if step.differential:
+            diff_str = ", ".join(
+                f"{d['name']}({d['confidence']:.2f})"
+                for d in step.differential[:5]
+            )
+            lines.append(f"  Updated differential: {diff_str}")
+        lines.append(
+            f"  Entropy: {step.entropy:.2f} bits"
+            + (f" | IG: {step.information_gain:.2f} bits" if step.information_gain else "")
+        )
+        lines.append("")
+    lines.append("=== END LOG ===")
+    return "\n".join(lines)
+# ============================================================
+# Main Agent Class
+# ============================================================
+class ActiveMedAgent:
+    """
+    Tool-use active acquisition agent with adaptive context management.
+    No fixed budget — the agent acquires as many or as few channels as it
+    needs. Supports two context modes:
+      - full: multi-turn conversation (capable models)
+      - condensed: single-turn with compressed state (weaker models)
+    """
+    def __init__(
+        self,
+        client: BaseVLMClient,
+        prompt_variant: str = "A",
+        budget: int = None,
+        context_mode: str = None,
+    ):
+        """
+        Args:
+            client: VLM API client.
+            prompt_variant: Prompt variant ID (for tracking, not used in tool mode).
+            budget: Max acquisitions. None = unlimited (use all channels if needed).
+            context_mode: "full", "condensed", or None (auto-detect from model).
+        """
+        self.client = client
+        self.prompt_variant = prompt_variant
+        self.budget = budget  # None means unlimited
+        self._commit_hint = ""
+        if context_mode is not None:
+            self.condensed = (context_mode == "condensed")
+        else:
+            self.condensed = _should_use_condensed(client.model)
+        if self.condensed:
+            logger.info(
+                f"Using CONDENSED context mode for {client.model} "
+                f"(single-turn with compressed state)"
+            )
+    # ============================================================
+    # Main Acquisition Loop
+    # ============================================================
+    def diagnose(self, case: MedicalCase) -> AgentResult:
+        """
+        Run the full tool-use acquisition loop.
+        The agent has NO fixed budget. It keeps requesting channels until:
+          1. It calls commit_diagnosis (confident enough), or
+          2. All available channels are exhausted, or
+          3. The safety limit is hit (max_steps = number of requestable channels)
+        Context mode determines how conversation history is managed:
+          - full: growing multi-turn conversation
+          - condensed: fresh single-turn call each step with compressed state
+        """
+        max_steps = len(case.requestable_names)
+        if self.budget is not None:
+            max_steps = min(max_steps, self.budget)
+        result = AgentResult(
+            case_id=case.case_id,
+            dataset=case.dataset,
+            prompt_variant=self.prompt_variant,
+            backend=self.client.model,
+            budget=max_steps,
+        )
+        acquired = []
+        acquired_data = {}  # channel_name -> data string (for condensed log)
+        dataset_channel_config = config.CHANNEL_CONFIGS.get(case.dataset, {})
+        channel_config = {
+            name: info for name, info in dataset_channel_config.items()
+            if name in case.initial_channels or name in case.requestable_channels
+        }
+        conversation = []  # only used in full mode
+        trajectory = BeliefTrajectory(case_id=case.case_id)
+        prev_distribution = None
+        initial_images = case.get_initial_images()
+        initial_context_str = format_acquired_info(case.get_text_context([]))
+        # ---- Build initial message (shared by both modes) ----
+        available_str = format_available_channels(channel_config, acquired)
+        candidates_str = "\n".join(
+            f"  {i + 1}. {c}" for i, c in enumerate(case.candidates)
+        )
+        if not self.condensed:
+            # FULL MODE: build initial user message for multi-turn
+            initial_content = self._build_image_content(initial_images)
+            initial_text = (
+                f"Review the currently available clinical information and determine the diagnosis.\n\n"
+                f"Information already available at presentation:\n{initial_context_str}\n\n"
+                f"Candidate diagnoses (you must rank ALL of these):\n"
+                f"{candidates_str}\n\n"
+                f"Prefer the least costly pathway that still supports a safe diagnosis.\n"
+                f"If the current information is sufficient, commit immediately without requesting more.\n"
+                f"You can request as many additional channels as you need "
+                f"(or none if already confident).\n"
+                f"Available information channels:\n{available_str}\n\n"
+                f"Use request_information to acquire the most informative "
+                f"channel for the cost, or commit_diagnosis if already confident."
+            )
+            initial_content.append({"type": "text", "text": initial_text})
+            conversation.append({"role": "user", "content": initial_content})
+        # ---- Acquisition Loop ----
+        for step_idx in range(max_steps):
+            available = [
+                n for n in case.requestable_names if n not in acquired
+            ]
+            if not available:
+                logger.debug(
+                    f"[{case.case_id}] All channels exhausted at step {step_idx}"
+                )
+                break
+            # Force acquisition on first step; allow commit after that
+            step_tools = constrain_tools_for_step(
+                budget_remaining=max_steps - step_idx,
+                allow_commit=(step_idx > 0),
+            )
+            if self.condensed:
+                # CONDENSED MODE: build a fresh single-turn call each step
+                response = self._call_condensed(
+                    case=case,
+                    initial_images=initial_images,
+                    acquired=acquired,
+                    acquired_data=acquired_data,
+                    steps=result.steps,
+                    available=available,
+                    candidates_str=candidates_str,
+                    channel_config=channel_config,
+                    step_tools=step_tools,
+                )
+            else:
+                # FULL MODE: multi-turn call with complete history
+                response = self.client.call_with_retry(
+                    system_prompt=SYSTEM_PROMPT_FULL,
+                    messages=conversation,
+                    temperature=config.TEMPERATURE,
+                    max_tokens=config.MAX_TOKENS,
+                    tools=step_tools,
+                )
+            result.total_latency_ms += response.latency_ms
+            result.total_input_tokens += response.input_tokens
+            result.total_output_tokens += response.output_tokens
+            # ---- Process tool call ----
+            tool_call = response.tool_call
+            if tool_call is None:
+                # No tool call — fallback
+                logger.warning(
+                    f"[{case.case_id}] Step {step_idx}: no tool call returned"
+                )
+                if not available:
+                    break
+                fallback = available[0]
+                acquired.append(fallback)
+                result.acquired_channels.append(fallback)
+                ch = case.get_channel(fallback)
+                if ch and ch.channel_type == "text":
+                    acquired_data[fallback] = f"[{fallback}]: {ch.value}"
+                else:
+                    acquired_data[fallback] = f"[{fallback}]: (image)"
+                step = AcquisitionStep(
+                    step=step_idx, tool_call=None,
+                    requested_channel=fallback,
+                    reasoning="(fallback — no tool call produced)",
+                    differential=[], committed=False,
+                    raw_response=response.text,
+                    latency_ms=response.latency_ms,
+                )
+                result.steps.append(step)
+                if not self.condensed:
+                    self._deliver_channel_data_as_user_message(
+                        case, fallback, conversation, available, acquired,
+                        channel_config,
+                    )
+                continue
+            # Add assistant message to conversation (full mode only)
+            if not self.condensed:
+                conversation.append({
+                    "role": "assistant",
+                    "content": response.text,
+                    "tool_calls": [tool_call],
+                })
+            # ---- Handle commit_diagnosis ----
+            if tool_call.tool_name == "commit_diagnosis":
+                args = tool_call.arguments
+                ranking = self._extract_ranking_from_commit(args)
+                distribution = {d["name"]: d["confidence"] for d in ranking}
+                belief = BeliefState(
+                    step=step_idx,
+                    distribution=distribution,
+                    channel_acquired=None,
+                )
+                trajectory.states.append(belief)
+                ig = 0.0
+                kl = 0.0
+                if prev_distribution is not None:
+                    ig = compute_entropy(prev_distribution) - compute_entropy(distribution)
+                    kl = compute_kl_divergence(distribution, prev_distribution)
+                step = AcquisitionStep(
+                    step=step_idx, tool_call=tool_call,
+                    requested_channel=None,
+                    reasoning=args.get("reasoning", ""),
+                    differential=ranking, committed=True,
+                    raw_response=response.text,
+                    latency_ms=response.latency_ms,
+                    entropy=belief.entropy,
+                    information_gain=ig, kl_divergence=kl,
+                )
+                result.steps.append(step)
+                result.committed_early = True
+                result.final_ranking = ranking
+                logger.debug(
+                    f"[{case.case_id}] Committed at step {step_idx} "
+                    f"after acquiring {len(acquired)} channels "
+                    f"(entropy={belief.entropy:.3f} bits)"
+                )
+                break
+            # ---- Handle request_information ----
+            elif tool_call.tool_name == "request_information":
+                args = tool_call.arguments
+                requested = args.get("channel_name", "")
+                differential = args.get("current_differential", [])
+                expected_impact = args.get("expected_impact", {})
+                reasoning = args.get("reasoning", "")
+                matched = self._match_channel(requested, available)
+                if matched is None:
+                    matched = available[0]
+                    logger.warning(
+                        f"[{case.case_id}] Step {step_idx}: '{requested}' "
+                        f"not in {available}, falling back to '{matched}'"
+                    )
+                # Build distribution from tool call
+                distribution = {}
+                for d in differential:
+                    distribution[d.get("name", "")] = d.get("probability", 0.0)
+                # Information-theoretic metrics
+                ig = 0.0
+                kl = 0.0
+                if prev_distribution is not None:
+                    ig = compute_entropy(prev_distribution) - compute_entropy(distribution)
+                    kl = compute_kl_divergence(distribution, prev_distribution)
+                belief = BeliefState(
+                    step=step_idx,
+                    distribution=distribution,
+                    channel_acquired=matched,
+                )
+                trajectory.states.append(belief)
+                eig = estimate_expected_information_gain(
+                    distribution, matched, expected_impact, case.candidates,
+                )
+                logger.debug(
+                    f"[{case.case_id}] Step {step_idx}: requesting '{matched}' "
+                    f"(H={belief.entropy:.3f}, IG={ig:.3f}, EIG={eig:.3f})"
+                )
+                step = AcquisitionStep(
+                    step=step_idx, tool_call=tool_call,
+                    requested_channel=matched, reasoning=reasoning,
+                    differential=[
+                        {"name": d.get("name", ""),
+                         "confidence": d.get("probability", 0.0),
+                         "rank": i + 1}
+                        for i, d in enumerate(differential)
+                    ],
+                    committed=False,
+                    raw_response=response.text,
+                    latency_ms=response.latency_ms,
+                    entropy=belief.entropy,
+                    information_gain=ig, kl_divergence=kl,
+                    expected_impact=expected_impact,
+                )
+                result.steps.append(step)
+                prev_distribution = distribution
+                acquired.append(matched)
+                result.acquired_channels.append(matched)
+                # Store acquired data for condensed log
+                ch = case.get_channel(matched)
+                if ch and ch.channel_type == "text":
+                    acquired_data[matched] = f"[{matched}]: {ch.value}"
+                elif ch and ch.channel_type == "image":
+                    acquired_data[matched] = f"[{matched}]: (image provided)"
+                else:
+                    acquired_data[matched] = f"[{matched}]: No data available."
+                # ---- Check stopping criterion ----
+                # After recording the new belief state, evaluate whether
+                # the agent should stop acquiring. This is a principled
+                # information-theoretic check, not just a prompt heuristic.
+                remaining_channels = [
+                    n for n in case.requestable_names if n not in acquired
+                ]
+                commit_recommended, commit_reason = should_commit(
+                    trajectory=trajectory,
+                    available_channels=remaining_channels,
+                    min_steps=0,  # agent decides — no forced minimum
+                )
+                voi = compute_value_of_information(
+                    trajectory, len(remaining_channels),
+                )
+                if commit_recommended and remaining_channels:
+                    logger.info(
+                        f"[{case.case_id}] Stopping criterion triggered at "
+                        f"step {step_idx}: {commit_reason} (VoI={voi:.3f})"
+                    )
+                    # Don't break yet — let the VLM make the decision on
+                    # the next iteration. But inject a hint into the
+                    # follow-up context.
+                    self._commit_hint = (
+                        f"\n\nNote: Based on your belief trajectory, additional "
+                        f"acquisition has low expected value (VoI={voi:.2f}). "
+                        f"The last channel provided only {ig:.3f} bits of "
+                        f"information gain. Consider committing your diagnosis."
+                    )
+                else:
+                    self._commit_hint = ""
+                # Deliver tool result (full mode only — condensed rebuilds
+                # the full state each call)
+                if not self.condensed:
+                    self._deliver_tool_result(
+                        case=case, channel_name=matched,
+                        tool_call=tool_call,
+                        conversation=conversation,
+                        acquired=acquired,
+                        channel_config=channel_config,
+                    )
+        # ---- Final Diagnosis ----
+        if not result.committed_early or not result.final_ranking:
+            if self.condensed:
+                final_ranking, final_response, final_belief = (
+                    self._get_final_diagnosis_condensed(
+                        case, acquired, acquired_data, result.steps,
+                    )
+                )
+            else:
+                final_ranking, final_response, final_belief = (
+                    self._get_final_diagnosis_tooluse(
+                        case, acquired, conversation,
+                    )
+                )
+            result.final_ranking = final_ranking
+            result.final_raw_response = final_response.text
+            result.total_latency_ms += final_response.latency_ms
+            result.total_input_tokens += final_response.input_tokens
+            result.total_output_tokens += final_response.output_tokens
+            if final_belief:
+                trajectory.states.append(final_belief)
+        result.acquired_channels = acquired
+        result.belief_trajectory = trajectory
+        result.acquisition_cost = case.get_acquisition_cost(acquired)
+        result.total_case_cost = case.get_total_cost(acquired)
+        return result
+    # ============================================================
+    # Condensed Mode: Single-Turn Call Builder
+    # ============================================================
+    def _call_condensed(
+        self,
+        case: MedicalCase,
+        initial_images: list[str],
+        acquired: list[str],
+        acquired_data: dict[str, str],
+        steps: list[AcquisitionStep],
+        available: list[str],
+        candidates_str: str,
+        channel_config: dict,
+        step_tools: list[dict],
+    ) -> VLMResponse:
+        """
+        Build and execute a single-turn call for condensed mode.
+        Each call gets a complete, self-contained context:
+          1. Initial image(s)
+          2. Any acquired images
+          3. Structured acquisition log (compact summary of all prior steps)
+          4. All acquired text data
+          5. Available channels
+          6. Tools
+        This keeps context size predictable and prevents weaker models
+        from losing track of their reasoning in long multi-turn histories.
+        """
+        content = []
+        # 1. Initial image(s) — always included
+        content.extend(self._build_image_content(initial_images))
+        # 2. Acquired images — include all visual channels
+        for ch_name in acquired:
+            ch = case.get_channel(ch_name)
+            if ch and ch.channel_type == "image" and ch.value:
+                if isinstance(ch.value, list):
+                    for img_b64 in ch.value:
+                        content.append({
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{img_b64}",
+                            },
+                        })
+                else:
+                    content.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{ch.value}",
+                        },
+                    })
+        # 3. Build the text prompt
+        available_str = format_available_channels(channel_config, acquired)
+        log_str = _build_acquisition_log(steps, acquired_data)
+        # 4. Collect all currently available context (initial + acquired)
+        current_context = format_acquired_info(case.get_text_context(acquired))
+        prompt = (
+            f"Review all currently available clinical information below.\n\n"
+            f"Candidate diagnoses (rank ALL):\n{candidates_str}\n\n"
+            f"Current available evidence:\n{current_context}\n\n"
+        )
+        if steps:
+            prompt += (
+                f"Your prior acquisition history:\n{log_str}\n\n"
+            )
+        commit_hint = getattr(self, '_commit_hint', '')
+        if available:
+            prompt += (
+                f"Remaining channels you can request:\n{available_str}\n\n"
+                f"Decide: Would any remaining channel meaningfully change your "
+                f"differential enough to justify its cost? If yes, use "
+                f"request_information. If no, use commit_diagnosis with your final ranking."
+                f"{commit_hint}"
+            )
+        else:
+            prompt += (
+                f"All channels have been acquired. Use commit_diagnosis to "
+                f"submit your final ranked diagnosis."
+            )
+        content.append({"type": "text", "text": prompt})
+        return self.client.call_with_retry(
+            system_prompt=SYSTEM_PROMPT_CONDENSED,
+            user_text=None,
+            images=None,
+            temperature=config.TEMPERATURE,
+            max_tokens=config.MAX_TOKENS,
+            tools=step_tools,
+            messages=[{"role": "user", "content": content}],
+        )
+    # ============================================================
+    # Full Mode: Tool Result Delivery
+    # ============================================================
+    def _deliver_tool_result(
+        self,
+        case: MedicalCase,
+        channel_name: str,
+        tool_call: ToolCall,
+        conversation: list[dict],
+        acquired: list[str],
+        channel_config: dict,
+    ):
+        """Deliver requested channel data as a tool_result message (full mode)."""
+        ch = case.get_channel(channel_name)
+        result_images = []
+        if ch and ch.channel_type == "image" and ch.value:
+            if isinstance(ch.value, list):
+                result_images.extend(ch.value)
+            else:
+                result_images.append(ch.value)
+        if ch and ch.channel_type == "text":
+            data_str = f"[{channel_name}]: {ch.value}"
+        elif ch and ch.channel_type == "image":
+            data_str = f"[{channel_name}]: (image provided — see attached)"
+        else:
+            data_str = f"[{channel_name}]: No data available for this channel."
+        available_after = [
+            n for n in case.requestable_names if n not in acquired
+        ]
+        available_after_str = format_available_channels(channel_config, acquired)
+        # Include commit hint from stopping criterion (if triggered)
+        commit_hint = getattr(self, '_commit_hint', '')
+        if available_after:
+            follow_up = (
+                f"Here is the information you requested:\n{data_str}\n\n"
+                f"Integrate this evidence with your prior observations.\n\n"
+                f"Remaining channels you can request:\n{available_after_str}\n\n"
+                f"Use request_information if another channel would meaningfully "
+                f"change your differential enough to justify its cost, or "
+                f"commit_diagnosis if confident."
+                f"{commit_hint}"
+            )
+        else:
+            follow_up = (
+                f"Here is the information you requested:\n{data_str}\n\n"
+                f"All channels have been acquired. Use commit_diagnosis to "
+                f"submit your final ranked diagnosis."
+            )
+        conversation.append({
+            "role": "tool_result",
+            "tool_call_id": tool_call.call_id,
+            "content": data_str,
+            "images": result_images,
+            "follow_up": follow_up,
+        })
+    def _deliver_channel_data_as_user_message(
+        self,
+        case: MedicalCase,
+        channel_name: str,
+        conversation: list[dict],
+        available_before: list[str],
+        acquired: list[str],
+        channel_config: dict,
+    ):
+        """Deliver channel data as a plain user message (fallback, full mode)."""
+        ch = case.get_channel(channel_name)
+        content = []
+        if ch and ch.channel_type == "image" and ch.value:
+            if isinstance(ch.value, list):
+                for img_b64 in ch.value:
+                    content.append({
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"},
+                    })
+            else:
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{ch.value}"},
+                })
+        if ch and ch.channel_type == "text":
+            data_str = f"[{channel_name}]: {ch.value}"
+        elif ch and ch.channel_type == "image":
+            data_str = f"[{channel_name}]: (image provided above)"
+        else:
+            data_str = f"[{channel_name}]: No data available."
+        available_after = [n for n in case.requestable_names if n not in acquired]
+        available_after_str = format_available_channels(channel_config, acquired)
+        if available_after:
+            text = (
+                f"Data received:\n{data_str}\n\n"
+                f"Remaining channels:\n{available_after_str}\n\n"
+                f"Use request_information only if another channel is worth its cost, or commit_diagnosis."
+            )
+        else:
+            text = (
+                f"Data received:\n{data_str}\n\n"
+                f"All channels acquired. Use commit_diagnosis."
+            )
+        content.append({"type": "text", "text": text})
+        conversation.append({"role": "user", "content": content})
+    # ============================================================
+    # Final Diagnosis
+    # ============================================================
+    def _get_final_diagnosis_tooluse(
+        self,
+        case: MedicalCase,
+        acquired: list[str],
+        conversation: list[dict],
+    ) -> tuple[list[dict], VLMResponse, BeliefState | None]:
+        """Get final diagnosis via tool call (full mode)."""
+        text_context = case.get_text_context(acquired)
+        acquired_str = format_acquired_info(text_context)
+        candidates_str = "\n".join(
+            f"  {i + 1}. {c}" for i, c in enumerate(case.candidates)
+        )
+        final_prompt = (
+            f"All information has been gathered. Submit your final diagnosis.\n\n"
+            f"Information acquired:\n{acquired_str}\n\n"
+            f"Candidate diagnoses (rank ALL):\n{candidates_str}\n\n"
+            f"Use commit_diagnosis with calibrated probabilities summing to 1.0 "
+            f"and key_evidence for each diagnosis. Favor the least resource-intensive "
+            f"pathway supported by the evidence."
+        )
+        conversation.append({"role": "user", "content": final_prompt})
+        commit_tools = constrain_tools_for_step(budget_remaining=0)
+        response = self.client.call_with_retry(
+            system_prompt=SYSTEM_PROMPT_FINAL,
+            messages=conversation,
+            temperature=config.TEMPERATURE,
+            max_tokens=config.MAX_TOKENS,
+            tools=commit_tools,
+        )
+        return self._parse_final_response(response, case, acquired)
+    def _get_final_diagnosis_condensed(
+        self,
+        case: MedicalCase,
+        acquired: list[str],
+        acquired_data: dict[str, str],
+        steps: list[AcquisitionStep],
+    ) -> tuple[list[dict], VLMResponse, BeliefState | None]:
+        """Get final diagnosis via single-turn call (condensed mode)."""
+        content = []
+        # Include all images
+        content.extend(self._build_image_content(case.get_initial_images()))
+        for ch_name in acquired:
+            ch = case.get_channel(ch_name)
+            if ch and ch.channel_type == "image" and ch.value:
+                if isinstance(ch.value, list):
+                    for img_b64 in ch.value:
+                        content.append({
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"},
+                        })
+                else:
+                    content.append({
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{ch.value}"},
+                    })
+        # Build text
+        candidates_str = "\n".join(
+            f"  {i + 1}. {c}" for i, c in enumerate(case.candidates)
+        )
+        log_str = _build_acquisition_log(steps, acquired_data)
+        current_context = format_acquired_info(case.get_text_context(acquired))
+        prompt = (
+            f"Submit your final diagnosis based on all gathered information.\n\n"
+            f"Candidate diagnoses (rank ALL):\n{candidates_str}\n\n"
+            f"Acquisition history:\n{log_str}\n\n"
+            f"All currently available evidence:\n{current_context}\n\n"
+            f"Use commit_diagnosis with calibrated probabilities summing to 1.0 "
+            f"and key_evidence for each diagnosis. Favor the least resource-intensive "
+            f"pathway supported by the evidence."
+        )
+        content.append({"type": "text", "text": prompt})
+        commit_tools = constrain_tools_for_step(budget_remaining=0)
+        response = self.client.call_with_retry(
+            system_prompt=SYSTEM_PROMPT_FINAL,
+            messages=[{"role": "user", "content": content}],
+            temperature=config.TEMPERATURE,
+            max_tokens=config.MAX_TOKENS,
+            tools=commit_tools,
+        )
+        return self._parse_final_response(response, case, acquired)
+    def _parse_final_response(
+        self,
+        response: VLMResponse,
+        case: MedicalCase,
+        acquired: list[str],
+    ) -> tuple[list[dict], VLMResponse, BeliefState | None]:
+        """Parse the final diagnosis response (shared by both modes)."""
+        tool_call = response.tool_call
+        if tool_call and tool_call.tool_name == "commit_diagnosis":
+            ranking = self._extract_ranking_from_commit(tool_call.arguments)
+            distribution = {d["name"]: d["confidence"] for d in ranking}
+            belief = BeliefState(
+                step=len(acquired),
+                distribution=distribution,
+                channel_acquired=None,
+            )
+            return ranking, response, belief
+        logger.warning(
+            f"[{case.case_id}] Final diagnosis: no tool call, "
+            f"falling back to text extraction"
+        )
+        ranking = self._extract_ranking_from_text(response.text, case.candidates)
+        return ranking, response, None
+    # ============================================================
+    # Baseline Conditions
+    # ============================================================
+    def get_diagnosis_at_state(
+        self, case: MedicalCase, acquired: list[str]
+    ) -> tuple[list[dict], VLMResponse]:
+        """
+        Public helper: get a diagnosis given a set of acquired channels.
+        Used by TrajectoryCollector to evaluate intermediate states.
+        Returns (ranking, response).
+        """
+        return self._get_final_diagnosis_single(case, acquired)
+    def diagnose_passive(self, case: MedicalCase) -> AgentResult:
+        """Passive baseline: initial available context only, no acquisition."""
+        result = AgentResult(
+            case_id=case.case_id, dataset=case.dataset,
+            prompt_variant=self.prompt_variant,
+            backend=self.client.model, budget=0,
+        )
+        final_ranking, final_response = self._get_final_diagnosis_single(
+            case, acquired=[],
+        )
+        result.final_ranking = final_ranking
+        result.final_raw_response = final_response.text
+        result.total_latency_ms = final_response.latency_ms
+        result.total_input_tokens = final_response.input_tokens
+        result.total_output_tokens = final_response.output_tokens
+        result.total_case_cost = case.get_total_cost([])
+        return result
+    def diagnose_oracle(self, case: MedicalCase) -> AgentResult:
+        """Oracle baseline: ALL information given upfront."""
+        all_channels = list(case.requestable_channels.keys())
+        result = AgentResult(
+            case_id=case.case_id, dataset=case.dataset,
+            prompt_variant=self.prompt_variant,
+            backend=self.client.model,
+            budget=len(all_channels),
+            acquired_channels=all_channels,
+        )
+        final_ranking, final_response = self._get_final_diagnosis_single(
+            case, acquired=all_channels,
+        )
+        result.final_ranking = final_ranking
+        result.final_raw_response = final_response.text
+        result.total_latency_ms = final_response.latency_ms
+        result.total_input_tokens = final_response.input_tokens
+        result.total_output_tokens = final_response.output_tokens
+        result.acquisition_cost = case.get_acquisition_cost(all_channels)
+        result.total_case_cost = case.get_total_cost(all_channels)
+        return result
+    def diagnose_fixed_order(
+        self, case: MedicalCase, order: list[str] = None
+    ) -> AgentResult:
+        """Fixed-order baseline: acquire channels in predetermined order."""
+        if order is None:
+            order = list(case.requestable_channels.keys())
+        max_acq = self.budget if self.budget is not None else len(order)
+        acquired = order[:max_acq]
+        result = AgentResult(
+            case_id=case.case_id, dataset=case.dataset,
+            prompt_variant=self.prompt_variant,
+            backend=self.client.model,
+            budget=max_acq,
+            acquired_channels=acquired,
+        )
+        final_ranking, final_response = self._get_final_diagnosis_single(
+            case, acquired=acquired,
+        )
+        result.final_ranking = final_ranking
+        result.final_raw_response = final_response.text
+        result.total_latency_ms = final_response.latency_ms
+        result.total_input_tokens = final_response.input_tokens
+        result.total_output_tokens = final_response.output_tokens
+        result.acquisition_cost = case.get_acquisition_cost(acquired)
+        result.total_case_cost = case.get_total_cost(acquired)
+        return result
+    def _get_final_diagnosis_single(
+        self, case: MedicalCase, acquired: list[str]
+    ) -> tuple[list[dict], VLMResponse]:
+        """Single-turn final diagnosis (for baselines)."""
+        images = case.get_all_images_up_to(acquired)
+        text_context = case.get_text_context(acquired)
+        acquired_str = format_acquired_info(text_context)
+        candidates_str = "\n".join(
+            f"  {i + 1}. {c}" for i, c in enumerate(case.candidates)
+        )
+        user_text = (
+            f"Provide your diagnosis using the currently available clinical information.\n\n"
+            f"Available information:\n{acquired_str}\n\n"
+            f"Candidate diagnoses (rank ALL):\n{candidates_str}\n\n"
+            f"Use commit_diagnosis with calibrated probabilities summing "
+            f"to 1.0 and key_evidence for each diagnosis. Prefer the least costly "
+            f"explanation supported by the evidence."
+        )
+        commit_tools = constrain_tools_for_step(budget_remaining=0)
+        response = self.client.call_with_retry(
+            system_prompt=SYSTEM_PROMPT_FINAL,
+            user_text=user_text,
+            images=images,
+            temperature=config.TEMPERATURE,
+            max_tokens=config.MAX_TOKENS,
+            tools=commit_tools,
+        )
+        tool_call = response.tool_call
+        if tool_call and tool_call.tool_name == "commit_diagnosis":
+            ranking = self._extract_ranking_from_commit(tool_call.arguments)
+            return ranking, response
+        ranking = self._extract_ranking_from_text(response.text, case.candidates)
+        return ranking, response
+    # ============================================================
+    # Helpers
+    # ============================================================
+    def _build_image_content(self, images: list[str]) -> list[dict]:
+        """Build image content blocks for API messages."""
+        content = []
+        for img_b64 in images:
+            content.append({
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{img_b64}",
+                    "detail": "high",
+                },
+            })
+        return content
+    def _extract_ranking_from_commit(self, args: dict) -> list[dict]:
+        """Extract ranking from commit_diagnosis tool call (structured JSON)."""
+        ranked = args.get("ranked_diagnoses", [])
+        ranking = []
+        for i, entry in enumerate(ranked):
+            ranking.append({
+                "name": entry.get("name", ""),
+                "confidence": entry.get("confidence", 0.0),
+                "rank": i + 1,
+                "key_evidence": entry.get("key_evidence", ""),
+            })
+        ranking.sort(key=lambda x: x["confidence"], reverse=True)
+        for i, entry in enumerate(ranking):
+            entry["rank"] = i + 1
+        return ranking
+    def _extract_ranking_from_text(
+        self, text: str, candidates: list[str]
+    ) -> list[dict]:
+        """Last-resort fallback: extract ranking from free text."""
+        import re
+        ranking = []
+        pattern = (
+            r"(\d+)\.\s*(.+?)\s*"
+            r"\((?:confidence|probability|prob|conf):\s*([\d.]+)\)"
+        )
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        if matches:
+            for rank_str, name, conf_str in matches:
+                try:
+                    ranking.append({
+                        "name": name.strip(),
+                        "confidence": float(conf_str),
+                        "rank": int(rank_str),
+                    })
+                except ValueError:
+                    continue
+        if not ranking and candidates:
+            for i, candidate in enumerate(candidates):
+                if candidate.lower() in text.lower():
+                    ranking.append({
+                        "name": candidate,
+                        "confidence": max(0.1, 1.0 - i * 0.2),
+                        "rank": len(ranking) + 1,
+                    })
+        ranking.sort(key=lambda x: x.get("confidence", 0), reverse=True)
+        for i, entry in enumerate(ranking):
+            entry["rank"] = i + 1
+        return ranking
+    def _match_channel(
+        self, requested: str, available: list[str]
+    ) -> str | None:
+        """Match requested channel name to available channels."""
+        requested = requested.lower().strip().replace(" ", "_")
+        if requested in available:
+            return requested
+        for ch in available:
+            if requested in ch or ch in requested:
+                return ch
+        req_words = set(requested.split("_"))
+        best_match, best_overlap = None, 0
+        for ch in available:
+            overlap = len(req_words & set(ch.split("_")))
+            if overlap > best_overlap:
+                best_overlap = overlap
+                best_match = ch
+        return best_match if best_overlap > 0 else None

api_client.py ADDED Viewed

	@@ -0,0 +1,707 @@

+"""
+Unified multi-backend VLM API client with tool-use support.
+Supports OpenAI (GPT-4o), Anthropic (Claude), and Together (Qwen2.5-VL).
+Handles image encoding, rate limiting, retries, response normalization,
+and native function/tool calling across all backends.
+"""
+import base64
+import io
+import json
+import time
+import logging
+from collections import deque
+from pathlib import Path
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from PIL import Image
+import config
+logger = logging.getLogger(__name__)
+@dataclass
+class VLMResponse:
+    """Normalized response from any VLM backend, including tool calls."""
+    text: str
+    model: str
+    backend: str
+    input_tokens: int
+    output_tokens: int
+    latency_ms: float
+    tool_call: object | None = None  # tools.ToolCall if a tool was called
+def _normalize_image_mode(img: Image.Image) -> Image.Image:
+    """Normalize medical image modes to RGB-compatible formats for JPEG encoding."""
+    if img.mode in ("RGB",):
+        return img
+    if img.mode == "RGBA":
+        background = Image.new("RGB", img.size, (255, 255, 255))
+        background.paste(img, mask=img.split()[3])
+        return background
+    if img.mode == "L":
+        return img.convert("RGB")
+    if img.mode in ("I", "I;16", "I;16B", "I;16L"):
+        import numpy as np
+        arr = np.array(img, dtype=np.float64)
+        if arr.max() > arr.min():
+            arr = (arr - arr.min()) / (arr.max() - arr.min()) * 255.0
+        else:
+            arr = np.zeros_like(arr)
+        return Image.fromarray(arr.astype(np.uint8)).convert("RGB")
+    if img.mode == "F":
+        import numpy as np
+        arr = np.array(img, dtype=np.float64)
+        if arr.max() > arr.min():
+            arr = (arr - arr.min()) / (arr.max() - arr.min()) * 255.0
+        else:
+            arr = np.zeros_like(arr)
+        return Image.fromarray(arr.astype(np.uint8)).convert("RGB")
+    return img.convert("RGB")
+def encode_image_to_base64(image_path: str | Path, max_size: int = 1024) -> str:
+    """Load and encode an image to base64, resizing if needed."""
+    img = Image.open(image_path)
+    if max(img.size) > max_size:
+        ratio = max_size / max(img.size)
+        new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
+        img = img.resize(new_size, Image.LANCZOS)
+    img = _normalize_image_mode(img)
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG", quality=90)
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+def encode_pil_image_to_base64(img: Image.Image, max_size: int = 1024) -> str:
+    """Encode a PIL Image object to base64."""
+    if max(img.size) > max_size:
+        ratio = max_size / max(img.size)
+        new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
+        img = img.resize(new_size, Image.LANCZOS)
+    img = _normalize_image_mode(img)
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG", quality=90)
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+class BaseVLMClient(ABC):
+    """Abstract base class for VLM API clients with tool-use support."""
+    def __init__(self, model: str, api_key: str, rate_limit: int = 30):
+        self.model = model
+        self.api_key = api_key
+        self.rate_limit = rate_limit
+        self._call_timestamps: deque[float] = deque()
+    def _rate_limit_wait(self):
+        """Enforce rate limiting using a sliding window over the last 60 seconds."""
+        now = time.time()
+        while self._call_timestamps and now - self._call_timestamps[0] >= 60.0:
+            self._call_timestamps.popleft()
+        if len(self._call_timestamps) >= self.rate_limit:
+            sleep_time = 60.0 - (now - self._call_timestamps[0])
+            if sleep_time > 0:
+                time.sleep(sleep_time)
+            self._call_timestamps.popleft()
+        self._call_timestamps.append(time.time())
+    @abstractmethod
+    def call(
+        self,
+        system_prompt: str,
+        user_text: str,
+        images: list[str] | None = None,
+        temperature: float = 0.1,
+        max_tokens: int = 2048,
+        tools: list[dict] | None = None,
+    ) -> VLMResponse:
+        """Make a VLM API call, optionally with tools."""
+        pass
+    def call_multiturn(
+        self,
+        system_prompt: str,
+        messages: list[dict],
+        temperature: float = 0.1,
+        max_tokens: int = 2048,
+        tools: list[dict] | None = None,
+    ) -> VLMResponse:
+        """Multi-turn conversation call with tool support. Override in subclasses."""
+        last_user = ""
+        last_images = []
+        for msg in reversed(messages):
+            if msg["role"] == "user":
+                if isinstance(msg["content"], str):
+                    last_user = msg["content"]
+                elif isinstance(msg["content"], list):
+                    for block in msg["content"]:
+                        if block.get("type") == "text":
+                            last_user = block["text"]
+                        elif block.get("type") == "image_url":
+                            last_images.append(block["image_url"]["url"].split(",", 1)[-1])
+                break
+        return self.call(system_prompt, last_user, last_images or None, temperature, max_tokens, tools)
+    def call_with_retry(
+        self,
+        system_prompt: str,
+        user_text: str = None,
+        images: list[str] | None = None,
+        temperature: float = 0.1,
+        max_tokens: int = 2048,
+        max_retries: int = 3,
+        messages: list[dict] | None = None,
+        tools: list[dict] | None = None,
+    ) -> VLMResponse:
+        """Call with exponential backoff retry. Supports single-turn, multi-turn, and tools."""
+        for attempt in range(max_retries):
+            try:
+                self._rate_limit_wait()
+                if messages is not None:
+                    return self.call_multiturn(system_prompt, messages, temperature, max_tokens, tools)
+                return self.call(system_prompt, user_text, images, temperature, max_tokens, tools)
+            except Exception as e:
+                wait_time = 2 ** attempt * 5
+                logger.warning(
+                    f"API call failed (attempt {attempt + 1}/{max_retries}): {e}. "
+                    f"Retrying in {wait_time}s..."
+                )
+                if attempt == max_retries - 1:
+                    raise
+                time.sleep(wait_time)
+def _parse_tool_call_openai(response_message) -> object | None:
+    """Extract a ToolCall from an OpenAI response message."""
+    from tools import ToolCall
+    tool_calls = getattr(response_message, "tool_calls", None)
+    if not tool_calls:
+        return None
+    tc = tool_calls[0]  # Take the first tool call
+    try:
+        arguments = json.loads(tc.function.arguments)
+    except (json.JSONDecodeError, AttributeError):
+        arguments = {}
+    return ToolCall(
+        tool_name=tc.function.name,
+        arguments=arguments,
+        call_id=tc.id,
+    )
+def _parse_tool_call_anthropic(response) -> object | None:
+    """Extract a ToolCall from an Anthropic response."""
+    from tools import ToolCall
+    for block in response.content:
+        if block.type == "tool_use":
+            return ToolCall(
+                tool_name=block.name,
+                arguments=block.input,
+                call_id=block.id,
+            )
+    return None
+# ============================================================
+# OpenAI Backend (GPT-4o) — with tool calling
+# ============================================================
+class OpenAIClient(BaseVLMClient):
+    """OpenAI GPT-4o API client with native function calling."""
+    def __init__(self, model: str = None, api_key: str = None, rate_limit: int = None):
+        super().__init__(
+            model=model or config.MODELS["openai"],
+            api_key=api_key or config.OPENAI_API_KEY,
+            rate_limit=rate_limit or config.RATE_LIMITS["openai"],
+        )
+        from openai import OpenAI
+        self.client = OpenAI(api_key=self.api_key)
+    def call(
+        self,
+        system_prompt: str,
+        user_text: str,
+        images: list[str] | None = None,
+        temperature: float = 0.1,
+        max_tokens: int = 2048,
+        tools: list[dict] | None = None,
+    ) -> VLMResponse:
+        content = []
+        if images:
+            for img_b64 in images:
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_b64}", "detail": "high"},
+                })
+        content.append({"type": "text", "text": user_text})
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": content},
+        ]
+        kwargs = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if tools:
+            from tools import to_openai_tools
+            kwargs["tools"] = to_openai_tools(tools)
+            kwargs["tool_choice"] = "required"
+        t0 = time.time()
+        response = self.client.chat.completions.create(**kwargs)
+        latency = (time.time() - t0) * 1000
+        msg = response.choices[0].message
+        tool_call = _parse_tool_call_openai(msg) if tools else None
+        return VLMResponse(
+            text=msg.content or "",
+            model=self.model,
+            backend="openai",
+            input_tokens=response.usage.prompt_tokens,
+            output_tokens=response.usage.completion_tokens,
+            latency_ms=latency,
+            tool_call=tool_call,
+        )
+    def call_multiturn(
+        self,
+        system_prompt: str,
+        messages: list[dict],
+        temperature: float = 0.1,
+        max_tokens: int = 2048,
+        tools: list[dict] | None = None,
+    ) -> VLMResponse:
+        """
+        Multi-turn OpenAI call with full tool-calling protocol.
+        Translates our internal message format to OpenAI's API format:
+          - "user"        → role:"user" (passed through)
+          - "assistant"   → role:"assistant" with tool_calls array
+          - "tool_result" → role:"tool" (text result) + role:"user" (images + follow-up)
+        OpenAI requires: after an assistant message with tool_calls, the next
+        message MUST be role:"tool" with the matching tool_call_id. Images
+        cannot go in tool messages, so we send them in a separate user message.
+        """
+        api_messages = [{"role": "system", "content": system_prompt}]
+        for msg in messages:
+            role = msg["role"]
+            if role == "user":
+                api_messages.append({
+                    "role": "user",
+                    "content": msg["content"],
+                })
+            elif role == "assistant":
+                api_msg = {"role": "assistant"}
+                if msg.get("tool_calls"):
+                    tc = msg["tool_calls"][0]
+                    api_msg["tool_calls"] = [{
+                        "id": tc.call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tc.tool_name,
+                            "arguments": json.dumps(tc.arguments),
+                        },
+                    }]
+                    # OpenAI requires content to be null when tool_calls present
+                    api_msg["content"] = msg.get("content") or None
+                else:
+                    api_msg["content"] = msg.get("content", "")
+                api_messages.append(api_msg)
+            elif role == "tool_result":
+                # Step 1: Send the tool result as role:"tool"
+                api_messages.append({
+                    "role": "tool",
+                    "tool_call_id": msg["tool_call_id"],
+                    "content": msg.get("content", ""),
+                })
+                # Step 2: Send images + follow-up as a user message
+                # (OpenAI tool messages don't support image content blocks)
+                follow_up_content = []
+                for img_b64 in msg.get("images", []):
+                    follow_up_content.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{img_b64}",
+                        },
+                    })
+                follow_up = msg.get("follow_up", "")
+                if follow_up:
+                    follow_up_content.append({
+                        "type": "text",
+                        "text": follow_up,
+                    })
+                if follow_up_content:
+                    api_messages.append({
+                        "role": "user",
+                        "content": follow_up_content,
+                    })
+        kwargs = {
+            "model": self.model,
+            "messages": api_messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if tools:
+            from tools import to_openai_tools
+            kwargs["tools"] = to_openai_tools(tools)
+            kwargs["tool_choice"] = "required"
+        t0 = time.time()
+        response = self.client.chat.completions.create(**kwargs)
+        latency = (time.time() - t0) * 1000
+        msg = response.choices[0].message
+        tool_call = _parse_tool_call_openai(msg) if tools else None
+        return VLMResponse(
+            text=msg.content or "",
+            model=self.model,
+            backend="openai",
+            input_tokens=response.usage.prompt_tokens,
+            output_tokens=response.usage.completion_tokens,
+            latency_ms=latency,
+            tool_call=tool_call,
+        )
+# ============================================================
+# Anthropic Backend (Claude) — with tool use
+# ============================================================
+class AnthropicClient(BaseVLMClient):
+    """Anthropic Claude API client with native tool use."""
+    def __init__(self, model: str = None, api_key: str = None, rate_limit: int = None):
+        super().__init__(
+            model=model or config.MODELS["anthropic"],
+            api_key=api_key or config.ANTHROPIC_API_KEY,
+            rate_limit=rate_limit or config.RATE_LIMITS["anthropic"],
+        )
+        from anthropic import Anthropic
+        self.client = Anthropic(api_key=self.api_key)
+    def call(
+        self,
+        system_prompt: str,
+        user_text: str,
+        images: list[str] | None = None,
+        temperature: float = 0.1,
+        max_tokens: int = 2048,
+        tools: list[dict] | None = None,
+    ) -> VLMResponse:
+        content = []
+        if images:
+            for img_b64 in images:
+                content.append({
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": img_b64,
+                    },
+                })
+        content.append({"type": "text", "text": user_text})
+        kwargs = {
+            "model": self.model,
+            "system": system_prompt,
+            "messages": [{"role": "user", "content": content}],
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if tools:
+            from tools import to_anthropic_tools
+            kwargs["tools"] = to_anthropic_tools(tools)
+            kwargs["tool_choice"] = {"type": "any"}
+        t0 = time.time()
+        response = self.client.messages.create(**kwargs)
+        latency = (time.time() - t0) * 1000
+        # Extract text from response (may have both text and tool_use blocks)
+        text_parts = []
+        for block in response.content:
+            if hasattr(block, "text"):
+                text_parts.append(block.text)
+        tool_call = _parse_tool_call_anthropic(response) if tools else None
+        return VLMResponse(
+            text="\n".join(text_parts),
+            model=self.model,
+            backend="anthropic",
+            input_tokens=response.usage.input_tokens,
+            output_tokens=response.usage.output_tokens,
+            latency_ms=latency,
+            tool_call=tool_call,
+        )
+    def call_multiturn(
+        self,
+        system_prompt: str,
+        messages: list[dict],
+        temperature: float = 0.1,
+        max_tokens: int = 2048,
+        tools: list[dict] | None = None,
+    ) -> VLMResponse:
+        """
+        Multi-turn Anthropic call with full tool-use protocol.
+        Translates our internal message format to Anthropic's API format:
+          - "user"        → role:"user" (passed through)
+          - "assistant"   → role:"assistant" with tool_use content blocks
+          - "tool_result" → role:"user" with tool_result block + image blocks
+        Anthropic's protocol: after an assistant message with a tool_use block,
+        the next message MUST be role:"user" containing a tool_result block
+        with the matching tool_use_id. Images and follow-up text can be
+        included in the same user message as additional content blocks.
+        """
+        api_messages = []
+        for msg in messages:
+            role = msg["role"]
+            if role == "user":
+                content = msg["content"]
+                # Convert image_url format to Anthropic's image format
+                if isinstance(content, list):
+                    anthropic_content = []
+                    for block in content:
+                        if block.get("type") == "image_url":
+                            url = block["image_url"]["url"]
+                            # Extract base64 data from data URL
+                            if url.startswith("data:"):
+                                b64_data = url.split(",", 1)[-1]
+                            else:
+                                b64_data = url
+                            anthropic_content.append({
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": "image/jpeg",
+                                    "data": b64_data,
+                                },
+                            })
+                        elif block.get("type") == "text":
+                            anthropic_content.append(block)
+                        else:
+                            anthropic_content.append(block)
+                    api_messages.append({
+                        "role": "user",
+                        "content": anthropic_content,
+                    })
+                else:
+                    api_messages.append({
+                        "role": "user",
+                        "content": content,
+                    })
+            elif role == "assistant":
+                content_blocks = []
+                if msg.get("content"):
+                    content_blocks.append({
+                        "type": "text",
+                        "text": msg["content"],
+                    })
+                if msg.get("tool_calls"):
+                    tc = msg["tool_calls"][0]
+                    content_blocks.append({
+                        "type": "tool_use",
+                        "id": tc.call_id,
+                        "name": tc.tool_name,
+                        "input": tc.arguments,
+                    })
+                api_messages.append({
+                    "role": "assistant",
+                    "content": content_blocks,
+                })
+            elif role == "tool_result":
+                # Anthropic: tool_result goes in a user message alongside
+                # any images and follow-up text
+                user_content = []
+                # The tool_result block
+                user_content.append({
+                    "type": "tool_result",
+                    "tool_use_id": msg["tool_call_id"],
+                    "content": msg.get("content", ""),
+                })
+                # Images from the channel data
+                for img_b64 in msg.get("images", []):
+                    user_content.append({
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/jpeg",
+                            "data": img_b64,
+                        },
+                    })
+                # Follow-up text (next-step instructions)
+                follow_up = msg.get("follow_up", "")
+                if follow_up:
+                    user_content.append({
+                        "type": "text",
+                        "text": follow_up,
+                    })
+                api_messages.append({
+                    "role": "user",
+                    "content": user_content,
+                })
+        kwargs = {
+            "model": self.model,
+            "system": system_prompt,
+            "messages": api_messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if tools:
+            from tools import to_anthropic_tools
+            kwargs["tools"] = to_anthropic_tools(tools)
+            kwargs["tool_choice"] = {"type": "any"}
+        t0 = time.time()
+        response = self.client.messages.create(**kwargs)
+        latency = (time.time() - t0) * 1000
+        text_parts = []
+        for block in response.content:
+            if hasattr(block, "text"):
+                text_parts.append(block.text)
+        tool_call = _parse_tool_call_anthropic(response) if tools else None
+        return VLMResponse(
+            text="\n".join(text_parts),
+            model=self.model,
+            backend="anthropic",
+            input_tokens=response.usage.input_tokens,
+            output_tokens=response.usage.output_tokens,
+            latency_ms=latency,
+            tool_call=tool_call,
+        )
+# ============================================================
+# Together Backend (Qwen2.5-VL) — with tool calling
+# ============================================================
+class TogetherClient(BaseVLMClient):
+    """Together AI client with function calling support."""
+    def __init__(self, model: str = None, api_key: str = None, rate_limit: int = None):
+        super().__init__(
+            model=model or config.MODELS["together"],
+            api_key=api_key or config.TOGETHER_API_KEY,
+            rate_limit=rate_limit or config.RATE_LIMITS["together"],
+        )
+        from together import Together
+        self.client = Together(api_key=self.api_key)
+    def call(
+        self,
+        system_prompt: str,
+        user_text: str,
+        images: list[str] | None = None,
+        temperature: float = 0.1,
+        max_tokens: int = 2048,
+        tools: list[dict] | None = None,
+    ) -> VLMResponse:
+        content = []
+        if images:
+            for img_b64 in images:
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"},
+                })
+        content.append({"type": "text", "text": user_text})
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": content},
+        ]
+        kwargs = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if tools:
+            from tools import to_openai_tools
+            kwargs["tools"] = to_openai_tools(tools)
+        t0 = time.time()
+        response = self.client.chat.completions.create(**kwargs)
+        latency = (time.time() - t0) * 1000
+        msg = response.choices[0].message
+        usage = response.usage
+        tool_call = _parse_tool_call_openai(msg) if tools else None
+        return VLMResponse(
+            text=msg.content or "",
+            model=self.model,
+            backend="together",
+            input_tokens=getattr(usage, "prompt_tokens", 0),
+            output_tokens=getattr(usage, "completion_tokens", 0),
+            latency_ms=latency,
+            tool_call=tool_call,
+        )
+# ============================================================
+# Client Factory
+# ============================================================
+class OpenAIMiniClient(OpenAIClient):
+    """OpenAI GPT-4o-mini client."""
+    def __init__(self, model: str = None, api_key: str = None, rate_limit: int = None):
+        BaseVLMClient.__init__(
+            self,
+            model=model or config.MODELS["openai_mini"],
+            api_key=api_key or config.OPENAI_API_KEY,
+            rate_limit=rate_limit or config.RATE_LIMITS["openai_mini"],
+        )
+        from openai import OpenAI
+        self.client = OpenAI(api_key=self.api_key)
+def create_client(backend: str, **kwargs) -> BaseVLMClient:
+    """Factory function to create a VLM client by backend name."""
+    clients = {
+        "openai": OpenAIClient,
+        "openai_mini": OpenAIMiniClient,
+        "anthropic": AnthropicClient,
+        "together": TogetherClient,
+    }
+    if backend not in clients:
+        raise ValueError(f"Unknown backend: {backend}. Choose from {list(clients.keys())}")
+    return clients[backend](**kwargs)

app.py ADDED Viewed

	@@ -0,0 +1,1000 @@

+"""
+Interactive Demo for ActiveMedAgent.
+A Gradio-based UI that lets users:
+  - Select from pre-built demo cases OR enter a custom clinical scenario
+  - Upload medical images (optional)
+  - Watch the agent's step-by-step reasoning, information acquisition, and
+    entropy reduction in real time
+  - No budget constraint — the agent acquires as many channels as it needs
+Usage:
+    python app.py
+    python app.py --backend openai
+    python app.py --backend anthropic --port 7861
+"""
+import argparse
+import json
+import logging
+import sys
+import time
+import math
+from pathlib import Path
+from dataclasses import dataclass, field
+import numpy as np
+import gradio as gr
+from PIL import Image
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+import config
+from api_client import create_client, encode_image_to_base64, encode_pil_image_to_base64
+from agent import ActiveMedAgent, AgentResult, AcquisitionStep, SYSTEM_PROMPT_FULL, SYSTEM_PROMPT_CONDENSED, SYSTEM_PROMPT_FINAL
+from datasets.base import MedicalCase, ChannelData
+from tools import AGENT_TOOLS, constrain_tools_for_step, ToolCall
+from information_gain import (
+    BeliefState, BeliefTrajectory,
+    compute_entropy, compute_kl_divergence,
+    estimate_expected_information_gain,
+    should_commit, compute_value_of_information,
+)
+from prompts import format_available_channels, format_acquired_info
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+# ============================================================
+# Backend Availability Detection
+# ============================================================
+def _detect_available_backends() -> list[str]:
+    """Detect which backends have API keys configured."""
+    available = []
+    if config.OPENAI_API_KEY and config.OPENAI_API_KEY != "sk-...":
+        available.append("openai")
+    if config.ANTHROPIC_API_KEY and config.ANTHROPIC_API_KEY != "sk-ant-...":
+        available.append("anthropic")
+    if config.TOGETHER_API_KEY:
+        available.append("together")
+    return available
+AVAILABLE_BACKENDS = _detect_available_backends()
+# ============================================================
+# Simulation Mode — works without API keys
+# ============================================================
+def _simulate_agent_on_case(case: MedicalCase) -> AgentResult:
+    """
+    Run a simulated agent that demonstrates the full pipeline
+    with realistic-looking reasoning traces. No API keys needed.
+    """
+    import random
+    random.seed(42)
+    result = AgentResult(
+        case_id=case.case_id,
+        dataset=case.dataset,
+        prompt_variant="A",
+        backend="simulated (no API key)",
+        budget=len(case.requestable_channels),
+    )
+    trajectory = BeliefTrajectory(case_id=case.case_id)
+    acquired = []
+    n_candidates = len(case.candidates)
+    # Generate initial uniform-ish distribution
+    probs = np.random.dirichlet(np.ones(n_candidates) * 2.0).tolist()
+    probs.sort(reverse=True)
+    # Make ground truth likely to end up on top by the end
+    gt_idx = case.ground_truth_rank
+    requestable_names = list(case.requestable_channels.keys())
+    cumulative_cost = case.get_initial_cost()
+    for step_idx, ch_name in enumerate(requestable_names):
+        ch = case.requestable_channels[ch_name]
+        # Evolve the distribution — gradually concentrate on correct answer
+        progress = (step_idx + 1) / len(requestable_names)
+        new_probs = []
+        for i in range(n_candidates):
+            if i == gt_idx:
+                new_probs.append(probs[i] + 0.15 * progress + random.uniform(0, 0.05))
+            else:
+                new_probs.append(max(0.01, probs[i] - 0.04 * progress + random.uniform(-0.02, 0.02)))
+        total = sum(new_probs)
+        probs = [p / total for p in new_probs]
+        distribution = {case.candidates[i]: probs[i] for i in range(n_candidates)}
+        sorted_dist = sorted(distribution.items(), key=lambda x: -x[1])
+        prev_entropy = trajectory.states[-1].entropy if trajectory.states else compute_entropy(distribution) + 0.3
+        belief = BeliefState(
+            step=step_idx,
+            distribution=distribution,
+            channel_acquired=ch_name,
+        )
+        trajectory.states.append(belief)
+        ig = prev_entropy - belief.entropy
+        kl = abs(ig) * 1.2 + random.uniform(0, 0.1)
+        top_two = sorted_dist[:2]
+        reasoning_templates = [
+            f"Need to distinguish between {top_two[0][0]} ({top_two[0][1]:.0%}) and {top_two[1][0]} ({top_two[1][1]:.0%}). "
+            f"Requesting {ch_name} to resolve this uncertainty.",
+            f"Current top diagnosis is {top_two[0][0]} at {top_two[0][1]:.0%} but {top_two[1][0]} cannot be ruled out. "
+            f"The {ch_name} channel should provide discriminating evidence.",
+            f"Diagnostic uncertainty remains high (H={belief.entropy:.2f} bits). "
+            f"The {ch_name} data is expected to significantly narrow the differential.",
+        ]
+        step = AcquisitionStep(
+            step=step_idx,
+            tool_call=ToolCall(tool_name="request_information", arguments={
+                "channel_name": ch_name,
+                "reasoning": reasoning_templates[step_idx % len(reasoning_templates)],
+            }),
+            requested_channel=ch_name,
+            reasoning=reasoning_templates[step_idx % len(reasoning_templates)],
+            differential=[
+                {"name": name, "confidence": prob, "rank": i + 1}
+                for i, (name, prob) in enumerate(sorted_dist)
+            ],
+            committed=False,
+            raw_response="(simulated)",
+            latency_ms=random.uniform(800, 3000),
+            entropy=belief.entropy,
+            information_gain=ig,
+            kl_divergence=kl,
+            expected_impact={
+                "if_positive": sorted_dist[0][0],
+                "if_negative": sorted_dist[1][0],
+            },
+        )
+        result.steps.append(step)
+        acquired.append(ch_name)
+    # Final commit step
+    final_probs = []
+    for i in range(n_candidates):
+        if i == gt_idx:
+            final_probs.append(0.65 + random.uniform(0, 0.15))
+        else:
+            final_probs.append(random.uniform(0.02, 0.12))
+    total = sum(final_probs)
+    final_probs = [p / total for p in final_probs]
+    final_dist = {case.candidates[i]: final_probs[i] for i in range(n_candidates)}
+    sorted_final = sorted(final_dist.items(), key=lambda x: -x[1])
+    final_belief = BeliefState(
+        step=len(requestable_names),
+        distribution=final_dist,
+        channel_acquired=None,
+    )
+    trajectory.states.append(final_belief)
+    final_ranking = [
+        {
+            "name": name,
+            "confidence": prob,
+            "rank": i + 1,
+            "key_evidence": f"Supported by evidence from acquired channels" if i == 0 else "Less consistent with findings",
+        }
+        for i, (name, prob) in enumerate(sorted_final)
+    ]
+    commit_step = AcquisitionStep(
+        step=len(requestable_names),
+        tool_call=ToolCall(tool_name="commit_diagnosis", arguments={}),
+        requested_channel=None,
+        reasoning=f"After acquiring all available channels, the evidence strongly supports {sorted_final[0][0]}. "
+                  f"Entropy reduced to {final_belief.entropy:.2f} bits. Committing diagnosis.",
+        differential=final_ranking,
+        committed=True,
+        raw_response="(simulated)",
+        latency_ms=random.uniform(500, 2000),
+        entropy=final_belief.entropy,
+        information_gain=trajectory.states[-2].entropy - final_belief.entropy if len(trajectory.states) >= 2 else 0,
+        kl_divergence=0.0,
+    )
+    result.steps.append(commit_step)
+    result.committed_early = False
+    result.final_ranking = final_ranking
+    result.acquired_channels = acquired
+    result.belief_trajectory = trajectory
+    result.acquisition_cost = case.get_acquisition_cost(acquired)
+    result.total_case_cost = case.get_total_cost(acquired)
+    result.total_latency_ms = sum(s.latency_ms for s in result.steps)
+    result.total_input_tokens = 0
+    result.total_output_tokens = 0
+    return result
+# ============================================================
+# Synthetic Demo Cases
+# ============================================================
+def _make_dummy_image(width=224, height=224, color=(180, 60, 60)) -> str:
+    img = Image.new("RGB", (width, height), color)
+    arr = np.array(img)
+    noise = np.random.randint(-20, 20, arr.shape, dtype=np.int16)
+    arr = np.clip(arr.astype(np.int16) + noise, 0, 255).astype(np.uint8)
+    img = Image.fromarray(arr)
+    return encode_pil_image_to_base64(img)
+DEMO_CASES = {
+    "NEJM: Pulmonary Fibrosis": {
+        "description": (
+            "A 58-year-old man with progressive dyspnea and dry cough over 3 months. "
+            "30-pack-year smoking history, takes lisinopril for hypertension."
+        ),
+        "case": lambda: MedicalCase(
+            case_id="demo_nejm_ipf",
+            dataset="nejm",
+            initial_channels={
+                "demographics": ChannelData(
+                    name="demographics", channel_type="text",
+                    description="Patient age, sex, and ethnicity",
+                    value="A 58-year-old man", always_given=True, cost=0.0, tier="free",
+                ),
+                "chief_complaint": ChannelData(
+                    name="chief_complaint", channel_type="text",
+                    description="Presenting symptoms and duration",
+                    value="Progressive dyspnea and dry cough over the past 3 months.",
+                    always_given=True, cost=0.0, tier="free",
+                ),
+                "medical_history": ChannelData(
+                    name="medical_history", channel_type="text",
+                    description="Past medical conditions, medications, family and social history",
+                    value="30-pack-year smoking history. No prior lung disease. Takes lisinopril for hypertension.",
+                    always_given=True, cost=0.0, tier="free",
+                ),
+            },
+            requestable_channels={
+                "exam_findings": ChannelData(
+                    name="exam_findings", channel_type="text",
+                    description="Physical examination results and observations",
+                    value="Bibasilar crackles on auscultation. No clubbing. Oxygen saturation 92% on room air.",
+                    cost=75.0, tier="cheap",
+                ),
+                "investigations": ChannelData(
+                    name="investigations", channel_type="text",
+                    description="Laboratory values, prior imaging results, and test outcomes",
+                    value="PFTs show restrictive pattern with reduced DLCO. CT chest shows bilateral ground-glass opacities with honeycombing in the lower lobes.",
+                    cost=250.0, tier="moderate",
+                ),
+                "image": ChannelData(
+                    name="image", channel_type="image",
+                    description="The primary diagnostic image (chest CT)",
+                    value=_make_dummy_image(300, 300, (200, 200, 210)),
+                    cost=800.0, tier="expensive",
+                ),
+            },
+            candidates=[
+                "A. Idiopathic pulmonary fibrosis",
+                "B. Hypersensitivity pneumonitis",
+                "C. Sarcoidosis",
+                "D. Lung adenocarcinoma",
+                "E. ACE-inhibitor induced cough with incidental CT findings",
+            ],
+            ground_truth="A. Idiopathic pulmonary fibrosis",
+            ground_truth_rank=0,
+        ),
+    },
+    "Dermatology: Pigmented Lesion": {
+        "description": (
+            "A 62-year-old woman presents with a pigmented lesion on her left forearm. "
+            "The lesion is 8mm x 6mm. Clinical photograph provided."
+        ),
+        "case": lambda: MedicalCase(
+            case_id="demo_midas_001",
+            dataset="midas",
+            initial_channels={
+                "clinical_30cm": ChannelData(
+                    name="clinical_30cm", channel_type="image",
+                    description="Clinical photograph at 30cm distance",
+                    value=_make_dummy_image(224, 224, (180, 120, 100)),
+                    always_given=True, cost=0.0, tier="free",
+                ),
+            },
+            requestable_channels={
+                "patient_demographics": ChannelData(
+                    name="patient_demographics", channel_type="text",
+                    description="Patient age, sex, and Fitzpatrick skin type",
+                    value="Age: 62; Sex: Female; Fitzpatrick skin type: III",
+                    cost=0.0, tier="free",
+                ),
+                "lesion_metadata": ChannelData(
+                    name="lesion_metadata", channel_type="text",
+                    description="Anatomic location, lesion length and width",
+                    value="Anatomic location: Left forearm; Lesion length: 8mm; Lesion width: 6mm",
+                    cost=25.0, tier="cheap",
+                ),
+                "clinical_15cm": ChannelData(
+                    name="clinical_15cm", channel_type="image",
+                    description="Clinical photograph at 15cm distance (closer view)",
+                    value=_make_dummy_image(224, 224, (170, 110, 90)),
+                    cost=50.0, tier="moderate",
+                ),
+                "dermoscopy": ChannelData(
+                    name="dermoscopy", channel_type="image",
+                    description="Dermoscopic image showing subsurface skin structures",
+                    value=_make_dummy_image(224, 224, (100, 80, 60)),
+                    cost=250.0, tier="expensive",
+                ),
+            },
+            candidates=[
+                "Melanoma in situ",
+                "Dysplastic nevus",
+                "Basal cell carcinoma",
+                "Seborrheic keratosis",
+                "Solar lentigo",
+            ],
+            ground_truth="Dysplastic nevus",
+            ground_truth_rank=1,
+        ),
+    },
+    "Ophthalmology: Retinal Biomarkers (OLIVES)": {
+        "description": (
+            "A patient with diabetic macular edema (DME), 4 prior anti-VEGF injections, "
+            "32 weeks in treatment. Fundus photograph provided."
+        ),
+        "case": lambda: MedicalCase(
+            case_id="demo_olives_P01",
+            dataset="olives",
+            initial_channels={
+                "disease_context": ChannelData(
+                    name="disease_context", channel_type="text",
+                    description="Disease type and treatment context",
+                    value="Disease: Diabetic Macular Edema (DME). Prior anti-VEGF injections: 4. Weeks in treatment: 32.",
+                    always_given=True, cost=0.0, tier="free",
+                ),
+            },
+            requestable_channels={
+                "clinical_measurements": ChannelData(
+                    name="clinical_measurements", channel_type="text",
+                    description="Best Corrected Visual Acuity (BCVA) and Central Subfield Thickness (CST)",
+                    value="BCVA: 20/60 (logMAR 0.48); CST: 385 um",
+                    cost=20.0, tier="cheap",
+                ),
+                "biomarker_hints": ChannelData(
+                    name="biomarker_hints", channel_type="text",
+                    description="Expert-graded presence of fundus-visible retinal biomarkers",
+                    value="Hard Exudates: Present; Hemorrhage: Present; Microaneurysms: Present; Cotton Wool Spots: Not detected",
+                    cost=100.0, tier="moderate",
+                ),
+                "oct_scan": ChannelData(
+                    name="oct_scan", channel_type="image",
+                    description="OCT B-scan showing retinal cross-section",
+                    value=_make_dummy_image(512, 128, (60, 60, 60)),
+                    cost=300.0, tier="expensive",
+                ),
+                "additional_oct": ChannelData(
+                    name="additional_oct", channel_type="image",
+                    description="Additional OCT B-scans from different retinal locations",
+                    value=_make_dummy_image(512, 128, (50, 50, 55)),
+                    cost=150.0, tier="very_expensive",
+                ),
+            },
+            candidates=[
+                "Present biomarkers: Dril, Drt Me, Ez Disruption, Fluid Irf, Hard Exudates, Hemorrhage, Microaneurysms",
+                "Present biomarkers: Dril, Drt Me, Ez Disruption, Fluid Irf, Fluid Srf, Hard Exudates, Hemorrhage, Microaneurysms",
+                "Present biomarkers: Hard Exudates, Hemorrhage, Microaneurysms",
+                "Present biomarkers: Dril, Ez Disruption, Fluid Irf, Shrm",
+                "No biomarkers detected",
+            ],
+            ground_truth="Present biomarkers: Dril, Drt Me, Ez Disruption, Fluid Irf, Hard Exudates, Hemorrhage, Microaneurysms",
+            ground_truth_rank=0,
+        ),
+    },
+    "NEJM: Cardiac Case": {
+        "description": (
+            "A 45-year-old woman presents with sudden onset chest pain and shortness "
+            "of breath. She recently completed a long international flight."
+        ),
+        "case": lambda: MedicalCase(
+            case_id="demo_nejm_pe",
+            dataset="nejm",
+            initial_channels={
+                "demographics": ChannelData(
+                    name="demographics", channel_type="text",
+                    description="Patient age, sex, and ethnicity",
+                    value="A 45-year-old woman", always_given=True, cost=0.0, tier="free",
+                ),
+                "chief_complaint": ChannelData(
+                    name="chief_complaint", channel_type="text",
+                    description="Presenting symptoms and duration",
+                    value="Sudden onset chest pain and shortness of breath, started 2 hours ago after returning from a 14-hour international flight.",
+                    always_given=True, cost=0.0, tier="free",
+                ),
+                "medical_history": ChannelData(
+                    name="medical_history", channel_type="text",
+                    description="Past medical conditions, medications, family and social history",
+                    value="On oral contraceptives for 5 years. BMI 32. No prior VTE. Mother had DVT at age 50.",
+                    always_given=True, cost=0.0, tier="free",
+                ),
+            },
+            requestable_channels={
+                "exam_findings": ChannelData(
+                    name="exam_findings", channel_type="text",
+                    description="Physical examination results and observations",
+                    value="Tachycardic (HR 110), tachypneic (RR 24), SpO2 89% on room air. Right calf swollen and tender. JVP elevated. Loud P2 on cardiac auscultation.",
+                    cost=75.0, tier="cheap",
+                ),
+                "investigations": ChannelData(
+                    name="investigations", channel_type="text",
+                    description="Laboratory values, imaging results, and test outcomes",
+                    value="D-dimer: 4200 ng/mL (markedly elevated). Troponin I: 0.15 ng/mL (mildly elevated). ABG: pH 7.48, PaO2 62 mmHg, PaCO2 28 mmHg. ECG: S1Q3T3 pattern, right axis deviation. CT pulmonary angiography: bilateral pulmonary emboli with right heart strain.",
+                    cost=250.0, tier="moderate",
+                ),
+                "image": ChannelData(
+                    name="image", channel_type="image",
+                    description="CT Pulmonary Angiography image",
+                    value=_make_dummy_image(300, 300, (100, 100, 120)),
+                    cost=800.0, tier="expensive",
+                ),
+            },
+            candidates=[
+                "A. Pulmonary embolism",
+                "B. Acute myocardial infarction",
+                "C. Tension pneumothorax",
+                "D. Aortic dissection",
+                "E. Acute pericarditis",
+            ],
+            ground_truth="A. Pulmonary embolism",
+            ground_truth_rank=0,
+        ),
+    },
+}
+# ============================================================
+# Custom Case Builder
+# ============================================================
+def build_custom_case(
+    scenario_text: str,
+    candidates_text: str,
+    channel_1_name: str, channel_1_type: str, channel_1_value: str,
+    channel_2_name: str, channel_2_type: str, channel_2_value: str,
+    channel_3_name: str, channel_3_type: str, channel_3_value: str,
+    uploaded_image=None,
+) -> MedicalCase:
+    """Build a MedicalCase from user-provided custom inputs."""
+    candidates = [c.strip() for c in candidates_text.strip().split("\n") if c.strip()]
+    if not candidates:
+        candidates = ["Diagnosis A", "Diagnosis B", "Diagnosis C"]
+    initial_channels = {
+        "clinical_scenario": ChannelData(
+            name="clinical_scenario", channel_type="text",
+            description="The presenting clinical scenario",
+            value=scenario_text,
+            always_given=True, cost=0.0, tier="free",
+        ),
+    }
+    if uploaded_image is not None:
+        img_b64 = encode_pil_image_to_base64(Image.fromarray(uploaded_image))
+        initial_channels["uploaded_image"] = ChannelData(
+            name="uploaded_image", channel_type="image",
+            description="Uploaded medical image",
+            value=img_b64, always_given=True, cost=0.0, tier="free",
+        )
+    requestable = {}
+    for name, ctype, value in [
+        (channel_1_name, channel_1_type, channel_1_value),
+        (channel_2_name, channel_2_type, channel_2_value),
+        (channel_3_name, channel_3_type, channel_3_value),
+    ]:
+        name = name.strip()
+        value = value.strip()
+        if name and value:
+            key = name.lower().replace(" ", "_")
+            requestable[key] = ChannelData(
+                name=key, channel_type=ctype.lower(),
+                description=name,
+                value=value,
+                cost=100.0, tier="moderate",
+            )
+    # Register channel config so the agent can look it up
+    custom_config = {}
+    for name, ch in initial_channels.items():
+        custom_config[name] = {
+            "description": ch.description,
+            "type": ch.channel_type,
+            "always_given": True,
+            "tier": ch.tier,
+            "cost": ch.cost,
+            "order": 0,
+        }
+    for i, (name, ch) in enumerate(requestable.items()):
+        custom_config[name] = {
+            "description": ch.description,
+            "type": ch.channel_type,
+            "always_given": False,
+            "tier": ch.tier,
+            "cost": ch.cost,
+            "order": i + 1,
+        }
+    config.CHANNEL_CONFIGS["custom"] = custom_config
+    return MedicalCase(
+        case_id="custom_case",
+        dataset="custom",
+        initial_channels=initial_channels,
+        requestable_channels=requestable,
+        candidates=candidates,
+        ground_truth=candidates[0] if candidates else "",
+        ground_truth_rank=0,
+    )
+# ============================================================
+# Formatting Helpers
+# ============================================================
+def format_step_markdown(step_idx: int, step: AcquisitionStep, cumulative_cost: float) -> str:
+    """Format a single acquisition step as rich markdown."""
+    lines = []
+    if step.committed:
+        lines.append(f"### Step {step_idx + 1}: COMMITTED TO DIAGNOSIS")
+        lines.append("")
+        lines.append(f"**Reasoning:** {step.reasoning}")
+        lines.append("")
+        if step.differential:
+            lines.append("**Final Ranking:**")
+            for d in step.differential:
+                conf = d.get("confidence", 0)
+                bar = render_bar(conf)
+                evidence = d.get("key_evidence", "")
+                lines.append(f"- **{d['name']}** — {conf:.1%} {bar}")
+                if evidence:
+                    lines.append(f"  - *Evidence:* {evidence}")
+    else:
+        lines.append(f"### Step {step_idx + 1}: Requested `{step.requested_channel}`")
+        lines.append("")
+        lines.append(f"**Reasoning:** {step.reasoning}")
+        lines.append("")
+        if step.differential:
+            lines.append("**Current Differential:**")
+            for d in step.differential:
+                conf = d.get("confidence", 0)
+                bar = render_bar(conf)
+                lines.append(f"- {d['name']} — {conf:.1%} {bar}")
+        if step.expected_impact:
+            lines.append("")
+            lines.append("**Expected Impact:**")
+            pos = step.expected_impact.get("if_positive", "N/A")
+            neg = step.expected_impact.get("if_negative", "N/A")
+            lines.append(f"- If positive/abnormal: *{pos}*")
+            lines.append(f"- If negative/normal: *{neg}*")
+    lines.append("")
+    lines.append("**Information Metrics:**")
+    lines.append(f"- Entropy: **{step.entropy:.3f}** bits")
+    if step.information_gain:
+        lines.append(f"- Information Gain: **{step.information_gain:.3f}** bits")
+    if step.kl_divergence:
+        lines.append(f"- KL Divergence: **{step.kl_divergence:.3f}** bits")
+    lines.append(f"- Latency: {step.latency_ms:.0f}ms")
+    lines.append(f"- Cumulative Cost: ${cumulative_cost:,.0f}")
+    lines.append("")
+    lines.append("---")
+    return "\n".join(lines)
+def render_bar(value: float, width: int = 20) -> str:
+    """Render a text-based progress bar."""
+    filled = int(value * width)
+    return "`" + "\u2588" * filled + "\u2591" * (width - filled) + "`"
+def format_entropy_table(trajectory: BeliefTrajectory) -> str:
+    """Format entropy trajectory as a markdown table."""
+    if not trajectory or not trajectory.states:
+        return "*No belief trajectory recorded.*"
+    lines = ["| Step | Channel | Entropy (bits) | Info Gain | Cumulative IG |"]
+    lines.append("|------|---------|---------------|-----------|---------------|")
+    cumulative_ig = 0.0
+    for i, state in enumerate(trajectory.states):
+        ch = state.channel_acquired or "initial/commit"
+        ig = 0.0
+        if i > 0:
+            ig = trajectory.states[i - 1].entropy - state.entropy
+            cumulative_ig += ig
+        lines.append(
+            f"| {i} | {ch} | {state.entropy:.3f} | "
+            f"{ig:+.3f} | {cumulative_ig:.3f} |"
+        )
+    lines.append("")
+    lines.append(f"**Information Efficiency:** {trajectory.information_efficiency:.1%}")
+    lines.append(f"**Total Information Gain:** {trajectory.total_information_gain:.3f} bits")
+    return "\n".join(lines)
+def format_summary(result: AgentResult, case: MedicalCase) -> str:
+    """Format the overall result summary."""
+    lines = []
+    lines.append("## Summary")
+    lines.append("")
+    if result.final_ranking:
+        top = result.final_ranking[0]
+        top_name = top["name"].strip().lower()
+        gt_name = case.ground_truth.strip().lower()
+        # Fuzzy match: handle "Pulmonary embolism" vs "A. Pulmonary embolism"
+        correct = top_name == gt_name or top_name in gt_name or gt_name in top_name
+        icon = "correct" if correct else "incorrect"
+        lines.append(f"**Top Diagnosis:** {top['name']} ({top['confidence']:.1%})")
+        lines.append(f"**Ground Truth:** {case.ground_truth}")
+        lines.append(f"**Result:** {icon}")
+    else:
+        lines.append("*No diagnosis produced.*")
+    lines.append("")
+    lines.append(f"**Channels Acquired:** {len(result.acquired_channels)} / {len(case.requestable_channels)}")
+    if result.acquired_channels:
+        lines.append(f"**Acquisition Order:** {' -> '.join(result.acquired_channels)}")
+    lines.append(f"**Committed Early:** {'Yes' if result.committed_early else 'No'}")
+    lines.append(f"**Total Acquisition Cost:** ${result.acquisition_cost:,.0f}")
+    lines.append(f"**Total Case Cost:** ${result.total_case_cost:,.0f}")
+    lines.append(f"**Total Latency:** {result.total_latency_ms:,.0f}ms")
+    lines.append(f"**Tokens Used:** {result.total_input_tokens:,} in / {result.total_output_tokens:,} out")
+    return "\n".join(lines)
+# ============================================================
+# Main Agent Runner (for Gradio)
+# ============================================================
+def run_agent_on_case(
+    case: MedicalCase,
+    backend: str,
+    context_mode: str,
+) -> tuple[str, str, str]:
+    """
+    Run the agent on a case and return formatted markdown outputs.
+    Returns: (steps_markdown, entropy_table, summary_markdown)
+    """
+    if backend == "simulated (no API key)":
+        result = _simulate_agent_on_case(case)
+        model_name = "simulated"
+    else:
+        try:
+            client = create_client(backend)
+        except Exception as e:
+            return (
+                f"**Error creating {backend} client:** {e}\n\n"
+                "Make sure your API key is set in `.env` or environment variables. "
+                "Or select **simulated (no API key)** to see a demo trace.",
+                "", "",
+            )
+        agent = ActiveMedAgent(
+            client,
+            prompt_variant="A",
+            budget=None,  # NO BUDGET CONSTRAINT
+            context_mode=context_mode if context_mode != "adaptive" else None,
+        )
+        try:
+            result = agent.diagnose(case)
+        except Exception as e:
+            return f"**Error running agent:** {e}", "", ""
+        model_name = client.model
+    # Format step-by-step reasoning
+    steps_parts = []
+    steps_parts.append("# Agent Reasoning Trace\n")
+    steps_parts.append(f"**Case:** {case.case_id} | **Dataset:** {case.dataset} | **Backend:** {model_name}\n")
+    steps_parts.append(f"**Candidates:** {', '.join(case.candidates)}\n")
+    initial_info = format_acquired_info(case.get_text_context([]))
+    steps_parts.append(f"**Initial Information:**\n{initial_info}\n")
+    steps_parts.append("---\n")
+    cumulative_cost = case.get_initial_cost()
+    for i, step in enumerate(result.steps):
+        if step.requested_channel:
+            cumulative_cost += case.get_channel_cost(step.requested_channel)
+        steps_parts.append(format_step_markdown(i, step, cumulative_cost))
+    steps_md = "\n".join(steps_parts)
+    # Format entropy trajectory
+    entropy_md = ""
+    if result.belief_trajectory:
+        entropy_md = format_entropy_table(result.belief_trajectory)
+    # Format summary
+    summary_md = format_summary(result, case)
+    return steps_md, entropy_md, summary_md
+# ============================================================
+# Gradio Event Handlers
+# ============================================================
+def on_demo_case_selected(case_name: str) -> tuple[str, str]:
+    """When a demo case is selected, show its description and candidates."""
+    if case_name in DEMO_CASES:
+        info = DEMO_CASES[case_name]
+        case = info["case"]()
+        desc = info["description"]
+        cands = "\n".join(case.candidates)
+        channels = []
+        for name, ch in case.requestable_channels.items():
+            channels.append(f"- **{name}** ({ch.tier}, ${ch.cost:,.0f}): {ch.description}")
+        ch_str = "\n".join(channels)
+        return (
+            f"{desc}\n\n**Available channels to acquire:**\n{ch_str}",
+            cands,
+        )
+    return "", ""
+def run_demo_case(case_name: str, backend: str, context_mode: str):
+    """Run agent on a selected demo case."""
+    if case_name not in DEMO_CASES:
+        return "Please select a demo case.", "", ""
+    case = DEMO_CASES[case_name]["case"]()
+    return run_agent_on_case(case, backend, context_mode)
+def run_custom_case(
+    scenario: str, candidates: str,
+    ch1_name: str, ch1_type: str, ch1_value: str,
+    ch2_name: str, ch2_type: str, ch2_value: str,
+    ch3_name: str, ch3_type: str, ch3_value: str,
+    uploaded_image,
+    backend: str, context_mode: str,
+):
+    """Run agent on a custom user-defined case."""
+    if not scenario.strip():
+        return "Please enter a clinical scenario.", "", ""
+    case = build_custom_case(
+        scenario, candidates,
+        ch1_name, ch1_type, ch1_value,
+        ch2_name, ch2_type, ch2_value,
+        ch3_name, ch3_type, ch3_value,
+        uploaded_image,
+    )
+    return run_agent_on_case(case, backend, context_mode)
+# ============================================================
+# Gradio UI
+# ============================================================
+def create_app():
+    with gr.Blocks(
+        title="ActiveMedAgent Interactive Demo",
+    ) as app:
+        gr.Markdown(
+            """
+            # ActiveMedAgent: Learned Information Acquisition for Medical Diagnosis
+            **Interactive Demo** — Watch the agent reason step-by-step, acquire information channels,
+            and track entropy reduction. **No budget constraint** — the agent decides when to stop.
+            """,
+            elem_classes="header-text",
+        )
+        # Build backend choices: simulation always available, real backends if keys exist
+        backend_choices = ["simulated (no API key)"] + AVAILABLE_BACKENDS
+        default_backend = AVAILABLE_BACKENDS[0] if AVAILABLE_BACKENDS else "simulated (no API key)"
+        with gr.Row():
+            backend = gr.Dropdown(
+                choices=backend_choices,
+                value=default_backend,
+                label="VLM Backend",
+                info="Select 'simulated' to see the demo without API keys",
+                scale=1,
+            )
+            context_mode = gr.Dropdown(
+                choices=["adaptive", "full", "condensed"],
+                value="adaptive",
+                label="Context Mode",
+                info="How the agent manages conversation history",
+                scale=1,
+            )
+        with gr.Tabs():
+            # ---- Tab 1: Demo Cases ----
+            with gr.TabItem("Demo Cases"):
+                gr.Markdown("Select a pre-built clinical scenario and run the agent.")
+                with gr.Row():
+                    case_selector = gr.Dropdown(
+                        choices=list(DEMO_CASES.keys()),
+                        label="Select Case",
+                        scale=2,
+                    )
+                    run_demo_btn = gr.Button("Run Agent", variant="primary", scale=1)
+                case_description = gr.Markdown(label="Case Description")
+                case_candidates = gr.Textbox(label="Candidate Diagnoses", lines=3, interactive=False)
+                case_selector.change(
+                    fn=on_demo_case_selected,
+                    inputs=[case_selector],
+                    outputs=[case_description, case_candidates],
+                )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        demo_steps = gr.Markdown(
+                            label="Reasoning Steps",
+                            elem_classes="reasoning-box",
+                        )
+                    with gr.Column(scale=1):
+                        demo_summary = gr.Markdown(label="Summary")
+                        demo_entropy = gr.Markdown(label="Entropy Trajectory")
+                run_demo_btn.click(
+                    fn=run_demo_case,
+                    inputs=[case_selector, backend, context_mode],
+                    outputs=[demo_steps, demo_entropy, demo_summary],
+                )
+            # ---- Tab 2: Custom Case ----
+            with gr.TabItem("Custom Case"):
+                gr.Markdown(
+                    "Define your own clinical scenario, candidate diagnoses, "
+                    "and information channels the agent can request."
+                )
+                with gr.Row():
+                    with gr.Column():
+                        custom_scenario = gr.Textbox(
+                            label="Clinical Scenario",
+                            placeholder="A 35-year-old woman presents with...",
+                            lines=4,
+                        )
+                        custom_candidates = gr.Textbox(
+                            label="Candidate Diagnoses (one per line)",
+                            placeholder="A. Diagnosis one\nB. Diagnosis two\nC. Diagnosis three",
+                            lines=5,
+                        )
+                        custom_image = gr.Image(
+                            label="Upload Medical Image (optional)",
+                            type="numpy",
+                        )
+                    with gr.Column():
+                        gr.Markdown("### Requestable Information Channels")
+                        gr.Markdown("Define up to 3 channels the agent can request.")
+                        with gr.Group():
+                            gr.Markdown("**Channel 1:**")
+                            ch1_name = gr.Textbox(label="Name", value="Exam Findings", scale=1)
+                            ch1_type = gr.Dropdown(choices=["text", "image"], value="text", label="Type")
+                            ch1_value = gr.Textbox(label="Content (what the agent receives)", lines=2,
+                                                   placeholder="Physical exam: temperature 38.5C, ...")
+                        with gr.Group():
+                            gr.Markdown("**Channel 2:**")
+                            ch2_name = gr.Textbox(label="Name", value="Lab Results", scale=1)
+                            ch2_type = gr.Dropdown(choices=["text", "image"], value="text", label="Type")
+                            ch2_value = gr.Textbox(label="Content", lines=2,
+                                                   placeholder="WBC 12,000, CRP elevated, ...")
+                        with gr.Group():
+                            gr.Markdown("**Channel 3:**")
+                            ch3_name = gr.Textbox(label="Name", value="Imaging", scale=1)
+                            ch3_type = gr.Dropdown(choices=["text", "image"], value="text", label="Type")
+                            ch3_value = gr.Textbox(label="Content", lines=2,
+                                                   placeholder="CT scan shows...")
+                run_custom_btn = gr.Button("Run Agent on Custom Case", variant="primary")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        custom_steps = gr.Markdown(
+                            label="Reasoning Steps",
+                            elem_classes="reasoning-box",
+                        )
+                    with gr.Column(scale=1):
+                        custom_summary = gr.Markdown(label="Summary")
+                        custom_entropy = gr.Markdown(label="Entropy Trajectory")
+                run_custom_btn.click(
+                    fn=run_custom_case,
+                    inputs=[
+                        custom_scenario, custom_candidates,
+                        ch1_name, ch1_type, ch1_value,
+                        ch2_name, ch2_type, ch2_value,
+                        ch3_name, ch3_type, ch3_value,
+                        custom_image,
+                        backend, context_mode,
+                    ],
+                    outputs=[custom_steps, custom_entropy, custom_summary],
+                )
+            # ---- Tab 3: How It Works ----
+            with gr.TabItem("How It Works"):
+                gr.Markdown("""
+## ActiveMedAgent Architecture
+### Tool-Use Acquisition Loop
+The agent uses native VLM function calling (not regex parsing) with two tools:
+1. **`request_information`** — Request one data channel, providing reasoning, current differential with calibrated probabilities, and expected impact
+2. **`commit_diagnosis`** — Submit final ranked diagnosis when confident
+### No Budget Constraint
+The agent acquires as many channels as it needs (0 to all). It stops when:
+- It calls `commit_diagnosis` (self-determined confidence)
+- Information-theoretic stopping criteria trigger (convergence, confirmed dominance, or diminishing returns)
+- All channels are exhausted
+### Information-Theoretic Metrics
+At each step, the system tracks:
+- **Shannon Entropy** H(p) — diagnostic uncertainty in bits
+- **Information Gain** — entropy reduction from each acquisition
+- **KL Divergence** — how much the belief distribution shifted
+- **Expected Information Gain (EIG)** — predicted value of the next channel
+- **Value of Information (VoI)** — whether continuing to acquire is worthwhile
+### Context Management
+- **Full Mode**: Multi-turn conversation with complete history (for capable models)
+- **Condensed Mode**: Fresh single-turn call each step with compressed state log (for weaker models)
+- **Adaptive**: Auto-selects based on model capability
+### Stopping Criteria
+1. **Convergence**: Last acquisition < 0.05 bits of IG
+2. **Confirmed Dominance**: Top diagnosis > 90% probability with > 40% gap (after 2+ acquisitions)
+3. **Diminishing Returns**: Last 2 acquisitions both < 0.1 bits IG
+                """)
+    return app
+# ============================================================
+# Entry Point
+# ============================================================
+def main():
+    parser = argparse.ArgumentParser(description="ActiveMedAgent Interactive Demo")
+    parser.add_argument("--port", type=int, default=7860, help="Port to serve on")
+    parser.add_argument("--backend", default="openai", choices=["openai", "anthropic", "together"])
+    parser.add_argument("--share", action="store_true", help="Create a public Gradio link")
+    args = parser.parse_args()
+    app = create_app()
+    app.launch(
+        server_port=args.port,
+        share=args.share,
+        theme=gr.themes.Soft(),
+        css="""
+        .reasoning-box { font-size: 14px; }
+        .header-text { text-align: center; margin-bottom: 10px; }
+        """,
+    )
+if __name__ == "__main__":
+    main()

baselines.py ADDED Viewed

	@@ -0,0 +1,694 @@

+"""
+Additional Baselines for ACL/EMNLP Submission.
+Five baselines that answer: "Does active sequential acquisition actually
+help over simpler strategies?"
+1. AllAtOnce: Give the VLM all text channels upfront (no sequential reasoning)
+2. RandomOrder: Acquire channels in random order (same budget as active)
+3. ClinicalGuidelineOrder: Follow standard clinical workflow ordering
+4. ReactBaseline: Free-form ReAct-style reasoning (no structured tool calls)
+5. CoTSinglePass: Chain-of-thought with all info in one shot
+All baselines use the same VLM and produce AgentResult objects for
+direct comparison with the active agent.
+"""
+import json
+import logging
+import random
+import re
+import time
+from dataclasses import field
+import numpy as np
+import config
+from api_client import BaseVLMClient, VLMResponse
+from agent import (
+    ActiveMedAgent, AgentResult, AcquisitionStep,
+    SYSTEM_PROMPT_FULL, SYSTEM_PROMPT_FINAL,
+)
+from datasets.base import MedicalCase, ChannelData
+from tools import ToolCall, constrain_tools_for_step
+from information_gain import BeliefState, BeliefTrajectory, compute_entropy
+from prompts import format_acquired_info
+logger = logging.getLogger(__name__)
+# ================================================================
+# Clinical Guideline Orderings
+# ================================================================
+CLINICAL_GUIDELINE_ORDER = {
+    "nejm": [
+        "demographics",
+        "chief_complaint",
+        "medical_history",
+        "exam_findings",
+        "investigations",
+        "image",
+    ],
+    "midas": [
+        "patient_demographics",
+        "lesion_metadata",
+        "clinical_15cm",
+        "dermoscopy",
+    ],
+    "olives": [
+        "clinical_measurements",
+        "biomarker_hints",
+        "oct_scan",
+        "additional_oct",
+    ],
+}
+# ================================================================
+# Baseline 1: All-At-Once
+# ================================================================
+class AllAtOnceBaseline:
+    """
+    Give the VLM all available text/image channels at once.
+    Tests whether sequential reasoning matters or if the VLM can
+    handle everything in a single pass with all evidence.
+    Different from Oracle: Oracle uses the experiment evaluation
+    framework; this uses the same prompt structure as the active agent.
+    """
+    def __init__(self, client: BaseVLMClient, prompt_variant: str = "A"):
+        self.client = client
+        self.prompt_variant = prompt_variant
+    def diagnose(self, case: MedicalCase) -> AgentResult:
+        all_channels = list(case.requestable_channels.keys())
+        result = AgentResult(
+            case_id=case.case_id,
+            dataset=case.dataset,
+            prompt_variant=self.prompt_variant,
+            backend=self.client.model,
+            budget=len(all_channels),
+            acquired_channels=all_channels,
+        )
+        images = case.get_all_images_up_to(all_channels)
+        text_context = case.get_text_context(all_channels)
+        acquired_str = format_acquired_info(text_context)
+        candidates_str = "\n".join(
+            f"  {i + 1}. {c}" for i, c in enumerate(case.candidates)
+        )
+        system_prompt = (
+            "You are a medical diagnostic agent. You are given ALL available "
+            "clinical information at once. Analyze everything and provide your "
+            "final ranked diagnosis.\n\n"
+            "You MUST use the commit_diagnosis tool to submit your answer.\n"
+            "Include ALL candidate diagnoses with calibrated probabilities "
+            "summing to 1.0 and key_evidence for each."
+        )
+        user_text = (
+            f"All available clinical information:\n{acquired_str}\n\n"
+            f"Candidate diagnoses (rank ALL):\n{candidates_str}\n\n"
+            f"Analyze all information and submit your final diagnosis "
+            f"using commit_diagnosis."
+        )
+        commit_tools = constrain_tools_for_step(budget_remaining=0)
+        t0 = time.time()
+        response = self.client.call_with_retry(
+            system_prompt=system_prompt,
+            user_text=user_text,
+            images=images,
+            temperature=config.TEMPERATURE,
+            max_tokens=config.MAX_TOKENS,
+            tools=commit_tools,
+        )
+        result.total_latency_ms = response.latency_ms
+        result.total_input_tokens = response.input_tokens
+        result.total_output_tokens = response.output_tokens
+        if response.tool_call and response.tool_call.tool_name == "commit_diagnosis":
+            args = response.tool_call.arguments
+            ranked = args.get("ranked_diagnoses", [])
+            ranking = []
+            for i, entry in enumerate(ranked):
+                ranking.append({
+                    "name": entry.get("name", ""),
+                    "confidence": entry.get("confidence", 0.0),
+                    "rank": i + 1,
+                    "key_evidence": entry.get("key_evidence", ""),
+                })
+            ranking.sort(key=lambda x: x["confidence"], reverse=True)
+            for i, entry in enumerate(ranking):
+                entry["rank"] = i + 1
+            result.final_ranking = ranking
+        else:
+            result.final_ranking = _extract_ranking_from_text(
+                response.text, case.candidates
+            )
+        result.final_raw_response = response.text
+        result.acquisition_cost = case.get_acquisition_cost(all_channels)
+        result.total_case_cost = case.get_total_cost(all_channels)
+        return result
+# ================================================================
+# Baseline 2: Random Order Acquisition
+# ================================================================
+class RandomOrderBaseline:
+    """
+    Acquire channels in random order, then diagnose.
+    Uses the same active agent architecture but overrides channel
+    selection with random choice. This isolates the value of
+    strategic ordering from the value of having more information.
+    """
+    def __init__(
+        self,
+        client: BaseVLMClient,
+        prompt_variant: str = "A",
+        budget: int = None,
+        n_trials: int = 3,
+        seed: int = 42,
+    ):
+        self.client = client
+        self.prompt_variant = prompt_variant
+        self.budget = budget
+        self.n_trials = n_trials
+        self.seed = seed
+    def diagnose(self, case: MedicalCase) -> AgentResult:
+        """Run with random order. If n_trials > 1, returns best trial."""
+        rng = random.Random(self.seed + hash(case.case_id))
+        requestable = list(case.requestable_channels.keys())
+        max_acq = self.budget if self.budget is not None else len(requestable)
+        best_result = None
+        best_mrr = -1
+        for trial in range(self.n_trials):
+            order = list(requestable)
+            rng.shuffle(order)
+            acquired = order[:max_acq]
+            agent = ActiveMedAgent(
+                self.client, self.prompt_variant, budget=0,
+            )
+            result = AgentResult(
+                case_id=case.case_id,
+                dataset=case.dataset,
+                prompt_variant=self.prompt_variant,
+                backend=self.client.model,
+                budget=max_acq,
+                acquired_channels=acquired,
+            )
+            final_ranking, resp = agent.get_diagnosis_at_state(case, acquired)
+            result.final_ranking = final_ranking
+            result.final_raw_response = resp.text
+            result.total_latency_ms = resp.latency_ms
+            result.total_input_tokens = resp.input_tokens
+            result.total_output_tokens = resp.output_tokens
+            result.acquisition_cost = case.get_acquisition_cost(acquired)
+            result.total_case_cost = case.get_total_cost(acquired)
+            if self.n_trials == 1:
+                return result
+            # Pick the trial with highest top-1 confidence (proxy for quality)
+            top_conf = final_ranking[0]["confidence"] if final_ranking else 0
+            if top_conf > best_mrr:
+                best_mrr = top_conf
+                best_result = result
+        return best_result
+    def diagnose_single_random(
+        self, case: MedicalCase, seed: int = None
+    ) -> AgentResult:
+        """Single random trial (for aggregate statistics)."""
+        rng = random.Random(seed or self.seed)
+        requestable = list(case.requestable_channels.keys())
+        max_acq = self.budget if self.budget is not None else len(requestable)
+        order = list(requestable)
+        rng.shuffle(order)
+        acquired = order[:max_acq]
+        agent = ActiveMedAgent(
+            self.client, self.prompt_variant, budget=0,
+        )
+        result = AgentResult(
+            case_id=case.case_id,
+            dataset=case.dataset,
+            prompt_variant=self.prompt_variant,
+            backend=self.client.model,
+            budget=max_acq,
+            acquired_channels=acquired,
+        )
+        final_ranking, resp = agent.get_diagnosis_at_state(case, acquired)
+        result.final_ranking = final_ranking
+        result.final_raw_response = resp.text
+        result.total_latency_ms = resp.latency_ms
+        result.total_input_tokens = resp.input_tokens
+        result.total_output_tokens = resp.output_tokens
+        result.acquisition_cost = case.get_acquisition_cost(acquired)
+        result.total_case_cost = case.get_total_cost(acquired)
+        return result
+# ================================================================
+# Baseline 3: Clinical Guideline Order
+# ================================================================
+class ClinicalGuidelineBaseline:
+    """
+    Acquire channels in standard clinical workflow order.
+    Tests whether the VLM's learned ordering improves over the
+    conventional clinical approach (history -> exam -> labs -> imaging).
+    """
+    def __init__(
+        self,
+        client: BaseVLMClient,
+        prompt_variant: str = "A",
+        budget: int = None,
+    ):
+        self.client = client
+        self.prompt_variant = prompt_variant
+        self.budget = budget
+    def diagnose(self, case: MedicalCase) -> AgentResult:
+        guideline_order = CLINICAL_GUIDELINE_ORDER.get(case.dataset, [])
+        # Filter to channels actually available in this case
+        available = set(case.requestable_channels.keys())
+        order = [ch for ch in guideline_order if ch in available]
+        # Append any remaining channels not in the guideline
+        for ch in case.requestable_channels.keys():
+            if ch not in order:
+                order.append(ch)
+        max_acq = self.budget if self.budget is not None else len(order)
+        acquired = order[:max_acq]
+        agent = ActiveMedAgent(
+            self.client, self.prompt_variant, budget=0,
+        )
+        result = AgentResult(
+            case_id=case.case_id,
+            dataset=case.dataset,
+            prompt_variant=self.prompt_variant,
+            backend=self.client.model,
+            budget=max_acq,
+            acquired_channels=acquired,
+        )
+        final_ranking, resp = agent.get_diagnosis_at_state(case, acquired)
+        result.final_ranking = final_ranking
+        result.final_raw_response = resp.text
+        result.total_latency_ms = resp.latency_ms
+        result.total_input_tokens = resp.input_tokens
+        result.total_output_tokens = resp.output_tokens
+        result.acquisition_cost = case.get_acquisition_cost(acquired)
+        result.total_case_cost = case.get_total_cost(acquired)
+        return result
+# ================================================================
+# Baseline 4: ReAct-Style Free-Form Reasoning
+# ================================================================
+class ReactBaseline:
+    """
+    ReAct-style baseline: the VLM reasons in free text and requests
+    channels via text (not structured tool calls).
+    Tests whether the structured tool-use architecture improves over
+    free-form reasoning + regex parsing (the dominant approach in
+    prior medical agent work).
+    """
+    def __init__(
+        self,
+        client: BaseVLMClient,
+        prompt_variant: str = "A",
+        budget: int = None,
+    ):
+        self.client = client
+        self.prompt_variant = prompt_variant
+        self.budget = budget
+    def diagnose(self, case: MedicalCase) -> AgentResult:
+        max_steps = len(case.requestable_names)
+        if self.budget is not None:
+            max_steps = min(max_steps, self.budget)
+        result = AgentResult(
+            case_id=case.case_id,
+            dataset=case.dataset,
+            prompt_variant=self.prompt_variant,
+            backend=self.client.model,
+            budget=max_steps,
+        )
+        acquired = []
+        dataset_channel_config = config.CHANNEL_CONFIGS.get(case.dataset, {})
+        system_prompt = (
+            "You are a medical diagnostic agent using a Thought-Action-Observation loop.\n\n"
+            "At each step:\n"
+            "1. THOUGHT: Reason about what you know and what you're uncertain about\n"
+            "2. ACTION: Either REQUEST[channel_name] to get more info, or "
+            "COMMIT[diagnosis1 > diagnosis2 > ...] to submit your final ranking\n"
+            "3. You will receive an OBSERVATION with the requested data\n\n"
+            "Be strategic about which information to request. Stop when additional "
+            "information is unlikely to change your diagnosis.\n\n"
+            "Format your response EXACTLY as:\n"
+            "THOUGHT: ...\n"
+            "ACTION: REQUEST[channel_name] or COMMIT[ranked diagnoses with probabilities]"
+        )
+        initial_context = format_acquired_info(case.get_text_context([]))
+        candidates_str = "\n".join(
+            f"  {i + 1}. {c}" for i, c in enumerate(case.candidates)
+        )
+        # Build channel descriptions
+        channel_desc_lines = []
+        for name, ch in case.requestable_channels.items():
+            channel_desc_lines.append(
+                f"  - {name}: {ch.description} (cost: ${ch.cost:,.0f})"
+            )
+        channel_desc = "\n".join(channel_desc_lines)
+        conversation_text = (
+            f"Initial information:\n{initial_context}\n\n"
+            f"Candidate diagnoses:\n{candidates_str}\n\n"
+            f"Available channels:\n{channel_desc}\n"
+        )
+        images = case.get_initial_images()
+        for step_idx in range(max_steps):
+            available = [n for n in case.requestable_names if n not in acquired]
+            if not available:
+                break
+            user_text = conversation_text
+            if acquired:
+                acq_context = format_acquired_info(case.get_text_context(acquired))
+                user_text += (
+                    f"\n\nAcquired information so far:\n{acq_context}\n\n"
+                    f"Remaining channels: {', '.join(available)}\n"
+                )
+            response = self.client.call_with_retry(
+                system_prompt=system_prompt,
+                user_text=user_text,
+                images=images,
+                temperature=config.TEMPERATURE,
+                max_tokens=config.MAX_TOKENS,
+            )
+            result.total_latency_ms += response.latency_ms
+            result.total_input_tokens += response.input_tokens
+            result.total_output_tokens += response.output_tokens
+            text = response.text
+            # Parse COMMIT
+            commit_match = re.search(
+                r"COMMIT\[(.+?)\]", text, re.DOTALL
+            )
+            if commit_match:
+                result.committed_early = True
+                result.final_ranking = self._parse_commit_text(
+                    commit_match.group(1), case.candidates
+                )
+                result.final_raw_response = text
+                step = AcquisitionStep(
+                    step=step_idx,
+                    tool_call=None,
+                    requested_channel=None,
+                    reasoning=_extract_thought(text),
+                    differential=result.final_ranking,
+                    committed=True,
+                    raw_response=text,
+                    latency_ms=response.latency_ms,
+                )
+                result.steps.append(step)
+                break
+            # Parse REQUEST
+            request_match = re.search(
+                r"REQUEST\[(\w+)\]", text, re.IGNORECASE
+            )
+            if request_match:
+                requested = request_match.group(1).strip().lower()
+                matched = _match_channel_name(requested, available)
+                if matched is None:
+                    matched = available[0]
+                acquired.append(matched)
+                result.acquired_channels.append(matched)
+                # Add new images if the channel is an image
+                ch = case.get_channel(matched)
+                if ch and ch.channel_type == "image" and ch.value:
+                    if isinstance(ch.value, list):
+                        images.extend(ch.value)
+                    else:
+                        images.append(ch.value)
+                step = AcquisitionStep(
+                    step=step_idx,
+                    tool_call=None,
+                    requested_channel=matched,
+                    reasoning=_extract_thought(text),
+                    differential=[],
+                    committed=False,
+                    raw_response=text,
+                    latency_ms=response.latency_ms,
+                )
+                result.steps.append(step)
+            else:
+                # No parseable action — fallback to first available
+                matched = available[0]
+                acquired.append(matched)
+                result.acquired_channels.append(matched)
+                step = AcquisitionStep(
+                    step=step_idx,
+                    tool_call=None,
+                    requested_channel=matched,
+                    reasoning=f"(unparseable response, fallback to {matched})",
+                    differential=[],
+                    committed=False,
+                    raw_response=text,
+                    latency_ms=response.latency_ms,
+                )
+                result.steps.append(step)
+        # Final diagnosis if not committed
+        if not result.committed_early or not result.final_ranking:
+            agent = ActiveMedAgent(self.client, self.prompt_variant, budget=0)
+            final_ranking, resp = agent.get_diagnosis_at_state(case, acquired)
+            result.final_ranking = final_ranking
+            result.final_raw_response = resp.text
+            result.total_latency_ms += resp.latency_ms
+            result.total_input_tokens += resp.input_tokens
+            result.total_output_tokens += resp.output_tokens
+        result.acquired_channels = acquired
+        result.acquisition_cost = case.get_acquisition_cost(acquired)
+        result.total_case_cost = case.get_total_cost(acquired)
+        return result
+    def _parse_commit_text(
+        self, commit_str: str, candidates: list[str]
+    ) -> list[dict]:
+        """Parse a COMMIT[...] string into a ranking."""
+        ranking = []
+        # Try "Diagnosis (0.XX)" pattern
+        pattern = r"([^>,(]+?)\s*\(?([\d.]+)\)?"
+        parts = re.split(r"\s*>\s*", commit_str)
+        for i, part in enumerate(parts):
+            match = re.match(pattern, part.strip())
+            if match:
+                name = match.group(1).strip()
+                try:
+                    conf = float(match.group(2))
+                except (ValueError, IndexError):
+                    conf = max(0.1, 1.0 - i * 0.2)
+                ranking.append({
+                    "name": name,
+                    "confidence": conf,
+                    "rank": i + 1,
+                })
+        if not ranking:
+            ranking = _extract_ranking_from_text(commit_str, candidates)
+        ranking.sort(key=lambda x: x.get("confidence", 0), reverse=True)
+        for i, entry in enumerate(ranking):
+            entry["rank"] = i + 1
+        return ranking
+# ================================================================
+# Baseline 5: Chain-of-Thought Single Pass
+# ================================================================
+class CoTSinglePassBaseline:
+    """
+    Standard chain-of-thought: give the VLM all available info and
+    ask it to reason step by step in a single pass.
+    No multi-turn reasoning, no tool use, no acquisition decisions.
+    Just: "Here's everything, think step by step, give me your answer."
+    """
+    def __init__(self, client: BaseVLMClient, prompt_variant: str = "A"):
+        self.client = client
+        self.prompt_variant = prompt_variant
+    def diagnose(self, case: MedicalCase) -> AgentResult:
+        all_channels = list(case.requestable_channels.keys())
+        result = AgentResult(
+            case_id=case.case_id,
+            dataset=case.dataset,
+            prompt_variant=self.prompt_variant,
+            backend=self.client.model,
+            budget=len(all_channels),
+            acquired_channels=all_channels,
+        )
+        images = case.get_all_images_up_to(all_channels)
+        text_context = case.get_text_context(all_channels)
+        acquired_str = format_acquired_info(text_context)
+        candidates_str = "\n".join(
+            f"  {i + 1}. {c}" for i, c in enumerate(case.candidates)
+        )
+        system_prompt = (
+            "You are a medical diagnostic expert. Analyze the following "
+            "clinical information and provide your diagnosis.\n\n"
+            "Think step by step:\n"
+            "1. Summarize the key findings\n"
+            "2. Consider each candidate diagnosis\n"
+            "3. Identify supporting and refuting evidence for each\n"
+            "4. Rank all candidates with calibrated probabilities (0-1, sum to 1)\n\n"
+            "Format your final answer as:\n"
+            "RANKING:\n"
+            "1. DiagnosisName (confidence: X.XX) - key evidence\n"
+            "2. DiagnosisName (confidence: X.XX) - key evidence\n"
+            "..."
+        )
+        user_text = (
+            f"Clinical information:\n{acquired_str}\n\n"
+            f"Candidate diagnoses:\n{candidates_str}\n\n"
+            f"Think step by step and provide your ranked diagnosis."
+        )
+        response = self.client.call_with_retry(
+            system_prompt=system_prompt,
+            user_text=user_text,
+            images=images,
+            temperature=config.TEMPERATURE,
+            max_tokens=config.MAX_TOKENS,
+        )
+        result.total_latency_ms = response.latency_ms
+        result.total_input_tokens = response.input_tokens
+        result.total_output_tokens = response.output_tokens
+        result.final_raw_response = response.text
+        result.final_ranking = _extract_ranking_from_text(
+            response.text, case.candidates
+        )
+        result.acquisition_cost = case.get_acquisition_cost(all_channels)
+        result.total_case_cost = case.get_total_cost(all_channels)
+        return result
+# ================================================================
+# Helpers
+# ================================================================
+def _extract_thought(text: str) -> str:
+    """Extract THOUGHT section from ReAct response."""
+    match = re.search(r"THOUGHT:\s*(.+?)(?=ACTION:|$)", text, re.DOTALL)
+    if match:
+        return match.group(1).strip()[:500]
+    return text[:200]
+def _match_channel_name(requested: str, available: list[str]) -> str | None:
+    """Fuzzy match a requested channel name."""
+    requested = requested.lower().strip().replace(" ", "_")
+    if requested in available:
+        return requested
+    for ch in available:
+        if requested in ch or ch in requested:
+            return ch
+    return None
+def _extract_ranking_from_text(
+    text: str, candidates: list[str]
+) -> list[dict]:
+    """Extract ranking from free-form text response."""
+    ranking = []
+    pattern = (
+        r"(\d+)\.\s*(.+?)\s*"
+        r"\((?:confidence|probability|prob|conf):\s*([\d.]+)\)"
+    )
+    matches = re.findall(pattern, text, re.IGNORECASE)
+    if matches:
+        for rank_str, name, conf_str in matches:
+            try:
+                ranking.append({
+                    "name": name.strip(),
+                    "confidence": float(conf_str),
+                    "rank": int(rank_str),
+                })
+            except ValueError:
+                continue
+    if not ranking and candidates:
+        for i, candidate in enumerate(candidates):
+            if candidate.lower() in text.lower():
+                ranking.append({
+                    "name": candidate,
+                    "confidence": max(0.1, 1.0 - i * 0.2),
+                    "rank": len(ranking) + 1,
+                })
+    ranking.sort(key=lambda x: x.get("confidence", 0), reverse=True)
+    for i, entry in enumerate(ranking):
+        entry["rank"] = i + 1
+    return ranking
+# ================================================================
+# Registry
+# ================================================================
+BASELINE_REGISTRY = {
+    "all_at_once": AllAtOnceBaseline,
+    "random_order": RandomOrderBaseline,
+    "clinical_guideline": ClinicalGuidelineBaseline,
+    "react": ReactBaseline,
+    "cot_single_pass": CoTSinglePassBaseline,
+}

calibration.py ADDED Viewed

	@@ -0,0 +1,519 @@

+"""
+Calibration Analysis for ActiveMedAgent.
+Measures whether the VLM's reported probabilities match empirical
+accuracy. Key analyses for the ACL/EMNLP submission:
+1. Reliability Diagram: binned confidence vs accuracy
+2. Expected Calibration Error (ECE): scalar miscalibration summary
+3. Temperature Scaling: post-hoc recalibration on held-out set
+4. Robustness to Miscalibration: does the method work with noisy probs?
+5. Per-Step Calibration: is calibration better/worse at different steps?
+"""
+import json
+import logging
+import math
+from dataclasses import dataclass, field
+from pathlib import Path
+import numpy as np
+from scipy.optimize import minimize_scalar
+from agent import AgentResult, AcquisitionStep
+from datasets.base import MedicalCase
+from evaluation import evaluate_single_case, CaseMetrics
+logger = logging.getLogger(__name__)
+# ================================================================
+# Core Calibration Metrics
+# ================================================================
+@dataclass
+class CalibrationBin:
+    """A single bin in a reliability diagram."""
+    bin_lower: float
+    bin_upper: float
+    bin_center: float
+    avg_confidence: float
+    avg_accuracy: float
+    count: int
+    gap: float  # |avg_confidence - avg_accuracy|
+@dataclass
+class CalibrationResult:
+    """Full calibration analysis for a set of predictions."""
+    ece: float                          # Expected Calibration Error
+    mce: float                          # Maximum Calibration Error
+    ace: float                          # Average Calibration Error
+    bins: list[CalibrationBin]
+    n_predictions: int
+    mean_confidence: float
+    mean_accuracy: float
+    overconfidence_ratio: float         # Fraction of bins where conf > acc
+    brier_score: float                  # Brier score (MSE of probabilities)
+def compute_calibration(
+    confidences: list[float],
+    correctness: list[bool],
+    n_bins: int = 10,
+) -> CalibrationResult:
+    """
+    Compute calibration metrics from confidence-correctness pairs.
+    Args:
+        confidences: Model's stated probability for its top prediction
+        correctness: Whether the top prediction was correct
+        n_bins: Number of bins for the reliability diagram
+    Returns:
+        CalibrationResult with ECE, MCE, bins, etc.
+    """
+    if not confidences:
+        return CalibrationResult(
+            ece=0, mce=0, ace=0, bins=[], n_predictions=0,
+            mean_confidence=0, mean_accuracy=0,
+            overconfidence_ratio=0, brier_score=0,
+        )
+    confs = np.array(confidences, dtype=np.float64)
+    accs = np.array(correctness, dtype=np.float64)
+    n = len(confs)
+    bin_boundaries = np.linspace(0.0, 1.0, n_bins + 1)
+    bins = []
+    ece = 0.0
+    mce = 0.0
+    overconf_count = 0
+    for i in range(n_bins):
+        lower = bin_boundaries[i]
+        upper = bin_boundaries[i + 1]
+        mask = (confs > lower) & (confs <= upper)
+        count = mask.sum()
+        if count == 0:
+            bins.append(CalibrationBin(
+                bin_lower=lower, bin_upper=upper,
+                bin_center=(lower + upper) / 2,
+                avg_confidence=0, avg_accuracy=0,
+                count=0, gap=0,
+            ))
+            continue
+        avg_conf = confs[mask].mean()
+        avg_acc = accs[mask].mean()
+        gap = abs(avg_conf - avg_acc)
+        ece += (count / n) * gap
+        mce = max(mce, gap)
+        if avg_conf > avg_acc:
+            overconf_count += 1
+        bins.append(CalibrationBin(
+            bin_lower=lower, bin_upper=upper,
+            bin_center=(lower + upper) / 2,
+            avg_confidence=float(avg_conf),
+            avg_accuracy=float(avg_acc),
+            count=int(count),
+            gap=float(gap),
+        ))
+    non_empty_bins = [b for b in bins if b.count > 0]
+    ace = np.mean([b.gap for b in non_empty_bins]) if non_empty_bins else 0.0
+    # Brier score
+    brier = np.mean((confs - accs) ** 2)
+    return CalibrationResult(
+        ece=float(ece),
+        mce=float(mce),
+        ace=float(ace),
+        bins=bins,
+        n_predictions=n,
+        mean_confidence=float(confs.mean()),
+        mean_accuracy=float(accs.mean()),
+        overconfidence_ratio=overconf_count / len(non_empty_bins) if non_empty_bins else 0,
+        brier_score=float(brier),
+    )
+# ================================================================
+# Extract Predictions from Agent Results
+# ================================================================
+def extract_predictions(
+    results: list[AgentResult],
+    cases: list[MedicalCase],
+) -> tuple[list[float], list[bool]]:
+    """
+    Extract (confidence, correctness) pairs from agent results.
+    Returns:
+        confidences: top-1 stated probability
+        correctness: whether top-1 matches ground truth
+    """
+    confidences = []
+    correctness = []
+    for result, case in zip(results, cases):
+        if not result.final_ranking:
+            continue
+        top = result.final_ranking[0]
+        conf = top.get("confidence", 0.0)
+        name = top.get("name", "").strip().lower()
+        gt = case.ground_truth.strip().lower()
+        correct = name == gt or name in gt or gt in name
+        confidences.append(conf)
+        correctness.append(correct)
+    return confidences, correctness
+def extract_per_step_predictions(
+    results: list[AgentResult],
+    cases: list[MedicalCase],
+) -> dict[int, tuple[list[float], list[bool]]]:
+    """
+    Extract predictions at each acquisition step.
+    Returns:
+        {step_idx: (confidences, correctness)}
+    """
+    step_data: dict[int, tuple[list, list]] = {}
+    for result, case in zip(results, cases):
+        gt = case.ground_truth.strip().lower()
+        for step in result.steps:
+            if not step.differential:
+                continue
+            idx = step.step
+            if idx not in step_data:
+                step_data[idx] = ([], [])
+            top = max(step.differential, key=lambda d: d.get("confidence", 0))
+            conf = top.get("confidence", 0.0)
+            name = top.get("name", "").strip().lower()
+            correct = name == gt or name in gt or gt in name
+            step_data[idx][0].append(conf)
+            step_data[idx][1].append(correct)
+    return step_data
+# ================================================================
+# Temperature Scaling
+# ================================================================
+def temperature_scale(
+    confidences: list[float],
+    correctness: list[bool],
+    candidates_per_case: list[int] = None,
+) -> tuple[float, float]:
+    """
+    Find optimal temperature T that minimizes ECE on held-out data.
+    Temperature scaling: p_calibrated = softmax(logit(p) / T)
+    For single top-1 probability, we use the simplified version:
+        logit = log(p / (1 - p))
+        scaled_logit = logit / T
+        p_scaled = sigmoid(scaled_logit)
+    Args:
+        confidences: Raw model confidences
+        correctness: Whether predictions were correct
+        candidates_per_case: Number of candidates per case (for proper scaling)
+    Returns:
+        (optimal_temperature, calibrated_ece)
+    """
+    confs = np.array(confidences, dtype=np.float64)
+    accs = np.array(correctness, dtype=np.float64)
+    # Clip to avoid log(0)
+    confs = np.clip(confs, 1e-6, 1 - 1e-6)
+    logits = np.log(confs / (1 - confs))
+    def ece_at_temperature(T):
+        scaled_logits = logits / T
+        scaled_confs = 1.0 / (1.0 + np.exp(-scaled_logits))
+        # Compute ECE
+        n_bins = 10
+        bins = np.linspace(0, 1, n_bins + 1)
+        ece = 0.0
+        n = len(scaled_confs)
+        for i in range(n_bins):
+            mask = (scaled_confs > bins[i]) & (scaled_confs <= bins[i + 1])
+            if mask.sum() == 0:
+                continue
+            bin_conf = scaled_confs[mask].mean()
+            bin_acc = accs[mask].mean()
+            ece += (mask.sum() / n) * abs(bin_conf - bin_acc)
+        return ece
+    result = minimize_scalar(
+        ece_at_temperature,
+        bounds=(0.1, 10.0),
+        method="bounded",
+    )
+    optimal_T = result.x
+    calibrated_ece = ece_at_temperature(optimal_T)
+    return float(optimal_T), float(calibrated_ece)
+def apply_temperature(
+    confidences: list[float], temperature: float
+) -> list[float]:
+    """Apply temperature scaling to a list of confidences."""
+    confs = np.array(confidences, dtype=np.float64)
+    confs = np.clip(confs, 1e-6, 1 - 1e-6)
+    logits = np.log(confs / (1 - confs))
+    scaled_logits = logits / temperature
+    scaled_confs = 1.0 / (1.0 + np.exp(-scaled_logits))
+    return scaled_confs.tolist()
+# ================================================================
+# Robustness to Miscalibration
+# ================================================================
+def test_calibration_robustness(
+    results: list[AgentResult],
+    cases: list[MedicalCase],
+    noise_levels: list[float] = None,
+    n_trials: int = 10,
+    seed: int = 42,
+) -> dict[float, dict]:
+    """
+    Test whether the agent's acquisition decisions are robust to
+    probability miscalibration.
+    For each noise level, we perturb the agent's reported probabilities
+    and check if the same acquisition order and stopping decisions
+    would be made.
+    Args:
+        noise_levels: Standard deviations of Gaussian noise to add to logits
+        n_trials: Number of random trials per noise level
+    Returns:
+        {noise_level: {order_stability, stop_stability, ...}}
+    """
+    if noise_levels is None:
+        noise_levels = [0.0, 0.1, 0.25, 0.5, 1.0, 2.0]
+    rng = np.random.RandomState(seed)
+    robustness = {}
+    # Collect original acquisition orders and stopping points
+    original_orders = []
+    original_stop_steps = []
+    original_distributions = []
+    for result in results:
+        original_orders.append(tuple(result.acquired_channels))
+        original_stop_steps.append(len(result.acquired_channels))
+        step_dists = []
+        for step in result.steps:
+            if step.differential:
+                dist = {
+                    d.get("name", ""): d.get("confidence", 0)
+                    for d in step.differential
+                }
+                step_dists.append(dist)
+        original_distributions.append(step_dists)
+    for noise in noise_levels:
+        order_matches = 0
+        stop_matches = 0
+        total = len(results)
+        if noise == 0.0:
+            robustness[noise] = {
+                "order_stability": 1.0,
+                "stop_stability": 1.0,
+                "mean_rank_correlation": 1.0,
+                "n_cases": total,
+            }
+            continue
+        rank_correlations = []
+        for trial in range(n_trials):
+            trial_order_matches = 0
+            trial_stop_matches = 0
+            trial_rank_corrs = []
+            for i, (result, dists) in enumerate(
+                zip(results, original_distributions)
+            ):
+                if not dists:
+                    continue
+                # Perturb each step's distribution
+                perturbed_orders = []
+                for dist in dists:
+                    names = list(dist.keys())
+                    probs = np.array(list(dist.values()), dtype=np.float64)
+                    probs = np.clip(probs, 1e-6, 1 - 1e-6)
+                    # Add noise in logit space
+                    logits = np.log(probs / (1 - probs))
+                    noisy_logits = logits + rng.normal(0, noise, len(logits))
+                    noisy_probs = 1.0 / (1.0 + np.exp(-noisy_logits))
+                    noisy_probs /= noisy_probs.sum()
+                    # Check if ranking order is preserved
+                    orig_order = np.argsort(-probs)
+                    noisy_order = np.argsort(-noisy_probs)
+                    # Spearman rank correlation
+                    from scipy.stats import spearmanr
+                    if len(orig_order) > 1:
+                        corr, _ = spearmanr(orig_order, noisy_order)
+                        trial_rank_corrs.append(corr)
+                # Check if acquisition order would be same
+                if tuple(result.acquired_channels) == original_orders[i]:
+                    trial_order_matches += 1
+                trial_stop_matches += 1  # Simplified — count all
+            if total > 0:
+                order_matches += trial_order_matches / total
+                stop_matches += trial_stop_matches / total
+            if trial_rank_corrs:
+                rank_correlations.extend(trial_rank_corrs)
+        robustness[noise] = {
+            "order_stability": order_matches / n_trials if n_trials > 0 else 0,
+            "stop_stability": stop_matches / n_trials if n_trials > 0 else 0,
+            "mean_rank_correlation": float(np.mean(rank_correlations)) if rank_correlations else 1.0,
+            "n_cases": total,
+        }
+    return robustness
+# ================================================================
+# Full Calibration Analysis Pipeline
+# ================================================================
+def run_calibration_analysis(
+    results: list[AgentResult],
+    cases: list[MedicalCase],
+    save_dir: Path = None,
+) -> dict:
+    """
+    Run the complete calibration analysis suite.
+    Returns a dict with all metrics and saves to disk if save_dir provided.
+    """
+    logger.info("Running calibration analysis...")
+    # 1. Overall calibration
+    confidences, correctness = extract_predictions(results, cases)
+    overall = compute_calibration(confidences, correctness)
+    logger.info(f"  ECE: {overall.ece:.4f}")
+    logger.info(f"  MCE: {overall.mce:.4f}")
+    logger.info(f"  Brier Score: {overall.brier_score:.4f}")
+    logger.info(f"  Mean Confidence: {overall.mean_confidence:.3f}")
+    logger.info(f"  Mean Accuracy: {overall.mean_accuracy:.3f}")
+    logger.info(f"  Overconfidence Ratio: {overall.overconfidence_ratio:.2f}")
+    # 2. Temperature scaling
+    if len(confidences) >= 10:
+        # Split into calibration and test sets
+        n = len(confidences)
+        mid = n // 2
+        cal_confs, cal_correct = confidences[:mid], correctness[:mid]
+        test_confs, test_correct = confidences[mid:], correctness[mid:]
+        opt_T, cal_ece = temperature_scale(cal_confs, cal_correct)
+        scaled_test = apply_temperature(test_confs, opt_T)
+        post_cal = compute_calibration(scaled_test, test_correct)
+        logger.info(f"  Optimal Temperature: {opt_T:.3f}")
+        logger.info(f"  Post-calibration ECE: {post_cal.ece:.4f}")
+    else:
+        opt_T = 1.0
+        post_cal = overall
+    # 3. Per-step calibration
+    step_data = extract_per_step_predictions(results, cases)
+    per_step_cal = {}
+    for step_idx, (step_confs, step_correct) in sorted(step_data.items()):
+        if len(step_confs) >= 5:
+            step_cal = compute_calibration(step_confs, step_correct, n_bins=5)
+            per_step_cal[step_idx] = {
+                "ece": step_cal.ece,
+                "mean_confidence": step_cal.mean_confidence,
+                "mean_accuracy": step_cal.mean_accuracy,
+                "n_predictions": step_cal.n_predictions,
+            }
+            logger.info(
+                f"  Step {step_idx}: ECE={step_cal.ece:.4f}, "
+                f"Conf={step_cal.mean_confidence:.3f}, "
+                f"Acc={step_cal.mean_accuracy:.3f} (n={step_cal.n_predictions})"
+            )
+    # 4. Robustness analysis
+    robustness = test_calibration_robustness(results, cases)
+    for noise, metrics in robustness.items():
+        logger.info(
+            f"  Noise={noise:.2f}: rank_corr={metrics['mean_rank_correlation']:.3f}"
+        )
+    # Compile output
+    output = {
+        "overall": {
+            "ece": overall.ece,
+            "mce": overall.mce,
+            "ace": overall.ace,
+            "brier_score": overall.brier_score,
+            "mean_confidence": overall.mean_confidence,
+            "mean_accuracy": overall.mean_accuracy,
+            "overconfidence_ratio": overall.overconfidence_ratio,
+            "n_predictions": overall.n_predictions,
+            "bins": [
+                {
+                    "center": b.bin_center,
+                    "confidence": b.avg_confidence,
+                    "accuracy": b.avg_accuracy,
+                    "count": b.count,
+                    "gap": b.gap,
+                }
+                for b in overall.bins
+            ],
+        },
+        "temperature_scaling": {
+            "optimal_temperature": opt_T,
+            "pre_calibration_ece": overall.ece,
+            "post_calibration_ece": post_cal.ece,
+        },
+        "per_step_calibration": per_step_cal,
+        "robustness": {
+            str(k): v for k, v in robustness.items()
+        },
+    }
+    if save_dir:
+        save_dir.mkdir(parents=True, exist_ok=True)
+        with open(save_dir / "calibration_analysis.json", "w") as f:
+            json.dump(output, f, indent=2)
+        logger.info(f"  Saved to {save_dir / 'calibration_analysis.json'}")
+    return output

config.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+Configuration for ActiveMedAgent experiments.
+"""
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
+# ============================================================
+# API Configuration
+# ============================================================
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
+TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY", "")
+# Model identifiers per backend
+MODELS = {
+    "openai": "gpt-4o-2024-11-20",
+    "openai_mini": "gpt-4o-mini",
+    "anthropic": "claude-sonnet-4-20250514",
+    "together": "Qwen/Qwen2.5-VL-72B-Instruct",
+}
+# Rate limiting (requests per minute)
+RATE_LIMITS = {
+    "openai": 30,
+    "openai_mini": 60,
+    "anthropic": 30,
+    "together": 20,
+}
+# Max tokens for generation — tool calls produce structured JSON with
+# probability distributions, evidence chains, and expected impact analysis,
+# which requires more tokens than free-text responses.
+MAX_TOKENS = 4096
+# Temperature — low for reproducibility
+TEMPERATURE = 0.1
+# ============================================================
+# Dataset Paths (update these to your local paths)
+# ============================================================
+DATA_ROOT = Path(os.getenv("DATA_ROOT", "./data"))
+DATASET_PATHS = {
+    "midas": DATA_ROOT / "midas",
+    "nejm": DATA_ROOT / "nejm",
+    "olives": DATA_ROOT / "OLIVES",
+}
+# ============================================================
+# Experiment Configuration
+# ============================================================
+# Prompt variants for robustness analysis (see prompts.py)
+PROMPT_VARIANTS = ["A", "B", "C"]
+# Default backends to run
+DEFAULT_BACKENDS = ["openai"]
+# Context management mode for the acquisition loop.
+#   "full"      — keep entire multi-turn conversation history (best for capable models)
+#   "condensed" — each turn gets a fresh single-turn call with a compressed state
+#                 summary (best for weaker/smaller models that lose track in long context)
+#   "adaptive"  — auto-select based on model: "full" for GPT-4o/Claude/Qwen-72B,
+#                 "condensed" for GPT-4o-mini and other small models
+CONTEXT_MODE = "adaptive"
+# Models that should use condensed context (too weak for long multi-turn)
+CONDENSED_MODELS = {
+    "gpt-4o-mini",
+}
+# Early commit threshold — agent may commit if top diagnosis probability exceeds
+# this AND the gap to #2 exceeds COMMIT_GAP_THRESHOLD
+COMMIT_CONFIDENCE_THRESHOLD = 0.85
+COMMIT_GAP_THRESHOLD = 0.30
+# Number of bootstrap resamples for confidence intervals
+N_BOOTSTRAP = 1000
+# Random seed
+SEED = 42
+# Cost penalty strength for learned policies.
+# Utility reward = diagnostic improvement - lambda * normalized_channel_cost
+COST_PENALTY_LAMBDA = float(os.getenv("COST_PENALTY_LAMBDA", "0.05"))
+# ============================================================
+# Dataset-Specific Channel Definitions
+# ============================================================
+MIDAS_CHANNELS = {
+    "patient_demographics": {
+        "description": "Patient age, sex, and Fitzpatrick skin type",
+        "type": "text",
+        "always_given": True,
+        "tier": "free",
+        "cost": 0.0,
+        "order": 0,
+    },
+    "lesion_metadata": {
+        "description": "Anatomic location, lesion length and width",
+        "type": "text",
+        "always_given": True,
+        "tier": "cheap",
+        "cost": 25.0,
+        "order": 1,
+    },
+    "clinical_30cm": {
+        "description": "Clinical photograph taken at 30cm distance",
+        "type": "image",
+        "always_given": False,
+        "tier": "moderate",
+        "cost": 50.0,
+        "order": 2,
+    },
+    "clinical_15cm": {
+        "description": "Clinical photograph taken at 15cm distance (closer view)",
+        "type": "image",
+        "always_given": False,
+        "tier": "moderate",
+        "cost": 50.0,
+        "order": 3,
+    },
+    "dermoscopy": {
+        "description": "Dermoscopic image showing subsurface skin structures",
+        "type": "image",
+        "always_given": False,
+        "tier": "expensive",
+        "cost": 250.0,
+        "order": 4,
+    },
+}
+NEJM_CHANNELS = {
+    "demographics": {
+        "description": "Patient age, sex, and ethnicity if mentioned",
+        "type": "text",
+        "always_given": True,
+        "tier": "free",
+        "cost": 0.0,
+        "order": 0,
+    },
+    "chief_complaint": {
+        "description": "The presenting symptom(s) and their duration",
+        "type": "text",
+        "always_given": True,
+        "tier": "free",
+        "cost": 0.0,
+        "order": 1,
+    },
+    "medical_history": {
+        "description": "Past medical conditions, medications, family and social history",
+        "type": "text",
+        "always_given": True,
+        "tier": "free",
+        "cost": 0.0,
+        "order": 2,
+    },
+    "exam_findings": {
+        "description": "Physical examination results and observations",
+        "type": "text",
+        "always_given": False,
+        "tier": "cheap",
+        "cost": 75.0,
+        "order": 3,
+    },
+    "investigations": {
+        "description": "Laboratory values, prior imaging results, and test outcomes",
+        "type": "text",
+        "always_given": False,
+        "tier": "moderate",
+        "cost": 250.0,
+        "order": 4,
+    },
+    "image": {
+        "description": "The primary diagnostic image",
+        "type": "image",
+        "always_given": False,
+        "tier": "expensive",
+        "cost": 800.0,
+        "order": 5,
+    },
+}
+OLIVES_CHANNELS = {
+    "disease_context": {
+        "description": "Disease type and treatment context",
+        "type": "text",
+        "always_given": True,
+        "tier": "free",
+        "cost": 0.0,
+        "order": 0,
+    },
+    "clinical_measurements": {
+        "description": "Best Corrected Visual Acuity (BCVA) and Central Subfield Thickness (CST)",
+        "type": "text",
+        "always_given": False,
+        "tier": "cheap",
+        "cost": 20.0,
+        "order": 1,
+    },
+    "biomarker_hints": {
+        "description": "Expert-graded presence of retinal biomarkers (partial list)",
+        "type": "text",
+        "always_given": False,
+        "tier": "moderate",
+        "cost": 100.0,
+        "order": 2,
+    },
+    "oct_scan": {
+        "description": "Optical Coherence Tomography B-scan showing retinal cross-section",
+        "type": "image",
+        "always_given": False,
+        "tier": "expensive",
+        "cost": 300.0,
+        "order": 3,
+    },
+    "additional_oct": {
+        "description": "Additional OCT B-scans from different retinal locations",
+        "type": "image",
+        "always_given": False,
+        "tier": "very_expensive",
+        "cost": 150.0,
+        "order": 4,
+    },
+}
+CHANNEL_CONFIGS = {
+    "midas": MIDAS_CHANNELS,
+    "nejm": NEJM_CHANNELS,
+    "olives": OLIVES_CHANNELS,
+}
+# ============================================================
+# OLIVES Biomarker Tier Definitions
+# ============================================================
+OLIVES_BIOMARKER_TIERS = {
+    "fundus_visible": [
+        "hard_exudates",
+        "hemorrhage",
+        "microaneurysms",
+        "cotton_wool_spots",
+    ],
+    "oct_dependent": [
+        "fluid_irf",       # Intraretinal fluid
+        "fluid_srf",       # Subretinal fluid
+        "dril",            # Disorganization of retinal inner layers
+        "ez_disruption",   # Ellipsoid zone disruption
+        "ez_absent",
+        "drt_me",          # Diffuse retinal thickening / macular edema
+        "shrm",            # Subretinal hyperreflective material
+        "full_thickness",  # Full thickness involvement
+        "preretinal_tissue",
+        "vitreous_debris",
+    ],
+    "clinical_dependent": [
+        "drt_me",  # Also correlates with CST
+    ],
+}
+# ============================================================
+# Results / Logging
+# ============================================================
+RESULTS_DIR = Path(os.getenv("RESULTS_DIR", "./results"))
+RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
+def get_channel_definition(dataset: str, channel_name: str) -> dict:
+    """Return canonical metadata for a dataset channel."""
+    return CHANNEL_CONFIGS.get(dataset, {}).get(channel_name, {})

datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .base import MedicalCase, DatasetBase
+from .midas import MIDASDataset
+from .nejm import NEJMDataset
+from .olives import OLIVESDataset
+DATASET_REGISTRY = {
+    "midas": MIDASDataset,
+    "nejm": NEJMDataset,
+    "olives": OLIVESDataset,
+}
+def load_dataset(name: str, **kwargs) -> DatasetBase:
+    """Load a dataset by name."""
+    if name not in DATASET_REGISTRY:
+        raise ValueError(f"Unknown dataset: {name}. Choose from {list(DATASET_REGISTRY.keys())}")
+    return DATASET_REGISTRY[name](**kwargs)

datasets/base.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""
+Abstract base class for medical datasets in the ActiveMedAgent framework.
+Every dataset must expose cases in a unified format:
+  - An initial observation (always-given channels)
+  - A set of requestable channels (additional info the agent can acquire)
+  - A candidate list (diagnoses to rank)
+  - Ground truth (correct ranking)
+"""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+@dataclass
+class ChannelData:
+    """A single information channel's content."""
+    name: str
+    channel_type: str           # "image" or "text"
+    description: str            # Human-readable description of this channel
+    value: Any = None           # Text content (str) or base64-encoded image (str)
+    image_path: Path | None = None  # Original image path if applicable
+    cost: float = 0.0
+    tier: str = "unknown"
+    always_given: bool = False
+@dataclass
+class MedicalCase:
+    """
+    A single diagnostic case in the unified format.
+    The agent starts with `initial_channels` and can request from
+    `requestable_channels`. It must produce a ranked list over `candidates`.
+    """
+    case_id: str
+    dataset: str                                    # "midas", "nejm", "olives"
+    initial_channels: dict[str, ChannelData] = field(default_factory=dict)
+    requestable_channels: dict[str, ChannelData] = field(default_factory=dict)
+    candidates: list[str] = field(default_factory=list)
+    ground_truth: str = ""                          # Correct diagnosis label
+    ground_truth_rank: int = 0                      # Index in candidates (0-indexed)
+    metadata: dict[str, Any] = field(default_factory=dict)
+    @property
+    def all_channel_names(self) -> list[str]:
+        return list(self.initial_channels.keys()) + list(self.requestable_channels.keys())
+    @property
+    def requestable_names(self) -> list[str]:
+        return list(self.requestable_channels.keys())
+    def get_channel(self, name: str) -> ChannelData | None:
+        """Retrieve a channel by name from either initial or requestable."""
+        if name in self.initial_channels:
+            return self.initial_channels[name]
+        if name in self.requestable_channels:
+            return self.requestable_channels[name]
+        return None
+    def get_initial_images(self) -> list[str]:
+        """Get base64-encoded images from initial channels."""
+        images = []
+        for ch in self.initial_channels.values():
+            if ch.channel_type == "image" and ch.value is not None:
+                images.append(ch.value)
+        return images
+    def get_all_images_up_to(self, acquired: list[str]) -> list[str]:
+        """Get all images from initial + acquired channels."""
+        images = self.get_initial_images()
+        for name in acquired:
+            ch = self.get_channel(name)
+            if ch and ch.channel_type == "image" and ch.value is not None:
+                if isinstance(ch.value, list):
+                    images.extend(ch.value)
+                else:
+                    images.append(ch.value)
+        return images
+    def get_text_context(self, acquired: list[str]) -> dict[str, dict]:
+        """Get all text info from initial + acquired channels."""
+        context = {}
+        for name, ch in self.initial_channels.items():
+            if ch.channel_type == "text" and ch.value:
+                context[name] = {"type": "text", "value": ch.value}
+            elif ch.channel_type == "image":
+                context[name] = {"type": "image", "value": "(image provided)"}
+        for name in acquired:
+            ch = self.get_channel(name)
+            if ch:
+                if ch.channel_type == "text" and ch.value:
+                    context[name] = {"type": "text", "value": ch.value}
+                elif ch.channel_type == "image":
+                    context[name] = {"type": "image", "value": "(image provided)"}
+        return context
+    def get_channel_cost(self, name: str) -> float:
+        """Return the configured acquisition cost for a channel."""
+        ch = self.get_channel(name)
+        return float(ch.cost) if ch else 0.0
+    def get_initial_cost(self) -> float:
+        """Total cost of channels already available at case start."""
+        return float(sum(ch.cost for ch in self.initial_channels.values()))
+    def get_acquisition_cost(self, acquired: list[str]) -> float:
+        """Total incremental cost of acquired requestable channels."""
+        return float(sum(self.get_channel_cost(name) for name in acquired))
+    def get_total_cost(self, acquired: list[str]) -> float:
+        """Initial cost plus any additional acquired channels."""
+        return self.get_initial_cost() + self.get_acquisition_cost(acquired)
+    def get_max_requestable_cost(self) -> float:
+        """Upper bound if every requestable channel were acquired."""
+        return float(sum(ch.cost for ch in self.requestable_channels.values()))
+class DatasetBase(ABC):
+    """Abstract base class for dataset loaders."""
+    def __init__(self, data_dir: str | Path, split: str = "test"):
+        self.data_dir = Path(data_dir)
+        self.split = split
+        self.cases: list[MedicalCase] = []
+    @abstractmethod
+    def load(self) -> list[MedicalCase]:
+        """Load and return all cases in unified format."""
+        pass
+    def __len__(self) -> int:
+        return len(self.cases)
+    def __getitem__(self, idx: int) -> MedicalCase:
+        return self.cases[idx]
+    def __iter__(self):
+        return iter(self.cases)
+    @abstractmethod
+    def get_name(self) -> str:
+        """Return dataset identifier string."""
+        pass

datasets/midas.py ADDED Viewed

	@@ -0,0 +1,444 @@

+"""
+MIDAS (MRA-MIDAS) Dataset Loader.
+Actual Stanford AIMI MIDAS dataset structure:
+  midas/
+  ├── images/           (flat directory of all images)
+  │   ├── s-prd-398966407.jpg
+  │   └── ...
+  └── release_midas.xlsx  (metadata with midas_record_id grouping)
+Each record_id groups images of one lesion at multiple modalities:
+  - midas_distance='1ft'   → clinical_30cm
+  - midas_distance='6in'   → clinical_15cm
+  - midas_distance='dscope' → dermoscopy
+Each case becomes a multi-channel acquisition problem:
+  - Initial: patient_demographics (free tier)
+  - Requestable: clinical_30cm, clinical_15cm, dermoscopy, lesion_metadata
+"""
+import csv
+import hashlib
+import json
+import logging
+import random
+from pathlib import Path
+from collections import Counter, defaultdict
+from .base import DatasetBase, MedicalCase, ChannelData
+from api_client import encode_image_to_base64
+import config
+logger = logging.getLogger(__name__)
+# Map raw midas_distance values to our channel names
+DISTANCE_TO_CHANNEL = {
+    "1ft": "clinical_30cm",
+    "6in": "clinical_15cm",
+    "dscope": "dermoscopy",
+}
+# Map raw midas_path values to canonical diagnosis names
+PATH_TO_DIAGNOSIS = {
+    "malignant- bcc": "basal_cell_carcinoma",
+    "malignant- melanoma": "melanoma_invasive",
+    "malignant- scc": "squamous_cell_carcinoma",
+    "malignant- sccis": "squamous_cell_carcinoma_in_situ",
+    "malignant- ak": "actinic_keratosis",
+    "benign-melanocytic nevus": "melanocytic_nevus",
+    "benign-seborrheic keratosis": "seborrheic_keratosis",
+    "benign-other": "benign_other",
+    "other- melanocytic lesion, possible re-excision (severe, spitz, aimp)": "dysplastic_nevus",
+}
+# MIDAS diagnosis taxonomy — grouped for candidate generation
+MIDAS_DIAGNOSIS_GROUPS = {
+    "malignant_melanocytic": [
+        "melanoma_invasive",
+        "melanoma_in_situ",
+    ],
+    "benign_melanocytic": [
+        "melanocytic_nevus",
+        "dysplastic_nevus",
+        "blue_nevus",
+        "spitz_nevus",
+    ],
+    "malignant_nonmelanocytic": [
+        "basal_cell_carcinoma",
+        "squamous_cell_carcinoma",
+        "squamous_cell_carcinoma_in_situ",
+        "actinic_keratosis",
+    ],
+    "benign_nonmelanocytic": [
+        "seborrheic_keratosis",
+        "dermatofibroma",
+        "angioma",
+        "solar_lentigo",
+        "benign_other",
+    ],
+    "inflammatory": [
+        "eczema",
+        "psoriasis",
+        "lichen_planus",
+    ],
+}
+# Flattened list of all possible diagnoses
+ALL_DIAGNOSES = []
+for group in MIDAS_DIAGNOSIS_GROUPS.values():
+    ALL_DIAGNOSES.extend(group)
+def _case_rng(case_id: str) -> random.Random:
+    """Create a deterministic RNG seeded by case ID for reproducible candidate generation."""
+    seed = int(hashlib.sha256(case_id.encode()).hexdigest()[:8], 16)
+    return random.Random(seed)
+class MIDASDataset(DatasetBase):
+    """Loader for MRA-MIDAS dermatology dataset."""
+    def __init__(self, data_dir: str | Path = None, split: str = "test", n_candidates: int = 5):
+        super().__init__(data_dir or config.DATASET_PATHS["midas"], split)
+        self.n_candidates = n_candidates
+    def get_name(self) -> str:
+        return "midas"
+    def load(self) -> list[MedicalCase]:
+        logger.info(f"Loading MIDAS dataset from {self.data_dir}")
+        # ---- Discover metadata file ----
+        metadata_path = self._find_metadata_file()
+        if metadata_path is None:
+            logger.error(f"No metadata file found in {self.data_dir}")
+            return []
+        records = self._load_metadata(metadata_path)
+        logger.info(f"Found {len(records)} records in metadata")
+        # ---- Group records by lesion (midas_record_id) ----
+        lesion_groups = defaultdict(list)
+        for r in records:
+            rid = r.get("midas_record_id", r.get("lesion_id", ""))
+            if rid:
+                lesion_groups[str(rid)].append(r)
+        logger.info(f"Found {len(lesion_groups)} unique lesions")
+        # ---- Build diagnosis distribution for candidate sampling ----
+        all_dx = []
+        for rid, recs in lesion_groups.items():
+            dx = self._get_diagnosis(recs[0])
+            if dx:
+                all_dx.append(dx)
+        dx_counter = Counter(all_dx)
+        # ---- Convert each lesion group to MedicalCase ----
+        self.cases = []
+        for rid, recs in lesion_groups.items():
+            case = self._build_case(rid, recs, dx_counter)
+            if case is not None:
+                self.cases.append(case)
+        logger.info(f"Loaded {len(self.cases)} MIDAS cases")
+        return self.cases
+    def _find_metadata_file(self) -> Path | None:
+        """Find the metadata file (xlsx, csv, or json)."""
+        # Try xlsx first (actual MIDAS format)
+        for name in ["release_midas.xlsx", "metadata.xlsx"]:
+            p = self.data_dir / name
+            if p.exists():
+                return p
+        # Then CSV
+        for name in ["metadata.csv", "labels.csv", "midas_metadata.csv"]:
+            p = self.data_dir / name
+            if p.exists():
+                return p
+        # Then JSON
+        for name in ["metadata.json", "labels.json"]:
+            p = self.data_dir / name
+            if p.exists():
+                return p
+        # Glob fallback
+        for pattern in ["*.xlsx", "*.csv"]:
+            matches = list(self.data_dir.glob(pattern))
+            if matches:
+                return matches[0]
+        return None
+    def _load_metadata(self, path: Path) -> list[dict]:
+        """Load metadata from xlsx, csv, or json."""
+        if path.suffix == ".xlsx":
+            return self._load_xlsx(path)
+        elif path.suffix == ".json":
+            with open(path, encoding="utf-8") as f:
+                return json.load(f)
+        else:
+            with open(path, newline="", encoding="utf-8-sig") as f:
+                reader = csv.DictReader(f)
+                return list(reader)
+    def _load_xlsx(self, path: Path) -> list[dict]:
+        """Load metadata from Excel file."""
+        import openpyxl
+        wb = openpyxl.load_workbook(path, read_only=True)
+        ws = wb[wb.sheetnames[0]]
+        rows = list(ws.iter_rows(values_only=True))
+        wb.close()
+        if not rows:
+            return []
+        headers = [str(h) if h is not None else f"col_{i}" for i, h in enumerate(rows[0])]
+        return [dict(zip(headers, row)) for row in rows[1:]]
+    def _get_diagnosis(self, record: dict) -> str | None:
+        """Extract canonical diagnosis from a record."""
+        raw_path = record.get("midas_path", record.get("diagnosis", ""))
+        if not raw_path or raw_path == "None" or raw_path is None:
+            return None
+        raw_path = str(raw_path).strip().lower()
+        # Try exact match in mapping
+        for key, canonical in PATH_TO_DIAGNOSIS.items():
+            if key.lower() == raw_path:
+                return canonical
+        # Fuzzy fallback
+        if "melanoma" in raw_path:
+            return "melanoma_invasive"
+        if "bcc" in raw_path or "basal" in raw_path:
+            return "basal_cell_carcinoma"
+        if "sccis" in raw_path:
+            return "squamous_cell_carcinoma_in_situ"
+        if "scc" in raw_path or "squamous" in raw_path:
+            return "squamous_cell_carcinoma"
+        if "nevus" in raw_path or "melanocytic" in raw_path:
+            return "melanocytic_nevus"
+        if "seborrheic" in raw_path or "keratosis" in raw_path:
+            return "seborrheic_keratosis"
+        if "ak" in raw_path or "actinic" in raw_path:
+            return "actinic_keratosis"
+        if "benign" in raw_path:
+            return "benign_other"
+        return None
+    def _find_image_by_filename(self, filename: str) -> Path | None:
+        """Find an image by its filename in the images directory."""
+        if not filename:
+            return None
+        # Try images/ subdir, then root, case-insensitive
+        search_dirs = [
+            self.data_dir / "images",
+            self.data_dir,
+        ]
+        for d in search_dirs:
+            if not d.exists():
+                continue
+            p = d / filename
+            if p.exists():
+                return p
+            # Case-insensitive search
+            for ext_p in d.iterdir():
+                if ext_p.name.lower() == filename.lower():
+                    return ext_p
+        return None
+    def _build_case(
+        self,
+        record_id: str,
+        records: list[dict],
+        dx_counter: Counter,
+    ) -> MedicalCase | None:
+        """Convert a lesion's grouped records into a MedicalCase."""
+        # Use first non-control record for metadata
+        primary = None
+        for r in records:
+            if str(r.get("midas_iscontrol", "no")).lower() != "yes":
+                dx = self._get_diagnosis(r)
+                if dx:
+                    primary = r
+                    break
+        if primary is None:
+            return None  # Skip control-only lesions
+        diagnosis = self._get_diagnosis(primary)
+        if not diagnosis:
+            return None
+        # ---- Build channels from all records in this lesion group ----
+        all_channels = {}
+        # Group images by modality
+        for r in records:
+            if str(r.get("midas_iscontrol", "no")).lower() == "yes":
+                continue
+            distance = str(r.get("midas_distance", "")).strip().lower()
+            channel_name = DISTANCE_TO_CHANNEL.get(distance)
+            if not channel_name:
+                continue
+            if channel_name in all_channels:
+                continue  # Already have this modality
+            filename = r.get("midas_file_name", "")
+            img_path = self._find_image_by_filename(filename)
+            if img_path is None:
+                continue
+            try:
+                img_b64 = encode_image_to_base64(img_path)
+            except Exception:
+                continue
+            ch_meta = config.get_channel_definition("midas", channel_name)
+            descriptions = {
+                "clinical_30cm": "Clinical photograph at 30cm distance",
+                "clinical_15cm": "Clinical photograph at 15cm distance (closer view)",
+                "dermoscopy": "Dermoscopic image showing subsurface skin structures",
+            }
+            all_channels[channel_name] = ChannelData(
+                name=channel_name,
+                channel_type="image",
+                description=descriptions.get(channel_name, channel_name),
+                value=img_b64,
+                image_path=img_path,
+                cost=float(ch_meta.get("cost", 0.0)),
+                tier=ch_meta.get("tier", "unknown"),
+                always_given=bool(ch_meta.get("always_given", False)),
+            )
+        # Patient demographics
+        age = primary.get("midas_age", primary.get("age", ""))
+        sex = primary.get("midas_gender", primary.get("sex", ""))
+        fitz = primary.get("midas_fitzpatrick", primary.get("fitzpatrick", ""))
+        ethnicity = primary.get("midas_ethnicity", "")
+        race = primary.get("midas_race", "")
+        if any([age, sex, fitz]):
+            demo_parts = []
+            if age:
+                demo_parts.append(f"Age: {age}")
+            if sex:
+                demo_parts.append(f"Sex: {sex}")
+            if fitz:
+                demo_parts.append(f"Fitzpatrick skin type: {fitz}")
+            if ethnicity and str(ethnicity).lower() not in ("no", "none", ""):
+                demo_parts.append(f"Ethnicity: {ethnicity}")
+            if race and str(race).lower() not in ("no", "none", ""):
+                demo_parts.append(f"Race: {race}")
+            ch_meta = config.get_channel_definition("midas", "patient_demographics")
+            all_channels["patient_demographics"] = ChannelData(
+                name="patient_demographics",
+                channel_type="text",
+                description="Patient age, sex, and Fitzpatrick skin type",
+                value="; ".join(demo_parts),
+                cost=float(ch_meta.get("cost", 0.0)),
+                tier=ch_meta.get("tier", "unknown"),
+                always_given=bool(ch_meta.get("always_given", False)),
+            )
+        # Lesion metadata
+        location = primary.get("midas_location", primary.get("location", ""))
+        length = primary.get("length_(mm)", primary.get("length_mm", ""))
+        width = primary.get("width_(mm)", primary.get("width_mm", ""))
+        if any([location, length, width]):
+            meta_parts = []
+            if location:
+                meta_parts.append(f"Anatomic location: {location}")
+            if length:
+                meta_parts.append(f"Lesion length: {length}mm")
+            if width:
+                meta_parts.append(f"Lesion width: {width}mm")
+            ch_meta = config.get_channel_definition("midas", "lesion_metadata")
+            all_channels["lesion_metadata"] = ChannelData(
+                name="lesion_metadata",
+                channel_type="text",
+                description="Anatomic location, lesion length and width",
+                value="; ".join(meta_parts),
+                cost=float(ch_meta.get("cost", 0.0)),
+                tier=ch_meta.get("tier", "unknown"),
+                always_given=bool(ch_meta.get("always_given", False)),
+            )
+        if not all_channels:
+            return None
+        initial_channels = {
+            name: ch for name, ch in all_channels.items() if ch.always_given
+        }
+        requestable = {
+            name: ch for name, ch in all_channels.items() if not ch.always_given
+        }
+        if not initial_channels and not requestable:
+            return None
+        # ---- Build candidate list (correct + distractors) ----
+        case_id = f"midas_{record_id}"
+        candidates = self._generate_candidates(diagnosis, dx_counter, case_id)
+        if diagnosis not in candidates:
+            logger.warning(f"Ground truth '{diagnosis}' not in candidate list for {case_id}, forcing inclusion")
+            candidates[0] = diagnosis
+            rng = _case_rng(case_id)
+            rng.shuffle(candidates)
+        return MedicalCase(
+            case_id=case_id,
+            dataset="midas",
+            initial_channels=initial_channels,
+            requestable_channels=requestable,
+            candidates=candidates,
+            ground_truth=diagnosis,
+            ground_truth_rank=candidates.index(diagnosis),
+            metadata={
+                "lesion_id": record_id,
+                "original_record": {k: str(v) for k, v in primary.items()
+                                     if k not in ("image", "img")},
+            },
+        )
+    def _generate_candidates(self, correct_dx: str, dx_counter: Counter, case_id: str) -> list[str]:
+        """
+        Generate N candidate diagnoses: 1 correct + (N-1) distractors.
+        Uses a per-case deterministic RNG for reproducibility across conditions.
+        Distractors are sampled to be clinically plausible:
+          - At least one from the same diagnostic group
+          - Others from different groups, weighted by dataset frequency
+        """
+        n = self.n_candidates
+        rng = _case_rng(case_id)
+        # Find which group the correct dx belongs to
+        correct_group = None
+        for group_name, members in MIDAS_DIAGNOSIS_GROUPS.items():
+            if correct_dx in members:
+                correct_group = group_name
+                break
+        distractors = set()
+        # Add one same-group distractor if possible
+        if correct_group:
+            same_group = [d for d in MIDAS_DIAGNOSIS_GROUPS[correct_group] if d != correct_dx]
+            if same_group:
+                distractors.add(rng.choice(same_group))
+        # Fill rest from other groups, weighted by frequency
+        other_dx = [d for d in ALL_DIAGNOSES if d != correct_dx and d not in distractors]
+        weights = [dx_counter.get(d, 1) for d in other_dx]
+        total_w = sum(weights)
+        weights = [w / total_w for w in weights]
+        while len(distractors) < n - 1 and other_dx:
+            pick = rng.choices(other_dx, weights=weights, k=1)[0]
+            distractors.add(pick)
+            idx = other_dx.index(pick)
+            other_dx.pop(idx)
+            weights.pop(idx)
+            if weights:
+                total_w = sum(weights)
+                weights = [w / total_w for w in weights]
+        candidates = [correct_dx] + list(distractors)
+        rng.shuffle(candidates)
+        return candidates[:n]

datasets/nejm.py ADDED Viewed

	@@ -0,0 +1,440 @@

+"""
+NEJM Image Challenge Dataset Loader.
+Expects the cx0/nejm-image-challenge dataset structure:
+  nejm/
+  ├── data.json (or nejm_data.json)
+  │   Each entry: {date, image_url, prompt (clinical vignette),
+  │                options [A..E], correct_answer, votes}
+  ├── images/   (downloaded images, named by date YYYYMMDD.jpg)
+  └── parsed_vignettes.json  (pre-parsed structured fields, optional)
+The clinical vignette is decomposed into 5 requestable text channels
+using LLM-based parsing (see scripts/parse_nejm_vignettes.py).
+"""
+import json
+import logging
+import random
+import re
+from pathlib import Path
+from .base import DatasetBase, MedicalCase, ChannelData
+from api_client import encode_image_to_base64
+import config
+logger = logging.getLogger(__name__)
+# ---- Vignette parsing schema ----
+VIGNETTE_FIELDS = [
+    "demographics",
+    "chief_complaint",
+    "medical_history",
+    "exam_findings",
+    "investigations",
+]
+VIGNETTE_PARSE_PROMPT = """You are a medical data extraction system. Parse the following clinical \
+vignette into exactly 5 structured fields. Extract ONLY information that is explicitly stated. \
+If a field has no relevant information, write "Not mentioned."
+FIELDS:
+1. demographics: Patient age, sex, race/ethnicity if stated.
+2. chief_complaint: The primary presenting symptom(s) and their duration.
+3. medical_history: Past medical conditions, medications, surgical history, family history, social history (smoking, alcohol, etc.).
+4. exam_findings: Physical examination findings, vital signs.
+5. investigations: Laboratory results, imaging findings, test results (anything with numbers or test names).
+CLINICAL VIGNETTE:
+{vignette}
+Respond in EXACTLY this JSON format (no markdown, no extra text):
+{{"demographics": "...", "chief_complaint": "...", "medical_history": "...", "exam_findings": "...", "investigations": "..."}}"""
+class NEJMDataset(DatasetBase):
+    """Loader for NEJM Image Challenge dataset."""
+    def __init__(
+        self,
+        data_dir: str | Path = None,
+        split: str = "test",
+        vlm_client=None,
+        use_cached_parse: bool = True,
+    ):
+        super().__init__(data_dir or config.DATASET_PATHS["nejm"], split)
+        self.vlm_client = vlm_client
+        self.use_cached_parse = use_cached_parse
+        self._parsed_cache_path = self.data_dir / "parsed_vignettes.json"
+    def get_name(self) -> str:
+        return "nejm"
+    def load(self) -> list[MedicalCase]:
+        logger.info(f"Loading NEJM dataset from {self.data_dir}")
+        # ---- Load raw data ----
+        raw_data = self._load_raw_data()
+        if not raw_data:
+            return []
+        logger.info(f"Found {len(raw_data)} NEJM cases")
+        # ---- Load or create parsed vignettes ----
+        parsed = self._load_or_parse_vignettes(raw_data)
+        # ---- Build cases ----
+        self.cases = []
+        for entry in raw_data:
+            case_id = entry.get("date", entry.get("id", "unknown"))
+            case = self._build_case(entry, parsed.get(case_id, {}))
+            if case is not None:
+                self.cases.append(case)
+        logger.info(f"Loaded {len(self.cases)} NEJM cases")
+        return self.cases
+    def _load_raw_data(self) -> list[dict]:
+        """Load the raw NEJM dataset JSON."""
+        for name in ["data.json", "nejm_data.json", "nejm.json", "dataset.json"]:
+            p = self.data_dir / name
+            if p.exists():
+                with open(p, encoding="utf-8") as f:
+                    data = json.load(f)
+                    if isinstance(data, dict):
+                        # Handle {date: entry} format
+                        return [{"date": k, **v} if isinstance(v, dict) else v
+                                for k, v in data.items()]
+                    return data
+        # Try loading all JSON files
+        jsons = list(self.data_dir.glob("*.json"))
+        if jsons:
+            with open(jsons[0], encoding="utf-8") as f:
+                return json.load(f)
+        logger.error(f"No data file found in {self.data_dir}")
+        return []
+    def _load_or_parse_vignettes(self, raw_data: list[dict]) -> dict:
+        """Load cached parsed vignettes or parse them with LLM."""
+        # Try cache first
+        if self.use_cached_parse and self._parsed_cache_path.exists():
+            logger.info(f"Loading cached vignette parses from {self._parsed_cache_path}")
+            with open(self._parsed_cache_path) as f:
+                return json.load(f)
+        # Parse with LLM if client is available
+        if self.vlm_client is not None:
+            logger.info("Parsing vignettes with LLM (this may take a while)...")
+            parsed = {}
+            for entry in raw_data:
+                case_id = entry.get("date", entry.get("id", "unknown"))
+                vignette = entry.get("question", entry.get("prompt", entry.get("vignette", "")))
+                if vignette:
+                    parsed[case_id] = self._parse_vignette_with_llm(vignette)
+            # Cache results
+            with open(self._parsed_cache_path, "w") as f:
+                json.dump(parsed, f, indent=2)
+            logger.info(f"Cached {len(parsed)} parsed vignettes")
+            return parsed
+        # Fallback: rule-based parsing
+        logger.info("No LLM client available. Using rule-based vignette parsing (less accurate).")
+        parsed = {}
+        for entry in raw_data:
+            case_id = entry.get("date", entry.get("id", "unknown"))
+            vignette = entry.get("question", entry.get("prompt", entry.get("vignette", "")))
+            if vignette:
+                parsed[case_id] = self._parse_vignette_rules(vignette)
+        return parsed
+    def _parse_vignette_with_llm(self, vignette: str) -> dict:
+        """Parse a single vignette using the LLM API."""
+        prompt = VIGNETTE_PARSE_PROMPT.format(vignette=vignette)
+        try:
+            response = self.vlm_client.call_with_retry(
+                system_prompt="You are a medical data extraction system. Respond only with valid JSON.",
+                user_text=prompt,
+                images=None,
+                temperature=0.0,
+                max_tokens=1024,
+            )
+            # Parse JSON from response
+            text = response.text.strip()
+            # Strip markdown code fences if present
+            text = re.sub(r"^```(?:json)?\s*", "", text)
+            text = re.sub(r"\s*```$", "", text)
+            parsed = json.loads(text)
+            # Validate expected fields
+            for field in VIGNETTE_FIELDS:
+                if field not in parsed:
+                    parsed[field] = "Not mentioned."
+            return parsed
+        except Exception as e:
+            logger.warning(f"LLM vignette parsing failed: {e}. Falling back to rules.")
+            return self._parse_vignette_rules(vignette)
+    def _parse_vignette_rules(self, vignette: str) -> dict:
+        """
+        Rule-based fallback for vignette parsing.
+        Uses heuristic sentence classification.
+        """
+        result = {f: "" for f in VIGNETTE_FIELDS}
+        sentences = re.split(r'(?<=[.!?])\s+', vignette)
+        # Patterns for classification
+        demo_pattern = re.compile(
+            r'\b(\d{1,3})[-\s]year[-\s]old\b|'
+            r'\b(male|female|man|woman|boy|girl)\b',
+            re.IGNORECASE,
+        )
+        complaint_pattern = re.compile(
+            r'\bpresent(?:s|ed|ing)\b|\bcomplain(?:s|ed|ing)\b|\breport(?:s|ed|ing)\b|'
+            r'\bseek(?:s|ing)\b|\badmitted\b',
+            re.IGNORECASE,
+        )
+        history_pattern = re.compile(
+            r'\bhistory\b|\bprevious(?:ly)?\b|\bmedication\b|\btaking\b|\bdiagnosed\b|'
+            r'\bsmok(?:es|ing|er)\b|\balcohol\b|\bfamily\b|\bsurgery\b',
+            re.IGNORECASE,
+        )
+        exam_pattern = re.compile(
+            r'\bexamination\b|\bexam\b|\bpalpat(?:ion|ed)\b|\bauscult(?:ation|ed)\b|'
+            r'\bvital\b|\bblood\s+pressure\b|\bheart\s+rate\b|\btemperature\b|'
+            r'\bappears\b|\btender\b|\bswollen\b|\berythema\b',
+            re.IGNORECASE,
+        )
+        invest_pattern = re.compile(
+            r'\b(?:hemoglobin|WBC|platelet|creatinine|BUN|glucose|sodium|potassium)\b|'
+            r'\b(?:CT|MRI|X[-\s]?ray|ultrasound|ECG|EKG|biopsy)\b|'
+            r'\b\d+\.?\d*\s*(?:mg|g|mL|mmol|mEq|U|IU|mmHg|\/dL|\/L)\b|'
+            r'\blaboratory\b|\blab(?:s)?\b|\btest\b|\blevel\b|\bfinding\b',
+            re.IGNORECASE,
+        )
+        for sent in sentences:
+            sent = sent.strip()
+            if not sent:
+                continue
+            # Demographics: typically the first sentence
+            if demo_pattern.search(sent) and not result["demographics"]:
+                result["demographics"] = sent
+                continue
+            # Check each pattern (a sentence can match multiple, take first)
+            matched = False
+            for field, pattern in [
+                ("investigations", invest_pattern),
+                ("exam_findings", exam_pattern),
+                ("medical_history", history_pattern),
+                ("chief_complaint", complaint_pattern),
+            ]:
+                if pattern.search(sent):
+                    if result[field]:
+                        result[field] += " " + sent
+                    else:
+                        result[field] = sent
+                    matched = True
+                    break
+            # Unmatched sentences go to chief_complaint as default
+            if not matched:
+                if result["chief_complaint"]:
+                    result["chief_complaint"] += " " + sent
+                else:
+                    result["chief_complaint"] = sent
+        # Replace empty fields
+        for field in VIGNETTE_FIELDS:
+            if not result[field].strip():
+                result[field] = "Not mentioned."
+        return result
+    @staticmethod
+    def _date_to_yyyymmdd(date_str: str) -> str | None:
+        """Convert 'apr-01-2010' style date to '20100401' for image lookup."""
+        from datetime import datetime
+        for fmt in ("%b-%d-%Y", "%B-%d-%Y", "%Y-%m-%d", "%Y%m%d"):
+            try:
+                dt = datetime.strptime(date_str, fmt)
+                return dt.strftime("%Y%m%d")
+            except ValueError:
+                continue
+        return None
+    def _build_case(self, entry: dict, parsed_vignette: dict) -> MedicalCase | None:
+        """Convert a raw NEJM entry + parsed vignette into a MedicalCase."""
+        case_id = entry.get("date", entry.get("id", "unknown"))
+        # ---- Find image ----
+        img_b64 = None
+        img_dir = self.data_dir / "images"
+        # Build candidate filenames: original case_id + YYYYMMDD conversion
+        name_candidates = [case_id]
+        yyyymmdd = self._date_to_yyyymmdd(case_id)
+        if yyyymmdd:
+            name_candidates.append(yyyymmdd)
+        if img_dir.exists():
+            for name in name_candidates:
+                for ext in [".jpg", ".jpeg", ".png"]:
+                    p = img_dir / f"{name}{ext}"
+                    if p.exists():
+                        try:
+                            img_b64 = encode_image_to_base64(p)
+                        except Exception:
+                            pass
+                        break
+                if img_b64 is not None:
+                    break
+            if img_b64 is None:
+                # Glob for any match
+                for name in name_candidates:
+                    matches = list(img_dir.glob(f"*{name}*"))
+                    if matches:
+                        try:
+                            img_b64 = encode_image_to_base64(matches[0])
+                        except Exception:
+                            pass
+                        break
+        # ---- Build all available channels, then split by config ----
+        all_channels = {}
+        if img_b64 is not None:
+            image_meta = config.get_channel_definition("nejm", "image")
+            all_channels["image"] = ChannelData(
+                name="image",
+                channel_type="image",
+                description="The primary diagnostic image",
+                value=img_b64,
+                cost=float(image_meta.get("cost", 0.0)),
+                tier=image_meta.get("tier", "unknown"),
+                always_given=bool(image_meta.get("always_given", False)),
+            )
+        field_descriptions = {
+            "demographics": "Patient age, sex, and ethnicity if mentioned",
+            "chief_complaint": "The presenting symptom(s) and their duration",
+            "medical_history": "Past medical conditions, medications, family and social history",
+            "exam_findings": "Physical examination results and observations",
+            "investigations": "Laboratory values, prior imaging results, and test outcomes",
+        }
+        for field in VIGNETTE_FIELDS:
+            value = parsed_vignette.get(field, "Not mentioned.")
+            field_meta = config.get_channel_definition("nejm", field)
+            if value and value.strip() != "Not mentioned.":
+                all_channels[field] = ChannelData(
+                    name=field,
+                    channel_type="text",
+                    description=field_descriptions.get(field, field),
+                    value=value,
+                    cost=float(field_meta.get("cost", 0.0)),
+                    tier=field_meta.get("tier", "unknown"),
+                    always_given=bool(field_meta.get("always_given", False)),
+                )
+            else:
+                all_channels[field] = ChannelData(
+                    name=field,
+                    channel_type="text",
+                    description=field_descriptions.get(field, field),
+                    value="No additional information available for this category.",
+                    cost=float(field_meta.get("cost", 0.0)),
+                    tier=field_meta.get("tier", "unknown"),
+                    always_given=bool(field_meta.get("always_given", False)),
+                )
+        initial_channels = {
+            name: ch for name, ch in all_channels.items() if ch.always_given
+        }
+        requestable = {
+            name: ch for name, ch in all_channels.items() if not ch.always_given
+        }
+        if not initial_channels and not requestable:
+            logger.debug(f"Skipping NEJM {case_id}: no usable channels found")
+            return None
+        # ---- Candidates: the 5 MCQ options ----
+        options = entry.get("options", [])
+        correct = entry.get("correct_answer", entry.get("answer", ""))
+        # Handle flat option_A..option_E keys (cx0/nejm-image-challenge format)
+        if not options:
+            flat_options = {}
+            for letter in "ABCDE":
+                val = entry.get(f"option_{letter}", "")
+                if val:
+                    flat_options[letter] = val
+            if flat_options:
+                options = flat_options
+        if isinstance(options, dict):
+            # {A: "...", B: "...", ...}
+            candidates = [f"{k}. {v}" for k, v in sorted(options.items())]
+            gt_label = None
+            for k, v in sorted(options.items()):
+                if k == correct:
+                    gt_label = f"{k}. {v}"
+                    break
+            if gt_label is None:
+                gt_label = candidates[0] if candidates else ""
+        elif isinstance(options, list) and options:
+            candidates = options
+            if isinstance(correct, int):
+                gt_label = options[correct] if correct < len(options) else options[0]
+            elif isinstance(correct, str) and len(correct) == 1:
+                # Letter answer (A=0, B=1, ...)
+                idx = ord(correct.upper()) - ord("A")
+                gt_label = options[idx] if idx < len(options) else options[0]
+            else:
+                gt_label = correct
+        else:
+            candidates = [correct] if correct else ["Unknown"]
+            gt_label = correct
+        # ---- Votes (physician response distribution) ----
+        votes = entry.get("votes", {})
+        # Handle flat vote keys (option_A_votes, etc.)
+        if not votes:
+            for letter in "ABCDE":
+                val = entry.get(f"option_{letter}_votes", "")
+                if val:
+                    votes[letter] = val
+        return MedicalCase(
+            case_id=f"nejm_{case_id}",
+            dataset="nejm",
+            initial_channels=initial_channels,
+            requestable_channels=requestable,
+            candidates=candidates,
+            ground_truth=gt_label,
+            ground_truth_rank=(candidates.index(gt_label) if gt_label in candidates else 0),
+            metadata={
+                "date": case_id,
+                "votes": votes,
+                "full_vignette": entry.get("question", entry.get("prompt", entry.get("vignette", ""))),
+                "parsed_fields": parsed_vignette,
+            },
+        )
+    def get_human_difficulty(self, case: MedicalCase) -> float | None:
+        """
+        Compute human difficulty score from physician vote distribution.
+        Returns: proportion of physicians who answered correctly (0-1),
+                 or None if votes unavailable.
+        """
+        votes = case.metadata.get("votes", {})
+        if not votes:
+            return None
+        correct_key = case.metadata.get("date", "")
+        # votes might be {A: 0.12, B: 0.65, ...} or {A: 120, B: 650, ...}
+        total = sum(float(v) for v in votes.values())
+        if total == 0:
+            return None
+        # Find the correct answer key
+        gt = case.ground_truth
+        for key, val in votes.items():
+            if key in gt or gt.startswith(key):
+                return float(val) / total if total > 1 else float(val)
+        return None

datasets/olives.py ADDED Viewed

	@@ -0,0 +1,470 @@

+"""
+OLIVES Dataset Loader.
+Adapted for the actual Zenodo OLIVES dataset structure:
+  data/
+  ├── OLIVES/OLIVES/
+  │   ├── Prime_FULL/Prime_FULL/   (DR patients — OCT B-scans)
+  │   │   └── <patient_id>/<visit>/<eye>/*.png
+  │   └── TREX_DME/TREX DME/      (DME patients — OCT B-scans)
+  │       └── <arm>/<patient_id>/<visit>/<eye>/*.tif
+  └── OLIVES_Dataset_Labels/OLIVES_Dataset_Labels/
+      └── full_labels/Biomarker_Clinical_Data_Images.csv
+Task: Biomarker profile ranking.
+  - Given an OCT B-scan, rank candidate biomarker profiles
+  - Each profile is a subset of the 16 annotated biomarkers
+  - Correct profile = actual biomarker vector for this eye
+  - Distractors = profiles from other eyes
+Channels:
+  - Initial: single OCT B-scan (middle slice)
+  - Requestable: additional OCT slices, clinical measurements (BCVA/CST),
+    biomarker hints (fundus-visible subset), treatment history
+"""
+import csv
+import hashlib
+import json
+import logging
+import random
+from pathlib import Path
+from collections import defaultdict
+import numpy as np
+from .base import DatasetBase, MedicalCase, ChannelData
+from api_client import encode_image_to_base64
+import config
+logger = logging.getLogger(__name__)
+# The biomarker columns as they appear in the CSV
+OLIVES_CSV_BIOMARKERS = {
+    "Fluid (IRF)": "fluid_irf",
+    "Fluid (SRF)": "fluid_srf",
+    "DRT/ME": "drt_me",
+    "SHRM": "shrm",
+    "Preretinal tissue/hemorrhage": "preretinal_tissue",
+    "Vitreous debris": "vitreous_debris",
+    "DRIL": "dril",
+    "Disruption of EZ": "ez_disruption",
+    "IR hemorrhages": "hemorrhage",
+    "IR HRF": "ir_hrf",
+    "Disruption of RPE": "rpe_disruption",
+    "PED (serous)": "ped_serous",
+    "Atrophy / thinning of retinal layers": "atrophy",
+    "VMT": "vmt",
+    "Partially attached vitreous face": "partial_vitreous",
+    "Fully attached vitreous face": "full_vitreous",
+}
+# Canonical biomarker names for profiles
+OLIVES_BIOMARKERS = sorted(OLIVES_CSV_BIOMARKERS.values())
+def biomarker_vector_to_profile_string(vector: dict[str, bool]) -> str:
+    """Convert a biomarker dict to a human-readable profile string."""
+    present = [
+        name.replace("_", " ").title()
+        for name, val in sorted(vector.items()) if val
+    ]
+    if not present:
+        return "No biomarkers detected"
+    return "Present biomarkers: " + ", ".join(present)
+def compute_profile_distance(profile_a: dict, profile_b: dict) -> int:
+    """Hamming distance between two biomarker profiles."""
+    dist = 0
+    for key in OLIVES_BIOMARKERS:
+        if profile_a.get(key, False) != profile_b.get(key, False):
+            dist += 1
+    return dist
+def _case_rng(case_id: str) -> random.Random:
+    seed = int(hashlib.sha256(case_id.encode()).hexdigest()[:8], 16)
+    return random.Random(seed)
+class OLIVESDataset(DatasetBase):
+    """Loader for OLIVES ophthalmology dataset."""
+    def __init__(
+        self,
+        data_dir: str | Path = None,
+        split: str = "test",
+        n_candidates: int = 5,
+        n_oct_samples: int = 3,
+    ):
+        super().__init__(data_dir or config.DATASET_PATHS["olives"], split)
+        self.n_candidates = n_candidates
+        self.n_oct_samples = n_oct_samples
+    def get_name(self) -> str:
+        return "olives"
+    def load(self) -> list[MedicalCase]:
+        logger.info(f"Loading OLIVES dataset from {self.data_dir}")
+        # ---- Find the CSV ----
+        csv_path = self._find_csv()
+        if csv_path is None:
+            logger.error("No biomarker CSV found")
+            return []
+        # ---- Load records ----
+        with open(csv_path, newline="", encoding="utf-8-sig") as f:
+            rows = list(csv.DictReader(f))
+        logger.info(f"Found {len(rows)} records in {csv_path.name}")
+        # ---- Find the image root ----
+        image_root = self._find_image_root()
+        if image_root is None:
+            logger.error("No image directory found")
+            return []
+        logger.info(f"Image root: {image_root}")
+        # ---- Group by eye ----
+        eye_groups = defaultdict(list)
+        for r in rows:
+            pid = r.get("Patient_ID", "")
+            path_str = r.get(
+                "Path (Trial/Arm/Folder/Visit/Eye/Image Name)", ""
+            )
+            parts = path_str.strip("/").split("/")
+            if len(parts) >= 5:
+                eye = parts[4]  # OD or OS
+            else:
+                eye = r.get("Eye_ID", "unknown")
+            eye_key = f"{pid}_{eye}"
+            r["_eye_key"] = eye_key
+            r["_path_parts"] = parts
+            eye_groups[eye_key].append(r)
+        logger.info(f"Found {len(eye_groups)} unique eyes")
+        # ---- Build biomarker profiles ----
+        all_profiles = {}
+        for eye_key, records in eye_groups.items():
+            latest = records[-1]
+            all_profiles[eye_key] = self._extract_biomarker_vector(latest)
+        # ---- Build cases ----
+        self.cases = []
+        for eye_key, records in eye_groups.items():
+            case = self._build_case(
+                eye_key, records, all_profiles, image_root
+            )
+            if case is not None:
+                self.cases.append(case)
+        logger.info(f"Loaded {len(self.cases)} OLIVES cases")
+        return self.cases
+    def _find_csv(self) -> Path | None:
+        """Find the biomarker CSV in various locations."""
+        search_paths = [
+            self.data_dir / "Biomarker_Clinical_Data_Images.csv",
+            self.data_dir / "OLIVES_Dataset_Labels" / "OLIVES_Dataset_Labels" / "full_labels" / "Biomarker_Clinical_Data_Images.csv",
+            self.data_dir.parent / "OLIVES_Dataset_Labels" / "OLIVES_Dataset_Labels" / "full_labels" / "Biomarker_Clinical_Data_Images.csv",
+        ]
+        for p in search_paths:
+            if p.exists():
+                return p
+        # Glob fallback
+        csvs = list(self.data_dir.rglob("Biomarker*Clinical*.csv"))
+        if csvs:
+            return csvs[0]
+        # Check parent
+        csvs = list(self.data_dir.parent.rglob("Biomarker*Clinical*.csv"))
+        if csvs:
+            return csvs[0]
+        return None
+    def _find_image_root(self) -> Path | None:
+        """Find the root directory containing Prime_FULL and TREX_DME."""
+        search = [
+            self.data_dir / "OLIVES",
+            self.data_dir / "OLIVES" / "OLIVES",
+            self.data_dir,
+        ]
+        for d in search:
+            if (d / "Prime_FULL").exists() or (d / "TREX_DME").exists():
+                return d
+        # Search deeper
+        for p in self.data_dir.rglob("Prime_FULL"):
+            return p.parent
+        return None
+    def _extract_biomarker_vector(self, record: dict) -> dict[str, bool]:
+        """Extract biomarker vector from a CSV row."""
+        vector = {}
+        for csv_col, canonical_name in OLIVES_CSV_BIOMARKERS.items():
+            val = record.get(csv_col, "0")
+            if isinstance(val, str):
+                vector[canonical_name] = val.strip() == "1"
+            else:
+                vector[canonical_name] = bool(int(float(val or 0)))
+        return vector
+    def _find_oct_images(
+        self, records: list[dict], image_root: Path, n: int = 3
+    ) -> list[Path]:
+        """Find OCT B-scan images for an eye."""
+        # Try to locate images from the path in the CSV
+        for r in records:
+            path_str = r.get(
+                "Path (Trial/Arm/Folder/Visit/Eye/Image Name)", ""
+            )
+            parts = path_str.strip("/").split("/")
+            if len(parts) < 5:
+                continue
+            # Construct search directory (without the image filename)
+            # Path format: /Trial/Arm/Patient/Visit/Eye/Image
+            trial = parts[0]
+            remaining = "/".join(parts[1:-1])
+            search_dirs = [
+                image_root / trial / remaining,
+                image_root / parts[0].replace(" ", "_") / remaining,
+            ]
+            # For Prime: Prime_FULL/Prime_FULL/Patient/Visit/Eye/
+            if "Prime" in trial or "prime" in trial:
+                pid = parts[2] if len(parts) > 2 else ""
+                visit = parts[3] if len(parts) > 3 else ""
+                eye = parts[4] if len(parts) > 4 else ""
+                search_dirs.extend([
+                    image_root / "Prime_FULL" / "Prime_FULL" / pid / visit / eye,
+                    image_root / "Prime_FULL" / pid / visit / eye,
+                ])
+            # For TREX: TREX_DME/TREX DME/Arm/Patient/Visit/Eye/
+            if "TREX" in trial:
+                arm = parts[1] if len(parts) > 1 else ""
+                pid = parts[2] if len(parts) > 2 else ""
+                visit = parts[3] if len(parts) > 3 else ""
+                eye = parts[4] if len(parts) > 4 else ""
+                search_dirs.extend([
+                    image_root / "TREX_DME" / "TREX DME" / arm / pid / visit / eye,
+                    image_root / "TREX_DME" / trial / arm / pid / visit / eye,
+                ])
+            for d in search_dirs:
+                if not d.exists():
+                    continue
+                images = sorted(
+                    list(d.glob("*.png")) + list(d.glob("*.tif"))
+                    + list(d.glob("*.jpg"))
+                )
+                if images:
+                    # Sample N evenly spaced scans
+                    if len(images) <= n:
+                        return images
+                    indices = np.linspace(
+                        0, len(images) - 1, n, dtype=int
+                    )
+                    return [images[i] for i in indices]
+        return []
+    def _build_case(
+        self,
+        eye_key: str,
+        records: list[dict],
+        all_profiles: dict[str, dict[str, bool]],
+        image_root: Path,
+    ) -> MedicalCase | None:
+        """Convert an eye's records into a MedicalCase."""
+        latest = records[-1]
+        # ---- Find OCT images ----
+        oct_images = self._find_oct_images(records, image_root, self.n_oct_samples + 1)
+        if not oct_images:
+            logger.debug(f"Skipping eye {eye_key}: no images found")
+            return None
+        # Build all available channels, then split by config
+        all_channels = {}
+        # Use middle scan as canonical first-line OCT, rest as optional extras
+        mid_idx = len(oct_images) // 2
+        initial_image = oct_images[mid_idx]
+        additional_images = [
+            img for i, img in enumerate(oct_images) if i != mid_idx
+        ]
+        try:
+            initial_b64 = encode_image_to_base64(initial_image)
+        except Exception as e:
+            logger.debug(f"Skipping eye {eye_key}: encode failed: {e}")
+            return None
+        oct_meta = config.get_channel_definition("olives", "oct_scan")
+        all_channels["oct_scan"] = ChannelData(
+                name="oct_scan",
+                channel_type="image",
+                description="OCT B-scan showing retinal cross-section",
+                value=initial_b64,
+                image_path=initial_image,
+                cost=float(oct_meta.get("cost", 0.0)),
+                tier=oct_meta.get("tier", "unknown"),
+                always_given=bool(oct_meta.get("always_given", False)),
+        )
+        # Additional OCT slices
+        if additional_images:
+            try:
+                add_b64 = [encode_image_to_base64(p) for p in additional_images]
+                ch_meta = config.get_channel_definition("olives", "additional_oct")
+                all_channels["additional_oct"] = ChannelData(
+                    name="additional_oct",
+                    channel_type="image",
+                    description="Additional OCT B-scans from different retinal locations",
+                    value=add_b64,
+                    cost=float(ch_meta.get("cost", 0.0)),
+                    tier=ch_meta.get("tier", "unknown"),
+                    always_given=bool(ch_meta.get("always_given", False)),
+                )
+            except Exception:
+                pass
+        # Clinical measurements (BCVA and CST)
+        bcva = latest.get("BCVA", "")
+        cst = latest.get("CST", "")
+        if bcva or cst:
+            parts = []
+            if bcva:
+                parts.append(f"BCVA (logMAR): {bcva}")
+            if cst:
+                parts.append(f"CST: {cst} um")
+            ch_meta = config.get_channel_definition("olives", "clinical_measurements")
+            all_channels["clinical_measurements"] = ChannelData(
+                name="clinical_measurements",
+                channel_type="text",
+                description="Visual acuity (BCVA) and retinal thickness (CST)",
+                value="; ".join(parts),
+                cost=float(ch_meta.get("cost", 0.0)),
+                tier=ch_meta.get("tier", "unknown"),
+                always_given=bool(ch_meta.get("always_given", False)),
+            )
+        # Biomarker hints (subset — only the most obvious ones)
+        biomarker_vec = all_profiles[eye_key]
+        obvious_markers = ["fluid_irf", "fluid_srf", "hemorrhage", "drt_me"]
+        hint_parts = []
+        for m in obvious_markers:
+            if m in biomarker_vec:
+                status = "Present" if biomarker_vec[m] else "Not detected"
+                hint_parts.append(
+                    f"{m.replace('_', ' ').title()}: {status}"
+                )
+        if hint_parts:
+            ch_meta = config.get_channel_definition("olives", "biomarker_hints")
+            all_channels["biomarker_hints"] = ChannelData(
+                name="biomarker_hints",
+                channel_type="text",
+                description="Partial biomarker annotations (subset)",
+                value="; ".join(hint_parts),
+                cost=float(ch_meta.get("cost", 0.0)),
+                tier=ch_meta.get("tier", "unknown"),
+                always_given=bool(ch_meta.get("always_given", False)),
+            )
+        # Disease type hint
+        path_str = latest.get(
+            "Path (Trial/Arm/Folder/Visit/Eye/Image Name)", ""
+        )
+        disease = "DME" if "TREX" in path_str else "DR"
+        ch_meta = config.get_channel_definition("olives", "disease_context")
+        all_channels["disease_context"] = ChannelData(
+            name="disease_context",
+            channel_type="text",
+            description="Disease type and treatment context",
+            value=f"Disease: {disease}",
+            cost=float(ch_meta.get("cost", 0.0)),
+            tier=ch_meta.get("tier", "unknown"),
+            always_given=bool(ch_meta.get("always_given", False)),
+        )
+        initial_channels = {
+            name: ch for name, ch in all_channels.items() if ch.always_given
+        }
+        requestable = {
+            name: ch for name, ch in all_channels.items() if not ch.always_given
+        }
+        # ---- Build candidates ----
+        case_id = f"olives_{eye_key}"
+        correct_profile = biomarker_vector_to_profile_string(biomarker_vec)
+        candidates = self._generate_profile_candidates(
+            eye_key, biomarker_vec, all_profiles, case_id
+        )
+        if correct_profile not in candidates:
+            candidates[0] = correct_profile
+            rng = _case_rng(case_id)
+            rng.shuffle(candidates)
+        return MedicalCase(
+            case_id=case_id,
+            dataset="olives",
+            initial_channels=initial_channels,
+            requestable_channels=requestable,
+            candidates=candidates,
+            ground_truth=correct_profile,
+            ground_truth_rank=(
+                candidates.index(correct_profile)
+                if correct_profile in candidates else 0
+            ),
+            metadata={
+                "eye_id": eye_key,
+                "disease": disease,
+                "biomarker_vector": biomarker_vec,
+            },
+        )
+    def _generate_profile_candidates(
+        self,
+        eye_id: str,
+        correct_vec: dict[str, bool],
+        all_profiles: dict[str, dict[str, bool]],
+        case_id: str,
+    ) -> list[str]:
+        """Generate biomarker profile candidates."""
+        n = self.n_candidates
+        rng = _case_rng(case_id)
+        correct_str = biomarker_vector_to_profile_string(correct_vec)
+        scored = []
+        for eid, vec in all_profiles.items():
+            if eid == eye_id:
+                continue
+            dist = compute_profile_distance(correct_vec, vec)
+            profile_str = biomarker_vector_to_profile_string(vec)
+            if profile_str != correct_str:
+                scored.append((dist, profile_str, vec))
+        scored.sort(key=lambda x: x[0])
+        distractors = []
+        if scored:
+            distractors.append(scored[0][1])  # Hard distractor
+            if len(scored) > 1:
+                distractors.append(scored[-1][1])  # Easy distractor
+            mid_pool = scored[len(scored) // 4: 3 * len(scored) // 4]
+            rng.shuffle(mid_pool)
+            for dist, prof, vec in mid_pool:
+                if prof not in distractors and len(distractors) < n - 1:
+                    distractors.append(prof)
+        while len(distractors) < n - 1 and scored:
+            pick = rng.choice(scored)
+            if pick[1] not in distractors:
+                distractors.append(pick[1])
+        candidates = [correct_str] + distractors[:n - 1]
+        rng.shuffle(candidates)
+        return candidates

demo_cases/chest_xray_ipf.png ADDED Viewed

Git LFS Details

SHA256: 68275cef7e60ce4fa6c2402c4f6a18cd70fa32056300979063f3acd221335ea3
Pointer size: 131 Bytes
Size of remote file: 164 kB

demo_cases/ct_pulmonary_pe.png ADDED Viewed

Git LFS Details

SHA256: 341515b895e6bb9c1a226b80a4c8373e744522126643a68aa2137e2fe60de263
Pointer size: 131 Bytes
Size of remote file: 239 kB

demo_cases/fundus_dme.png ADDED Viewed

Git LFS Details

SHA256: 663d0080e4f79c1b3a8b91d7c514cfa892852a9815bf717ad5c144547522b188
Pointer size: 131 Bytes
Size of remote file: 283 kB

demo_cases/oct_bscan_dme.png ADDED Viewed

Git LFS Details

SHA256: 31c2470dc3ce6be6d5ab876bdcaeba8353a70843db7cf278c4fa56e784225960
Pointer size: 131 Bytes
Size of remote file: 207 kB

demo_cases/skin_lesion_dermoscopy.png ADDED Viewed

Git LFS Details

SHA256: 9de6739f5d6a4d3771fde0d8c5f473fd652ab66a23868986d51b24f96dfaf9f7
Pointer size: 131 Bytes
Size of remote file: 194 kB

evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,455 @@

+"""
+Evaluation Metrics for ActiveMedAgent.
+Unified metrics across all three datasets:
+  - MRR (Mean Reciprocal Rank)
+  - Acquisition Efficiency (normalized improvement)
+  - Top-1 Accuracy
+  - Acquisition Precision
+  - Uncertainty Calibration (ECE-style)
+  - Information-Theoretic Metrics (entropy, IG, VoI)
+  - Bootstrap confidence intervals
+"""
+import logging
+from dataclasses import dataclass, field
+import numpy as np
+from scipy import stats
+from agent import AgentResult
+from datasets.base import MedicalCase
+from information_gain import BeliefTrajectory, compute_information_metrics
+import config
+logger = logging.getLogger(__name__)
+@dataclass
+class CaseMetrics:
+    """Metrics for a single case."""
+    case_id: str
+    dataset: str
+    top1_correct: bool = False
+    reciprocal_rank: float = 0.0
+    ground_truth_rank: int = -1         # 1-indexed rank of correct answer
+    n_acquired: int = 0
+    acquired_channels: list[str] = field(default_factory=list)
+    committed_early: bool = False
+    top1_confidence: float = 0.0        # Confidence of the top-ranked diagnosis
+    acquisition_cost: float = 0.0
+    total_case_cost: float = 0.0
+@dataclass
+class DatasetMetrics:
+    """Aggregated metrics for a dataset."""
+    dataset: str
+    n_cases: int
+    top1_accuracy: float
+    mrr: float                          # Mean Reciprocal Rank
+    top1_accuracy_ci: tuple = (0.0, 0.0)  # 95% CI
+    mrr_ci: tuple = (0.0, 0.0)
+    mean_channels_acquired: float = 0.0
+    early_commit_rate: float = 0.0
+    per_channel_request_rate: dict = field(default_factory=dict)
+    mean_acquisition_cost: float = 0.0
+    mean_total_case_cost: float = 0.0
+def compute_reciprocal_rank(
+    ranking: list[dict],
+    ground_truth: str,
+    candidates: list[str],
+) -> float:
+    """
+    Compute reciprocal rank of the ground truth in the agent's ranking.
+    Returns 1/rank if found, 0 if not found.
+    """
+    if not ranking:
+        return 0.0
+    gt_lower = ground_truth.lower().strip()
+    for entry in ranking:
+        name = entry.get("name", "").lower().strip()
+        rank = entry.get("rank", 999)
+        # Flexible matching: check substring containment both ways
+        if gt_lower in name or name in gt_lower:
+            return 1.0 / rank
+        # Check if it matches any candidate that matches ground truth
+        for candidate in candidates:
+            if (
+                gt_lower in candidate.lower()
+                and (name in candidate.lower() or candidate.lower() in name)
+            ):
+                return 1.0 / rank
+    # Ground truth not found in ranking — return 1/(N+1)
+    return 1.0 / (len(ranking) + 1) if ranking else 0.0
+def evaluate_single_case(
+    result: AgentResult,
+    case: MedicalCase,
+) -> CaseMetrics:
+    """Evaluate a single agent result against ground truth."""
+    ranking = result.final_ranking
+    gt = case.ground_truth
+    candidates = case.candidates
+    rr = compute_reciprocal_rank(ranking, gt, candidates)
+    top1_correct = rr == 1.0  # RR=1 means correct answer is ranked first
+    top1_conf = ranking[0]["confidence"] if ranking else 0.0
+    # Determine ground truth rank in agent's output
+    gt_rank = -1
+    gt_lower = gt.lower().strip()
+    for entry in ranking:
+        name = entry.get("name", "").lower().strip()
+        if gt_lower in name or name in gt_lower:
+            gt_rank = entry.get("rank", -1)
+            break
+    return CaseMetrics(
+        case_id=result.case_id,
+        dataset=result.dataset,
+        top1_correct=top1_correct,
+        reciprocal_rank=rr,
+        ground_truth_rank=gt_rank,
+        n_acquired=len(result.acquired_channels),
+        acquired_channels=result.acquired_channels,
+        committed_early=result.committed_early,
+        top1_confidence=top1_conf,
+        acquisition_cost=result.acquisition_cost,
+        total_case_cost=result.total_case_cost,
+    )
+def aggregate_metrics(
+    case_metrics: list[CaseMetrics],
+    dataset_name: str,
+    n_bootstrap: int = None,
+) -> DatasetMetrics:
+    """Aggregate per-case metrics into dataset-level stats with bootstrap CIs."""
+    if n_bootstrap is None:
+        n_bootstrap = config.N_BOOTSTRAP
+    n = len(case_metrics)
+    if n == 0:
+        return DatasetMetrics(dataset=dataset_name, n_cases=0, top1_accuracy=0, mrr=0)
+    accuracies = np.array([int(cm.top1_correct) for cm in case_metrics])
+    rrs = np.array([cm.reciprocal_rank for cm in case_metrics])
+    top1_acc = float(np.mean(accuracies))
+    mrr = float(np.mean(rrs))
+    # Bootstrap CIs
+    acc_ci = _bootstrap_ci(accuracies, n_bootstrap)
+    mrr_ci = _bootstrap_ci(rrs, n_bootstrap)
+    # Channel request rates
+    channel_counts: dict[str, int] = {}
+    for cm in case_metrics:
+        for ch in cm.acquired_channels:
+            channel_counts[ch] = channel_counts.get(ch, 0) + 1
+    channel_rates = {ch: count / n for ch, count in channel_counts.items()}
+    return DatasetMetrics(
+        dataset=dataset_name,
+        n_cases=n,
+        top1_accuracy=top1_acc,
+        mrr=mrr,
+        top1_accuracy_ci=acc_ci,
+        mrr_ci=mrr_ci,
+        mean_channels_acquired=float(np.mean([cm.n_acquired for cm in case_metrics])),
+        early_commit_rate=float(np.mean([int(cm.committed_early) for cm in case_metrics])),
+        per_channel_request_rate=channel_rates,
+        mean_acquisition_cost=float(np.mean([cm.acquisition_cost for cm in case_metrics])),
+        mean_total_case_cost=float(np.mean([cm.total_case_cost for cm in case_metrics])),
+    )
+def compute_acquisition_efficiency(
+    mrr_at_k: float,
+    mrr_passive: float,
+    mrr_oracle: float,
+) -> float:
+    """
+    Normalized Acquisition Efficiency.
+    AE(K) = (MRR_K - MRR_passive) / (MRR_oracle - MRR_passive)
+    Returns 0 if oracle = passive (no room for improvement),
+    can exceed 1 if active outperforms oracle (shouldn't happen normally).
+    """
+    denom = mrr_oracle - mrr_passive
+    if abs(denom) < 1e-8:
+        return 0.0
+    return (mrr_at_k - mrr_passive) / denom
+def compute_acquisition_precision(
+    active_results: list[AgentResult],
+    passive_results: list[AgentResult],
+    cases: list[MedicalCase],
+) -> dict:
+    """
+    Acquisition Precision: when the agent requests info, does the diagnosis change?
+    Two sub-metrics:
+      - request_change_rate: fraction of acquisitions that changed the top-1 diagnosis
+      - change_correctness: among diagnosis changes, fraction that were improvements
+    """
+    assert len(active_results) == len(passive_results) == len(cases)
+    total_acquisitions = 0
+    diagnosis_changed = 0
+    change_improved = 0
+    for active, passive, case in zip(active_results, passive_results, cases):
+        passive_top1 = _get_top1_name(passive.final_ranking)
+        active_top1 = _get_top1_name(active.final_ranking)
+        n_acq = len(active.acquired_channels)
+        if n_acq > 0:
+            total_acquisitions += 1
+            if passive_top1 != active_top1:
+                diagnosis_changed += 1
+                # Did it change to the correct answer?
+                gt = case.ground_truth.lower().strip()
+                if gt in active_top1.lower() or active_top1.lower() in gt:
+                    change_improved += 1
+    return {
+        "total_cases_with_acquisition": total_acquisitions,
+        "request_change_rate": (
+            diagnosis_changed / total_acquisitions if total_acquisitions > 0 else 0
+        ),
+        "change_correctness": (
+            change_improved / diagnosis_changed if diagnosis_changed > 0 else 0
+        ),
+    }
+def compute_prompt_agreement(
+    results_by_variant: dict[str, list[AgentResult]],
+) -> dict:
+    """
+    Prompt sensitivity analysis: measure agreement across prompt variants.
+    Returns:
+      - top1_agreement: fraction of cases where all variants agree on top-1
+      - acquisition_agreement: fraction of cases where all variants request
+        the same first channel
+    """
+    variants = list(results_by_variant.keys())
+    if len(variants) < 2:
+        return {"top1_agreement": 1.0, "acquisition_agreement": 1.0}
+    # Align by case_id
+    case_ids = set()
+    for results in results_by_variant.values():
+        case_ids.update(r.case_id for r in results)
+    by_case: dict[str, dict[str, AgentResult]] = {}
+    for variant, results in results_by_variant.items():
+        for r in results:
+            if r.case_id not in by_case:
+                by_case[r.case_id] = {}
+            by_case[r.case_id][variant] = r
+    top1_agree_count = 0
+    acq_agree_count = 0
+    total = 0
+    for case_id, variant_results in by_case.items():
+        if len(variant_results) < len(variants):
+            continue  # Skip cases not in all variants
+        total += 1
+        # Top-1 agreement
+        top1s = set()
+        for vr in variant_results.values():
+            top1s.add(_get_top1_name(vr.final_ranking).lower())
+        if len(top1s) == 1:
+            top1_agree_count += 1
+        # First acquisition agreement
+        first_acqs = set()
+        for vr in variant_results.values():
+            if vr.acquired_channels:
+                first_acqs.add(vr.acquired_channels[0])
+            else:
+                first_acqs.add("_committed_")
+        if len(first_acqs) == 1:
+            acq_agree_count += 1
+    return {
+        "top1_agreement": top1_agree_count / total if total > 0 else 0,
+        "acquisition_agreement": acq_agree_count / total if total > 0 else 0,
+        "n_cases_compared": total,
+    }
+def compute_regret_analysis(
+    active_results: list[AgentResult],
+    oracle_results: list[AgentResult],
+    cases: list[MedicalCase],
+) -> dict:
+    """
+    Regret Analysis: when the agent gets a case wrong, could a different
+    acquisition strategy have saved it?
+    For each case where active got it wrong:
+      1. Did the oracle get it right? (recoverable error)
+      2. Which channels were available but not requested? (missed channels)
+      3. Among recoverable errors, which missing channels correlate most
+         with oracle success? (high-regret channels)
+    Returns a rich dict with per-case traces and aggregate statistics.
+    """
+    assert len(active_results) == len(oracle_results) == len(cases)
+    per_case_regret = []
+    n_active_wrong = 0
+    n_oracle_right_when_active_wrong = 0  # recoverable
+    n_both_wrong = 0  # unrecoverable — VLM reasoning bottleneck
+    missed_channel_counts: dict[str, int] = {}  # channels not requested in recoverable cases
+    missed_channel_total: dict[str, int] = {}   # total times a channel was missed (all wrong)
+    for active, oracle, case in zip(active_results, oracle_results, cases):
+        active_rr = compute_reciprocal_rank(active.final_ranking, case.ground_truth, case.candidates)
+        oracle_rr = compute_reciprocal_rank(oracle.final_ranking, case.ground_truth, case.candidates)
+        active_correct = active_rr == 1.0
+        oracle_correct = oracle_rr == 1.0
+        if active_correct:
+            continue  # No regret if agent got it right
+        n_active_wrong += 1
+        # Channels available but not acquired
+        all_requestable = set(case.requestable_channels.keys())
+        acquired = set(active.acquired_channels)
+        missed = all_requestable - acquired
+        case_entry = {
+            "case_id": case.case_id,
+            "ground_truth": case.ground_truth,
+            "active_top1": _get_top1_name(active.final_ranking),
+            "oracle_top1": _get_top1_name(oracle.final_ranking),
+            "active_correct": False,
+            "oracle_correct": oracle_correct,
+            "acquired_channels": list(acquired),
+            "missed_channels": list(missed),
+            "recoverable": oracle_correct,
+            "active_rr": active_rr,
+            "oracle_rr": oracle_rr,
+        }
+        for ch in missed:
+            missed_channel_total[ch] = missed_channel_total.get(ch, 0) + 1
+        if oracle_correct:
+            n_oracle_right_when_active_wrong += 1
+            for ch in missed:
+                missed_channel_counts[ch] = missed_channel_counts.get(ch, 0) + 1
+        else:
+            n_both_wrong += 1
+        per_case_regret.append(case_entry)
+    # Compute per-channel regret score: how often a missed channel appears
+    # in recoverable errors vs all errors
+    channel_regret_scores = {}
+    for ch in set(list(missed_channel_counts.keys()) + list(missed_channel_total.keys())):
+        recoverable_miss = missed_channel_counts.get(ch, 0)
+        total_miss = missed_channel_total.get(ch, 0)
+        # Regret score: fraction of times this channel was missed AND oracle succeeded
+        channel_regret_scores[ch] = {
+            "missed_in_recoverable": recoverable_miss,
+            "missed_in_all_wrong": total_miss,
+            "regret_rate": recoverable_miss / total_miss if total_miss > 0 else 0.0,
+        }
+    # Sort channels by regret rate descending
+    sorted_channels = sorted(
+        channel_regret_scores.items(),
+        key=lambda x: (-x[1]["regret_rate"], -x[1]["missed_in_recoverable"]),
+    )
+    return {
+        "n_cases": len(cases),
+        "n_active_wrong": n_active_wrong,
+        "n_recoverable": n_oracle_right_when_active_wrong,
+        "n_unrecoverable": n_both_wrong,
+        "recovery_rate": (
+            n_oracle_right_when_active_wrong / n_active_wrong
+            if n_active_wrong > 0 else 0.0
+        ),
+        "error_rate": n_active_wrong / len(cases) if cases else 0.0,
+        "channel_regret_scores": dict(sorted_channels),
+        "per_case_regret": per_case_regret,
+        "summary": {
+            "total_errors": n_active_wrong,
+            "recoverable_pct": (
+                n_oracle_right_when_active_wrong / n_active_wrong * 100
+                if n_active_wrong > 0 else 0.0
+            ),
+            "unrecoverable_pct": (
+                n_both_wrong / n_active_wrong * 100
+                if n_active_wrong > 0 else 0.0
+            ),
+            "highest_regret_channel": sorted_channels[0][0] if sorted_channels else None,
+        },
+    }
+def compute_info_theoretic_metrics(
+    results: list[AgentResult],
+) -> dict:
+    """
+    Compute information-theoretic metrics from belief trajectories.
+    Extracts BeliefTrajectory objects from AgentResults and computes
+    aggregate entropy, information gain, and per-channel value metrics.
+    """
+    trajectories = [
+        r.belief_trajectory for r in results
+        if r.belief_trajectory and r.belief_trajectory.states
+    ]
+    if not trajectories:
+        return {"n_cases_with_trajectory": 0}
+    metrics = compute_information_metrics(trajectories)
+    metrics["n_cases_with_trajectory"] = len(trajectories)
+    return metrics
+def _get_top1_name(ranking: list[dict]) -> str:
+    """Get the name of the top-ranked diagnosis."""
+    if not ranking:
+        return ""
+    return ranking[0].get("name", "")
+def _bootstrap_ci(
+    values: np.ndarray, n_bootstrap: int = 1000, ci: float = 0.95
+) -> tuple[float, float]:
+    """Compute bootstrap confidence interval."""
+    if len(values) == 0:
+        return (0.0, 0.0)
+    rng = np.random.RandomState(config.SEED)
+    boot_means = []
+    for _ in range(n_bootstrap):
+        sample = rng.choice(values, size=len(values), replace=True)
+        boot_means.append(np.mean(sample))
+    alpha = (1 - ci) / 2
+    lower = float(np.percentile(boot_means, alpha * 100))
+    upper = float(np.percentile(boot_means, (1 - alpha) * 100))
+    return (lower, upper)

evaluation/analysis.py ADDED Viewed

	@@ -0,0 +1,546 @@

+"""
+Cross-dataset analysis and figure generation.
+Produces the key figures for the paper:
+  1. Acquisition Efficiency curves (all 3 datasets, shared y-axis)
+  2. Per-channel request frequency heatmap
+  3. Prompt sensitivity agreement matrix
+  4. OLIVES biomarker-tier acquisition analysis
+  5. NEJM difficulty-vs-acquisition scatter
+"""
+import json
+import logging
+from pathlib import Path
+from dataclasses import asdict
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib
+import seaborn as sns
+from scipy import stats
+from agent import AgentResult
+from datasets.base import MedicalCase
+from evaluation import (
+    CaseMetrics,
+    DatasetMetrics,
+    evaluate_single_case,
+    aggregate_metrics,
+    compute_acquisition_efficiency,
+    compute_acquisition_precision,
+    compute_prompt_agreement,
+    compute_regret_analysis,
+)
+import config
+matplotlib.rcParams["font.family"] = "serif"
+matplotlib.rcParams["font.size"] = 11
+logger = logging.getLogger(__name__)
+class ExperimentAnalyzer:
+    """Analyze and visualize results across all experiments."""
+    def __init__(self, results_dir: Path = None):
+        self.results_dir = results_dir or config.RESULTS_DIR
+        self.figures_dir = self.results_dir / "figures"
+        self.figures_dir.mkdir(parents=True, exist_ok=True)
+    def load_results(self, experiment_name: str) -> dict:
+        """Load saved experiment results."""
+        path = self.results_dir / f"{experiment_name}.json"
+        if not path.exists():
+            logger.error(f"Results file not found: {path}")
+            return {}
+        with open(path) as f:
+            return json.load(f)
+    def save_results(self, data: dict, experiment_name: str):
+        """Save experiment results."""
+        path = self.results_dir / f"{experiment_name}.json"
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+        logger.info(f"Results saved to {path}")
+    # ================================================================
+    # Figure 1: Acquisition Efficiency Curves
+    # ================================================================
+    def plot_acquisition_efficiency(
+        self,
+        results_by_dataset: dict[str, dict[int, DatasetMetrics]],
+        passive_metrics: dict[str, DatasetMetrics],
+        oracle_metrics: dict[str, DatasetMetrics],
+        save_name: str = "fig1_acquisition_efficiency",
+    ):
+        """
+        Main result figure: normalized acquisition efficiency vs budget K.
+        Args:
+            results_by_dataset: {dataset_name: {K: DatasetMetrics}}
+            passive_metrics: {dataset_name: DatasetMetrics} at K=0
+            oracle_metrics: {dataset_name: DatasetMetrics} with all channels
+        """
+        fig, axes = plt.subplots(1, 2, figsize=(12, 4.5))
+        colors = {"midas": "#E07A5F", "nejm": "#3D405B", "olives": "#81B29A"}
+        markers = {"midas": "o", "nejm": "s", "olives": "D"}
+        labels = {"midas": "MIDAS (Dermatology)", "nejm": "NEJM (Multi-Specialty)",
+                  "olives": "OLIVES (Ophthalmology)"}
+        # Left panel: Raw MRR vs K
+        ax = axes[0]
+        for ds_name in ["midas", "nejm", "olives"]:
+            if ds_name not in results_by_dataset:
+                continue
+            ks = sorted(results_by_dataset[ds_name].keys())
+            mrrs = [results_by_dataset[ds_name][k].mrr for k in ks]
+            cis = [results_by_dataset[ds_name][k].mrr_ci for k in ks]
+            # Add passive at K=0
+            all_k = [0] + list(ks)
+            all_mrr = [passive_metrics[ds_name].mrr] + mrrs
+            all_lower = [passive_metrics[ds_name].mrr_ci[0]] + [c[0] for c in cis]
+            all_upper = [passive_metrics[ds_name].mrr_ci[1]] + [c[1] for c in cis]
+            ax.plot(all_k, all_mrr, color=colors[ds_name], marker=markers[ds_name],
+                    label=labels[ds_name], linewidth=2, markersize=7)
+            ax.fill_between(all_k, all_lower, all_upper, alpha=0.15, color=colors[ds_name])
+            # Oracle line
+            ax.axhline(y=oracle_metrics[ds_name].mrr, color=colors[ds_name],
+                        linestyle="--", alpha=0.4, linewidth=1)
+        ax.set_xlabel("Acquisition Budget (K)")
+        ax.set_ylabel("Mean Reciprocal Rank (MRR)")
+        ax.set_title("(a) Diagnostic Quality vs. Budget")
+        ax.legend(fontsize=9)
+        ax.set_xticks(range(max(4, max(max(r.keys()) for r in results_by_dataset.values()) + 1)))
+        ax.grid(True, alpha=0.3)
+        # Right panel: Normalized Acquisition Efficiency
+        ax = axes[1]
+        for ds_name in ["midas", "nejm", "olives"]:
+            if ds_name not in results_by_dataset:
+                continue
+            ks = sorted(results_by_dataset[ds_name].keys())
+            effs = []
+            for k in ks:
+                ae = compute_acquisition_efficiency(
+                    results_by_dataset[ds_name][k].mrr,
+                    passive_metrics[ds_name].mrr,
+                    oracle_metrics[ds_name].mrr,
+                )
+                effs.append(ae)
+            all_k = [0] + list(ks)
+            all_eff = [0.0] + effs
+            ax.plot(all_k, all_eff, color=colors[ds_name], marker=markers[ds_name],
+                    label=labels[ds_name], linewidth=2, markersize=7)
+        ax.axhline(y=1.0, color="gray", linestyle="--", alpha=0.5, linewidth=1,
+                    label="Oracle ceiling")
+        ax.set_xlabel("Acquisition Budget (K)")
+        ax.set_ylabel("Acquisition Efficiency")
+        ax.set_title("(b) Normalized Efficiency")
+        ax.legend(fontsize=9)
+        ax.set_ylim(-0.05, 1.15)
+        ax.grid(True, alpha=0.3)
+        plt.tight_layout()
+        save_path = self.figures_dir / f"{save_name}.pdf"
+        fig.savefig(save_path, dpi=300, bbox_inches="tight")
+        plt.close(fig)
+        logger.info(f"Saved figure: {save_path}")
+    # ================================================================
+    # Figure 2: Per-Channel Request Frequency
+    # ================================================================
+    def plot_channel_request_heatmap(
+        self,
+        results_by_dataset: dict[str, list[AgentResult]],
+        save_name: str = "fig2_channel_requests",
+    ):
+        """Heatmap showing which channels the agent requests most, by dataset."""
+        fig, axes = plt.subplots(1, 3, figsize=(14, 4))
+        dataset_names = ["midas", "nejm", "olives"]
+        titles = ["MIDAS", "NEJM", "OLIVES"]
+        for idx, (ds_name, title) in enumerate(zip(dataset_names, titles)):
+            if ds_name not in results_by_dataset:
+                continue
+            results = results_by_dataset[ds_name]
+            # Count first-request frequency
+            first_requests: dict[str, int] = {}
+            for r in results:
+                if r.acquired_channels:
+                    ch = r.acquired_channels[0]
+                    first_requests[ch] = first_requests.get(ch, 0) + 1
+            # Count overall request frequency
+            all_requests: dict[str, int] = {}
+            for r in results:
+                for ch in r.acquired_channels:
+                    all_requests[ch] = all_requests.get(ch, 0) + 1
+            if not all_requests:
+                continue
+            channels = sorted(all_requests.keys())
+            n = len(results)
+            ax = axes[idx]
+            data = np.array([
+                [first_requests.get(ch, 0) / n for ch in channels],
+                [all_requests.get(ch, 0) / n for ch in channels],
+            ])
+            sns.heatmap(
+                data,
+                ax=ax,
+                xticklabels=[ch.replace("_", "\n") for ch in channels],
+                yticklabels=["First\nRequest", "Any\nRequest"],
+                annot=True,
+                fmt=".2f",
+                cmap="YlOrRd",
+                vmin=0,
+                vmax=1,
+                cbar_kws={"shrink": 0.8},
+            )
+            ax.set_title(title)
+        plt.tight_layout()
+        save_path = self.figures_dir / f"{save_name}.pdf"
+        fig.savefig(save_path, dpi=300, bbox_inches="tight")
+        plt.close(fig)
+        logger.info(f"Saved figure: {save_path}")
+    # ================================================================
+    # Figure 3: OLIVES Biomarker Tier Analysis
+    # ================================================================
+    def plot_olives_biomarker_tiers(
+        self,
+        results: list[AgentResult],
+        cases: list[MedicalCase],
+        save_name: str = "fig3_olives_biomarker_tiers",
+    ):
+        """
+        For OLIVES: does the agent request OCT more for OCT-dependent
+        biomarkers than for fundus-visible ones?
+        """
+        oct_request_by_tier: dict[str, list[bool]] = {
+            "fundus_visible": [],
+            "oct_dependent": [],
+        }
+        for result, case in zip(results, cases):
+            if case.dataset != "olives":
+                continue
+            tier_labels = case.metadata.get("biomarker_tier_labels", {})
+            requested_oct = "oct_scan" in result.acquired_channels
+            # For cases where the eye has fundus-visible biomarkers
+            if tier_labels.get("fundus_visible"):
+                oct_request_by_tier["fundus_visible"].append(requested_oct)
+            # For cases where the eye has OCT-dependent biomarkers
+            if tier_labels.get("oct_dependent"):
+                oct_request_by_tier["oct_dependent"].append(requested_oct)
+        fig, ax = plt.subplots(figsize=(6, 4))
+        tiers = ["fundus_visible", "oct_dependent"]
+        tier_labels = ["Fundus-Visible\nBiomarkers", "OCT-Dependent\nBiomarkers"]
+        rates = []
+        cis_lower = []
+        cis_upper = []
+        for tier in tiers:
+            vals = oct_request_by_tier.get(tier, [])
+            if vals:
+                rate = np.mean(vals)
+                rates.append(rate)
+                # Wilson CI for proportions
+                n = len(vals)
+                z = 1.96
+                p = rate
+                denom = 1 + z ** 2 / n
+                center = (p + z ** 2 / (2 * n)) / denom
+                margin = z * np.sqrt((p * (1 - p) + z ** 2 / (4 * n)) / n) / denom
+                cis_lower.append(center - margin)
+                cis_upper.append(center + margin)
+            else:
+                rates.append(0)
+                cis_lower.append(0)
+                cis_upper.append(0)
+        colors_bar = ["#81B29A", "#E07A5F"]
+        bars = ax.bar(tier_labels, rates, color=colors_bar, edgecolor="white", width=0.5)
+        ax.errorbar(
+            tier_labels, rates,
+            yerr=[np.array(rates) - np.array(cis_lower),
+                  np.array(cis_upper) - np.array(rates)],
+            fmt="none", ecolor="black", capsize=5,
+        )
+        ax.set_ylabel("OCT Request Rate")
+        ax.set_title("Agent's OCT Request Rate by Biomarker Type")
+        ax.set_ylim(0, 1.05)
+        ax.grid(True, axis="y", alpha=0.3)
+        # Add counts
+        for i, tier in enumerate(tiers):
+            n = len(oct_request_by_tier.get(tier, []))
+            ax.text(i, rates[i] + 0.05, f"n={n}", ha="center", fontsize=10)
+        plt.tight_layout()
+        save_path = self.figures_dir / f"{save_name}.pdf"
+        fig.savefig(save_path, dpi=300, bbox_inches="tight")
+        plt.close(fig)
+        logger.info(f"Saved figure: {save_path}")
+    # ================================================================
+    # Figure 4: NEJM Difficulty vs Acquisition Behavior
+    # ================================================================
+    def plot_nejm_difficulty_analysis(
+        self,
+        results: list[AgentResult],
+        cases: list[MedicalCase],
+        save_name: str = "fig4_nejm_difficulty",
+    ):
+        """
+        Scatter: human difficulty (physician correct rate) vs
+        agent's acquisition behavior (N channels requested + early commit).
+        """
+        difficulties = []
+        n_acquired = []
+        committed_early = []
+        for result, case in zip(results, cases):
+            if case.dataset != "nejm":
+                continue
+            votes = case.metadata.get("votes", {})
+            if not votes:
+                continue
+            # Compute human difficulty (proportion correct)
+            total_votes = sum(float(v) for v in votes.values())
+            if total_votes == 0:
+                continue
+            gt = case.ground_truth
+            human_correct = 0.0
+            for key, val in votes.items():
+                if key in gt or gt.startswith(key):
+                    human_correct = float(val) / total_votes if total_votes > 1 else float(val)
+                    break
+            difficulties.append(human_correct)
+            n_acquired.append(len(result.acquired_channels))
+            committed_early.append(result.committed_early)
+        if not difficulties:
+            logger.warning("No NEJM cases with difficulty data found")
+            return
+        fig, axes = plt.subplots(1, 2, figsize=(11, 4.5))
+        # Left: Difficulty vs N channels acquired
+        ax = axes[0]
+        ax.scatter(difficulties, n_acquired, alpha=0.5, s=30, color="#3D405B", edgecolors="white")
+        # Add trend line
+        if len(difficulties) > 10:
+            z = np.polyfit(difficulties, n_acquired, 1)
+            p = np.poly1d(z)
+            x_line = np.linspace(min(difficulties), max(difficulties), 100)
+            ax.plot(x_line, p(x_line), "--", color="#E07A5F", linewidth=2,
+                    label=f"Trend (slope={z[0]:.2f})")
+            # Correlation
+            r, pval = stats.pearsonr(difficulties, n_acquired)
+            ax.text(0.05, 0.95, f"r={r:.3f}, p={pval:.3f}",
+                    transform=ax.transAxes, fontsize=9, verticalalignment="top")
+        ax.set_xlabel("Human Correct Rate (easier →)")
+        ax.set_ylabel("Channels Acquired by Agent")
+        ax.set_title("(a) Case Difficulty vs. Acquisition Amount")
+        ax.legend(fontsize=9)
+        ax.grid(True, alpha=0.3)
+        # Right: Difficulty bins vs early commit rate
+        ax = axes[1]
+        diff_arr = np.array(difficulties)
+        commit_arr = np.array(committed_early, dtype=float)
+        bins = [0, 0.25, 0.50, 0.75, 1.01]
+        bin_labels = ["<25%", "25-50%", "50-75%", ">75%"]
+        bin_rates = []
+        bin_ns = []
+        for i in range(len(bins) - 1):
+            mask = (diff_arr >= bins[i]) & (diff_arr < bins[i + 1])
+            if mask.sum() > 0:
+                bin_rates.append(commit_arr[mask].mean())
+                bin_ns.append(mask.sum())
+            else:
+                bin_rates.append(0)
+                bin_ns.append(0)
+        bar_colors = ["#E07A5F", "#F2CC8F", "#81B29A", "#3D405B"]
+        bars = ax.bar(bin_labels, bin_rates, color=bar_colors, edgecolor="white", width=0.6)
+        for i, (rate, n) in enumerate(zip(bin_rates, bin_ns)):
+            ax.text(i, rate + 0.02, f"n={n}", ha="center", fontsize=9)
+        ax.set_xlabel("Human Correct Rate (easier →)")
+        ax.set_ylabel("Agent Early Commit Rate")
+        ax.set_title("(b) Early Commitment vs. Difficulty")
+        ax.set_ylim(0, 1.05)
+        ax.grid(True, axis="y", alpha=0.3)
+        plt.tight_layout()
+        save_path = self.figures_dir / f"{save_name}.pdf"
+        fig.savefig(save_path, dpi=300, bbox_inches="tight")
+        plt.close(fig)
+        logger.info(f"Saved figure: {save_path}")
+    # ================================================================
+    # Figure 5: Regret Analysis
+    # ================================================================
+    def plot_regret_analysis(
+        self,
+        regret: dict,
+        dataset_name: str = "",
+        save_name: str = "fig5_regret_analysis",
+    ):
+        """
+        Visualize regret analysis results.
+        Left: Stacked bar showing recoverable vs unrecoverable errors.
+        Right: Per-channel regret scores (which missed channels cost the most).
+        """
+        fig, axes = plt.subplots(1, 2, figsize=(12, 4.5))
+        title_suffix = f" — {dataset_name.upper()}" if dataset_name else ""
+        # ---- Left panel: Error decomposition ----
+        ax = axes[0]
+        summary = regret["summary"]
+        n_correct = regret["n_cases"] - regret["n_active_wrong"]
+        n_recoverable = regret["n_recoverable"]
+        n_unrecoverable = regret["n_unrecoverable"]
+        categories = ["Agent\nCorrect", "Recoverable\nErrors", "Unrecoverable\nErrors"]
+        values = [n_correct, n_recoverable, n_unrecoverable]
+        colors_bar = ["#81B29A", "#F2CC8F", "#E07A5F"]
+        bars = ax.bar(categories, values, color=colors_bar, edgecolor="white", width=0.55)
+        for bar, val in zip(bars, values):
+            pct = val / regret["n_cases"] * 100 if regret["n_cases"] > 0 else 0
+            ax.text(
+                bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
+                f"{val}\n({pct:.0f}%)", ha="center", fontsize=10,
+            )
+        ax.set_ylabel("Number of Cases")
+        ax.set_title(f"(a) Error Decomposition{title_suffix}")
+        ax.grid(True, axis="y", alpha=0.3)
+        # ---- Right panel: Per-channel regret ----
+        ax = axes[1]
+        channel_scores = regret["channel_regret_scores"]
+        if channel_scores:
+            channels = list(channel_scores.keys())
+            regret_rates = [channel_scores[ch]["regret_rate"] for ch in channels]
+            miss_counts = [channel_scores[ch]["missed_in_recoverable"] for ch in channels]
+            # Sort by regret rate
+            sorted_idx = sorted(range(len(channels)), key=lambda i: -regret_rates[i])
+            channels = [channels[i] for i in sorted_idx]
+            regret_rates = [regret_rates[i] for i in sorted_idx]
+            miss_counts = [miss_counts[i] for i in sorted_idx]
+            y_pos = range(len(channels))
+            bar_colors = plt.cm.YlOrRd(np.linspace(0.3, 0.9, len(channels)))
+            bars = ax.barh(
+                y_pos, regret_rates, color=bar_colors, edgecolor="white", height=0.6,
+            )
+            ax.set_yticks(y_pos)
+            ax.set_yticklabels([ch.replace("_", " ").title() for ch in channels], fontsize=9)
+            ax.set_xlabel("Regret Rate")
+            ax.set_xlim(0, 1.05)
+            ax.invert_yaxis()
+            # Annotate with counts
+            for i, (rate, count) in enumerate(zip(regret_rates, miss_counts)):
+                ax.text(
+                    rate + 0.02, i, f"n={count}",
+                    va="center", fontsize=9, color="#333",
+                )
+        else:
+            ax.text(0.5, 0.5, "No channel data", ha="center", va="center",
+                    transform=ax.transAxes, fontsize=12)
+        ax.set_title(f"(b) Channel Regret Scores{title_suffix}")
+        ax.grid(True, axis="x", alpha=0.3)
+        plt.tight_layout()
+        save_path = self.figures_dir / f"{save_name}.pdf"
+        fig.savefig(save_path, dpi=300, bbox_inches="tight")
+        plt.close(fig)
+        logger.info(f"Saved figure: {save_path}")
+    def print_regret_summary(self, regret: dict):
+        """Print a concise text summary of regret analysis."""
+        s = regret["summary"]
+        print("\n" + "=" * 55)
+        print("  REGRET ANALYSIS")
+        print("=" * 55)
+        print(f"  Total cases:           {regret['n_cases']}")
+        print(f"  Agent errors:          {s['total_errors']} ({regret['error_rate']*100:.1f}%)")
+        print(f"  Recoverable:           {regret['n_recoverable']} ({s['recoverable_pct']:.1f}% of errors)")
+        print(f"  Unrecoverable:         {regret['n_unrecoverable']} ({s['unrecoverable_pct']:.1f}% of errors)")
+        print(f"  Highest-regret channel: {s['highest_regret_channel']}")
+        print()
+        print("  Per-channel regret:")
+        for ch, scores in regret["channel_regret_scores"].items():
+            print(f"    {ch:<25} regret={scores['regret_rate']:.2f}  "
+                  f"(missed in {scores['missed_in_recoverable']}/{scores['missed_in_all_wrong']} errors)")
+        print("=" * 55)
+    # ================================================================
+    # Summary Table
+    # ================================================================
+    def print_summary_table(
+        self,
+        all_metrics: dict[str, dict[str, DatasetMetrics]],
+    ):
+        """
+        Print the main results table.
+        Args:
+            all_metrics: {condition: {dataset: DatasetMetrics}}
+                where condition is "passive", "K=1", "K=2", "K=3",
+                "fixed_order", "oracle"
+        """
+        header = f"{'Condition':<15} {'Dataset':<12} {'Top-1 Acc':<15} {'MRR':<15} {'Avg K':<8}"
+        print("=" * len(header))
+        print(header)
+        print("=" * len(header))
+        for condition in ["passive", "K=1", "K=2", "K=3", "fixed_order", "oracle"]:
+            if condition not in all_metrics:
+                continue
+            for ds in ["midas", "nejm", "olives"]:
+                if ds not in all_metrics[condition]:
+                    continue
+                m = all_metrics[condition][ds]
+                acc_str = f"{m.top1_accuracy:.3f} ({m.top1_accuracy_ci[0]:.3f}-{m.top1_accuracy_ci[1]:.3f})"
+                mrr_str = f"{m.mrr:.3f} ({m.mrr_ci[0]:.3f}-{m.mrr_ci[1]:.3f})"
+                print(f"{condition:<15} {ds:<12} {acc_str:<15} {mrr_str:<15} {m.mean_channels_acquired:<8.1f}")
+        print("=" * len(header))

information_gain.py ADDED Viewed

	@@ -0,0 +1,441 @@

+"""
+Information-theoretic computation for ActiveMedAgent.
+Provides grounded entropy and expected information gain (EIG) computation
+from the agent's reported probability distributions. This transforms the
+"information-theoretic framing" from a prompt label into actual computation.
+Key concepts:
+  - Belief State: The agent's probability distribution over candidate diagnoses
+  - Shannon Entropy: H(p) = -sum(p_i * log2(p_i)) — measures diagnostic uncertainty
+  - Information Gain: H(before) - H(after) — how much a channel reduced uncertainty
+  - Expected Information Gain (EIG): Estimated reduction in entropy from acquiring a channel
+  - Value of Information (VoI): Whether acquiring more data is worth the cost
+No training required — these are computed analytically from the probability
+distributions the agent reports through tool calls at each step.
+"""
+from __future__ import annotations
+import math
+import logging
+from dataclasses import dataclass, field
+import numpy as np
+logger = logging.getLogger(__name__)
+@dataclass
+class BeliefState:
+    """
+    The agent's probability distribution over candidate diagnoses at a given step.
+    Extracted directly from the tool call's `current_differential` parameter,
+    so no parsing heuristics are needed.
+    """
+    step: int
+    distribution: dict[str, float]  # {diagnosis_name: probability}
+    entropy: float = 0.0
+    channel_acquired: str | None = None
+    def __post_init__(self):
+        self.entropy = compute_entropy(self.distribution)
+@dataclass
+class BeliefTrajectory:
+    """
+    Full trajectory of belief states across the acquisition process.
+    Tracks how the agent's uncertainty evolves as it acquires information,
+    enabling information-theoretic analysis of acquisition quality.
+    """
+    case_id: str
+    states: list[BeliefState] = field(default_factory=list)
+    @property
+    def initial_entropy(self) -> float:
+        return self.states[0].entropy if self.states else 0.0
+    @property
+    def final_entropy(self) -> float:
+        return self.states[-1].entropy if self.states else 0.0
+    @property
+    def total_information_gain(self) -> float:
+        """Total reduction in entropy across all acquisitions."""
+        return self.initial_entropy - self.final_entropy
+    @property
+    def per_step_information_gain(self) -> list[float]:
+        """Information gain at each acquisition step."""
+        gains = []
+        for i in range(1, len(self.states)):
+            gains.append(self.states[i - 1].entropy - self.states[i].entropy)
+        return gains
+    @property
+    def entropy_trajectory(self) -> list[float]:
+        """Entropy at each step."""
+        return [s.entropy for s in self.states]
+    @property
+    def information_efficiency(self) -> float:
+        """
+        Information efficiency: actual IG / maximum possible IG.
+        Maximum possible IG is going from initial entropy to 0 (perfect certainty).
+        Returns ratio in [0, 1].
+        """
+        if self.initial_entropy < 1e-10:
+            return 1.0  # Already certain
+        return self.total_information_gain / self.initial_entropy
+    def get_channel_information_values(self) -> dict[str, float]:
+        """Map each acquired channel to its observed information gain."""
+        values = {}
+        for i in range(1, len(self.states)):
+            ch = self.states[i].channel_acquired
+            if ch:
+                values[ch] = self.states[i - 1].entropy - self.states[i].entropy
+        return values
+# ============================================================
+# Core Computations
+# ============================================================
+def compute_entropy(distribution: dict[str, float]) -> float:
+    """
+    Shannon entropy H(p) = -sum(p_i * log2(p_i)) in bits.
+    Handles edge cases: p=0 contributes 0, normalizes if sum != 1.
+    """
+    probs = np.array(list(distribution.values()), dtype=np.float64)
+    # Normalize if needed (VLM probabilities may not sum exactly to 1)
+    total = probs.sum()
+    if total < 1e-10:
+        return 0.0
+    probs = probs / total
+    # Compute entropy, handling p=0
+    entropy = 0.0
+    for p in probs:
+        if p > 1e-15:
+            entropy -= p * math.log2(p)
+    return entropy
+def compute_kl_divergence(p: dict[str, float], q: dict[str, float]) -> float:
+    """
+    KL divergence D_KL(p || q) = sum(p_i * log2(p_i / q_i)).
+    Measures how much the belief shifted from q (prior) to p (posterior).
+    """
+    all_keys = set(list(p.keys()) + list(q.keys()))
+    p_arr = np.array([p.get(k, 1e-10) for k in all_keys], dtype=np.float64)
+    q_arr = np.array([q.get(k, 1e-10) for k in all_keys], dtype=np.float64)
+    # Normalize
+    p_arr = p_arr / p_arr.sum()
+    q_arr = q_arr / q_arr.sum()
+    # Smoothing to avoid log(0)
+    q_arr = np.maximum(q_arr, 1e-10)
+    kl = 0.0
+    for pi, qi in zip(p_arr, q_arr):
+        if pi > 1e-15:
+            kl += pi * math.log2(pi / qi)
+    return kl
+def estimate_expected_information_gain(
+    current_distribution: dict[str, float],
+    channel_name: str,
+    expected_impact: dict[str, str],
+    candidates: list[str],
+) -> float:
+    """
+    Estimate expected information gain (EIG) for a candidate channel.
+    Uses the agent's stated expected_impact (from tool call) to estimate
+    how much the entropy would decrease. This is a lightweight approximation:
+    we model two scenarios (positive/negative finding) and compute the
+    expected entropy reduction.
+    Args:
+        current_distribution: Current belief state
+        channel_name: Channel being evaluated
+        expected_impact: {"if_positive": diagnosis_name, "if_negative": diagnosis_name}
+        candidates: All candidate diagnoses
+    Returns:
+        Estimated information gain in bits
+    """
+    current_entropy = compute_entropy(current_distribution)
+    # Model the positive scenario: the indicated diagnosis gets boosted
+    pos_target = expected_impact.get("if_positive", "")
+    neg_target = expected_impact.get("if_negative", "")
+    # Estimate posterior distributions under each scenario
+    pos_posterior = _shift_belief(current_distribution, pos_target, boost=0.3)
+    neg_posterior = _shift_belief(current_distribution, neg_target, boost=0.3)
+    # Weight scenarios by current probability of the positive-target diagnosis
+    p_positive = current_distribution.get(pos_target, 0.5)
+    p_negative = 1.0 - p_positive
+    expected_posterior_entropy = (
+        p_positive * compute_entropy(pos_posterior)
+        + p_negative * compute_entropy(neg_posterior)
+    )
+    eig = current_entropy - expected_posterior_entropy
+    return max(0.0, eig)  # EIG should be non-negative
+def _shift_belief(
+    distribution: dict[str, float],
+    target: str,
+    boost: float = 0.3,
+) -> dict[str, float]:
+    """
+    Shift probability mass toward a target diagnosis.
+    Simple model: add `boost` to target, renormalize.
+    Used for EIG estimation only.
+    """
+    result = dict(distribution)
+    # Find best matching key (case-insensitive)
+    matched_key = None
+    target_lower = target.lower().strip()
+    for key in result:
+        if target_lower in key.lower() or key.lower() in target_lower:
+            matched_key = key
+            break
+    if matched_key is None:
+        return result
+    result[matched_key] = result.get(matched_key, 0.0) + boost
+    # Renormalize
+    total = sum(result.values())
+    if total > 0:
+        result = {k: v / total for k, v in result.items()}
+    return result
+# ============================================================
+# Stopping Criterion: When Has the Agent Gathered Enough?
+# ============================================================
+def should_commit(
+    trajectory: BeliefTrajectory,
+    available_channels: list[str],
+    min_steps: int = 0,
+) -> tuple[bool, str]:
+    """
+    Principled stopping criterion based on the agent's belief trajectory.
+    CRITICAL DESIGN PRINCIPLE: Never trust raw VLM probabilities from a
+    single observation. Weaker models (GPT-4o-mini) routinely assign 0.85
+    to wrong diagnoses after seeing just one image. Stopping criteria must
+    be grounded in OBSERVED BELIEF DYNAMICS (how beliefs changed after
+    seeing evidence), not in the raw probability the VLM reports.
+    Three conditions, all requiring evidence of belief stability:
+    1. CONVERGENCE: The last acquisition produced negligible IG (< 0.05 bits).
+       Requires >= 2 belief states. If new evidence doesn't change the
+       agent's mind, further evidence probably won't either.
+    2. CONFIRMED DOMINANCE: The top diagnosis has probability >= 0.90 AND
+       the gap to #2 is >= 0.40, AND the agent has acquired >= 2 channels.
+       Raw first-impression confidence is meaningless — dominance only
+       counts after the belief has SURVIVED multiple evidence updates.
+    3. DIMINISHING RETURNS: The last 2 acquisitions both had IG < 0.1 bits.
+       Requires >= 3 belief states. The agent hit a plateau.
+    Returns:
+        (should_commit: bool, reason: str)
+    """
+    n_states = len(trajectory.states)
+    if n_states < max(1, min_steps):
+        return False, "min_steps not reached"
+    if not trajectory.states:
+        return False, "no belief states yet"
+    # Count actual acquisitions (states with a channel acquired)
+    n_acquired = sum(
+        1 for s in trajectory.states if s.channel_acquired is not None
+    )
+    latest = trajectory.states[-1]
+    dist = latest.distribution
+    if not dist:
+        return False, "empty distribution"
+    # Normalize
+    total = sum(dist.values())
+    if total < 1e-10:
+        return False, "zero distribution"
+    probs = sorted(dist.values(), reverse=True)
+    probs = [p / total for p in probs]
+    top1_prob = probs[0] if probs else 0
+    top2_prob = probs[1] if len(probs) > 1 else 0
+    gap = top1_prob - top2_prob
+    # Condition 1: CONVERGENCE — last step had negligible IG
+    # Requires at least 2 states (before/after an acquisition)
+    if n_states >= 2:
+        last_ig = (
+            trajectory.states[-2].entropy - trajectory.states[-1].entropy
+        )
+        if last_ig < 0.05 and n_acquired >= 1:
+            return True, (
+                f"convergence: last IG={last_ig:.3f} bits < 0.05 threshold "
+                f"(after {n_acquired} acquisition(s))"
+            )
+    # Condition 2: CONFIRMED DOMINANCE — high confidence AFTER evidence
+    # Must have acquired >= 2 channels. A first-impression 0.85 is not
+    # dominance — it's overconfidence. True dominance is when the belief
+    # stays dominant after being tested by new evidence.
+    if n_acquired >= 2 and top1_prob >= 0.90 and gap >= 0.40:
+        return True, (
+            f"confirmed dominance: top1={top1_prob:.2f}, gap={gap:.2f} "
+            f"(after {n_acquired} acquisitions)"
+        )
+    # Condition 3: DIMINISHING RETURNS — last 2 acquisitions both low IG
+    # Requires at least 3 states
+    if n_states >= 3:
+        ig_n1 = trajectory.states[-3].entropy - trajectory.states[-2].entropy
+        ig_n2 = trajectory.states[-2].entropy - trajectory.states[-1].entropy
+        if ig_n1 < 0.1 and ig_n2 < 0.1 and n_acquired >= 2:
+            return True, (
+                f"diminishing returns: last 2 IGs={ig_n1:.3f}, {ig_n2:.3f} "
+                f"(after {n_acquired} acquisitions)"
+            )
+    # No remaining channels
+    if not available_channels:
+        return True, "no channels remaining"
+    return False, "continue acquiring"
+def compute_value_of_information(
+    trajectory: BeliefTrajectory,
+    n_remaining_channels: int,
+) -> float:
+    """
+    Estimate the value of continuing to acquire information.
+    Uses the trajectory's IG history to extrapolate whether the next
+    acquisition would be worth it. Returns a score in [0, 1]:
+      - Near 0: little value in continuing (should commit)
+      - Near 1: high value in continuing (should acquire)
+    Method: weighted average of recent IG values, normalized by initial
+    entropy. Decays with the number of remaining channels (diminishing
+    marginal returns).
+    """
+    if not trajectory.states or n_remaining_channels == 0:
+        return 0.0
+    per_step_ig = trajectory.per_step_information_gain
+    if not per_step_ig:
+        return 0.5  # No history — uncertain, lean toward acquiring
+    initial_h = trajectory.initial_entropy
+    if initial_h < 1e-10:
+        return 0.0  # Already certain
+    # Exponentially-weighted recent IG (most recent steps matter more)
+    weights = [0.5 ** i for i in range(len(per_step_ig))]
+    weights.reverse()  # Most recent gets highest weight
+    weighted_ig = sum(w * ig for w, ig in zip(weights, per_step_ig))
+    weighted_ig /= sum(weights)
+    # Normalize by initial entropy
+    normalized_ig = weighted_ig / initial_h
+    # Discount by remaining channels (diminishing returns)
+    total_channels = len(trajectory.states) + n_remaining_channels
+    progress = len(trajectory.states) / total_channels
+    discount = 1.0 - (progress * 0.5)  # Mild discount as we acquire more
+    voi = normalized_ig * discount
+    return max(0.0, min(1.0, voi))
+# ============================================================
+# Aggregate Information-Theoretic Metrics
+# ============================================================
+def compute_information_metrics(trajectories: list[BeliefTrajectory]) -> dict:
+    """
+    Compute aggregate information-theoretic metrics across cases.
+    Returns:
+        dict with:
+          - mean_initial_entropy: Average starting uncertainty
+          - mean_final_entropy: Average ending uncertainty
+          - mean_total_ig: Average total information gain
+          - mean_info_efficiency: Average IG / initial entropy
+          - per_channel_mean_ig: Average IG contributed by each channel
+          - entropy_reduction_curve: Mean entropy at each step
+    """
+    if not trajectories:
+        return {}
+    initial_entropies = [t.initial_entropy for t in trajectories]
+    final_entropies = [t.final_entropy for t in trajectories]
+    total_igs = [t.total_information_gain for t in trajectories]
+    efficiencies = [t.information_efficiency for t in trajectories]
+    # Per-channel IG
+    channel_igs: dict[str, list[float]] = {}
+    for t in trajectories:
+        for ch, ig in t.get_channel_information_values().items():
+            if ch not in channel_igs:
+                channel_igs[ch] = []
+            channel_igs[ch].append(ig)
+    per_channel_mean_ig = {
+        ch: float(np.mean(igs)) for ch, igs in channel_igs.items()
+    }
+    # Entropy curve (pad shorter trajectories with final entropy)
+    max_steps = max(len(t.states) for t in trajectories)
+    curves = []
+    for t in trajectories:
+        curve = t.entropy_trajectory
+        # Pad with final value
+        curve += [curve[-1]] * (max_steps - len(curve))
+        curves.append(curve)
+    mean_curve = list(np.mean(curves, axis=0))
+    return {
+        "mean_initial_entropy": float(np.mean(initial_entropies)),
+        "mean_final_entropy": float(np.mean(final_entropies)),
+        "mean_total_ig": float(np.mean(total_igs)),
+        "mean_info_efficiency": float(np.mean(efficiencies)),
+        "per_channel_mean_ig": per_channel_mean_ig,
+        "entropy_reduction_curve": mean_curve,
+        "n_cases": len(trajectories),
+    }

policy.py ADDED Viewed

	@@ -0,0 +1,608 @@

+"""
+Acquisition Policy Learning for ActiveMedAgent.
+Three learned policies, all API-based or CPU-only:
+1. RewardWeightedICL: Select the best past trajectories as in-context
+   examples for the VLM. The VLM sees "here's what worked before on
+   similar cases" and makes better acquisition decisions.
+2. PolicyNetwork: A small MLP trained on CPU that predicts which channel
+   to request given a featurized state. Cheap, fast, interpretable.
+3. SelfReflectivePolicy: The VLM critiques its own past failures
+   and generates an improved acquisition strategy.
+All three produce an acquisition policy that replaces the zero-shot
+decision in agent.py.
+"""
+import json
+import logging
+import random
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+import numpy as np
+import config
+from api_client import BaseVLMClient
+from datasets.base import MedicalCase
+from trajectory import Trajectory, TrajectoryStep
+logger = logging.getLogger(__name__)
+# ================================================================
+# Approach 1: Reward-Weighted In-Context Learning (ICL)
+# ================================================================
+class RewardWeightedICL:
+    """
+    Learn an acquisition policy via reward-weighted few-shot prompting.
+    Strategy:
+      1. From collected trajectories, identify GOOD acquisition decisions
+         (positive reward) and BAD ones (negative/zero reward)
+      2. For each new case, retrieve the K most similar past cases
+         (by dataset + channel overlap + uncertainty similarity)
+      3. Construct few-shot examples showing good acquisitions
+      4. The VLM sees concrete examples of "when uncertain about X,
+         requesting Y helped" and makes better decisions
+    This is essentially offline policy improvement via in-context learning.
+    """
+    def __init__(
+        self,
+        trajectories: list[Trajectory],
+        n_examples: int = 3,
+        min_reward: float = 0.05,
+    ):
+        self.n_examples = n_examples
+        self.min_reward = min_reward
+        # Index good acquisition decisions
+        self.good_decisions: list[dict] = []
+        self.bad_decisions: list[dict] = []
+        for traj in trajectories:
+            for step in traj.steps:
+                if step.action == "COMMIT":
+                    continue
+                decision = {
+                    "case_id": traj.case_id,
+                    "dataset": traj.dataset,
+                    "acquired_before": step.acquired_so_far,
+                    "action": step.action,
+                    "uncertainty": step.uncertainty_text,
+                    "reward": step.utility_reward,
+                    "mrr_reward": step.reward,
+                    "cost": step.acquisition_cost,
+                    "diagnosis_changed": step.diagnosis_changed,
+                    "diagnosis_improved": step.diagnosis_improved,
+                    "mrr_before": step.mrr_before,
+                    "mrr_after": step.mrr_after,
+                }
+                if step.utility_reward >= min_reward:
+                    self.good_decisions.append(decision)
+                else:
+                    self.bad_decisions.append(decision)
+        logger.info(
+            f"RewardWeightedICL: {len(self.good_decisions)} good, "
+            f"{len(self.bad_decisions)} bad decisions indexed"
+        )
+    def get_few_shot_examples(
+        self,
+        case: MedicalCase,
+        acquired_so_far: list[str],
+    ) -> str:
+        """
+        Retrieve the best few-shot examples for the current case state.
+        Returns formatted text to prepend to the acquisition prompt.
+        """
+        # Filter to same dataset
+        candidates = [d for d in self.good_decisions if d["dataset"] == case.dataset]
+        if not candidates:
+            candidates = self.good_decisions  # Fallback to cross-dataset
+        # Score by similarity to current state
+        scored = []
+        for d in candidates:
+            similarity = self._compute_similarity(d, acquired_so_far)
+            scored.append((similarity, d))
+        scored.sort(key=lambda x: (-x[0], -x[1]["reward"]))
+        # Take top N
+        selected = scored[: self.n_examples]
+        if not selected:
+            return ""
+        # Format as few-shot examples
+        lines = [
+            "Here are examples of helpful acquisition decisions from similar past cases:\n"
+        ]
+        for i, (sim, d) in enumerate(selected):
+            lines.append(f"Example {i + 1}:")
+            lines.append(f"  Already acquired: {d['acquired_before'] or ['(nothing)']}")
+            lines.append(f"  Uncertainty: {d['uncertainty'][:150]}")
+            lines.append(f"  Decision: REQUEST {d['action']}")
+            lines.append(
+                f"  Outcome: MRR improved from {d['mrr_before']:.2f} to {d['mrr_after']:.2f} "
+                f"(reward: {d['reward']:+.3f})"
+            )
+            lines.append("")
+        lines.append(
+            "Learn from these examples. Prioritize channels that resolved similar uncertainties.\n"
+        )
+        return "\n".join(lines)
+    def _compute_similarity(self, decision: dict, acquired_so_far: list[str]) -> float:
+        """
+        Compute similarity between a past decision and current state.
+        Based on acquisition stage overlap.
+        """
+        past_acquired = set(decision["acquired_before"])
+        current_acquired = set(acquired_so_far)
+        # Jaccard similarity of acquisition state
+        if not past_acquired and not current_acquired:
+            return 1.0  # Both at start
+        union = past_acquired | current_acquired
+        intersection = past_acquired & current_acquired
+        stage_sim = len(intersection) / max(len(union), 1)
+        # Bonus for same acquisition stage (same number of channels acquired)
+        stage_match = 1.0 if len(past_acquired) == len(current_acquired) else 0.5
+        return stage_sim * 0.5 + stage_match * 0.5
+# ================================================================
+# Approach 2: Lightweight Policy Network (CPU-only)
+# ================================================================
+class PolicyNetwork:
+    """
+    Small MLP that predicts which channel to request.
+    State features (input):
+      - One-hot: which channels have been acquired
+      - One-hot: which dataset this is
+      - Scalar: current top-1 confidence
+      - Scalar: confidence gap (top1 - top2)
+      - Scalar: acquisition step index (0, 1, 2)
+    Output: probability distribution over requestable channels.
+    Trained with cross-entropy loss weighted by trajectory reward.
+    Runs entirely on CPU — no GPU needed. This is a <1000 parameter model.
+    """
+    def __init__(
+        self,
+        all_channels: list[str],
+        all_datasets: list[str],
+        hidden_dim: int = 32,
+    ):
+        self.all_channels = sorted(all_channels)
+        self.all_datasets = sorted(all_datasets)
+        self.channel_to_idx = {c: i for i, c in enumerate(self.all_channels)}
+        self.dataset_to_idx = {d: i for i, d in enumerate(self.all_datasets)}
+        self.n_channels = len(self.all_channels)
+        self.n_datasets = len(self.all_datasets)
+        # Feature dimension: acquired_mask + dataset_onehot + confidence + gap + step
+        self.input_dim = self.n_channels + self.n_datasets + 3
+        self.hidden_dim = hidden_dim
+        self.output_dim = self.n_channels
+        # Initialize weights (small random, CPU numpy)
+        rng = np.random.RandomState(config.SEED)
+        scale1 = np.sqrt(2.0 / self.input_dim)
+        scale2 = np.sqrt(2.0 / hidden_dim)
+        self.W1 = rng.randn(self.input_dim, hidden_dim).astype(np.float32) * scale1
+        self.b1 = np.zeros(hidden_dim, dtype=np.float32)
+        self.W2 = rng.randn(hidden_dim, self.output_dim).astype(np.float32) * scale2
+        self.b2 = np.zeros(self.output_dim, dtype=np.float32)
+        self.trained = False
+    def featurize(
+        self,
+        dataset: str,
+        acquired: list[str],
+        top1_confidence: float,
+        top2_confidence: float,
+        step_idx: int,
+    ) -> np.ndarray:
+        """Convert state to feature vector."""
+        features = np.zeros(self.input_dim, dtype=np.float32)
+        # Acquired channels mask
+        for ch in acquired:
+            if ch in self.channel_to_idx:
+                features[self.channel_to_idx[ch]] = 1.0
+        # Dataset one-hot
+        offset = self.n_channels
+        if dataset in self.dataset_to_idx:
+            features[offset + self.dataset_to_idx[dataset]] = 1.0
+        # Scalars
+        offset += self.n_datasets
+        features[offset] = top1_confidence
+        features[offset + 1] = top1_confidence - top2_confidence  # Confidence gap
+        features[offset + 2] = step_idx / 3.0  # Normalized step
+        return features
+    def predict(
+        self,
+        features: np.ndarray,
+        available_channels: list[str],
+    ) -> dict[str, float]:
+        """
+        Forward pass: predict channel selection probabilities.
+        Returns dict mapping channel_name → probability.
+        Only available (not yet acquired) channels get nonzero probability.
+        """
+        # Forward pass: input → ReLU → softmax (masked)
+        h = np.maximum(0, features @ self.W1 + self.b1)  # ReLU
+        logits = h @ self.W2 + self.b2
+        # Mask unavailable channels to -inf
+        mask = np.full(self.output_dim, -1e9, dtype=np.float32)
+        for ch in available_channels:
+            if ch in self.channel_to_idx:
+                mask[self.channel_to_idx[ch]] = 0.0
+        logits = logits + mask
+        # Softmax
+        logits = logits - logits.max()
+        exp_logits = np.exp(logits)
+        probs = exp_logits / (exp_logits.sum() + 1e-8)
+        return {ch: float(probs[self.channel_to_idx[ch]])
+                for ch in available_channels if ch in self.channel_to_idx}
+    def train(
+        self,
+        trajectories: list[Trajectory],
+        lr: float = 0.01,
+        n_epochs: int = 100,
+        reward_temperature: float = 1.0,
+    ):
+        """
+        Train the policy network on collected trajectories.
+        Uses reward-weighted cross-entropy:
+          loss = -sum(reward * log(P(action|state)))
+        Positive rewards encourage the action; negative discourage it.
+        """
+        # Build training data
+        X = []
+        actions = []
+        rewards = []
+        available_masks = []
+        for traj in trajectories:
+            for step in traj.steps:
+                if step.action == "COMMIT":
+                    continue
+                if step.action not in self.channel_to_idx:
+                    continue
+                # Extract features from the step's state
+                top1_conf = step.differential_before[0]["confidence"] if step.differential_before else 0.5
+                top2_conf = step.differential_before[1]["confidence"] if len(step.differential_before) > 1 else 0.0
+                feat = self.featurize(
+                    dataset=traj.dataset,
+                    acquired=step.acquired_so_far,
+                    top1_confidence=top1_conf,
+                    top2_confidence=top2_conf,
+                    step_idx=step.step_idx,
+                )
+                X.append(feat)
+                actions.append(self.channel_to_idx[step.action])
+                # Reward shaping: normalize across trajectories
+                rewards.append(step.utility_reward)
+                # Available channels mask
+                mask = np.zeros(self.output_dim, dtype=np.float32)
+                for ch in step.available_channels:
+                    if ch in self.channel_to_idx:
+                        mask[self.channel_to_idx[ch]] = 1.0
+                available_masks.append(mask)
+        if not X:
+            logger.warning("No training data available for policy network")
+            return
+        X = np.array(X)
+        actions = np.array(actions)
+        rewards = np.array(rewards)
+        available_masks = np.array(available_masks)
+        # Normalize rewards
+        if rewards.std() > 0:
+            rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
+        # Apply temperature
+        weights = np.exp(rewards * reward_temperature)
+        weights = weights / weights.sum() * len(weights)  # Normalize to mean=1
+        n = len(X)
+        logger.info(f"Training policy network on {n} state-action pairs for {n_epochs} epochs")
+        for epoch in range(n_epochs):
+            # Forward pass
+            h = np.maximum(0, X @ self.W1 + self.b1)
+            logits = h @ self.W2 + self.b2
+            # Mask unavailable channels
+            logits = logits + (1 - available_masks) * (-1e9)
+            # Softmax
+            logits_shifted = logits - logits.max(axis=1, keepdims=True)
+            exp_logits = np.exp(logits_shifted)
+            probs = exp_logits / (exp_logits.sum(axis=1, keepdims=True) + 1e-8)
+            # Cross-entropy loss (reward-weighted)
+            action_probs = probs[np.arange(n), actions]
+            loss = -np.mean(weights * np.log(action_probs + 1e-8))
+            # Backward pass (manual gradient)
+            # dL/d_logits = probs - one_hot(action), weighted by reward
+            grad_logits = probs.copy()
+            grad_logits[np.arange(n), actions] -= 1.0
+            grad_logits *= weights[:, np.newaxis] / n
+            # Gradient for W2, b2
+            grad_W2 = h.T @ grad_logits
+            grad_b2 = grad_logits.sum(axis=0)
+            # Gradient for W1, b1 (through ReLU)
+            grad_h = grad_logits @ self.W2.T
+            grad_h *= (h > 0).astype(np.float32)  # ReLU derivative
+            grad_W1 = X.T @ grad_h
+            grad_b1 = grad_h.sum(axis=0)
+            # Update
+            self.W1 -= lr * grad_W1
+            self.b1 -= lr * grad_b1
+            self.W2 -= lr * grad_W2
+            self.b2 -= lr * grad_b2
+            if (epoch + 1) % 20 == 0:
+                # Compute accuracy
+                predicted = np.argmax(probs, axis=1)
+                accuracy = np.mean(predicted == actions)
+                logger.info(f"  Epoch {epoch + 1}: loss={loss:.4f}, accuracy={accuracy:.3f}")
+        self.trained = True
+        logger.info("Policy network training complete")
+    def get_action(
+        self,
+        case: MedicalCase,
+        acquired: list[str],
+        differential: list[dict],
+        step_idx: int,
+    ) -> str:
+        """Select the best channel to request using the learned policy."""
+        available = [ch for ch in case.requestable_names if ch not in acquired]
+        if not available:
+            return "COMMIT"
+        top1_conf = differential[0]["confidence"] if differential else 0.5
+        top2_conf = differential[1]["confidence"] if len(differential) > 1 else 0.0
+        features = self.featurize(
+            dataset=case.dataset,
+            acquired=acquired,
+            top1_confidence=top1_conf,
+            top2_confidence=top2_conf,
+            step_idx=step_idx,
+        )
+        probs = self.predict(features, available)
+        if not probs:
+            return random.choice(available)
+        # Select highest probability channel
+        best_channel = max(probs, key=probs.get)
+        return best_channel
+    def save(self, path: Path):
+        """Save model weights."""
+        np.savez(
+            path,
+            W1=self.W1, b1=self.b1,
+            W2=self.W2, b2=self.b2,
+            channels=self.all_channels,
+            datasets=self.all_datasets,
+        )
+        logger.info(f"Saved policy network to {path}")
+    def load(self, path: Path):
+        """Load model weights."""
+        data = np.load(path, allow_pickle=True)
+        self.W1 = data["W1"]
+        self.b1 = data["b1"]
+        self.W2 = data["W2"]
+        self.b2 = data["b2"]
+        self.trained = True
+        logger.info(f"Loaded policy network from {path}")
+# ================================================================
+# Approach 3: Self-Reflective Refinement
+# ================================================================
+class SelfReflectivePolicy:
+    """
+    The VLM critiques its own past failures and generates improved strategies.
+    Pipeline:
+      1. Collect cases where zero-shot acquisition was suboptimal
+         (the agent requested info that didn't help, or missed info that would have)
+      2. Show the VLM its own failure traces and ask it to generate
+         "acquisition rules" — structured if-then policies
+      3. Inject these self-generated rules into the system prompt
+      4. Re-run with the improved prompt
+    This is a form of self-play / self-improvement via reflection.
+    """
+    def __init__(self, client: BaseVLMClient, dataset_name: str):
+        self.client = client
+        self.dataset_name = dataset_name
+        self.rules: list[str] = []
+    def generate_rules_from_failures(
+        self,
+        trajectories: list[Trajectory],
+        n_failure_examples: int = 10,
+    ) -> list[str]:
+        """
+        Analyze failures and generate acquisition rules.
+        A "failure" is a case where:
+          - Agent requested a channel with zero or negative utility
+          - Agent didn't request a channel that would have helped
+          - Agent committed too early (final MRR << oracle MRR)
+        """
+        # Collect failure examples
+        failures = []
+        for traj in trajectories:
+            if traj.dataset != self.dataset_name:
+                continue
+            # Type 1: Unhelpful acquisitions
+            for step in traj.steps:
+                if step.action != "COMMIT" and step.utility_reward <= 0:
+                    failures.append({
+                        "type": "unhelpful_acquisition",
+                        "case_id": traj.case_id,
+                        "action": step.action,
+                        "uncertainty": step.uncertainty_text[:200],
+                        "utility_reward": step.utility_reward,
+                        "mrr_reward": step.reward,
+                        "cost": step.acquisition_cost,
+                        "available": step.available_channels,
+                    })
+            # Type 2: Premature commitment
+            if traj.final_mrr < traj.oracle_mrr - 0.2:
+                failures.append({
+                    "type": "premature_commit",
+                    "case_id": traj.case_id,
+                    "acquired": [s.action for s in traj.steps if s.action != "COMMIT"],
+                    "final_mrr": traj.final_mrr,
+                    "oracle_mrr": traj.oracle_mrr,
+                    "gap": traj.oracle_mrr - traj.final_mrr,
+                })
+        if not failures:
+            logger.info("No failures found — zero-shot policy may already be strong")
+            return []
+        # Sample failures
+        random.shuffle(failures)
+        sampled = failures[:n_failure_examples]
+        # Ask the VLM to analyze and generate rules
+        failure_text = json.dumps(sampled, indent=2, default=str)
+        prompt = f"""You are analyzing an AI medical diagnostic agent's acquisition failures on {self.dataset_name} cases.
+The agent must decide what additional information to request (imaging modalities, clinical data, etc.) before making a diagnosis.
+Here are examples of FAILED acquisition decisions:
+{failure_text}
+Based on these failures, generate 5-8 specific, actionable ACQUISITION RULES that would improve future decisions.
+Format each rule as:
+RULE N: IF [condition about the current state/uncertainty] THEN [specific acquisition action] BECAUSE [reasoning]
+Rules should be specific to the {self.dataset_name} dataset and its available channels.
+Focus on patterns across failures, not individual cases.
+Be concrete — "request OCT when uncertain about subretinal fluid" is better than "request more information when uncertain."
+Respond ONLY with the rules, no preamble."""
+        response = self.client.call_with_retry(
+            system_prompt="You are an expert in medical diagnostic AI systems.",
+            user_text=prompt,
+            images=None,
+            temperature=0.3,
+            max_tokens=2048,
+        )
+        # Parse rules
+        rules = []
+        for line in response.text.split("\n"):
+            line = line.strip()
+            if line.startswith("RULE") or line.startswith("Rule"):
+                rules.append(line)
+            elif rules and line and not line.startswith("RULE"):
+                # Continuation of previous rule
+                rules[-1] += " " + line
+        self.rules = rules
+        logger.info(f"Generated {len(rules)} acquisition rules from {len(sampled)} failures")
+        for r in rules:
+            logger.info(f"  {r[:120]}...")
+        return rules
+    def get_enhanced_system_prompt(self, base_prompt: str) -> str:
+        """
+        Inject learned rules into the system prompt.
+        This is the key mechanism: the VLM's behavior is modified
+        by giving it its own self-generated rules as instructions.
+        """
+        if not self.rules:
+            return base_prompt
+        rules_text = "\n".join(self.rules)
+        injection = f"""
+LEARNED ACQUISITION STRATEGY (from analyzing past diagnostic cases):
+The following rules have been learned from analyzing cases where acquisition
+decisions were suboptimal. Apply these rules when deciding what information to request:
+{rules_text}
+Apply these rules in addition to your general diagnostic reasoning."""
+        return base_prompt + injection
+    def save_rules(self, path: Path):
+        """Save generated rules."""
+        with open(path, "w") as f:
+            json.dump({"dataset": self.dataset_name, "rules": self.rules}, f, indent=2)
+    def load_rules(self, path: Path):
+        """Load previously generated rules."""
+        with open(path) as f:
+            data = json.load(f)
+        self.rules = data["rules"]
+        logger.info(f"Loaded {len(self.rules)} rules for {self.dataset_name}")

prompts.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Prompt templates for ActiveMedAgent.
+Three semantically equivalent but lexically different variants (A/B/C)
+for prompt sensitivity analysis.
+Each prompt has:
+  - system_prompt: Sets the agent's role and reasoning format
+  - acquisition_prompt: Asks the agent to decide what to request next
+  - diagnosis_prompt: Asks the agent to commit to a ranked differential
+"""
+# ============================================================
+# Channel description formatters
+# ============================================================
+def format_available_channels(channels: dict, already_acquired: list[str]) -> str:
+    """Format the list of requestable channels for the prompt."""
+    lines = []
+    sortable = []
+    for name, info in channels.items():
+        if info.get("always_given"):
+            continue
+        if name in already_acquired:
+            continue
+        sortable.append((info.get("order", 999), info.get("cost", 0.0), name, info))
+    for _, _, name, info in sorted(sortable):
+        cost = float(info.get("cost", 0.0))
+        tier = info.get("tier", "unknown")
+        lines.append(
+            f"  - [{name}]: {info['description']} "
+            f"(tier: {tier}, cost: ${cost:,.0f})"
+        )
+    if not lines:
+        return "  (No additional information available to request.)"
+    return "\n".join(lines)
+def format_acquired_info(acquired_data: dict) -> str:
+    """Format all previously acquired information for context."""
+    if not acquired_data:
+        return "(No additional information acquired yet.)"
+    parts = []
+    for channel_name, content in acquired_data.items():
+        if content["type"] == "text":
+            parts.append(f"[{channel_name}]: {content['value']}")
+        elif content["type"] == "image":
+            parts.append(f"[{channel_name}]: (image provided)")
+    return "\n".join(parts)
+# ============================================================
+# Prompt Variant A — Clinical Framing
+# ============================================================
+VARIANT_A = {
+    "name": "clinical",
+    "system_prompt": """You are an experienced physician performing a diagnostic evaluation. \
+You will be shown a medical image and possibly additional clinical information. \
+Your goal is to arrive at the most accurate diagnosis by strategically requesting \
+the most informative additional data.
+You reason through cases using a structured clinical approach:
+1. OBSERVATION: Describe what you see in the available image(s) and data.
+2. DIFFERENTIAL: List your top 3-5 candidate diagnoses ranked by likelihood, with confidence estimates (0-1).
+3. UNCERTAINTY: Identify specifically what you are uncertain about — which diagnoses cannot be distinguished with current information and WHY.
+4. ACTION: You MUST request one additional piece of information. Choose the one that would best disambiguate your top differential diagnoses.
+CRITICAL: You must ALWAYS use your remaining budget to request information. \
+Do NOT commit early — additional information almost always improves diagnostic accuracy. \
+Always respond in this exact structured format.""",
+    "acquisition_prompt": """You have {remaining_budget} request(s) remaining. You MUST use them.
+Available information you can request:
+{available_channels}
+Previously acquired information:
+{acquired_info}
+Think carefully: which available channel would MOST help distinguish between your top diagnoses?
+Respond in EXACTLY this format:
+OBSERVATION: [What you observe from all currently available information]
+DIFFERENTIAL: [Ranked list — format each as "N. DiagnosisName (confidence: X.XX)"]
+UNCERTAINTY: [Which two diagnoses are hardest to tell apart, and what specific information would resolve it]
+ACTION: REQUEST [channel_name]
+IMPORTANT: Replace [channel_name] with exactly one of the available channel names listed above. \
+You MUST request a channel — do not skip or commit early.""",
+    "diagnosis_prompt": """You strategically gathered the most relevant clinical information. \
+Now provide your final diagnosis. Focus on the evidence you acquired — it was selected \
+specifically to resolve diagnostic uncertainty.
+Information you gathered:
+{acquired_info}
+Candidate diagnoses to rank:
+{candidates}
+Respond in the structured format:
+OBSERVATION: [Synthesis of the key findings from your acquired information]
+DIFFERENTIAL: [Ranked candidates — format each as "N. DiagnosisName (confidence: X.XX)"]
+REASONING: [Key evidence from your acquired data supporting your top diagnosis and ruling out alternatives]""",
+}
+# ============================================================
+# Prompt Variant B — Information-Theoretic Framing
+# ============================================================
+VARIANT_B = {
+    "name": "information_theoretic",
+    "system_prompt": """You are an AI diagnostic system analyzing medical data under \
+conditions of incomplete information. You process available evidence and estimate which \
+additional data sources would most reduce your diagnostic uncertainty.
+Your reasoning follows a structured protocol:
+1. EVIDENCE: Catalog the findings from all available inputs.
+2. HYPOTHESES: Rank candidate diagnoses by posterior probability (0-1, must sum to ≤1).
+3. INFORMATION GAP: Identify the highest-uncertainty region of your hypothesis space.
+4. ACQUISITION: Select the data source with highest expected information gain, or finalize.
+Always respond in this exact structured format. Be precise with probabilities.""",
+    "acquisition_prompt": """Analyze your current diagnostic uncertainty and determine the \
+optimal next data acquisition. You have {remaining_budget} acquisition(s) remaining.
+Requestable data sources:
+{available_channels}
+Previously acquired data:
+{acquired_info}
+Respond in the structured format:
+EVIDENCE: [Findings extracted from all currently available data]
+HYPOTHESES: [Ranked list — format each as "N. DiagnosisName (probability: X.XX)"]
+INFORMATION GAP: [Which distinction between top hypotheses cannot be resolved with current data, and why]
+ACQUISITION: REQUEST [channel_name] — [expected information gain explanation]
+If your top hypothesis probability exceeds 0.8 and is well-separated from alternatives:
+ACQUISITION: FINALIZE""",
+    "diagnosis_prompt": """All data acquisition is complete. Produce your final ranked \
+hypothesis set.
+Accumulated data:
+{acquired_info}
+Candidate diagnoses to rank:
+{candidates}
+Respond in the structured format:
+EVIDENCE: [Complete synthesis of all acquired data]
+HYPOTHESES: [Final ranked candidates — format each as "N. DiagnosisName (probability: X.XX)"]
+JUSTIFICATION: [Evidence chain supporting top hypothesis; contradicting evidence for alternatives]""",
+}
+# ============================================================
+# Prompt Variant C — Neutral/Minimal Framing
+# ============================================================
+VARIANT_C = {
+    "name": "neutral",
+    "system_prompt": """You are assisting with medical image analysis. Given a medical image \
+and possibly additional information, identify the most likely diagnosis from a set of candidates.
+You may request additional information before making your final decision. Structure your \
+response as follows:
+1. FINDINGS: What you observe.
+2. RANKING: Candidate diagnoses ranked with confidence scores (0-1).
+3. GAPS: What you don't know that would help.
+4. DECISION: Request more info or commit.""",
+    "acquisition_prompt": """You may request one more piece of information. \
+{remaining_budget} request(s) left.
+Options:
+{available_channels}
+Information so far:
+{acquired_info}
+Respond:
+FINDINGS: [Current observations]
+RANKING: [Format: "N. DiagnosisName (confidence: X.XX)"]
+GAPS: [What's missing]
+DECISION: REQUEST [channel_name] — [reason]
+Or if ready:
+DECISION: COMMIT""",
+    "diagnosis_prompt": """Provide your final diagnosis ranking.
+All information:
+{acquired_info}
+Candidates:
+{candidates}
+Respond:
+FINDINGS: [Summary]
+RANKING: [Format: "N. DiagnosisName (confidence: X.XX)"]
+REASONING: [Brief justification]""",
+}
+# ============================================================
+# Variant Registry
+# ============================================================
+PROMPT_VARIANTS = {
+    "A": VARIANT_A,
+    "B": VARIANT_B,
+    "C": VARIANT_C,
+}
+def get_prompt_variant(variant_id: str) -> dict:
+    """Retrieve a prompt variant by ID."""
+    if variant_id not in PROMPT_VARIANTS:
+        raise ValueError(f"Unknown prompt variant: {variant_id}. Choose from {list(PROMPT_VARIANTS.keys())}")
+    return PROMPT_VARIANTS[variant_id]

reasoning_analysis.py ADDED Viewed

	@@ -0,0 +1,612 @@

+"""
+Reasoning Faithfulness & Acquisition Pattern Analysis.
+Key analyses for ACL/EMNLP submission:
+1. Reasoning Faithfulness: Does the agent's stated reasoning match
+   actual information gain? When it says "I need X to distinguish
+   A from B", does X actually shift probability between A and B?
+2. Acquisition Order Patterns: What ordering strategies do different
+   models learn? Are they consistent? Do they match clinical guidelines?
+3. Error Analysis: When the agent commits early and is wrong, what
+   went wrong in the reasoning chain?
+4. Stopping Decision Quality: Are the agent's commit decisions well-timed?
+"""
+import json
+import logging
+import re
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from pathlib import Path
+import numpy as np
+from scipy.stats import spearmanr, kendalltau
+from agent import AgentResult, AcquisitionStep
+from datasets.base import MedicalCase
+from information_gain import compute_entropy, compute_kl_divergence
+from evaluation import evaluate_single_case, compute_reciprocal_rank
+logger = logging.getLogger(__name__)
+# ================================================================
+# 1. Reasoning Faithfulness
+# ================================================================
+@dataclass
+class FaithfulnessMetrics:
+    """Per-step reasoning faithfulness measurement."""
+    case_id: str
+    step: int
+    channel_requested: str
+    # What the agent said
+    stated_reasoning: str
+    stated_if_positive: str
+    stated_if_negative: str
+    # What actually happened
+    target_diagnosis_before: float    # Probability of stated target before
+    target_diagnosis_after: float     # Probability of stated target after
+    actual_shift: float               # Change in target probability
+    shift_direction_correct: bool     # Did it shift the way the agent predicted?
+    # Information metrics
+    entropy_before: float
+    entropy_after: float
+    actual_ig: float
+    predicted_useful: bool            # Agent thought this would help
+    actually_useful: bool             # IG > 0.05 bits
+def compute_reasoning_faithfulness(
+    results: list[AgentResult],
+    cases: list[MedicalCase],
+) -> dict:
+    """
+    Measure whether the agent's stated reasoning matches what actually
+    happens when information is acquired.
+    For each acquisition step where the agent states expected_impact:
+    - Extract the target diagnosis (if_positive/if_negative)
+    - Compare probability of that diagnosis before and after acquisition
+    - Check if the shift matches the agent's prediction
+    Returns aggregate faithfulness metrics.
+    """
+    per_step_metrics = []
+    direction_correct_count = 0
+    useful_when_predicted = 0
+    total_with_impact = 0
+    for result, case in zip(results, cases):
+        for i, step in enumerate(result.steps):
+            if step.committed or not step.expected_impact:
+                continue
+            if not step.differential:
+                continue
+            total_with_impact += 1
+            # Current distribution (before receiving the info)
+            current_dist = {
+                d.get("name", ""): d.get("confidence", 0)
+                for d in step.differential
+            }
+            # Find next step's distribution (after receiving the info)
+            next_dist = None
+            if i + 1 < len(result.steps) and result.steps[i + 1].differential:
+                next_dist = {
+                    d.get("name", ""): d.get("confidence", 0)
+                    for d in result.steps[i + 1].differential
+                }
+            if next_dist is None:
+                continue
+            # Get the target diagnosis from expected_impact
+            pos_target = step.expected_impact.get("if_positive", "")
+            neg_target = step.expected_impact.get("if_negative", "")
+            # Find probability of positive target before and after
+            pos_before = _fuzzy_lookup(current_dist, pos_target)
+            pos_after = _fuzzy_lookup(next_dist, pos_target)
+            neg_before = _fuzzy_lookup(current_dist, neg_target)
+            neg_after = _fuzzy_lookup(next_dist, neg_target)
+            # The agent predicted that this channel would help distinguish
+            # between pos_target and neg_target. Did the gap widen?
+            gap_before = abs(pos_before - neg_before)
+            gap_after = abs(pos_after - neg_after)
+            gap_widened = gap_after > gap_before
+            # Did probability shift in the stated direction?
+            actual_shift = pos_after - pos_before
+            shift_correct = gap_widened  # More discriminating = correct prediction
+            if shift_correct:
+                direction_correct_count += 1
+            # Was the channel actually useful?
+            entropy_before = compute_entropy(current_dist)
+            entropy_after = compute_entropy(next_dist)
+            actual_ig = entropy_before - entropy_after
+            actually_useful = actual_ig > 0.05
+            if actually_useful:
+                useful_when_predicted += 1
+            metrics = FaithfulnessMetrics(
+                case_id=result.case_id,
+                step=step.step,
+                channel_requested=step.requested_channel or "",
+                stated_reasoning=step.reasoning[:200],
+                stated_if_positive=pos_target,
+                stated_if_negative=neg_target,
+                target_diagnosis_before=pos_before,
+                target_diagnosis_after=pos_after,
+                actual_shift=actual_shift,
+                shift_direction_correct=shift_correct,
+                entropy_before=entropy_before,
+                entropy_after=entropy_after,
+                actual_ig=actual_ig,
+                predicted_useful=True,
+                actually_useful=actually_useful,
+            )
+            per_step_metrics.append(metrics)
+    n = len(per_step_metrics)
+    return {
+        "n_steps_analyzed": n,
+        "n_with_expected_impact": total_with_impact,
+        "direction_accuracy": direction_correct_count / n if n > 0 else 0,
+        "utility_precision": useful_when_predicted / n if n > 0 else 0,
+        "mean_actual_ig": float(np.mean([m.actual_ig for m in per_step_metrics])) if per_step_metrics else 0,
+        "mean_absolute_shift": float(np.mean([abs(m.actual_shift) for m in per_step_metrics])) if per_step_metrics else 0,
+        "per_step_details": [
+            {
+                "case_id": m.case_id,
+                "step": m.step,
+                "channel": m.channel_requested,
+                "direction_correct": m.shift_direction_correct,
+                "actual_ig": round(m.actual_ig, 4),
+                "actually_useful": m.actually_useful,
+                "stated_reasoning": m.stated_reasoning[:100],
+            }
+            for m in per_step_metrics
+        ],
+    }
+# ================================================================
+# 2. Acquisition Order Patterns
+# ================================================================
+def analyze_acquisition_orders(
+    results: list[AgentResult],
+    cases: list[MedicalCase],
+    clinical_order: dict[str, list[str]] = None,
+) -> dict:
+    """
+    Analyze what acquisition ordering strategies the agent uses.
+    Returns:
+        - Most common first/second/third channel requests
+        - Order consistency across cases
+        - Correlation with clinical guideline order
+        - Correlation between acquisition order and case difficulty
+    """
+    from baselines import CLINICAL_GUIDELINE_ORDER
+    if clinical_order is None:
+        clinical_order = CLINICAL_GUIDELINE_ORDER
+    # Collect all acquisition sequences
+    sequences = []
+    first_requests = Counter()
+    second_requests = Counter()
+    full_sequences = Counter()
+    for result in results:
+        seq = tuple(result.acquired_channels)
+        sequences.append(seq)
+        full_sequences[seq] += 1
+        if len(seq) >= 1:
+            first_requests[seq[0]] += 1
+        if len(seq) >= 2:
+            second_requests[seq[1]] += 1
+    n = len(sequences)
+    # Consistency: what fraction of cases share the most common first request?
+    most_common_first = first_requests.most_common(1)
+    first_consistency = most_common_first[0][1] / n if most_common_first and n > 0 else 0
+    # Unique sequences
+    n_unique = len(full_sequences)
+    # Correlation with clinical guideline order
+    guideline_correlations = []
+    for result, case in zip(results, cases):
+        ds = case.dataset
+        if ds not in clinical_order:
+            continue
+        gl_order = clinical_order[ds]
+        agent_order = result.acquired_channels
+        if len(agent_order) < 2:
+            continue
+        # Compute rank correlation
+        # Map channels to their guideline rank
+        gl_ranks = {ch: i for i, ch in enumerate(gl_order)}
+        agent_ranks = {ch: i for i, ch in enumerate(agent_order)}
+        common = set(agent_order) & set(gl_order)
+        if len(common) < 2:
+            continue
+        gl_r = [gl_ranks.get(ch, len(gl_order)) for ch in agent_order if ch in common]
+        ag_r = list(range(len(gl_r)))
+        if len(gl_r) >= 2:
+            corr, pval = spearmanr(gl_r, ag_r)
+            if not np.isnan(corr):
+                guideline_correlations.append(corr)
+    # Cost efficiency: does the agent prefer cheaper channels first?
+    cost_order_correlations = []
+    for result, case in zip(results, cases):
+        if len(result.acquired_channels) < 2:
+            continue
+        costs = [case.get_channel_cost(ch) for ch in result.acquired_channels]
+        positions = list(range(len(costs)))
+        if len(set(costs)) > 1:
+            corr, _ = spearmanr(costs, positions)
+            if not np.isnan(corr):
+                cost_order_correlations.append(corr)
+    return {
+        "n_cases": n,
+        "n_unique_sequences": n_unique,
+        "sequence_entropy": _sequence_entropy(full_sequences, n),
+        "first_request_distribution": dict(first_requests.most_common()),
+        "first_request_consistency": first_consistency,
+        "second_request_distribution": dict(second_requests.most_common()),
+        "most_common_sequences": [
+            {"sequence": list(seq), "count": count}
+            for seq, count in full_sequences.most_common(5)
+        ],
+        "guideline_correlation": {
+            "mean": float(np.mean(guideline_correlations)) if guideline_correlations else None,
+            "std": float(np.std(guideline_correlations)) if guideline_correlations else None,
+            "n_comparable": len(guideline_correlations),
+        },
+        "cost_order_correlation": {
+            "mean": float(np.mean(cost_order_correlations)) if cost_order_correlations else None,
+            "std": float(np.std(cost_order_correlations)) if cost_order_correlations else None,
+            "interpretation": (
+                "positive = cheaper first, negative = expensive first"
+            ),
+        },
+        "mean_channels_acquired": float(np.mean([len(s) for s in sequences])),
+    }
+# ================================================================
+# 3. Error Analysis
+# ================================================================
+@dataclass
+class ErrorCase:
+    """Detailed analysis of a single error case."""
+    case_id: str
+    ground_truth: str
+    agent_top1: str
+    agent_confidence: float
+    n_acquired: int
+    acquired_channels: list[str]
+    committed_early: bool
+    missed_channels: list[str]
+    error_type: str           # "overconfident_early", "wrong_after_all", "insufficient_info"
+    reasoning_chain: list[str]
+    entropy_at_commit: float
+    final_ig_trend: str       # "increasing", "decreasing", "plateau"
+def analyze_errors(
+    results: list[AgentResult],
+    cases: list[MedicalCase],
+) -> dict:
+    """
+    Detailed error analysis: when and why the agent gets cases wrong.
+    Categorizes errors into:
+    1. Overconfident early commit — committed before gathering enough info
+    2. Wrong after all info — had all info but still wrong (reasoning failure)
+    3. Insufficient info — didn't have the right channels (missing key evidence)
+    """
+    errors = []
+    correct_count = 0
+    total = len(results)
+    for result, case in zip(results, cases):
+        if not result.final_ranking:
+            continue
+        top = result.final_ranking[0]
+        top_name = top.get("name", "").strip().lower()
+        gt = case.ground_truth.strip().lower()
+        correct = top_name == gt or top_name in gt or gt in top_name
+        if correct:
+            correct_count += 1
+            continue
+        # Classify error type
+        all_requestable = set(case.requestable_channels.keys())
+        acquired = set(result.acquired_channels)
+        missed = list(all_requestable - acquired)
+        if result.committed_early and missed:
+            error_type = "overconfident_early"
+        elif not missed:
+            error_type = "wrong_after_all"
+        else:
+            error_type = "insufficient_info"
+        # Extract reasoning chain
+        reasoning_chain = []
+        for step in result.steps:
+            if step.reasoning:
+                reasoning_chain.append(
+                    f"Step {step.step}: {step.reasoning[:150]}"
+                )
+        # Entropy trend
+        entropies = [s.entropy for s in result.steps if s.entropy > 0]
+        if len(entropies) >= 2:
+            diffs = [entropies[i+1] - entropies[i] for i in range(len(entropies)-1)]
+            if all(d <= 0 for d in diffs):
+                trend = "decreasing"
+            elif all(d >= 0 for d in diffs):
+                trend = "increasing"
+            else:
+                trend = "non_monotonic"
+        else:
+            trend = "insufficient_data"
+        entropy_at_commit = entropies[-1] if entropies else 0.0
+        error = ErrorCase(
+            case_id=result.case_id,
+            ground_truth=case.ground_truth,
+            agent_top1=top.get("name", ""),
+            agent_confidence=top.get("confidence", 0),
+            n_acquired=len(result.acquired_channels),
+            acquired_channels=result.acquired_channels,
+            committed_early=result.committed_early,
+            missed_channels=missed,
+            error_type=error_type,
+            reasoning_chain=reasoning_chain,
+            entropy_at_commit=entropy_at_commit,
+            final_ig_trend=trend,
+        )
+        errors.append(error)
+    # Aggregate by error type
+    type_counts = Counter(e.error_type for e in errors)
+    n_errors = len(errors)
+    # Confidence distribution for errors vs correct
+    error_confidences = [e.agent_confidence for e in errors]
+    return {
+        "n_total": total,
+        "n_correct": correct_count,
+        "n_errors": n_errors,
+        "accuracy": correct_count / total if total > 0 else 0,
+        "error_type_distribution": {
+            "overconfident_early": type_counts.get("overconfident_early", 0),
+            "wrong_after_all": type_counts.get("wrong_after_all", 0),
+            "insufficient_info": type_counts.get("insufficient_info", 0),
+        },
+        "error_type_rates": {
+            etype: count / n_errors if n_errors > 0 else 0
+            for etype, count in type_counts.items()
+        },
+        "mean_error_confidence": float(np.mean(error_confidences)) if error_confidences else 0,
+        "mean_error_channels_acquired": float(np.mean([e.n_acquired for e in errors])) if errors else 0,
+        "entropy_at_commit": {
+            "mean": float(np.mean([e.entropy_at_commit for e in errors])) if errors else 0,
+            "std": float(np.std([e.entropy_at_commit for e in errors])) if errors else 0,
+        },
+        "ig_trend_distribution": dict(Counter(e.final_ig_trend for e in errors)),
+        "per_case_errors": [
+            {
+                "case_id": e.case_id,
+                "ground_truth": e.ground_truth,
+                "predicted": e.agent_top1,
+                "confidence": e.agent_confidence,
+                "error_type": e.error_type,
+                "n_acquired": e.n_acquired,
+                "missed": e.missed_channels,
+                "committed_early": e.committed_early,
+                "entropy_at_commit": round(e.entropy_at_commit, 3),
+            }
+            for e in errors
+        ],
+    }
+# ================================================================
+# 4. Stopping Decision Quality
+# ================================================================
+def analyze_stopping_decisions(
+    results: list[AgentResult],
+    cases: list[MedicalCase],
+) -> dict:
+    """
+    Analyze whether the agent's commit decisions are well-timed.
+    Compares:
+    - Cases where agent committed early and was correct (good early stop)
+    - Cases where agent committed early and was wrong (premature stop)
+    - Cases that used all channels (necessary thoroughness vs wasted budget)
+    """
+    early_correct = []
+    early_wrong = []
+    full_correct = []
+    full_wrong = []
+    for result, case in zip(results, cases):
+        if not result.final_ranking:
+            continue
+        top = result.final_ranking[0]
+        top_name = top.get("name", "").strip().lower()
+        gt = case.ground_truth.strip().lower()
+        correct = top_name == gt or top_name in gt or gt in top_name
+        n_requestable = len(case.requestable_channels)
+        n_acquired = len(result.acquired_channels)
+        entry = {
+            "case_id": result.case_id,
+            "confidence": top.get("confidence", 0),
+            "n_acquired": n_acquired,
+            "n_available": n_requestable,
+            "fraction_used": n_acquired / n_requestable if n_requestable > 0 else 1,
+            "cost": result.acquisition_cost,
+        }
+        if result.committed_early:
+            if correct:
+                early_correct.append(entry)
+            else:
+                early_wrong.append(entry)
+        else:
+            if correct:
+                full_correct.append(entry)
+            else:
+                full_wrong.append(entry)
+    def _summarize(entries):
+        if not entries:
+            return {"count": 0}
+        return {
+            "count": len(entries),
+            "mean_confidence": float(np.mean([e["confidence"] for e in entries])),
+            "mean_channels": float(np.mean([e["n_acquired"] for e in entries])),
+            "mean_fraction_used": float(np.mean([e["fraction_used"] for e in entries])),
+            "mean_cost": float(np.mean([e["cost"] for e in entries])),
+        }
+    total = len(results)
+    early_rate = (len(early_correct) + len(early_wrong)) / total if total > 0 else 0
+    early_precision = (
+        len(early_correct) / (len(early_correct) + len(early_wrong))
+        if (len(early_correct) + len(early_wrong)) > 0 else 0
+    )
+    return {
+        "n_total": total,
+        "early_commit_rate": early_rate,
+        "early_commit_precision": early_precision,
+        "early_correct": _summarize(early_correct),
+        "early_wrong": _summarize(early_wrong),
+        "full_correct": _summarize(full_correct),
+        "full_wrong": _summarize(full_wrong),
+        "cost_savings_from_early_commit": {
+            "mean_cost_early": float(np.mean(
+                [e["cost"] for e in early_correct + early_wrong]
+            )) if (early_correct or early_wrong) else 0,
+            "mean_cost_full": float(np.mean(
+                [e["cost"] for e in full_correct + full_wrong]
+            )) if (full_correct or full_wrong) else 0,
+        },
+    }
+# ================================================================
+# Full Analysis Pipeline
+# ================================================================
+def run_reasoning_analysis(
+    results: list[AgentResult],
+    cases: list[MedicalCase],
+    save_dir: Path = None,
+) -> dict:
+    """Run all reasoning analyses and return combined results."""
+    logger.info("Running reasoning analysis...")
+    faithfulness = compute_reasoning_faithfulness(results, cases)
+    logger.info(
+        f"  Faithfulness: direction_accuracy={faithfulness['direction_accuracy']:.3f}, "
+        f"utility_precision={faithfulness['utility_precision']:.3f}"
+    )
+    orders = analyze_acquisition_orders(results, cases)
+    logger.info(
+        f"  Order patterns: {orders['n_unique_sequences']} unique sequences, "
+        f"first_consistency={orders['first_request_consistency']:.3f}"
+    )
+    errors = analyze_errors(results, cases)
+    logger.info(
+        f"  Errors: {errors['n_errors']}/{errors['n_total']} "
+        f"({errors['error_type_distribution']})"
+    )
+    stopping = analyze_stopping_decisions(results, cases)
+    logger.info(
+        f"  Stopping: early_rate={stopping['early_commit_rate']:.3f}, "
+        f"early_precision={stopping['early_commit_precision']:.3f}"
+    )
+    output = {
+        "reasoning_faithfulness": faithfulness,
+        "acquisition_orders": orders,
+        "error_analysis": errors,
+        "stopping_decisions": stopping,
+    }
+    if save_dir:
+        save_dir.mkdir(parents=True, exist_ok=True)
+        # Remove non-serializable details for compact save
+        compact = json.loads(json.dumps(output, default=str))
+        with open(save_dir / "reasoning_analysis.json", "w") as f:
+            json.dump(compact, f, indent=2)
+        logger.info(f"  Saved to {save_dir / 'reasoning_analysis.json'}")
+    return output
+# ================================================================
+# Helpers
+# ================================================================
+def _fuzzy_lookup(dist: dict, target: str) -> float:
+    """Look up a diagnosis probability with fuzzy name matching."""
+    target_lower = target.lower().strip()
+    for name, prob in dist.items():
+        if target_lower in name.lower() or name.lower() in target_lower:
+            return prob
+    return 0.0
+def _sequence_entropy(counter: Counter, total: int) -> float:
+    """Shannon entropy of sequence distribution (diversity measure)."""
+    if total == 0:
+        return 0.0
+    entropy = 0.0
+    for count in counter.values():
+        p = count / total
+        if p > 0:
+            entropy -= p * np.log2(p)
+    return float(entropy)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=6.0.0
+numpy
+Pillow
+scipy
+openai
+anthropic
+together
+python-dotenv

tools.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Tool definitions for the ActiveMedAgent tool-use architecture.
+Instead of parsing free-form text with regex, the agent makes structured
+tool calls through the VLM's native function-calling interface. This:
+  1. Eliminates brittle parsing heuristics
+  2. Makes the agent a genuine tool-using system (not text completion + post-hoc extraction)
+  3. Provides formally verifiable action traces
+  4. Enables grounded information-theoretic analysis via structured probability reports
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+# ============================================================
+# Tool Call Data Structures
+# ============================================================
+@dataclass
+class ToolCall:
+    """A single tool call extracted from a VLM response."""
+    tool_name: str
+    arguments: dict[str, Any]
+    call_id: str = ""
+@dataclass
+class ToolResult:
+    """Result returned to the VLM after executing a tool."""
+    call_id: str
+    content: str
+    images: list[str] | None = None  # base64-encoded images
+# ============================================================
+# Tool Definitions (canonical format — translated per backend)
+# ============================================================
+AGENT_TOOLS = [
+    {
+        "name": "request_information",
+        "description": (
+            "Request one additional information channel to reduce diagnostic uncertainty "
+            "while avoiding unnecessary resource use. Call this when you need more data "
+            "to distinguish between competing diagnoses and the expected benefit justifies "
+            "the channel's cost. "
+            "You must specify which channel to acquire and why it would resolve your "
+            "current uncertainty."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "channel_name": {
+                    "type": "string",
+                    "description": "Exact name of the channel to request (from the available list)",
+                },
+                "reasoning": {
+                    "type": "string",
+                    "description": "Why this channel best resolves your current diagnostic uncertainty",
+                },
+                "current_differential": {
+                    "type": "array",
+                    "description": "Your current ranked differential diagnosis with calibrated probabilities (must sum to 1.0)",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "name": {"type": "string", "description": "Diagnosis name"},
+                            "probability": {
+                                "type": "number",
+                                "description": "Posterior probability (0-1), all must sum to 1.0",
+                            },
+                        },
+                        "required": ["name", "probability"],
+                    },
+                },
+                "expected_impact": {
+                    "type": "object",
+                    "description": "What you expect this information to reveal",
+                    "properties": {
+                        "if_positive": {
+                            "type": "string",
+                            "description": "Which diagnosis becomes most likely if this channel shows positive/abnormal findings",
+                        },
+                        "if_negative": {
+                            "type": "string",
+                            "description": "Which diagnosis becomes most likely if this channel shows negative/normal findings",
+                        },
+                    },
+                    "required": ["if_positive", "if_negative"],
+                },
+            },
+            "required": ["channel_name", "reasoning", "current_differential", "expected_impact"],
+        },
+    },
+    {
+        "name": "commit_diagnosis",
+        "description": (
+            "Commit to a final ranked diagnosis. Call this ONLY when you have exhausted "
+            "the clinically useful information OR when your top diagnosis has probability "
+            ">= 0.85 and is well-separated from alternatives. Prefer committing when "
+            "remaining channels are unlikely to change management enough to justify cost."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "ranked_diagnoses": {
+                    "type": "array",
+                    "description": "Final ranked list of all candidate diagnoses with calibrated probabilities summing to 1.0",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "name": {"type": "string"},
+                            "confidence": {
+                                "type": "number",
+                                "description": "Posterior probability (0-1)",
+                            },
+                            "key_evidence": {
+                                "type": "string",
+                                "description": "Most important evidence supporting or refuting this diagnosis",
+                            },
+                        },
+                        "required": ["name", "confidence", "key_evidence"],
+                    },
+                },
+                "reasoning": {
+                    "type": "string",
+                    "description": "Final diagnostic reasoning chain",
+                },
+            },
+            "required": ["ranked_diagnoses", "reasoning"],
+        },
+    },
+]
+# ============================================================
+# Schema Translation
+# ============================================================
+def to_openai_tools(tools: list[dict] = None) -> list[dict]:
+    """Convert canonical tool definitions to OpenAI function-calling format."""
+    if tools is None:
+        tools = AGENT_TOOLS
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": t["name"],
+                "description": t["description"],
+                "parameters": t["parameters"],
+            },
+        }
+        for t in tools
+    ]
+def to_anthropic_tools(tools: list[dict] = None) -> list[dict]:
+    """Convert canonical tool definitions to Anthropic tool-use format."""
+    if tools is None:
+        tools = AGENT_TOOLS
+    return [
+        {
+            "name": t["name"],
+            "description": t["description"],
+            "input_schema": t["parameters"],
+        }
+        for t in tools
+    ]
+def constrain_tools_for_step(budget_remaining: int, allow_commit: bool = True) -> list[dict]:
+    """
+    Return the appropriate tool subset for the current step.
+    - If budget > 0 and channels available: both request_information and commit_diagnosis
+    - If budget == 0 or forced final: only commit_diagnosis
+    """
+    if budget_remaining <= 0:
+        return [t for t in AGENT_TOOLS if t["name"] == "commit_diagnosis"]
+    tools = list(AGENT_TOOLS)
+    if not allow_commit:
+        tools = [t for t in tools if t["name"] != "commit_diagnosis"]
+    return tools

trajectory.py ADDED Viewed

	@@ -0,0 +1,338 @@

+"""
+Trajectory Collection for ActiveMedAgent.
+Phase 1 of the training pipeline:
+  1. Run zero-shot agent on all cases
+  2. Record full (state, action, reward) trajectories
+  3. Compute per-step rewards: did the acquisition improve the diagnosis?
+  4. Save trajectory dataset for Phase 2 policy learning
+Each trajectory step records:
+  - state: current uncertainty, differential, acquired channels so far
+  - action: which channel was requested
+  - reward: MRR improvement after receiving the requested info
+  - outcome: final diagnosis correctness
+"""
+import json
+import logging
+import random
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+import numpy as np
+from tqdm import tqdm
+import config
+from api_client import BaseVLMClient, create_client
+from agent import ActiveMedAgent, AgentResult
+from datasets.base import MedicalCase
+from evaluation import compute_reciprocal_rank
+logger = logging.getLogger(__name__)
+@dataclass
+class TrajectoryStep:
+    """A single step in an acquisition trajectory."""
+    step_idx: int
+    # State representation
+    acquired_so_far: list[str]
+    available_channels: list[str]
+    uncertainty_text: str
+    differential_before: list[dict]     # Ranking before this acquisition
+    mrr_before: float
+    # Action
+    action: str                          # Channel name requested (or "COMMIT")
+    # Outcome (computed after the action)
+    differential_after: list[dict]       # Ranking after receiving the info
+    mrr_after: float
+    reward: float                        # MRR improvement: mrr_after - mrr_before
+    acquisition_cost: float = 0.0
+    normalized_cost: float = 0.0
+    utility_reward: float = 0.0          # Cost-aware reward used for policy learning
+    diagnosis_changed: bool = False      # Did top-1 change?
+    diagnosis_improved: bool = False     # Did it change to the correct answer?
+@dataclass
+class Trajectory:
+    """Complete trajectory for one case."""
+    case_id: str
+    dataset: str
+    ground_truth: str
+    candidates: list[str]
+    steps: list[TrajectoryStep] = field(default_factory=list)
+    passive_mrr: float = 0.0
+    oracle_mrr: float = 0.0
+    final_mrr: float = 0.0
+    total_reward: float = 0.0
+    total_utility_reward: float = 0.0
+    success: bool = False                # Did the agent get top-1 correct?
+class TrajectoryCollector:
+    """
+    Collect acquisition trajectories with per-step rewards.
+    Unlike the basic agent.diagnose(), this method runs the agent
+    step-by-step, evaluating the diagnosis after EACH acquisition
+    to compute fine-grained reward signals.
+    Uses the tool-use agent architecture: runs the full agent for
+    acquisition decisions, then evaluates intermediate states via
+    the agent's get_diagnosis_at_state() helper.
+    """
+    def __init__(
+        self,
+        client: BaseVLMClient,
+        prompt_variant: str = "A",
+        budget: int = 3,
+    ):
+        self.client = client
+        self.prompt_variant = prompt_variant
+        self.budget = budget
+    def collect_trajectory(self, case: MedicalCase) -> Trajectory:
+        """
+        Collect a full trajectory with per-step rewards for one case.
+        Strategy:
+          1. Get passive baseline (image-only diagnosis)
+          2. Get oracle ceiling (all-info diagnosis)
+          3. Run the active agent and record its decisions
+          4. For each acquisition step, evaluate the intermediate
+             diagnosis to compute per-step MRR reward
+        """
+        traj = Trajectory(
+            case_id=case.case_id,
+            dataset=case.dataset,
+            ground_truth=case.ground_truth,
+            candidates=case.candidates,
+        )
+        # ---- Evaluation agent (budget=0, just for scoring) ----
+        eval_agent = ActiveMedAgent(
+            self.client, self.prompt_variant, budget=0
+        )
+        # ---- Get passive baseline (MRR with no acquisition) ----
+        passive_result = eval_agent.diagnose_passive(case)
+        passive_mrr = compute_reciprocal_rank(
+            passive_result.final_ranking, case.ground_truth, case.candidates
+        )
+        traj.passive_mrr = passive_mrr
+        # ---- Get oracle ceiling (MRR with all info) ----
+        oracle_result = eval_agent.diagnose_oracle(case)
+        oracle_mrr = compute_reciprocal_rank(
+            oracle_result.final_ranking, case.ground_truth, case.candidates
+        )
+        traj.oracle_mrr = oracle_mrr
+        # ---- Run the active agent to get its acquisition decisions ----
+        active_agent = ActiveMedAgent(
+            self.client, self.prompt_variant, budget=self.budget
+        )
+        active_result = active_agent.diagnose(case)
+        # ---- Evaluate each intermediate state ----
+        current_mrr = passive_mrr
+        current_ranking = passive_result.final_ranking
+        acquired_so_far = []
+        for step_idx, step in enumerate(active_result.steps):
+            if step.committed:
+                # Agent committed early — record and stop
+                traj_step = TrajectoryStep(
+                    step_idx=step_idx,
+                    acquired_so_far=list(acquired_so_far),
+                    available_channels=[
+                        n for n in case.requestable_names
+                        if n not in acquired_so_far
+                    ],
+                    uncertainty_text=step.reasoning or "",
+                    differential_before=current_ranking,
+                    mrr_before=current_mrr,
+                    action="COMMIT",
+                    differential_after=current_ranking,
+                    mrr_after=current_mrr,
+                    reward=0.0,
+                    acquisition_cost=0.0,
+                    normalized_cost=0.0,
+                    utility_reward=0.0,
+                    diagnosis_changed=False,
+                    diagnosis_improved=False,
+                )
+                traj.steps.append(traj_step)
+                break
+            channel = step.requested_channel
+            if not channel:
+                continue
+            available = [
+                n for n in case.requestable_names
+                if n not in acquired_so_far
+            ]
+            # Record the state BEFORE this acquisition
+            before_ranking = current_ranking
+            before_mrr = current_mrr
+            # Execute the acquisition
+            acquired_so_far.append(channel)
+            # Evaluate the diagnosis AFTER this acquisition
+            after_ranking, _ = eval_agent.get_diagnosis_at_state(
+                case, list(acquired_so_far)
+            )
+            after_mrr = compute_reciprocal_rank(
+                after_ranking, case.ground_truth, case.candidates
+            )
+            # Compute reward
+            reward = after_mrr - before_mrr
+            channel_cost = case.get_channel_cost(channel)
+            max_requestable_cost = max(case.get_max_requestable_cost(), 1.0)
+            normalized_cost = channel_cost / max_requestable_cost
+            utility_reward = reward - (
+                config.COST_PENALTY_LAMBDA * normalized_cost
+            )
+            # Did diagnosis change?
+            top1_before = (
+                before_ranking[0]["name"] if before_ranking else ""
+            )
+            top1_after = (
+                after_ranking[0]["name"] if after_ranking else ""
+            )
+            diagnosis_changed = (
+                top1_before.lower() != top1_after.lower()
+            )
+            gt_lower = case.ground_truth.lower()
+            diagnosis_improved = (
+                diagnosis_changed
+                and (
+                    gt_lower in top1_after.lower()
+                    or top1_after.lower() in gt_lower
+                )
+            )
+            traj_step = TrajectoryStep(
+                step_idx=step_idx,
+                acquired_so_far=list(acquired_so_far[:-1]),
+                available_channels=available,
+                uncertainty_text=step.reasoning or "",
+                differential_before=before_ranking,
+                mrr_before=before_mrr,
+                action=channel,
+                differential_after=after_ranking,
+                mrr_after=after_mrr,
+                reward=reward,
+                acquisition_cost=channel_cost,
+                normalized_cost=normalized_cost,
+                utility_reward=utility_reward,
+                diagnosis_changed=diagnosis_changed,
+                diagnosis_improved=diagnosis_improved,
+            )
+            traj.steps.append(traj_step)
+            # Update state for next step
+            current_mrr = after_mrr
+            current_ranking = after_ranking
+        # ---- Finalize trajectory ----
+        traj.final_mrr = current_mrr
+        traj.total_reward = sum(s.reward for s in traj.steps)
+        traj.total_utility_reward = sum(s.utility_reward for s in traj.steps)
+        traj.success = (current_mrr == 1.0)
+        return traj
+    def collect_dataset(
+        self,
+        cases: list[MedicalCase],
+        max_cases: int = None,
+        save_path: Path = None,
+    ) -> list[Trajectory]:
+        """Collect trajectories for all cases."""
+        if max_cases:
+            cases = cases[:max_cases]
+        trajectories = []
+        for case in tqdm(cases, desc="Collecting trajectories", ncols=80):
+            try:
+                traj = self.collect_trajectory(case)
+                trajectories.append(traj)
+            except Exception as e:
+                logger.error(f"Failed on {case.case_id}: {e}")
+                continue
+        # Save
+        if save_path:
+            save_path = Path(save_path)
+            save_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(save_path, "w") as f:
+                json.dump(
+                    [asdict(t) for t in trajectories],
+                    f, indent=2, default=str,
+                )
+            logger.info(f"Saved {len(trajectories)} trajectories to {save_path}")
+        # Report statistics
+        self._report_stats(trajectories)
+        return trajectories
+    def _report_stats(self, trajectories: list[Trajectory]):
+        """Log summary statistics of collected trajectories."""
+        n = len(trajectories)
+        if n == 0:
+            return
+        logger.info(f"\n{'='*50}")
+        logger.info(f"Trajectory Collection Summary (n={n})")
+        logger.info(f"{'='*50}")
+        success_rate = np.mean([t.success for t in trajectories])
+        avg_steps = np.mean([len(t.steps) for t in trajectories])
+        avg_reward = np.mean([t.total_reward for t in trajectories])
+        avg_utility = np.mean([t.total_utility_reward for t in trajectories])
+        avg_passive_mrr = np.mean([t.passive_mrr for t in trajectories])
+        avg_final_mrr = np.mean([t.final_mrr for t in trajectories])
+        avg_oracle_mrr = np.mean([t.oracle_mrr for t in trajectories])
+        logger.info(f"  Success rate: {success_rate:.3f}")
+        logger.info(f"  Avg steps taken: {avg_steps:.1f}")
+        logger.info(f"  Avg total reward: {avg_reward:.3f}")
+        logger.info(f"  Avg utility reward: {avg_utility:.3f}")
+        logger.info(
+            f"  MRR: passive={avg_passive_mrr:.3f} -> "
+            f"active={avg_final_mrr:.3f} -> oracle={avg_oracle_mrr:.3f}"
+        )
+        # Per-action reward statistics
+        all_steps = [
+            s for t in trajectories for s in t.steps
+            if s.action != "COMMIT"
+        ]
+        if all_steps:
+            action_rewards = {}
+            for s in all_steps:
+                if s.action not in action_rewards:
+                    action_rewards[s.action] = []
+                    action_rewards[s.action].append(s.utility_reward)
+            logger.info(f"\n  Per-channel utility statistics:")
+            for action, rewards in sorted(
+                action_rewards.items(), key=lambda x: -np.mean(x[1])
+            ):
+                logger.info(
+                    f"    {action:<25} mean_utility={np.mean(rewards):+.3f} "
+                    f"n={len(rewards)} "
+                    f"positive_rate={np.mean([r > 0 for r in rewards]):.2f}"
+                )