Spaces:

samwell
/

medrax2

Paused

+"""Benchmark abstractions for medical AI evaluation."""
+from .base import Benchmark, BenchmarkDataPoint
+from .rexvqa_benchmark import ReXVQABenchmark
+from .chestagentbench_benchmark import ChestAgentBenchBenchmark
+__all__ = [
+    "Benchmark",
+    "BenchmarkDataPoint",
+    "ReXVQABenchmark",
+    "ChestAgentBenchBenchmark",
+]

benchmarking/benchmarks/base.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""Base class for benchmarks."""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Any, Iterator, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class BenchmarkDataPoint:
+    """A single data point from a benchmark."""
+    id: str
+    text: str  # The question/prompt
+    images: Optional[List[str]] = None  # List of image paths
+    correct_answer: Optional[str] = None  # Ground truth answer
+    case_id: Optional[str] = None  # For grouping related questions
+    category: Optional[str] = None  # Type of question/task
+    metadata: Optional[Dict[str, Any]] = None  # Additional metadata
+class Benchmark(ABC):
+    """Abstract base class for benchmarks.
+    This class defines the interface for all benchmarks, standardizing
+    how data is loaded and accessed across different benchmark datasets.
+    """
+    def __init__(self, data_dir: str, **kwargs):
+        """Initialize the benchmark.
+        Args:
+            data_dir (str): Directory containing benchmark data
+            **kwargs: Additional configuration parameters
+        """
+        self.data_dir = Path(data_dir)
+        self.config = kwargs
+        self.data_points = []
+        self._load_data()
+    @abstractmethod
+    def _load_data(self) -> None:
+        """Load benchmark data from the data directory."""
+        pass
+    def get_data_point(self, index: int) -> BenchmarkDataPoint:
+        """Get a specific data point by index.
+        Args:
+            index (int): Index of the data point to retrieve
+        Returns:
+            BenchmarkDataPoint: The data point at the given index
+        """
+        if index < 0 or index >= len(self.data_points):
+            raise IndexError(f"Index {index} out of range for {len(self.data_points)} data points")
+        return self.data_points[index]
+    def get_subset(self, indices: List[int]) -> List[BenchmarkDataPoint]:
+        """Get a subset of data points by indices.
+        Args:
+            indices (List[int]): List of indices to retrieve
+        Returns:
+            List[BenchmarkDataPoint]: List of data points at the given indices
+        """
+        return [self.get_data_point(i) for i in indices]
+    def get_by_category(self, category: str) -> List[BenchmarkDataPoint]:
+        """Get all data points of a specific category.
+        Args:
+            category (str): Category to filter by
+        Returns:
+            List[BenchmarkDataPoint]: List of data points in the category
+        """
+        return [dp for dp in self if dp.category == category]
+    def get_by_case_id(self, case_id: str) -> List[BenchmarkDataPoint]:
+        """Get all data points for a specific case.
+        Args:
+            case_id (str): Case ID to filter by
+        Returns:
+            List[BenchmarkDataPoint]: List of data points for the case
+        """
+        return [dp for dp in self if dp.case_id == case_id]
+    def __str__(self) -> str:
+        """String representation of the benchmark."""
+        return f"{self.__class__.__name__}(data_dir={self.data_dir}, size={len(self)})"
+    def __len__(self) -> int:
+        """Return the number of data points in the benchmark."""
+        return len(self.data_points)
+    def __iter__(self) -> Iterator[BenchmarkDataPoint]:
+        """Iterate over all data points in the benchmark."""
+        for i in range(len(self)):
+            yield self.get_data_point(i)
+    def get_categories(self) -> List[str]:
+        """Get all unique categories in the benchmark.
+        Returns:
+            List[str]: List of unique categories
+        """
+        categories = set()
+        for dp in self:
+            if dp.category:
+                categories.add(dp.category)
+        return sorted(list(categories))
+    def get_case_ids(self) -> List[str]:
+        """Get all unique case IDs in the benchmark.
+        Returns:
+            List[str]: List of unique case IDs
+        """
+        case_ids = set()
+        for dp in self:
+            if dp.case_id:
+                case_ids.add(dp.case_id)
+        return sorted(list(case_ids))
+    def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the benchmark.
+        Returns:
+            Dict[str, Any]: Dictionary containing benchmark statistics
+        """
+        stats = {
+            "total_questions": len(self),
+            "total_cases": len(self.get_case_ids()),
+            "categories": self.get_categories(),
+            "category_counts": {},
+            "has_images": False,
+            "num_images": 0,
+        }
+        for dp in self:
+            # Category counts
+            if dp.category:
+                stats["category_counts"][dp.category] = stats["category_counts"].get(dp.category, 0) + 1
+            # Image statistics
+            if dp.images:
+                stats["has_images"] = True
+                stats["num_images"] += len(dp.images)
+        return stats
+    def validate_images(self) -> Tuple[List[str], List[str]]:
+        """Validate that all image paths exist.
+        Returns:
+            Tuple[List[str], List[str]]: Tuple of (valid_image_paths, invalid_image_paths)
+        """
+        valid_images = []
+        invalid_images = []
+        for dp in self:
+            if dp.images:
+                for image_path in dp.images:
+                    if Path(image_path).exists():
+                        valid_images.append(image_path)
+                    else:
+                        invalid_images.append(image_path)
+        return valid_images, invalid_images

benchmarking/benchmarks/chestagentbench_benchmark.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import json
+from pathlib import Path
+from typing import Dict, Optional, Any
+from .base import Benchmark, BenchmarkDataPoint
+class ChestAgentBenchBenchmark(Benchmark):
+    """ChestAgentBench benchmark for complex CXR interpretation and reasoning.
+    Loads the dataset from a local metadata.jsonl file and parses each entry into a BenchmarkDataPoint.
+    """
+    def __init__(self, data_dir: str, **kwargs):
+        self.max_questions = kwargs.get("max_questions", None)
+        super().__init__(data_dir, **kwargs)
+    def _load_data(self) -> None:
+        metadata_path = Path(self.data_dir) / "metadata.jsonl"
+        if not metadata_path.exists():
+            raise FileNotFoundError(f"Could not find metadata.jsonl in {self.data_dir}")
+        print(f"Loading ChestAgentBench from local file: {metadata_path}")
+        self.data_points = []
+        with open(metadata_path, "r", encoding="utf-8") as f:
+            for i, line in enumerate(f):
+                if self.max_questions and i >= self.max_questions:
+                    break
+                try:
+                    item = json.loads(line)
+                    data_point = self._parse_item(item, i)
+                    if data_point:
+                        self.data_points.append(data_point)
+                except Exception as e:
+                    print(f"Error loading item {i}: {e}")
+                    continue
+    def _parse_item(self, item: Dict[str, Any], index: int) -> Optional[BenchmarkDataPoint]:
+        # Use full_question_id or question_id if available, else fallback
+        question_id = item.get("full_question_id") or item.get("question_id") or f"chestagentbench_{index}"
+        question = item.get("question", "")
+        correct_answer = item.get("answer", "")
+        explanation = item.get("explanation", "")
+        images = item.get("images", [])
+        case_id = item.get("case_id", "")
+        category = item.get("categories", "")
+        # Compose question text (options are embedded in the question string)
+        question_with_options = question
+        # Map image paths to local figures directory
+        local_images = None
+        if images:
+            figures_dir = Path(self.data_dir) / "figures"
+            local_images = []
+            for img in images:
+                # Handle relative paths like "figures/11583/figure_1.jpg"
+                if img.startswith("figures/"):
+                    # Remove "figures/" prefix and construct full path
+                    relative_path = img[8:]  # Remove "figures/" prefix
+                    full_path = figures_dir / relative_path
+                    local_images.append(str(full_path))
+                else:
+                    # Fallback to original logic
+                    local_images.append(str(figures_dir / Path(img).name))
+        # Metadata
+        metadata = dict(item)
+        metadata["explanation"] = explanation
+        metadata["dataset"] = "chestagentbench"
+        return BenchmarkDataPoint(
+            id=question_id,
+            text=question_with_options,
+            images=local_images,
+            correct_answer=correct_answer,
+            metadata=metadata,
+            case_id=case_id,
+            category=category,
+        )

benchmarking/benchmarks/rexvqa_benchmark.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""ReXVQA benchmark implementation."""
+import json
+import os
+from typing import Dict, List, Optional, Any
+from datasets import load_dataset
+from .base import Benchmark, BenchmarkDataPoint
+from pathlib import Path
+class ReXVQABenchmark(Benchmark):
+    """ReXVQA benchmark for chest radiology visual question answering.
+    ReXVQA is a large-scale VQA dataset for chest radiology comprising approximately
+    696,000 questions paired with 160,000 chest X-rays. It tests 5 core radiological
+    reasoning skills: presence assessment, location analysis, negation detection,
+    differential diagnosis, and geometric reasoning.
+    The dataset consists of two separate HuggingFace datasets:
+    - ReXVQA: Contains questions, answers, and metadata
+    - ReXGradient-160K: Contains metadata only (images are in separate part files)
+    Paper: https://arxiv.org/abs/2506.04353
+    Dataset: https://huggingface.co/datasets/rajpurkarlab/ReXVQA
+    Images: https://huggingface.co/datasets/rajpurkarlab/ReXGradient-160K
+    """
+    def __init__(self, data_dir: str, **kwargs):
+        """Initialize ReXVQA benchmark.
+        Args:
+            data_dir (str): Directory to store/cache downloaded data
+            **kwargs: Additional configuration parameters
+                split (str): Dataset split to use (default: 'test')
+                cache_dir (str): Directory for caching HuggingFace datasets
+                trust_remote_code (bool): Whether to trust remote code (default: False)
+                max_questions (int): Maximum number of questions to load (default: None, load all)
+                images_dir (str): Directory containing extracted PNG images (default: None)
+        """
+        self.split = kwargs.get("split", "test")
+        self.cache_dir = kwargs.get("cache_dir", None)
+        self.trust_remote_code = kwargs.get("trust_remote_code", False)
+        self.max_questions = kwargs.get("max_questions", None)
+        self.images_dir = "benchmarking/data/rexvqa/images/deid_png"
+        self.image_dataset = None
+        self.image_mapping = {}  # Maps study_id to image data
+        super().__init__(data_dir, **kwargs)
+    def _load_data(self) -> None:
+        """Load ReXVQA data from local JSON file."""
+        try:
+            # Construct path to the JSON file
+            json_file_path = os.path.join("benchmarking", "data", "rexvqa", "test_vqa_data.json")
+            # Check if file exists
+            if not os.path.exists(json_file_path):
+                raise FileNotFoundError(f"Could not find test_vqa_data.json in the expected location: {json_file_path}")
+            print(f"Loading ReXVQA {self.split} split from local JSON file: {json_file_path}")
+            # Load JSON file directly
+            with open(json_file_path, 'r', encoding='utf-8') as f:
+                questions_data = json.load(f)
+            # ReXVQA format: {question_id: {question_data}, ...}
+            questions_list = []
+            for question_id, question_data in questions_data.items():
+                # Add the question_id to the question_data for reference
+                question_data['id'] = question_id
+                questions_list.append(question_data)
+            print(f"Loaded {len(questions_list)} questions from local JSON file")
+            # Load images dataset from ReXGradient-160K (metadata only)
+            print("Loading ReXGradient-160K metadata dataset...")
+            try:
+                self.image_dataset = load_dataset(
+                    "rajpurkarlab/ReXGradient-160K",
+                    split="test",
+                    cache_dir=self.cache_dir,
+                    trust_remote_code=self.trust_remote_code
+                )
+                print(f"Loaded {len(self.image_dataset)} image metadata entries from ReXGradient-160K")
+                # Create mapping from study_id to image metadata
+                self._create_image_mapping()
+            except Exception as e:
+                print(f"Warning: Could not load ReXGradient-160K dataset: {e}")
+                print("Proceeding without images...")
+                self.load_images = False
+            self.data_points = []
+            # Process questions (limit if max_questions is specified)
+            questions_to_process = questions_list
+            if self.max_questions:
+                questions_to_process = questions_list[:min(self.max_questions, len(questions_list))]
+            for i, item in enumerate(questions_to_process):
+                try:
+                    data_point = self._parse_rexvqa_item(item, i)
+                    if data_point:
+                        self.data_points.append(data_point)
+                except Exception as e:
+                    print(f"Error loading item {i}: {e}")
+                    continue
+        except Exception as e:
+            raise RuntimeError(f"Failed to load ReXVQA dataset: {e}")
+    def _create_image_mapping(self) -> None:
+        """Create mapping from study_id to image metadata."""
+        if not self.image_dataset:
+            return
+        print("Creating image mapping...")
+        for item in self.image_dataset:
+            study_instance_uid = item.get("StudyInstanceUid", "")
+            if study_instance_uid:
+                # Store the image metadata for this study using StudyInstanceUid as key
+                if study_instance_uid not in self.image_mapping:
+                    self.image_mapping[study_instance_uid] = []
+                self.image_mapping[study_instance_uid].append(item)
+        print(f"Created image mapping for {len(self.image_mapping)} studies")
+    def _parse_rexvqa_item(self, item: Dict[str, Any], index: int) -> Optional[BenchmarkDataPoint]:
+        """Parse a ReXVQA dataset item.
+        Args:
+            item (Dict[str, Any]): Dataset item from JSON file
+            index (int): Item index
+        Returns:
+            Optional[BenchmarkDataPoint]: Parsed data point
+        """
+        # Extract basic information
+        question_id = item.get("id", f"rexvqa_{self.split}_{index}")
+        question = item.get("question", "")
+        # Handle multiple choice options
+        options = item.get("options", [])
+        if options:
+            # Add options to the question for multiple choice format
+            question_with_options = question + "\n\nOptions:\n" + "\n".join(options)
+        else:
+            question_with_options = question
+        # Get correct answer
+        correct_answer = item.get("correct_answer", "")
+        if not question:
+            return None
+        # Handle images using ImagePath field
+        images = None
+        if self.images_dir and "ImagePath" in item and item["ImagePath"]:
+            images = []
+            for rel_path in item["ImagePath"]:
+                # Remove leading ../ if present
+                norm_rel_path = rel_path.lstrip("./")
+                # Join with images_dir root
+                full_path = str(Path(self.images_dir).parent / norm_rel_path)
+                images.append(full_path)
+        # Extract metadata
+        metadata = {
+            "dataset": "rexvqa",
+            "split": self.split,
+            "study_id": item.get("study_id", ""),
+            "study_instance_uid": item.get("StudyInstanceUid", ""),
+            "reasoning_type": item.get("task_name", ""),  # task_name maps to reasoning_type
+            "category": item.get("category", ""),
+            "class": item.get("class", ""),
+            "subcategory": item.get("subcategory", ""),
+            "patient_id": item.get("PatientID", ""),
+            "patient_age": item.get("PatientAge", ""),
+            "patient_sex": item.get("PatientSex", ""),
+            "study_date": item.get("StudyDate", ""),
+            "indication": item.get("Indication", ""),
+            "findings": item.get("Findings", ""),
+            "impression": item.get("Impression", ""),
+            "image_modality": item.get("ImageModality", []),
+            "image_view_position": item.get("ImageViewPosition", []),
+            "correct_answer_explanation": item.get("correct_answer_explanation", ""),
+        }
+        case_id = item.get("study_id", "")
+        category = item.get("task_name", "")
+        return BenchmarkDataPoint(
+            id=question_id,
+            text=question_with_options,
+            images=images,
+            correct_answer=correct_answer,
+            metadata=metadata,
+            case_id=case_id,
+            category=category,
+        )

benchmarking/cli.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""Command-line interface for the benchmarking pipeline."""
+import argparse
+import sys
+from .llm_providers import *
+from .benchmarks import *
+from .runner import BenchmarkRunner, BenchmarkRunConfig
+def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMProvider:
+    """Create an LLM provider based on the model name and type.
+    Args:
+        model_name (str): Name of the model
+        provider_type (str): Type of provider (openai, google, openrouter, anthropic, medrax)
+        **kwargs: Additional configuration parameters
+    Returns:
+        LLMProvider: The configured LLM provider
+    """
+    provider_map = {
+        "openai": OpenAIProvider,
+        "google": GoogleProvider,
+        "openrouter": OpenRouterProvider,
+        "medrax": MedRAXProvider,
+    }
+    if provider_type not in provider_map:
+        raise ValueError(f"Unknown provider type: {provider_type}. Available: {list(provider_map.keys())}")
+    provider_class = provider_map[provider_type]
+    return provider_class(model_name, **kwargs)
+def create_benchmark(benchmark_name: str, data_dir: str, **kwargs) -> Benchmark:
+    """Create a benchmark based on the benchmark name.
+    Args:
+        benchmark_name (str): Name of the benchmark
+        data_dir (str): Directory containing benchmark data
+        **kwargs: Additional configuration parameters
+    Returns:
+        Benchmark: The configured benchmark
+    """
+    benchmark_map = {
+        "rexvqa": ReXVQABenchmark,
+        "chestagentbench": ChestAgentBenchBenchmark,
+    }
+    if benchmark_name not in benchmark_map:
+        raise ValueError(f"Unknown benchmark: {benchmark_name}. Available: {list(benchmark_map.keys())}")
+    benchmark_class = benchmark_map[benchmark_name]
+    return benchmark_class(data_dir, **kwargs)
+def run_benchmark_command(args) -> None:
+    """Run a benchmark."""
+    print(f"Running benchmark: {args.benchmark} with model: {args.model}")
+    # Create LLM provider
+    provider_kwargs = {}
+    llm_provider = create_llm_provider(args.model, args.provider, **provider_kwargs)
+    # Create benchmark
+    benchmark_kwargs = {}
+    benchmark = create_benchmark(args.benchmark, args.data_dir, **benchmark_kwargs)
+    # Create runner config
+    config = BenchmarkRunConfig(
+        provider_name=args.provider,
+        model_name=args.model,
+        benchmark_name=args.benchmark,
+        output_dir=args.output_dir,
+        max_questions=args.max_questions,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_tokens
+    )
+    # Run benchmark
+    runner = BenchmarkRunner(config)
+    summary = runner.run_benchmark(llm_provider, benchmark)
+    print("\n" + "="*50)
+    print("BENCHMARK COMPLETED")
+    print("="*50)
+    # Check if benchmark run was successful
+    if "error" in summary:
+        print(f"Error: {summary['error']}")
+        return
+    # Print results
+    print(f"Model: {args.model}")
+    print(f"Benchmark: {args.benchmark}")
+    print(f"Total Questions: {summary['results']['total_questions']}")
+    print(f"Correct Answers: {summary['results']['correct_answers']}")
+    print(f"Overall Accuracy: {summary['results']['accuracy']:.2f}%")
+    print(f"Total Duration: {summary['results']['total_duration']:.2f}s")
+    print(f"Results saved to: {summary['results_file']}")
+def main():
+    """Main CLI entry point."""
+    parser = argparse.ArgumentParser(description="MedRAX Benchmarking Pipeline")
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Run benchmark command
+    run_parser = subparsers.add_parser("run", help="Run a benchmark")
+    run_parser.add_argument("--model", required=True, help="Model name (e.g., gpt-4o, gemini-2.5-pro)")
+    run_parser.add_argument("--provider", required=True, choices=["openai", "google", "openrouter", "medrax"], help="LLM provider")
+    run_parser.add_argument("--benchmark", required=True, choices=["rexvqa", "chestagentbench"], help="Benchmark to run")
+    run_parser.add_argument("--data-dir", required=True, help="Directory containing benchmark data")
+    run_parser.add_argument("--output-dir", default="benchmark_results", help="Output directory for results")
+    run_parser.add_argument("--max-questions", type=int, help="Maximum number of questions to process")
+    run_parser.add_argument("--temperature", type=float, default=0.7, help="Model temperature")
+    run_parser.add_argument("--top-p", type=float, default=0.95, help="Top-p value")
+    run_parser.add_argument("--max-tokens", type=int, default=1000, help="Maximum tokens per response")
+    run_parser.set_defaults(func=run_benchmark_command)
+    args = parser.parse_args()
+    if args.command is None:
+        parser.print_help()
+        return
+    try:
+        args.func(args)
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

benchmarking/data/rexvqa/download_rexgradient_images.py ADDED Viewed

	@@ -0,0 +1,172 @@

+#!/usr/bin/env python3
+"""
+Utility script to download and extract ReXGradient-160K images.
+This script helps users download the actual PNG images from the ReXGradient-160K dataset,
+which are stored as part files on HuggingFace and need to be concatenated and extracted.
+Usage:
+    python download_rexgradient_images.py --output_dir /path/to/images
+"""
+import argparse
+import subprocess
+from pathlib import Path
+from huggingface_hub import hf_hub_download, list_repo_files
+import requests
+from tqdm import tqdm
+def download_file(url, output_path, chunk_size=8192):
+    """Download a file with progress bar."""
+    response = requests.get(url, stream=True)
+    total_size = int(response.headers.get('content-length', 0))
+    with open(output_path, 'wb') as f:
+        with tqdm(total=total_size, unit='B', unit_scale=True, desc=output_path.name) as pbar:
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                if chunk:
+                    f.write(chunk)
+                    pbar.update(len(chunk))
+def main():
+    parser = argparse.ArgumentParser(description="Download ReXGradient-160K images")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Directory to save extracted images"
+    )
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="rajpurkarlab/ReXGradient-160K",
+        help="HuggingFace repository ID"
+    )
+    parser.add_argument(
+        "--skip_download",
+        action="store_true",
+        help="Skip downloading and only extract if files exist"
+    )
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Output directory: {output_dir}")
+    # Check if we need to accept the license first
+    print("Note: You may need to accept the dataset license on HuggingFace first:")
+    print(f"Visit: https://huggingface.co/datasets/{args.repo_id}")
+    print("Click 'Access repository' and accept the license agreement.")
+    print()
+    try:
+        # List files in the repository
+        print("Listing files in repository...")
+        files = list_repo_files(args.repo_id, repo_type='dataset')
+        part_files = [f for f in files if f.startswith("deid_png.part")]
+        if not part_files:
+            print("No part files found. The images might be in a different format.")
+            print("Available files:")
+            for f in files:
+                print(f"  - {f}")
+            return
+        print(f"Found {len(part_files)} part files:")
+        for f in part_files:
+            print(f"  - {f}")
+        # Download part files
+        if not args.skip_download:
+            print("\nDownloading part files...")
+            for part_file in part_files:
+                output_path = output_dir / part_file
+                if output_path.exists():
+                    print(f"Skipping {part_file} (already exists)")
+                    continue
+                print(f"Downloading {part_file}...")
+                try:
+                    hf_hub_download(
+                        repo_id=args.repo_id,
+                        filename=part_file,
+                        local_dir=output_dir,
+                        local_dir_use_symlinks=False,
+                        repo_type='dataset'
+                    )
+                except Exception as e:
+                    print(f"Error downloading {part_file}: {e}")
+                    print("You may need to accept the license agreement on HuggingFace.")
+                    return
+        # Concatenate part files
+        tar_path = output_dir / "deid_png.tar"
+        if not tar_path.exists():
+            print("\nConcatenating part files...")
+            with open(tar_path, 'wb') as tar_file:
+                for part_file in sorted(part_files):
+                    part_path = output_dir / part_file
+                    if part_path.exists():
+                        print(f"Adding {part_file}...")
+                        with open(part_path, 'rb') as f:
+                            tar_file.write(f.read())
+                    else:
+                        print(f"Warning: {part_file} not found, skipping...")
+        else:
+            print(f"Tar file already exists: {tar_path}")
+        # Extract tar file
+        if tar_path.exists():
+            print("\nExtracting images...")
+            images_dir = output_dir / "images"
+            images_dir.mkdir(exist_ok=True)
+            # Check if already extracted
+            if any(images_dir.glob("*.png")):
+                print("Images already extracted.")
+            else:
+                try:
+                    subprocess.run([
+                        "tar", "-xf", str(tar_path),
+                        "-C", str(images_dir)
+                    ], check=True)
+                    print("Extraction completed!")
+                except subprocess.CalledProcessError as e:
+                    print(f"Error extracting tar file: {e}")
+                    return
+                except FileNotFoundError:
+                    print("Error: 'tar' command not found. Please install tar or extract manually.")
+                    return
+            # Count extracted images
+            png_files = list(images_dir.glob("*.png"))
+            print(f"Extracted {len(png_files)} PNG images to {images_dir}")
+            # Show some example filenames
+            if png_files:
+                print("\nExample image filenames:")
+                for f in png_files[:5]:
+                    print(f"  - {f.name}")
+                if len(png_files) > 5:
+                    print(f"  ... and {len(png_files) - 5} more")
+        print(f"\nSetup complete! Use this directory as images_dir in ReXVQABenchmark:")
+        print(f"images_dir='{images_dir}'")
+    except Exception as e:
+        print(f"Error: {e}")
+        print("\nManual setup instructions:")
+        print("1. Visit https://huggingface.co/datasets/rajpurkarlab/ReXGradient-160K")
+        print("2. Accept the license agreement")
+        print("3. Download the deid_png.part* files")
+        print("4. Concatenate: cat deid_png.part* > deid_png.tar")
+        print("5. Extract: tar -xf deid_png.tar")
+        print("6. Use the extracted directory as images_dir")
+if __name__ == "__main__":
+    main()

benchmarking/llm_providers/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""LLM provider abstractions for benchmarking."""
+from .base import LLMProvider, LLMRequest, LLMResponse
+from .openai_provider import OpenAIProvider
+from .google_provider import GoogleProvider
+from .medrax_provider import MedRAXProvider
+from .openrouter_provider import OpenRouterProvider
+__all__ = [
+    "LLMProvider",
+    "LLMRequest",
+    "LLMResponse",
+    "OpenAIProvider",
+    "GoogleProvider",
+    "MedRAXProvider",
+    "OpenRouterProvider",
+]

benchmarking/llm_providers/base.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Base class for LLM providers."""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass
+import base64
+from pathlib import Path
+from medrax.utils.utils import load_prompts_from_file
+@dataclass
+class LLMRequest:
+    """Request to an LLM provider."""
+    text: str
+    images: Optional[List[str]] = None  # List of image paths
+    temperature: float = 0.7
+    top_p: float = 0.95
+    max_tokens: int = 5000
+    additional_params: Optional[Dict[str, Any]] = None
+@dataclass
+class LLMResponse:
+    """Response from an LLM provider."""
+    content: str
+    usage: Optional[Dict[str, Any]] = None
+    duration: Optional[float] = None
+    raw_response: Optional[Any] = None
+class LLMProvider(ABC):
+    """Abstract base class for LLM providers.
+    This class defines the interface for all LLM providers, standardizing
+    text + image input -> text output across different models and APIs.
+    """
+    def __init__(self, model_name: str, **kwargs):
+        """Initialize the LLM provider.
+        Args:
+            model_name (str): Name of the model to use
+            **kwargs: Additional configuration parameters
+        """
+        self.model_name = model_name
+        self.config = kwargs
+        # Always load system prompt from file
+        try:
+            prompts = load_prompts_from_file("medrax/docs/system_prompts.txt")
+            self.system_prompt = prompts.get("CHESTAGENTBENCH_PROMPT", None)
+            if self.system_prompt is None:
+                print(f"Warning: System prompt not found in medrax/docs/system_prompts.txt.")
+        except Exception as e:
+            print(f"Error loading system prompt: {e}")
+            self.system_prompt = None
+        self._setup()
+    @abstractmethod
+    def _setup(self) -> None:
+        """Set up the provider (API keys, client initialization, etc.)."""
+        pass
+    @abstractmethod
+    def generate_response(self, request: LLMRequest) -> LLMResponse:
+        """Generate a response from the LLM.
+        Args:
+            request (LLMRequest): The request containing text, images, and parameters
+        Returns:
+            LLMResponse: The response from the LLM
+        """
+        pass
+    def test_connection(self) -> bool:
+        """Test the connection to the LLM provider.
+        Returns:
+            bool: True if connection is successful, False otherwise
+        """
+        try:
+            # Simple test request
+            test_request = LLMRequest(
+                text="Hello",
+                temperature=0.5,
+                max_tokens=1000
+            )
+            response = self.generate_response(test_request)
+            return response.content is not None and len(response.content.strip()) > 0
+        except Exception as e:
+            print(f"Connection test failed: {e}")
+            return False
+    def _encode_image(self, image_path: str) -> str:
+        """Encode image to base64 string.
+        Args:
+            image_path (str): Path to the image file
+        Returns:
+            str: Base64 encoded image string
+        """
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    def _validate_image_paths(self, image_paths: List[str]) -> List[str]:
+        """Validate that image paths exist and are readable.
+        Args:
+            image_paths (List[str]): List of image paths to validate
+        Returns:
+            List[str]: List of valid image paths
+        """
+        valid_paths = []
+        for path in image_paths:
+            if Path(path).exists() and Path(path).is_file():
+                valid_paths.append(path)
+            else:
+                print(f"Warning: Image path does not exist: {path}")
+        return valid_paths
+    def __str__(self) -> str:
+        """String representation of the provider."""
+        return f"{self.__class__.__name__}(model={self.model_name})"

benchmarking/llm_providers/google_provider.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Google LLM provider implementation using langchain_google_genai."""
+import os
+import time
+from tenacity import retry, wait_exponential, stop_after_attempt
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import HumanMessage, SystemMessage
+from .base import LLMProvider, LLMRequest, LLMResponse
+class GoogleProvider(LLMProvider):
+    """Google LLM provider for Gemini models using langchain_google_genai."""
+    def _setup(self) -> None:
+        """Set up Google langchain client."""
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError("GOOGLE_API_KEY environment variable is required")
+        # Create ChatGoogleGenerativeAI instance
+        self.client = ChatGoogleGenerativeAI(
+            model=self.model_name,
+            google_api_key=api_key
+        )
+    @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
+    def generate_response(self, request: LLMRequest) -> LLMResponse:
+        """Generate response using langchain Google Gemini.
+        Args:
+            request (LLMRequest): The request containing text, images, and parameters
+        Returns:
+            LLMResponse: The response from Google Gemini
+        """
+        start_time = time.time()
+        # Build messages
+        messages = []
+        # Add system prompt if provided
+        if self.system_prompt:
+            messages.append(SystemMessage(content=self.system_prompt))
+        # Construct content for multimodal content
+        if request.images:
+            # For multimodal content, use a list format
+            content_parts = [request.text]
+            # Add images if provided
+            valid_images = self._validate_image_paths(request.images)
+            for image_path in valid_images:
+                try:
+                    # For langchain Google, pass image data as base64
+                    image_b64 = self._encode_image(image_path)
+                    content_parts.append({
+                        "type": "image_url",
+                        "image_url": f"data:image/jpeg;base64,{image_b64}"
+                    })
+                except Exception as e:
+                    print(f"Error reading image {image_path}: {e}")
+            messages.append(HumanMessage(content=content_parts))
+        else:
+            # Text-only message
+            messages.append(HumanMessage(content=request.text))
+        # Make API call using langchain
+        try:
+            # Update client parameters for this request
+            self.client.temperature = request.temperature
+            self.client.max_output_tokens = request.max_tokens
+            self.client.top_p = request.top_p
+            response = self.client.invoke(messages)
+            duration = time.time() - start_time
+            # Extract response content
+            content = response.content if response.content else ""
+            # Get usage information if available
+            usage = {}
+            if hasattr(response, 'usage_metadata') and response.usage_metadata:
+                usage = {
+                    "prompt_tokens": response.usage_metadata.get("input_tokens", 0),
+                    "completion_tokens": response.usage_metadata.get("output_tokens", 0),
+                    "total_tokens": response.usage_metadata.get("total_tokens", 0)
+                }
+            return LLMResponse(
+                content=content,
+                usage=usage,
+                duration=duration,
+                raw_response=response
+            )
+        except Exception as e:
+            return LLMResponse(
+                content=f"Error: {str(e)}",
+                duration=time.time() - start_time,
+                raw_response=None
+            )

benchmarking/llm_providers/medrax_provider.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""MedRAX LLM provider implementation."""
+import time
+import shutil
+from pathlib import Path
+from .base import LLMProvider, LLMRequest, LLMResponse
+from medrax.rag.rag import RAGConfig
+from main import initialize_agent
+class MedRAXProvider(LLMProvider):
+    """MedRAX LLM provider that uses the full MedRAX agent system."""
+    def __init__(self, model_name: str, **kwargs):
+        """Initialize MedRAX provider.
+        Args:
+            model_name (str): Base LLM model name (e.g., "gpt-4.1-2025-04-14")
+            **kwargs: Additional configuration parameters
+        """
+        self.model_name = model_name
+        self.agent = None
+        self.tools_dict = None
+        super().__init__(model_name, **kwargs)
+    def _setup(self) -> None:
+        """Set up MedRAX agent system."""
+        try:
+            print("Starting server...")
+            selected_tools = [
+                # "ImageVisualizerTool",  # For displaying images in the UI
+                # "DicomProcessorTool",  # For processing DICOM medical image files
+                # "TorchXRayVisionClassifierTool",  # For classifying chest X-ray images using TorchXRayVision
+                # "ArcPlusClassifierTool",  # For advanced chest X-ray classification using ArcPlus
+                # "ChestXRaySegmentationTool",  # For segmenting anatomical regions in chest X-rays
+                # "ChestXRayReportGeneratorTool",  # For generating medical reports from X-rays
+                # "XRayVQATool",  # For visual question answering on X-rays
+                # "LlavaMedTool",  # For multimodal medical image understanding
+                # "XRayPhraseGroundingTool",  # For locating described features in X-rays
+                # "ChestXRayGeneratorTool",  # For generating synthetic chest X-rays
+                "WebBrowserTool",  # For web browsing and search capabilities
+                "MedicalRAGTool",  # For retrieval-augmented generation with medical knowledge
+                # "PythonSandboxTool",  # Add the Python sandbox tool
+            ]
+            rag_config = RAGConfig(
+                model="command-a-03-2025",  # Chat model for generating responses
+                embedding_model="embed-v4.0",  # Embedding model for the RAG system
+                rerank_model="rerank-v3.5",  # Reranking model for the RAG system
+                temperature=0.3,
+                pinecone_index_name="medrax2",  # Name for the Pinecone index
+                chunk_size=1500,
+                chunk_overlap=300,
+                retriever_k=7,
+                local_docs_dir="rag_docs",  # Change this to the path of the documents for RAG
+                huggingface_datasets=["VictorLJZ/medrax2"],  # List of HuggingFace datasets to load
+                dataset_split="train",  # Which split of the datasets to use
+            )
+            # Prepare any additional model-specific kwargs
+            model_kwargs = {}
+            agent, tools_dict = initialize_agent(
+                prompt_file="medrax/docs/system_prompts.txt",
+                tools_to_use=selected_tools,
+                model_dir="/model-weights",
+                temp_dir="temp",  # Change this to the path of the temporary directory
+                device="cpu",
+                model=self.model_name,  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
+                temperature=0.7,
+                top_p=0.95,
+                model_kwargs=model_kwargs,
+                rag_config=rag_config,
+                debug=True,
+            )
+            self.agent = agent
+            self.tools_dict = tools_dict
+            print(f"MedRAX agent initialized with tools: {list(self.tools_dict.keys())}")
+        except Exception as e:
+            print(f"Error initializing MedRAX agent: {e}")
+            raise
+    def generate_response(self, request: LLMRequest) -> LLMResponse:
+        """Generate response using MedRAX agent.
+        Args:
+            request (LLMRequest): The request containing text, images, and parameters
+        Returns:
+            LLMResponse: The response from MedRAX agent
+        """
+        start_time = time.time()
+        if self.agent is None:
+            return LLMResponse(
+                content="Error: MedRAX agent not initialized",
+                duration=time.time() - start_time,
+                raw_response=None
+            )
+        try:
+            # Build messages for the agent
+            messages = []
+            thread_id = str(int(time.time() * 1000))  # Unique thread ID
+            # Copy images to session temp directory and provide paths
+            image_paths = []
+            if request.images:
+                valid_images = self._validate_image_paths(request.images)
+                print(f"Processing {len(valid_images)} images")
+                for i, image_path in enumerate(valid_images):
+                    print(f"Original image path: {image_path}")
+                    # Copy image to session temp directory
+                    dest_path = Path("temp") / f"image_{i}_{Path(image_path).name}"
+                    print(f"Destination path: {dest_path}")
+                    shutil.copy2(image_path, dest_path)
+                    image_paths.append(str(dest_path))
+                    # Verify file exists after copy
+                    if not dest_path.exists():
+                        print(f"ERROR: File not found after copy: {dest_path}")
+                    else:
+                        print(f"File successfully copied: {dest_path}")
+                    # Add image path message for tools
+                    messages.append({
+                        "role": "user",
+                        "content": f"image_path: {dest_path}"
+                    })
+                    # Add image content for multimodal LLM
+                    with open(image_path, "rb") as img_file:
+                        img_base64 = self._encode_image(image_path)
+                    messages.append({
+                        "role": "user",
+                        "content": [{
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
+                        }]
+                    })
+            # Add text message
+            messages.append({
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": request.text
+                }]
+            })
+            # Run the agent
+            response_content = ""
+            for chunk in self.agent.workflow.stream(
+                {"messages": messages},
+                {"configurable": {"thread_id": thread_id}},
+                stream_mode="updates"
+            ):
+                if isinstance(chunk, dict):
+                    for node_name, node_output in chunk.items():
+                        if "messages" in node_output:
+                            for msg in node_output["messages"]:
+                                if hasattr(msg, 'content') and msg.content:
+                                    response_content += str(msg.content)
+            duration = time.time() - start_time
+            return LLMResponse(
+                content=response_content.strip(),
+                usage={"agent_tools": list(self.tools_dict.keys())},
+                duration=duration,
+                raw_response={"thread_id": thread_id, "image_paths": image_paths}
+            )
+        except Exception as e:
+            return LLMResponse(
+                content=f"Error: {str(e)}",
+                duration=time.time() - start_time,
+                raw_response=None
+            )

benchmarking/llm_providers/openai_provider.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""OpenAI LLM provider implementation using langchain_openai."""
+import os
+import time
+from tenacity import retry, wait_exponential, stop_after_attempt
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage, SystemMessage
+from .base import LLMProvider, LLMRequest, LLMResponse
+class OpenAIProvider(LLMProvider):
+    """OpenAI LLM provider for GPT models using langchain_openai."""
+    def _setup(self) -> None:
+        """Set up OpenAI langchain client."""
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError("OPENAI_API_KEY environment variable is required")
+        base_url = os.getenv("OPENAI_BASE_URL")
+        # Create ChatOpenAI instance
+        kwargs = {
+            "model": self.model_name,
+            "api_key": api_key,
+        }
+        if base_url:
+            kwargs["base_url"] = base_url
+        self.client = ChatOpenAI(**kwargs)
+    @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
+    def generate_response(self, request: LLMRequest) -> LLMResponse:
+        """Generate response using langchain OpenAI.
+        Args:
+            request (LLMRequest): The request containing text, images, and parameters
+        Returns:
+            LLMResponse: The response from OpenAI
+        """
+        start_time = time.time()
+        # Build messages
+        messages = []
+        # Add system prompt if provided
+        if self.system_prompt:
+            messages.append(SystemMessage(content=self.system_prompt))
+        # Build user message content
+        user_content = []
+        user_content.append({
+            "type": "text",
+            "text": request.text
+        })
+        # Add images if provided
+        if request.images:
+            valid_images = self._validate_image_paths(request.images)
+            for image_path in valid_images:
+                try:
+                    image_b64 = self._encode_image(image_path)
+                    user_content.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image_b64}",
+                            "detail": "high"
+                        }
+                    })
+                except Exception as e:
+                    print(f"Error reading image {image_path}: {e}")
+        messages.append(HumanMessage(content=user_content))
+        # Make API call using langchain
+        try:
+            # Update client parameters for this request
+            self.client.temperature = request.temperature
+            self.client.max_tokens = request.max_tokens
+            self.client.top_p = request.top_p
+            response = self.client.invoke(messages)
+            duration = time.time() - start_time
+            # Extract response content
+            content = response.content if response.content else ""
+            # Get usage information if available
+            usage = {}
+            if hasattr(response, 'usage_metadata') and response.usage_metadata:
+                usage = {
+                    "prompt_tokens": response.usage_metadata.get("input_tokens", 0),
+                    "completion_tokens": response.usage_metadata.get("output_tokens", 0),
+                    "total_tokens": response.usage_metadata.get("total_tokens", 0)
+                }
+            return LLMResponse(
+                content=content,
+                usage=usage,
+                duration=duration,
+                raw_response=response
+            )
+        except Exception as e:
+            return LLMResponse(
+                content=f"Error: {str(e)}",
+                duration=time.time() - start_time,
+                raw_response=None
+            )

benchmarking/llm_providers/openrouter_provider.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""xAI LLM provider implementation using OpenRouter API via OpenAI SDK."""
+import os
+import time
+from tenacity import retry, wait_exponential, stop_after_attempt
+from openai import OpenAI
+from .base import LLMProvider, LLMRequest, LLMResponse
+class OpenRouterProvider(LLMProvider):
+    """LLM provider using OpenRouter API via OpenAI SDK."""
+    def _setup(self) -> None:
+        """Set up OpenRouter client models."""
+        api_key = os.getenv("OPENROUTER_API_KEY")
+        if not api_key:
+            raise ValueError("OPENROUTER_API_KEY environment variable is required for xAI Grok via OpenRouter.")
+        base_url = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
+        # Use OpenAI SDK with OpenRouter endpoint
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+    @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
+    def generate_response(self, request: LLMRequest) -> LLMResponse:
+        """Generate response using OpenRouter model via OpenAI SDK.
+        Args:
+            request (LLMRequest): The request containing text, images, and parameters
+        Returns:
+            LLMResponse: The response from OpenRouter
+        """
+        start_time = time.time()
+        # Build messages
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+        user_content = []
+        user_content.append({"type": "text", "text": request.text})
+        # Add images if provided
+        if request.images:
+            valid_images = self._validate_image_paths(request.images)
+            for image_path in valid_images:
+                try:
+                    image_b64 = self._encode_image(image_path)
+                    user_content.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image_b64}",
+                            "detail": "high"
+                        }
+                    })
+                except Exception as e:
+                    print(f"Error reading image {image_path}: {e}")
+        messages.append({"role": "user", "content": user_content})
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                temperature=request.temperature,
+                top_p=request.top_p,
+                max_tokens=request.max_tokens,
+                **(request.additional_params or {})
+            )
+            duration = time.time() - start_time
+            content = response.choices[0].message.content if response.choices else ""
+            usage = {}
+            if hasattr(response, 'usage') and response.usage:
+                usage = {
+                    "prompt_tokens": getattr(response.usage, "prompt_tokens", 0),
+                    "completion_tokens": getattr(response.usage, "completion_tokens", 0),
+                    "total_tokens": getattr(response.usage, "total_tokens", 0)
+                }
+            return LLMResponse(
+                content=content,
+                usage=usage,
+                duration=duration,
+                raw_response=response
+            )
+        except Exception as e:
+            return LLMResponse(
+                content=f"Error: {str(e)}",
+                duration=time.time() - start_time,
+                raw_response=None
+            )

benchmarking/runner.py ADDED Viewed

	@@ -0,0 +1,377 @@

+"""Main test runner for benchmarking pipeline."""
+import json
+import time
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Optional, Any
+from dataclasses import dataclass
+from tqdm import tqdm
+import re
+from .llm_providers import LLMProvider, LLMRequest, LLMResponse
+from .benchmarks import Benchmark, BenchmarkDataPoint
+@dataclass
+class BenchmarkResult:
+    """Result of running a benchmark on a single data point."""
+    data_point_id: str
+    question: str
+    model_answer: str
+    correct_answer: str
+    is_correct: bool
+    duration: float
+    usage: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+@dataclass
+class BenchmarkRunConfig:
+    """Configuration for a benchmark run."""
+    provider_name: str
+    model_name: str
+    benchmark_name: str
+    output_dir: str
+    max_questions: Optional[int] = None
+    temperature: float = 0.7
+    top_p: float = 0.95
+    max_tokens: int = 5000
+    additional_params: Optional[Dict[str, Any]] = None
+class BenchmarkRunner:
+    """Main class for running benchmarks against LLM providers."""
+    def __init__(self, config: BenchmarkRunConfig):
+        """Initialize the benchmark runner.
+        Args:
+            config (BenchmarkRunConfig): Configuration for the benchmark run
+        """
+        self.config = config
+        self.results = []
+        self.output_dir = Path(config.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Generate unique run ID
+        self.run_id = f"{config.benchmark_name}_{config.provider_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        # Set up logging
+        self._setup_logging()
+        self.logger.info(f"Initialized benchmark runner with ID: {self.run_id}")
+    def _setup_logging(self) -> None:
+        """Set up logging configuration."""
+        log_file = self.output_dir / f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
+        # Create logger
+        self.logger = logging.getLogger(f"benchmark_runner_{self.run_id}")
+        self.logger.setLevel(logging.INFO)
+        # Create handlers
+        file_handler = logging.FileHandler(log_file)
+        console_handler = logging.StreamHandler()
+        # Create formatter
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        file_handler.setFormatter(formatter)
+        console_handler.setFormatter(formatter)
+        # Add handlers to logger
+        self.logger.addHandler(file_handler)
+        self.logger.addHandler(console_handler)
+    def run_benchmark(
+        self,
+        llm_provider: LLMProvider,
+        benchmark: Benchmark,
+    ) -> Dict[str, Any]:
+        """Run a benchmark against an LLM provider.
+        Args:
+            llm_provider (LLMProvider): The LLM provider to test
+            benchmark (Benchmark): The benchmark to run
+        Returns:
+            Dict[str, Any]: Summary of benchmark results
+        """
+        self.logger.info(f"Starting benchmark run: {self.run_id}")
+        self.logger.info(f"Model: {llm_provider.model_name}")
+        self.logger.info(f"Benchmark: {benchmark}")
+        # Test provider connection
+        if not llm_provider.test_connection():
+            self.logger.error("LLM provider connection test failed")
+            return {"error": "LLM provider connection test failed"}
+        # Get data points to process
+        total_questions = len(benchmark)
+        max_questions = self.config.max_questions or total_questions
+        end_index = min(max_questions, total_questions)
+        self.logger.info(f"Processing questions {0} to {end_index-1} of {total_questions}")
+        # Initialize counters
+        processed = 0
+        correct = 0
+        total_duration = 0.0
+        # Process each data point
+        for i in tqdm(range(0, end_index), desc="Processing questions"):
+            try:
+                data_point = benchmark.get_data_point(i)
+                # Run the model on this data point
+                result = self._process_data_point(llm_provider, data_point)
+                # Update counters
+                processed += 1
+                if result.is_correct:
+                    correct += 1
+                total_duration += result.duration
+                # Add to results
+                self.results.append(result)
+                # Log progress
+                if processed % 10 == 0:
+                    self._save_intermediate_results()
+                    accuracy = (correct / processed) * 100
+                    avg_duration = total_duration / processed
+                    self.logger.info(
+                        f"Progress: {processed}/{end_index} | "
+                        f"Accuracy: {accuracy:.2f}% | "
+                        f"Avg Duration: {avg_duration:.2f}s"
+                    )
+            except Exception as e:
+                self.logger.error(f"Error processing data point {i}: {e}")
+                # Add error result
+                error_result = BenchmarkResult(
+                    data_point_id=f"error_{i}",
+                    question="",
+                    model_answer="",
+                    correct_answer="",
+                    is_correct=False,
+                    duration=0.0,
+                    error=str(e)
+                )
+                self.results.append(error_result)
+                continue
+        # Save final results
+        summary = self._save_final_results(benchmark)
+        self.logger.info(f"Benchmark run completed: {self.run_id}")
+        self.logger.info(f"Final accuracy: {summary['results']['accuracy']:.2f}%")
+        self.logger.info(f"Total duration: {summary['results']['total_duration']:.2f}s")
+        return summary
+    def _process_data_point(
+        self,
+        llm_provider: LLMProvider,
+        data_point: BenchmarkDataPoint,
+    ) -> BenchmarkResult:
+        """Process a single data point.
+        Args:
+            llm_provider (LLMProvider): The LLM provider to use
+            data_point (BenchmarkDataPoint): The data point to process
+        Returns:
+            BenchmarkResult: Result of processing the data point
+        """
+        start_time = time.time()
+        try:
+            # Create request
+            request = LLMRequest(
+                text=data_point.text,
+                images=data_point.images,
+                temperature=self.config.temperature,
+                top_p=self.config.top_p,
+                max_tokens=self.config.max_tokens,
+                additional_params=self.config.additional_params
+            )
+            # Get response from LLM
+            response: LLMResponse = llm_provider.generate_response(request)
+            # Extract answer (this may need customization based on benchmark)
+            model_answer = self._extract_answer(response.content)
+            # Check if correct
+            is_correct = self._is_correct_answer(model_answer, data_point.correct_answer)
+            duration = time.time() - start_time
+            return BenchmarkResult(
+                data_point_id=data_point.id,
+                question=data_point.text,
+                model_answer=model_answer,
+                correct_answer=data_point.correct_answer,
+                is_correct=is_correct,
+                duration=duration,
+                usage=response.usage,
+                metadata={
+                    "data_point_metadata": data_point.metadata,
+                    "case_id": data_point.case_id,
+                    "category": data_point.category,
+                    "raw_response": response.content,
+                }
+            )
+        except Exception as e:
+            duration = time.time() - start_time
+            return BenchmarkResult(
+                data_point_id=data_point.id,
+                question=data_point.text,
+                model_answer="",
+                correct_answer=data_point.correct_answer,
+                is_correct=False,
+                duration=duration,
+                error=str(e),
+                metadata={
+                    "data_point_metadata": data_point.metadata,
+                    "case_id": data_point.case_id,
+                    "category": data_point.category,
+                }
+            )
+    def _extract_answer(self, response_text: str) -> str:
+        """Extract the answer from the model response.
+        Args:
+            response_text (str): The full response text from the model
+        Returns:
+            str: The extracted answer
+        """
+        # First, look for the '<|A|>' format
+        final_answer_pattern = r'\s*<\|([A-F])\|>'
+        match = re.search(final_answer_pattern, response_text)
+        if match:
+            return match.group(1).upper()
+        # If no pattern matches, return the full response
+        return response_text.strip()
+    def _is_correct_answer(self, model_answer: str, correct_answer: str) -> bool:
+        """Check if the model answer is correct.
+        Args:
+            model_answer (str): The model's answer
+            correct_answer (str): The correct answer
+        Returns:
+            bool: True if the answer is correct
+        """
+        if not model_answer or not correct_answer:
+            return False
+        # For multiple choice, compare just the letter
+        model_clean = model_answer.strip().upper()
+        correct_clean = correct_answer.strip().upper()
+        # Extract just the first letter for comparison
+        model_letter = model_clean[0] if model_clean else ""
+        correct_letter = correct_clean[0] if correct_clean else ""
+        return model_letter == correct_letter
+    def _save_intermediate_results(self) -> None:
+        """Save intermediate results to disk."""
+        results_file = self.output_dir / f"{self.run_id}_intermediate.json"
+        # Convert results to serializable format
+        results_data = []
+        for result in self.results:
+            results_data.append({
+                "data_point_id": result.data_point_id,
+                "question": result.question,
+                "model_answer": result.model_answer,
+                "correct_answer": result.correct_answer,
+                "is_correct": result.is_correct,
+                "duration": result.duration,
+                "usage": result.usage,
+                "error": result.error,
+                "metadata": result.metadata,
+            })
+        with open(results_file, 'w') as f:
+            json.dump(results_data, f, indent=2)
+    def _save_final_results(self, benchmark: Benchmark) -> Dict[str, Any]:
+        """Save final results and return summary.
+        Args:
+            benchmark (Benchmark): The benchmark that was run
+        Returns:
+            Dict[str, Any]: Summary of results
+        """
+        # Save detailed results
+        results_file = self.output_dir / f"{self.run_id}_results.json"
+        self._save_intermediate_results()
+        # Calculate summary statistics
+        total_questions = len(self.results)
+        correct_answers = sum(1 for r in self.results if r.is_correct)
+        total_duration = sum(r.duration for r in self.results)
+        accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
+        # Calculate per-category accuracy
+        category_stats = {}
+        for result in self.results:
+            if result.metadata and result.metadata.get("category"):
+                category = result.metadata["category"]
+                if category not in category_stats:
+                    category_stats[category] = {"correct": 0, "total": 0}
+                category_stats[category]["total"] += 1
+                if result.is_correct:
+                    category_stats[category]["correct"] += 1
+        # Calculate accuracy for each category
+        category_accuracies = {}
+        for category, stats in category_stats.items():
+            category_accuracies[category] = (stats["correct"] / stats["total"]) * 100
+        # Create summary
+        summary = {
+            "run_id": self.run_id,
+            "timestamp": datetime.now().isoformat(),
+            "config": {
+                "model_name": self.config.model_name,
+                "benchmark_name": self.config.benchmark_name,
+                "temperature": self.config.temperature,
+                "max_tokens": self.config.max_tokens,
+            },
+            "benchmark_info": {
+                "total_size": len(benchmark),
+                "processed_questions": total_questions,
+            },
+            "results": {
+                "accuracy": accuracy,
+                "correct_answers": correct_answers,
+                "total_questions": total_questions,
+                "total_duration": total_duration,
+                "avg_duration_per_question": total_duration / total_questions if total_questions > 0 else 0,
+                "category_accuracies": category_accuracies,
+            },
+            "results_file": str(results_file),
+        }
+        # Save summary
+        summary_file = self.output_dir / f"{self.run_id}_summary.json"
+        with open(summary_file, 'w') as f:
+            json.dump(summary, f, indent=2)
+        return summary

main.py CHANGED Viewed

@@ -9,7 +9,6 @@ The system uses OpenAI's language models for reasoning and can be configured
 with different model weights, tools, and parameters.
 """
-import os
 import warnings
 from typing import Dict, List, Optional, Any
 from dotenv import load_dotenv
@@ -175,14 +174,6 @@ if __name__ == "__main__":
     # Prepare any additional model-specific kwargs
     model_kwargs = {}
-    # Set up API keys for the web browser tool
-    # You'll need to set these environment variables:
-    # - GOOGLE_SEARCH_API_KEY: Your Google Custom Search API key
-    # - GOOGLE_SEARCH_ENGINE_ID: Your Google Custom Search Engine ID
-    # - COHERE_API_KEY: Your Cohere API key
-    # - OPENAI_API_KEY: Your OpenAI API key
-    # - PINECONE_API_KEY: Your Pinecone API key
     agent, tools_dict = initialize_agent(
         prompt_file="medrax/docs/system_prompts.txt",
         tools_to_use=selected_tools,

 with different model weights, tools, and parameters.
 """
 import warnings
 from typing import Dict, List, Optional, Any
 from dotenv import load_dotenv
     # Prepare any additional model-specific kwargs
     model_kwargs = {}
     agent, tools_dict = initialize_agent(
         prompt_file="medrax/docs/system_prompts.txt",
         tools_to_use=selected_tools,

medrax/docs/system_prompts.txt CHANGED Viewed

@@ -1,20 +1,26 @@
 [MEDICAL_ASSISTANT]
 You are an expert medical AI assistant who can answer any medical questions and analyze medical images similar to a doctor.
 Solve using your own vision and reasoning and use tools to complement your reasoning.
-Make multiple tool calls in parallel or sequence as needed for comprehensive answers.
-Critically think about and criticize the tool outputs.
 If you need to look up some information before asking a follow up question, you are allowed to do that.
 CITATION REQUIREMENTS:
-- When referencing information from the RAG and web search tools, ALWAYS include numbered citations [1], [2], [3], etc.
-- Use citations immediately after making claims or statements based on the above tool results
-- Be consistent with citation numbering throughout your response
-- Only cite sources that actually contain the information you're referencing
 Examples:
 - "According to recent research [1], chest X-rays can show signs of pneumonia..."
 - "The medical literature indicates [2] that this condition typically presents with..."
 - "Based on clinical guidelines [3], the recommended treatment approach is..."
-[GENERAL_ASSISTANT]
-You are a helpful AI assistant. Your role is to assist users with a wide range of tasks and questions, providing accurate and useful information on various topics.

 [MEDICAL_ASSISTANT]
 You are an expert medical AI assistant who can answer any medical questions and analyze medical images similar to a doctor.
 Solve using your own vision and reasoning and use tools to complement your reasoning.
+You can make multiple tool calls in parallel or in sequence as needed for comprehensive answers.
+Think critically about and criticize the tool outputs.
 If you need to look up some information before asking a follow up question, you are allowed to do that.
 CITATION REQUIREMENTS:
+- When referencing information from RAG and/or web search tools, ALWAYS include numbered citations [1], [2], [3], etc.
+- Use citations immediately after making claims or statements based on the above tool results.
+- Be consistent with citation numbering throughout your response.
+- Only cite sources that actually contain the information you're referencing.
 Examples:
 - "According to recent research [1], chest X-rays can show signs of pneumonia..."
 - "The medical literature indicates [2] that this condition typically presents with..."
 - "Based on clinical guidelines [3], the recommended treatment approach is..."
+[CHESTAGENTBENCH_PROMPT]
+You are an expert medical AI assistant who can answer any medical questions and analyze medical images similar to a doctor.
+Solve using your own vision and reasoning and use tools (if available) to complement your reasoning.
+You can make multiple tool calls in parallel or in sequence as needed for comprehensive answers.
+Think critically about and criticize the tool outputs.
+If you need to look up some information before asking a follow up question, you are allowed to do that.
+When encountering a multiple-choice question, your final response should end with "Final answer: <|A|>" from list of possible choices A, B, C, D, E, F.
+It is extremely important that you strictly answer in the format mentioned above.

medrax/models/model_factory.py CHANGED Viewed

@@ -28,7 +28,11 @@ class ModelFactory:
             "env_key": "OPENAI_API_KEY",
             "base_url_key": "OPENAI_BASE_URL",
         },
-        "gemini": {"class": ChatGoogleGenerativeAI, "env_key": "GOOGLE_API_KEY"},
         "openrouter": {
             "class": ChatOpenAI,  # OpenRouter uses OpenAI-compatible interface
             "env_key": "OPENROUTER_API_KEY",
@@ -36,8 +40,8 @@ class ModelFactory:
             "default_base_url": "https://openrouter.ai/api/v1",
         },
         "grok": {
-        "class": ChatXAI,
-        "env_key": "XAI_API_KEY",
         }
         # Add more providers with default configurations here
     }

             "env_key": "OPENAI_API_KEY",
             "base_url_key": "OPENAI_BASE_URL",
         },
+        "gemini": {
+            "class": ChatGoogleGenerativeAI,
+            "env_key": "GOOGLE_API_KEY",
+            "base_url_key": "GOOGLE_BASE_URL",
+        },
         "openrouter": {
             "class": ChatOpenAI,  # OpenRouter uses OpenAI-compatible interface
             "env_key": "OPENROUTER_API_KEY",
             "default_base_url": "https://openrouter.ai/api/v1",
         },
         "grok": {
+            "class": ChatXAI,
+            "env_key": "XAI_API_KEY",
         }
         # Add more providers with default configurations here
     }

pyproject.toml CHANGED Viewed

@@ -72,6 +72,8 @@ dependencies = [
     "langchain-google-genai>=0.1.0",
     "ray>=2.9.0",
     "langchain-sandbox>=0.0.6",
     "iopath>=0.1.10",
 ]

     "langchain-google-genai>=0.1.0",
     "ray>=2.9.0",
     "langchain-sandbox>=0.0.6",
+    "seaborn>=0.12.0",
+    "huggingface_hub>=0.17.0",
     "iopath>=0.1.10",
 ]