Spaces:

samwell
/

medrax2

Paused

App Files Files Community

VictorLJZ commited on Jul 14, 2025

Commit

99f2cbc

1 Parent(s): eba86cf

changes so far

Browse files

Files changed (13) hide show

.DS_Store +0 -0
benchmarking/__init__.py +3 -0
benchmarking/benchmarks/__init__.py +12 -0
benchmarking/benchmarks/base.py +210 -0
benchmarking/benchmarks/chest_agent_bench.py +200 -0
benchmarking/benchmarks/rexvqa_benchmark.py +171 -0
benchmarking/cli.py +246 -0
benchmarking/evaluation.py +114 -0
benchmarking/llm_providers/__init__.py +3 -1
benchmarking/llm_providers/base.py +19 -0
benchmarking/llm_providers/medrax_provider.py +2 -1
benchmarking/runner.py +397 -0
pyproject.toml +2 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

benchmarking/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """Benchmarking pipeline for MedRAX and other medical AI models."""
2	+
3	+ __version__ = "1.0.0"

benchmarking/benchmarks/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Benchmark abstractions for medical AI evaluation."""
+from .base import Benchmark, BenchmarkDataPoint
+from .chest_agent_bench import ChestAgentBench
+from .rexvqa_benchmark import ReXVQABenchmark
+__all__ = [
+    "Benchmark",
+    "BenchmarkDataPoint",
+    "ChestAgentBench",
+    "ReXVQABenchmark",
+]

benchmarking/benchmarks/base.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Base class for benchmarks."""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Any, Iterator, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+import json
+@dataclass
+class BenchmarkDataPoint:
+    """A single data point from a benchmark."""
+    id: str
+    text: str  # The question/prompt
+    images: Optional[List[str]] = None  # List of image paths
+    correct_answer: Optional[str] = None  # Ground truth answer
+    case_id: Optional[str] = None  # For grouping related questions
+    category: Optional[str] = None  # Type of question/task
+    metadata: Optional[Dict[str, Any]] = None  # Additional metadata
+class Benchmark(ABC):
+    """Abstract base class for benchmarks.
+    This class defines the interface for all benchmarks, standardizing
+    how data is loaded and accessed across different benchmark datasets.
+    """
+    def __init__(self, data_dir: str, **kwargs):
+        """Initialize the benchmark.
+        Args:
+            data_dir (str): Directory containing benchmark data
+            **kwargs: Additional configuration parameters
+        """
+        self.data_dir = Path(data_dir)
+        self.config = kwargs
+        self.data_points = []
+        self._load_data()
+    @abstractmethod
+    def _load_data(self) -> None:
+        """Load benchmark data from the data directory."""
+        pass
+    def get_data_point(self, index: int) -> BenchmarkDataPoint:
+        """Get a specific data point by index.
+        Args:
+            index (int): Index of the data point to retrieve
+        Returns:
+            BenchmarkDataPoint: The data point at the given index
+        """
+        if index < 0 or index >= len(self.data_points):
+            raise IndexError(f"Index {index} out of range for {len(self.data_points)} data points")
+        return self.data_points[index]
+    def get_subset(self, indices: List[int]) -> List[BenchmarkDataPoint]:
+        """Get a subset of data points by indices.
+        Args:
+            indices (List[int]): List of indices to retrieve
+        Returns:
+            List[BenchmarkDataPoint]: List of data points at the given indices
+        """
+        return [self.get_data_point(i) for i in indices]
+    def save_subset(self, indices: List[int], output_path: str) -> None:
+        """Save a subset of the benchmark to a file.
+        Args:
+            indices (List[int]): Indices of data points to save
+            output_path (str): Path to save the subset
+        """
+        subset = self.get_subset(indices)
+        # Convert to serializable format
+        subset_data = []
+        for dp in subset:
+            subset_data.append({
+                "id": dp.id,
+                "text": dp.text,
+                "images": dp.images,
+                "correct_answer": dp.correct_answer,
+                "metadata": dp.metadata,
+                "case_id": dp.case_id,
+                "category": dp.category,
+            })
+        with open(output_path, 'w') as f:
+            json.dump(subset_data, f, indent=2)
+    def get_by_category(self, category: str) -> List[BenchmarkDataPoint]:
+        """Get all data points of a specific category.
+        Args:
+            category (str): Category to filter by
+        Returns:
+            List[BenchmarkDataPoint]: List of data points in the category
+        """
+        return [dp for dp in self if dp.category == category]
+    def get_by_case_id(self, case_id: str) -> List[BenchmarkDataPoint]:
+        """Get all data points for a specific case.
+        Args:
+            case_id (str): Case ID to filter by
+        Returns:
+            List[BenchmarkDataPoint]: List of data points for the case
+        """
+        return [dp for dp in self if dp.case_id == case_id]
+    def __str__(self) -> str:
+        """String representation of the benchmark."""
+        return f"{self.__class__.__name__}(data_dir={self.data_dir}, size={len(self)})"
+    def __len__(self) -> int:
+        """Return the number of data points in the benchmark."""
+        return len(self.data_points)
+    def __iter__(self) -> Iterator[BenchmarkDataPoint]:
+        """Iterate over all data points in the benchmark."""
+        for i in range(len(self)):
+            yield self.get_data_point(i)
+    def get_categories(self) -> List[str]:
+        """Get all unique categories in the benchmark.
+        Returns:
+            List[str]: List of unique categories
+        """
+        categories = set()
+        for dp in self:
+            if dp.category:
+                categories.add(dp.category)
+        return sorted(list(categories))
+    def get_case_ids(self) -> List[str]:
+        """Get all unique case IDs in the benchmark.
+        Returns:
+            List[str]: List of unique case IDs
+        """
+        case_ids = set()
+        for dp in self:
+            if dp.case_id:
+                case_ids.add(dp.case_id)
+        return sorted(list(case_ids))
+    def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the benchmark.
+        Returns:
+            Dict[str, Any]: Dictionary containing benchmark statistics
+        """
+        stats = {
+            "total_questions": len(self),
+            "total_cases": len(self.get_case_ids()),
+            "categories": self.get_categories(),
+            "category_counts": {},
+            "images_per_question": [],
+            "has_images": 0,
+            "no_images": 0,
+        }
+        for dp in self:
+            # Category counts
+            if dp.category:
+                stats["category_counts"][dp.category] = stats["category_counts"].get(dp.category, 0) + 1
+            # Image statistics
+            if dp.images:
+                stats["images_per_question"].append(len(dp.images))
+                stats["has_images"] += 1
+            else:
+                stats["images_per_question"].append(0)
+                stats["no_images"] += 1
+        if stats["images_per_question"]:
+            stats["avg_images_per_question"] = sum(stats["images_per_question"]) / len(stats["images_per_question"])
+            stats["max_images_per_question"] = max(stats["images_per_question"])
+        else:
+            stats["avg_images_per_question"] = 0
+            stats["max_images_per_question"] = 0
+        return stats
+    def validate_images(self) -> Tuple[List[str], List[str]]:
+        """Validate that all image paths exist.
+        Returns:
+            Tuple[List[str], List[str]]: Tuple of (valid_image_paths, invalid_image_paths)
+        """
+        valid_images = []
+        invalid_images = []
+        for dp in self:
+            if dp.images:
+                for image_path in dp.images:
+                    if Path(image_path).exists():
+                        valid_images.append(image_path)
+                    else:
+                        invalid_images.append(image_path)
+        return valid_images, invalid_images

benchmarking/benchmarks/chest_agent_bench.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""ChestAgentBench benchmark implementation."""
+import json
+import glob
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from .base import Benchmark, BenchmarkDataPoint
+class ChestAgentBench(Benchmark):
+    """ChestAgentBench benchmark for complex medical reasoning tasks."""
+    def __init__(self, data_dir: str, **kwargs):
+        """Initialize ChestAgentBench.
+        Args:
+            data_dir (str): Directory containing benchmark data
+            **kwargs: Additional configuration parameters
+        """
+        # Expected structure:
+        # data_dir/
+        #   eurorad_metadata.json  # Case metadata
+        #   questions/
+        #     case_id1/
+        #       case_id1_question1.json
+        #       case_id1_question2.json
+        #     case_id2/
+        #       ...
+        #   figures/
+        #     case_id1/
+        #       figure1.jpg
+        #       figure2.jpg
+        #     case_id2/
+        #       ...
+        self.metadata_file = kwargs.get("metadata_file", "eurorad_metadata.json")
+        self.questions_dir = kwargs.get("questions_dir", "questions")
+        self.figures_dir = kwargs.get("figures_dir", "figures")
+        super().__init__(data_dir, **kwargs)
+    def _load_data(self) -> None:
+        """Load ChestAgentBench data."""
+        # Load case metadata
+        metadata_path = self.data_dir / self.metadata_file
+        if not metadata_path.exists():
+            raise FileNotFoundError(f"Metadata file not found: {metadata_path}")
+        with open(metadata_path, 'r') as f:
+            case_metadata = json.load(f)
+        # Load questions for each case
+        questions_path = self.data_dir / self.questions_dir
+        if not questions_path.exists():
+            raise FileNotFoundError(f"Questions directory not found: {questions_path}")
+        figures_path = self.data_dir / self.figures_dir
+        self.data_points = []
+        for case_id, case_details in case_metadata.items():
+            # Find all question files for this case
+            case_questions_dir = questions_path / case_id
+            if not case_questions_dir.exists():
+                continue
+            question_files = glob.glob(str(case_questions_dir / f"{case_id}_*.json"))
+            for question_file in question_files:
+                try:
+                    with open(question_file, 'r') as f:
+                        question_data = json.load(f)
+                    question_id = Path(question_file).stem
+                    # Parse figure information
+                    images = []
+                    if question_data.get("figures"):
+                        required_figures = self._parse_figures(question_data["figures"])
+                        # Find actual image files
+                        case_figures_dir = figures_path / case_id
+                        if case_figures_dir.exists():
+                            for figure_id in required_figures:
+                                # Look for the figure file
+                                figure_files = glob.glob(str(case_figures_dir / f"{figure_id}.*"))
+                                if figure_files:
+                                    images.append(figure_files[0])  # Take the first match
+                    # Extract categories from metadata
+                    categories = []
+                    if question_data.get("metadata", {}).get("categories"):
+                        categories = question_data["metadata"]["categories"]
+                    category = categories[0] if categories else None
+                    # Create data point
+                    data_point = BenchmarkDataPoint(
+                        id=question_id,
+                        text=question_data["question"],
+                        images=images if images else None,
+                        correct_answer=question_data.get("answer", [None])[0],
+                        metadata={
+                            "case_details": case_details,
+                            "question_metadata": question_data.get("metadata", {}),
+                            "explanation": question_data.get("explanation", ""),
+                            "categories": categories,
+                            "figures": question_data.get("figures", []),
+                        },
+                        case_id=case_id,
+                        category=category,
+                    )
+                    self.data_points.append(data_point)
+                except Exception as e:
+                    print(f"Error loading question {question_file}: {e}")
+                    continue
+    def _parse_figures(self, figures_data: Any) -> List[str]:
+        """Parse figure information from question data.
+        Args:
+            figures_data: Figure information from question JSON
+        Returns:
+            List[str]: List of figure IDs
+        """
+        if isinstance(figures_data, str):
+            try:
+                # Try to parse as JSON
+                figures_list = json.loads(figures_data)
+                return figures_list if isinstance(figures_list, list) else [figures_data]
+            except json.JSONDecodeError:
+                return [figures_data]
+        elif isinstance(figures_data, list):
+            return figures_data
+        else:
+            return [str(figures_data)]
+    def get_data_point(self, index: int) -> BenchmarkDataPoint:
+        """Get a specific data point by index.
+        Args:
+            index (int): Index of the data point to retrieve
+        Returns:
+            BenchmarkDataPoint: The data point at the given index
+        """
+        if index < 0 or index >= len(self.data_points):
+            raise IndexError(f"Index {index} out of range for {len(self.data_points)} data points")
+        return self.data_points[index]
+    def get_multiple_choice_options(self, data_point: BenchmarkDataPoint) -> List[str]:
+        """Get multiple choice options for a data point.
+        Args:
+            data_point (BenchmarkDataPoint): The data point
+        Returns:
+            List[str]: List of multiple choice options (A, B, C, D, E, F)
+        """
+        # ChestAgentBench uses A-F multiple choice
+        return ["A", "B", "C", "D", "E", "F"]
+    def format_question_with_choices(self, data_point: BenchmarkDataPoint) -> str:
+        """Format question text with multiple choice options.
+        Args:
+            data_point (BenchmarkDataPoint): The data point
+        Returns:
+            str: Formatted question with choices
+        """
+        question = data_point.text
+        # Add instruction for multiple choice format
+        question += "\n\nPlease provide your answer as a single letter (A, B, C, D, E, or F)."
+        return question
+    def get_category_mapping(self) -> Dict[str, str]:
+        """Get mapping of category names to descriptions.
+        Returns:
+            Dict[str, str]: Mapping of category names to descriptions
+        """
+        return {
+            "detection": "Identify and locate specific findings in the chest X-ray",
+            "classification": "Determine whether specific findings are present or absent",
+            "enumeration": "Count the number of target findings in the chest X-ray",
+            "localization": "Locate a given finding in the chest X-ray",
+            "comparison": "Compare the size or position of a specific finding",
+            "relationship": "Determine the relationship between two or more findings",
+            "diagnosis": "Make a diagnosis or determine a treatment plan",
+            "characterization": "Describe specific attributes of findings",
+            "reasoning": "Explain the medical rationale behind findings and conclusions",
+        }

benchmarking/benchmarks/rexvqa_benchmark.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""ReXVQA benchmark implementation."""
+from typing import Dict, List, Optional, Any
+from datasets import load_dataset
+from .base import Benchmark, BenchmarkDataPoint
+class ReXVQABenchmark(Benchmark):
+    """ReXVQA benchmark for chest radiology visual question answering.
+    ReXVQA is a large-scale VQA dataset for chest radiology comprising approximately
+    696,000 questions paired with 160,000 chest X-rays. It tests 5 core radiological
+    reasoning skills: presence assessment, location analysis, negation detection,
+    differential diagnosis, and geometric reasoning.
+    Paper: https://arxiv.org/abs/2506.04353
+    Dataset: https://huggingface.co/datasets/rajpurkarlab/ReXVQA
+    """
+    def __init__(self, data_dir: str, **kwargs):
+        """Initialize ReXVQA benchmark.
+        Args:
+            data_dir (str): Directory to store/cache downloaded data
+            **kwargs: Additional configuration parameters
+                split (str): Dataset split to use ('validation' or 'test', default: 'validation')
+                cache_dir (str): Directory for caching HuggingFace datasets
+                trust_remote_code (bool): Whether to trust remote code (default: False)
+        """
+        self.split = kwargs.get("split", "validation")
+        self.cache_dir = kwargs.get("cache_dir", None)
+        self.trust_remote_code = kwargs.get("trust_remote_code", False)
+        super().__init__(data_dir, **kwargs)
+    def _load_data(self) -> None:
+        """Load ReXVQA data from HuggingFace."""
+        try:
+            # Load dataset from HuggingFace
+            print(f"Loading ReXVQA {self.split} split from HuggingFace...")
+            dataset = load_dataset(
+                "rajpurkarlab/ReXVQA",
+                split=self.split,
+                cache_dir=self.cache_dir,
+                trust_remote_code=self.trust_remote_code
+            )
+            print(f"Loaded {len(dataset)} examples from ReXVQA {self.split} split")
+            self.data_points = []
+            for i, item in enumerate(dataset):
+                try:
+                    data_point = self._parse_rexvqa_item(item, i)
+                    if data_point:
+                        self.data_points.append(data_point)
+                except Exception as e:
+                    print(f"Error loading item {i}: {e}")
+                    continue
+        except Exception as e:
+            raise RuntimeError(f"Failed to load ReXVQA dataset: {e}")
+    def _parse_rexvqa_item(self, item: Dict[str, Any], index: int) -> Optional[BenchmarkDataPoint]:
+        """Parse a ReXVQA dataset item.
+        Args:
+            item (Dict[str, Any]): Dataset item from HuggingFace
+            index (int): Item index
+        Returns:
+            Optional[BenchmarkDataPoint]: Parsed data point
+        """
+        # Extract basic information
+        question_id = item.get("id", f"rexvqa_{self.split}_{index}")
+        question = item.get("question", "")
+        answer = item.get("answer", "")
+        if not question:
+            return None
+        # Handle image
+        images = None
+        if "image" in item and item["image"] is not None:
+            # Save image to local cache directory
+            image_filename = f"{question_id}.png"
+            image_path = self.data_dir / "images" / image_filename
+            # Create images directory if it doesn't exist
+            image_path.parent.mkdir(parents=True, exist_ok=True)
+            # Save image if it doesn't exist
+            if not image_path.exists():
+                try:
+                    item["image"].save(str(image_path))
+                except Exception as e:
+                    print(f"Error saving image for {question_id}: {e}")
+                    return None
+            images = [str(image_path)]
+        # Extract metadata
+        metadata = {
+            "dataset": "rexvqa",
+            "split": self.split,
+            "study_id": item.get("study_id", ""),
+            "image_id": item.get("image_id", ""),
+            "reasoning_type": item.get("reasoning_type", ""),
+            "anatomical_location": item.get("anatomical_location", ""),
+            "pathology": item.get("pathology", ""),
+        }
+        # Determine category from reasoning type
+        category = item.get("reasoning_type", "")
+        # Use study_id as case_id for grouping related questions
+        case_id = item.get("study_id", "")
+        return BenchmarkDataPoint(
+            id=question_id,
+            text=question,
+            images=images,
+            correct_answer=answer,
+            metadata=metadata,
+            case_id=case_id,
+            category=category,
+        )
+    def get_pathologies(self) -> List[str]:
+        """Get all unique pathologies in the dataset.
+        Returns:
+            List[str]: List of unique pathologies
+        """
+        pathologies = set()
+        for dp in self:
+            pathology = dp.metadata.get("pathology", "")
+            if pathology:
+                pathologies.add(pathology)
+        return sorted(list(pathologies))
+    def get_by_pathology(self, pathology: str) -> List[BenchmarkDataPoint]:
+        """Get all data points about a specific pathology.
+        Args:
+            pathology (str): Pathology to filter by
+        Returns:
+            List[BenchmarkDataPoint]: List of data points about the pathology
+        """
+        return [dp for dp in self if dp.metadata.get("pathology", "") == pathology]
+    def get_dataset_info(self) -> Dict[str, Any]:
+        """Get information about the ReXVQA dataset.
+        Returns:
+            Dict[str, Any]: Dataset information
+        """
+        return {
+            "name": "ReXVQA",
+            "description": "Large-scale Visual Question Answering Benchmark for Chest Radiology",
+            "split": self.split,
+            "size": len(self.data_points),
+            "reasoning_types": self.get_reasoning_types(),
+            "pathologies": self.get_pathologies(),
+            "categories": self.get_categories(),
+            "paper": "https://arxiv.org/abs/2506.04353",
+            "dataset_url": "https://huggingface.co/datasets/rajpurkarlab/ReXVQA",
+        }

benchmarking/cli.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""Command-line interface for the benchmarking pipeline."""
+import argparse
+import sys
+from pathlib import Path
+from typing import Dict, Any, Optional
+from .llm_providers import *
+from .benchmarks import *
+from .runner import BenchmarkRunner, BenchmarkRunConfig
+from .evaluation import BenchmarkEvaluator
+def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMProvider:
+    """Create an LLM provider based on the model name and type.
+    Args:
+        model_name (str): Name of the model
+        provider_type (str): Type of provider (openai, google, openrouter, medrax)
+        **kwargs: Additional configuration parameters
+    Returns:
+        LLMProvider: The configured LLM provider
+    """
+    provider_map = {
+        "openai": OpenAIProvider,
+        "google": GoogleProvider,
+        "openrouter": OpenRouterProvider,
+        "medrax": MedRAXProvider,
+    }
+    if provider_type not in provider_map:
+        raise ValueError(f"Unknown provider type: {provider_type}. Available: {list(provider_map.keys())}")
+    provider_class = provider_map[provider_type]
+    return provider_class(model_name, **kwargs)
+def create_benchmark(benchmark_name: str, data_dir: str, **kwargs) -> Benchmark:
+    """Create a benchmark based on the benchmark name.
+    Args:
+        benchmark_name (str): Name of the benchmark
+        data_dir (str): Directory containing benchmark data
+        **kwargs: Additional configuration parameters
+    Returns:
+        Benchmark: The configured benchmark
+    """
+    benchmark_map = {
+        "chest_agent_bench": ChestAgentBench,
+        "rexvqa": ReXVQABenchmark,
+    }
+    if benchmark_name not in benchmark_map:
+        raise ValueError(f"Unknown benchmark: {benchmark_name}. Available: {list(benchmark_map.keys())}")
+    benchmark_class = benchmark_map[benchmark_name]
+    return benchmark_class(data_dir, **kwargs)
+def run_benchmark_command(args) -> None:
+    """Run a benchmark."""
+    print(f"Running benchmark: {args.benchmark} with model: {args.model}")
+    # Create LLM provider
+    provider_kwargs = {}
+    if args.provider == "medrax":
+        provider_kwargs = {
+            "tools_to_use": args.medrax_tools.split(",") if args.medrax_tools else None,
+            "model_dir": args.model_dir,
+            "temp_dir": args.temp_dir,
+            "device": args.device,
+            "rag_config": None,  # You might want to add RAG config options
+        }
+    llm_provider = create_llm_provider(args.model, args.provider, **provider_kwargs)
+    # Create benchmark
+    benchmark_kwargs = {}
+    benchmark = create_benchmark(args.benchmark, args.data_dir, **benchmark_kwargs)
+    # Create runner config
+    config = BenchmarkRunConfig(
+        model_name=args.model,
+        benchmark_name=args.benchmark,
+        output_dir=args.output_dir,
+        max_questions=args.max_questions,
+        start_index=args.start_index,
+        temperature=args.temperature,
+        max_tokens=args.max_tokens,
+        system_prompt=args.system_prompt,
+        save_frequency=args.save_frequency,
+        log_level=args.log_level,
+    )
+    # Run benchmark
+    runner = BenchmarkRunner(config)
+    summary = runner.run_benchmark(llm_provider, benchmark)
+    print("\n" + "="*50)
+    print("BENCHMARK COMPLETED")
+    print("="*50)
+    print(f"Overall Accuracy: {summary['results']['accuracy']:.2f}%")
+    print(f"Total Questions: {summary['results']['total_questions']}")
+    print(f"Correct Answers: {summary['results']['correct_answers']}")
+    print(f"Total Duration: {summary['results']['total_duration']:.2f}s")
+    print(f"Results saved to: {summary['results_file']}")
+def evaluate_results_command(args) -> None:
+    """Evaluate benchmark results."""
+    print(f"Evaluating results: {args.results_files}")
+    evaluator = BenchmarkEvaluator(args.output_dir)
+    if len(args.results_files) == 1:
+        # Single model evaluation
+        evaluation = evaluator.evaluate_single_run(args.results_files[0])
+        print("\n" + "="*50)
+        print("SINGLE MODEL EVALUATION")
+        print("="*50)
+        print(f"Model: {evaluation.model_name}")
+        print(f"Benchmark: {evaluation.benchmark_name}")
+        print(f"Overall Accuracy: {evaluation.overall_accuracy:.2f}%")
+        print(f"Total Questions: {evaluation.total_questions}")
+        print(f"Error Rate: {evaluation.error_rate:.2f}%")
+        print(f"Total Duration: {evaluation.total_duration:.2f}s")
+        if evaluation.category_accuracies:
+            print("\nCategory Accuracies:")
+            for category, accuracy in evaluation.category_accuracies.items():
+                print(f"  {category}: {accuracy:.2f}%")
+    else:
+        # Multiple model comparison
+        comparison = evaluator.compare_models(args.results_files)
+        if "error" in comparison:
+            print(f"Error: {comparison['error']}")
+            return
+        print("\n" + "="*50)
+        print("MODEL COMPARISON")
+        print("="*50)
+        summary = comparison["summary"]
+        print(f"Models Compared: {summary['models_compared']}")
+        print(f"Best Overall Accuracy: {summary['best_overall_accuracy']:.2f}%")
+        print(f"Accuracy Range: {summary['accuracy_range'][0]:.2f}% - {summary['accuracy_range'][1]:.2f}%")
+        best_model = comparison["best_model"]
+        print(f"\nBest Model: {best_model['Model']} ({best_model['Accuracy (%)']:.2f}%)")
+        # Generate comprehensive report
+        report_path = evaluator.generate_report(args.results_files, args.report_name)
+        print(f"\nDetailed report saved to: {report_path}")
+        # Statistical significance test
+        if args.statistical_test:
+            print("\nRunning statistical significance tests...")
+            sig_results = evaluator.statistical_significance_test(args.results_files)
+            print(f"Found {len(sig_results['comparisons'])} pairwise comparisons")
+            for comp in sig_results["comparisons"]:
+                significance = "significant" if comp["significant"] else "not significant"
+                print(f"{comp['model1']} vs {comp['model2']}: {significance} (p={comp['p_value']:.4f})")
+def list_providers_command(args) -> None:
+    """List available LLM providers."""
+    print("Available LLM Providers:")
+    print("- openai: OpenAI GPT models")
+    print("- google: Google Gemini models")
+    print("- openrouter: OpenRouter API (multiple models)")
+    print("- medrax: MedRAX agent system")
+def list_benchmarks_command(args) -> None:
+    """List available benchmarks."""
+    print("Available Benchmarks:")
+    print("- rexvqa: ReXVQA (large-scale chest radiology VQA)")
+def main():
+    """Main CLI entry point."""
+    parser = argparse.ArgumentParser(description="MedRAX Benchmarking Pipeline")
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Run benchmark command
+    run_parser = subparsers.add_parser("run", help="Run a benchmark")
+    run_parser.add_argument("--model", required=True, help="Model name (e.g., gpt-4o, gemini-2.5-pro)")
+    run_parser.add_argument("--provider", required=True, choices=["openai", "google", "openrouter", "medrax"], help="LLM provider")
+    run_parser.add_argument("--benchmark", required=True, choices=["chest_agent_bench", "rexvqa"], help="Benchmark to run")
+    run_parser.add_argument("--data-dir", required=True, help="Directory containing benchmark data")
+    run_parser.add_argument("--output-dir", default="benchmark_results", help="Output directory for results")
+    run_parser.add_argument("--max-questions", type=int, help="Maximum number of questions to process")
+    run_parser.add_argument("--start-index", type=int, default=0, help="Starting index for questions")
+    run_parser.add_argument("--temperature", type=float, default=0.7, help="Model temperature")
+    run_parser.add_argument("--max-tokens", type=int, default=1500, help="Maximum tokens per response")
+    run_parser.add_argument("--system-prompt", help="System prompt for the model")
+    run_parser.add_argument("--save-frequency", type=int, default=10, help="Save results every N questions")
+    run_parser.add_argument("--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"])
+    # MedRAX-specific arguments
+    run_parser.add_argument("--medrax-tools", help="Comma-separated list of tools for MedRAX (e.g., WebBrowserTool,MedicalRAGTool)")
+    run_parser.add_argument("--model-dir", default="/model-weights", help="Directory containing model weights for MedRAX")
+    run_parser.add_argument("--temp-dir", default="temp", help="Temporary directory for MedRAX")
+    run_parser.add_argument("--device", default="cuda", help="Device for MedRAX models")
+    run_parser.set_defaults(func=run_benchmark_command)
+    # Evaluate results command
+    eval_parser = subparsers.add_parser("evaluate", help="Evaluate benchmark results")
+    eval_parser.add_argument("results_files", nargs="+", help="Path(s) to results files")
+    eval_parser.add_argument("--output-dir", default="evaluation_results", help="Output directory for evaluation")
+    eval_parser.add_argument("--report-name", default="evaluation_report", help="Name for the evaluation report")
+    eval_parser.add_argument("--statistical-test", action="store_true", help="Run statistical significance tests")
+    eval_parser.set_defaults(func=evaluate_results_command)
+    # List providers command
+    list_providers_parser = subparsers.add_parser("list-providers", help="List available LLM providers")
+    list_providers_parser.set_defaults(func=list_providers_command)
+    # List benchmarks command
+    list_benchmarks_parser = subparsers.add_parser("list-benchmarks", help="List available benchmarks")
+    list_benchmarks_parser.set_defaults(func=list_benchmarks_command)
+    args = parser.parse_args()
+    if args.command is None:
+        parser.print_help()
+        return
+    try:
+        args.func(args)
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

benchmarking/evaluation.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""Evaluation code for analyzing benchmark results."""
+import json
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Tuple
+from dataclasses import dataclass
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import seaborn as sns
+@dataclass
+class EvaluationResult:
+    """Results of evaluating a benchmark run."""
+    model_name: str
+    benchmark_name: str
+    overall_accuracy: float
+    total_questions: int
+    correct_answers: int
+    total_duration: float
+    category_accuracies: Dict[str, float]
+    category_counts: Dict[str, int]
+    error_rate: float
+    avg_duration_per_question: float
+class BenchmarkEvaluator:
+    """Class for evaluating and comparing benchmark results."""
+    def __init__(self, output_dir: str = "evaluation_results"):
+        """Initialize the evaluator.
+        Args:
+            output_dir (str): Directory to save evaluation results
+        """
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    def load_results(self, results_file: str) -> Dict[str, Any]:
+        """Load benchmark results from file.
+        Args:
+            results_file (str): Path to the results file
+        Returns:
+            Dict[str, Any]: Loaded results data
+        """
+        with open(results_file, 'r') as f:
+            return json.load(f)
+    def evaluate_single_run(self, results_file: str) -> EvaluationResult:
+        """Evaluate a single benchmark run.
+        Args:
+            results_file (str): Path to the results file
+        Returns:
+            EvaluationResult: Evaluation results
+        """
+        results = self.load_results(results_file)
+        # Calculate basic metrics
+        total_questions = len(results)
+        correct_answers = sum(1 for r in results if r.get("is_correct", False))
+        total_duration = sum(r.get("duration", 0) for r in results)
+        errors = sum(1 for r in results if r.get("error") is not None)
+        overall_accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
+        error_rate = (errors / total_questions) * 100 if total_questions > 0 else 0
+        # Calculate per-category metrics
+        category_stats = defaultdict(lambda: {"correct": 0, "total": 0})
+        for result in results:
+            metadata = result.get("metadata", {})
+            category = metadata.get("category")
+            if category:
+                category_stats[category]["total"] += 1
+                if result.get("is_correct", False):
+                    category_stats[category]["correct"] += 1
+        # Calculate category accuracies
+        category_accuracies = {}
+        category_counts = {}
+        for category, stats in category_stats.items():
+            category_accuracies[category] = (stats["correct"] / stats["total"]) * 100
+            category_counts[category] = stats["total"]
+        # Extract model and benchmark names (assuming they're in the filename or metadata)
+        results_path = Path(results_file)
+        filename_parts = results_path.stem.split("_")
+        model_name = "unknown"
+        benchmark_name = "unknown"
+        if len(filename_parts) >= 2:
+            benchmark_name = filename_parts[0]
+            model_name = filename_parts[1]
+        return EvaluationResult(
+            model_name=model_name,
+            benchmark_name=benchmark_name,
+            overall_accuracy=overall_accuracy,
+            total_questions=total_questions,
+            correct_answers=correct_answers,
+            total_duration=total_duration,
+            category_accuracies=category_accuracies,
+            category_counts=category_counts,
+            error_rate=error_rate,
+            avg_duration_per_question=total_duration / total_questions if total_questions > 0 else 0,
+        )

benchmarking/llm_providers/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """LLM provider abstractions for benchmarking."""
-from .base import LLMProvider
 from .openai_provider import OpenAIProvider
 from .google_provider import GoogleProvider
 from .openrouter_provider import OpenRouterProvider
@@ -8,6 +8,8 @@ from .medrax_provider import MedRAXProvider
 __all__ = [
     "LLMProvider",
     "OpenAIProvider",
     "GoogleProvider",
     "OpenRouterProvider",

 """LLM provider abstractions for benchmarking."""
+from .base import LLMProvider, LLMRequest, LLMResponse
 from .openai_provider import OpenAIProvider
 from .google_provider import GoogleProvider
 from .openrouter_provider import OpenRouterProvider
 __all__ = [
     "LLMProvider",
+    "LLMRequest",
+    "LLMResponse",
     "OpenAIProvider",
     "GoogleProvider",
     "OpenRouterProvider",

benchmarking/llm_providers/base.py CHANGED Viewed

@@ -63,6 +63,25 @@ class LLMProvider(ABC):
         """
         pass
     def _encode_image(self, image_path: str) -> str:
         """Encode image to base64 string.

         """
         pass
+    def test_connection(self) -> bool:
+        """Test the connection to the LLM provider.
+        Returns:
+            bool: True if connection is successful, False otherwise
+        """
+        try:
+            # Simple test request
+            test_request = LLMRequest(
+                text="Hello",
+                temperature=0.0,
+                max_tokens=10
+            )
+            response = self.generate_response(test_request)
+            return response.content is not None and len(response.content.strip()) > 0
+        except Exception as e:
+            print(f"Connection test failed: {e}")
+            return False
     def _encode_image(self, image_path: str) -> str:
         """Encode image to base64 string.

benchmarking/llm_providers/medrax_provider.py CHANGED Viewed

@@ -13,7 +13,8 @@ from .base import LLMProvider, LLMRequest, LLMResponse
 # Import MedRAX components
 from medrax.agent import Agent
 from medrax.tools import *
-from medrax.utils import load_prompts_from_file, RAGConfig
 from medrax.models import ModelFactory
 from langgraph.checkpoint.memory import MemorySaver
 from langchain_core.messages import HumanMessage

 # Import MedRAX components
 from medrax.agent import Agent
 from medrax.tools import *
+from medrax.utils import load_prompts_from_file
+from medrax.rag.rag import RAGConfig
 from medrax.models import ModelFactory
 from langgraph.checkpoint.memory import MemorySaver
 from langchain_core.messages import HumanMessage

benchmarking/runner.py ADDED Viewed

	@@ -0,0 +1,397 @@

+"""Main test runner for benchmarking pipeline."""
+import json
+import time
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Optional, Any
+from dataclasses import dataclass
+from tqdm import tqdm
+import re
+from .llm_providers import LLMProvider, LLMRequest, LLMResponse
+from .benchmarks import Benchmark, BenchmarkDataPoint
+@dataclass
+class BenchmarkResult:
+    """Result of running a benchmark on a single data point."""
+    data_point_id: str
+    question: str
+    model_answer: str
+    correct_answer: str
+    is_correct: bool
+    duration: float
+    usage: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+@dataclass
+class BenchmarkRunConfig:
+    """Configuration for a benchmark run."""
+    model_name: str
+    benchmark_name: str
+    output_dir: str
+    max_questions: Optional[int] = None
+    start_index: int = 0
+    temperature: float = 0.7
+    max_tokens: int = 1500
+    system_prompt: Optional[str] = None
+    save_frequency: int = 10  # Save results every N questions
+    log_level: str = "INFO"
+    additional_params: Optional[Dict[str, Any]] = None
+class BenchmarkRunner:
+    """Main class for running benchmarks against LLM providers."""
+    def __init__(self, config: BenchmarkRunConfig):
+        """Initialize the benchmark runner.
+        Args:
+            config (BenchmarkRunConfig): Configuration for the benchmark run
+        """
+        self.config = config
+        self.results = []
+        self.output_dir = Path(config.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Set up logging
+        self._setup_logging()
+        # Generate unique run ID
+        self.run_id = f"{config.benchmark_name}_{config.model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        self.logger.info(f"Initialized benchmark runner with ID: {self.run_id}")
+    def _setup_logging(self) -> None:
+        """Set up logging configuration."""
+        log_file = self.output_dir / f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
+        # Create logger
+        self.logger = logging.getLogger(f"benchmark_runner_{self.run_id}")
+        self.logger.setLevel(getattr(logging, self.config.log_level))
+        # Create handlers
+        file_handler = logging.FileHandler(log_file)
+        console_handler = logging.StreamHandler()
+        # Create formatter
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        file_handler.setFormatter(formatter)
+        console_handler.setFormatter(formatter)
+        # Add handlers to logger
+        self.logger.addHandler(file_handler)
+        self.logger.addHandler(console_handler)
+    def run_benchmark(
+        self,
+        llm_provider: LLMProvider,
+        benchmark: Benchmark,
+    ) -> Dict[str, Any]:
+        """Run a benchmark against an LLM provider.
+        Args:
+            llm_provider (LLMProvider): The LLM provider to test
+            benchmark (Benchmark): The benchmark to run
+        Returns:
+            Dict[str, Any]: Summary of benchmark results
+        """
+        self.logger.info(f"Starting benchmark run: {self.run_id}")
+        self.logger.info(f"Model: {llm_provider.model_name}")
+        self.logger.info(f"Benchmark: {benchmark}")
+        # Test provider connection
+        if not llm_provider.test_connection():
+            self.logger.error("LLM provider connection test failed")
+            return {"error": "LLM provider connection test failed"}
+        # Get data points to process
+        total_questions = len(benchmark)
+        max_questions = self.config.max_questions or total_questions
+        end_index = min(self.config.start_index + max_questions, total_questions)
+        self.logger.info(f"Processing questions {self.config.start_index} to {end_index-1} of {total_questions}")
+        # Initialize counters
+        processed = 0
+        correct = 0
+        total_duration = 0.0
+        # Process each data point
+        for i in tqdm(range(self.config.start_index, end_index), desc="Processing questions"):
+            try:
+                data_point = benchmark.get_data_point(i)
+                # Run the model on this data point
+                result = self._process_data_point(llm_provider, data_point)
+                # Update counters
+                processed += 1
+                if result.is_correct:
+                    correct += 1
+                total_duration += result.duration
+                # Add to results
+                self.results.append(result)
+                # Log progress
+                if processed % self.config.save_frequency == 0:
+                    self._save_intermediate_results()
+                    accuracy = (correct / processed) * 100
+                    avg_duration = total_duration / processed
+                    self.logger.info(
+                        f"Progress: {processed}/{end_index - self.config.start_index} | "
+                        f"Accuracy: {accuracy:.2f}% | "
+                        f"Avg Duration: {avg_duration:.2f}s"
+                    )
+            except Exception as e:
+                self.logger.error(f"Error processing data point {i}: {e}")
+                # Add error result
+                error_result = BenchmarkResult(
+                    data_point_id=f"error_{i}",
+                    question="",
+                    model_answer="",
+                    correct_answer="",
+                    is_correct=False,
+                    duration=0.0,
+                    error=str(e)
+                )
+                self.results.append(error_result)
+                continue
+        # Save final results
+        summary = self._save_final_results(benchmark)
+        self.logger.info(f"Benchmark run completed: {self.run_id}")
+        self.logger.info(f"Final accuracy: {summary['results']['accuracy']:.2f}%")
+        self.logger.info(f"Total duration: {summary['results']['total_duration']:.2f}s")
+        return summary
+    def _process_data_point(
+        self,
+        llm_provider: LLMProvider,
+        data_point: BenchmarkDataPoint,
+    ) -> BenchmarkResult:
+        """Process a single data point.
+        Args:
+            llm_provider (LLMProvider): The LLM provider to use
+            data_point (BenchmarkDataPoint): The data point to process
+        Returns:
+            BenchmarkResult: Result of processing the data point
+        """
+        start_time = time.time()
+        try:
+            # Create request
+            request = LLMRequest(
+                text=data_point.text,
+                images=data_point.images,
+                system_prompt=self.config.system_prompt,
+                temperature=self.config.temperature,
+                max_tokens=self.config.max_tokens,
+                additional_params=self.config.additional_params
+            )
+            # Get response from LLM
+            response: LLMResponse = llm_provider.generate_response(request)
+            # Extract answer (this may need customization based on benchmark)
+            model_answer = self._extract_answer(response.content)
+            # Check if correct
+            is_correct = self._is_correct_answer(model_answer, data_point.correct_answer)
+            duration = time.time() - start_time
+            return BenchmarkResult(
+                data_point_id=data_point.id,
+                question=data_point.text,
+                model_answer=model_answer,
+                correct_answer=data_point.correct_answer,
+                is_correct=is_correct,
+                duration=duration,
+                usage=response.usage,
+                metadata={
+                    "data_point_metadata": data_point.metadata,
+                    "case_id": data_point.case_id,
+                    "category": data_point.category,
+                    "raw_response": response.content,
+                }
+            )
+        except Exception as e:
+            duration = time.time() - start_time
+            return BenchmarkResult(
+                data_point_id=data_point.id,
+                question=data_point.text,
+                model_answer="",
+                correct_answer=data_point.correct_answer,
+                is_correct=False,
+                duration=duration,
+                error=str(e),
+                metadata={
+                    "data_point_metadata": data_point.metadata,
+                    "case_id": data_point.case_id,
+                    "category": data_point.category,
+                }
+            )
+    def _extract_answer(self, response_text: str) -> str:
+        """Extract the answer from the model response.
+        Args:
+            response_text (str): The full response text from the model
+        Returns:
+            str: The extracted answer
+        """
+        # This is a simple implementation - may need customization per benchmark
+        # For multiple choice, look for single letters A, B, C, D, E, F
+        # Look for patterns like "A", "B)", "(C)", "Answer: D", etc.
+        patterns = [
+            r'\b([A-F])\b',  # Single letter
+            r'\b([A-F])\)',  # Letter with closing parenthesis
+            r'\(([A-F])\)',  # Letter in parentheses
+            r'[Aa]nswer\s*:?\s*([A-F])',  # "Answer: X" format
+            r'[Cc]hoice\s*:?\s*([A-F])',  # "Choice: X" format
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, response_text)
+            if match:
+                return match.group(1).upper()
+        # If no pattern matches, return the first letter found
+        letters = re.findall(r'\b[A-F]\b', response_text)
+        if letters:
+            return letters[0].upper()
+        # If no letters found, return the full response (truncated)
+        return response_text.strip()[:100]
+    def _is_correct_answer(self, model_answer: str, correct_answer: str) -> bool:
+        """Check if the model answer is correct.
+        Args:
+            model_answer (str): The model's answer
+            correct_answer (str): The correct answer
+        Returns:
+            bool: True if the answer is correct
+        """
+        if not model_answer or not correct_answer:
+            return False
+        # For multiple choice, compare just the letter
+        model_clean = model_answer.strip().upper()
+        correct_clean = correct_answer.strip().upper()
+        # Extract just the first letter for comparison
+        model_letter = model_clean[0] if model_clean else ""
+        correct_letter = correct_clean[0] if correct_clean else ""
+        return model_letter == correct_letter
+    def _save_intermediate_results(self) -> None:
+        """Save intermediate results to disk."""
+        results_file = self.output_dir / f"{self.run_id}_intermediate.json"
+        # Convert results to serializable format
+        results_data = []
+        for result in self.results:
+            results_data.append({
+                "data_point_id": result.data_point_id,
+                "question": result.question,
+                "model_answer": result.model_answer,
+                "correct_answer": result.correct_answer,
+                "is_correct": result.is_correct,
+                "duration": result.duration,
+                "usage": result.usage,
+                "error": result.error,
+                "metadata": result.metadata,
+            })
+        with open(results_file, 'w') as f:
+            json.dump(results_data, f, indent=2)
+    def _save_final_results(self, benchmark: Benchmark) -> Dict[str, Any]:
+        """Save final results and return summary.
+        Args:
+            benchmark (Benchmark): The benchmark that was run
+        Returns:
+            Dict[str, Any]: Summary of results
+        """
+        # Save detailed results
+        results_file = self.output_dir / f"{self.run_id}_results.json"
+        self._save_intermediate_results()
+        # Calculate summary statistics
+        total_questions = len(self.results)
+        correct_answers = sum(1 for r in self.results if r.is_correct)
+        total_duration = sum(r.duration for r in self.results)
+        accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
+        # Calculate per-category accuracy
+        category_stats = {}
+        for result in self.results:
+            if result.metadata and result.metadata.get("category"):
+                category = result.metadata["category"]
+                if category not in category_stats:
+                    category_stats[category] = {"correct": 0, "total": 0}
+                category_stats[category]["total"] += 1
+                if result.is_correct:
+                    category_stats[category]["correct"] += 1
+        # Calculate accuracy for each category
+        category_accuracies = {}
+        for category, stats in category_stats.items():
+            category_accuracies[category] = (stats["correct"] / stats["total"]) * 100
+        # Create summary
+        summary = {
+            "run_id": self.run_id,
+            "timestamp": datetime.now().isoformat(),
+            "config": {
+                "model_name": self.config.model_name,
+                "benchmark_name": self.config.benchmark_name,
+                "temperature": self.config.temperature,
+                "max_tokens": self.config.max_tokens,
+                "system_prompt": self.config.system_prompt,
+            },
+            "benchmark_info": {
+                "total_size": len(benchmark),
+                "processed_questions": total_questions,
+                "start_index": self.config.start_index,
+            },
+            "results": {
+                "accuracy": accuracy,
+                "correct_answers": correct_answers,
+                "total_questions": total_questions,
+                "total_duration": total_duration,
+                "avg_duration_per_question": total_duration / total_questions if total_questions > 0 else 0,
+                "category_accuracies": category_accuracies,
+            },
+            "results_file": str(results_file),
+        }
+        # Save summary
+        summary_file = self.output_dir / f"{self.run_id}_summary.json"
+        with open(summary_file, 'w') as f:
+            json.dump(summary, f, indent=2)
+        return summary

pyproject.toml CHANGED Viewed

@@ -72,6 +72,8 @@ dependencies = [
     "langchain-google-genai>=0.1.0",
     "ray>=2.9.0",
     "langchain-sandbox>=0.0.6",
 ]
 [project.optional-dependencies]

     "langchain-google-genai>=0.1.0",
     "ray>=2.9.0",
     "langchain-sandbox>=0.0.6",
+    "seaborn>=0.12.0",
+    "huggingface_hub>=0.17.0",
 ]
 [project.optional-dependencies]