Spaces:

samwell
/

medrax2

Paused

App Files Files Community

Junzhe Li commited on Oct 16, 2025

Commit

89321e2

1 Parent(s): b93ad3f

revamped benchmarking suite

Browse files

Files changed (12) hide show

benchmarking/benchmarks/base.py +3 -20
benchmarking/benchmarks/rexvqa_benchmark.py +2 -2
benchmarking/cli.py +52 -64
benchmarking/llm_providers/__init__.py +14 -0
benchmarking/llm_providers/base.py +44 -24
benchmarking/llm_providers/google_provider.py +13 -10
benchmarking/llm_providers/medgemma_provider.py +222 -0
benchmarking/llm_providers/medrax_provider.py +97 -33
benchmarking/llm_providers/openai_provider.py +16 -13
benchmarking/llm_providers/openrouter_provider.py +14 -8
benchmarking/runner.py +38 -102
benchmarking/system_prompts.txt +36 -0

benchmarking/benchmarks/base.py CHANGED Viewed

@@ -39,7 +39,7 @@ class Benchmark(ABC):
         self._load_data()
         self._shuffle_data()
-        self.max_questions = kwargs.get("max_questions", None)
         if self.max_questions:
             self.data_points = self.data_points[:self.max_questions]
             print(f"Randomly sampled {self.max_questions} questions from {self.__class__.__name__}")
@@ -51,12 +51,13 @@ class Benchmark(ABC):
         """Load benchmark data from the data directory."""
         pass
-    def _shuffle_data(self, random_seed: Optional[int]=42) -> None:
         """Shuffle the data points if a random seed is provided. If no random seed is provided, use 42 as default.
         This method is called automatically after data loading to ensure
         reproducible benchmark runs when a random seed is specified.
         """
         random.seed(random_seed)
         random.shuffle(self.data_points)
         print(f"Shuffled {len(self.data_points)} data points with seed {random_seed}")
@@ -99,21 +100,3 @@ class Benchmark(ABC):
         for i in range(len(self)):
             yield self.get_data_point(i)
-    def validate_images(self) -> Tuple[List[str], List[str]]:
-        """Validate that all image paths exist.
-        Returns:
-            Tuple[List[str], List[str]]: Tuple of (valid_image_paths, invalid_image_paths)
-        """
-        valid_images = []
-        invalid_images = []
-        for dp in self:
-            if dp.images:
-                for image_path in dp.images:
-                    if Path(image_path).exists():
-                        valid_images.append(image_path)
-                    else:
-                        invalid_images.append(image_path)
-        return valid_images, invalid_images

         self._load_data()
         self._shuffle_data()
+        self.max_questions = self.config.get("max_questions", None)
         if self.max_questions:
             self.data_points = self.data_points[:self.max_questions]
             print(f"Randomly sampled {self.max_questions} questions from {self.__class__.__name__}")
         """Load benchmark data from the data directory."""
         pass
+    def _shuffle_data(self) -> None:
         """Shuffle the data points if a random seed is provided. If no random seed is provided, use 42 as default.
         This method is called automatically after data loading to ensure
         reproducible benchmark runs when a random seed is specified.
         """
+        random_seed = self.config.get("random_seed", 42)
         random.seed(random_seed)
         random.shuffle(self.data_points)
         print(f"Shuffled {len(self.data_points)} data points with seed {random_seed}")
         for i in range(len(self)):
             yield self.get_data_point(i)

benchmarking/benchmarks/rexvqa_benchmark.py CHANGED Viewed

@@ -48,11 +48,11 @@ class ReXVQABenchmark(Benchmark):
                 max_questions (int): Maximum number of questions to load (default: None, load all)
                 images_dir (str): Directory containing extracted PNG images (default: None)
         """
-        super().__init__(data_dir, **kwargs)
         self.split = kwargs.get("split", "test")
         self.images_dir = f"{data_dir}/images/deid_png"
     def _load_data(self) -> None:
         """Load ReXVQA data from HuggingFace."""
         try:

                 max_questions (int): Maximum number of questions to load (default: None, load all)
                 images_dir (str): Directory containing extracted PNG images (default: None)
         """
         self.split = kwargs.get("split", "test")
         self.images_dir = f"{data_dir}/images/deid_png"
+        super().__init__(data_dir, **kwargs)
     def _load_data(self) -> None:
         """Load ReXVQA data from HuggingFace."""
         try:

benchmarking/cli.py CHANGED Viewed

@@ -8,12 +8,35 @@ from .benchmarks import *
 from .runner import BenchmarkRunner, BenchmarkRunConfig
-def create_llm_provider(model_name: str, provider_type: str, system_prompt: str, **kwargs) -> LLMProvider:
     """Create an LLM provider based on the model name and type.
     Args:
         model_name (str): Name of the model
-        provider_type (str): Type of provider (openai, google, openrouter, medrax)
         system_prompt (str): System prompt identifier to load from file
         **kwargs: Additional configuration parameters
@@ -33,85 +56,50 @@ def create_llm_provider(model_name: str, provider_type: str, system_prompt: str,
     elif provider_type == "medrax":
         from .llm_providers.medrax_provider import MedRAXProvider
         provider_class = MedRAXProvider
     else:
-        raise ValueError(f"Unknown provider type: {provider_type}. Available: openai, google, openrouter, medrax")
     return provider_class(model_name, system_prompt, **kwargs)
-def create_benchmark(benchmark_name: str, data_dir: str, **kwargs) -> Benchmark:
-    """Create a benchmark based on the benchmark name.
-    Args:
-        benchmark_name (str): Name of the benchmark
-        data_dir (str): Directory containing benchmark data
-        **kwargs: Additional configuration parameters
-    Returns:
-        Benchmark: The configured benchmark
-    """
-    benchmark_map = {
-        "rexvqa": ReXVQABenchmark,
-        "chestagentbench": ChestAgentBenchBenchmark,
-    }
-    if benchmark_name not in benchmark_map:
-        raise ValueError(f"Unknown benchmark: {benchmark_name}. Available: {list(benchmark_map.keys())}")
-    benchmark_class = benchmark_map[benchmark_name]
-    return benchmark_class(data_dir, **kwargs)
 def run_benchmark_command(args) -> None:
     """Run a benchmark."""
-    print(f"Running benchmark: {args.benchmark} with model: {args.model}")
-    # Create LLM provider
-    provider_kwargs = {}
-    llm_provider = create_llm_provider(model_name=args.model, provider_type=args.provider, system_prompt=args.system_prompt, **provider_kwargs)
     # Create benchmark
     benchmark_kwargs = {}
-    if args.random_seed is not None:
-        benchmark_kwargs["random_seed"] = args.random_seed
     benchmark = create_benchmark(benchmark_name=args.benchmark, data_dir=args.data_dir, **benchmark_kwargs)
     # Create runner config
     config = BenchmarkRunConfig(
         provider_name=args.provider,
         model_name=args.model,
-        benchmark_name=args.benchmark,
         output_dir=args.output_dir,
         max_questions=args.max_questions,
         temperature=args.temperature,
         top_p=args.top_p,
         max_tokens=args.max_tokens,
-        concurrency=args.concurrency
     )
     # Run benchmark
     runner = BenchmarkRunner(config)
-    summary = runner.run_benchmark(llm_provider, benchmark)
-    print("\n" + "="*50)
-    print("BENCHMARK COMPLETED")
-    print("="*50)
-    # Check if benchmark run was successful
-    if "error" in summary:
-        print(f"Error: {summary['error']}")
-        return
-    # Print results
-    print(f"Model: {args.model}")
-    print(f"Benchmark: {args.benchmark}")
-    print(f"Total Questions: {summary['results']['total_questions']}")
-    print(f"Correct Answers: {summary['results']['correct_answers']}")
-    print(f"Overall Accuracy: {summary['results']['accuracy']:.2f}%")
-    print(f"Total Duration: {summary['results']['total_duration']:.2f}s")
-    print(f"Results saved to: {summary['results_file']}")
 def main():
@@ -121,17 +109,17 @@ def main():
     # Run benchmark command
     run_parser = subparsers.add_parser("run", help="Run a benchmark evaluation")
-    run_parser.add_argument("--model", required=True,
-                           help="Model name (e.g., gpt-4o, gpt-4.1-2025-04-14, gemini-2.5-pro)")
     run_parser.add_argument("--provider", required=True,
-                           choices=["openai", "google", "openrouter", "medrax"],
                            help="LLM provider to use")
     run_parser.add_argument("--system-prompt", required=True,
                            choices=["MEDICAL_ASSISTANT", "CHESTAGENTBENCH_PROMPT"],
                            help="System prompt: MEDICAL_ASSISTANT (general) or CHESTAGENTBENCH_PROMPT (benchmarks)")
-    run_parser.add_argument("--benchmark", required=True,
-                           choices=["rexvqa", "chestagentbench"],
-                           help="Benchmark dataset: rexvqa (radiology VQA) or chestagentbench (chest X-ray reasoning)")
     run_parser.add_argument("--data-dir", required=True,
                            help="Directory containing benchmark data files")
     run_parser.add_argument("--output-dir", default="benchmark_results",
@@ -144,10 +132,10 @@ def main():
                            help="Top-p nucleus sampling parameter (default: 0.95)")
     run_parser.add_argument("--max-tokens", type=int, default=5000,
                            help="Maximum tokens per model response (default: 5000)")
-    run_parser.add_argument("--random-seed", type=int, default=42,
-                           help="Random seed for shuffling benchmark data (enables reproducible runs, default: None)")
     run_parser.add_argument("--concurrency", type=int, default=1,
                             help="Number of datapoints to process in parallel (default: 1)")
     run_parser.set_defaults(func=run_benchmark_command)

 from .runner import BenchmarkRunner, BenchmarkRunConfig
+def create_benchmark(benchmark_name: str, data_dir: str, **kwargs) -> Benchmark:
+    """Create a benchmark based on the benchmark name.
+    Args:
+        benchmark_name (str): Name of the benchmark
+        data_dir (str): Directory containing benchmark data
+        **kwargs: Additional configuration parameters
+    Returns:
+        Benchmark: The configured benchmark
+    """
+    benchmark_map = {
+        "rexvqa": ReXVQABenchmark,
+        "chestagentbench": ChestAgentBenchBenchmark,
+    }
+    if benchmark_name not in benchmark_map:
+        raise ValueError(f"Unknown benchmark: {benchmark_name}. Available: {list(benchmark_map.keys())}")
+    benchmark_class = benchmark_map[benchmark_name]
+    return benchmark_class(data_dir, **kwargs)
+def create_llm_provider(provider_type: str, model_name: str, system_prompt: str, **kwargs) -> LLMProvider:
     """Create an LLM provider based on the model name and type.
     Args:
+        provider_type (str): Type of provider (openai, google, openrouter, medrax, medgemma)
         model_name (str): Name of the model
         system_prompt (str): System prompt identifier to load from file
         **kwargs: Additional configuration parameters
     elif provider_type == "medrax":
         from .llm_providers.medrax_provider import MedRAXProvider
         provider_class = MedRAXProvider
+    elif provider_type == "medgemma":
+        from .llm_providers.medgemma_provider import MedGemmaProvider
+        provider_class = MedGemmaProvider
     else:
+        raise ValueError(f"Unknown provider type: {provider_type}. Available: openai, google, openrouter, medrax, medgemma")
     return provider_class(model_name, system_prompt, **kwargs)
 def run_benchmark_command(args) -> None:
     """Run a benchmark."""
+    print(f"Running benchmark: {args.benchmark} with provider: {args.provider}, model: {args.model}")
     # Create benchmark
     benchmark_kwargs = {}
+    benchmark_kwargs["max_questions"] = args.max_questions
+    benchmark_kwargs["random_seed"] = args.random_seed
     benchmark = create_benchmark(benchmark_name=args.benchmark, data_dir=args.data_dir, **benchmark_kwargs)
+    # Create LLM provider
+    provider_kwargs = {}
+    provider_kwargs["temperature"] = args.temperature
+    provider_kwargs["top_p"] = args.top_p
+    provider_kwargs["max_tokens"] = args.max_tokens
+    llm_provider = create_llm_provider(provider_type=args.provider, model_name=args.model, system_prompt=args.system_prompt, **provider_kwargs)
     # Create runner config
     config = BenchmarkRunConfig(
+        benchmark_name=args.benchmark,
         provider_name=args.provider,
         model_name=args.model,
         output_dir=args.output_dir,
         max_questions=args.max_questions,
         temperature=args.temperature,
         top_p=args.top_p,
         max_tokens=args.max_tokens,
+        concurrency=args.concurrency,
+        random_seed=args.random_seed
     )
     # Run benchmark
     runner = BenchmarkRunner(config)
+    summary = runner.run_benchmark(benchmark, llm_provider)
+    print(summary)
 def main():
     # Run benchmark command
     run_parser = subparsers.add_parser("run", help="Run a benchmark evaluation")
+    run_parser.add_argument("--benchmark", required=True,
+                           choices=["rexvqa", "chestagentbench"],
+                           help="Benchmark dataset: rexvqa (radiology VQA) or chestagentbench (chest X-ray reasoning)")
     run_parser.add_argument("--provider", required=True,
+                           choices=["openai", "google", "openrouter", "medrax", "medgemma"],
                            help="LLM provider to use")
+    run_parser.add_argument("--model", required=True,
+                           help="Model name (e.g., gpt-4o, gpt-4.1-2025-04-14, gemini-2.5-pro)")
     run_parser.add_argument("--system-prompt", required=True,
                            choices=["MEDICAL_ASSISTANT", "CHESTAGENTBENCH_PROMPT"],
                            help="System prompt: MEDICAL_ASSISTANT (general) or CHESTAGENTBENCH_PROMPT (benchmarks)")
     run_parser.add_argument("--data-dir", required=True,
                            help="Directory containing benchmark data files")
     run_parser.add_argument("--output-dir", default="benchmark_results",
                            help="Top-p nucleus sampling parameter (default: 0.95)")
     run_parser.add_argument("--max-tokens", type=int, default=5000,
                            help="Maximum tokens per model response (default: 5000)")
     run_parser.add_argument("--concurrency", type=int, default=1,
                             help="Number of datapoints to process in parallel (default: 1)")
+    run_parser.add_argument("--random-seed", type=int, default=42,
+                           help="Random seed for shuffling benchmark data (enables reproducible runs, default: 42)")
     run_parser.set_defaults(func=run_benchmark_command)

benchmarking/llm_providers/__init__.py CHANGED Viewed

@@ -5,6 +5,17 @@ from .openai_provider import OpenAIProvider
 from .google_provider import GoogleProvider
 from .medrax_provider import MedRAXProvider
 from .openrouter_provider import OpenRouterProvider
 __all__ = [
     "LLMProvider",
@@ -14,4 +25,7 @@ __all__ = [
     "GoogleProvider",
     "MedRAXProvider",
     "OpenRouterProvider",
 ]

 from .google_provider import GoogleProvider
 from .medrax_provider import MedRAXProvider
 from .openrouter_provider import OpenRouterProvider
+from .medgemma_provider import MedGemmaProvider
+# QwenProvider is optional - only import if dependencies are compatible
+try:
+    from .qwen_provider import QwenProvider
+    QWEN_AVAILABLE = True
+except ImportError as e:
+    QWEN_AVAILABLE = False
+    QwenProvider = None
+    print(f"QwenProvider not available: {e}")
+    print("To use Qwen models, upgrade transformers: pip install --upgrade git+https://github.com/huggingface/transformers")
 __all__ = [
     "LLMProvider",
     "GoogleProvider",
     "MedRAXProvider",
     "OpenRouterProvider",
+    "MedGemmaProvider",
+    "QwenProvider",
+    "QWEN_AVAILABLE",
 ]

benchmarking/llm_providers/base.py CHANGED Viewed

@@ -13,10 +13,6 @@ class LLMRequest:
     """Request to an LLM provider."""
     text: str
     images: Optional[List[str]] = None  # List of image paths
-    temperature: float = 0.7
-    top_p: float = 0.95
-    max_tokens: int = 5000
-    additional_params: Optional[Dict[str, Any]] = None
 @dataclass
@@ -44,15 +40,17 @@ class LLMProvider(ABC):
             **kwargs: Additional configuration parameters
         """
         self.model_name = model_name
-        self.config = kwargs
-        self.prompt_name = system_prompt  # Store the original prompt identifier
         # Load system prompt content from file
         try:
-            prompts = load_prompts_from_file("medrax/docs/system_prompts.txt")
-            self.system_prompt = prompts.get(system_prompt, None)
             if self.system_prompt is None:
-                print(f"Warning: System prompt '{system_prompt}' not found in medrax/docs/system_prompts.txt.")
         except Exception as e:
             print(f"Error loading system prompt: {e}")
             self.system_prompt = None
@@ -85,9 +83,7 @@ class LLMProvider(ABC):
         try:
             # Simple test request
             test_request = LLMRequest(
-                text="Hello! What model are you? Tell me your full specification.",
-                temperature=0.5,
-                max_tokens=1000
             )
             response = self.generate_response(test_request)
             return response.content is not None and len(response.content.strip()) > 0
@@ -95,6 +91,23 @@ class LLMProvider(ABC):
             print(f"Connection test failed: {e}")
             return False
     def _encode_image(self, image_path: str) -> str:
         """Encode image to base64 string.
@@ -110,23 +123,30 @@ class LLMProvider(ABC):
         except Exception as e:
             print(f"ERROR: _encode_image failed for {image_path} (type: {type(image_path)}): {e}")
             raise
-    def _validate_image_paths(self, image_paths: List[str]) -> List[str]:
-        """Validate that image paths exist and are readable.
         Args:
-            image_paths (List[str]): List of image paths to validate
         Returns:
-            List[str]: List of valid image paths
         """
-        valid_paths = []
-        for path in image_paths:
-            if Path(path).exists() and Path(path).is_file():
-                valid_paths.append(path)
-            else:
-                print(f"Warning: Image path does not exist: {path}")
-        return valid_paths
     def __str__(self) -> str:
         """String representation of the provider."""

     """Request to an LLM provider."""
     text: str
     images: Optional[List[str]] = None  # List of image paths
 @dataclass
             **kwargs: Additional configuration parameters
         """
         self.model_name = model_name
+        self.temperature = kwargs.get("temperature", 0.7)
+        self.top_p = kwargs.get("top_p", 0.95)
+        self.max_tokens = kwargs.get("max_tokens", 5000)
+        self.prompt_name = system_prompt
         # Load system prompt content from file
         try:
+            prompts = load_prompts_from_file("benchmarking/system_prompts.txt")
+            self.system_prompt = prompts.get(self.prompt_name, None)
             if self.system_prompt is None:
+                print(f"Warning: System prompt '{system_prompt}' not found in benchmarking/system_prompts.txt.")
         except Exception as e:
             print(f"Error loading system prompt: {e}")
             self.system_prompt = None
         try:
             # Simple test request
             test_request = LLMRequest(
+                text="Hello! What model are you? Tell me your full specification."
             )
             response = self.generate_response(test_request)
             return response.content is not None and len(response.content.strip()) > 0
             print(f"Connection test failed: {e}")
             return False
+    def _validate_image_paths(self, image_paths: List[str]) -> List[str]:
+        """Validate that image paths exist and are readable.
+        Args:
+            image_paths (List[str]): List of image paths to validate
+        Returns:
+            List[str]: List of valid image paths
+        """
+        valid_paths = []
+        for path in image_paths:
+            if Path(path).exists() and Path(path).is_file():
+                valid_paths.append(path)
+            else:
+                print(f"Warning: Image path does not exist: {path}")
+        return valid_paths
     def _encode_image(self, image_path: str) -> str:
         """Encode image to base64 string.
         except Exception as e:
             print(f"ERROR: _encode_image failed for {image_path} (type: {type(image_path)}): {e}")
             raise
+    def _get_image_mime_type(self, image_path: str) -> str:
+        """Detect the MIME type of an image file.
         Args:
+            image_path (str): Path to the image file
         Returns:
+            str: MIME type (e.g., 'image/png', 'image/jpeg')
         """
+        # Get file extension
+        ext = Path(image_path).suffix.lower()
+        # Map extensions to MIME types
+        mime_types = {
+            '.png': 'image/png',
+            '.jpg': 'image/jpeg',
+            '.jpeg': 'image/jpeg',
+            '.gif': 'image/gif',
+            '.webp': 'image/webp',
+            '.bmp': 'image/bmp',
+        }
+        return mime_types.get(ext, 'image/png')  # Default to PNG for medical images
     def __str__(self) -> str:
         """String representation of the provider."""

benchmarking/llm_providers/google_provider.py CHANGED Viewed

@@ -14,6 +14,10 @@ class GoogleProvider(LLMProvider):
     def _setup(self) -> None:
         """Set up Google langchain client."""
         api_key = os.getenv("GOOGLE_API_KEY")
         if not api_key:
             raise ValueError("GOOGLE_API_KEY environment variable is required")
@@ -21,7 +25,10 @@ class GoogleProvider(LLMProvider):
         # Create ChatGoogleGenerativeAI instance
         self.client = ChatGoogleGenerativeAI(
             model=self.model_name,
-            google_api_key=api_key
         )
     @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
@@ -54,9 +61,10 @@ class GoogleProvider(LLMProvider):
                 try:
                     # For langchain Google, pass image data as base64
                     image_b64 = self._encode_image(image_path)
                     content_parts.append({
                         "type": "image_url",
-                        "image_url": f"data:image/jpeg;base64,{image_b64}"
                     })
                 except Exception as e:
                     print(f"Error reading image {image_path}: {e}")
@@ -68,18 +76,13 @@ class GoogleProvider(LLMProvider):
         # Make API call using langchain
         try:
-            # Update client parameters for this request
-            self.client.temperature = request.temperature
-            self.client.max_output_tokens = request.max_tokens
-            self.client.top_p = request.top_p
             response = self.client.invoke(messages)
             duration = time.time() - start_time
-            # Extract response content
-            content = response.content if response.content else ""
             # Get usage information if available
             usage = {}
             if hasattr(response, 'usage_metadata') and response.usage_metadata:

     def _setup(self) -> None:
         """Set up Google langchain client."""
+        # Set provider name
+        self.provider_name = "google"
+        # Get API key from environment variable
         api_key = os.getenv("GOOGLE_API_KEY")
         if not api_key:
             raise ValueError("GOOGLE_API_KEY environment variable is required")
         # Create ChatGoogleGenerativeAI instance
         self.client = ChatGoogleGenerativeAI(
             model=self.model_name,
+            google_api_key=api_key,
+            temperature=self.temperature,
+            max_output_tokens=self.max_tokens,
+            top_p=self.top_p
         )
     @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
                 try:
                     # For langchain Google, pass image data as base64
                     image_b64 = self._encode_image(image_path)
+                    mime_type = self._get_image_mime_type(image_path)
                     content_parts.append({
                         "type": "image_url",
+                        "image_url": f"data:{mime_type};base64,{image_b64}"
                     })
                 except Exception as e:
                     print(f"Error reading image {image_path}: {e}")
         # Make API call using langchain
         try:
+            # Make API call
             response = self.client.invoke(messages)
+            content = response.content if response.content else ""
+            # Calculate duration
             duration = time.time() - start_time
             # Get usage information if available
             usage = {}
             if hasattr(response, 'usage_metadata') and response.usage_metadata:

benchmarking/llm_providers/medgemma_provider.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""MedGemma LLM provider implementation using the MedGemma FastAPI service."""
+import os
+import time
+import httpx
+from typing import Optional
+from pathlib import Path
+from tenacity import retry, wait_exponential, stop_after_attempt
+from .base import LLMProvider, LLMRequest, LLMResponse
+class MedGemmaProvider(LLMProvider):
+    """MedGemma LLM provider that communicates with the MedGemma FastAPI service.
+    This provider wraps Google's MedGemma 4B model as an LLMProvider for benchmarking.
+    It communicates with a running MedGemma FastAPI service on localhost:8002.
+    MedGemma is a specialized multimodal AI model trained on medical images and text.
+    It provides expert-level analysis for chest X-rays, dermatology images,
+    ophthalmology images, and histopathology slides.
+    Requirements:
+        - MedGemma FastAPI service must be running on the configured API URL
+        - Default URL: http://localhost:8002
+        - Can be overridden via MEDGEMMA_API_URL environment variable
+    """
+    def __init__(self, model_name: str, system_prompt: str, **kwargs):
+        """Initialize MedGemma provider.
+        Args:
+            model_name (str): Model name (for consistency with other providers)
+            system_prompt (str): System prompt identifier to load from file
+            **kwargs: Additional configuration parameters
+                - api_url: URL of the MedGemma FastAPI service
+                - max_new_tokens: Maximum tokens to generate (default: 300)
+        """
+        # Extract MedGemma-specific config before calling super().__init__
+        self.api_url = kwargs.pop('api_url', None) or os.getenv('MEDGEMMA_API_URL', 'http://localhost:8002')
+        self.max_new_tokens = kwargs.pop('max_new_tokens', 300)
+        self.client = None
+        # Call parent constructor
+        super().__init__(model_name, system_prompt, **kwargs)
+    def _setup(self) -> None:
+        """Set up httpx client for communicating with MedGemma API."""
+        # Create httpx client with reasonable timeouts
+        timeout_config = httpx.Timeout(
+            timeout=300.0,  # 5 minutes for inference
+            connect=10.0    # 10 seconds to establish connection
+        )
+        self.client = httpx.Client(timeout=timeout_config)
+        # Test connection to MedGemma service
+        try:
+            response = self.client.get(f"{self.api_url}/docs")
+            if response.status_code != 200:
+                print(f"Warning: MedGemma API at {self.api_url} may not be running (status: {response.status_code})")
+        except httpx.ConnectError:
+            print(f"Warning: Could not connect to MedGemma API at {self.api_url}")
+            print("Please ensure the MedGemma FastAPI service is running:")
+            print(f"  python medrax/tools/vqa/medgemma/medgemma.py")
+    @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
+    def generate_response(self, request: LLMRequest) -> LLMResponse:
+        """Generate response using MedGemma API.
+        Args:
+            request (LLMRequest): The request containing text, images, and parameters
+        Returns:
+            LLMResponse: The response from MedGemma
+        """
+        start_time = time.time()
+        if self.client is None:
+            return LLMResponse(
+                content="Error: MedGemma client not initialized",
+                duration=time.time() - start_time
+            )
+        try:
+            # Validate and prepare images
+            if not request.images:
+                return LLMResponse(
+                    content="Error: MedGemma requires at least one image",
+                    duration=time.time() - start_time
+                )
+            valid_images = self._validate_image_paths(request.images)
+            if not valid_images:
+                return LLMResponse(
+                    content="Error: No valid image paths provided",
+                    duration=time.time() - start_time
+                )
+            # Prepare multipart form data
+            files_to_send = []
+            for image_path in valid_images:
+                try:
+                    # Detect correct MIME type based on file extension
+                    ext = Path(image_path).suffix.lower()
+                    mime_type = "image/png" if ext == ".png" else "image/jpeg"
+                    # Read image file
+                    with open(image_path, "rb") as f:
+                        image_data = f.read()
+                    # Add to files list
+                    files_to_send.append(
+                        ("images", (os.path.basename(image_path), image_data, mime_type))
+                    )
+                except Exception as e:
+                    print(f"Error reading image {image_path}: {e}")
+                    continue
+            if not files_to_send:
+                return LLMResponse(
+                    content="Error: Failed to read any image files",
+                    duration=time.time() - start_time
+                )
+            # Prepare form data
+            # Use system_prompt if provided, otherwise use default
+            system_prompt_text = self.system_prompt if self.system_prompt else "You are an expert radiologist."
+            # Override max_new_tokens if provided in request
+            max_tokens = getattr(request, 'max_tokens', self.max_new_tokens)
+            data = {
+                "prompt": request.text,
+                "system_prompt": system_prompt_text,
+                "max_new_tokens": max_tokens,
+            }
+            # Make API request
+            response = self.client.post(
+                f"{self.api_url}/analyze-images/",
+                data=data,
+                files=files_to_send,
+            )
+            # Check for errors
+            response.raise_for_status()
+            # Parse response
+            response_data = response.json()
+            content = response_data.get("response", "")
+            metadata = response_data.get("metadata", {})
+            duration = time.time() - start_time
+            # MedGemma doesn't provide token usage, but we can include request info
+            usage = {
+                "num_images": len(valid_images),
+                "max_new_tokens": max_tokens,
+            }
+            return LLMResponse(
+                content=content,
+                usage=usage,
+                duration=duration
+            )
+        except httpx.TimeoutException as e:
+            duration = time.time() - start_time
+            error_msg = f"MedGemma API request timed out after {duration:.1f}s. The server might be overloaded or the model is taking too long to process."
+            print(f"Error: {error_msg}")
+            return LLMResponse(
+                content=f"Error: {error_msg}",
+                duration=duration
+            )
+        except httpx.ConnectError as e:
+            duration = time.time() - start_time
+            error_msg = f"Could not connect to MedGemma API at {self.api_url}. Please ensure the service is running."
+            print(f"Error: {error_msg}")
+            return LLMResponse(
+                content=f"Error: {error_msg}",
+                duration=duration
+            )
+        except httpx.HTTPStatusError as e:
+            duration = time.time() - start_time
+            error_msg = f"MedGemma API returned error {e.response.status_code}: {e.response.text}"
+            print(f"Error: {error_msg}")
+            return LLMResponse(
+                content=f"Error: {error_msg}",
+                duration=duration
+            )
+        except Exception as e:
+            duration = time.time() - start_time
+            error_msg = f"Unexpected error calling MedGemma API: {str(e)}"
+            print(f"Error: {error_msg}")
+            return LLMResponse(
+                content=f"Error: {error_msg}",
+                duration=duration
+            )
+    def test_connection(self) -> bool:
+        """Test the connection to the MedGemma API service.
+        Returns:
+            bool: True if connection is successful and service is responding
+        """
+        try:
+            # Try to access the API docs endpoint
+            response = self.client.get(f"{self.api_url}/docs")
+            return response.status_code == 200
+        except Exception as e:
+            print(f"MedGemma connection test failed: {e}")
+            return False
+    def __del__(self):
+        """Clean up httpx client on deletion."""
+        if self.client is not None:
+            self.client.close()

benchmarking/llm_providers/medrax_provider.py CHANGED Viewed

@@ -21,7 +21,9 @@ class MedRAXProvider(LLMProvider):
             system_prompt (str): System prompt to use
             **kwargs: Additional configuration parameters
         """
-        self.model_name = model_name
         self.agent = None
         self.tools_dict = None
@@ -33,15 +35,15 @@ class MedRAXProvider(LLMProvider):
             print("Starting server...")
             selected_tools = [
-                "TorchXRayVisionClassifierTool",  # For classifying chest X-ray images using TorchXRayVision
-                "ArcPlusClassifierTool",  # For advanced chest X-ray classification using ArcPlus
-                "ChestXRayReportGeneratorTool",  # For generating medical reports from X-rays
-                "XRayPhraseGroundingTool",  # For locating described features in X-rays
-                "MedGemmaVQATool", # Google MedGemma VQA tool
-                "XRayVQATool",  # For visual question answering on X-rays
-                "MedicalRAGTool",  # For retrieval-augmented generation with medical knowledge
-                "WebBrowserTool",  # For web browsing and search capabilities
-                "DuckDuckGoSearchTool",  # For privacy-focused web search using DuckDuckGo
             ]
             rag_config = RAGConfig(
@@ -62,14 +64,15 @@ class MedRAXProvider(LLMProvider):
             model_kwargs = {}
             agent, tools_dict = initialize_agent(
-                prompt_file="medrax/docs/system_prompts.txt",
                 tools_to_use=selected_tools,
-                model_dir="/scratch/ssd004/scratch/victorli/model-weights",
                 temp_dir="temp",  # Change this to the path of the temporary directory
                 device="cuda:0",
                 model=self.model_name,  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
-                temperature=1.0,
-                top_p=0.95,
                 model_kwargs=model_kwargs,
                 rag_config=rag_config,
                 system_prompt=self.prompt_name,
@@ -107,32 +110,34 @@ class MedRAXProvider(LLMProvider):
             thread_id = str(int(time.time() * 1000))  # Unique thread ID
             if request.images:
                 valid_images = self._validate_image_paths(request.images)
                 print(f"Processing {len(valid_images)} images")
-                for i, image_path in enumerate(valid_images):
-                    # Add image path message for tools
-                    messages.append(HumanMessage(content=f"image_path: {image_path}"))
-                    # Add image content for multimodal LLM
                     try:
-                        with open(image_path, "rb") as img_file:
-                            img_base64 = self._encode_image(image_path)
-                        messages.append(HumanMessage(content=[{
                             "type": "image_url",
-                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
-                        }]))
                     except Exception as e:
                         print(f"ERROR: Image encoding failed for {image_path}: {e}")
                         raise
-            # Add text message
-            if request.images:
-                # If there are images, add text as part of multimodal content
-                messages.append(HumanMessage(content=[{
-                    "type": "text",
-                    "text": request.text
-                }]))
             else:
                 # If no images, add text as simple string
                 messages.append(HumanMessage(content=request.text))
@@ -216,8 +221,67 @@ class MedRAXProvider(LLMProvider):
                     "type": type(msg).__name__,
                     "content": str(msg.content) if hasattr(msg, 'content') else str(msg)
                 }
                 chunk_messages.append(msg_info)
-                print(f"Message in chunk: {msg_info}")
             serializable_chunk["messages"] = chunk_messages
         return serializable_chunk

             system_prompt (str): System prompt to use
             **kwargs: Additional configuration parameters
         """
+        # Set provider name
+        self.provider_name = "medrax"
         self.agent = None
         self.tools_dict = None
             print("Starting server...")
             selected_tools = [
+                # "TorchXRayVisionClassifierTool",  # For classifying chest X-ray images using TorchXRayVision
+                # "ArcPlusClassifierTool",  # For advanced chest X-ray classification using ArcPlus
+                # "ChestXRayReportGeneratorTool",  # For generating medical reports from X-rays
+                # "XRayPhraseGroundingTool",  # For locating described features in X-rays
+                # "MedGemmaVQATool", # Google MedGemma VQA tool
+                # "XRayVQATool",  # For visual question answering on X-rays
+                # "MedicalRAGTool",  # For retrieval-augmented generation with medical knowledge
+                # "WebBrowserTool",  # For web browsing and search capabilities
+                # "DuckDuckGoSearchTool",  # For privacy-focused web search using DuckDuckGo
             ]
             rag_config = RAGConfig(
             model_kwargs = {}
             agent, tools_dict = initialize_agent(
+                prompt_file="benchmarking/system_prompts.txt",
                 tools_to_use=selected_tools,
+                model_dir="/home/lijunzh3/scratch/MedRAX2/model-weights",
                 temp_dir="temp",  # Change this to the path of the temporary directory
                 device="cuda:0",
                 model=self.model_name,  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
+                temperature=self.temperature,
+                top_p=self.top_p,
+                max_tokens=self.max_tokens,
                 model_kwargs=model_kwargs,
                 rag_config=rag_config,
                 system_prompt=self.prompt_name,
             thread_id = str(int(time.time() * 1000))  # Unique thread ID
             if request.images:
+                # Build multimodal content with text and images
+                content = [{"type": "text", "text": request.text}]
+                # Validate image paths
                 valid_images = self._validate_image_paths(request.images)
                 print(f"Processing {len(valid_images)} images")
+                # Add image paths for tools
+                for image_path in valid_images:
+                    content.append({"type": "text", "text": f"image_path: {image_path}"})
+                # Add image content for multimodal LLM
+                for image_path in valid_images:
                     try:
+                        img_base64 = self._encode_image(image_path)
+                        mime_type = self._get_image_mime_type(image_path)
+                        content.append({
                             "type": "image_url",
+                            "image_url": {"url": f"data:{mime_type};base64,{img_base64}"}
+                        })
                     except Exception as e:
                         print(f"ERROR: Image encoding failed for {image_path}: {e}")
                         raise
+                # Create single multimodal message
+                messages.append(HumanMessage(content=content))
             else:
                 # If no images, add text as simple string
                 messages.append(HumanMessage(content=request.text))
                     "type": type(msg).__name__,
                     "content": str(msg.content) if hasattr(msg, 'content') else str(msg)
                 }
+                # Extract response metadata (reasoning/thinking traces)
+                if hasattr(msg, 'response_metadata') and msg.response_metadata:
+                    try:
+                        msg_info["response_metadata"] = dict(msg.response_metadata)
+                        # Extract specific reasoning fields for easier access
+                        # Gemini 2.0 Flash Thinking uses 'thoughts'
+                        if "thoughts" in msg.response_metadata:
+                            msg_info["thinking"] = msg.response_metadata["thoughts"]
+                        # DeepSeek-R1 and similar models use 'reasoning_content'
+                        if "reasoning_content" in msg.response_metadata:
+                            msg_info["reasoning"] = msg.response_metadata["reasoning_content"]
+                        # Some models expose thinking in other fields
+                        if "extended_thinking" in msg.response_metadata:
+                            msg_info["extended_thinking"] = msg.response_metadata["extended_thinking"]
+                    except Exception as e:
+                        print(f"Warning: Could not serialize response_metadata: {e}")
+                # Extract usage metadata (reasoning tokens for o1/o3 models)
+                if hasattr(msg, 'usage_metadata') and msg.usage_metadata:
+                    try:
+                        msg_info["usage_metadata"] = dict(msg.usage_metadata)
+                        # Highlight reasoning tokens if present
+                        if isinstance(msg.usage_metadata, dict) and "reasoning_tokens" in msg.usage_metadata:
+                            msg_info["reasoning_tokens"] = msg.usage_metadata["reasoning_tokens"]
+                    except Exception as e:
+                        print(f"Warning: Could not serialize usage_metadata: {e}")
+                # Extract additional kwargs (some models put reasoning here)
+                if hasattr(msg, 'additional_kwargs') and msg.additional_kwargs:
+                    try:
+                        # Filter for reasoning-related fields
+                        reasoning_kwargs = {}
+                        for key in ['thinking', 'reasoning', 'thoughts', 'chain_of_thought']:
+                            if key in msg.additional_kwargs:
+                                reasoning_kwargs[key] = msg.additional_kwargs[key]
+                        if reasoning_kwargs:
+                            msg_info["additional_reasoning"] = reasoning_kwargs
+                        # Include full additional_kwargs for completeness (may contain other useful info)
+                        msg_info["additional_kwargs"] = dict(msg.additional_kwargs)
+                    except Exception as e:
+                        print(f"Warning: Could not serialize additional_kwargs: {e}")
                 chunk_messages.append(msg_info)
+                # Enhanced logging for debugging
+                log_msg = f"Message in chunk: type={msg_info['type']}"
+                if "thinking" in msg_info:
+                    log_msg += f", has_thinking=True (length={len(str(msg_info['thinking']))})"
+                if "reasoning" in msg_info:
+                    log_msg += f", has_reasoning=True (length={len(str(msg_info['reasoning']))})"
+                if "reasoning_tokens" in msg_info:
+                    log_msg += f", reasoning_tokens={msg_info['reasoning_tokens']}"
+                print(log_msg)
             serializable_chunk["messages"] = chunk_messages
         return serializable_chunk

benchmarking/llm_providers/openai_provider.py CHANGED Viewed

@@ -14,21 +14,28 @@ class OpenAIProvider(LLMProvider):
     def _setup(self) -> None:
         """Set up OpenAI langchain client."""
         api_key = os.getenv("OPENAI_API_KEY")
-        if not api_key:
-            raise ValueError("OPENAI_API_KEY environment variable is required")
         base_url = os.getenv("OPENAI_BASE_URL")
-        # Create ChatOpenAI instance
         kwargs = {
             "model": self.model_name,
             "api_key": api_key,
         }
         if base_url:
             kwargs["base_url"] = base_url
         self.client = ChatOpenAI(**kwargs)
     @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
@@ -63,10 +70,11 @@ class OpenAIProvider(LLMProvider):
             for image_path in valid_images:
                 try:
                     image_b64 = self._encode_image(image_path)
                     user_content.append({
                         "type": "image_url",
                         "image_url": {
-                            "url": f"data:image/jpeg;base64,{image_b64}",
                             "detail": "high"
                         }
                     })
@@ -75,13 +83,8 @@ class OpenAIProvider(LLMProvider):
         messages.append(HumanMessage(content=user_content))
-        # Make API call using langchain
         try:
-            # Update client parameters for this request
-            self.client.temperature = request.temperature
-            self.client.max_tokens = request.max_tokens
-            self.client.top_p = request.top_p
             response = self.client.invoke(messages)
             duration = time.time() - start_time

     def _setup(self) -> None:
         """Set up OpenAI langchain client."""
+        # Set provider name
+        self.provider_name = "openai"
+        # Get API key and base URL from environment variables
         api_key = os.getenv("OPENAI_API_KEY")
         base_url = os.getenv("OPENAI_BASE_URL")
+        if not api_key or not base_url:
+            raise ValueError("OPENAI_API_KEY and OPENAI_BASE_URL environment variables are required")
+        # Construct kwargs for ChatOpenAI instance
         kwargs = {
             "model": self.model_name,
             "api_key": api_key,
+            "temperature": self.temperature,
+            "max_tokens": self.max_tokens
         }
         if base_url:
             kwargs["base_url"] = base_url
+        if self.model_name.startswith("gpt-5") or self.model_name.startswith("o1") or self.model_name.startswith("o3"):
+            kwargs["reasoning_effort"] = "high"
+        # Create ChatOpenAI instance
         self.client = ChatOpenAI(**kwargs)
     @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
             for image_path in valid_images:
                 try:
                     image_b64 = self._encode_image(image_path)
+                    mime_type = self._get_image_mime_type(image_path)
                     user_content.append({
                         "type": "image_url",
                         "image_url": {
+                            "url": f"data:{mime_type};base64,{image_b64}",
                             "detail": "high"
                         }
                     })
         messages.append(HumanMessage(content=user_content))
+        # Make API call
         try:
             response = self.client.invoke(messages)
             duration = time.time() - start_time

benchmarking/llm_providers/openrouter_provider.py CHANGED Viewed

@@ -13,11 +13,16 @@ class OpenRouterProvider(LLMProvider):
     def _setup(self) -> None:
         """Set up OpenRouter client models."""
         api_key = os.getenv("OPENROUTER_API_KEY")
-        if not api_key:
-            raise ValueError("OPENROUTER_API_KEY environment variable is required for xAI Grok via OpenRouter.")
         base_url = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
-        # Use OpenAI SDK with OpenRouter endpoint
         self.client = OpenAI(api_key=api_key, base_url=base_url)
     @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
@@ -45,10 +50,11 @@ class OpenRouterProvider(LLMProvider):
             for image_path in valid_images:
                 try:
                     image_b64 = self._encode_image(image_path)
                     user_content.append({
                         "type": "image_url",
                         "image_url": {
-                            "url": f"data:image/jpeg;base64,{image_b64}",
                             "detail": "high"
                         }
                     })
@@ -57,14 +63,14 @@ class OpenRouterProvider(LLMProvider):
         messages.append({"role": "user", "content": user_content})
         try:
             response = self.client.chat.completions.create(
                 model=self.model_name,
                 messages=messages,
-                temperature=request.temperature,
-                top_p=request.top_p,
-                max_tokens=request.max_tokens,
-                **(request.additional_params or {})
             )
             duration = time.time() - start_time
             content = response.choices[0].message.content if response.choices else ""

     def _setup(self) -> None:
         """Set up OpenRouter client models."""
+        # Set provider name
+        self.provider_name = "openrouter"
+        # Get API key and base URL from environment variables
         api_key = os.getenv("OPENROUTER_API_KEY")
         base_url = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
+        if not api_key or not base_url:
+            raise ValueError("OPENROUTER_API_KEY and OPENROUTER_BASE_URL environment variables are required")
+        # Create OpenAI client with OpenRouter endpoint
         self.client = OpenAI(api_key=api_key, base_url=base_url)
     @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
             for image_path in valid_images:
                 try:
                     image_b64 = self._encode_image(image_path)
+                    mime_type = self._get_image_mime_type(image_path)
                     user_content.append({
                         "type": "image_url",
                         "image_url": {
+                            "url": f"data:{mime_type};base64,{image_b64}",
                             "detail": "high"
                         }
                     })
         messages.append({"role": "user", "content": user_content})
+        # Make API call
         try:
             response = self.client.chat.completions.create(
                 model=self.model_name,
                 messages=messages,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                top_p=self.top_p
             )
             duration = time.time() - start_time
             content = response.choices[0].message.content if response.choices else ""

benchmarking/runner.py CHANGED Viewed

@@ -32,16 +32,17 @@ class BenchmarkResult:
 @dataclass
 class BenchmarkRunConfig:
     """Configuration for a benchmark run."""
     provider_name: str
     model_name: str
-    benchmark_name: str
     output_dir: str
     max_questions: Optional[int] = None
     temperature: float = 0.7
     top_p: float = 0.95
     max_tokens: int = 5000
-    additional_params: Optional[Dict[str, Any]] = None
     concurrency: int = 1
 class BenchmarkRunner:
@@ -59,11 +60,10 @@ class BenchmarkRunner:
         self.output_dir.mkdir(parents=True, exist_ok=True)
         # Generate unique run ID
-        self.run_id = f"{config.benchmark_name}_{config.provider_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         # Set up logging
         self._setup_logging()
         self.logger.info(f"Initialized benchmark runner with ID: {self.run_id}")
     def _setup_logging(self) -> None:
@@ -91,34 +91,28 @@ class BenchmarkRunner:
     def run_benchmark(
         self,
-        llm_provider: LLMProvider,
         benchmark: Benchmark,
     ) -> Dict[str, Any]:
         """Run a benchmark against an LLM provider.
         Args:
-            llm_provider (LLMProvider): The LLM provider to test
             benchmark (Benchmark): The benchmark to run
         Returns:
             Dict[str, Any]: Summary of benchmark results
         """
         self.logger.info(f"Starting benchmark run: {self.run_id}")
-        self.logger.info(f"Model: {llm_provider.model_name}")
         self.logger.info(f"Benchmark: {benchmark}")
         # Test provider connection
         if not llm_provider.test_connection():
             self.logger.error("LLM provider connection test failed")
             return {"error": "LLM provider connection test failed"}
-        # Get data points to process
-        total_questions = len(benchmark)
-        max_questions = self.config.max_questions or total_questions
-        end_index = min(max_questions, total_questions)
-        self.logger.info(f"Processing questions {0} to {end_index-1} of {total_questions}")
         # Initialize counters
         processed = 0
         correct = 0
@@ -127,29 +121,10 @@ class BenchmarkRunner:
         # Determine concurrency
         max_workers = max(1, int(getattr(self.config, "concurrency", 1) or 1))
-        # Prefetch data points to avoid potential thread-safety issues inside benchmark access
-        data_points = []
-        for i in range(0, end_index):
-            try:
-                data_points.append(benchmark.get_data_point(i))
-            except Exception as e:
-                self.logger.error(f"Error fetching data point {i}: {e}")
-                error_result = BenchmarkResult(
-                    data_point_id=f"error_{i}",
-                    question="",
-                    model_answer="",
-                    correct_answer="",
-                    is_correct=False,
-                    duration=0.0,
-                    error=str(e)
-                )
-                self.results.append(error_result)
-                self._save_individual_result(error_result)
         # Process data points in parallel using a bounded thread pool
-        with tqdm(total=end_index, desc="Processing questions") as pbar:
             with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                future_to_index = {executor.submit(self._process_data_point, llm_provider, dp): idx for idx, dp in enumerate(data_points)}
                 for future in as_completed(future_to_index):
                     idx = future_to_index[future]
                     try:
@@ -184,30 +159,29 @@ class BenchmarkRunner:
                         accuracy = (correct / processed) * 100
                         avg_duration = total_duration / processed if processed > 0 else 0.0
                         self.logger.info(
-                            f"Progress: {processed}/{end_index} | "
                             f"Accuracy: {accuracy:.2f}% | "
                             f"Avg Duration: {avg_duration:.2f}s"
                         )
         # Save final results
         summary = self._save_final_results(benchmark)
         self.logger.info(f"Benchmark run completed: {self.run_id}")
-        self.logger.info(f"Final accuracy: {summary['results']['accuracy']:.2f}%")
-        self.logger.info(f"Total duration: {summary['results']['total_duration']:.2f}s")
         return summary
     def _process_data_point(
         self,
-        llm_provider: LLMProvider,
         data_point: BenchmarkDataPoint,
     ) -> BenchmarkResult:
         """Process a single data point.
         Args:
-            llm_provider (LLMProvider): The LLM provider to use
             data_point (BenchmarkDataPoint): The data point to process
         Returns:
             BenchmarkResult: Result of processing the data point
@@ -215,14 +189,10 @@ class BenchmarkRunner:
         start_time = time.time()
         try:
-            # Create request
             request = LLMRequest(
                 text=data_point.text,
-                images=data_point.images,
-                temperature=self.config.temperature,
-                top_p=self.config.top_p,
-                max_tokens=self.config.max_tokens,
-                additional_params=self.config.additional_params
             )
             # Get response from LLM
@@ -232,10 +202,12 @@ class BenchmarkRunner:
             model_answer = self._extract_answer(response.content)
             # Check if correct
-            is_correct = self._is_correct_answer(model_answer, data_point.correct_answer)
             duration = time.time() - start_time
             return BenchmarkResult(
                 data_point_id=data_point.id,
                 question=data_point.text,
@@ -247,8 +219,6 @@ class BenchmarkRunner:
                 chunk_history=response.chunk_history,
                 metadata={
                     "data_point_metadata": data_point.metadata,
-                    "case_id": data_point.case_id,
-                    "category": data_point.category,
                     "raw_response": response.content,
                 }
             )
@@ -265,9 +235,7 @@ class BenchmarkRunner:
                 error=str(e),
                 chunk_history=None,
                 metadata={
-                    "data_point_metadata": data_point.metadata,
-                    "case_id": data_point.case_id,
-                    "category": data_point.category,
                 }
             )
@@ -289,29 +257,6 @@ class BenchmarkRunner:
         # If no pattern matches, return the full response
         return response_text.strip()
-    def _is_correct_answer(self, model_answer: str, correct_answer: str) -> bool:
-        """Check if the model answer is correct.
-        Args:
-            model_answer (str): The model's answer
-            correct_answer (str): The correct answer
-        Returns:
-            bool: True if the answer is correct
-        """
-        if not model_answer or not correct_answer:
-            return False
-        # For multiple choice, compare just the letter
-        model_clean = model_answer.strip().upper()
-        correct_clean = correct_answer.strip().upper()
-        # Extract just the first letter for comparison
-        model_letter = model_clean[0] if model_clean else ""
-        correct_letter = correct_clean[0] if correct_clean else ""
-        return model_letter == correct_letter
     def _save_individual_result(self, result: BenchmarkResult) -> None:
         """Save a single result to its own JSON file.
@@ -321,12 +266,14 @@ class BenchmarkRunner:
         # Sanitize data_point_id for filename (remove invalid characters)
         safe_id = re.sub(r'[^\w\-_.]', '_', result.data_point_id)
         # Create filename with benchmark name and data point ID
         filename = f"{self.config.benchmark_name}_{safe_id}.json"
-        result_file = self.output_dir / "individual_results" / filename
-        # Create individual_results directory if it doesn't exist
-        result_file.parent.mkdir(exist_ok=True)
         # Convert result to serializable format
         result_data = {
@@ -341,7 +288,7 @@ class BenchmarkRunner:
             "usage": result.usage,
             "error": result.error,
             "chunk_history": result.chunk_history,
-            "metadata": result.metadata
         }
         # Save to file
@@ -357,8 +304,13 @@ class BenchmarkRunner:
         Returns:
             Dict[str, Any]: Summary of results
         """
         # Save detailed results
-        results_file = self.output_dir / f"{self.run_id}_results.json"
         # Convert results to serializable format for final file
         results_data = []
@@ -385,29 +337,14 @@ class BenchmarkRunner:
         accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
-        # Calculate per-category accuracy
-        category_stats = {}
-        for result in self.results:
-            if result.metadata and result.metadata.get("category"):
-                category = result.metadata["category"]
-                if category not in category_stats:
-                    category_stats[category] = {"correct": 0, "total": 0}
-                category_stats[category]["total"] += 1
-                if result.is_correct:
-                    category_stats[category]["correct"] += 1
-        # Calculate accuracy for each category
-        category_accuracies = {}
-        for category, stats in category_stats.items():
-            category_accuracies[category] = (stats["correct"] / stats["total"]) * 100
         # Create summary
         summary = {
             "run_id": self.run_id,
             "timestamp": datetime.now().isoformat(),
             "config": {
-                "model_name": self.config.model_name,
                 "benchmark_name": self.config.benchmark_name,
                 "temperature": self.config.temperature,
                 "top_p": self.config.top_p,
                 "max_tokens": self.config.max_tokens,
@@ -422,13 +359,12 @@ class BenchmarkRunner:
                 "total_questions": total_questions,
                 "total_duration": total_duration,
                 "avg_duration_per_question": total_duration / total_questions if total_questions > 0 else 0,
-                "category_accuracies": category_accuracies,
             },
             "results_file": str(results_file),
         }
         # Save summary
-        summary_file = self.output_dir / f"{self.run_id}_summary.json"
         with open(summary_file, 'w') as f:
             json.dump(summary, f, indent=2)

 @dataclass
 class BenchmarkRunConfig:
     """Configuration for a benchmark run."""
+    benchmark_name: str
     provider_name: str
     model_name: str
     output_dir: str
     max_questions: Optional[int] = None
     temperature: float = 0.7
     top_p: float = 0.95
     max_tokens: int = 5000
     concurrency: int = 1
+    random_seed: Optional[int] = None
 class BenchmarkRunner:
         self.output_dir.mkdir(parents=True, exist_ok=True)
         # Generate unique run ID
+        self.run_id = f"{config.benchmark_name}_{config.provider_name}_{config.model_name}_{config.max_questions}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         # Set up logging
         self._setup_logging()
         self.logger.info(f"Initialized benchmark runner with ID: {self.run_id}")
     def _setup_logging(self) -> None:
     def run_benchmark(
         self,
         benchmark: Benchmark,
+        llm_provider: LLMProvider,
     ) -> Dict[str, Any]:
         """Run a benchmark against an LLM provider.
         Args:
             benchmark (Benchmark): The benchmark to run
+            llm_provider (LLMProvider): The LLM provider to test
         Returns:
             Dict[str, Any]: Summary of benchmark results
         """
         self.logger.info(f"Starting benchmark run: {self.run_id}")
         self.logger.info(f"Benchmark: {benchmark}")
+        self.logger.info(f"Provider: {llm_provider.provider_name}")
+        self.logger.info(f"Model: {llm_provider.model_name}")
         # Test provider connection
         if not llm_provider.test_connection():
             self.logger.error("LLM provider connection test failed")
             return {"error": "LLM provider connection test failed"}
         # Initialize counters
         processed = 0
         correct = 0
         # Determine concurrency
         max_workers = max(1, int(getattr(self.config, "concurrency", 1) or 1))
         # Process data points in parallel using a bounded thread pool
+        with tqdm(total=len(benchmark), desc="Processing questions") as pbar:
             with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                future_to_index = {executor.submit(self._process_data_point, dp, llm_provider): idx for idx, dp in enumerate(benchmark)}
                 for future in as_completed(future_to_index):
                     idx = future_to_index[future]
                     try:
                         accuracy = (correct / processed) * 100
                         avg_duration = total_duration / processed if processed > 0 else 0.0
                         self.logger.info(
+                            f"Progress: {processed}/{len(benchmark)} | "
                             f"Accuracy: {accuracy:.2f}% | "
                             f"Avg Duration: {avg_duration:.2f}s"
                         )
         # Save final results
         summary = self._save_final_results(benchmark)
         self.logger.info(f"Benchmark run completed: {self.run_id}")
+        self.logger.info(f"Summary: {summary}")
         return summary
     def _process_data_point(
         self,
         data_point: BenchmarkDataPoint,
+        llm_provider: LLMProvider
     ) -> BenchmarkResult:
         """Process a single data point.
         Args:
             data_point (BenchmarkDataPoint): The data point to process
+            llm_provider (LLMProvider): The LLM provider to use
         Returns:
             BenchmarkResult: Result of processing the data point
         start_time = time.time()
         try:
+            # Create request for LLM
             request = LLMRequest(
                 text=data_point.text,
+                images=data_point.images
             )
             # Get response from LLM
             model_answer = self._extract_answer(response.content)
             # Check if correct
+            is_correct = model_answer == data_point.correct_answer
+            # Calculate duration
             duration = time.time() - start_time
+            # Return result
             return BenchmarkResult(
                 data_point_id=data_point.id,
                 question=data_point.text,
                 chunk_history=response.chunk_history,
                 metadata={
                     "data_point_metadata": data_point.metadata,
                     "raw_response": response.content,
                 }
             )
                 error=str(e),
                 chunk_history=None,
                 metadata={
+                    "data_point_metadata": data_point.metadata
                 }
             )
         # If no pattern matches, return the full response
         return response_text.strip()
     def _save_individual_result(self, result: BenchmarkResult) -> None:
         """Save a single result to its own JSON file.
         # Sanitize data_point_id for filename (remove invalid characters)
         safe_id = re.sub(r'[^\w\-_.]', '_', result.data_point_id)
+        # Create run_id directory and individual_results subdirectory
+        run_dir = self.output_dir / self.run_id
+        individual_results_dir = run_dir / "individual_results"
+        individual_results_dir.mkdir(parents=True, exist_ok=True)
         # Create filename with benchmark name and data point ID
         filename = f"{self.config.benchmark_name}_{safe_id}.json"
+        result_file = individual_results_dir / filename
         # Convert result to serializable format
         result_data = {
             "usage": result.usage,
             "error": result.error,
             "chunk_history": result.chunk_history,
+            "metadata": result.metadata,
         }
         # Save to file
         Returns:
             Dict[str, Any]: Summary of results
         """
+        # Create run_id directory and final_results subdirectory
+        run_dir = self.output_dir / self.run_id
+        final_results_dir = run_dir / "final_results"
+        final_results_dir.mkdir(parents=True, exist_ok=True)
         # Save detailed results
+        results_file = final_results_dir / f"{self.run_id}_results.json"
         # Convert results to serializable format for final file
         results_data = []
         accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
         # Create summary
         summary = {
             "run_id": self.run_id,
             "timestamp": datetime.now().isoformat(),
             "config": {
                 "benchmark_name": self.config.benchmark_name,
+                "provider_name": self.config.provider_name,
+                "model_name": self.config.model_name,
                 "temperature": self.config.temperature,
                 "top_p": self.config.top_p,
                 "max_tokens": self.config.max_tokens,
                 "total_questions": total_questions,
                 "total_duration": total_duration,
                 "avg_duration_per_question": total_duration / total_questions if total_questions > 0 else 0,
             },
             "results_file": str(results_file),
         }
         # Save summary
+        summary_file = final_results_dir / f"{self.run_id}_summary.json"
         with open(summary_file, 'w') as f:
             json.dump(summary, f, indent=2)

benchmarking/system_prompts.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+[MEDICAL_ASSISTANT]
+You are an expert medical AI assistant who can answer any medical questions and analyze medical images similar to a doctor.
+Solve using your own vision and reasoning and use tools to complement your reasoning.
+You can make multiple tool calls in parallel or in sequence as needed for comprehensive answers.
+Think critically about and criticize the tool outputs.
+If you need to look up some information before asking a follow up question, you are allowed to do that.
+CITATION REQUIREMENTS:
+- When referencing information from RAG and/or web search tools, ALWAYS include numbered citations [1], [2], [3], etc.
+- Use citations immediately after making claims or statements based on the above tool results.
+- Be consistent with citation numbering throughout your response.
+- Only cite sources that actually contain the information you're referencing.
+Examples:
+- "According to recent research [1], chest X-rays can show signs of pneumonia..."
+- "The medical literature indicates [2] that this condition typically presents with..."
+- "Based on clinical guidelines [3], the recommended treatment approach is..."
+[CHESTAGENTBENCH_PROMPT]
+You are a highly skilled radiology AI agent, an expert in interpreting medical images, specifically chest X-rays, CT scans, and MRIs, with world-class accuracy and precision.
+Your primary function is to assist in the analysis of these images and answer diagnostic questions.
+Your task is to provide a step-by-step, structured analysis. First, carefully examine the provided image and describe all relevant findings in a clear, concise manner.
+Next, use your expert medical knowledge to form a differential diagnosis based on these findings. Finally, critically evaluate the provided question and all possible choices.
+You have access to a suite of powerful tools to aid in your analysis. Use these tools as needed to retrieve external medical knowledge, access patient history, or perform specific image processing tasks.
+You should always scrutinize the output from your tools and integrate it into your reasoning. If tool outputs conflict with your initial assessment, explain the discrepancy and justify your final conclusion.
+You must take care to pass in the image paths exactly or else the tools will not work. Do not mangle up the image paths.
+Your final response for a multiple-choice question must strictly follow this format, including your step-by-step reasoning:
+1.  **Image Analysis:** [Describe image findings here]
+2.  **Differential Diagnosis:** [List possible diagnoses and their justifications]
+3.  **Critical Thinking & Tool Use:** [Show your reasoning, including how you used tools and evaluated their output]
+4.  **Final Answer:** \boxed{A}
+Do not provide a definitive diagnosis or treatment plan for a patient. Your purpose is to assist medical professionals with your analysis, not to replace them. You must maintain this persona and adhere to all instructions.