Spaces:

samwell
/

medrax2

Paused

App Files Files Community

VictorLJZ commited on Jul 18, 2025

Commit

2b26ed4

1 Parent(s): a87f8a9

added chestagentbench

Browse files

Files changed (10) hide show

benchmarking/benchmarks/__init__.py +2 -0
benchmarking/benchmarks/base.py +2 -12
benchmarking/benchmarks/chestagentbench_benchmark.py +73 -0
benchmarking/cli.py +3 -1
benchmarking/llm_providers/base.py +12 -1
benchmarking/llm_providers/google_provider.py +2 -2
benchmarking/llm_providers/medrax_provider.py +17 -7
benchmarking/llm_providers/openai_provider.py +2 -2
benchmarking/runner.py +16 -14
medrax/docs/system_prompts.txt +5 -4

benchmarking/benchmarks/__init__.py CHANGED Viewed

@@ -2,9 +2,11 @@
 from .base import Benchmark, BenchmarkDataPoint
 from .rexvqa_benchmark import ReXVQABenchmark
 __all__ = [
     "Benchmark",
     "BenchmarkDataPoint",
     "ReXVQABenchmark",
 ]

 from .base import Benchmark, BenchmarkDataPoint
 from .rexvqa_benchmark import ReXVQABenchmark
+from .chestagentbench_benchmark import ChestAgentBenchBenchmark
 __all__ = [
     "Benchmark",
     "BenchmarkDataPoint",
     "ReXVQABenchmark",
+    "ChestAgentBenchBenchmark",
 ]

benchmarking/benchmarks/base.py CHANGED Viewed

@@ -138,7 +138,7 @@ class Benchmark(ABC):
             "categories": self.get_categories(),
             "category_counts": {},
             "has_images": False,
-            "images_per_question": [],
         }
         for dp in self:
@@ -149,17 +149,7 @@ class Benchmark(ABC):
             # Image statistics
             if dp.images:
                 stats["has_images"] = True
-                stats["images_per_question"].append(len(dp.images))
-            else:
-                stats["images_per_question"].append(0)
-        if stats["images_per_question"]:
-            stats["avg_images_per_question"] = sum(stats["images_per_question"]) / len(stats["images_per_question"])
-            stats["max_images_per_question"] = max(stats["images_per_question"])
-        else:
-            stats["avg_images_per_question"] = 0
-            stats["max_images_per_question"] = 0
         return stats
     def validate_images(self) -> Tuple[List[str], List[str]]:

             "categories": self.get_categories(),
             "category_counts": {},
             "has_images": False,
+            "num_images": 0,
         }
         for dp in self:
             # Image statistics
             if dp.images:
                 stats["has_images"] = True
+                stats["num_images"] += len(dp.images)
         return stats
     def validate_images(self) -> Tuple[List[str], List[str]]:

benchmarking/benchmarks/chestagentbench_benchmark.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import json
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from .base import Benchmark, BenchmarkDataPoint
+class ChestAgentBenchBenchmark(Benchmark):
+    """ChestAgentBench benchmark for complex CXR interpretation and reasoning.
+    Loads the dataset from a local metadata.jsonl file and parses each entry into a BenchmarkDataPoint.
+    """
+    def __init__(self, data_dir: str, **kwargs):
+        self.max_questions = kwargs.get("max_questions", None)
+        super().__init__(data_dir, **kwargs)
+    def _load_data(self) -> None:
+        metadata_path = Path(self.data_dir) / "metadata.jsonl"
+        if not metadata_path.exists():
+            raise FileNotFoundError(f"Could not find metadata.jsonl in {self.data_dir}")
+        print(f"Loading ChestAgentBench from local file: {metadata_path}")
+        self.data_points = []
+        with open(metadata_path, "r", encoding="utf-8") as f:
+            for i, line in enumerate(f):
+                if self.max_questions and i >= self.max_questions:
+                    break
+                try:
+                    item = json.loads(line)
+                    data_point = self._parse_item(item, i)
+                    if data_point:
+                        self.data_points.append(data_point)
+                except Exception as e:
+                    print(f"Error loading item {i}: {e}")
+                    continue
+    def _parse_item(self, item: Dict[str, Any], index: int) -> Optional[BenchmarkDataPoint]:
+        # Use full_question_id or question_id if available, else fallback
+        question_id = item.get("full_question_id") or item.get("question_id") or f"chestagentbench_{index}"
+        question = item.get("question", "")
+        correct_answer = item.get("answer", "")
+        explanation = item.get("explanation", "")
+        images = item.get("images", [])
+        case_id = item.get("case_id", "")
+        category = item.get("categories", "")
+        # Compose question text (options are embedded in the question string)
+        question_with_options = question
+        # Map image paths to local figures directory
+        local_images = None
+        if images:
+            figures_dir = Path(self.data_dir) / "figures"
+            local_images = []
+            for img in images:
+                # Handle relative paths like "figures/11583/figure_1.jpg"
+                if img.startswith("figures/"):
+                    # Remove "figures/" prefix and construct full path
+                    relative_path = img[8:]  # Remove "figures/" prefix
+                    full_path = figures_dir / relative_path
+                    local_images.append(str(full_path))
+                else:
+                    # Fallback to original logic
+                    local_images.append(str(figures_dir / Path(img).name))
+        # Metadata
+        metadata = dict(item)
+        metadata["explanation"] = explanation
+        metadata["dataset"] = "chestagentbench"
+        return BenchmarkDataPoint(
+            id=question_id,
+            text=question_with_options,
+            images=local_images,
+            correct_answer=correct_answer,
+            metadata=metadata,
+            case_id=case_id,
+            category=category,
+        )

benchmarking/cli.py CHANGED Viewed

@@ -45,6 +45,7 @@ def create_benchmark(benchmark_name: str, data_dir: str, **kwargs) -> Benchmark:
     """
     benchmark_map = {
         "rexvqa": ReXVQABenchmark,
     }
     if benchmark_name not in benchmark_map:
@@ -70,6 +71,7 @@ def run_benchmark_command(args) -> None:
     # Create runner config
     config = BenchmarkRunConfig(
         model_name=args.model,
         benchmark_name=args.benchmark,
         output_dir=args.output_dir,
@@ -110,7 +112,7 @@ def main():
     run_parser = subparsers.add_parser("run", help="Run a benchmark")
     run_parser.add_argument("--model", required=True, help="Model name (e.g., gpt-4o, gemini-2.5-pro)")
     run_parser.add_argument("--provider", required=True, choices=["openai", "google", "medrax"], help="LLM provider")
-    run_parser.add_argument("--benchmark", required=True, choices=["rexvqa"], help="Benchmark to run")
     run_parser.add_argument("--data-dir", required=True, help="Directory containing benchmark data")
     run_parser.add_argument("--output-dir", default="benchmark_results", help="Output directory for results")
     run_parser.add_argument("--max-questions", type=int, help="Maximum number of questions to process")

     """
     benchmark_map = {
         "rexvqa": ReXVQABenchmark,
+        "chestagentbench": ChestAgentBenchBenchmark,
     }
     if benchmark_name not in benchmark_map:
     # Create runner config
     config = BenchmarkRunConfig(
+        provider_name=args.provider,
         model_name=args.model,
         benchmark_name=args.benchmark,
         output_dir=args.output_dir,
     run_parser = subparsers.add_parser("run", help="Run a benchmark")
     run_parser.add_argument("--model", required=True, help="Model name (e.g., gpt-4o, gemini-2.5-pro)")
     run_parser.add_argument("--provider", required=True, choices=["openai", "google", "medrax"], help="LLM provider")
+    run_parser.add_argument("--benchmark", required=True, choices=["rexvqa", "chestagentbench"], help="Benchmark to run")
     run_parser.add_argument("--data-dir", required=True, help="Directory containing benchmark data")
     run_parser.add_argument("--output-dir", default="benchmark_results", help="Output directory for results")
     run_parser.add_argument("--max-questions", type=int, help="Maximum number of questions to process")

benchmarking/llm_providers/base.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Dict, List, Optional, Any
 from dataclasses import dataclass
 import base64
 from pathlib import Path
 @dataclass
@@ -12,7 +13,6 @@ class LLMRequest:
     """Request to an LLM provider."""
     text: str
     images: Optional[List[str]] = None  # List of image paths
-    system_prompt: Optional[str] = None
     temperature: float = 0.7
     max_tokens: int = 1500
     additional_params: Optional[Dict[str, Any]] = None
@@ -43,6 +43,17 @@ class LLMProvider(ABC):
         """
         self.model_name = model_name
         self.config = kwargs
         self._setup()
     @abstractmethod

 from dataclasses import dataclass
 import base64
 from pathlib import Path
+from medrax.utils.utils import load_prompts_from_file
 @dataclass
     """Request to an LLM provider."""
     text: str
     images: Optional[List[str]] = None  # List of image paths
     temperature: float = 0.7
     max_tokens: int = 1500
     additional_params: Optional[Dict[str, Any]] = None
         """
         self.model_name = model_name
         self.config = kwargs
+        # Always load system prompt from file
+        try:
+            prompts = load_prompts_from_file("medrax/docs/system_prompts.txt")
+            self.system_prompt = prompts.get("MEDICAL_ASSISTANT", None)
+            if self.system_prompt is None:
+                print(f"Warning: System prompt type 'MEDICAL_ASSISTANT' not found in medrax/docs/system_prompts.txt.")
+        except Exception as e:
+            print(f"Error loading system prompt: {e}")
+            self.system_prompt = None
         self._setup()
     @abstractmethod

benchmarking/llm_providers/google_provider.py CHANGED Viewed

@@ -40,8 +40,8 @@ class GoogleProvider(LLMProvider):
         messages = []
         # Add system prompt if provided
-        if request.system_prompt:
-            messages.append(SystemMessage(content=request.system_prompt))
         # Construct content for multimodal content
         if request.images:

         messages = []
         # Add system prompt if provided
+        if self.system_prompt:
+            messages.append(SystemMessage(content=self.system_prompt))
         # Construct content for multimodal content
         if request.images:

benchmarking/llm_providers/medrax_provider.py CHANGED Viewed

@@ -33,7 +33,7 @@ class MedRAXProvider(LLMProvider):
             print("Starting server...")
             selected_tools = [
-                "ImageVisualizerTool",  # For displaying images in the UI
                 # "DicomProcessorTool",  # For processing DICOM medical image files
                 # "TorchXRayVisionClassifierTool",  # For classifying chest X-ray images using TorchXRayVision
                 # "ArcPlusClassifierTool",  # For advanced chest X-ray classification using ArcPlus
@@ -45,7 +45,7 @@ class MedRAXProvider(LLMProvider):
                 # "ChestXRayGeneratorTool",  # For generating synthetic chest X-rays
                 "WebBrowserTool",  # For web browsing and search capabilities
                 "MedicalRAGTool",  # For retrieval-augmented generation with medical knowledge
-                "PythonSandboxTool",  # Add the Python sandbox tool
             ]
             rag_config = RAGConfig(
@@ -73,7 +73,7 @@ class MedRAXProvider(LLMProvider):
                 tools_to_use=selected_tools,
                 model_dir="/model-weights",
                 temp_dir=self.session_temp_dir,  # Change this to the path of the temporary directory
-                device="cuda",
                 model=self.model_name,  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
                 temperature=0.7,
                 top_p=0.95,
@@ -118,12 +118,21 @@ class MedRAXProvider(LLMProvider):
             image_paths = []
             if request.images:
                 valid_images = self._validate_image_paths(request.images)
                 for i, image_path in enumerate(valid_images):
                     # Copy image to session temp directory
                     dest_path = self.session_temp_dir / f"image_{i}_{Path(image_path).name}"
                     shutil.copy2(image_path, dest_path)
                     image_paths.append(str(dest_path))
                     # Add image path message for tools
                     messages.append({
                         "role": "user",
@@ -167,9 +176,6 @@ class MedRAXProvider(LLMProvider):
             duration = time.time() - start_time
-            # Clean up temporary files
-            self._cleanup_temp_files()
             return LLMResponse(
                 content=response_content.strip(),
                 usage={"agent_tools": list(self.tools_dict.keys())},
@@ -178,7 +184,6 @@ class MedRAXProvider(LLMProvider):
             )
         except Exception as e:
-            self._cleanup_temp_files()
             return LLMResponse(
                 content=f"Error: {str(e)}",
                 duration=time.time() - start_time,
@@ -190,5 +195,10 @@ class MedRAXProvider(LLMProvider):
         try:
             if hasattr(self, 'session_temp_dir') and self.session_temp_dir.exists():
                 shutil.rmtree(self.session_temp_dir)
         except Exception as e:
             print(f"Warning: Failed to cleanup temp files: {e}")

             print("Starting server...")
             selected_tools = [
+                # "ImageVisualizerTool",  # For displaying images in the UI
                 # "DicomProcessorTool",  # For processing DICOM medical image files
                 # "TorchXRayVisionClassifierTool",  # For classifying chest X-ray images using TorchXRayVision
                 # "ArcPlusClassifierTool",  # For advanced chest X-ray classification using ArcPlus
                 # "ChestXRayGeneratorTool",  # For generating synthetic chest X-rays
                 "WebBrowserTool",  # For web browsing and search capabilities
                 "MedicalRAGTool",  # For retrieval-augmented generation with medical knowledge
+                # "PythonSandboxTool",  # Add the Python sandbox tool
             ]
             rag_config = RAGConfig(
                 tools_to_use=selected_tools,
                 model_dir="/model-weights",
                 temp_dir=self.session_temp_dir,  # Change this to the path of the temporary directory
+                device="cpu",
                 model=self.model_name,  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
                 temperature=0.7,
                 top_p=0.95,
             image_paths = []
             if request.images:
                 valid_images = self._validate_image_paths(request.images)
+                print(f"Processing {len(valid_images)} images")
                 for i, image_path in enumerate(valid_images):
+                    print(f"Original image path: {image_path}")
                     # Copy image to session temp directory
                     dest_path = self.session_temp_dir / f"image_{i}_{Path(image_path).name}"
+                    print(f"Destination path: {dest_path}")
                     shutil.copy2(image_path, dest_path)
                     image_paths.append(str(dest_path))
+                    # Verify file exists after copy
+                    if not dest_path.exists():
+                        print(f"ERROR: File not found after copy: {dest_path}")
+                    else:
+                        print(f"File successfully copied: {dest_path}")
                     # Add image path message for tools
                     messages.append({
                         "role": "user",
             duration = time.time() - start_time
             return LLMResponse(
                 content=response_content.strip(),
                 usage={"agent_tools": list(self.tools_dict.keys())},
             )
         except Exception as e:
             return LLMResponse(
                 content=f"Error: {str(e)}",
                 duration=time.time() - start_time,
         try:
             if hasattr(self, 'session_temp_dir') and self.session_temp_dir.exists():
                 shutil.rmtree(self.session_temp_dir)
+                print(f"Cleaned up temporary directory: {self.session_temp_dir}")
         except Exception as e:
             print(f"Warning: Failed to cleanup temp files: {e}")
+    def cleanup(self) -> None:
+        """Clean up resources when done with the provider."""
+        self._cleanup_temp_files()

benchmarking/llm_providers/openai_provider.py CHANGED Viewed

@@ -48,8 +48,8 @@ class OpenAIProvider(LLMProvider):
         messages = []
         # Add system prompt if provided
-        if request.system_prompt:
-            messages.append(SystemMessage(content=request.system_prompt))
         # Build user message content
         user_content = []

         messages = []
         # Add system prompt if provided
+        if self.system_prompt:
+            messages.append(SystemMessage(content=self.system_prompt))
         # Build user message content
         user_content = []

benchmarking/runner.py CHANGED Viewed

@@ -30,16 +30,13 @@ class BenchmarkResult:
 @dataclass
 class BenchmarkRunConfig:
     """Configuration for a benchmark run."""
     model_name: str
     benchmark_name: str
     output_dir: str
     max_questions: Optional[int] = None
-    start_index: int = 0
     temperature: float = 0.7
     max_tokens: int = 1500
-    system_prompt: Optional[str] = None
-    save_frequency: int = 10  # Save results every N questions
-    log_level: str = "INFO"
     additional_params: Optional[Dict[str, Any]] = None
@@ -58,7 +55,7 @@ class BenchmarkRunner:
         self.output_dir.mkdir(parents=True, exist_ok=True)
         # Generate unique run ID
-        self.run_id = f"{config.benchmark_name}_{config.model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         # Set up logging
         self._setup_logging()
@@ -71,7 +68,7 @@ class BenchmarkRunner:
         # Create logger
         self.logger = logging.getLogger(f"benchmark_runner_{self.run_id}")
-        self.logger.setLevel(getattr(logging, self.config.log_level))
         # Create handlers
         file_handler = logging.FileHandler(log_file)
@@ -114,9 +111,9 @@ class BenchmarkRunner:
         # Get data points to process
         total_questions = len(benchmark)
         max_questions = self.config.max_questions or total_questions
-        end_index = min(self.config.start_index + max_questions, total_questions)
-        self.logger.info(f"Processing questions {self.config.start_index} to {end_index-1} of {total_questions}")
         # Initialize counters
         processed = 0
@@ -124,7 +121,7 @@ class BenchmarkRunner:
         total_duration = 0.0
         # Process each data point
-        for i in tqdm(range(self.config.start_index, end_index), desc="Processing questions"):
             try:
                 data_point = benchmark.get_data_point(i)
@@ -141,13 +138,13 @@ class BenchmarkRunner:
                 self.results.append(result)
                 # Log progress
-                if processed % self.config.save_frequency == 0:
                     self._save_intermediate_results()
                     accuracy = (correct / processed) * 100
                     avg_duration = total_duration / processed
                     self.logger.info(
-                        f"Progress: {processed}/{end_index - self.config.start_index} | "
                         f"Accuracy: {accuracy:.2f}% | "
                         f"Avg Duration: {avg_duration:.2f}s"
                     )
@@ -170,6 +167,14 @@ class BenchmarkRunner:
         # Save final results
         summary = self._save_final_results(benchmark)
         self.logger.info(f"Benchmark run completed: {self.run_id}")
         self.logger.info(f"Final accuracy: {summary['results']['accuracy']:.2f}%")
         self.logger.info(f"Total duration: {summary['results']['total_duration']:.2f}s")
@@ -197,7 +202,6 @@ class BenchmarkRunner:
             request = LLMRequest(
                 text=data_point.text,
                 images=data_point.images,
-                system_prompt=self.config.system_prompt,
                 temperature=self.config.temperature,
                 max_tokens=self.config.max_tokens,
                 additional_params=self.config.additional_params
@@ -371,12 +375,10 @@ class BenchmarkRunner:
                 "benchmark_name": self.config.benchmark_name,
                 "temperature": self.config.temperature,
                 "max_tokens": self.config.max_tokens,
-                "system_prompt": self.config.system_prompt,
             },
             "benchmark_info": {
                 "total_size": len(benchmark),
                 "processed_questions": total_questions,
-                "start_index": self.config.start_index,
             },
             "results": {
                 "accuracy": accuracy,

 @dataclass
 class BenchmarkRunConfig:
     """Configuration for a benchmark run."""
+    provider_name: str
     model_name: str
     benchmark_name: str
     output_dir: str
     max_questions: Optional[int] = None
     temperature: float = 0.7
     max_tokens: int = 1500
     additional_params: Optional[Dict[str, Any]] = None
         self.output_dir.mkdir(parents=True, exist_ok=True)
         # Generate unique run ID
+        self.run_id = f"{config.benchmark_name}_{config.provider_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         # Set up logging
         self._setup_logging()
         # Create logger
         self.logger = logging.getLogger(f"benchmark_runner_{self.run_id}")
+        self.logger.setLevel(logging.INFO)
         # Create handlers
         file_handler = logging.FileHandler(log_file)
         # Get data points to process
         total_questions = len(benchmark)
         max_questions = self.config.max_questions or total_questions
+        end_index = min(max_questions, total_questions)
+        self.logger.info(f"Processing questions {0} to {end_index-1} of {total_questions}")
         # Initialize counters
         processed = 0
         total_duration = 0.0
         # Process each data point
+        for i in tqdm(range(0, end_index), desc="Processing questions"):
             try:
                 data_point = benchmark.get_data_point(i)
                 self.results.append(result)
                 # Log progress
+                if processed % 10 == 0:
                     self._save_intermediate_results()
                     accuracy = (correct / processed) * 100
                     avg_duration = total_duration / processed
                     self.logger.info(
+                        f"Progress: {processed}/{end_index} | "
                         f"Accuracy: {accuracy:.2f}% | "
                         f"Avg Duration: {avg_duration:.2f}s"
                     )
         # Save final results
         summary = self._save_final_results(benchmark)
+        # Clean up provider resources
+        if hasattr(llm_provider, 'cleanup'):
+            try:
+                llm_provider.cleanup()
+                self.logger.info("Provider cleanup completed")
+            except Exception as e:
+                self.logger.warning(f"Provider cleanup failed: {e}")
         self.logger.info(f"Benchmark run completed: {self.run_id}")
         self.logger.info(f"Final accuracy: {summary['results']['accuracy']:.2f}%")
         self.logger.info(f"Total duration: {summary['results']['total_duration']:.2f}s")
             request = LLMRequest(
                 text=data_point.text,
                 images=data_point.images,
                 temperature=self.config.temperature,
                 max_tokens=self.config.max_tokens,
                 additional_params=self.config.additional_params
                 "benchmark_name": self.config.benchmark_name,
                 "temperature": self.config.temperature,
                 "max_tokens": self.config.max_tokens,
             },
             "benchmark_info": {
                 "total_size": len(benchmark),
                 "processed_questions": total_questions,
             },
             "results": {
                 "accuracy": accuracy,

medrax/docs/system_prompts.txt CHANGED Viewed

@@ -4,12 +4,13 @@ Solve using your own vision and reasoning and use tools to complement your reaso
 Make multiple tool calls in parallel or sequence as needed for comprehensive answers.
 Critically think about and criticize the tool outputs.
 If you need to look up some information before asking a follow up question, you are allowed to do that.
 CITATION REQUIREMENTS:
-- When referencing information from the RAG and web search tools, ALWAYS include numbered citations [1], [2], [3], etc.
-- Use citations immediately after making claims or statements based on the above tool results
-- Be consistent with citation numbering throughout your response
-- Only cite sources that actually contain the information you're referencing
 Examples:
 - "According to recent research [1], chest X-rays can show signs of pneumonia..."

 Make multiple tool calls in parallel or sequence as needed for comprehensive answers.
 Critically think about and criticize the tool outputs.
 If you need to look up some information before asking a follow up question, you are allowed to do that.
+When encountering a multiple-choice question, give the final answer in closed parentheses without further elaborations; give a definitive answer even if you're not sure.
 CITATION REQUIREMENTS:
+- When referencing information from RAG and/or web search tools, ALWAYS include numbered citations [1], [2], [3], etc.
+- Use citations immediately after making claims or statements based on the above tool results.
+- Be consistent with citation numbering throughout your response.
+- Only cite sources that actually contain the information you're referencing.
 Examples:
 - "According to recent research [1], chest X-rays can show signs of pneumonia..."