Spaces:

samwell
/

medrax2

Paused

samwell Claude commited on 22 days ago

Commit

b2fc7a6

1 Parent(s): 27f1dea

Replace Gemini with MedGemma-4B as main orchestrator

- Create ChatMedGemma LangChain wrapper with multimodal support
- Add MedGemma provider to ModelFactory with 4-bit quantization
- Update app.py to use MedGemma-4B instead of Gemini 2.0 Flash
- Benefits: Medical specialization (88.9% F1 on MIMIC-CXR), privacy, cost savings

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (3) hide show

app.py +6 -4
medrax/models/medgemma.py +184 -0
medrax/models/model_factory.py +27 -6

app.py CHANGED Viewed

@@ -113,9 +113,11 @@ except Exception as e:
 checkpointer = MemorySaver()
 llm = ModelFactory.create_model(
-    model_name="gemini-2.0-flash",
-    temperature=0.7,
-    max_tokens=5000
 )
 prompts = load_prompts_from_file("medrax/docs/system_prompts.txt")
@@ -167,7 +169,7 @@ def chat(message, history):
 # Custom interface with image output
 with gr.Blocks() as demo:
-    gr.Markdown(f"# MedRAX2 - Medical AI Assistant\n**Device:** {device} | **Tools:** {len(tools)} loaded")
     chatbot = gr.Chatbot()
     viz_output = gr.Image(label="Grounding Visualization", visible=True)

 checkpointer = MemorySaver()
 llm = ModelFactory.create_model(
+    model_name="medgemma-4b-it",
+    temperature=1.0,
+    max_tokens=2048,
+    device=device,
+    load_in_4bit=True
 )
 prompts = load_prompts_from_file("medrax/docs/system_prompts.txt")
 # Custom interface with image output
 with gr.Blocks() as demo:
+    gr.Markdown(f"# MedRAX2 - Medical AI Assistant (MedGemma-4B)\n**Device:** {device} | **Tools:** {len(tools)} loaded")
     chatbot = gr.Chatbot()
     viz_output = gr.Image(label="Grounding Visualization", visible=True)

medrax/models/medgemma.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""MedGemma model wrapper for LangChain compatibility."""
+from typing import Any, List, Optional, Iterator
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
+from langchain_core.outputs import ChatGeneration, ChatResult
+from langchain_core.callbacks import CallbackManagerForLLMRun
+class ChatMedGemma(BaseChatModel):
+    """LangChain wrapper for MedGemma multimodal model."""
+    model: Any = None
+    processor: Any = None
+    model_name: str = "google/medgemma-4b-it"
+    device: str = "cuda"
+    max_new_tokens: int = 2048
+    temperature: float = 1.0
+    top_p: float = 0.95
+    top_k: int = 64
+    def __init__(
+        self,
+        model_name: str = "google/medgemma-4b-it",
+        device: str = "cuda",
+        load_in_4bit: bool = True,
+        max_new_tokens: int = 2048,
+        temperature: float = 1.0,
+        top_p: float = 0.95,
+        top_k: int = 64,
+        **kwargs
+    ):
+        """Initialize MedGemma model.
+        Args:
+            model_name: HuggingFace model name
+            device: Device to load model on (cuda/cpu)
+            load_in_4bit: Whether to use 4-bit quantization
+            max_new_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_p: Top-p sampling parameter
+            top_k: Top-k sampling parameter
+        """
+        super().__init__(**kwargs)
+        self.model_name = model_name
+        self.device = device
+        self.max_new_tokens = max_new_tokens
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        # Setup quantization
+        if load_in_4bit and device == "cuda":
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+            )
+        else:
+            quantization_config = None
+        # Load model and processor
+        print(f"Loading MedGemma model: {model_name}...")
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            model_name,
+            device_map=device,
+            torch_dtype=torch.bfloat16,
+            quantization_config=quantization_config,
+            trust_remote_code=True,
+        ).eval()
+        # Enable sampling by default
+        self.model.generation_config.do_sample = True
+        print(f"✓ MedGemma model loaded successfully")
+    def _convert_messages_to_medgemma_format(self, messages: List[BaseMessage]) -> List[dict]:
+        """Convert LangChain messages to MedGemma format."""
+        converted_messages = []
+        for message in messages:
+            if isinstance(message, SystemMessage):
+                # MedGemma doesn't have system role, prepend to first user message
+                converted_messages.append({
+                    "role": "system",
+                    "content": [{"type": "text", "text": message.content}]
+                })
+            elif isinstance(message, HumanMessage):
+                content = []
+                # Handle multimodal content
+                if isinstance(message.content, list):
+                    for item in message.content:
+                        if isinstance(item, dict):
+                            if item.get("type") == "image_url":
+                                # Extract image path
+                                image_url = item.get("image_url", {})
+                                if isinstance(image_url, dict):
+                                    url = image_url.get("url", "")
+                                else:
+                                    url = image_url
+                                content.append({"type": "image", "url": url})
+                            elif item.get("type") == "text":
+                                content.append({"type": "text", "text": item.get("text", "")})
+                        elif isinstance(item, str):
+                            content.append({"type": "text", "text": item})
+                elif isinstance(message.content, str):
+                    content = [{"type": "text", "text": message.content}]
+                converted_messages.append({"role": "user", "content": content})
+            elif isinstance(message, AIMessage):
+                converted_messages.append({
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": message.content}]
+                })
+        return converted_messages
+    def _generate(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        """Generate a response from MedGemma."""
+        # Convert messages to MedGemma format
+        medgemma_messages = self._convert_messages_to_medgemma_format(messages)
+        # Apply chat template
+        inputs = self.processor.apply_chat_template(
+            medgemma_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(device=self.model.device, dtype=torch.bfloat16)
+        # Generate response
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=self.max_new_tokens,
+                do_sample=True,
+                temperature=self.temperature,
+                top_p=self.top_p,
+                top_k=self.top_k,
+                pad_token_id=self.processor.tokenizer.eos_token_id,
+            )
+        # Decode output
+        prompt_length = inputs["input_ids"].shape[-1]
+        generated_ids = output_ids[0][prompt_length:]
+        response_text = self.processor.decode(
+            generated_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=True
+        )
+        # Create ChatGeneration
+        message = AIMessage(content=response_text)
+        generation = ChatGeneration(message=message)
+        return ChatResult(generations=[generation])
+    @property
+    def _llm_type(self) -> str:
+        """Return type of LLM."""
+        return "medgemma"
+    @property
+    def _identifying_params(self) -> dict:
+        """Return identifying parameters."""
+        return {
+            "model_name": self.model_name,
+            "device": self.device,
+            "max_new_tokens": self.max_new_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+        }

medrax/models/model_factory.py CHANGED Viewed

@@ -7,6 +7,7 @@ from langchain_core.language_models import BaseLanguageModel
 from langchain_openai import ChatOpenAI
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_xai import ChatXAI
 class ModelFactory:
@@ -43,6 +44,11 @@ class ModelFactory:
             "class": ChatXAI,
             "env_key": "XAI_API_KEY",
         },
         # Add more providers with default configurations here
     }
@@ -90,16 +96,18 @@ class ModelFactory:
         provider = cls._model_providers[provider_prefix]
         model_class = provider["class"]
         env_key = provider["env_key"]
         # Set up provider-specific kwargs
         provider_kwargs = {}
-        # Handle API key
-        if env_key in os.environ:
-            provider_kwargs["api_key"] = os.environ[env_key]
-        else:
-            # Log warning but don't fail - the model class might handle missing API keys differently
-            print(f"Warning: Environment variable {env_key} not found. Authentication may fail.")
         # Check for base_url if applicable
         if "base_url_key" in provider:
@@ -131,6 +139,19 @@ class ModelFactory:
                 **kwargs,
             )
         # Create and return the model instance
         return model_class(
             model=actual_model_name,

 from langchain_openai import ChatOpenAI
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_xai import ChatXAI
+from .medgemma import ChatMedGemma
 class ModelFactory:
             "class": ChatXAI,
             "env_key": "XAI_API_KEY",
         },
+        "medgemma": {
+            "class": ChatMedGemma,
+            "env_key": None,  # Local model, no API key needed
+            "is_local": True,
+        },
         # Add more providers with default configurations here
     }
         provider = cls._model_providers[provider_prefix]
         model_class = provider["class"]
         env_key = provider["env_key"]
+        is_local = provider.get("is_local", False)
         # Set up provider-specific kwargs
         provider_kwargs = {}
+        # Handle API key (skip for local models)
+        if not is_local:
+            if env_key and env_key in os.environ:
+                provider_kwargs["api_key"] = os.environ[env_key]
+            elif env_key:
+                # Log warning but don't fail - the model class might handle missing API keys differently
+                print(f"Warning: Environment variable {env_key} not found. Authentication may fail.")
         # Check for base_url if applicable
         if "base_url_key" in provider:
                 **kwargs,
             )
+        # Handle MedGemma (local model with different parameter names)
+        if model_name.startswith("medgemma"):
+            return model_class(
+                model_name=actual_model_name,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=kwargs.get("top_k", 64),
+                max_new_tokens=max_tokens,
+                device=kwargs.get("device", "cuda"),
+                load_in_4bit=kwargs.get("load_in_4bit", True),
+                **provider_kwargs,
+            )
         # Create and return the model instance
         return model_class(
             model=actual_model_name,