Spaces:

SebAustin
/

medgemma-triage-demo

Sleeping

SebAustin commited on Feb 24

Commit

b904a07

1 Parent(s): b6d4c5f

Speed: 4-bit default on Spaces, SDPA option, lower token limits; CUDA greedy fix

Files changed (6) hide show

config.py CHANGED Viewed

@@ -38,15 +38,17 @@ class ModelConfig:
     # Model parameters
     USE_GPU: bool = os.getenv("USE_GPU", "true").lower() == "true"
     MAX_LENGTH: int = int(os.getenv("MAX_LENGTH", "2048"))
-    MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "512"))
     TEMPERATURE: float = float(os.getenv("TEMPERATURE", "0.7"))
     TOP_P: float = 0.9
     TOP_K: int = 50
-    # Performance (set LOAD_IN_4BIT=true in .env for faster inference and less VRAM)
     LOAD_IN_8BIT: bool = os.getenv("LOAD_IN_8BIT", "false").lower() == "true"
-    LOAD_IN_4BIT: bool = os.getenv("LOAD_IN_4BIT", "false").lower() == "true"
-    USE_FLASH_ATTENTION: bool = False
     @classmethod
     def get_device(cls) -> str:

     # Model parameters
     USE_GPU: bool = os.getenv("USE_GPU", "true").lower() == "true"
     MAX_LENGTH: int = int(os.getenv("MAX_LENGTH", "2048"))
+    MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "384"))
     TEMPERATURE: float = float(os.getenv("TEMPERATURE", "0.7"))
     TOP_P: float = 0.9
     TOP_K: int = 50
+    # Performance: 4-bit greatly speeds up inference on GPU (e.g. HF Spaces). Default on when SPACE_ID is set.
+    _default_4bit = "true" if os.getenv("SPACE_ID") else "false"
     LOAD_IN_8BIT: bool = os.getenv("LOAD_IN_8BIT", "false").lower() == "true"
+    LOAD_IN_4BIT: bool = os.getenv("LOAD_IN_4BIT", _default_4bit).lower() == "true"
+    # Attention: "eager" (default), "sdpa" (faster on GPU), "flash_attention_2" (fastest, needs flash-attn; Gemma can be flaky)
+    ATTN_IMPLEMENTATION: str = os.getenv("ATTN_IMPLEMENTATION", "eager").lower()
     @classmethod
     def get_device(cls) -> str:

src/agents/care_agent.py CHANGED Viewed

@@ -37,7 +37,7 @@ class CareRecommendationAgent(BaseAgent):
             urgency_reasoning=urgency_reasoning
         )
-        recommendations = self._generate(prompt, temperature=0.5, max_length=1536, max_new_tokens=512)
         # Extract structured components
         care_setting = self._extract_care_setting(recommendations, urgency_level)

             urgency_reasoning=urgency_reasoning
         )
+        recommendations = self._generate(prompt, temperature=0.5, max_length=1536, max_new_tokens=384)
         # Extract structured components
         care_setting = self._extract_care_setting(recommendations, urgency_level)

src/agents/communication_agent.py CHANGED Viewed

@@ -45,7 +45,7 @@ class CommunicationAgent(BaseAgent):
             care_recommendation=care_recommendation_text
         )
-        report = self._generate(prompt, temperature=0.6, max_length=2048, max_new_tokens=768)
         # Create structured formatted report
         formatted_report = self._create_formatted_report(

             care_recommendation=care_recommendation_text
         )
+        report = self._generate(prompt, temperature=0.6, max_length=2048, max_new_tokens=512)
         # Create structured formatted report
         formatted_report = self._create_formatted_report(

src/agents/intake_agent.py CHANGED Viewed

@@ -48,7 +48,7 @@ class IntakeAgent(BaseAgent):
             })
         # Generate response
-        response = self._generate(prompt, temperature=0.7, max_length=1024, max_new_tokens=384)
         self.conversation_history.append({
             "role": "assistant",
@@ -155,7 +155,7 @@ Provide a structured summary including:
 Be concise and focus on medically relevant information."""
-        summary = self._generate(summary_prompt, temperature=0.5, max_length=1024, max_new_tokens=384)
         logger.info(f"{self.name} generated case summary")
         return summary

             })
         # Generate response
+        response = self._generate(prompt, temperature=0.7, max_length=1024, max_new_tokens=256)
         self.conversation_history.append({
             "role": "assistant",
 Be concise and focus on medically relevant information."""
+        summary = self._generate(summary_prompt, temperature=0.5, max_length=1024, max_new_tokens=256)
         logger.info(f"{self.name} generated case summary")
         return summary

src/agents/symptom_agent.py CHANGED Viewed

@@ -32,7 +32,7 @@ class SymptomAssessmentAgent(BaseAgent):
         # Generate symptom analysis with lower temperature for more focused analysis
         prompt = PromptTemplates.format_symptom_assessment(case_summary)
-        analysis = self._generate(prompt, temperature=0.4, max_length=1536, max_new_tokens=512)
         # Extract key components
         primary_symptoms = self._extract_primary_symptoms(analysis)

         # Generate symptom analysis with lower temperature for more focused analysis
         prompt = PromptTemplates.format_symptom_assessment(case_summary)
+        analysis = self._generate(prompt, temperature=0.4, max_length=1536, max_new_tokens=384)
         # Extract key components
         primary_symptoms = self._extract_primary_symptoms(analysis)

src/models/medgemma_client.py CHANGED Viewed

@@ -80,9 +80,11 @@ class MedGemmaClient:
                 "cache_dir": ModelConfig.MODEL_CACHE_DIR,
                 "token": self.token,
                 "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
-                "low_cpu_mem_usage": True
             }
             # Use BitsAndBytesConfig for 4/8-bit (Gemma 3 etc. don't accept load_in_4bit kwarg). Only on GPU.
             if self.device == "cuda" and ModelConfig.LOAD_IN_8BIT:
                 model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)

                 "cache_dir": ModelConfig.MODEL_CACHE_DIR,
                 "token": self.token,
                 "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
+                "low_cpu_mem_usage": True,
             }
+            if getattr(ModelConfig, "ATTN_IMPLEMENTATION", "eager") != "eager" and self.device == "cuda":
+                model_kwargs["attn_implementation"] = ModelConfig.ATTN_IMPLEMENTATION
             # Use BitsAndBytesConfig for 4/8-bit (Gemma 3 etc. don't accept load_in_4bit kwarg). Only on GPU.
             if self.device == "cuda" and ModelConfig.LOAD_IN_8BIT:
                 model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)