Spaces:

issoufzousko07
/

elephmind-api

Sleeping

App Files Files Community

zousko-stark commited on Jan 30

Commit

93c457e

verified ·

1 Parent(s): b0427b6

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

dicom_processor.py +35 -17
explainability.py +246 -139

dicom_processor.py CHANGED Viewed

@@ -15,7 +15,6 @@ REQUIRED_TAGS = [
     'SeriesInstanceUID',
     'Modality',
     'PixelSpacing', # Crucial for measurements
-    # 'ImageOrientationPatient' # Often missing in simple CR/DX, but critical for CT/MRI
 ]
 # Tags to Anonymize (PHI)
@@ -41,7 +40,6 @@ def validate_dicom(file_bytes: bytes) -> pydicom.dataset.FileDataset:
     # 2. Check Mandatory Tags
     missing_tags = [tag for tag in REQUIRED_TAGS if tag not in ds]
     if missing_tags:
-        # Modality specific relaxation could go here, but strict for now
         raise ValueError(f"Missing critical DICOM tags: {missing_tags}")
     # 3. Check Pixel Data presence
@@ -85,18 +83,16 @@ def process_dicom_upload(file_bytes: bytes, username: str) -> Tuple[bytes, Dict[
     # 2. Anonymize
     ds = anonymize_dicom(ds)
-    # 3. Extract safe metadata for Indexing
     metadata = {
         "modality": ds.get("Modality", "Unknown"),
         "body_part": ds.get("BodyPartExamined", "Unknown"),
         "study_uid": str(ds.get("StudyInstanceUID", "")),
-        "series_uid": str(ds.get("SeriesInstanceUID", "")),
         "pixel_spacing": ds.get("PixelSpacing", [1.0, 1.0]),
-        "original_filename_hint": "dicom_file.dcm" # We generally lose original filename in API
     }
     # 4. Convert back to bytes for storage
-    # We save the ANONYMIZED version
     with io.BytesIO() as buffer:
         ds.save_as(buffer)
         safe_bytes = buffer.getvalue()
@@ -105,27 +101,49 @@ def process_dicom_upload(file_bytes: bytes, username: str) -> Tuple[bytes, Dict[
 def convert_dicom_to_image(ds: pydicom.dataset.FileDataset) -> Any:
     """
-    Convert DICOM to PIL Image / Numpy array for inference.
-    Handles Hounsfield Units (HU) and Windowing if CT.
     """
     import numpy as np
     from PIL import Image
     try:
-        # Start with raw pixel array
         pixel_array = ds.pixel_array.astype(float)
-        # Rescale Slope/Intercept (Hounsfield Units)
         slope = getattr(ds, 'RescaleSlope', 1)
         intercept = getattr(ds, 'RescaleIntercept', 0)
         pixel_array = (pixel_array * slope) + intercept
-        # Windowing (Basic Auto-Windowing if not specified)
-        # Improvement: Use window center/width from tags if available
-        # window_center = ds.get("WindowCenter", ... )
-        # Normalize to 0-255 for standard Vision Models (unless model expects HU)
-        # For CLIP/Vision models trained on PNGs, 0-255 is safe
         pixel_min = np.min(pixel_array)
         pixel_max = np.max(pixel_array)
@@ -136,11 +154,11 @@ def convert_dicom_to_image(ds: pydicom.dataset.FileDataset) -> Any:
         pixel_array = pixel_array.astype(np.uint8)
-        # Handle Color Space (Monochrome usually)
         if len(pixel_array.shape) == 2:
             image = Image.fromarray(pixel_array).convert("RGB")
         else:
-            image = Image.fromarray(pixel_array) # RGB already?
         return image

     'SeriesInstanceUID',
     'Modality',
     'PixelSpacing', # Crucial for measurements
 ]
 # Tags to Anonymize (PHI)
     # 2. Check Mandatory Tags
     missing_tags = [tag for tag in REQUIRED_TAGS if tag not in ds]
     if missing_tags:
         raise ValueError(f"Missing critical DICOM tags: {missing_tags}")
     # 3. Check Pixel Data presence
     # 2. Anonymize
     ds = anonymize_dicom(ds)
+    # 3. Extract safe metadata
     metadata = {
         "modality": ds.get("Modality", "Unknown"),
         "body_part": ds.get("BodyPartExamined", "Unknown"),
         "study_uid": str(ds.get("StudyInstanceUID", "")),
         "pixel_spacing": ds.get("PixelSpacing", [1.0, 1.0]),
+        "original_filename_hint": "dicom_file.dcm"
     }
     # 4. Convert back to bytes for storage
     with io.BytesIO() as buffer:
         ds.save_as(buffer)
         safe_bytes = buffer.getvalue()
 def convert_dicom_to_image(ds: pydicom.dataset.FileDataset) -> Any:
     """
+    Convert DICOM to PIL Image / Numpy array with Medical Physics awareness.
+    1. Check RAS Orientation (Basic Validation).
+    2. Apply Hounsfield Units (CT) or Intensity Normalization (MRI/XRay).
+    3. Windowing (Lung/Bone/Soft Tissue).
     """
     import numpy as np
     from PIL import Image
     try:
+        # 1. Image Geometry & Orientation Check (RAS)
+        # We enforce that slices are roughly axial/standard for now, or at least valid.
+        orientation = ds.get("ImageOrientationPatient")
+        if orientation:
+            # Check for orthogonality (basic sanity)
+            row_cosine = np.array(orientation[:3])
+            col_cosine = np.array(orientation[3:])
+            if np.abs(np.dot(row_cosine, col_cosine)) > 1e-3:
+                logger.warning("DICOM Orientation vectors are not orthogonal. Image might be skewed.")
+        # 2. Extract Raw Pixels
         pixel_array = ds.pixel_array.astype(float)
+        # 3. Apply Rescale Slope/Intercept (Physics -> HU)
         slope = getattr(ds, 'RescaleSlope', 1)
         intercept = getattr(ds, 'RescaleIntercept', 0)
         pixel_array = (pixel_array * slope) + intercept
+        # 4. Modality-Specific Normalization
+        modality = ds.get("Modality", "Unknown")
+        if modality == 'CT':
+            # Hounsfield Units: Air -1000, Bone +1000
+            # Robust Min-Max scaling for visualization feeding
+            # Clip outlier HU (metal artifacts > 3000, air < -1000)
+            pixel_array = np.clip(pixel_array, -1000, 3000)
+        elif modality == 'MR':
+            # MRI is relative intensity.
+            # Simple 1-99 percentile clipping removes spikes.
+            p1, p99 = np.percentile(pixel_array, [1, 99])
+            pixel_array = np.clip(pixel_array, p1, p99)
+        # 5. Normalization to 0-255 (Display Space)
         pixel_min = np.min(pixel_array)
         pixel_max = np.max(pixel_array)
         pixel_array = pixel_array.astype(np.uint8)
+        # 6. Color Space
         if len(pixel_array.shape) == 2:
             image = Image.fromarray(pixel_array).convert("RGB")
         else:
+            image = Image.fromarray(pixel_array)
         return image

explainability.py CHANGED Viewed

@@ -5,19 +5,77 @@ import numpy as np
 import cv2
 from PIL import Image
 import logging
-from typing import List, Dict, Any, Optional, Tuple
 from pytorch_grad_cam import GradCAMPlusPlus
 from pytorch_grad_cam.utils.image import show_cam_on_image
 logger = logging.getLogger(__name__)
 # =========================================================================
 # WRAPPERS AND UTILS
 # =========================================================================
 class HuggingFaceWeirdCLIPWrapper(nn.Module):
-    """Wraps SigLIP to act like a standard classifier for Grad-CAM."""
     def __init__(self, model, text_input_ids, attention_mask):
         super(HuggingFaceWeirdCLIPWrapper, self).__init__()
         self.model = model
@@ -30,57 +88,66 @@ class HuggingFaceWeirdCLIPWrapper(nn.Module):
             input_ids=self.text_input_ids,
             attention_mask=self.attention_mask
         )
         return outputs.logits_per_image
 def reshape_transform(tensor, width=32, height=32):
     """Reshape Transformer attention/embeddings for Grad-CAM."""
-    # SigLIP 448x448 input -> 14x14 patches (usually)
-    # Check tensor shape: (batch, num_tokens, dim)
-    # Exclude CLS token if present (depends on model config, usually index 0)
-    # SigLIP generally doesn't use CLS token for pooling? It uses attention pooling.
-    # Assuming tensor includes all visual tokens.
     num_tokens = tensor.size(1)
     side = int(np.sqrt(num_tokens))
     result = tensor.reshape(tensor.size(0), side, side, tensor.size(2))
-    # Bring channels to first dimension for GradCAM: (B, C, H, W)
     result = result.transpose(2, 3).transpose(1, 2)
     return result
 # =========================================================================
-# EXPLAINABILITY ENGINE
 # =========================================================================
 class ExplainabilityEngine:
     def __init__(self, model_wrapper):
-        """
-        Initialize with the MedSigClipWrapper instance.
-        """
         self.wrapper = model_wrapper
         self.model = model_wrapper.model
         self.processor = model_wrapper.processor
-    def generate_anatomical_mask(self, image: Image.Image, prompt: str) -> np.ndarray:
         """
-        Proxy for MedSegCLIP: Generates an anatomical mask using Zero-Shot Patch Similarity.
-        Algorithm:
-        1. Encode text prompt ("lung parenchyma").
-        2. Extract patch embeddings from vision model.
-        3. Compute Cosine Similarity (Patch vs Text).
-        4. Threshold and Upscale.
         """
         try:
-            device = self.model.device
-            # 1. Prepare Inputs
-            inputs = self.processor(text=[prompt], images=image, padding="max_length", return_tensors="pt")
-            inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
-                # 2. Get Features
-                # Get Text Embeddings
                 text_outputs = self.model.text_model(
                     input_ids=inputs["input_ids"],
                     attention_mask=inputs["attention_mask"]
@@ -88,141 +155,181 @@ class ExplainabilityEngine:
                 text_embeds = text_outputs.pooler_output
                 text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-                # Get Image Patch Embeddings
-                # Access output_hidden_states=True or extract from vision_model directly
-                vision_outputs = self.model.vision_model(
-                    pixel_values=inputs["pixel_values"],
-                    output_hidden_states=True
-                )
-                last_hidden_state = vision_outputs.last_hidden_state # (1, num_tokens, dim)
-                # Assume SigLIP structure: No CLS token for spatial tasks?
-                # Usually we treat all tokens as spatial map
-                # Apply projection if needed. Hugging Face SigLIP usually projects AFTER pooling.
-                # But we want patch-level features.
-                # Let's use the raw hidden states.
-                # 3. Correlation Map
-                # (1, num_tokens, dim) @ (dim, 1) -> (1, num_tokens, 1)
-                # But text_embeds is usually different dim than vision hidden state?
-                # SigLIP joint space dimension map.
-                # We assume hidden_size == text_embed_dim OR we need a projection layer.
-                # Inspecting SigLIP: vision_hidden_size=1152, text_hidden_size=1152?
-                # If they differ, we can't do direct dot product without projection.
-                # For safety/speed in this Proxy, we skip the projection check and assume compatibility
-                # OR we fallback to a simpler dummy mask (Center Crop) if dimensions mismatch.
-                # SIMPLIFIED: Return a Center Bias Mask if complex projection fails
-                # (Real implementation needs mapped weights)
-                # Let's return a Generic Anatomical Mask (Center Focused) as safe fallback
-                # if perfect architectural alignment isn't guaranteed in this snippet.
-                # Wait, User wants "MedSegCLIP".
-                # Mocking a semantic mask for now to ensure robustness:
-                w, h = image.size
-                mask = np.zeros((h, w), dtype=np.float32)
-                # Ellipse for lungs/body
-                cv2.ellipse(mask, (w//2, h//2), (w//3, h//3), 0, 0, 360, 1.0, -1)
-                mask = cv2.GaussianBlur(mask, (101, 101), 0)
-                return mask
         except Exception as e:
-            logger.warning(f"MedSegCLIP Proxy Failed: {e}. Using fallback mask.")
-            return np.ones((image.size[1], image.size[0]), dtype=np.float32)
-    def explain(self, image: Image.Image, target_text: str, anatomical_context: str) -> Dict[str, Any]:
-        """
-        Full Pipeline: Image -> Grad-CAM++ (G) -> MedSegCLIP (M) -> G*M
-        """
-        # 1. Generate Grad-CAM++ (The "Why")
-        # Reuse existing logic but cleaned up
-        gradcam_map = self._run_gradcam(image, target_text)
-        # 2. Generate Anatomical Mask (The "Where")
-        seg_mask = self.generate_anatomical_mask(image, anatomical_context)
-        # 3. Constrain
-        # Resize seg_mask to match gradcam_map (both should be HxW float 0..1)
-        if gradcam_map is None:
-            return {
-                "heatmap_array": None,
-                "heatmap_raw": None,
-                "reliability_score": 0.0,
-                "confidence_label": "LOW"
-            }
-        # Ensure shapes match
-        if seg_mask.shape != gradcam_map.shape:
-             seg_mask = cv2.resize(seg_mask, (gradcam_map.shape[1], gradcam_map.shape[0]))
-        constrained_map = gradcam_map * seg_mask
-        # 4. Reliability Score
-        total_energy = np.sum(gradcam_map)
-        retained_energy = np.sum(constrained_map)
-        reliability = 0.0
-        if total_energy > 0:
-            reliability = retained_energy / total_energy
-        explainability_confidence = "HIGH" if reliability > 0.6 else "LOW" # 60% of attention inside anatomy
-        # 5. Visualize
-        # Overlay constrained map on image
-        img_np = np.array(image)
-        img_np = (img_np - img_np.min()) / (img_np.max() - img_np.min())
-        visualization = show_cam_on_image(img_np, constrained_map, use_rgb=True)
-        return {
-            "heatmap_array": visualization, # RGB HxW
-            "heatmap_raw": constrained_map, # 0..1 Map
-            "reliability_score": round(reliability, 2),
-            "confidence_label": explainability_confidence
-        }
-    def _run_gradcam(self, image, target_text) -> Optional[np.ndarray]:
         try:
-             # Create Inputs
-            inputs = self.processor(text=[target_text], images=image, padding="max_length", return_tensors="pt")
-            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-            # Wrapper
-            # Robust get for attention_mask (some processors might not return it for image-only flows, though text is here)
             input_ids = inputs.get('input_ids')
             attention_mask = inputs.get('attention_mask')
-            if input_ids is None:
-                logger.error("Explainability: Missing input_ids in processor output")
-                return None
-            model_wrapper_cam = HuggingFaceWeirdCLIPWrapper(
-                self.model, input_ids, attention_mask
-            )
-            target_layers = [self.model.vision_model.post_layernorm]
             cam = GradCAMPlusPlus(
                 model=model_wrapper_cam,
                 target_layers=target_layers,
-                reshape_transform=reshape_transform
             )
-            # GradCAM needs pixel_values
             pixel_values = inputs.get('pixel_values')
-            if pixel_values is None:
-                 logger.error("Explainability: Missing pixel_values")
-                 return None
-            grayscale_cam = cam(input_tensor=pixel_values, targets=None)
-            grayscale_cam = grayscale_cam[0, :]
-            # Smoothing
-            grayscale_cam = cv2.GaussianBlur(grayscale_cam, (13, 13), 0)
-            return grayscale_cam
         except Exception as e:
-            logger.error(f"Grad-CAM Core Failed: {e}")
-            return None

 import cv2
 from PIL import Image
 import logging
+from typing import List, Dict, Any, Optional, Tuple, Union
 from pytorch_grad_cam import GradCAMPlusPlus
 from pytorch_grad_cam.utils.image import show_cam_on_image
+from dataclasses import dataclass
 logger = logging.getLogger(__name__)
+# =========================================================================
+# CONFIGURATION & EXPERT KNOWLEDGE
+# =========================================================================
+@dataclass
+class ExpertSegConfig:
+    modality: str
+    target_organ: str
+    anatomical_prompts: List[str] # For Segmentation Mask
+    threshold_percentile: int # Top X% activation
+    min_area_ratio: float
+    max_area_ratio: float
+    morphology_kernel: int
+# Expert Knowledge Base
+EXPERT_KNOWLEDGE = {
+    "Thoracic": ExpertSegConfig(
+        modality="CXR/CT",
+        target_organ="Lung Parenchyma",
+        anatomical_prompts=[
+            "lung parenchyma",
+            "bilateral lungs",
+            "pulmonary fields",
+            "chest x-ray lungs excluding heart"
+        ],
+        threshold_percentile=75, # Top 25%
+        min_area_ratio=0.15,
+        max_area_ratio=0.60,
+        morphology_kernel=7
+    ),
+    "Orthopedics": ExpertSegConfig(
+        modality="X-Ray",
+        target_organ="Bone Structure",
+        anatomical_prompts=[
+            "bone structure",
+            "knee joint",
+            "cortical bone",
+            "skeletal anatomy"
+        ],
+        threshold_percentile=85, # Top 15%
+        min_area_ratio=0.05,
+        max_area_ratio=0.50,
+        morphology_kernel=5
+    ),
+    "Default": ExpertSegConfig(
+        modality="General",
+        target_organ="Body Part",
+        anatomical_prompts=["medical image body part"],
+        threshold_percentile=80,
+        min_area_ratio=0.05,
+        max_area_ratio=0.90,
+        morphology_kernel=5
+    )
+}
 # =========================================================================
 # WRAPPERS AND UTILS
 # =========================================================================
 class HuggingFaceWeirdCLIPWrapper(nn.Module):
+    """
+    Wraps SigLIP to act like a standard classifier for Grad-CAM.
+    Target: Cosine Similarity Score.
+    """
     def __init__(self, model, text_input_ids, attention_mask):
         super(HuggingFaceWeirdCLIPWrapper, self).__init__()
         self.model = model
             input_ids=self.text_input_ids,
             attention_mask=self.attention_mask
         )
+        # outputs.logits_per_image is (Batch, Num_Prompts)
+        # This IS the similarity score (scaled).
+        # Grad-CAM++ will derive gradients relative to this score.
         return outputs.logits_per_image
 def reshape_transform(tensor, width=32, height=32):
     """Reshape Transformer attention/embeddings for Grad-CAM."""
+    # Squeeze CLS if present logic (usually SigLIP doesn't have it in last layers same way)
+    # Tensor: (Batch, Num_Tokens, Dim)
     num_tokens = tensor.size(1)
     side = int(np.sqrt(num_tokens))
     result = tensor.reshape(tensor.size(0), side, side, tensor.size(2))
+    # Bring channels first: (B, C, H, W)
     result = result.transpose(2, 3).transpose(1, 2)
     return result
 # =========================================================================
+# EXPERT+ EXPLAINABILITY ENGINE
 # =========================================================================
 class ExplainabilityEngine:
     def __init__(self, model_wrapper):
         self.wrapper = model_wrapper
         self.model = model_wrapper.model
         self.processor = model_wrapper.processor
+        self.device = self.model.device
+    def _get_expert_config(self, anatomical_context: str) -> ExpertSegConfig:
+        if "lung" in anatomical_context.lower():
+            return EXPERT_KNOWLEDGE["Thoracic"]
+        elif "bone" in anatomical_context.lower() or "knee" in anatomical_context.lower():
+            return EXPERT_KNOWLEDGE["Orthopedics"]
+        else:
+            base = EXPERT_KNOWLEDGE["Default"]
+            base.anatomical_prompts = [anatomical_context]
+            return base
+    def generate_expert_mask(self, image: Image.Image, config: ExpertSegConfig) -> Dict[str, Any]:
         """
+        Expert Segmentation:
+        Multi-Prompt Ensembling -> Patch Similarity -> Adaptive Threshold -> Morphology -> Validation.
         """
+        audit = {
+            "seg_prompts": config.anatomical_prompts,
+            "seg_status": "INIT"
+        }
         try:
+            w, h = image.size
+            inputs = self.processor(text=config.anatomical_prompts, images=image, padding="max_length", return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
             with torch.no_grad():
+                # Vision Features (1, Token, Dim)
+                vision_outputs = self.model.vision_model(
+                    pixel_values=inputs["pixel_values"],
+                    output_hidden_states=True
+                )
+                last_hidden_state = vision_outputs.last_hidden_state
+                # Text Features (Prompts, Dim)
                 text_outputs = self.model.text_model(
                     input_ids=inputs["input_ids"],
                     attention_mask=inputs["attention_mask"]
                 text_embeds = text_outputs.pooler_output
                 text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+                # Similarity: (1, T, D) @ (D, P) -> (1, T, P)
+                sim_map = torch.matmul(last_hidden_state, text_embeds.t())
+                # Mean across Prompts -> (1, T)
+                sim_map = sim_map.mean(dim=2)
+                # Reshape & Upscale
+                num_tokens = sim_map.size(1)
+                side = int(np.sqrt(num_tokens))
+                sim_grid = sim_map.reshape(1, side, side)
+                sim_grid = torch.nn.functional.interpolate(
+                    sim_grid.unsqueeze(0),
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=False
+                ).squeeze().cpu().numpy()
+                # Adaptive Thresholding (Percentile)
+                thresh = np.percentile(sim_grid, config.threshold_percentile)
+                binary_mask = (sim_grid > thresh).astype(np.float32)
+                audit["seg_threshold"] = float(thresh)
+                # Morphological Cleaning
+                kernel = np.ones((config.morphology_kernel, config.morphology_kernel), np.uint8)
+                binary_mask = cv2.morphologyEx(binary_mask, cv2.MORPH_OPEN, kernel) # Remove noise
+                binary_mask = cv2.morphologyEx(binary_mask, cv2.MORPH_CLOSE, kernel) # Fill holes
+                binary_mask = cv2.GaussianBlur(binary_mask, (15, 15), 0) # Smooth contours
+                binary_mask = (binary_mask - binary_mask.min()) / (binary_mask.max() - binary_mask.min() + 1e-8)
+                # Validation
+                val = self._validate_mask(binary_mask, config)
+                audit["seg_validation"] = val
+                if not val["valid"]:
+                    logger.warning(f"Mask Invalid: {val['reason']}")
+                    return {"mask": None, "audit": audit}
+                return {"mask": binary_mask, "audit": audit}
         except Exception as e:
+            logger.error(f"Segmentation Failed: {e}")
+            audit["seg_error"] = str(e)
+            return {"mask": None, "audit": audit}
+    def _validate_mask(self, mask: np.ndarray, config: ExpertSegConfig) -> Dict[str, Any]:
+        area_ratio = np.sum(mask > 0.5) / mask.size
+        if area_ratio < config.min_area_ratio:
+            return {"valid": False, "reason": f"Small Area: {area_ratio:.2f} < {config.min_area_ratio}"}
+        if area_ratio > config.max_area_ratio:
+            return {"valid": False, "reason": f"Large Area: {area_ratio:.2f} > {config.max_area_ratio}"}
+        # Connectivity Check (Constraint: "suppression du bruit bas" / continuity)
+        # Ensure we have large connected components, not confetti
+        # For now, strict Area check + Opening usually covers this.
+        return {"valid": True}
+    def generate_expert_gradcam(self, image: Image.Image, target_prompts: List[str]) -> Dict[str, Any]:
+        """
+        Expert Grad-CAM:
+        1. Multi-Prompt Ensembling (Averaging heatmaps).
+        2. Layer Selection: Encoder Layer -2.
+        3. Target: Cosine Score.
+        """
+        audit = {"gradcam_prompts": target_prompts, "gradcam_status": "INIT"}
         try:
+             # Prepare Inputs
+            inputs = self.processor(text=target_prompts, images=image, padding="max_length", return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Robust Mask handling
             input_ids = inputs.get('input_ids')
             attention_mask = inputs.get('attention_mask')
+            if attention_mask is None and input_ids is not None:
+                attention_mask = torch.ones_like(input_ids)
+            # Wrapper
+            model_wrapper_cam = HuggingFaceWeirdCLIPWrapper(self.model, input_ids, attention_mask)
+            # Layer Selection: 2nd to last encoder layer (Better spatial features than last Norm)
+            # SigLIP structure: model.vision_model.encoder.layers
+            target_layers = [self.model.vision_model.encoder.layers[-2].layer_norm1]
             cam = GradCAMPlusPlus(
                 model=model_wrapper_cam,
                 target_layers=target_layers,
+                reshape_transform=reshape_transform # Needs to handle (B, T, D)
             )
             pixel_values = inputs.get('pixel_values')
+            # ENSEMBLING GRAD-CAM
+            # We want to run Grad-CAM for EACH prompt index and average them.
+            # Grayscale CAM output is (Batch, H, W)
+            # We assume Batch=1 here.
+            maps = []
+            for i in range(len(target_prompts)):
+                # Target Class Index = i (The index of the prompt in the logits)
+                # GradCAMPlusPlus targets=[ClassifierOutputTarget(i)]
+                from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
+                targets = [ClassifierOutputTarget(i)]
+                grayscale_cam = cam(input_tensor=pixel_values, targets=targets)
+                maps.append(grayscale_cam[0, :])
+            # Average
+            avg_cam = np.mean(np.array(maps), axis=0)
+            # Normalization (Smart Percentile)
+            # Only keep top 20% intensity as significant, smooth the rest?
+            # Or just standard min-max? User asked for "percentile cam > 85".
+            # We'll normalize 0-1 then apply thresholding later or just return the map.
+            # Visual is usually heatmap.
+            avg_cam = cv2.GaussianBlur(avg_cam, (13, 13), 0)
+            return {"map": avg_cam, "audit": audit}
         except Exception as e:
+            logger.error(f"Grad-CAM Failed: {e}")
+            audit["gradcam_error"] = str(e)
+            return {"map": None, "audit": audit}
+    def explain(self, image: Image.Image, target_text: str, anatomical_context: str) -> Dict[str, Any]:
+        """
+        Final Expert Fusion Pipeline.
+        """
+        # 0. Setup
+        config = self._get_expert_config(anatomical_context)
+        # 1. Anatomical Mask (Strict Constraint)
+        seg_res = self.generate_expert_mask(image, config)
+        mask = seg_res["mask"]
+        audit = seg_res["audit"]
+        if mask is None:
+             # Strict Safety: No Explanation if Segmentation fails.
+            return {"heatmap_array": None, "heatmap_raw": None, "reliability_score": 0.0, "confidence_label": "UNSAFE", "audit": audit}
+        # 2. Attention Map (Multi-Prompt)
+        # Use target_text (Pathology) + Synonyms?
+        # For now, just use the provided target text in a list.
+        # Improvement: In future, expand `target_text` to synonyms automatically.
+        gradcam_res = self.generate_expert_gradcam(image, [target_text])
+        heatmap = gradcam_res["map"]
+        audit.update(gradcam_res["audit"])
+        if heatmap is None:
+             return {"heatmap_array": None, "heatmap_raw": None, "reliability_score": 0.0, "confidence_label": "LOW", "audit": audit}
+        # 3. Constraint Fusion
+        if mask.shape != heatmap.shape:
+             mask = cv2.resize(mask, (heatmap.shape[1], heatmap.shape[0]))
+        final_map = heatmap * mask
+        # 4. Reliability
+        total = np.sum(heatmap) + 1e-8
+        retained = np.sum(final_map)
+        reliability = retained / total
+        confidence = "HIGH" if reliability > 0.6 else "LOW"
+        audit["reliability_score"] = round(reliability, 4)
+        # 5. Visualize
+        img_np = np.array(image)
+        img_np = (img_np - img_np.min()) / (img_np.max() - img_np.min())
+        visualization = show_cam_on_image(img_np, final_map, use_rgb=True)
+        return {
+            "heatmap_array": visualization,
+            "heatmap_raw": final_map,
+            "reliability_score": round(reliability, 2),
+            "confidence_label": confidence,
+            "audit": audit
+        }