Lab-Rasool
/

sybil

@@ -1,140 +1,298 @@
 """
-Simplified Hugging Face wrapper for original Sybil model
-This ensures full compatibility with the original implementation
 """
 import os
-import sys
 import json
 import torch
-import torch.nn as nn
-from typing import Optional, List, Dict
-from transformers import PreTrainedModel
 from dataclasses import dataclass
 from transformers.modeling_outputs import BaseModelOutput
-# Add original Sybil to path
-sys.path.append('/mnt/f/Projects/hfsybil/Sybil')
-from sybil import Sybil as OriginalSybil
-from sybil import Serie
 try:
     from .configuration_sybil import SybilConfig
 except ImportError:
     from configuration_sybil import SybilConfig
 @dataclass
 class SybilOutput(BaseModelOutput):
     """
-    Output class for Sybil model.
     """
     risk_scores: torch.FloatTensor = None
     attentions: Optional[Dict] = None
-class SybilHFWrapper(PreTrainedModel):
     """
-    Hugging Face wrapper around the original Sybil model.
-    This ensures complete compatibility while providing HF interface.
     """
-    config_class = SybilConfig
-    base_model_prefix = "sybil"
-    def __init__(self, config: SybilConfig):
-        super().__init__(config)
-        self.config = config
-        # Load the original Sybil model with ensemble
-        checkpoint_dir = "/mnt/f/Projects/hfsybil/checkpoints"
-        # Copy checkpoints to ~/.sybil if needed
-        cache_dir = os.path.expanduser("~/.sybil")
-        os.makedirs(cache_dir, exist_ok=True)
-        # Map of checkpoint files
-        checkpoint_files = {
-            "28a7cd44f5bcd3e6cc760b65c7e0d54d.ckpt": "sybil_1",
-            "56ce1a7d241dc342982f5466c4a9d7ef.ckpt": "sybil_2",
-            "624407ef8e3a2a009f9fa51f9846fe9a.ckpt": "sybil_3",
-            "64a91b25f84141d32852e75a3aec7305.ckpt": "sybil_4",
-            "65fd1f04cb4c5847d86a9ed8ba31ac1a.ckpt": "sybil_5",
-            "sybil_ensemble_simple_calibrator.json": "ensemble_calibrator"
-        }
-        # Copy checkpoint files
-        for filename in checkpoint_files.keys():
-            src = os.path.join(checkpoint_dir, filename)
-            dst = os.path.join(cache_dir, filename)
-            if os.path.exists(src) and not os.path.exists(dst):
-                import shutil
-                shutil.copy2(src, dst)
-        # Initialize the original model
-        self.sybil_model = OriginalSybil("sybil_ensemble")
-    def forward(
-        self,
-        pixel_values: torch.FloatTensor = None,
-        dicom_paths: List[str] = None,
-        return_attentions: bool = False,
-        **kwargs
-    ) -> SybilOutput:
         """
-        Forward pass using original Sybil model.
         Args:
-            pixel_values: Pre-processed tensor (not used directly, for compatibility)
-            dicom_paths: List of DICOM file paths
             return_attentions: Whether to return attention maps
         Returns:
-            SybilOutput with risk scores and optional attentions
         """
-        if dicom_paths is None:
-            raise ValueError("dicom_paths must be provided")
-        # Create Serie object
-        serie = Serie(dicom_paths)
-        # Run prediction
-        prediction = self.sybil_model.predict([serie], return_attentions=return_attentions)
-        # Convert to torch tensors
-        risk_scores = torch.tensor(prediction.scores[0])
-        return SybilOutput(
-            risk_scores=risk_scores,
-            attentions=prediction.attentions[0] if return_attentions else None
-        )
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         """
-        Load the model. Since we're using the original Sybil,
-        we just need to ensure the checkpoints are available.
         """
-        config = kwargs.pop("config", None)
-        if config is None:
-            config = SybilConfig.from_pretrained(pretrained_model_name_or_path)
-        return cls(config)
-    def save_pretrained(self, save_directory, **kwargs):
         """
-        Save the model configuration.
-        The actual model weights are handled by the original Sybil.
         """
-        os.makedirs(save_directory, exist_ok=True)
-        self.config.save_pretrained(save_directory)
-        # Save info about checkpoint locations
-        info = {
-            "model_type": "sybil_wrapper",
-            "checkpoint_dir": "/mnt/f/Projects/hfsybil/checkpoints",
-            "note": "This model uses the original Sybil implementation"
-        }
-        with open(os.path.join(save_directory, "model_info.json"), "w") as f:
-            json.dump(info, f, indent=2)

 """
+Self-contained Hugging Face wrapper for Sybil lung cancer risk prediction model.
+This version works directly from HF without requiring external Sybil package.
 """
 import os
 import json
+import sys
 import torch
+import numpy as np
+from typing import List, Dict, Optional
 from dataclasses import dataclass
 from transformers.modeling_outputs import BaseModelOutput
+from safetensors.torch import load_file
+# Add model path to sys.path for imports
+current_dir = os.path.dirname(os.path.abspath(__file__))
+if current_dir not in sys.path:
+    sys.path.insert(0, current_dir)
 try:
     from .configuration_sybil import SybilConfig
+    from .modeling_sybil import SybilForRiskPrediction
+    from .image_processing_sybil import SybilImageProcessor
 except ImportError:
     from configuration_sybil import SybilConfig
+    from modeling_sybil import SybilForRiskPrediction
+    from image_processing_sybil import SybilImageProcessor
 @dataclass
 class SybilOutput(BaseModelOutput):
     """
+    Output class for Sybil model predictions.
+    Args:
+        risk_scores: Risk scores for each year (1-6 years by default)
+        attentions: Optional attention maps if requested
     """
     risk_scores: torch.FloatTensor = None
     attentions: Optional[Dict] = None
+class SybilHFWrapper:
     """
+    Hugging Face wrapper for Sybil ensemble model.
+    Provides a simple interface for lung cancer risk prediction from CT scans.
     """
+    def __init__(self, config: SybilConfig = None):
+        """
+        Initialize the Sybil model ensemble.
+        Args:
+            config: Model configuration (will use default if not provided)
+        """
+        self.config = config if config is not None else SybilConfig()
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Get the directory where this file is located
+        self.model_dir = os.path.dirname(os.path.abspath(__file__))
+        # Initialize image processor
+        self.image_processor = SybilImageProcessor()
+        # Load calibrator
+        self.calibrator = self._load_calibrator()
+        # Load ensemble models
+        self.models = self._load_ensemble_models()
+    def _load_calibrator(self) -> Dict:
+        """Load ensemble calibrator data"""
+        calibrator_path = os.path.join(self.model_dir, "checkpoints", "sybil_ensemble_simple_calibrator.json")
+        if os.path.exists(calibrator_path):
+            with open(calibrator_path, 'r') as f:
+                return json.load(f)
+        else:
+            # Try alternative location
+            calibrator_path = os.path.join(self.model_dir, "calibrator_data.json")
+            if os.path.exists(calibrator_path):
+                with open(calibrator_path, 'r') as f:
+                    return json.load(f)
+        return {}
+    def _load_ensemble_models(self) -> List[torch.nn.Module]:
+        """Load all models in the ensemble from safetensors files"""
+        models = []
+        # Load each model in the ensemble (Sybil uses 5 models)
+        for i in range(1, 6):
+            model_subdir = os.path.join(self.model_dir, f"sybil_{i}")
+            weights_path = os.path.join(model_subdir, "model.safetensors")
+            if os.path.exists(weights_path):
+                # Create model instance
+                model = SybilForRiskPrediction(self.config)
+                # Load weights from safetensors
+                try:
+                    state_dict = load_file(weights_path)
+                    model.load_state_dict(state_dict, strict=False)
+                except Exception as e:
+                    print(f"Warning: Could not load weights for sybil_{i}: {e}")
+                    continue
+                model.to(self.device)
+                model.eval()
+                models.append(model)
+            else:
+                # Try loading from checkpoints directory
+                checkpoint_path = os.path.join(self.model_dir, "checkpoints", f"sybil_{i}.ckpt")
+                if os.path.exists(checkpoint_path):
+                    model = SybilForRiskPrediction(self.config)
+                    checkpoint = torch.load(checkpoint_path, map_location='cpu')
+                    # Extract state dict
+                    if 'state_dict' in checkpoint:
+                        state_dict = checkpoint['state_dict']
+                    else:
+                        state_dict = checkpoint
+                    # Remove 'model.' prefix if present
+                    cleaned_state_dict = {}
+                    for k, v in state_dict.items():
+                        if k.startswith('model.'):
+                            cleaned_state_dict[k[6:]] = v
+                        else:
+                            cleaned_state_dict[k] = v
+                    model.load_state_dict(cleaned_state_dict, strict=False)
+                    model.to(self.device)
+                    model.eval()
+                    models.append(model)
+        if not models:
+            raise ValueError("No models could be loaded from the ensemble. Please ensure model files are present.")
+        print(f"Loaded {len(models)} models in ensemble")
+        return models
+    def _apply_calibration(self, scores: np.ndarray) -> np.ndarray:
+        """
+        Apply calibration to raw model outputs.
+        Args:
+            scores: Raw risk scores from the model
+        Returns:
+            Calibrated risk scores
+        """
+        if not self.calibrator:
+            return scores
+        calibrated = np.zeros_like(scores)
+        for year in range(scores.shape[1]):
+            year_key = f"Year{year + 1}"
+            if year_key in self.calibrator:
+                cal_data = self.calibrator[year_key]
+                if isinstance(cal_data, list) and len(cal_data) > 0:
+                    cal_data = cal_data[0]
+                # Apply linear calibration if available
+                if isinstance(cal_data, dict) and "coef" in cal_data and "intercept" in cal_data:
+                    coef = cal_data["coef"][0][0] if isinstance(cal_data["coef"], list) else cal_data["coef"]
+                    intercept = cal_data["intercept"][0] if isinstance(cal_data["intercept"], list) else cal_data["intercept"]
+                    # Apply calibration
+                    calibrated[:, year] = scores[:, year] * coef + intercept
+                    calibrated[:, year] = 1 / (1 + np.exp(-calibrated[:, year]))  # Sigmoid
+                else:
+                    calibrated[:, year] = scores[:, year]
+            else:
+                calibrated[:, year] = scores[:, year]
+        return calibrated
+    def preprocess_dicom(self, dicom_paths: List[str]) -> torch.Tensor:
         """
+        Preprocess DICOM files for model input.
         Args:
+            dicom_paths: List of paths to DICOM files
+        Returns:
+            Preprocessed tensor ready for model input
+        """
+        # Use the image processor to handle DICOM files
+        result = self.image_processor(dicom_paths, file_type="dicom", return_tensors="pt")
+        pixel_values = result["pixel_values"]
+        # Ensure we have 5D tensor (B, C, D, H, W)
+        if pixel_values.ndim == 4:
+            pixel_values = pixel_values.unsqueeze(0)  # Add batch dimension
+        return pixel_values.to(self.device)
+    def predict(self, dicom_paths: List[str], return_attentions: bool = False) -> SybilOutput:
+        """
+        Run prediction on a CT scan series.
+        Args:
+            dicom_paths: List of paths to DICOM files for a single CT series
             return_attentions: Whether to return attention maps
         Returns:
+            SybilOutput with risk scores and optional attention maps
         """
+        # Preprocess the DICOM files
+        pixel_values = self.preprocess_dicom(dicom_paths)
+        # Run inference with ensemble
+        all_predictions = []
+        all_attentions = []
+        with torch.no_grad():
+            for model in self.models:
+                output = model(
+                    pixel_values=pixel_values,
+                    return_attentions=return_attentions
+                )
+                # Extract risk scores
+                if hasattr(output, 'risk_scores'):
+                    predictions = output.risk_scores
+                else:
+                    predictions = output[0] if isinstance(output, tuple) else output
+                all_predictions.append(predictions.cpu().numpy())
+                if return_attentions and hasattr(output, 'image_attention'):
+                    all_attentions.append(output.image_attention)
+        # Average ensemble predictions
+        ensemble_pred = np.mean(all_predictions, axis=0)
+        # Apply calibration
+        calibrated_pred = self._apply_calibration(ensemble_pred)
+        # Convert back to torch tensor
+        risk_scores = torch.from_numpy(calibrated_pred).float()
+        # Average attentions if requested
+        attentions = None
+        if return_attentions and all_attentions:
+            attentions = {"image_attention": torch.stack(all_attentions).mean(dim=0)}
+        return SybilOutput(risk_scores=risk_scores, attentions=attentions)
+    def __call__(self, dicom_paths: List[str] = None, dicom_series: List[List[str]] = None, **kwargs) -> SybilOutput:
         """
+        Convenience method for prediction.
+        Args:
+            dicom_paths: List of DICOM file paths for a single series
+            dicom_series: List of lists of DICOM paths for batch processing
+            **kwargs: Additional arguments passed to predict()
+        Returns:
+            SybilOutput with predictions
         """
+        if dicom_series is not None:
+            # Batch processing
+            all_outputs = []
+            for paths in dicom_series:
+                output = self.predict(paths, **kwargs)
+                all_outputs.append(output.risk_scores)
+            risk_scores = torch.stack(all_outputs)
+            return SybilOutput(risk_scores=risk_scores)
+        elif dicom_paths is not None:
+            return self.predict(dicom_paths, **kwargs)
+        else:
+            raise ValueError("Either dicom_paths or dicom_series must be provided")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
         """
+        Load model from Hugging Face hub or local path.
+        Args:
+            pretrained_model_name_or_path: HF model ID or local path
+            **kwargs: Additional configuration arguments
+        Returns:
+            SybilHFWrapper instance
         """
+        # Load configuration
+        config = kwargs.pop("config", None)
+        if config is None:
+            try:
+                config = SybilConfig.from_pretrained(pretrained_model_name_or_path)
+            except:
+                config = SybilConfig()
+        return cls(config=config)