1NEYRON1
/

whisper

@@ -769,22 +769,36 @@ class WhisperSSLEnsemble(PreTrainedModel):  # type: ignore
         ssl_ensemble_config = self.config.ssl_ensemble_config
         # Determine if 'weights' is a path or a variant name for whisper.load_model
-        whisper_path_or_variant = whisper_weights_path if whisper_weights_path else whisper_variant
-        logger.info(f"Loading Whisper model: '{whisper_path_or_variant}'...")
-        try:
-            # Pass the _target_device directly to whisper.load_model
-            wm = whisper.load_model(whisper_path_or_variant, device=self._target_device)
-            # with torch.device("cpu"):
-            #     wm = whisper.load_model(whisper_path_or_variant, device='cpu')
-            self.whisper_model = wm  # Assign to self.whisper_model AFTER loading
-            self.whisper_model.eval()
-            self._audio_embedding_dim = self.whisper_model.encoder.ln_post.normalized_shape[0]
-            logger.info(f"  Whisper loaded. Audio embedding dimension: {self._audio_embedding_dim}. Actual Whisper device: {self.whisper_model.device}")
-        except Exception as e:
-            logger.error(f"Error loading Whisper model: {e}")
-            raise RuntimeError(f"Failed to load Whisper model '{whisper_path_or_variant}'") from e
         self.use_text = text_model_type is not None and text_model_type.lower() != "none"
         if self.use_text:
@@ -852,7 +866,28 @@ class WhisperSSLEnsemble(PreTrainedModel):  # type: ignore
         self.to(self._target_device)
         logger.info(f"WhisperSSLEnsemble initialization complete. Final model device: {self.device}")
     def preprocess_audio(self, audios: List[Union[np.ndarray, torch.Tensor]]) -> torch.Tensor:
         processed_mels = []
         # Use self.whisper_model.device as the definitive device for mel spectrograms
         # as whisper.load_model puts its tensors on that device.
@@ -893,6 +928,7 @@ class WhisperSSLEnsemble(PreTrainedModel):  # type: ignore
         return inputs
     def get_embeddings(self, audios: List[Union[np.ndarray, torch.Tensor]], texts: Optional[List[str]] = None) -> tuple:
         if self.use_text and texts is None:
             pass

         ssl_ensemble_config = self.config.ssl_ensemble_config
         # Determine if 'weights' is a path or a variant name for whisper.load_model
+        # whisper_path_or_variant = whisper_weights_path if whisper_weights_path else whisper_variant
+        # logger.info(f"Loading Whisper model: '{whisper_path_or_variant}'...")
+        # try:
+        #     # Pass the _target_device directly to whisper.load_model
+        #     wm = whisper.load_model(whisper_path_or_variant, device=self._target_device)
+        #     # with torch.device("cpu"):
+        #     #     wm = whisper.load_model(whisper_path_or_variant, device='cpu')
+        #     self.whisper_model = wm  # Assign to self.whisper_model AFTER loading
+        #     self.whisper_model.eval()
+        #     self._audio_embedding_dim = self.whisper_model.encoder.ln_post.normalized_shape[0]
+        #     logger.info(f"  Whisper loaded. Audio embedding dimension: {self._audio_embedding_dim}. Actual Whisper device: {self.whisper_model.device}")
+        # except Exception as e:
+        #     logger.error(f"Error loading Whisper model: {e}")
+        #     raise RuntimeError(f"Failed to load Whisper model '{whisper_path_or_variant}'") from e
+        self.whisper_model = None
+        # Размеры эмбеддингов теперь должны быть явно указаны в конфиге,
+        # так как мы не можем узнать их из еще не загруженной модели.
+        # Убедитесь, что в вашем WhisperSSLEnsembleConfig есть эти поля.
+        if not hasattr(self.config, 'whisper_embedding_dim'):
+             raise ValueError("config.json must contain 'whisper_embedding_dim'")
+        if not hasattr(self.config, 'text_embedding_dim'):
+             raise ValueError("config.json must contain 'text_embedding_dim'")
+        self._audio_embedding_dim = self.config.whisper_embedding_dim
+        self._text_embedding_dim = 0 # Будет обновлено ниже, если есть текстовая модель
+        text_model_type = self.config.text_model_type
         self.use_text = text_model_type is not None and text_model_type.lower() != "none"
         if self.use_text:
         self.to(self._target_device)
         logger.info(f"WhisperSSLEnsemble initialization complete. Final model device: {self.device}")
+    def _load_whisper_if_needed(self):
+        """
+        Lazily loads the whisper model on the first call to a method that needs it.
+        This avoids the 'meta' device conflict during __init__.
+        """
+        if self.whisper_model is not None:
+            return
+        logger.info(f"Lazily loading Whisper model '{self.config.whisper_variant}' onto device '{self.device}'...")
+        try:
+            # К моменту вызова этого метода основная модель уже на своем финальном устройстве (self.device).
+            # Мы можем безопасно загрузить модель Whisper прямо на это устройство.
+            whisper_path_or_variant = self.config.whisper_weights_path or self.config.whisper_variant
+            self.whisper_model = whisper.load_model(whisper_path_or_variant, device=self.device)
+            self.whisper_model.eval()
+            logger.info("Whisper model loaded successfully.")
+        except Exception as e:
+            logger.error(f"Failed to lazily load Whisper model: {e}")
+            raise RuntimeError("Could not initialize the Whisper sub-component.") from e
     def preprocess_audio(self, audios: List[Union[np.ndarray, torch.Tensor]]) -> torch.Tensor:
+        self._load_whisper_if_needed()
         processed_mels = []
         # Use self.whisper_model.device as the definitive device for mel spectrograms
         # as whisper.load_model puts its tensors on that device.
         return inputs
     def get_embeddings(self, audios: List[Union[np.ndarray, torch.Tensor]], texts: Optional[List[str]] = None) -> tuple:
+        self._load_whisper_if_needed()
         if self.use_text and texts is None:
             pass