phronetic-ai
/

owlet-phi-2-audio

@@ -615,7 +615,7 @@ class WhisperAudioTower(nn.Module):
         if self.is_loaded:
             return
-        self.audio_tower = WhisperModel.from_pretrained(self.audio_tower_name)
         self.audio_tower.requires_grad_(False)
         self.audio_tower.eval()
@@ -2627,10 +2627,8 @@ class BunnyPhiForCausalLM(PhiForCausalLM, BunnyMetaForCausalLM):
         audio_tower = self.get_audio_tower()
         if not audio_tower.is_loaded:
             audio_tower.load_model()
-        audio_tower.to(device='cuda', dtype=torch.float16)
         audio_processor = audio_tower.audio_processor
-        audio_processor.to(device='cuda')
-        features = audio_processor(audio, sampling_rate=16000, return_tensors="pt").input_features  # replace 16k with arg later
         audio_tensor = features.to(self.device, dtype=self.dtype)
         return audio_tensor

         if self.is_loaded:
             return
+        self.audio_tower = WhisperModel.from_pretrained(self.audio_tower_name, torch_dtype=torch.float16)
         self.audio_tower.requires_grad_(False)
         self.audio_tower.eval()
         audio_tower = self.get_audio_tower()
         if not audio_tower.is_loaded:
             audio_tower.load_model()
         audio_processor = audio_tower.audio_processor
+        features = audio_processor(audio, sampling_rate=16000, return_tensors="pt", device='cuda').input_features  # replace 16k with arg later
         audio_tensor = features.to(self.device, dtype=self.dtype)
         return audio_tensor