hareeshbabu82
/

TeluguIndicF5

Hareesh Polla commited on Jun 20, 2025

Commit

a60c006

1 Parent(s): 4cb53db

add MPS support for Apple Silicon

Files changed (1) hide show

model.py CHANGED Viewed

@@ -37,10 +37,10 @@ class INF5Model(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # Load vocoder
-        self.vocoder = torch.compile(load_vocoder(vocoder_name="vocos", is_local=False, device=device))
         # Download and load model weights
         # safetensors_path = hf_hub_download(config.name_or_path, filename="model.safetensors")
@@ -55,7 +55,7 @@ class INF5Model(PreTrainedModel):
                 dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
                 mel_spec_type="vocos",
                 vocab_file=vocab_path,
-                device=device
             )
         )
@@ -83,8 +83,8 @@ class INF5Model(PreTrainedModel):
         ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_path, ref_text)
-        self.ema_model.to(self.device)
-        self.vocoder.to(self.device)
         # Perform inference
         audio, final_sample_rate, _ = infer_process(
@@ -95,7 +95,7 @@ class INF5Model(PreTrainedModel):
             self.vocoder,
             mel_spec_type="vocos",
             speed=self.config.speed,
-            device=self.device,
         )
         # Convert to pydub format and remove silence if needed

     def __init__(self, config):
         super().__init__(config)
+        self._device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
         # Load vocoder
+        self.vocoder = torch.compile(load_vocoder(vocoder_name="vocos", is_local=False, device=self._device))
         # Download and load model weights
         # safetensors_path = hf_hub_download(config.name_or_path, filename="model.safetensors")
                 dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
                 mel_spec_type="vocos",
                 vocab_file=vocab_path,
+                device=self._device
             )
         )
         ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_path, ref_text)
+        self.ema_model.to(self._device)
+        self.vocoder.to(self._device)
         # Perform inference
         audio, final_sample_rate, _ = infer_process(
             self.vocoder,
             mel_spec_type="vocos",
             speed=self.config.speed,
+            device=self._device,
         )
         # Convert to pydub format and remove silence if needed