Spaces:

codewithjarair
/

Kokoro_TTS

Sleeping

App Files Files Community

codewithjarair commited on Feb 15

Commit

8973227

verified ·

1 Parent(s): 722bcf4

Update kokoro_engine.py

Browse files

Files changed (1) hide show

kokoro_engine.py +29 -20

kokoro_engine.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from kokoro import KModel
 import numpy as np
 import os
@@ -7,7 +7,11 @@ class KokoroEngine:
     def __init__(self, model_path="hexgrad/Kokoro-82M", device=None):
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         print(f"Initializing KokoroEngine on {self.device}...")
-        self.model = KModel(model_path).to(self.device).eval()
         # Available voices categorized
         self.voices = {
@@ -22,6 +26,14 @@ class KokoroEngine:
             "Portuguese": ["pf_dora", "pm_alex"]
         }
     def get_voice_list(self):
         all_voices = []
         for category in self.voices.values():
@@ -31,23 +43,20 @@ class KokoroEngine:
     def generate(self, text, voice="af_heart", speed=1.0, lang='a'):
         """
         Generates audio from text using a specified voice.
-        Args:
-            text (str): The text to synthesize.
-            voice (str or torch.Tensor): The voice ID to use or a voice tensor.
-            speed (float): The speed factor (default 1.0).
-            lang (str): Language code (default 'a').
-        Returns:
-            tuple: (audio_numpy, sample_rate)
-        """
-        # If voice is a path to a custom .pt file, load it
-        if isinstance(voice, str) and (voice.endswith(".pt") or voice.endswith(".bin")):
-            if os.path.exists(voice):
-                voice = torch.load(voice, map_location=self.device)
-            else:
-                print(f"Warning: Voice file {voice} not found. Falling back to af_heart.")
-                voice = "af_heart"
-        audio, out_ps = self.model(text, voice=voice, speed=speed, lang=lang)
-        return audio, 24000 # Kokoro standard sample rate is 24k

 import torch
+from kokoro import KModel, KPipeline
 import numpy as np
 import os
     def __init__(self, model_path="hexgrad/Kokoro-82M", device=None):
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         print(f"Initializing KokoroEngine on {self.device}...")
+        # Load the base model
+        self.model = KModel().to(self.device).eval()
+        # Initialize a dictionary to cache pipelines for different languages
+        self.pipelines = {}
         # Available voices categorized
         self.voices = {
             "Portuguese": ["pf_dora", "pm_alex"]
         }
+    def get_pipeline(self, lang_code):
+        """Returns or creates a pipeline for the given language code."""
+        if lang_code not in self.pipelines:
+            print(f"Creating pipeline for language: {lang_code}")
+            # We pass model=self.model to share the underlying weights
+            self.pipelines[lang_code] = KPipeline(lang_code=lang_code, model=self.model, device=self.device)
+        return self.pipelines[lang_code]
     def get_voice_list(self):
         all_voices = []
         for category in self.voices.values():
     def generate(self, text, voice="af_heart", speed=1.0, lang='a'):
         """
         Generates audio from text using a specified voice.
+        """
+        pipeline = self.get_pipeline(lang)
+        # Generator returns (gs, ps, audio)
+        generator = pipeline(text, voice=voice, speed=speed)
+        # Collect all audio segments
+        all_audio = []
+        for gs, ps, audio in generator:
+            if audio is not None:
+                all_audio.append(audio)
+        if not all_audio:
+            return None, 24000
+        final_audio = np.concatenate(all_audio)
+        return final_audio, 24000