Upload 5 files

Browse files

Files changed (5) hide show

cli.py +22 -0
core/__init__.py +0 -0
core/cloner.py +119 -0
inference.py +12 -0
requirements.txt +12 -0

cli.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import argparse
+from core.cloner import KokoClone
+def main():
+    parser = argparse.ArgumentParser(description="KokoClone: Zero-Shot Multilingual Voice Cloning")
+    parser.add_argument("--text", type=str, required=True, help="Text to synthesize")
+    parser.add_argument("--lang", type=str, default="en", help="Language code (en, hi, fr, ja, zh, it, pt, es)")
+    parser.add_argument("--ref", type=str, required=True, help="Path to reference audio file (.wav)")
+    parser.add_argument("--out", type=str, default="output.wav", help="Output file path (.wav)")
+    args = parser.parse_args()
+    cloner = KokoClone()
+    cloner.generate(
+        text=args.text,
+        lang=args.lang,
+        reference_audio=args.ref,
+        output_path=args.out
+    )
+if __name__ == "__main__":
+    main()

core/__init__.py ADDED Viewed

File without changes

core/cloner.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+import tempfile
+import torch
+import soundfile as sf
+from huggingface_hub import hf_hub_download
+from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
+from kokoro_onnx import Kokoro
+from misaki import espeak
+from misaki.espeak import EspeakG2P
+class KokoClone:
+    def __init__(self, kanade_model="frothywater/kanade-25hz-clean", hf_repo="PatnaikAshish/kokoclone"):
+        # Auto-detect GPU (CUDA) or fallback to CPU
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Initializing KokoClone on: {self.device.type.upper()}")
+        self.hf_repo = hf_repo
+        # Load Kanade & Vocoder once, move to detected device
+        print("Loading Kanade Voice Conversion model...")
+        self.kanade = KanadeModel.from_pretrained(kanade_model).to(self.device).eval()
+        self.vocoder = load_vocoder(self.kanade.config.vocoder_name).to(self.device)
+        self.sample_rate = self.kanade.config.sample_rate
+        # Cache for Kokoro
+        self.kokoro_cache = {}
+        # Initialize fallback (Misaki handles this globally in the background)
+        self.fallback = espeak.EspeakFallback(british=False)
+    def _ensure_file(self, folder, filename):
+        """Auto-downloads missing models from your Hugging Face repo."""
+        filepath = os.path.join(folder, filename)
+        repo_filepath = f"{folder}/{filename}"
+        if not os.path.exists(filepath):
+            print(f"Downloading missing file '{filename}' from {self.hf_repo}...")
+            hf_hub_download(
+                repo_id=self.hf_repo,
+                filename=repo_filepath,
+                local_dir="." # Downloads securely into local ./model or ./voice
+            )
+        return filepath
+    def _get_config(self, lang):
+        """Routes the correct model, voice, and G2P based on language."""
+        model_file = self._ensure_file("model", "kokoro.onnx")
+        voices_file = self._ensure_file("voice", "voices-v1.0.bin")
+        vocab = None
+        g2p = None
+        # REMOVED the 'fallback=' kwargs here
+        routes = {
+            "en": {"voice": "af_bella"},
+            "hi": {"g2p": EspeakG2P(language="hi"), "voice": "hf_alpha"},
+            "fr": {"g2p": EspeakG2P(language="fr-fr"), "voice": "ff_siwis"},
+            "it": {"g2p": EspeakG2P(language="it"), "voice": "im_nicola"},
+            "es": {"g2p": EspeakG2P(language="es"), "voice": "im_nicola"},
+            "pt": {"g2p": EspeakG2P(language="pt-br"), "voice": "pf_dora"},
+        }
+        if lang in routes:
+            g2p = routes[lang].get("g2p")
+            voice = routes[lang]["voice"]
+        elif lang == "ja":
+            from misaki import ja
+            g2p = ja.JAG2P()
+            voice = "jf_alpha"
+            vocab = self._ensure_file("model", "config.json")
+        elif lang == "zh":
+            from misaki import zh
+            g2p = zh.ZHG2P(version="1.1")
+            voice = "zf_001"
+            model_file = self._ensure_file("model", "kokoro-v1.1-zh.onnx")
+            voices_file = self._ensure_file("voice", "voices-v1.1-zh.bin")
+            vocab = self._ensure_file("model", "config.json")
+        else:
+            raise ValueError(f"Language '{lang}' not supported.")
+        return model_file, voices_file, vocab, g2p, voice
+    def generate(self, text, lang, reference_audio, output_path="output.wav"):
+        """Generates the speech and applies the target voice."""
+        model_file, voices_file, vocab, g2p, voice = self._get_config(lang)
+        # 1. Kokoro TTS Phase
+        if model_file not in self.kokoro_cache:
+            self.kokoro_cache[model_file] = Kokoro(model_file, voices_file, vocab_config=vocab) if vocab else Kokoro(model_file, voices_file)
+        kokoro = self.kokoro_cache[model_file]
+        print(f"Synthesizing text ({lang.upper()})...")
+        if g2p:
+            phonemes, _ = g2p(text)
+            samples, sr = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
+        else:
+            samples, sr = kokoro.create(text, voice=voice, speed=0.9, lang="en-us")
+        # Use a secure temporary file for the base audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
+            temp_path = temp_audio.name
+            sf.write(temp_path, samples, sr)
+        # 2. Kanade Voice Conversion Phase
+        try:
+            print("Applying Voice Clone...")
+            # Load and push to device
+            source_wav = load_audio(temp_path, sample_rate=self.sample_rate).to(self.device)
+            ref_wav = load_audio(reference_audio, sample_rate=self.sample_rate).to(self.device)
+            with torch.inference_mode():
+                converted_mel = self.kanade.voice_conversion(source_waveform=source_wav, reference_waveform=ref_wav)
+                converted_wav = vocode(self.vocoder, converted_mel.unsqueeze(0))
+            sf.write(output_path, converted_wav.squeeze().cpu().numpy(), self.sample_rate)
+            print(f"Success! Saved: {output_path}")
+        finally:
+            os.remove(temp_path) # Clean up temp file silently

inference.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from core.cloner import KokoClone
+# Initialize the cloner (Auto-downloads models if missing, and auto-detects CPU/GPU)
+cloner = KokoClone()
+# Generate your cloned audio!
+cloner.generate(
+    text="Welcome to KokoClone! This is incredibly easy to use.",
+    lang="en",
+    reference_audio="ss.wav", # Replace with your actual reference audio file
+    output_path="english_output.wav"
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# Core Audio & AI Libraries
+torch
+torchaudio
+soundfile
+huggingface_hub
+# Text-to-Speech (Kokoro)
+kokoro-onnx
+misaki-fork[en,ja,zh]
+# Voice Conversion (Kanade)
+git+https://github.com/frothywater/kanade-tokenizer