PatnaikAshish commited on
Commit
b72a5dc
·
verified ·
1 Parent(s): 64dda9e

Upload 5 files

Browse files
Files changed (5) hide show
  1. cli.py +22 -0
  2. core/__init__.py +0 -0
  3. core/cloner.py +119 -0
  4. inference.py +12 -0
  5. requirements.txt +12 -0
cli.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from core.cloner import KokoClone
3
+
4
+ def main():
5
+ parser = argparse.ArgumentParser(description="KokoClone: Zero-Shot Multilingual Voice Cloning")
6
+ parser.add_argument("--text", type=str, required=True, help="Text to synthesize")
7
+ parser.add_argument("--lang", type=str, default="en", help="Language code (en, hi, fr, ja, zh, it, pt, es)")
8
+ parser.add_argument("--ref", type=str, required=True, help="Path to reference audio file (.wav)")
9
+ parser.add_argument("--out", type=str, default="output.wav", help="Output file path (.wav)")
10
+
11
+ args = parser.parse_args()
12
+
13
+ cloner = KokoClone()
14
+ cloner.generate(
15
+ text=args.text,
16
+ lang=args.lang,
17
+ reference_audio=args.ref,
18
+ output_path=args.out
19
+ )
20
+
21
+ if __name__ == "__main__":
22
+ main()
core/__init__.py ADDED
File without changes
core/cloner.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import torch
4
+ import soundfile as sf
5
+ from huggingface_hub import hf_hub_download
6
+ from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
7
+ from kokoro_onnx import Kokoro
8
+ from misaki import espeak
9
+ from misaki.espeak import EspeakG2P
10
+
11
+ class KokoClone:
12
+ def __init__(self, kanade_model="frothywater/kanade-25hz-clean", hf_repo="PatnaikAshish/kokoclone"):
13
+ # Auto-detect GPU (CUDA) or fallback to CPU
14
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ print(f"Initializing KokoClone on: {self.device.type.upper()}")
16
+
17
+ self.hf_repo = hf_repo
18
+
19
+ # Load Kanade & Vocoder once, move to detected device
20
+ print("Loading Kanade Voice Conversion model...")
21
+ self.kanade = KanadeModel.from_pretrained(kanade_model).to(self.device).eval()
22
+ self.vocoder = load_vocoder(self.kanade.config.vocoder_name).to(self.device)
23
+ self.sample_rate = self.kanade.config.sample_rate
24
+
25
+ # Cache for Kokoro
26
+ self.kokoro_cache = {}
27
+
28
+ # Initialize fallback (Misaki handles this globally in the background)
29
+ self.fallback = espeak.EspeakFallback(british=False)
30
+
31
+ def _ensure_file(self, folder, filename):
32
+ """Auto-downloads missing models from your Hugging Face repo."""
33
+ filepath = os.path.join(folder, filename)
34
+ repo_filepath = f"{folder}/{filename}"
35
+
36
+ if not os.path.exists(filepath):
37
+ print(f"Downloading missing file '{filename}' from {self.hf_repo}...")
38
+ hf_hub_download(
39
+ repo_id=self.hf_repo,
40
+ filename=repo_filepath,
41
+ local_dir="." # Downloads securely into local ./model or ./voice
42
+ )
43
+ return filepath
44
+
45
+ def _get_config(self, lang):
46
+ """Routes the correct model, voice, and G2P based on language."""
47
+ model_file = self._ensure_file("model", "kokoro.onnx")
48
+ voices_file = self._ensure_file("voice", "voices-v1.0.bin")
49
+ vocab = None
50
+ g2p = None
51
+
52
+ # REMOVED the 'fallback=' kwargs here
53
+ routes = {
54
+ "en": {"voice": "af_bella"},
55
+ "hi": {"g2p": EspeakG2P(language="hi"), "voice": "hf_alpha"},
56
+ "fr": {"g2p": EspeakG2P(language="fr-fr"), "voice": "ff_siwis"},
57
+ "it": {"g2p": EspeakG2P(language="it"), "voice": "im_nicola"},
58
+ "es": {"g2p": EspeakG2P(language="es"), "voice": "im_nicola"},
59
+ "pt": {"g2p": EspeakG2P(language="pt-br"), "voice": "pf_dora"},
60
+ }
61
+
62
+ if lang in routes:
63
+ g2p = routes[lang].get("g2p")
64
+ voice = routes[lang]["voice"]
65
+ elif lang == "ja":
66
+ from misaki import ja
67
+ g2p = ja.JAG2P()
68
+ voice = "jf_alpha"
69
+ vocab = self._ensure_file("model", "config.json")
70
+ elif lang == "zh":
71
+ from misaki import zh
72
+ g2p = zh.ZHG2P(version="1.1")
73
+ voice = "zf_001"
74
+ model_file = self._ensure_file("model", "kokoro-v1.1-zh.onnx")
75
+ voices_file = self._ensure_file("voice", "voices-v1.1-zh.bin")
76
+ vocab = self._ensure_file("model", "config.json")
77
+ else:
78
+ raise ValueError(f"Language '{lang}' not supported.")
79
+
80
+ return model_file, voices_file, vocab, g2p, voice
81
+
82
+ def generate(self, text, lang, reference_audio, output_path="output.wav"):
83
+ """Generates the speech and applies the target voice."""
84
+ model_file, voices_file, vocab, g2p, voice = self._get_config(lang)
85
+
86
+ # 1. Kokoro TTS Phase
87
+ if model_file not in self.kokoro_cache:
88
+ self.kokoro_cache[model_file] = Kokoro(model_file, voices_file, vocab_config=vocab) if vocab else Kokoro(model_file, voices_file)
89
+
90
+ kokoro = self.kokoro_cache[model_file]
91
+
92
+ print(f"Synthesizing text ({lang.upper()})...")
93
+ if g2p:
94
+ phonemes, _ = g2p(text)
95
+ samples, sr = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
96
+ else:
97
+ samples, sr = kokoro.create(text, voice=voice, speed=0.9, lang="en-us")
98
+
99
+ # Use a secure temporary file for the base audio
100
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
101
+ temp_path = temp_audio.name
102
+ sf.write(temp_path, samples, sr)
103
+
104
+ # 2. Kanade Voice Conversion Phase
105
+ try:
106
+ print("Applying Voice Clone...")
107
+ # Load and push to device
108
+ source_wav = load_audio(temp_path, sample_rate=self.sample_rate).to(self.device)
109
+ ref_wav = load_audio(reference_audio, sample_rate=self.sample_rate).to(self.device)
110
+
111
+ with torch.inference_mode():
112
+ converted_mel = self.kanade.voice_conversion(source_waveform=source_wav, reference_waveform=ref_wav)
113
+ converted_wav = vocode(self.vocoder, converted_mel.unsqueeze(0))
114
+
115
+ sf.write(output_path, converted_wav.squeeze().cpu().numpy(), self.sample_rate)
116
+ print(f"Success! Saved: {output_path}")
117
+
118
+ finally:
119
+ os.remove(temp_path) # Clean up temp file silently
inference.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from core.cloner import KokoClone
2
+
3
+ # Initialize the cloner (Auto-downloads models if missing, and auto-detects CPU/GPU)
4
+ cloner = KokoClone()
5
+
6
+ # Generate your cloned audio!
7
+ cloner.generate(
8
+ text="Welcome to KokoClone! This is incredibly easy to use.",
9
+ lang="en",
10
+ reference_audio="ss.wav", # Replace with your actual reference audio file
11
+ output_path="english_output.wav"
12
+ )
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Audio & AI Libraries
2
+ torch
3
+ torchaudio
4
+ soundfile
5
+ huggingface_hub
6
+
7
+ # Text-to-Speech (Kokoro)
8
+ kokoro-onnx
9
+ misaki-fork[en,ja,zh]
10
+
11
+ # Voice Conversion (Kanade)
12
+ git+https://github.com/frothywater/kanade-tokenizer