PatnaikAshish commited on
Commit
bee5d4f
·
verified ·
1 Parent(s): 5cdab36

Update core/cloner.py

Browse files
Files changed (1) hide show
  1. core/cloner.py +130 -119
core/cloner.py CHANGED
@@ -1,119 +1,130 @@
1
- import os
2
- import tempfile
3
- import torch
4
- import soundfile as sf
5
- from huggingface_hub import hf_hub_download
6
- from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
7
- from kokoro_onnx import Kokoro
8
- from misaki import espeak
9
- from misaki.espeak import EspeakG2P
10
-
11
- class KokoClone:
12
- def __init__(self, kanade_model="frothywater/kanade-25hz-clean", hf_repo="PatnaikAshish/kokoclone"):
13
- # Auto-detect GPU (CUDA) or fallback to CPU
14
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
- print(f"Initializing KokoClone on: {self.device.type.upper()}")
16
-
17
- self.hf_repo = hf_repo
18
-
19
- # Load Kanade & Vocoder once, move to detected device
20
- print("Loading Kanade Voice Conversion model...")
21
- self.kanade = KanadeModel.from_pretrained(kanade_model).to(self.device).eval()
22
- self.vocoder = load_vocoder(self.kanade.config.vocoder_name).to(self.device)
23
- self.sample_rate = self.kanade.config.sample_rate
24
-
25
- # Cache for Kokoro
26
- self.kokoro_cache = {}
27
-
28
- # Initialize fallback (Misaki handles this globally in the background)
29
- self.fallback = espeak.EspeakFallback(british=False)
30
-
31
- def _ensure_file(self, folder, filename):
32
- """Auto-downloads missing models from your Hugging Face repo."""
33
- filepath = os.path.join(folder, filename)
34
- repo_filepath = f"{folder}/{filename}"
35
-
36
- if not os.path.exists(filepath):
37
- print(f"Downloading missing file '{filename}' from {self.hf_repo}...")
38
- hf_hub_download(
39
- repo_id=self.hf_repo,
40
- filename=repo_filepath,
41
- local_dir="." # Downloads securely into local ./model or ./voice
42
- )
43
- return filepath
44
-
45
- def _get_config(self, lang):
46
- """Routes the correct model, voice, and G2P based on language."""
47
- model_file = self._ensure_file("model", "kokoro.onnx")
48
- voices_file = self._ensure_file("voice", "voices-v1.0.bin")
49
- vocab = None
50
- g2p = None
51
-
52
- # REMOVED the 'fallback=' kwargs here
53
- routes = {
54
- "en": {"voice": "af_bella"},
55
- "hi": {"g2p": EspeakG2P(language="hi"), "voice": "hf_alpha"},
56
- "fr": {"g2p": EspeakG2P(language="fr-fr"), "voice": "ff_siwis"},
57
- "it": {"g2p": EspeakG2P(language="it"), "voice": "im_nicola"},
58
- "es": {"g2p": EspeakG2P(language="es"), "voice": "im_nicola"},
59
- "pt": {"g2p": EspeakG2P(language="pt-br"), "voice": "pf_dora"},
60
- }
61
-
62
- if lang in routes:
63
- g2p = routes[lang].get("g2p")
64
- voice = routes[lang]["voice"]
65
- elif lang == "ja":
66
- from misaki import ja
67
- g2p = ja.JAG2P()
68
- voice = "jf_alpha"
69
- vocab = self._ensure_file("model", "config.json")
70
- elif lang == "zh":
71
- from misaki import zh
72
- g2p = zh.ZHG2P(version="1.1")
73
- voice = "zf_001"
74
- model_file = self._ensure_file("model", "kokoro-v1.1-zh.onnx")
75
- voices_file = self._ensure_file("voice", "voices-v1.1-zh.bin")
76
- vocab = self._ensure_file("model", "config.json")
77
- else:
78
- raise ValueError(f"Language '{lang}' not supported.")
79
-
80
- return model_file, voices_file, vocab, g2p, voice
81
-
82
- def generate(self, text, lang, reference_audio, output_path="output.wav"):
83
- """Generates the speech and applies the target voice."""
84
- model_file, voices_file, vocab, g2p, voice = self._get_config(lang)
85
-
86
- # 1. Kokoro TTS Phase
87
- if model_file not in self.kokoro_cache:
88
- self.kokoro_cache[model_file] = Kokoro(model_file, voices_file, vocab_config=vocab) if vocab else Kokoro(model_file, voices_file)
89
-
90
- kokoro = self.kokoro_cache[model_file]
91
-
92
- print(f"Synthesizing text ({lang.upper()})...")
93
- if g2p:
94
- phonemes, _ = g2p(text)
95
- samples, sr = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
96
- else:
97
- samples, sr = kokoro.create(text, voice=voice, speed=0.9, lang="en-us")
98
-
99
- # Use a secure temporary file for the base audio
100
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
101
- temp_path = temp_audio.name
102
- sf.write(temp_path, samples, sr)
103
-
104
- # 2. Kanade Voice Conversion Phase
105
- try:
106
- print("Applying Voice Clone...")
107
- # Load and push to device
108
- source_wav = load_audio(temp_path, sample_rate=self.sample_rate).to(self.device)
109
- ref_wav = load_audio(reference_audio, sample_rate=self.sample_rate).to(self.device)
110
-
111
- with torch.inference_mode():
112
- converted_mel = self.kanade.voice_conversion(source_waveform=source_wav, reference_waveform=ref_wav)
113
- converted_wav = vocode(self.vocoder, converted_mel.unsqueeze(0))
114
-
115
- sf.write(output_path, converted_wav.squeeze().cpu().numpy(), self.sample_rate)
116
- print(f"Success! Saved: {output_path}")
117
-
118
- finally:
119
- os.remove(temp_path) # Clean up temp file silently
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import torch
4
+ import soundfile as sf
5
+ from huggingface_hub import hf_hub_download
6
+ from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
7
+ from kokoro_onnx import Kokoro
8
+ from misaki import espeak
9
+ from misaki.espeak import EspeakG2P
10
+
11
+ class KokoClone:
12
+ def __init__(self, kanade_model="frothywater/kanade-12.5hz", hf_repo="PatnaikAshish/kokoclone"):
13
+ # Auto-detect GPU (CUDA) or fallback to CPU
14
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ print(f"Initializing KokoClone on: {self.device.type.upper()}")
16
+
17
+ self.hf_repo = hf_repo
18
+
19
+ # Load Kanade & Vocoder once, move to detected device
20
+ print("Loading Kanade model...")
21
+ self.kanade = KanadeModel.from_pretrained(kanade_model).to(self.device).eval()
22
+ self.vocoder = load_vocoder(self.kanade.config.vocoder_name).to(self.device)
23
+ self.sample_rate = self.kanade.config.sample_rate
24
+
25
+ # Cache for Kokoro
26
+ self.kokoro_cache = {}
27
+
28
+ def _ensure_file(self, folder, filename):
29
+ """Auto-downloads missing models from your Hugging Face repo."""
30
+ filepath = os.path.join(folder, filename)
31
+ repo_filepath = f"{folder}/{filename}"
32
+
33
+ if not os.path.exists(filepath):
34
+ print(f"Downloading missing file '{filename}' from {self.hf_repo}...")
35
+ hf_hub_download(
36
+ repo_id=self.hf_repo,
37
+ filename=repo_filepath,
38
+ local_dir="." # Downloads securely into local ./model or ./voice
39
+ )
40
+ return filepath
41
+
42
+ def _get_config(self, lang):
43
+ """Routes the correct model, voice, and G2P based on language."""
44
+ model_file = self._ensure_file("model", "kokoro.onnx")
45
+ voices_file = self._ensure_file("voice", "voices-v1.0.bin")
46
+ vocab = None
47
+ g2p = None
48
+
49
+ # Optimized routing: Only load the specific G2P engine requested
50
+ if lang == "en":
51
+ voice = "af_bella"
52
+ elif lang == "hi":
53
+ g2p = EspeakG2P(language="hi")
54
+ voice = "hf_alpha"
55
+ elif lang == "fr":
56
+ g2p = EspeakG2P(language="fr-fr")
57
+ voice = "ff_siwis"
58
+ elif lang == "it":
59
+ g2p = EspeakG2P(language="it")
60
+ voice = "im_nicola"
61
+ elif lang == "es":
62
+ g2p = EspeakG2P(language="es")
63
+ voice = "im_nicola"
64
+ elif lang == "pt":
65
+ g2p = EspeakG2P(language="pt-br")
66
+ voice = "pf_dora"
67
+ elif lang == "ja":
68
+ from misaki import ja
69
+ import unidic
70
+ import subprocess
71
+
72
+ # FIX: Auto-download the Japanese dictionary if it's missing!
73
+ if not os.path.exists(unidic.DICDIR):
74
+ print("Downloading missing Japanese dictionary (this takes a minute but only happens once)...")
75
+ subprocess.run(["python", "-m", "unidic", "download"], check=True)
76
+
77
+ g2p = ja.JAG2P()
78
+ voice = "jf_alpha"
79
+ vocab = self._ensure_file("model", "config.json")
80
+ elif lang == "zh":
81
+ from misaki import zh
82
+ g2p = zh.ZHG2P(version="1.1")
83
+ voice = "zf_001"
84
+ model_file = self._ensure_file("model", "kokoro-v1.1-zh.onnx")
85
+ voices_file = self._ensure_file("voice", "voices-v1.1-zh.bin")
86
+ vocab = self._ensure_file("model", "config.json")
87
+ else:
88
+ raise ValueError(f"Language '{lang}' not supported.")
89
+
90
+ return model_file, voices_file, vocab, g2p, voice
91
+
92
+ def generate(self, text, lang, reference_audio, output_path="output.wav"):
93
+ """Generates the speech and applies the target voice."""
94
+ model_file, voices_file, vocab, g2p, voice = self._get_config(lang)
95
+
96
+ # 1. Kokoro TTS Phase
97
+ if model_file not in self.kokoro_cache:
98
+ self.kokoro_cache[model_file] = Kokoro(model_file, voices_file, vocab_config=vocab) if vocab else Kokoro(model_file, voices_file)
99
+
100
+ kokoro = self.kokoro_cache[model_file]
101
+
102
+ print(f"Synthesizing text ({lang.upper()})...")
103
+ if g2p:
104
+ phonemes, _ = g2p(text)
105
+ samples, sr = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
106
+ else:
107
+ samples, sr = kokoro.create(text, voice=voice, speed=0.9, lang="en-us")
108
+
109
+ # Use a secure temporary file for the base audio
110
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
111
+ temp_path = temp_audio.name
112
+ sf.write(temp_path, samples, sr)
113
+
114
+ # 2. Kanade Voice Conversion Phase
115
+ try:
116
+ print("Applying Voice Clone...")
117
+ # Load and push to device
118
+ source_wav = load_audio(temp_path, sample_rate=self.sample_rate).to(self.device)
119
+ ref_wav = load_audio(reference_audio, sample_rate=self.sample_rate).to(self.device)
120
+
121
+ with torch.inference_mode():
122
+ converted_mel = self.kanade.voice_conversion(source_waveform=source_wav, reference_waveform=ref_wav)
123
+ converted_wav = vocode(self.vocoder, converted_mel.unsqueeze(0))
124
+
125
+ sf.write(output_path, converted_wav.squeeze().cpu().numpy(), self.sample_rate)
126
+ print(f"Success! Saved: {output_path}")
127
+
128
+ finally:
129
+ if os.path.exists(temp_path):
130
+ os.remove(temp_path) # Clean up temp file silently