britto224 commited on
Commit
ab7c93f
Β·
verified Β·
1 Parent(s): ccae3c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -89
app.py CHANGED
@@ -1,11 +1,13 @@
1
  """
2
  Kanade Tokenizer β€” Text-to-Audio with Voice Cloning
3
  =====================================================
4
- Original project: Audio-to-Audio (voice conversion)
5
- This version: Text + Reference Audio β†’ Cloned Voice Audio
 
 
6
 
7
  Pipeline:
8
- 1. Text β†’ [TTS engine] β†’ intermediate WAV (content only)
9
  2. Reference Audio β†’ Kanade encode β†’ global_embedding (speaker identity)
10
  3. intermediate WAV β†’ Kanade encode β†’ content_token_indices
11
  4. Kanade decode(content_tokens, reference_speaker_embedding) β†’ output mel
@@ -14,72 +16,76 @@ Pipeline:
14
 
15
  import os
16
  import tempfile
17
- import torch
18
- import gradio as gr
19
  import numpy as np
 
20
  import soundfile as sf
 
21
 
22
- # ── Kanade ──────────────────────────────────────────────────────────────────
23
  from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
24
 
25
- # ── TTS back-end (edge-tts is zero-install, async) ──────────────────────────
26
- import asyncio
27
- import edge_tts
28
-
29
- # ────────────────────────────────────────────────────────────────────────────
30
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
- MODEL_ID = "frothywater/kanade-25hz-clean" # change to kanade-12.5hz if preferred
32
-
33
- print(f"[init] Loading Kanade model: {MODEL_ID} ({DEVICE})")
34
- kanade = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
35
- vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
36
- SR = kanade.config.sample_rate # typically 16000
37
- print("[init] Models ready.")
38
-
39
- # ── TTS voices available via edge-tts ───────────────────────────────────────
40
- TTS_VOICES = {
41
- "English (US) Female β€” Aria": "en-US-AriaNeural",
42
- "English (US) Male β€” Guy": "en-US-GuyNeural",
43
- "English (UK) Female β€” Sonia": "en-GB-SoniaNeural",
44
- "English (UK) Male β€” Ryan": "en-GB-RyanNeural",
45
- "English (AU) Female β€” Natasha": "en-AU-NatashaNeural",
46
- "English (IN) Female β€” Neerja": "en-IN-NeerjaNeural",
 
 
 
 
 
 
 
 
 
 
 
 
47
  }
48
 
49
  # ── helpers ───────────────────────────────���──────────────────────────────────
50
 
51
- def tts_to_wav(text: str, voice: str) -> str:
52
- """Run edge-tts and return path to a temp WAV file."""
53
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
54
- tmp.close()
55
 
56
- async def _run():
57
- communicate = edge_tts.Communicate(text, voice)
58
- # edge-tts outputs MP3; write to mp3 then convert
59
- mp3_path = tmp.name.replace(".wav", ".mp3")
60
- await communicate.save(mp3_path)
61
- return mp3_path
62
-
63
- mp3_path = asyncio.run(_run())
64
-
65
- # Convert MP3 β†’ WAV via soundfile / pydub fallback
66
- try:
67
- import pydub
68
- audio = pydub.AudioSegment.from_mp3(mp3_path)
69
- audio = audio.set_frame_rate(SR).set_channels(1)
70
- audio.export(tmp.name, format="wav")
71
- except Exception:
72
- # fallback: scipy / librosa
73
- import librosa
74
- y, _ = librosa.load(mp3_path, sr=SR, mono=True)
75
- sf.write(tmp.name, y, SR)
76
 
77
- os.unlink(mp3_path)
 
 
 
 
 
 
 
 
78
  return tmp.name
79
 
80
 
81
  def load_wav_tensor(path: str) -> torch.Tensor:
82
- """Load a WAV file β†’ 1-D float32 tensor at Kanade's sample rate."""
83
  return load_audio(path, sample_rate=SR).to(DEVICE)
84
 
85
 
@@ -87,51 +93,49 @@ def load_wav_tensor(path: str) -> torch.Tensor:
87
 
88
  def synthesize(
89
  text: str,
90
- tts_voice_label: str,
91
- reference_audio_path: str,
92
  speed: float,
93
  ) -> tuple[int, np.ndarray]:
94
- """
95
- Returns (sample_rate, waveform_numpy) for Gradio Audio output.
96
- """
97
  if not text.strip():
98
  raise gr.Error("Please enter some text to synthesize.")
99
  if reference_audio_path is None:
100
  raise gr.Error("Please upload a reference audio clip (the voice to clone).")
101
 
102
- voice_id = TTS_VOICES[tts_voice_label]
103
 
104
- # ── Step 1: Text β†’ intermediate speech WAV ─────────────────────────────
105
- gr.Info("Step 1/4 β€” Synthesising text with TTS…")
106
- tts_wav_path = tts_to_wav(text, voice_id)
107
 
108
- # ── Step 2: Encode TTS audio β†’ content tokens ──────────────────────────
109
- gr.Info("Step 2/4 β€” Extracting content tokens from TTS audio…")
110
- tts_waveform = load_wav_tensor(tts_wav_path)
111
- os.unlink(tts_wav_path)
112
 
113
  with torch.inference_mode():
114
  tts_features = kanade.encode(tts_waveform)
115
 
116
- # ── Step 3: Encode reference audio β†’ speaker embedding ─────────────────
117
  gr.Info("Step 3/4 β€” Extracting speaker embedding from reference audio…")
118
  ref_waveform = load_wav_tensor(reference_audio_path)
119
 
120
  with torch.inference_mode():
121
  ref_features = kanade.encode(ref_waveform)
122
 
123
- # ── Step 4: Decode with cloned speaker embedding ────────────────────────
124
  gr.Info("Step 4/4 β€” Decoding with cloned voice…")
125
  with torch.inference_mode():
126
  mel = kanade.decode(
127
- content_token_indices=tts_features.content_token_indices, # WHAT to say
128
- global_embedding=ref_features.global_embedding, # WHO says it
129
  )
130
  waveform = vocode(vocoder, mel.unsqueeze(0)) # (1, samples)
131
 
132
  audio_np = waveform.squeeze().cpu().float().numpy()
133
 
134
- # Optional speed adjustment via resampling
135
  if abs(speed - 1.0) > 0.05:
136
  import librosa
137
  audio_np = librosa.effects.time_stretch(audio_np, rate=speed)
@@ -139,7 +143,7 @@ def synthesize(
139
  return int(SR), audio_np
140
 
141
 
142
- # ── Gradio UI ─────────────────────────────────────────────────────────────────
143
 
144
  CSS = """
145
  #title { text-align: center; }
@@ -147,11 +151,11 @@ CSS = """
147
  footer { display: none !important; }
148
  """
149
 
150
- with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft()) as demo:
151
  gr.Markdown("# πŸŽ™οΈ Kanade β€” Text-to-Audio with Voice Cloning", elem_id="title")
152
  gr.Markdown(
153
- "Type any text, upload a **reference audio** (the voice you want to clone), "
154
- "and Kanade will speak your text **in that person's voice**.",
155
  elem_id="banner",
156
  )
157
 
@@ -163,10 +167,10 @@ with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft())
163
  lines=5,
164
  )
165
  tts_voice = gr.Dropdown(
166
- label="πŸ”Š Base TTS voice (used for content extraction only)",
167
- choices=list(TTS_VOICES.keys()),
168
- value=list(TTS_VOICES.keys())[0],
169
- info="This voice provides pronunciation β€” the output will sound like the reference speaker.",
170
  )
171
  speed_slider = gr.Slider(
172
  label="⏩ Speed",
@@ -181,14 +185,16 @@ with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft())
181
  )
182
  gr.Markdown(
183
  "πŸ’‘ **Tips for best results:**\n"
184
- "- Use 5–30 seconds of clean speech\n"
185
- "- Single speaker, minimal background noise\n"
186
  "- WAV or high-quality MP3\n"
187
  )
188
 
189
  run_btn = gr.Button("πŸš€ Generate Cloned Speech", variant="primary", size="lg")
190
-
191
- output_audio = gr.Audio(label="πŸ”ˆ Output β€” Your text in the reference speaker's voice", type="numpy")
 
 
192
 
193
  run_btn.click(
194
  fn=synthesize,
@@ -198,12 +204,15 @@ with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft())
198
 
199
  gr.Markdown("---")
200
  gr.Markdown(
201
- "**How it works:** Kanade disentangles speech into *content tokens* (what is said) "
202
  "and a *global speaker embedding* (who says it). "
203
- "We extract content from a TTS-generated intermediate and speaker identity from your "
204
- "reference audio, then recombine them. "
205
- "Model: [`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean)"
206
  )
207
 
208
  if __name__ == "__main__":
209
- demo.launch()
 
 
 
 
1
  """
2
  Kanade Tokenizer β€” Text-to-Audio with Voice Cloning
3
  =====================================================
4
+ Fixes vs v1:
5
+ - Replaced edge-tts (needs internet) with kokoro (100% offline, local model)
6
+ - Fixed Gradio 6.0 API: theme/css moved to launch()
7
+ - asyncio.run() issue eliminated (kokoro is sync)
8
 
9
  Pipeline:
10
+ 1. Text β†’ [Kokoro TTS, offline] β†’ intermediate WAV (content only)
11
  2. Reference Audio β†’ Kanade encode β†’ global_embedding (speaker identity)
12
  3. intermediate WAV β†’ Kanade encode β†’ content_token_indices
13
  4. Kanade decode(content_tokens, reference_speaker_embedding) β†’ output mel
 
16
 
17
  import os
18
  import tempfile
 
 
19
  import numpy as np
20
+ import torch
21
  import soundfile as sf
22
+ import gradio as gr
23
 
24
+ # ── Kanade ───────────────────────────────────────────────────────────────────
25
  from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
26
 
27
+ # ── Kokoro offline TTS ───────────────────────────────────────────────────────
28
+ from kokoro import KPipeline
29
+
30
+ # ─────────────────────────────────────────────────────────────────────────────
31
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
+ MODEL_ID = "frothywater/kanade-25hz-clean"
33
+ KOKORO_SR = 24000 # Kokoro always outputs 24 kHz
34
+
35
+ print(f"[init] Loading Kanade: {MODEL_ID} ({DEVICE})")
36
+ kanade = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
37
+ vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
38
+ SR = kanade.config.sample_rate # 16000
39
+ print("[init] Kanade ready.")
40
+
41
+ print("[init] Loading Kokoro TTS pipeline…")
42
+ # lang_code='a' = American English | 'b' = British English
43
+ kokoro_pipeline_us = KPipeline(lang_code='a')
44
+ kokoro_pipeline_uk = KPipeline(lang_code='b')
45
+ print("[init] Kokoro ready. All models loaded.")
46
+
47
+ # ── Available Kokoro voices ───────────────────────────────────────────────────
48
+ # Full list: https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices
49
+ VOICES = {
50
+ # American English (lang='a')
51
+ "πŸ‡ΊπŸ‡Έ Female β€” Heart (warm)": ("a", "af_heart"),
52
+ "πŸ‡ΊπŸ‡Έ Female β€” Bella (smooth)": ("a", "af_bella"),
53
+ "πŸ‡ΊπŸ‡Έ Female β€” Nicole (breathy)": ("a", "af_nicole"),
54
+ "πŸ‡ΊπŸ‡Έ Female β€” Sarah": ("a", "af_sarah"),
55
+ "πŸ‡ΊπŸ‡Έ Male β€” Adam": ("a", "am_adam"),
56
+ "πŸ‡ΊπŸ‡Έ Male β€” Michael": ("a", "am_michael"),
57
+ # British English (lang='b')
58
+ "πŸ‡¬πŸ‡§ Female β€” Emma": ("b", "bf_emma"),
59
+ "πŸ‡¬πŸ‡§ Male β€” George": ("b", "bm_george"),
60
+ "πŸ‡¬πŸ‡§ Male β€” Lewis": ("b", "bm_lewis"),
61
  }
62
 
63
  # ── helpers ───────────────────────────────���──────────────────────────────────
64
 
65
+ def tts_to_wav(text: str, lang: str, voice_id: str) -> str:
66
+ """Run Kokoro TTS (offline) β†’ temp WAV resampled to Kanade's SR."""
67
+ pipeline = kokoro_pipeline_us if lang == 'a' else kokoro_pipeline_uk
 
68
 
69
+ chunks = []
70
+ for _, _, audio in pipeline(text, voice=voice_id, speed=1.0, split_pattern=r'(?<=[.!?])\s+'):
71
+ chunks.append(audio)
72
+
73
+ if not chunks:
74
+ raise RuntimeError("Kokoro produced no audio β€” check your text input.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ audio_24k = np.concatenate(chunks)
77
+
78
+ # Resample 24 kHz β†’ 16 kHz for Kanade
79
+ import librosa
80
+ audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR)
81
+
82
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
83
+ sf.write(tmp.name, audio_16k, SR)
84
+ tmp.close()
85
  return tmp.name
86
 
87
 
88
  def load_wav_tensor(path: str) -> torch.Tensor:
 
89
  return load_audio(path, sample_rate=SR).to(DEVICE)
90
 
91
 
 
93
 
94
  def synthesize(
95
  text: str,
96
+ voice_label: str,
97
+ reference_audio_path,
98
  speed: float,
99
  ) -> tuple[int, np.ndarray]:
100
+
 
 
101
  if not text.strip():
102
  raise gr.Error("Please enter some text to synthesize.")
103
  if reference_audio_path is None:
104
  raise gr.Error("Please upload a reference audio clip (the voice to clone).")
105
 
106
+ lang, voice_id = VOICES[voice_label]
107
 
108
+ # Step 1 β€” Text β†’ intermediate WAV via Kokoro (offline)
109
+ gr.Info("Step 1/4 β€” Synthesising text with Kokoro (offline)…")
110
+ tts_path = tts_to_wav(text, lang, voice_id)
111
 
112
+ # Step 2 β€” Encode TTS β†’ content tokens
113
+ gr.Info("Step 2/4 β€” Extracting content tokens…")
114
+ tts_waveform = load_wav_tensor(tts_path)
115
+ os.unlink(tts_path)
116
 
117
  with torch.inference_mode():
118
  tts_features = kanade.encode(tts_waveform)
119
 
120
+ # Step 3 β€” Encode reference audio β†’ speaker embedding
121
  gr.Info("Step 3/4 β€” Extracting speaker embedding from reference audio…")
122
  ref_waveform = load_wav_tensor(reference_audio_path)
123
 
124
  with torch.inference_mode():
125
  ref_features = kanade.encode(ref_waveform)
126
 
127
+ # Step 4 β€” Decode: content from TTS, voice from reference
128
  gr.Info("Step 4/4 β€” Decoding with cloned voice…")
129
  with torch.inference_mode():
130
  mel = kanade.decode(
131
+ content_token_indices=tts_features.content_token_indices, # WHAT to say
132
+ global_embedding=ref_features.global_embedding, # WHO says it
133
  )
134
  waveform = vocode(vocoder, mel.unsqueeze(0)) # (1, samples)
135
 
136
  audio_np = waveform.squeeze().cpu().float().numpy()
137
 
138
+ # Optional speed change via resampling
139
  if abs(speed - 1.0) > 0.05:
140
  import librosa
141
  audio_np = librosa.effects.time_stretch(audio_np, rate=speed)
 
143
  return int(SR), audio_np
144
 
145
 
146
+ # ── Gradio UI (Gradio 6 compatible) ──────────────────────────────────────────
147
 
148
  CSS = """
149
  #title { text-align: center; }
 
151
  footer { display: none !important; }
152
  """
153
 
154
+ with gr.Blocks(title="Kanade TTS Voice Cloner") as demo:
155
  gr.Markdown("# πŸŽ™οΈ Kanade β€” Text-to-Audio with Voice Cloning", elem_id="title")
156
  gr.Markdown(
157
+ "Type any text Β· Upload a **reference audio** (the voice to clone) Β· "
158
+ "Kanade will speak your text **in that person's voice** β€” 100% offline.",
159
  elem_id="banner",
160
  )
161
 
 
167
  lines=5,
168
  )
169
  tts_voice = gr.Dropdown(
170
+ label="πŸ”Š Base TTS voice (Kokoro β€” offline)",
171
+ choices=list(VOICES.keys()),
172
+ value=list(VOICES.keys())[0],
173
+ info="Provides pronunciation only β€” output will sound like the reference speaker.",
174
  )
175
  speed_slider = gr.Slider(
176
  label="⏩ Speed",
 
185
  )
186
  gr.Markdown(
187
  "πŸ’‘ **Tips for best results:**\n"
188
+ "- 5–30 seconds of clean speech\n"
189
+ "- Single speaker, minimal noise\n"
190
  "- WAV or high-quality MP3\n"
191
  )
192
 
193
  run_btn = gr.Button("πŸš€ Generate Cloned Speech", variant="primary", size="lg")
194
+ output_audio = gr.Audio(
195
+ label="πŸ”ˆ Output β€” Your text in the reference speaker's voice",
196
+ type="numpy",
197
+ )
198
 
199
  run_btn.click(
200
  fn=synthesize,
 
204
 
205
  gr.Markdown("---")
206
  gr.Markdown(
207
+ "**How it works:** Kanade separates speech into *content tokens* (what is said) "
208
  "and a *global speaker embedding* (who says it). "
209
+ "Kokoro (82M offline TTS) generates the content β€” then Kanade re-voices it using your reference. "
210
+ "Models: [`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) Β· "
211
+ "[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)"
212
  )
213
 
214
  if __name__ == "__main__":
215
+ demo.launch(
216
+ theme=gr.themes.Soft(),
217
+ css=CSS,
218
+ )