britto224 commited on
Commit
ae2f25b
Β·
verified Β·
1 Parent(s): 2cba492

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -110
app.py CHANGED
@@ -1,119 +1,209 @@
1
- import sys
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
- import time
4
  import torch
5
  import gradio as gr
 
 
6
 
7
- # --- 1. PATH SETUP ---
8
- current_dir = os.path.dirname(os.path.abspath(__file__))
9
- src_path = os.path.join(current_dir, "src")
10
- if src_path not in sys.path:
11
- sys.path.append(src_path)
12
-
13
- # --- 2. Imports ---
14
- try:
15
- from kanade_tokenizer.model import KanadeModel
16
- from kanade_tokenizer.util import load_vocoder, vocode, load_audio
17
- except ImportError as e:
18
- print(f"❌ IMPORT ERROR: {e}")
19
- raise e
20
-
21
- # --- Configuration ---
22
- KANADE_REPO = "frothywater/kanade-25hz-clean"
23
- KANADE_VOCODER = "hift"
24
- DEVICE = "cpu"
25
- SAMPLE_RATE = 24000
26
- MAX_AUDIO_SECONDS = 30 # Limit audio to 30 seconds
27
-
28
- print(f"πŸš€ Initializing on {DEVICE}...")
29
-
30
- # --- 3. Load Models ---
31
- print(f"πŸ“₯ Loading Kanade...")
32
- kanade_model = KanadeModel.from_pretrained(repo_id=KANADE_REPO).to(DEVICE).eval()
33
-
34
- print(f"πŸ”Š Loading HiFT Vocoder...")
35
- kanade_vocoder = load_vocoder(name=KANADE_VOCODER).to(DEVICE).eval()
36
-
37
- print("βœ… Models Loaded.")
38
-
39
- # --- Core Inference ---
40
- def run_inference(source_wav, ref_wav):
41
- """Run voice conversion inference on CPU"""
42
- with torch.inference_mode():
43
- mel_output = kanade_model.voice_conversion(source_wav, ref_wav)
44
- generated_wav = vocode(kanade_vocoder, mel_output.unsqueeze(0))
45
- return generated_wav
46
-
47
- # --- Main Handler ---
48
- def voice_conversion(source_path, reference_path):
49
- if not source_path or not reference_path:
50
- return None, "⚠️ Please provide both source and reference audio."
51
-
52
  try:
53
- # Load audio
54
- source_wav = load_audio(source_path, sample_rate=SAMPLE_RATE).to(DEVICE)
55
- ref_wav = load_audio(reference_path, sample_rate=SAMPLE_RATE).to(DEVICE)
56
-
57
- # Check duration (30 second limit)
58
- max_samples = MAX_AUDIO_SECONDS * SAMPLE_RATE
59
-
60
- if source_wav.shape[-1] > max_samples:
61
- source_wav = source_wav[..., :max_samples]
62
-
63
- if ref_wav.shape[-1] > max_samples:
64
- ref_wav = ref_wav[..., :max_samples]
65
-
66
- # Run inference
67
- start = time.time()
68
- final_wav = run_inference(source_wav, ref_wav)
69
- proc_time = time.time() - start
70
-
71
- output_np = final_wav.squeeze().cpu().float().numpy()
72
- output_duration = len(output_np) / SAMPLE_RATE
73
-
74
- # RTF = processing time / audio duration (lower is better, <1 means faster than real-time)
75
- rtf = proc_time / output_duration if output_duration > 0 else 0
76
-
77
- return (SAMPLE_RATE, output_np), f"βœ… {proc_time:.2f}s to convert {output_duration:.1f}s of audio | RTF: {rtf:.2f}x"
78
-
79
- except Exception as e:
80
- import traceback
81
- traceback.print_exc()
82
- return None, f"❌ Error: {str(e)}"
83
-
84
- # --- Gradio Interface ---
85
- with gr.Blocks(title="Kanade Voice Cloning") as demo:
86
- gr.Markdown("""
87
- # πŸ—£οΈ Kanade Voice Cloning
88
- **Model:** `frothywater/kanade-25hz-clean`
89
-
90
- Convert any audio into a target voice. Upload a source audio (what to say) and a reference audio (whose voice to use).
91
-
92
- ⏱️ **Limit:** Audio is trimmed to 30 seconds max.
93
- """)
94
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  with gr.Row():
96
- with gr.Column():
97
- source_audio = gr.Audio(label="Source Audio (Content - what to say)", type="filepath")
98
- reference_audio = gr.Audio(label="Reference Audio (Target Voice - whose voice)", type="filepath")
99
- convert_btn = gr.Button("🎀 Convert Voice", variant="primary")
100
-
101
- with gr.Column():
102
- output_audio = gr.Audio(label="Result")
103
- status_text = gr.Textbox(label="Status", interactive=False)
104
-
105
- convert_btn.click(
106
- voice_conversion,
107
- inputs=[source_audio, reference_audio],
108
- outputs=[output_audio, status_text]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  )
110
-
111
- gr.Markdown("""
112
- ---
113
- **Tips:**
114
- - For best results, use clean reference audio (3-10 seconds of clear speech)
115
- - Source and reference should ideally be similar in speaking pace
116
- """)
117
 
118
  if __name__ == "__main__":
119
- demo.launch()
 
1
+ """
2
+ Kanade Tokenizer β€” Text-to-Audio with Voice Cloning
3
+ =====================================================
4
+ Original project: Audio-to-Audio (voice conversion)
5
+ This version: Text + Reference Audio β†’ Cloned Voice Audio
6
+
7
+ Pipeline:
8
+ 1. Text β†’ [TTS engine] β†’ intermediate WAV (content only)
9
+ 2. Reference Audio β†’ Kanade encode β†’ global_embedding (speaker identity)
10
+ 3. intermediate WAV β†’ Kanade encode β†’ content_token_indices
11
+ 4. Kanade decode(content_tokens, reference_speaker_embedding) β†’ output mel
12
+ 5. Vocoder β†’ final WAV (your text, in the reference speaker's voice)
13
+ """
14
+
15
  import os
16
+ import tempfile
17
  import torch
18
  import gradio as gr
19
+ import numpy as np
20
+ import soundfile as sf
21
 
22
+ # ── Kanade ──────────────────────────────────────────────────────────────────
23
+ from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
24
+
25
+ # ── TTS back-end (edge-tts is zero-install, async) ──────────────────────────
26
+ import asyncio
27
+ import edge_tts
28
+
29
+ # ────────────────────────────────────────────────────────────────────────────
30
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
+ MODEL_ID = "frothywater/kanade-25hz-clean" # change to kanade-12.5hz if preferred
32
+
33
+ print(f"[init] Loading Kanade model: {MODEL_ID} ({DEVICE})")
34
+ kanade = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
35
+ vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
36
+ SR = kanade.config.sample_rate # typically 16000
37
+ print("[init] Models ready.")
38
+
39
+ # ── TTS voices available via edge-tts ───────────────────────────────────────
40
+ TTS_VOICES = {
41
+ "English (US) Female β€” Aria": "en-US-AriaNeural",
42
+ "English (US) Male β€” Guy": "en-US-GuyNeural",
43
+ "English (UK) Female β€” Sonia": "en-GB-SoniaNeural",
44
+ "English (UK) Male β€” Ryan": "en-GB-RyanNeural",
45
+ "English (AU) Female β€” Natasha": "en-AU-NatashaNeural",
46
+ "English (IN) Female β€” Neerja": "en-IN-NeerjaNeural",
47
+ }
48
+
49
+ # ── helpers ──────────────────────────────────────────────────────────────────
50
+
51
+ def tts_to_wav(text: str, voice: str) -> str:
52
+ """Run edge-tts and return path to a temp WAV file."""
53
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
54
+ tmp.close()
55
+
56
+ async def _run():
57
+ communicate = edge_tts.Communicate(text, voice)
58
+ # edge-tts outputs MP3; write to mp3 then convert
59
+ mp3_path = tmp.name.replace(".wav", ".mp3")
60
+ await communicate.save(mp3_path)
61
+ return mp3_path
62
+
63
+ mp3_path = asyncio.run(_run())
64
+
65
+ # Convert MP3 β†’ WAV via soundfile / pydub fallback
 
66
  try:
67
+ import pydub
68
+ audio = pydub.AudioSegment.from_mp3(mp3_path)
69
+ audio = audio.set_frame_rate(SR).set_channels(1)
70
+ audio.export(tmp.name, format="wav")
71
+ except Exception:
72
+ # fallback: scipy / librosa
73
+ import librosa
74
+ y, _ = librosa.load(mp3_path, sr=SR, mono=True)
75
+ sf.write(tmp.name, y, SR)
76
+
77
+ os.unlink(mp3_path)
78
+ return tmp.name
79
+
80
+
81
+ def load_wav_tensor(path: str) -> torch.Tensor:
82
+ """Load a WAV file β†’ 1-D float32 tensor at Kanade's sample rate."""
83
+ return load_audio(path, sample_rate=SR).to(DEVICE)
84
+
85
+
86
+ # ── main inference ────────────────────────────────────────────────────────────
87
+
88
+ def synthesize(
89
+ text: str,
90
+ tts_voice_label: str,
91
+ reference_audio_path: str,
92
+ speed: float,
93
+ ) -> tuple[int, np.ndarray]:
94
+ """
95
+ Returns (sample_rate, waveform_numpy) for Gradio Audio output.
96
+ """
97
+ if not text.strip():
98
+ raise gr.Error("Please enter some text to synthesize.")
99
+ if reference_audio_path is None:
100
+ raise gr.Error("Please upload a reference audio clip (the voice to clone).")
101
+
102
+ voice_id = TTS_VOICES[tts_voice_label]
103
+
104
+ # ── Step 1: Text β†’ intermediate speech WAV ─────────────────────────────
105
+ gr.Info("Step 1/4 β€” Synthesising text with TTS…")
106
+ tts_wav_path = tts_to_wav(text, voice_id)
107
+
108
+ # ── Step 2: Encode TTS audio β†’ content tokens ──────────────────────────
109
+ gr.Info("Step 2/4 β€” Extracting content tokens from TTS audio…")
110
+ tts_waveform = load_wav_tensor(tts_wav_path)
111
+ os.unlink(tts_wav_path)
112
+
113
+ with torch.inference_mode():
114
+ tts_features = kanade.encode(tts_waveform)
115
+
116
+ # ── Step 3: Encode reference audio β†’ speaker embedding ─────────────────
117
+ gr.Info("Step 3/4 β€” Extracting speaker embedding from reference audio…")
118
+ ref_waveform = load_wav_tensor(reference_audio_path)
119
+
120
+ with torch.inference_mode():
121
+ ref_features = kanade.encode(ref_waveform)
122
+
123
+ # ── Step 4: Decode with cloned speaker embedding ────────────────────────
124
+ gr.Info("Step 4/4 β€” Decoding with cloned voice…")
125
+ with torch.inference_mode():
126
+ mel = kanade.decode(
127
+ content_token_indices=tts_features.content_token_indices, # WHAT to say
128
+ global_embedding=ref_features.global_embedding, # WHO says it
129
+ )
130
+ waveform = vocode(vocoder, mel.unsqueeze(0)) # (1, samples)
131
+
132
+ audio_np = waveform.squeeze().cpu().float().numpy()
133
+
134
+ # Optional speed adjustment via resampling
135
+ if abs(speed - 1.0) > 0.05:
136
+ import librosa
137
+ audio_np = librosa.effects.time_stretch(audio_np, rate=speed)
138
+
139
+ return int(SR), audio_np
140
+
141
+
142
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
143
+
144
+ CSS = """
145
+ #title { text-align: center; }
146
+ #banner { text-align: center; color: #6366f1; font-size: 0.9em; }
147
+ footer { display: none !important; }
148
+ """
149
+
150
+ with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft()) as demo:
151
+ gr.Markdown("# πŸŽ™οΈ Kanade β€” Text-to-Audio with Voice Cloning", elem_id="title")
152
+ gr.Markdown(
153
+ "Type any text, upload a **reference audio** (the voice you want to clone), "
154
+ "and Kanade will speak your text **in that person's voice**.",
155
+ elem_id="banner",
156
+ )
157
+
158
  with gr.Row():
159
+ with gr.Column(scale=3):
160
+ text_input = gr.Textbox(
161
+ label="πŸ“ Text to synthesise",
162
+ placeholder="Enter any text here…",
163
+ lines=5,
164
+ )
165
+ tts_voice = gr.Dropdown(
166
+ label="πŸ”Š Base TTS voice (used for content extraction only)",
167
+ choices=list(TTS_VOICES.keys()),
168
+ value=list(TTS_VOICES.keys())[0],
169
+ info="This voice provides pronunciation β€” the output will sound like the reference speaker.",
170
+ )
171
+ speed_slider = gr.Slider(
172
+ label="⏩ Speed",
173
+ minimum=0.7, maximum=1.5, value=1.0, step=0.05,
174
+ )
175
+
176
+ with gr.Column(scale=2):
177
+ reference_audio = gr.Audio(
178
+ label="🎀 Reference audio (voice to clone)",
179
+ type="filepath",
180
+ sources=["upload", "microphone"],
181
+ )
182
+ gr.Markdown(
183
+ "πŸ’‘ **Tips for best results:**\n"
184
+ "- Use 5–30 seconds of clean speech\n"
185
+ "- Single speaker, minimal background noise\n"
186
+ "- WAV or high-quality MP3\n"
187
+ )
188
+
189
+ run_btn = gr.Button("πŸš€ Generate Cloned Speech", variant="primary", size="lg")
190
+
191
+ output_audio = gr.Audio(label="πŸ”ˆ Output β€” Your text in the reference speaker's voice", type="numpy")
192
+
193
+ run_btn.click(
194
+ fn=synthesize,
195
+ inputs=[text_input, tts_voice, reference_audio, speed_slider],
196
+ outputs=output_audio,
197
+ )
198
+
199
+ gr.Markdown("---")
200
+ gr.Markdown(
201
+ "**How it works:** Kanade disentangles speech into *content tokens* (what is said) "
202
+ "and a *global speaker embedding* (who says it). "
203
+ "We extract content from a TTS-generated intermediate and speaker identity from your "
204
+ "reference audio, then recombine them. "
205
+ "Model: [`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean)"
206
  )
 
 
 
 
 
 
 
207
 
208
  if __name__ == "__main__":
209
+ demo.launch()