import sys import os import time import torch import gradio as gr # --- 1. PATH SETUP --- current_dir = os.path.dirname(os.path.abspath(__file__)) src_path = os.path.join(current_dir, "src") if src_path not in sys.path: sys.path.append(src_path) # --- 2. Imports --- try: from kanade_tokenizer.model import KanadeModel from kanade_tokenizer.util import load_vocoder, vocode, load_audio except ImportError as e: print(f"❌ IMPORT ERROR: {e}") raise e # --- Configuration --- KANADE_REPO = "frothywater/kanade-25hz-clean" KANADE_VOCODER = "hift" DEVICE = "cpu" SAMPLE_RATE = 24000 MAX_AUDIO_SECONDS = 30 # Limit audio to 30 seconds print(f"🚀 Initializing on {DEVICE}...") # --- 3. Load Models --- print(f"📥 Loading Kanade...") kanade_model = KanadeModel.from_pretrained(repo_id=KANADE_REPO).to(DEVICE).eval() print(f"🔊 Loading HiFT Vocoder...") kanade_vocoder = load_vocoder(name=KANADE_VOCODER).to(DEVICE).eval() print("✅ Models Loaded.") # --- Core Inference --- def run_inference(source_wav, ref_wav): """Run voice conversion inference on CPU""" with torch.inference_mode(): mel_output = kanade_model.voice_conversion(source_wav, ref_wav) generated_wav = vocode(kanade_vocoder, mel_output.unsqueeze(0)) return generated_wav # --- Main Handler --- def voice_conversion(source_path, reference_path): if not source_path or not reference_path: return None, "⚠️ Please provide both source and reference audio." try: # Load audio source_wav = load_audio(source_path, sample_rate=SAMPLE_RATE).to(DEVICE) ref_wav = load_audio(reference_path, sample_rate=SAMPLE_RATE).to(DEVICE) # Check duration (30 second limit) max_samples = MAX_AUDIO_SECONDS * SAMPLE_RATE if source_wav.shape[-1] > max_samples: source_wav = source_wav[..., :max_samples] if ref_wav.shape[-1] > max_samples: ref_wav = ref_wav[..., :max_samples] # Run inference start = time.time() final_wav = run_inference(source_wav, ref_wav) proc_time = time.time() - start output_np = final_wav.squeeze().cpu().float().numpy() output_duration = len(output_np) / SAMPLE_RATE # RTF = processing time / audio duration (lower is better, <1 means faster than real-time) rtf = proc_time / output_duration if output_duration > 0 else 0 return (SAMPLE_RATE, output_np), f"✅ {proc_time:.2f}s to convert {output_duration:.1f}s of audio | RTF: {rtf:.2f}x" except Exception as e: import traceback traceback.print_exc() return None, f"❌ Error: {str(e)}" # --- Gradio Interface --- with gr.Blocks(title="Kanade Voice Cloning") as demo: gr.Markdown(""" # 🗣️ Kanade Voice Cloning **Model:** `frothywater/kanade-25hz-clean` Convert any audio into a target voice. Upload a source audio (what to say) and a reference audio (whose voice to use). ⏱️ **Limit:** Audio is trimmed to 30 seconds max. """) with gr.Row(): with gr.Column(): source_audio = gr.Audio(label="Source Audio (Content - what to say)", type="filepath") reference_audio = gr.Audio(label="Reference Audio (Target Voice - whose voice)", type="filepath") convert_btn = gr.Button("🎤 Convert Voice", variant="primary") with gr.Column(): output_audio = gr.Audio(label="Result") status_text = gr.Textbox(label="Status", interactive=False) convert_btn.click( voice_conversion, inputs=[source_audio, reference_audio], outputs=[output_audio, status_text] ) gr.Markdown(""" --- **Tips:** - For best results, use clean reference audio (3-10 seconds of clear speech) - Source and reference should ideally be similar in speaking pace """) if __name__ == "__main__": demo.launch()