Spaces:
Running
Running
| import sys | |
| import os | |
| import time | |
| import torch | |
| import gradio as gr | |
| # --- 1. PATH SETUP --- | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| src_path = os.path.join(current_dir, "src") | |
| if src_path not in sys.path: | |
| sys.path.append(src_path) | |
| # --- 2. Imports --- | |
| try: | |
| from kanade_tokenizer.model import KanadeModel | |
| from kanade_tokenizer.util import load_vocoder, vocode, load_audio | |
| except ImportError as e: | |
| print(f"β IMPORT ERROR: {e}") | |
| raise e | |
| # --- Configuration --- | |
| KANADE_REPO = "frothywater/kanade-25hz-clean" | |
| KANADE_VOCODER = "hift" | |
| DEVICE = "cpu" | |
| SAMPLE_RATE = 24000 | |
| MAX_AUDIO_SECONDS = 30 # Limit audio to 30 seconds | |
| print(f"π Initializing on {DEVICE}...") | |
| # --- 3. Load Models --- | |
| print(f"π₯ Loading Kanade...") | |
| kanade_model = KanadeModel.from_pretrained(repo_id=KANADE_REPO).to(DEVICE).eval() | |
| print(f"π Loading HiFT Vocoder...") | |
| kanade_vocoder = load_vocoder(name=KANADE_VOCODER).to(DEVICE).eval() | |
| print("β Models Loaded.") | |
| # --- Core Inference --- | |
| def run_inference(source_wav, ref_wav): | |
| """Run voice conversion inference on CPU""" | |
| with torch.inference_mode(): | |
| mel_output = kanade_model.voice_conversion(source_wav, ref_wav) | |
| generated_wav = vocode(kanade_vocoder, mel_output.unsqueeze(0)) | |
| return generated_wav | |
| # --- Main Handler --- | |
| def voice_conversion(source_path, reference_path): | |
| if not source_path or not reference_path: | |
| return None, "β οΈ Please provide both source and reference audio." | |
| try: | |
| # Load audio | |
| source_wav = load_audio(source_path, sample_rate=SAMPLE_RATE).to(DEVICE) | |
| ref_wav = load_audio(reference_path, sample_rate=SAMPLE_RATE).to(DEVICE) | |
| # Check duration (30 second limit) | |
| max_samples = MAX_AUDIO_SECONDS * SAMPLE_RATE | |
| if source_wav.shape[-1] > max_samples: | |
| source_wav = source_wav[..., :max_samples] | |
| if ref_wav.shape[-1] > max_samples: | |
| ref_wav = ref_wav[..., :max_samples] | |
| # Run inference | |
| start = time.time() | |
| final_wav = run_inference(source_wav, ref_wav) | |
| proc_time = time.time() - start | |
| output_np = final_wav.squeeze().cpu().float().numpy() | |
| output_duration = len(output_np) / SAMPLE_RATE | |
| # RTF = processing time / audio duration (lower is better, <1 means faster than real-time) | |
| rtf = proc_time / output_duration if output_duration > 0 else 0 | |
| return (SAMPLE_RATE, output_np), f"β {proc_time:.2f}s to convert {output_duration:.1f}s of audio | RTF: {rtf:.2f}x" | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return None, f"β Error: {str(e)}" | |
| # --- Gradio Interface --- | |
| with gr.Blocks(title="Kanade Voice Cloning") as demo: | |
| gr.Markdown(""" | |
| # π£οΈ Kanade Voice Cloning | |
| **Model:** `frothywater/kanade-25hz-clean` | |
| Convert any audio into a target voice. Upload a source audio (what to say) and a reference audio (whose voice to use). | |
| β±οΈ **Limit:** Audio is trimmed to 30 seconds max. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| source_audio = gr.Audio(label="Source Audio (Content - what to say)", type="filepath") | |
| reference_audio = gr.Audio(label="Reference Audio (Target Voice - whose voice)", type="filepath") | |
| convert_btn = gr.Button("π€ Convert Voice", variant="primary") | |
| with gr.Column(): | |
| output_audio = gr.Audio(label="Result") | |
| status_text = gr.Textbox(label="Status", interactive=False) | |
| convert_btn.click( | |
| voice_conversion, | |
| inputs=[source_audio, reference_audio], | |
| outputs=[output_audio, status_text] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **Tips:** | |
| - For best results, use clean reference audio (3-10 seconds of clear speech) | |
| - Source and reference should ideally be similar in speaking pace | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |