Dalzymodderever
Intial Commit
2cba492
import sys
import os
import time
import torch
import gradio as gr
# --- 1. PATH SETUP ---
current_dir = os.path.dirname(os.path.abspath(__file__))
src_path = os.path.join(current_dir, "src")
if src_path not in sys.path:
sys.path.append(src_path)
# --- 2. Imports ---
try:
from kanade_tokenizer.model import KanadeModel
from kanade_tokenizer.util import load_vocoder, vocode, load_audio
except ImportError as e:
print(f"❌ IMPORT ERROR: {e}")
raise e
# --- Configuration ---
KANADE_REPO = "frothywater/kanade-25hz-clean"
KANADE_VOCODER = "hift"
DEVICE = "cpu"
SAMPLE_RATE = 24000
MAX_AUDIO_SECONDS = 30 # Limit audio to 30 seconds
print(f"πŸš€ Initializing on {DEVICE}...")
# --- 3. Load Models ---
print(f"πŸ“₯ Loading Kanade...")
kanade_model = KanadeModel.from_pretrained(repo_id=KANADE_REPO).to(DEVICE).eval()
print(f"πŸ”Š Loading HiFT Vocoder...")
kanade_vocoder = load_vocoder(name=KANADE_VOCODER).to(DEVICE).eval()
print("βœ… Models Loaded.")
# --- Core Inference ---
def run_inference(source_wav, ref_wav):
"""Run voice conversion inference on CPU"""
with torch.inference_mode():
mel_output = kanade_model.voice_conversion(source_wav, ref_wav)
generated_wav = vocode(kanade_vocoder, mel_output.unsqueeze(0))
return generated_wav
# --- Main Handler ---
def voice_conversion(source_path, reference_path):
if not source_path or not reference_path:
return None, "⚠️ Please provide both source and reference audio."
try:
# Load audio
source_wav = load_audio(source_path, sample_rate=SAMPLE_RATE).to(DEVICE)
ref_wav = load_audio(reference_path, sample_rate=SAMPLE_RATE).to(DEVICE)
# Check duration (30 second limit)
max_samples = MAX_AUDIO_SECONDS * SAMPLE_RATE
if source_wav.shape[-1] > max_samples:
source_wav = source_wav[..., :max_samples]
if ref_wav.shape[-1] > max_samples:
ref_wav = ref_wav[..., :max_samples]
# Run inference
start = time.time()
final_wav = run_inference(source_wav, ref_wav)
proc_time = time.time() - start
output_np = final_wav.squeeze().cpu().float().numpy()
output_duration = len(output_np) / SAMPLE_RATE
# RTF = processing time / audio duration (lower is better, <1 means faster than real-time)
rtf = proc_time / output_duration if output_duration > 0 else 0
return (SAMPLE_RATE, output_np), f"βœ… {proc_time:.2f}s to convert {output_duration:.1f}s of audio | RTF: {rtf:.2f}x"
except Exception as e:
import traceback
traceback.print_exc()
return None, f"❌ Error: {str(e)}"
# --- Gradio Interface ---
with gr.Blocks(title="Kanade Voice Cloning") as demo:
gr.Markdown("""
# πŸ—£οΈ Kanade Voice Cloning
**Model:** `frothywater/kanade-25hz-clean`
Convert any audio into a target voice. Upload a source audio (what to say) and a reference audio (whose voice to use).
⏱️ **Limit:** Audio is trimmed to 30 seconds max.
""")
with gr.Row():
with gr.Column():
source_audio = gr.Audio(label="Source Audio (Content - what to say)", type="filepath")
reference_audio = gr.Audio(label="Reference Audio (Target Voice - whose voice)", type="filepath")
convert_btn = gr.Button("🎀 Convert Voice", variant="primary")
with gr.Column():
output_audio = gr.Audio(label="Result")
status_text = gr.Textbox(label="Status", interactive=False)
convert_btn.click(
voice_conversion,
inputs=[source_audio, reference_audio],
outputs=[output_audio, status_text]
)
gr.Markdown("""
---
**Tips:**
- For best results, use clean reference audio (3-10 seconds of clear speech)
- Source and reference should ideally be similar in speaking pace
""")
if __name__ == "__main__":
demo.launch()