File size: 4,023 Bytes
2cba492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sys
import os
import time
import torch
import gradio as gr

# --- 1. PATH SETUP ---
current_dir = os.path.dirname(os.path.abspath(__file__))
src_path = os.path.join(current_dir, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

# --- 2. Imports ---
try:
    from kanade_tokenizer.model import KanadeModel
    from kanade_tokenizer.util import load_vocoder, vocode, load_audio
except ImportError as e:
    print(f"❌ IMPORT ERROR: {e}")
    raise e

# --- Configuration ---
KANADE_REPO = "frothywater/kanade-25hz-clean"
KANADE_VOCODER = "hift"
DEVICE = "cpu"
SAMPLE_RATE = 24000
MAX_AUDIO_SECONDS = 30  # Limit audio to 30 seconds

print(f"πŸš€ Initializing on {DEVICE}...")

# --- 3. Load Models ---
print(f"πŸ“₯ Loading Kanade...")
kanade_model = KanadeModel.from_pretrained(repo_id=KANADE_REPO).to(DEVICE).eval()

print(f"πŸ”Š Loading HiFT Vocoder...")
kanade_vocoder = load_vocoder(name=KANADE_VOCODER).to(DEVICE).eval()

print("βœ… Models Loaded.")

# --- Core Inference ---
def run_inference(source_wav, ref_wav):
    """Run voice conversion inference on CPU"""
    with torch.inference_mode():
        mel_output = kanade_model.voice_conversion(source_wav, ref_wav)
        generated_wav = vocode(kanade_vocoder, mel_output.unsqueeze(0))
    return generated_wav

# --- Main Handler ---
def voice_conversion(source_path, reference_path):
    if not source_path or not reference_path:
        return None, "⚠️ Please provide both source and reference audio."
    
    try:
        # Load audio
        source_wav = load_audio(source_path, sample_rate=SAMPLE_RATE).to(DEVICE)
        ref_wav = load_audio(reference_path, sample_rate=SAMPLE_RATE).to(DEVICE)
        
        # Check duration (30 second limit)
        max_samples = MAX_AUDIO_SECONDS * SAMPLE_RATE
        
        if source_wav.shape[-1] > max_samples:
            source_wav = source_wav[..., :max_samples]
        
        if ref_wav.shape[-1] > max_samples:
            ref_wav = ref_wav[..., :max_samples]
        
        # Run inference
        start = time.time()
        final_wav = run_inference(source_wav, ref_wav)
        proc_time = time.time() - start
        
        output_np = final_wav.squeeze().cpu().float().numpy()
        output_duration = len(output_np) / SAMPLE_RATE
        
        # RTF = processing time / audio duration (lower is better, <1 means faster than real-time)
        rtf = proc_time / output_duration if output_duration > 0 else 0
        
        return (SAMPLE_RATE, output_np), f"βœ… {proc_time:.2f}s to convert {output_duration:.1f}s of audio | RTF: {rtf:.2f}x"
    
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, f"❌ Error: {str(e)}"

# --- Gradio Interface ---
with gr.Blocks(title="Kanade Voice Cloning") as demo:
    gr.Markdown("""
    # πŸ—£οΈ Kanade Voice Cloning
    **Model:** `frothywater/kanade-25hz-clean`
    
    Convert any audio into a target voice. Upload a source audio (what to say) and a reference audio (whose voice to use).
    
    ⏱️ **Limit:** Audio is trimmed to 30 seconds max.
    """)
    
    with gr.Row():
        with gr.Column():
            source_audio = gr.Audio(label="Source Audio (Content - what to say)", type="filepath")
            reference_audio = gr.Audio(label="Reference Audio (Target Voice - whose voice)", type="filepath")
            convert_btn = gr.Button("🎀 Convert Voice", variant="primary")
        
        with gr.Column():
            output_audio = gr.Audio(label="Result")
            status_text = gr.Textbox(label="Status", interactive=False)
    
    convert_btn.click(
        voice_conversion, 
        inputs=[source_audio, reference_audio], 
        outputs=[output_audio, status_text]
    )
    
    gr.Markdown("""
    ---
    **Tips:**
    - For best results, use clean reference audio (3-10 seconds of clear speech)
    - Source and reference should ideally be similar in speaking pace
    """)

if __name__ == "__main__":
    demo.launch()