File size: 5,244 Bytes
f3b5388
d996a8f
4f3b5bd
f3b5388
4f3b5bd
 
621e6b2
4f3b5bd
f3b5388
4f3b5bd
621e6b2
4f3b5bd
 
 
 
c26c2ec
 
4f3b5bd
 
 
 
 
 
 
 
 
 
 
d996a8f
4f3b5bd
 
c26c2ec
d996a8f
4f3b5bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d996a8f
4f3b5bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3b5388
 
4f3b5bd
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import gradio as gr
from engine import VoiceCloningEngine

# Initialize the Voice Cloning Engine
engine = VoiceCloningEngine()

def process_tts(text, ref_audio, exaggeration, cfg_weight, temperature, seed, progress=gr.Progress()):
    """
    Main TTS processing function connecting the UI with the VoiceCloningEngine.
    """
    if not text.strip():
        return None, "Error: Please enter a script."
    if ref_audio is None:
        return None, "Error: Please upload a reference audio clip."

    try:
        # Call the engine with the Gradio Progress callback
        output_path, used_seed = engine.generate(
            text=text,
            ref_audio=ref_audio,
            exaggeration=exaggeration,
            cfg_weight=cfg_weight,
            temperature=temperature,
            seed=seed,
            progress_callback=progress
        )
        return output_path, f"Successfully generated audio with seed {used_seed}."
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, f"Error: {str(e)}"

# UI Layout and Configuration
def create_ui():
    with gr.Blocks(theme=gr.themes.Soft(), title="Voice Cloning TTS Chatterbox") as demo:
        gr.Markdown("# 🗣️ Voice Cloning TTS Engine")
        gr.Markdown("""
        **A high-performance voice cloning application powered by Chatterbox TTS.** 
        Optimized for long scripts with intelligent chunking, context preservation, and smooth concatenation.
        """)
        
        with gr.Row():
            # Configuration Column
            with gr.Column(scale=1):
                text_input = gr.Textbox(
                    label="Script", 
                    placeholder="Paste your long script here. The engine automatically splits it at sentence boundaries for smooth narration...", 
                    lines=10,
                    value="Welcome to the modular voice cloning application. By separating the core processing engine into its own file, we ensure cleaner code and better scalability. This tool automatically handles long texts, ensuring that your narration is smooth and continuous across multiple sentences."
                )
                ref_audio = gr.Audio(
                    label="Reference Voice (Voice to Clone)", 
                    type="filepath",
                    sources=["upload", "microphone"]
                )
                
                with gr.Row():
                    exaggeration = gr.Slider(
                        0.1, 1.0, value=0.5, step=0.05, 
                        label="Exaggeration", 
                        info="Intensity of cloned voice traits. Default 0.5. Warning: >0.8 can be unstable."
                    )
                    cfg_weight = gr.Slider(
                        0.0, 1.0, value=0.5, step=0.05, 
                        label="CFG/Pace", 
                        info="Balance between text adherence and reference voice speed."
                    )
                
                with gr.Accordion("Advanced Options", open=False):
                    seed = gr.Number(
                        label="Seed", 
                        value=0, 
                        precision=0, 
                        info="Set to 0 for a random seed each time."
                    )
                    temperature = gr.Slider(
                        0.1, 2.0, value=1.0, step=0.05, 
                        label="Temperature", 
                        info="Higher values increase expressiveness and randomness."
                    )
                
                generate_btn = gr.Button("Generate Speech", variant="primary")
                
            # Result Column
            with gr.Column(scale=1):
                audio_output = gr.Audio(label="Generated Speech", type="filepath")
                status_msg = gr.Textbox(label="Status", interactive=False)
                
                gr.Markdown("### 📖 Documentation")
                gr.Markdown("""
                ### Features
                - **Modular Engine**: The `VoiceCloningEngine` in `engine.py` handles all core processing, making the app easier to maintain.
                - **Intelligent Chunking**: Scripts are automatically split at sentence boundaries (~250 chars) for stability.
                - **Context Preservation**: Audio segments are concatenated smoothly for long-form narration.
                
                ### Deployment & Secrets
                - **Secrets Management**: If your app requires API keys, set them in the **Hugging Face Space Secrets** and access them via `os.getenv()`.
                - **GPU Recommended**: This app runs best on a T4 or L4 GPU Space.
                """)

        # Connect UI events
        generate_btn.click(
            fn=process_tts,
            inputs=[
                text_input, 
                ref_audio, 
                exaggeration, 
                cfg_weight, 
                temperature, 
                seed
            ],
            outputs=[audio_output, status_msg]
        )
        
    return demo

if __name__ == "__main__":
    ui = create_ui()
    # Ensure server_name is set for Hugging Face compatibility
    ui.launch(server_name="0.0.0.0")