File size: 2,164 Bytes
6ddd7e4
3f3d89a
b5b5082
 
 
a8f789a
7890f41
 
7b8a54e
d2078f6
3c03d8e
7b8a54e
ad0617c
d2078f6
b5b5082
72db052
d2078f6
b5b5082
d2078f6
b5b5082
7890f41
a8f789a
 
 
 
 
ea7eb85
 
 
 
 
 
 
 
 
 
b5b5082
d2078f6
b5b5082
 
 
58068be
 
b5b5082
 
 
 
 
 
e30854e
 
 
b5b5082
 
d2078f6
 
b5b5082
 
 
d2078f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import spaces
import os
import sys
sys.path.append("neutts-air")
from neuttsair.neutts import NeuTTSAir
import numpy as np
import gradio as gr

SAMPLES_PATH = os.path.join(os.getcwd(), "neutts-air", "samples")
DEFAULT_REF_TEXT = "So I'm live on radio. And I say, well, my dear friend James here clearly, and the whole room just froze. Turns out I'd completely misspoken and mentioned our other friend."
DEFAULT_REF_PATH = os.path.join(SAMPLES_PATH, "dave.wav")
DEFAULT_GEN_TEXT = "My name is Dave, and um, I'm from London."

# --- Force CPU usage ---
tts = NeuTTSAir(
    backbone_repo="neuphonic/neutts-air",
    backbone_device="cpu",
    codec_repo="neuphonic/neucodec",
    codec_device="cpu"
)

def infer(
    ref_text: str,
    ref_audio_path: str,
    gen_text: str,
) -> tuple[int, np.ndarray]:
    """
    Generates speech using NeuTTS-Air given a reference audio and text, and new text to synthesize.

    Args:
        ref_text (str): The text corresponding to the reference audio.
        ref_audio_path (str): The file path to the reference audio.
        gen_text (str): The new text to synthesize.
    Returns:
        tuple [int, np.ndarray]: A tuple containing the sample rate (24000) and the generated audio waveform as a numpy array.
    """

    gr.Info("Starting inference request (CPU mode)!")
    gr.Info("Encoding reference...")
    ref_codes = tts.encode_reference(ref_audio_path)

    gr.Info(f"Generating audio for input text: {gen_text}")
    wav = tts.infer(gen_text, ref_codes, ref_text)

    return (24_000, wav)

demo = gr.Interface(
    fn=infer,
    inputs=[
        gr.Textbox(label="Reference Text", value=DEFAULT_REF_TEXT),
        gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH),
        gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT),
    ],
    outputs=gr.Audio(type="numpy", label="Generated Speech"),
    title="NeuTTS-Air☁️ (CPU Mode)",
    description="Upload a reference audio sample, provide the reference text, and enter new text to synthesize (running on CPU)."
)

if __name__ == "__main__":
    demo.launch(allowed_paths=[SAMPLES_PATH], mcp_server=True, inbrowser=True)