File size: 2,350 Bytes
db98300
d71be36
8cef076
db98300
 
 
8cef076
 
 
 
 
a4bf808
 
 
 
 
db98300
 
 
 
 
 
 
 
 
 
8cef076
 
 
db98300
8cef076
db98300
8cef076
 
db98300
 
8cef076
 
 
db98300
 
8cef076
db98300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
from inference import Mars5TTS
import librosa
import torch
import numpy as np

# GPU ๋ฉ”๋ชจ๋ฆฌ ์ดˆ๊ธฐํ™” (๋ฌด๋ฃŒ ํ‹ฐ์–ด ๋ฉ”๋ชจ๋ฆฌ ๋ถ€์กฑ ๋ฐฉ์ง€)
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# MARS5 TTS ๋ชจ๋ธ ๋กœ๋“œ
try:
    mars5 = Mars5TTS.from_pretrained("CAMB-AI/MARS5-TTS")
    config = InferenceConfig(temperature=0.7)  # Prosody ์ œ์–ด๋ฅผ ์œ„ํ•œ ์˜จ๋„ ์„ค์ •
except Exception as e:
    print(f"Model loading error: {str(e)}")

def clone_with_prosody(text, ref_audio, enhance_prosody=True):
    """
    ์ž…๋ ฅ ํ…์ŠคํŠธ์™€ ์ฐธ์กฐ ์˜ค๋””์˜ค๋ฅผ ๋ฐ›์•„ ๋งํˆฌ๋ฅผ ํด๋ก ํ•˜์—ฌ ์˜ค๋””์˜ค ์ถœ๋ ฅ
    :param text: ๋ณ€ํ™˜ํ•  ํ…์ŠคํŠธ
    :param ref_audio: ๋งํˆฌ๋ฅผ ๋ณต์ œํ•  ์˜ค๋””์˜ค ํŒŒ์ผ (3-5์ดˆ ์ด์ƒ ๊ถŒ์žฅ)
    :param enhance_prosody: Prosody(์–ต์–‘/๋ฆฌ๋“ฌ) ๊ฐ•์กฐ ์—ฌ๋ถ€
    :return: ์ถœ๋ ฅ ์˜ค๋””์˜ค ํŒŒ์ผ ๊ฒฝ๋กœ
    """
    try:
        # ์ฐธ์กฐ ์˜ค๋””์˜ค ๋กœ๋“œ (Gradio์—์„œ ์ œ๊ณต๋œ filepath ์ฒ˜๋ฆฌ)
        if isinstance(ref_audio, str):
            audio_data, sr = librosa.load(ref_audio, sr=16000)  # 16kHz๋กœ ๋กœ๋“œ
        else:
            audio_data = ref_audio  # Gradio์—์„œ numpy ๋ฐฐ์—ด๋กœ ์ œ๊ณต ์‹œ

        # MARS5 TTS๋กœ ํด๋กœ๋‹
        output_audio = mars5.tts(
            text=text,
            ref_audio=audio_data,
            ref_sr=16000,  # ์ƒ˜ํ”Œ๋ ˆ์ดํŠธ ๊ณ ์ •
            config=config if enhance_prosody else None,  # Prosody ๊ฐ•์กฐ ์„ค์ •
            language="ko"  # ํ•œ๊ตญ์–ด
        )
        
        # ์ถœ๋ ฅ ์˜ค๋””์˜ค ์ €์žฅ
        output_path = "output_cloned_audio.wav"
        output_audio.save(output_path)
        return output_path
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
interface = gr.Interface(
    fn=clone_with_prosody,
    inputs=[
        gr.Textbox(label="Text to Convert", placeholder="Enter text to convert to speech"),
        gr.Audio(label="Reference Audio (Your Voice)", type="filepath", source="upload"),
        gr.Checkbox(label="Enhance Prosody (Intonation/Rhythm)", value=True)
    ],
    outputs=gr.Audio(label="Cloned Voice Output"),
    title="MARS5 Voice Cloner with Prosody",
    description="Upload a 3-5 second audio of your voice and enter text to clone your voice with prosody (intonation, rhythm, emotion).",
    allow_flagging="never"
)

# ์•ฑ ์‹คํ–‰
interface.launch()