Spaces:
Runtime error
Runtime error
File size: 2,350 Bytes
db98300 d71be36 8cef076 db98300 8cef076 a4bf808 db98300 8cef076 db98300 8cef076 db98300 8cef076 db98300 8cef076 db98300 8cef076 db98300 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
from inference import Mars5TTS
import librosa
import torch
import numpy as np
# GPU ๋ฉ๋ชจ๋ฆฌ ์ด๊ธฐํ (๋ฌด๋ฃ ํฐ์ด ๋ฉ๋ชจ๋ฆฌ ๋ถ์กฑ ๋ฐฉ์ง)
if torch.cuda.is_available():
torch.cuda.empty_cache()
# MARS5 TTS ๋ชจ๋ธ ๋ก๋
try:
mars5 = Mars5TTS.from_pretrained("CAMB-AI/MARS5-TTS")
config = InferenceConfig(temperature=0.7) # Prosody ์ ์ด๋ฅผ ์ํ ์จ๋ ์ค์
except Exception as e:
print(f"Model loading error: {str(e)}")
def clone_with_prosody(text, ref_audio, enhance_prosody=True):
"""
์
๋ ฅ ํ
์คํธ์ ์ฐธ์กฐ ์ค๋์ค๋ฅผ ๋ฐ์ ๋งํฌ๋ฅผ ํด๋ก ํ์ฌ ์ค๋์ค ์ถ๋ ฅ
:param text: ๋ณํํ ํ
์คํธ
:param ref_audio: ๋งํฌ๋ฅผ ๋ณต์ ํ ์ค๋์ค ํ์ผ (3-5์ด ์ด์ ๊ถ์ฅ)
:param enhance_prosody: Prosody(์ต์/๋ฆฌ๋ฌ) ๊ฐ์กฐ ์ฌ๋ถ
:return: ์ถ๋ ฅ ์ค๋์ค ํ์ผ ๊ฒฝ๋ก
"""
try:
# ์ฐธ์กฐ ์ค๋์ค ๋ก๋ (Gradio์์ ์ ๊ณต๋ filepath ์ฒ๋ฆฌ)
if isinstance(ref_audio, str):
audio_data, sr = librosa.load(ref_audio, sr=16000) # 16kHz๋ก ๋ก๋
else:
audio_data = ref_audio # Gradio์์ numpy ๋ฐฐ์ด๋ก ์ ๊ณต ์
# MARS5 TTS๋ก ํด๋ก๋
output_audio = mars5.tts(
text=text,
ref_audio=audio_data,
ref_sr=16000, # ์ํ๋ ์ดํธ ๊ณ ์
config=config if enhance_prosody else None, # Prosody ๊ฐ์กฐ ์ค์
language="ko" # ํ๊ตญ์ด
)
# ์ถ๋ ฅ ์ค๋์ค ์ ์ฅ
output_path = "output_cloned_audio.wav"
output_audio.save(output_path)
return output_path
except Exception as e:
return f"Error: {str(e)}"
# Gradio ์ธํฐํ์ด์ค ์ค์
interface = gr.Interface(
fn=clone_with_prosody,
inputs=[
gr.Textbox(label="Text to Convert", placeholder="Enter text to convert to speech"),
gr.Audio(label="Reference Audio (Your Voice)", type="filepath", source="upload"),
gr.Checkbox(label="Enhance Prosody (Intonation/Rhythm)", value=True)
],
outputs=gr.Audio(label="Cloned Voice Output"),
title="MARS5 Voice Cloner with Prosody",
description="Upload a 3-5 second audio of your voice and enter text to clone your voice with prosody (intonation, rhythm, emotion).",
allow_flagging="never"
)
# ์ฑ ์คํ
interface.launch() |