Spaces:

rwine
/

testpublic

Runtime error

File size: 2,350 Bytes

db98300
d71be36
8cef076
db98300
 
 
8cef076
 
 
 
 
a4bf808
 
 
 
 
db98300
 
 
 
 
 
 
 
 
 
8cef076
 
 
db98300
8cef076
db98300
8cef076
 
db98300
 
8cef076
 
 
db98300
 
8cef076
db98300

import gradio as gr
from inference import Mars5TTS
import librosa
import torch
import numpy as np

# GPU 메모리 초기화 (무료 티어 메모리 부족 방지)
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# MARS5 TTS 모델 로드
try:
    mars5 = Mars5TTS.from_pretrained("CAMB-AI/MARS5-TTS")
    config = InferenceConfig(temperature=0.7)  # Prosody 제어를 위한 온도 설정
except Exception as e:
    print(f"Model loading error: {str(e)}")

def clone_with_prosody(text, ref_audio, enhance_prosody=True):
    """
    입력 텍스트와 참조 오디오를 받아 말투를 클론하여 오디오 출력
    :param text: 변환할 텍스트
    :param ref_audio: 말투를 복제할 오디오 파일 (3-5초 이상 권장)
    :param enhance_prosody: Prosody(억양/리듬) 강조 여부
    :return: 출력 오디오 파일 경로
    """
    try:
        # 참조 오디오 로드 (Gradio에서 제공된 filepath 처리)
        if isinstance(ref_audio, str):
            audio_data, sr = librosa.load(ref_audio, sr=16000)  # 16kHz로 로드
        else:
            audio_data = ref_audio  # Gradio에서 numpy 배열로 제공 시

        # MARS5 TTS로 클로닝
        output_audio = mars5.tts(
            text=text,
            ref_audio=audio_data,
            ref_sr=16000,  # 샘플레이트 고정
            config=config if enhance_prosody else None,  # Prosody 강조 설정
            language="ko"  # 한국어
        )
        
        # 출력 오디오 저장
        output_path = "output_cloned_audio.wav"
        output_audio.save(output_path)
        return output_path
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio 인터페이스 설정
interface = gr.Interface(
    fn=clone_with_prosody,
    inputs=[
        gr.Textbox(label="Text to Convert", placeholder="Enter text to convert to speech"),
        gr.Audio(label="Reference Audio (Your Voice)", type="filepath", source="upload"),
        gr.Checkbox(label="Enhance Prosody (Intonation/Rhythm)", value=True)
    ],
    outputs=gr.Audio(label="Cloned Voice Output"),
    title="MARS5 Voice Cloner with Prosody",
    description="Upload a 3-5 second audio of your voice and enter text to clone your voice with prosody (intonation, rhythm, emotion).",
    allow_flagging="never"
)

# 앱 실행
interface.launch()