Spaces:

rwine
/

testpublic

Runtime error

File size: 1,959 Bytes

fe8a53b
 
db98300
fe8a53b
8cef076
db98300
 
 
fe8a53b
 
 
 
 
 
 
 
8cef076
 
 
 
a4bf808
 
fe8a53b
a4bf808
 
f3d3f37
db98300
 
 
8cef076
fe8a53b
db98300
fe8a53b
db98300
8cef076
db98300
 
fe8a53b
 
 
db98300

import subprocess
import sys
import gradio as gr
from inference import Mars5TTS, InferenceConfig
import librosa
import torch
import numpy as np

# requirements.txt 설치 확인
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
    print("Successfully installed requirements.txt")
except subprocess.CalledProcessError as e:
    print(f"Failed to install requirements.txt: {e}")

# GPU 메모리 초기화
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# MARS5 TTS 모델 로드
try:
    mars5 = Mars5TTS.from_pretrained("CAMB-AI/MARS5-TTS")
    config = InferenceConfig(temperature=0.7)
except Exception as e:
    print(f"Model loading error: {str(e)}")
    raise

def clone_with_prosody(text, ref_audio, enhance_prosody=True):
    try:
        if isinstance(ref_audio, str):
            audio_data, sr = librosa.load(ref_audio, sr=16000)
        else:
            audio_data = ref_audio

        output_audio = mars5.tts(
            text=text,
            ref_audio=audio_data,
            ref_sr=16000,
            config=config if enhance_prosody else None,
            language="ko"
        )
        
        output_path = "output_cloned_audio.wav"
        output_audio.save(output_path)
        return output_path
    except Exception as e:
        return f"Error: {str(e)}"

interface = gr.Interface(
    fn=clone_with_prosody,
    inputs=[
        gr.Textbox(label="Text to Convert", placeholder="Enter text to convert to speech"),
        gr.Audio(label="Reference Audio (Your Voice)", type="filepath", source="upload"),
        gr.Checkbox(label="Enhance Prosody (Intonation/Rhythm)", value=True)
    ],
    outputs=gr.Audio(label="Cloned Voice Output"),
    title="MARS5 Voice Cloner with Prosody",
    description="Upload a 3-5 second audio of your voice and enter text to clone your voice with prosody (intonation, rhythm, emotion).",
    allow_flagging="never"
)

interface.launch()