testpublic / app.py
rwine's picture
Update app.py
fe8a53b verified
raw
history blame
1.96 kB
import subprocess
import sys
import gradio as gr
from inference import Mars5TTS, InferenceConfig
import librosa
import torch
import numpy as np
# requirements.txt 설치 확인
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
print("Successfully installed requirements.txt")
except subprocess.CalledProcessError as e:
print(f"Failed to install requirements.txt: {e}")
# GPU 메모리 초기화
if torch.cuda.is_available():
torch.cuda.empty_cache()
# MARS5 TTS 모델 로드
try:
mars5 = Mars5TTS.from_pretrained("CAMB-AI/MARS5-TTS")
config = InferenceConfig(temperature=0.7)
except Exception as e:
print(f"Model loading error: {str(e)}")
raise
def clone_with_prosody(text, ref_audio, enhance_prosody=True):
try:
if isinstance(ref_audio, str):
audio_data, sr = librosa.load(ref_audio, sr=16000)
else:
audio_data = ref_audio
output_audio = mars5.tts(
text=text,
ref_audio=audio_data,
ref_sr=16000,
config=config if enhance_prosody else None,
language="ko"
)
output_path = "output_cloned_audio.wav"
output_audio.save(output_path)
return output_path
except Exception as e:
return f"Error: {str(e)}"
interface = gr.Interface(
fn=clone_with_prosody,
inputs=[
gr.Textbox(label="Text to Convert", placeholder="Enter text to convert to speech"),
gr.Audio(label="Reference Audio (Your Voice)", type="filepath", source="upload"),
gr.Checkbox(label="Enhance Prosody (Intonation/Rhythm)", value=True)
],
outputs=gr.Audio(label="Cloned Voice Output"),
title="MARS5 Voice Cloner with Prosody",
description="Upload a 3-5 second audio of your voice and enter text to clone your voice with prosody (intonation, rhythm, emotion).",
allow_flagging="never"
)
interface.launch()