Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from inference import Mars5TTS, InferenceConfig as config_class | |
| import librosa | |
| import torch | |
| import numpy as np | |
| # GPU ๋ฉ๋ชจ๋ฆฌ ์ด๊ธฐํ (๋ฌด๋ฃ ํฐ์ด ๋ฉ๋ชจ๋ฆฌ ๋ถ์กฑ ๋ฐฉ์ง) | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| # MARS5 TTS ๋ชจ๋ธ ๋ก๋ | |
| mars5 = Mars5TTS.from_pretrained("CAMB-AI/MARS5-TTS") | |
| config = config_class(temperature=0.7) # Prosody ์ ์ด๋ฅผ ์ํ ์จ๋ ์ค์ | |
| def clone_with_prosody(text, ref_audio, enhance_prosody=True): | |
| """ | |
| ์ ๋ ฅ ํ ์คํธ์ ์ฐธ์กฐ ์ค๋์ค๋ฅผ ๋ฐ์ ๋งํฌ๋ฅผ ํด๋ก ํ์ฌ ์ค๋์ค ์ถ๋ ฅ | |
| :param text: ๋ณํํ ํ ์คํธ | |
| :param ref_audio: ๋งํฌ๋ฅผ ๋ณต์ ํ ์ค๋์ค ํ์ผ (3-5์ด ์ด์ ๊ถ์ฅ) | |
| :param enhance_prosody: Prosody(์ต์/๋ฆฌ๋ฌ) ๊ฐ์กฐ ์ฌ๋ถ | |
| :return: ์ถ๋ ฅ ์ค๋์ค ํ์ผ ๊ฒฝ๋ก | |
| """ | |
| try: | |
| # ์ฐธ์กฐ ์ค๋์ค ๋ก๋ (Gradio์์ ์ ๊ณต๋ filepath ์ฒ๋ฆฌ) | |
| if isinstance(ref_audio, str): | |
| audio_data, sr = librosa.load(ref_audio, sr=16000) # 16kHz๋ก ๋ก๋ | |
| else: | |
| audio_data = ref_audio # Gradio์์ numpy ๋ฐฐ์ด๋ก ์ ๊ณต ์ | |
| # MARS5 TTS๋ก ํด๋ก๋ | |
| output_audio = mars5.tts( | |
| text=text, | |
| ref_audio=audio_data, | |
| ref_sr=16000, # ์ํ๋ ์ดํธ ๊ณ ์ | |
| config=config if enhance_prosody else None, # Prosody ๊ฐ์กฐ ์ค์ | |
| language="ko" # ํ๊ตญ์ด | |
| ) | |
| # ์ถ๋ ฅ ์ค๋์ค ์ ์ฅ | |
| output_path = "output_cloned_audio.wav" | |
| output_audio.save(output_path) | |
| return output_path | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Gradio ์ธํฐํ์ด์ค ์ค์ | |
| interface = gr.Interface( | |
| fn=clone_with_prosody, | |
| inputs=[ | |
| gr.Textbox(label="Text to Convert", placeholder="Enter text to convert to speech"), | |
| gr.Audio(label="Reference Audio (Your Voice)", type="filepath", source="upload"), | |
| gr.Checkbox(label="Enhance Prosody (Intonation/Rhythm)", value=True) | |
| ], | |
| outputs=gr.Audio(label="Cloned Voice Output"), | |
| title="MARS5 Voice Cloner with Prosody", | |
| description="Upload a 3-5 second audio of your voice and enter text to clone your voice with prosody (intonation, rhythm, emotion).", | |
| allow_flagging="never" | |
| ) | |
| # ์ฑ ์คํ | |
| interface.launch() |