rwine commited on
Commit
db98300
ยท
verified ยท
1 Parent(s): 590a47e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -0
app.py CHANGED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from mars5 import Mars5TTS
3
+ import torch
4
+ import numpy as np
5
+
6
+ # MARS5 TTS ๋ชจ๋ธ ๋กœ๋“œ (Hugging Face Hub์—์„œ)
7
+ model = Mars5TTS.from_pretrained("camb-ai/mars5-tts")
8
+
9
+ def clone_with_prosody(text, ref_audio, enhance_prosody=True):
10
+ """
11
+ ์ž…๋ ฅ ํ…์ŠคํŠธ์™€ ์ฐธ์กฐ ์˜ค๋””์˜ค๋ฅผ ๋ฐ›์•„ ๋งํˆฌ๋ฅผ ํด๋ก ํ•˜์—ฌ ์˜ค๋””์˜ค ์ถœ๋ ฅ
12
+ :param text: ๋ณ€ํ™˜ํ•  ํ…์ŠคํŠธ
13
+ :param ref_audio: ๋งํˆฌ๋ฅผ ๋ณต์ œํ•  ์˜ค๋””์˜ค ํŒŒ์ผ (3-5์ดˆ ์ด์ƒ ๊ถŒ์žฅ)
14
+ :param enhance_prosody: Prosody(์–ต์–‘/๋ฆฌ๋“ฌ) ๊ฐ•์กฐ ์—ฌ๋ถ€
15
+ :return: ์ถœ๋ ฅ ์˜ค๋””์˜ค ํŒŒ์ผ ๊ฒฝ๋กœ
16
+ """
17
+ try:
18
+ # ์ฐธ์กฐ ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ (Gradio์—์„œ ์—…๋กœ๋“œ๋œ ์˜ค๋””์˜ค: numpy ๋ฐฐ์—ด ๋˜๋Š” ํŒŒ์ผ ๊ฒฝ๋กœ)
19
+ if isinstance(ref_audio, np.ndarray):
20
+ # Gradio์—์„œ ์ œ๊ณตํ•˜๋Š” ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
21
+ audio_data = ref_audio
22
+ else:
23
+ # ํŒŒ์ผ ๊ฒฝ๋กœ์ผ ๊ฒฝ์šฐ
24
+ audio_data = ref_audio
25
+
26
+ # MARS5 TTS๋กœ ํด๋กœ๋‹ (prosody ๊ฐ•์กฐ)
27
+ output_audio = model.tts(
28
+ text=text,
29
+ ref_audio=audio_data,
30
+ prosody_enhance=enhance_prosody, # ์–ต์–‘/๋ฆฌ๋“ฌ ๋ณต์ œ ๊ฐ•ํ™”
31
+ language="ko" # ํ•œ๊ตญ์–ด ์„ค์ •
32
+ )
33
+
34
+ # ์ถœ๋ ฅ ์˜ค๋””์˜ค๋ฅผ ํŒŒ์ผ๋กœ ์ €์žฅ
35
+ output_path = "output_cloned_audio.wav"
36
+ output_audio.save(output_path)
37
+ return output_path
38
+ except Exception as e:
39
+ return f"Error: {str(e)}"
40
+
41
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
42
+ interface = gr.Interface(
43
+ fn=clone_with_prosody,
44
+ inputs=[
45
+ gr.Textbox(label="Text to Convert", placeholder="Enter text to convert to speech"),
46
+ gr.Audio(label="Reference Audio (Your Voice)", type="filepath", source="upload"),
47
+ gr.Checkbox(label="Enhance Prosody (Intonation/Rhythm)", value=True)
48
+ ],
49
+ outputs=gr.Audio(label="Cloned Voice Output"),
50
+ title="MARS5 Voice Cloner with Prosody",
51
+ description="Upload a 3-5 second audio of your voice and enter text to clone your voice with prosody (intonation, rhythm, emotion).",
52
+ allow_flagging="never"
53
+ )
54
+
55
+ # ์•ฑ ์‹คํ–‰
56
+ interface.launch()