issamTTS / app.py
sh2124's picture
Update app.py
5578cd0 verified
import gradio as gr
from pocket_tts import TTSModel
import pocket_tts
import numpy as np
# โœ… ํ•ต์‹ฌ: 'data' ์ฐฝ๊ณ ๋ฅผ ๊ฐ•์ œ๋กœ ์—ฝ๋‹ˆ๋‹ค (Explicit Import)
import pocket_tts.data
print("โณ ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘...")
model = TTSModel.load_model()
print("โœ… ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ.")
# ===================================================
# ๐Ÿ•ต๏ธโ€โ™‚๏ธ [์ฐฝ๊ณ  ํ„ธ๊ธฐ] pocket_tts.data ๋‚ด๋ถ€ ์ˆ˜์ƒ‰
# ===================================================
print("\n" + "="*40)
print("๐Ÿ“ฆ [DEBUG] 'pocket_tts.data' ์ฐฝ๊ณ  ๊ฐœ๋ฐฉ")
# ์ฐฝ๊ณ  ์•ˆ์— ๋ฌด์—‡์ด ์žˆ๋Š”์ง€ ๋ฆฌ์ŠคํŠธ๋ฅผ ๋ฝ‘์Šต๋‹ˆ๋‹ค.
data_contents = dir(pocket_tts.data)
print(f"์ฐฝ๊ณ  ๋‚ด์šฉ๋ฌผ: {data_contents}")
# ์ฐฝ๊ณ  ์•ˆ์—์„œ 'voices', 'speakers', 'catalog' ๊ฐ™์€ ๋‹จ์–ด๋ฅผ ์ฐพ์Šต๋‹ˆ๋‹ค.
found_voices = None
for item_name in data_contents:
if item_name in ['speakers', 'voices', 'catalog', 'predefined_voices']:
print(f"โœ… ์œ ๋ ฅํ•œ ์šฉ์˜์ž ๋ฐœ๊ฒฌ: pocket_tts.data.{item_name}")
found_voices = getattr(pocket_tts.data, item_name)
break
print("="*40 + "\n")
def generate_speech(text):
if not text:
return (24000, np.zeros(24000))
# 1. ๋ชฉ์†Œ๋ฆฌ ๋ฐ์ดํ„ฐ ํ™•๋ณด ์‹œ๋„
voice_state = None
# ์ „๋žต A: ์ฐฝ๊ณ ์—์„œ ์ฐพ์€ ๋ชฉ๋ก(found_voices)์ด ๋”•์…”๋„ˆ๋ฆฌ๋ผ๋ฉด ๊ฑฐ๊ธฐ์„œ 'alba'๋ฅผ ๊บผ๋ƒ…๋‹ˆ๋‹ค.
if found_voices and isinstance(found_voices, dict) and 'alba' in found_voices:
print("๐ŸŽ‰ ์„ฑ๊ณต! ์ฐฝ๊ณ  ๋ชฉ๋ก์—์„œ 'alba'๋ฅผ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค.")
voice_state = found_voices['alba']
# ์ „๋žต B: ๋งŒ์•ฝ ์ฐฝ๊ณ  ํ„ธ๊ธฐ์— ์‹คํŒจํ–ˆ๋‹ค๋ฉด, 'alba'๋ผ๋Š” ์ด๋ฆ„ํ‘œ๋ผ๋„ ๋ถ™์—ฌ์„œ ๋ณด๋ƒ…๋‹ˆ๋‹ค.
# (์ผ๋ถ€ ๋ฒ„์ „์—์„œ๋Š” ์ด๊ฒƒ์ด ํ†ตํ•  ์ˆ˜๋„ ์žˆ์Œ)
if voice_state is None:
print("โš ๏ธ ์ฐฝ๊ณ ์—์„œ ๋ฐ์ดํ„ฐ๋ฅผ ์ฐพ์ง€ ๋ชปํ•ด, ๊ธฐ๋ณธ ์ด๋ฆ„ํ‘œ('alba')๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
# ์ฃผ์˜: ์ด์ „์— ์‹คํŒจํ–ˆ๋˜ ๋ฐฉ์‹์ด์ง€๋งŒ, ๊ตฌ์กฐ๋ฅผ ํŒŒ์•…ํ•˜๊ธฐ ์œ„ํ•ด ๋‚จ๊ฒจ๋‘ก๋‹ˆ๋‹ค.
voice_state = 'alba'
try:
# 2. ์Œ์„ฑ ์ƒ์„ฑ
print(f"๐Ÿ—ฃ๏ธ ์Œ์„ฑ ์ƒ์„ฑ ์‹œ๋„ (๋ฐ์ดํ„ฐ ํƒ€์ž…: {type(voice_state)})")
audio = model.generate_audio(voice_state, text)
return (model.sample_rate, audio.numpy())
except Exception as e:
print(f"โŒ ์ƒ์„ฑ ์‹คํŒจ: {e}")
return (24000, np.zeros(24000))
# 3. UI ๊ตฌ์„ฑ
with gr.Blocks(title="๐Ÿš€ issamTTS (Data Warehouse)") as demo:
gr.Markdown("## ๐Ÿ“ฆ ์ฐฝ๊ณ (Data Module) ์ˆ˜์ƒ‰ ๋ชจ๋“œ")
gr.Markdown("Logs ํƒญ์—์„œ **'์ฐฝ๊ณ  ๋‚ด์šฉ๋ฌผ'** ๋ฆฌ์ŠคํŠธ๋ฅผ ํ™•์ธํ•ด ์ฃผ์„ธ์š”. ๊ทธ ์•ˆ์— ์ •๋‹ต์ด ์žˆ์Šต๋‹ˆ๋‹ค.")
input_text = gr.Textbox(label="ํ…์ŠคํŠธ", value="๋ฐ•์„ฑํ•œ ์ด์‚ฌ๋‹˜, ์ฐฝ๊ณ  ๋ฌธ์„ ์—ด์—ˆ์Šต๋‹ˆ๋‹ค. ์•ˆ์— ๋ฌด์—‡์ด ์žˆ์Šต๋‹ˆ๊นŒ?")
btn = gr.Button("์ˆ˜์ƒ‰ ๋ฐ ์ƒ์„ฑ (Search & Generate)")
output_audio = gr.Audio()
btn.click(fn=generate_speech, inputs=[input_text], outputs=[output_audio])
if __name__ == "__main__":
demo.launch()