bgtts / app.py
englissi's picture
Update app.py
9ed0b14 verified
import gradio as gr
import PyPDF2
from transformers import AutoProcessor, AutoModel
import torch
import numpy as np
import nltk
# NLTK ๋ฌธ์žฅ ๋ถ„๋ฆฌ์šฉ ๋ฐ์ดํ„ฐ ๋‹ค์šด๋กœ๋“œ
nltk.download('punkt')
nltk.download('punkt_tab')
# 1. Hugging Face Bark ๋ชจ๋ธ ๋ฐ ํ”„๋กœ์„ธ์„œ ๋กœ๋“œ
model_id = "suno/bark-small"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
def extract_text_from_pdf(pdf_path):
"""PDF ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜ (Gradio 5.x ํ˜ธํ™˜)"""
if pdf_path is None:
return ""
text = ""
try:
# ์ตœ์‹  Gradio๋Š” pdf_path์— ํŒŒ์ผ์˜ ์ž„์‹œ ์ €์žฅ ๊ฒฝ๋กœ(๋ฌธ์ž์—ด)๋ฅผ ๋ฐ”๋กœ ์ „๋‹ฌํ•ฉ๋‹ˆ๋‹ค.
reader = PyPDF2.PdfReader(pdf_path)
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
except Exception as e:
return f"PDF ์ฝ๊ธฐ ์˜ค๋ฅ˜: {str(e)}"
return text
def synthesize_speech(text, gender):
"""ํ…์ŠคํŠธ๋ฅผ ์Œ์„ฑ์œผ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜"""
# ๋ถˆ๊ฐ€๋ฆฌ์•„์–ด(bg) ํ”„๋ฆฌ์…‹ (0๋ฒˆ์€ ์—ฌ์„ฑ, 1๋ฒˆ์€ ๋‚จ์„ฑ ํ†ค์— ๊ฐ€๊น์Šต๋‹ˆ๋‹ค)
voice_preset = "v2/bg_speaker_1" if gender == "๋‚จ์„ฑ (Male)" else "v2/bg_speaker_0"
sentences = nltk.sent_tokenize(text)
audio_chunks = []
for sentence in sentences:
if not sentence.strip():
continue
inputs = processor(sentence, voice_preset=voice_preset, return_tensors="pt").to(device)
with torch.no_grad():
speech_output = model.generate(**inputs)
audio_chunks.append(speech_output[0].cpu().numpy())
if not audio_chunks:
return None
final_audio = np.concatenate(audio_chunks)
sample_rate = model.generation_config.sample_rate
return (sample_rate, final_audio)
def process_input(text, pdf_file, gender):
"""์ž…๋ ฅ ์ฒ˜๋ฆฌ ๋ฉ”์ธ ์ปจํŠธ๋กค๋Ÿฌ"""
if pdf_file is not None:
extracted = extract_text_from_pdf(pdf_file)
if extracted.strip():
text = extracted
if not text.strip():
return "ํ…์ŠคํŠธ๋ฅผ ์ง์ ‘ ์ž…๋ ฅํ•˜๊ฑฐ๋‚˜ PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”.", None
# ์„œ๋ฒ„ ๊ณผ๋ถ€ํ•˜ ๋ฐฉ์ง€์šฉ ๊ธ€์ž ์ˆ˜ ์ œํ•œ
limited_text = text[:1500]
audio = synthesize_speech(limited_text, gender)
return limited_text, audio
# 2. Gradio UI ๊ตฌ์„ฑ (์ตœ์‹  Blocks ๋ฌธ๋ฒ•)
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("# ๐Ÿ‡ง๐Ÿ‡ฌ ๋ถˆ๊ฐ€๋ฆฌ์•„์–ด TTS ๋ฆฌ๋” (์‹ ๋ฌธ/PDF)")
gr.Markdown("๋ถˆ๊ฐ€๋ฆฌ์•„์–ด ํ…์ŠคํŠธ๋‚˜ PDF ๊ธฐ์‚ฌ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์ง€์ •ํ•œ ์„ฑ๋ณ„์˜ ์Œ์„ฑ์œผ๋กœ ์ฝ์–ด์ค๋‹ˆ๋‹ค.")
with gr.Row():
with gr.Column():
pdf_input = gr.File(label="PDF ํŒŒ์ผ ์—…๋กœ๋“œ (์„ ํƒ)", file_types=[".pdf"])
text_input = gr.Textbox(label="๋ถˆ๊ฐ€๋ฆฌ์•„์–ด ํ…์ŠคํŠธ ์ง์ ‘ ์ž…๋ ฅ", lines=8, placeholder="์—ฌ๊ธฐ์— ๋ถˆ๊ฐ€๋ฆฌ์•„์–ด ๊ธฐ์‚ฌ ๋‚ด์šฉ์„ ์ž…๋ ฅํ•˜์„ธ์š”...")
gender_input = gr.Radio(["๋‚จ์„ฑ (Male)", "์—ฌ์„ฑ (Female)"], label="๋ชฉ์†Œ