youtube_auto_image1 / logic_tts.py
PLXR's picture
Update logic_tts.py
7f424a1 verified
import io
import wave
import re
def split_text_smartly(text, limit=500):
"""
1. ์ค„๋ฐ”๊ฟˆ(\n)์ด ์žˆ์œผ๋ฉด ๋ฌด์กฐ๊ฑด ์žฅ๋ฉด์„ ๋‚˜๋ˆ•๋‹ˆ๋‹ค. (์‚ฌ์šฉ์ž ์˜๋„ ๋ฐ˜์˜)
2. ๋‚˜๋ˆ ์ง„ ๋ฌธ๋‹จ์ด limit(๊ธ€์ž์ˆ˜)๋ฅผ ๋„˜์„ ๊ฒฝ์šฐ์—๋งŒ ๋ฌธ์žฅ ๋ถ€ํ˜ธ ๊ธฐ์ค€์œผ๋กœ ์ถ”๊ฐ€๋กœ ์ž๋ฆ…๋‹ˆ๋‹ค.
"""
# 1. ๋จผ์ € ์ค„๋ฐ”๊ฟˆ(\n) ๊ธฐ์ค€์œผ๋กœ ๊ฐ•์ œ ๋ถ„ํ•  (๋ฌธ๋‹จ ๋‚˜๋ˆ„๊ธฐ)
raw_paragraphs = text.split('\n')
final_chunks = []
for paragraph in raw_paragraphs:
paragraph = paragraph.strip()
if not paragraph:
continue
# ๋งŒ์•ฝ ๋ฌธ๋‹จ ์ž์ฒด๊ฐ€ limit๋ณด๋‹ค ์งง์œผ๋ฉด? -> ๊ทธ๋ƒฅ ํ†ต์งธ๋กœ ํ•˜๋‚˜์˜ ์žฅ๋ฉด!
if len(paragraph) <= limit:
final_chunks.append(paragraph)
continue
# 2. ๋ฌธ๋‹จ์ด ๋„ˆ๋ฌด ๊ธธ๋ฉด? -> ์—ฌ๊ธฐ์„œ๋งŒ ๋ฌธ์žฅ ๋ถ€ํ˜ธ๋กœ ์ชผ๊ฐœ๊ธฐ ๋กœ์ง ๋ฐœ๋™
sentences = re.split(r'(?<=[.?!])\s+', paragraph)
current_chunk = ""
for sentence in sentences:
if not sentence.strip():
continue
# ํ•ฉ์ณค์„ ๋•Œ ์ œํ•œ์„ ๋„˜์œผ๋ฉด -> ์ €์žฅํ•˜๊ณ  ์ƒˆ ์ถœ๋ฐœ
if len(current_chunk) + len(sentence) > limit:
if current_chunk:
final_chunks.append(current_chunk.strip())
current_chunk = sentence
else:
# ์ œํ•œ ์•ˆ ๋„˜์œผ๋ฉด -> ๊ณ„์† ์ด์–ด ๋ถ™์ž„
current_chunk += " " + sentence
# ๋งˆ์ง€๋ง‰ ๋‚จ์€ ์กฐ๊ฐ ์ €์žฅ
if current_chunk:
final_chunks.append(current_chunk.strip())
return final_chunks
def raw_pcm_to_wav(pcm_data, sample_rate=24000):
"""
[ํ•ต์‹ฌ] Gemini๊ฐ€ ์ฃผ๋Š” ๋‚ ๊ฒƒ์˜ PCM ๋ฐ์ดํ„ฐ๋ฅผ
๋ธŒ๋ผ์šฐ์ €๊ฐ€ ๋“ค์„ ์ˆ˜ ์žˆ๋Š” WAV ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜
"""
try:
wav_io = io.BytesIO()
with wave.open(wav_io, "wb") as wav_file:
wav_file.setnchannels(1) # ๋ชจ๋…ธ
wav_file.setsampwidth(2) # 16-bit (2 bytes)
wav_file.setframerate(sample_rate)
wav_file.writeframes(pcm_data)
return wav_io.getvalue()
except Exception as e:
print(f"PCM to WAV Error: {e}")
return None
def generate_speech_chunk(client, model_id, text, voice_name):
"""
์งง์€ ํ…์ŠคํŠธ๋ฅผ ์˜ค๋””์˜ค๋กœ ๋ฐ”๊พธ๋Š” ํ•จ์ˆ˜ (๋ฏธ๋ฆฌ๋“ฃ๊ธฐ์šฉ)
"""
try:
response = client.models.generate_content(
model=model_id,
contents=text,
config={
"response_modalities": ["AUDIO"],
"speech_config": {
"voice_config": {
"prebuilt_voice_config": {"voice_name": voice_name}
}
}
}
)
# ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ถ”์ถœ
if response.candidates and response.candidates[0].content.parts:
part = response.candidates[0].content.parts[0]
if part.inline_data:
return part.inline_data.data
return None
except Exception as e:
print(f"TTS Chunk Error: {e}")
return None
def process_tts_task(index, text, client, model_id, voice_name):
"""
๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ์ž‘์—… ํ•จ์ˆ˜
"""
audio_data = generate_speech_chunk(client, model_id, text, voice_name)
if audio_data:
# ๋ฐ”๋กœ WAV ํ—ค๋”๋ฅผ ์”Œ์šฐ์ง€ ์•Š๊ณ , ๋‚˜์ค‘์— ํ•ฉ์น˜๊ธฐ ์œ„ํ•ด PCM ์ƒํƒœ๋กœ ๋ฐ˜ํ™˜ํ•  ์ˆ˜๋„ ์žˆ์Œ
# ํ•˜์ง€๋งŒ ์—ฌ๊ธฐ์„  ๊ฐ„๋‹จํ•˜๊ฒŒ PCM ๋ฐ”์ด๋„ˆ๋ฆฌ ์ž์ฒด๋ฅผ ๋ฐ˜ํ™˜
return index, audio_data
return index, None
def merge_wav_bytes(audio_chunks_pcm, sample_rate=24000):
"""
์—ฌ๋Ÿฌ ๊ฐœ์˜ PCM ์กฐ๊ฐ์„ ํ•˜๋‚˜์˜ ๊ธด WAV ํŒŒ์ผ๋กœ ํ•ฉ์นจ
"""
try:
full_pcm = b"".join([chunk for chunk in audio_chunks_pcm if chunk])
return raw_pcm_to_wav(full_pcm, sample_rate)
except Exception as e:
print(f"Merge Error: {e}")
return None