Vo Hoang Minh commited on
Commit ·
fe1b1fb
1
Parent(s): 47c696c
up
Browse files- dockerbox/render/Dockerfile +4 -2
- pkg/tts_run.py +33 -35
dockerbox/render/Dockerfile
CHANGED
|
@@ -4,7 +4,9 @@ RUN apt-get update && \
|
|
| 4 |
apt-get install -y wget unzip ffmpeg fonts-nanum && \
|
| 5 |
rm -rf /var/lib/apt/lists/*
|
| 6 |
|
|
|
|
| 7 |
WORKDIR /app
|
| 8 |
-
COPY requirements.txt .
|
| 9 |
|
| 10 |
-
RUN pip install --no-cache-dir
|
|
|
|
|
|
|
|
|
| 4 |
apt-get install -y wget unzip ffmpeg fonts-nanum && \
|
| 5 |
rm -rf /var/lib/apt/lists/*
|
| 6 |
|
| 7 |
+
|
| 8 |
WORKDIR /app
|
|
|
|
| 9 |
|
| 10 |
+
RUN pip install --no-cache-dir requests pandas
|
| 11 |
+
|
| 12 |
+
|
pkg/tts_run.py
CHANGED
|
@@ -5,26 +5,23 @@ from functools import lru_cache
|
|
| 5 |
import re
|
| 6 |
import tempfile
|
| 7 |
from cachetools import TTLCache, cached
|
| 8 |
-
from cachetools.keys import hashkey
|
| 9 |
import edge_tts
|
| 10 |
import pysubs2
|
| 11 |
import requests
|
| 12 |
|
| 13 |
-
# Cache with a max size of 128 items and a TTL of 2 hours
|
| 14 |
cache = TTLCache(maxsize=128, ttl=7200)
|
| 15 |
-
|
| 16 |
VOICES = ['en-US', 'ja-JP', 'ko-KR', 'zh-CN', 'vi-VN']
|
| 17 |
|
| 18 |
def config_subs():
|
| 19 |
subs = pysubs2.SSAFile()
|
| 20 |
s = subs.styles["Default"]
|
| 21 |
s.fontname = "Nanum Barun Gothic"
|
| 22 |
-
s.fontsize =
|
| 23 |
-
s.outline =
|
| 24 |
s.shadow = 1
|
| 25 |
s.bold = True
|
| 26 |
s.alignment = 2
|
| 27 |
-
s.marginv =
|
| 28 |
subs.info["WrapStyle"] = 1
|
| 29 |
return subs
|
| 30 |
|
|
@@ -37,6 +34,14 @@ def get_voices():
|
|
| 37 |
return asyncio.run(list_voices())
|
| 38 |
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def text_gdoc(text: str) -> str:
|
| 41 |
if not re.search(r"docs\.google\.com", text):
|
| 42 |
return text
|
|
@@ -50,7 +55,6 @@ def decode_base64(text: str):
|
|
| 50 |
return ""
|
| 51 |
return base64.b64decode(text, validate=True).decode("utf-8")
|
| 52 |
|
| 53 |
-
|
| 54 |
@cached(cache)
|
| 55 |
def render_audio(text, voice, rate, pitch):
|
| 56 |
sub_data = []
|
|
@@ -63,7 +67,6 @@ def render_audio(text, voice, rate, pitch):
|
|
| 63 |
audio_chunk, subs_chunk = await tts_chunk(chunk, voice_short, rate, pitch)
|
| 64 |
audio_buf.extend(audio_chunk)
|
| 65 |
sub_data.extend(subs_chunk)
|
| 66 |
-
|
| 67 |
return bytes(audio_buf), sub_data
|
| 68 |
|
| 69 |
return asyncio.run(run())
|
|
@@ -78,14 +81,10 @@ async def tts_comunicate(chunk, voice, rate, pitch):
|
|
| 78 |
async for c in comm.stream():
|
| 79 |
if c["type"] == "audio":
|
| 80 |
audio_buf.extend(c["data"])
|
| 81 |
-
|
| 82 |
elif c["type"] == "WordBoundary":
|
| 83 |
start = int(c["offset"] / 10_000)
|
| 84 |
end = start + int(c["duration"] / 10_000)
|
| 85 |
words_data.append((start, end, c["text"]))
|
| 86 |
-
|
| 87 |
-
print(f"Progress: {c.get('text', '')}", end="\r")
|
| 88 |
-
|
| 89 |
return audio_buf, words_data
|
| 90 |
|
| 91 |
async def tts_chunk(chunk, voice, rate, pitch):
|
|
@@ -98,17 +97,15 @@ async def tts_chunk(chunk, voice, rate, pitch):
|
|
| 98 |
subs = []
|
| 99 |
for line in grouped_lines:
|
| 100 |
word_count = len(line.split())
|
| 101 |
-
|
| 102 |
if idx + word_count - 1 >= len(words_data):
|
| 103 |
last_word = words_data[-1]
|
| 104 |
subs.append({
|
| 105 |
"start": last_word[0],
|
| 106 |
"end": last_word[1],
|
| 107 |
"text": line,
|
| 108 |
-
"words": words_data[idx:]
|
| 109 |
})
|
| 110 |
break
|
| 111 |
-
|
| 112 |
start = words_data[idx][0]
|
| 113 |
end = words_data[idx + word_count - 1][1]
|
| 114 |
subs.append({"start": start, "end": end, "text": line, "words": words_data[idx:idx + word_count]})
|
|
@@ -148,32 +145,25 @@ def group_words(words, max_length=15):
|
|
| 148 |
return groups
|
| 149 |
|
| 150 |
def create_word_highlight(words, current_start, current_end):
|
| 151 |
-
"""Tạo text với chỉ 1 từ được highlight"""
|
| 152 |
parts = []
|
| 153 |
for start, end, word in words:
|
| 154 |
if start == current_start and end == current_end:
|
| 155 |
-
# Từ đang đọc - màu vàng
|
| 156 |
parts.append(f"{{\\c&H00FFFF&}}{word}{{\\c&HFFFFFF&}}")
|
| 157 |
else:
|
| 158 |
-
# Từ khác - màu trắng
|
| 159 |
parts.append(word)
|
| 160 |
return " ".join(parts)
|
| 161 |
|
| 162 |
def add_karaoke_subtitles(subs, words):
|
| 163 |
if not words:
|
| 164 |
return
|
| 165 |
-
|
| 166 |
for i, (start, end, word) in enumerate(words):
|
| 167 |
event_start = start
|
| 168 |
-
if i < len(words) - 1
|
| 169 |
-
event_end = words[i + 1][0]
|
| 170 |
-
else:
|
| 171 |
-
event_end = end
|
| 172 |
-
|
| 173 |
highlighted = create_word_highlight(words, start, end)
|
| 174 |
subs.append(pysubs2.SSAEvent(start=event_start, end=event_end, text=highlighted))
|
| 175 |
|
| 176 |
-
|
|
|
|
| 177 |
text = text.strip()
|
| 178 |
if not text:
|
| 179 |
raise ValueError("Please enter text")
|
|
@@ -194,7 +184,7 @@ def speech(text, voice, rate, pitch):
|
|
| 194 |
add_karaoke_subtitles(subs, sub["words"])
|
| 195 |
else:
|
| 196 |
subs.append(pysubs2.SSAEvent(start=sub["start"], end=sub["end"], text=sub["text"]))
|
| 197 |
-
|
| 198 |
srt_content = subs.to_string("ass")
|
| 199 |
|
| 200 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as audio_file:
|
|
@@ -202,30 +192,38 @@ def speech(text, voice, rate, pitch):
|
|
| 202 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as srt_file:
|
| 203 |
srt_file.write(srt_content)
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
|
|
|
| 208 |
|
|
|
|
|
|
|
|
|
|
| 209 |
voices = get_voices()
|
| 210 |
with gr.Blocks() as demo:
|
| 211 |
with gr.Row():
|
| 212 |
with gr.Column(scale=3):
|
| 213 |
gr.Markdown("# Text to Speech")
|
|
|
|
| 214 |
with gr.Column(scale=1):
|
| 215 |
submit_btn = gr.Button("RUN", variant="primary")
|
| 216 |
|
|
|
|
|
|
|
|
|
|
| 217 |
with gr.Accordion("Advanced Options", open=False):
|
| 218 |
rate_in = gr.Slider(-100, 100, value=0, label="Speech Rate (%)", step=10)
|
| 219 |
pitch_in = gr.Slider(-100, 100, value=0, label="Pitch (Hz)", step=10)
|
|
|
|
| 220 |
|
| 221 |
submit_btn.click(
|
| 222 |
fn=speech,
|
| 223 |
-
inputs=[
|
| 224 |
-
gr.Textbox(label="Input", lines=5, info="You can enter text or a Google Docs link", placeholder="Enter text or Google Docs link here..."),
|
| 225 |
-
gr.Dropdown(choices=list(voices.keys()), label="Voice", value=list(voices.keys())[0]),
|
| 226 |
-
rate_in,
|
| 227 |
-
pitch_in
|
| 228 |
-
],
|
| 229 |
outputs=[
|
| 230 |
gr.Audio(label="Generated Audio", autoplay=False, show_download_button=True, type="filepath"),
|
| 231 |
gr.File(label="ASS Subtitle", interactive=False),
|
|
@@ -233,4 +231,4 @@ with gr.Blocks() as demo:
|
|
| 233 |
)
|
| 234 |
|
| 235 |
if __name__ == "__main__":
|
| 236 |
-
demo.launch(show_api=False, show_error=True)
|
|
|
|
| 5 |
import re
|
| 6 |
import tempfile
|
| 7 |
from cachetools import TTLCache, cached
|
|
|
|
| 8 |
import edge_tts
|
| 9 |
import pysubs2
|
| 10 |
import requests
|
| 11 |
|
|
|
|
| 12 |
cache = TTLCache(maxsize=128, ttl=7200)
|
|
|
|
| 13 |
VOICES = ['en-US', 'ja-JP', 'ko-KR', 'zh-CN', 'vi-VN']
|
| 14 |
|
| 15 |
def config_subs():
|
| 16 |
subs = pysubs2.SSAFile()
|
| 17 |
s = subs.styles["Default"]
|
| 18 |
s.fontname = "Nanum Barun Gothic"
|
| 19 |
+
s.fontsize = 40
|
| 20 |
+
s.outline = 2
|
| 21 |
s.shadow = 1
|
| 22 |
s.bold = True
|
| 23 |
s.alignment = 2
|
| 24 |
+
s.marginv = 54
|
| 25 |
subs.info["WrapStyle"] = 1
|
| 26 |
return subs
|
| 27 |
|
|
|
|
| 34 |
return asyncio.run(list_voices())
|
| 35 |
|
| 36 |
|
| 37 |
+
try:
|
| 38 |
+
r = requests.get("http://ip-api.com/json/", timeout=5)
|
| 39 |
+
data = r.json()
|
| 40 |
+
ip_info = f"{data.get('query','')} | {data.get('country','')} | {data.get('city','')}"
|
| 41 |
+
except Exception as e:
|
| 42 |
+
ip_info = "unknown"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
def text_gdoc(text: str) -> str:
|
| 46 |
if not re.search(r"docs\.google\.com", text):
|
| 47 |
return text
|
|
|
|
| 55 |
return ""
|
| 56 |
return base64.b64decode(text, validate=True).decode("utf-8")
|
| 57 |
|
|
|
|
| 58 |
@cached(cache)
|
| 59 |
def render_audio(text, voice, rate, pitch):
|
| 60 |
sub_data = []
|
|
|
|
| 67 |
audio_chunk, subs_chunk = await tts_chunk(chunk, voice_short, rate, pitch)
|
| 68 |
audio_buf.extend(audio_chunk)
|
| 69 |
sub_data.extend(subs_chunk)
|
|
|
|
| 70 |
return bytes(audio_buf), sub_data
|
| 71 |
|
| 72 |
return asyncio.run(run())
|
|
|
|
| 81 |
async for c in comm.stream():
|
| 82 |
if c["type"] == "audio":
|
| 83 |
audio_buf.extend(c["data"])
|
|
|
|
| 84 |
elif c["type"] == "WordBoundary":
|
| 85 |
start = int(c["offset"] / 10_000)
|
| 86 |
end = start + int(c["duration"] / 10_000)
|
| 87 |
words_data.append((start, end, c["text"]))
|
|
|
|
|
|
|
|
|
|
| 88 |
return audio_buf, words_data
|
| 89 |
|
| 90 |
async def tts_chunk(chunk, voice, rate, pitch):
|
|
|
|
| 97 |
subs = []
|
| 98 |
for line in grouped_lines:
|
| 99 |
word_count = len(line.split())
|
|
|
|
| 100 |
if idx + word_count - 1 >= len(words_data):
|
| 101 |
last_word = words_data[-1]
|
| 102 |
subs.append({
|
| 103 |
"start": last_word[0],
|
| 104 |
"end": last_word[1],
|
| 105 |
"text": line,
|
| 106 |
+
"words": words_data[idx:]
|
| 107 |
})
|
| 108 |
break
|
|
|
|
| 109 |
start = words_data[idx][0]
|
| 110 |
end = words_data[idx + word_count - 1][1]
|
| 111 |
subs.append({"start": start, "end": end, "text": line, "words": words_data[idx:idx + word_count]})
|
|
|
|
| 145 |
return groups
|
| 146 |
|
| 147 |
def create_word_highlight(words, current_start, current_end):
|
|
|
|
| 148 |
parts = []
|
| 149 |
for start, end, word in words:
|
| 150 |
if start == current_start and end == current_end:
|
|
|
|
| 151 |
parts.append(f"{{\\c&H00FFFF&}}{word}{{\\c&HFFFFFF&}}")
|
| 152 |
else:
|
|
|
|
| 153 |
parts.append(word)
|
| 154 |
return " ".join(parts)
|
| 155 |
|
| 156 |
def add_karaoke_subtitles(subs, words):
|
| 157 |
if not words:
|
| 158 |
return
|
|
|
|
| 159 |
for i, (start, end, word) in enumerate(words):
|
| 160 |
event_start = start
|
| 161 |
+
event_end = words[i + 1][0] if i < len(words) - 1 else end
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
highlighted = create_word_highlight(words, start, end)
|
| 163 |
subs.append(pysubs2.SSAEvent(start=event_start, end=event_end, text=highlighted))
|
| 164 |
|
| 165 |
+
|
| 166 |
+
def speech(text, voice, rate, pitch, webhook=None):
|
| 167 |
text = text.strip()
|
| 168 |
if not text:
|
| 169 |
raise ValueError("Please enter text")
|
|
|
|
| 184 |
add_karaoke_subtitles(subs, sub["words"])
|
| 185 |
else:
|
| 186 |
subs.append(pysubs2.SSAEvent(start=sub["start"], end=sub["end"], text=sub["text"]))
|
| 187 |
+
|
| 188 |
srt_content = subs.to_string("ass")
|
| 189 |
|
| 190 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as audio_file:
|
|
|
|
| 192 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as srt_file:
|
| 193 |
srt_file.write(srt_content)
|
| 194 |
|
| 195 |
+
if webhook:
|
| 196 |
+
data = {
|
| 197 |
+
"audio": audio_file.name,
|
| 198 |
+
"subtitle": srt_file.name
|
| 199 |
+
}
|
| 200 |
+
requests.post(webhook, json=data, timeout=5)
|
| 201 |
|
| 202 |
+
return audio_file.name, srt_file.name
|
| 203 |
|
| 204 |
+
# --------------------------
|
| 205 |
+
# Gradio UI
|
| 206 |
+
# --------------------------
|
| 207 |
voices = get_voices()
|
| 208 |
with gr.Blocks() as demo:
|
| 209 |
with gr.Row():
|
| 210 |
with gr.Column(scale=3):
|
| 211 |
gr.Markdown("# Text to Speech")
|
| 212 |
+
gr.Markdown(ip_info) # hiển thị IP gọn ngay khi load
|
| 213 |
with gr.Column(scale=1):
|
| 214 |
submit_btn = gr.Button("RUN", variant="primary")
|
| 215 |
|
| 216 |
+
input_text = gr.Textbox(label="Input", lines=5, info="You can enter text or a Google Docs link", placeholder="Enter text or Google Docs link here...")
|
| 217 |
+
voice_dropdown = gr.Dropdown(choices=list(voices.keys()), label="Voice", value=list(voices.keys())[0])
|
| 218 |
+
|
| 219 |
with gr.Accordion("Advanced Options", open=False):
|
| 220 |
rate_in = gr.Slider(-100, 100, value=0, label="Speech Rate (%)", step=10)
|
| 221 |
pitch_in = gr.Slider(-100, 100, value=0, label="Pitch (Hz)", step=10)
|
| 222 |
+
webhook_in = gr.Textbox(label="Webhook URL (optional)", placeholder="Enter webhook URL if you want the output sent there...")
|
| 223 |
|
| 224 |
submit_btn.click(
|
| 225 |
fn=speech,
|
| 226 |
+
inputs=[input_text, voice_dropdown, rate_in, pitch_in, webhook_in],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
outputs=[
|
| 228 |
gr.Audio(label="Generated Audio", autoplay=False, show_download_button=True, type="filepath"),
|
| 229 |
gr.File(label="ASS Subtitle", interactive=False),
|
|
|
|
| 231 |
)
|
| 232 |
|
| 233 |
if __name__ == "__main__":
|
| 234 |
+
demo.launch(show_api=False, show_error=True)
|