Vo Hoang Minh commited on
Commit
fe1b1fb
·
1 Parent(s): 47c696c
Files changed (2) hide show
  1. dockerbox/render/Dockerfile +4 -2
  2. pkg/tts_run.py +33 -35
dockerbox/render/Dockerfile CHANGED
@@ -4,7 +4,9 @@ RUN apt-get update && \
4
  apt-get install -y wget unzip ffmpeg fonts-nanum && \
5
  rm -rf /var/lib/apt/lists/*
6
 
 
7
  WORKDIR /app
8
- COPY requirements.txt .
9
 
10
- RUN pip install --no-cache-dir -r requirements.txt
 
 
 
4
  apt-get install -y wget unzip ffmpeg fonts-nanum && \
5
  rm -rf /var/lib/apt/lists/*
6
 
7
+
8
  WORKDIR /app
 
9
 
10
+ RUN pip install --no-cache-dir requests pandas
11
+
12
+
pkg/tts_run.py CHANGED
@@ -5,26 +5,23 @@ from functools import lru_cache
5
  import re
6
  import tempfile
7
  from cachetools import TTLCache, cached
8
- from cachetools.keys import hashkey
9
  import edge_tts
10
  import pysubs2
11
  import requests
12
 
13
- # Cache with a max size of 128 items and a TTL of 2 hours
14
  cache = TTLCache(maxsize=128, ttl=7200)
15
-
16
  VOICES = ['en-US', 'ja-JP', 'ko-KR', 'zh-CN', 'vi-VN']
17
 
18
  def config_subs():
19
  subs = pysubs2.SSAFile()
20
  s = subs.styles["Default"]
21
  s.fontname = "Nanum Barun Gothic"
22
- s.fontsize = 36
23
- s.outline = 3
24
  s.shadow = 1
25
  s.bold = True
26
  s.alignment = 2
27
- s.marginv = 52
28
  subs.info["WrapStyle"] = 1
29
  return subs
30
 
@@ -37,6 +34,14 @@ def get_voices():
37
  return asyncio.run(list_voices())
38
 
39
 
 
 
 
 
 
 
 
 
40
  def text_gdoc(text: str) -> str:
41
  if not re.search(r"docs\.google\.com", text):
42
  return text
@@ -50,7 +55,6 @@ def decode_base64(text: str):
50
  return ""
51
  return base64.b64decode(text, validate=True).decode("utf-8")
52
 
53
-
54
  @cached(cache)
55
  def render_audio(text, voice, rate, pitch):
56
  sub_data = []
@@ -63,7 +67,6 @@ def render_audio(text, voice, rate, pitch):
63
  audio_chunk, subs_chunk = await tts_chunk(chunk, voice_short, rate, pitch)
64
  audio_buf.extend(audio_chunk)
65
  sub_data.extend(subs_chunk)
66
-
67
  return bytes(audio_buf), sub_data
68
 
69
  return asyncio.run(run())
@@ -78,14 +81,10 @@ async def tts_comunicate(chunk, voice, rate, pitch):
78
  async for c in comm.stream():
79
  if c["type"] == "audio":
80
  audio_buf.extend(c["data"])
81
-
82
  elif c["type"] == "WordBoundary":
83
  start = int(c["offset"] / 10_000)
84
  end = start + int(c["duration"] / 10_000)
85
  words_data.append((start, end, c["text"]))
86
-
87
- print(f"Progress: {c.get('text', '')}", end="\r")
88
-
89
  return audio_buf, words_data
90
 
91
  async def tts_chunk(chunk, voice, rate, pitch):
@@ -98,17 +97,15 @@ async def tts_chunk(chunk, voice, rate, pitch):
98
  subs = []
99
  for line in grouped_lines:
100
  word_count = len(line.split())
101
-
102
  if idx + word_count - 1 >= len(words_data):
103
  last_word = words_data[-1]
104
  subs.append({
105
  "start": last_word[0],
106
  "end": last_word[1],
107
  "text": line,
108
- "words": words_data[idx:] # lấy các từ còn lại
109
  })
110
  break
111
-
112
  start = words_data[idx][0]
113
  end = words_data[idx + word_count - 1][1]
114
  subs.append({"start": start, "end": end, "text": line, "words": words_data[idx:idx + word_count]})
@@ -148,32 +145,25 @@ def group_words(words, max_length=15):
148
  return groups
149
 
150
  def create_word_highlight(words, current_start, current_end):
151
- """Tạo text với chỉ 1 từ được highlight"""
152
  parts = []
153
  for start, end, word in words:
154
  if start == current_start and end == current_end:
155
- # Từ đang đọc - màu vàng
156
  parts.append(f"{{\\c&H00FFFF&}}{word}{{\\c&HFFFFFF&}}")
157
  else:
158
- # Từ khác - màu trắng
159
  parts.append(word)
160
  return " ".join(parts)
161
 
162
  def add_karaoke_subtitles(subs, words):
163
  if not words:
164
  return
165
-
166
  for i, (start, end, word) in enumerate(words):
167
  event_start = start
168
- if i < len(words) - 1:
169
- event_end = words[i + 1][0]
170
- else:
171
- event_end = end
172
-
173
  highlighted = create_word_highlight(words, start, end)
174
  subs.append(pysubs2.SSAEvent(start=event_start, end=event_end, text=highlighted))
175
 
176
- def speech(text, voice, rate, pitch):
 
177
  text = text.strip()
178
  if not text:
179
  raise ValueError("Please enter text")
@@ -194,7 +184,7 @@ def speech(text, voice, rate, pitch):
194
  add_karaoke_subtitles(subs, sub["words"])
195
  else:
196
  subs.append(pysubs2.SSAEvent(start=sub["start"], end=sub["end"], text=sub["text"]))
197
-
198
  srt_content = subs.to_string("ass")
199
 
200
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as audio_file:
@@ -202,30 +192,38 @@ def speech(text, voice, rate, pitch):
202
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as srt_file:
203
  srt_file.write(srt_content)
204
 
205
- return audio_file.name, srt_file.name
206
-
 
 
 
 
207
 
 
208
 
 
 
 
209
  voices = get_voices()
210
  with gr.Blocks() as demo:
211
  with gr.Row():
212
  with gr.Column(scale=3):
213
  gr.Markdown("# Text to Speech")
 
214
  with gr.Column(scale=1):
215
  submit_btn = gr.Button("RUN", variant="primary")
216
 
 
 
 
217
  with gr.Accordion("Advanced Options", open=False):
218
  rate_in = gr.Slider(-100, 100, value=0, label="Speech Rate (%)", step=10)
219
  pitch_in = gr.Slider(-100, 100, value=0, label="Pitch (Hz)", step=10)
 
220
 
221
  submit_btn.click(
222
  fn=speech,
223
- inputs=[
224
- gr.Textbox(label="Input", lines=5, info="You can enter text or a Google Docs link", placeholder="Enter text or Google Docs link here..."),
225
- gr.Dropdown(choices=list(voices.keys()), label="Voice", value=list(voices.keys())[0]),
226
- rate_in,
227
- pitch_in
228
- ],
229
  outputs=[
230
  gr.Audio(label="Generated Audio", autoplay=False, show_download_button=True, type="filepath"),
231
  gr.File(label="ASS Subtitle", interactive=False),
@@ -233,4 +231,4 @@ with gr.Blocks() as demo:
233
  )
234
 
235
  if __name__ == "__main__":
236
- demo.launch(show_api=False, show_error=True)
 
5
  import re
6
  import tempfile
7
  from cachetools import TTLCache, cached
 
8
  import edge_tts
9
  import pysubs2
10
  import requests
11
 
 
12
  cache = TTLCache(maxsize=128, ttl=7200)
 
13
  VOICES = ['en-US', 'ja-JP', 'ko-KR', 'zh-CN', 'vi-VN']
14
 
15
  def config_subs():
16
  subs = pysubs2.SSAFile()
17
  s = subs.styles["Default"]
18
  s.fontname = "Nanum Barun Gothic"
19
+ s.fontsize = 40
20
+ s.outline = 2
21
  s.shadow = 1
22
  s.bold = True
23
  s.alignment = 2
24
+ s.marginv = 54
25
  subs.info["WrapStyle"] = 1
26
  return subs
27
 
 
34
  return asyncio.run(list_voices())
35
 
36
 
37
+ try:
38
+ r = requests.get("http://ip-api.com/json/", timeout=5)
39
+ data = r.json()
40
+ ip_info = f"{data.get('query','')} | {data.get('country','')} | {data.get('city','')}"
41
+ except Exception as e:
42
+ ip_info = "unknown"
43
+
44
+
45
  def text_gdoc(text: str) -> str:
46
  if not re.search(r"docs\.google\.com", text):
47
  return text
 
55
  return ""
56
  return base64.b64decode(text, validate=True).decode("utf-8")
57
 
 
58
  @cached(cache)
59
  def render_audio(text, voice, rate, pitch):
60
  sub_data = []
 
67
  audio_chunk, subs_chunk = await tts_chunk(chunk, voice_short, rate, pitch)
68
  audio_buf.extend(audio_chunk)
69
  sub_data.extend(subs_chunk)
 
70
  return bytes(audio_buf), sub_data
71
 
72
  return asyncio.run(run())
 
81
  async for c in comm.stream():
82
  if c["type"] == "audio":
83
  audio_buf.extend(c["data"])
 
84
  elif c["type"] == "WordBoundary":
85
  start = int(c["offset"] / 10_000)
86
  end = start + int(c["duration"] / 10_000)
87
  words_data.append((start, end, c["text"]))
 
 
 
88
  return audio_buf, words_data
89
 
90
  async def tts_chunk(chunk, voice, rate, pitch):
 
97
  subs = []
98
  for line in grouped_lines:
99
  word_count = len(line.split())
 
100
  if idx + word_count - 1 >= len(words_data):
101
  last_word = words_data[-1]
102
  subs.append({
103
  "start": last_word[0],
104
  "end": last_word[1],
105
  "text": line,
106
+ "words": words_data[idx:]
107
  })
108
  break
 
109
  start = words_data[idx][0]
110
  end = words_data[idx + word_count - 1][1]
111
  subs.append({"start": start, "end": end, "text": line, "words": words_data[idx:idx + word_count]})
 
145
  return groups
146
 
147
  def create_word_highlight(words, current_start, current_end):
 
148
  parts = []
149
  for start, end, word in words:
150
  if start == current_start and end == current_end:
 
151
  parts.append(f"{{\\c&H00FFFF&}}{word}{{\\c&HFFFFFF&}}")
152
  else:
 
153
  parts.append(word)
154
  return " ".join(parts)
155
 
156
  def add_karaoke_subtitles(subs, words):
157
  if not words:
158
  return
 
159
  for i, (start, end, word) in enumerate(words):
160
  event_start = start
161
+ event_end = words[i + 1][0] if i < len(words) - 1 else end
 
 
 
 
162
  highlighted = create_word_highlight(words, start, end)
163
  subs.append(pysubs2.SSAEvent(start=event_start, end=event_end, text=highlighted))
164
 
165
+
166
+ def speech(text, voice, rate, pitch, webhook=None):
167
  text = text.strip()
168
  if not text:
169
  raise ValueError("Please enter text")
 
184
  add_karaoke_subtitles(subs, sub["words"])
185
  else:
186
  subs.append(pysubs2.SSAEvent(start=sub["start"], end=sub["end"], text=sub["text"]))
187
+
188
  srt_content = subs.to_string("ass")
189
 
190
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as audio_file:
 
192
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as srt_file:
193
  srt_file.write(srt_content)
194
 
195
+ if webhook:
196
+ data = {
197
+ "audio": audio_file.name,
198
+ "subtitle": srt_file.name
199
+ }
200
+ requests.post(webhook, json=data, timeout=5)
201
 
202
+ return audio_file.name, srt_file.name
203
 
204
+ # --------------------------
205
+ # Gradio UI
206
+ # --------------------------
207
  voices = get_voices()
208
  with gr.Blocks() as demo:
209
  with gr.Row():
210
  with gr.Column(scale=3):
211
  gr.Markdown("# Text to Speech")
212
+ gr.Markdown(ip_info) # hiển thị IP gọn ngay khi load
213
  with gr.Column(scale=1):
214
  submit_btn = gr.Button("RUN", variant="primary")
215
 
216
+ input_text = gr.Textbox(label="Input", lines=5, info="You can enter text or a Google Docs link", placeholder="Enter text or Google Docs link here...")
217
+ voice_dropdown = gr.Dropdown(choices=list(voices.keys()), label="Voice", value=list(voices.keys())[0])
218
+
219
  with gr.Accordion("Advanced Options", open=False):
220
  rate_in = gr.Slider(-100, 100, value=0, label="Speech Rate (%)", step=10)
221
  pitch_in = gr.Slider(-100, 100, value=0, label="Pitch (Hz)", step=10)
222
+ webhook_in = gr.Textbox(label="Webhook URL (optional)", placeholder="Enter webhook URL if you want the output sent there...")
223
 
224
  submit_btn.click(
225
  fn=speech,
226
+ inputs=[input_text, voice_dropdown, rate_in, pitch_in, webhook_in],
 
 
 
 
 
227
  outputs=[
228
  gr.Audio(label="Generated Audio", autoplay=False, show_download_button=True, type="filepath"),
229
  gr.File(label="ASS Subtitle", interactive=False),
 
231
  )
232
 
233
  if __name__ == "__main__":
234
+ demo.launch(show_api=False, show_error=True)