Spaces:

minhvh
/

tool

Sleeping

App Files Files Community

Vo Hoang Minh commited on Oct 4, 2025

Commit

fe1b1fb

1 Parent(s): 47c696c

up

Browse files

Files changed (2) hide show

dockerbox/render/Dockerfile +4 -2
pkg/tts_run.py +33 -35

dockerbox/render/Dockerfile CHANGED Viewed

@@ -4,7 +4,9 @@ RUN apt-get update && \
     apt-get install -y wget unzip ffmpeg fonts-nanum && \
     rm -rf /var/lib/apt/lists/*
 WORKDIR /app
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt

     apt-get install -y wget unzip ffmpeg fonts-nanum && \
     rm -rf /var/lib/apt/lists/*
 WORKDIR /app
+RUN pip install --no-cache-dir requests pandas

pkg/tts_run.py CHANGED Viewed

@@ -5,26 +5,23 @@ from functools import lru_cache
 import re
 import tempfile
 from cachetools import TTLCache, cached
-from cachetools.keys import hashkey
 import edge_tts
 import pysubs2
 import requests
-# Cache with a max size of 128 items and a TTL of 2 hours
 cache = TTLCache(maxsize=128, ttl=7200)
 VOICES = ['en-US', 'ja-JP', 'ko-KR', 'zh-CN', 'vi-VN']
 def config_subs():
     subs = pysubs2.SSAFile()
     s = subs.styles["Default"]
     s.fontname = "Nanum Barun Gothic"
-    s.fontsize = 36
-    s.outline = 3
     s.shadow = 1
     s.bold = True
     s.alignment = 2
-    s.marginv = 52
     subs.info["WrapStyle"] = 1
     return subs
@@ -37,6 +34,14 @@ def get_voices():
     return asyncio.run(list_voices())
 def text_gdoc(text: str) -> str:
     if not re.search(r"docs\.google\.com", text):
         return text
@@ -50,7 +55,6 @@ def decode_base64(text: str):
         return ""
     return base64.b64decode(text, validate=True).decode("utf-8")
 @cached(cache)
 def render_audio(text, voice, rate, pitch):
     sub_data = []
@@ -63,7 +67,6 @@ def render_audio(text, voice, rate, pitch):
             audio_chunk, subs_chunk = await tts_chunk(chunk, voice_short, rate, pitch)
             audio_buf.extend(audio_chunk)
             sub_data.extend(subs_chunk)
         return bytes(audio_buf), sub_data
     return asyncio.run(run())
@@ -78,14 +81,10 @@ async def tts_comunicate(chunk, voice, rate, pitch):
     async for c in comm.stream():
         if c["type"] == "audio":
             audio_buf.extend(c["data"])
         elif c["type"] == "WordBoundary":
             start = int(c["offset"] / 10_000)
             end = start + int(c["duration"] / 10_000)
             words_data.append((start, end, c["text"]))
-        print(f"Progress: {c.get('text', '')}", end="\r")
     return audio_buf, words_data
 async def tts_chunk(chunk, voice, rate, pitch):
@@ -98,17 +97,15 @@ async def tts_chunk(chunk, voice, rate, pitch):
     subs = []
     for line in grouped_lines:
         word_count = len(line.split())
         if idx + word_count - 1 >= len(words_data):
             last_word = words_data[-1]
             subs.append({
                 "start": last_word[0],
                 "end": last_word[1],
                 "text": line,
-                "words": words_data[idx:]  # lấy các từ còn lại
             })
             break
         start = words_data[idx][0]
         end = words_data[idx + word_count - 1][1]
         subs.append({"start": start, "end": end, "text": line, "words": words_data[idx:idx + word_count]})
@@ -148,32 +145,25 @@ def group_words(words, max_length=15):
     return groups
 def create_word_highlight(words, current_start, current_end):
-    """Tạo text với chỉ 1 từ được highlight"""
     parts = []
     for start, end, word in words:
         if start == current_start and end == current_end:
-            # Từ đang đọc - màu vàng
             parts.append(f"{{\\c&H00FFFF&}}{word}{{\\c&HFFFFFF&}}")
         else:
-            # Từ khác - màu trắng
             parts.append(word)
     return " ".join(parts)
 def add_karaoke_subtitles(subs, words):
     if not words:
         return
     for i, (start, end, word) in enumerate(words):
         event_start = start
-        if i < len(words) - 1:
-            event_end = words[i + 1][0]
-        else:
-            event_end = end
         highlighted = create_word_highlight(words, start, end)
         subs.append(pysubs2.SSAEvent(start=event_start, end=event_end, text=highlighted))
-def speech(text, voice, rate, pitch):
     text = text.strip()
     if not text:
         raise ValueError("Please enter text")
@@ -194,7 +184,7 @@ def speech(text, voice, rate, pitch):
             add_karaoke_subtitles(subs, sub["words"])
         else:
             subs.append(pysubs2.SSAEvent(start=sub["start"], end=sub["end"], text=sub["text"]))
     srt_content = subs.to_string("ass")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as audio_file:
@@ -202,30 +192,38 @@ def speech(text, voice, rate, pitch):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as srt_file:
         srt_file.write(srt_content)
-    return audio_file.name, srt_file.name
 voices = get_voices()
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=3):
             gr.Markdown("# Text to Speech")
         with gr.Column(scale=1):
             submit_btn = gr.Button("RUN", variant="primary")
     with gr.Accordion("Advanced Options", open=False):
         rate_in = gr.Slider(-100, 100, value=0, label="Speech Rate (%)", step=10)
         pitch_in = gr.Slider(-100, 100, value=0, label="Pitch (Hz)", step=10)
     submit_btn.click(
         fn=speech,
-        inputs=[
-            gr.Textbox(label="Input", lines=5, info="You can enter text or a Google Docs link", placeholder="Enter text or Google Docs link here..."),
-            gr.Dropdown(choices=list(voices.keys()), label="Voice", value=list(voices.keys())[0]),
-            rate_in,
-            pitch_in
-        ],
         outputs=[
             gr.Audio(label="Generated Audio", autoplay=False, show_download_button=True, type="filepath"),
             gr.File(label="ASS Subtitle", interactive=False),
@@ -233,4 +231,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch(show_api=False, show_error=True)

 import re
 import tempfile
 from cachetools import TTLCache, cached
 import edge_tts
 import pysubs2
 import requests
 cache = TTLCache(maxsize=128, ttl=7200)
 VOICES = ['en-US', 'ja-JP', 'ko-KR', 'zh-CN', 'vi-VN']
 def config_subs():
     subs = pysubs2.SSAFile()
     s = subs.styles["Default"]
     s.fontname = "Nanum Barun Gothic"
+    s.fontsize = 40
+    s.outline = 2
     s.shadow = 1
     s.bold = True
     s.alignment = 2
+    s.marginv = 54
     subs.info["WrapStyle"] = 1
     return subs
     return asyncio.run(list_voices())
+try:
+    r = requests.get("http://ip-api.com/json/", timeout=5)
+    data = r.json()
+    ip_info = f"{data.get('query','')} | {data.get('country','')} | {data.get('city','')}"
+except Exception as e:
+    ip_info = "unknown"
 def text_gdoc(text: str) -> str:
     if not re.search(r"docs\.google\.com", text):
         return text
         return ""
     return base64.b64decode(text, validate=True).decode("utf-8")
 @cached(cache)
 def render_audio(text, voice, rate, pitch):
     sub_data = []
             audio_chunk, subs_chunk = await tts_chunk(chunk, voice_short, rate, pitch)
             audio_buf.extend(audio_chunk)
             sub_data.extend(subs_chunk)
         return bytes(audio_buf), sub_data
     return asyncio.run(run())
     async for c in comm.stream():
         if c["type"] == "audio":
             audio_buf.extend(c["data"])
         elif c["type"] == "WordBoundary":
             start = int(c["offset"] / 10_000)
             end = start + int(c["duration"] / 10_000)
             words_data.append((start, end, c["text"]))
     return audio_buf, words_data
 async def tts_chunk(chunk, voice, rate, pitch):
     subs = []
     for line in grouped_lines:
         word_count = len(line.split())
         if idx + word_count - 1 >= len(words_data):
             last_word = words_data[-1]
             subs.append({
                 "start": last_word[0],
                 "end": last_word[1],
                 "text": line,
+                "words": words_data[idx:]
             })
             break
         start = words_data[idx][0]
         end = words_data[idx + word_count - 1][1]
         subs.append({"start": start, "end": end, "text": line, "words": words_data[idx:idx + word_count]})
     return groups
 def create_word_highlight(words, current_start, current_end):
     parts = []
     for start, end, word in words:
         if start == current_start and end == current_end:
             parts.append(f"{{\\c&H00FFFF&}}{word}{{\\c&HFFFFFF&}}")
         else:
             parts.append(word)
     return " ".join(parts)
 def add_karaoke_subtitles(subs, words):
     if not words:
         return
     for i, (start, end, word) in enumerate(words):
         event_start = start
+        event_end = words[i + 1][0] if i < len(words) - 1 else end
         highlighted = create_word_highlight(words, start, end)
         subs.append(pysubs2.SSAEvent(start=event_start, end=event_end, text=highlighted))
+def speech(text, voice, rate, pitch, webhook=None):
     text = text.strip()
     if not text:
         raise ValueError("Please enter text")
             add_karaoke_subtitles(subs, sub["words"])
         else:
             subs.append(pysubs2.SSAEvent(start=sub["start"], end=sub["end"], text=sub["text"]))
     srt_content = subs.to_string("ass")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as audio_file:
     with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as srt_file:
         srt_file.write(srt_content)
+    if webhook:
+        data = {
+            "audio": audio_file.name,
+            "subtitle": srt_file.name
+        }
+        requests.post(webhook, json=data, timeout=5)
+    return audio_file.name, srt_file.name
+# --------------------------
+# Gradio UI
+# --------------------------
 voices = get_voices()
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=3):
             gr.Markdown("# Text to Speech")
+            gr.Markdown(ip_info)  # hiển thị IP gọn ngay khi load
         with gr.Column(scale=1):
             submit_btn = gr.Button("RUN", variant="primary")
+    input_text = gr.Textbox(label="Input", lines=5, info="You can enter text or a Google Docs link", placeholder="Enter text or Google Docs link here...")
+    voice_dropdown = gr.Dropdown(choices=list(voices.keys()), label="Voice", value=list(voices.keys())[0])
     with gr.Accordion("Advanced Options", open=False):
         rate_in = gr.Slider(-100, 100, value=0, label="Speech Rate (%)", step=10)
         pitch_in = gr.Slider(-100, 100, value=0, label="Pitch (Hz)", step=10)
+        webhook_in = gr.Textbox(label="Webhook URL (optional)", placeholder="Enter webhook URL if you want the output sent there...")
     submit_btn.click(
         fn=speech,
+        inputs=[input_text, voice_dropdown, rate_in, pitch_in, webhook_in],
         outputs=[
             gr.Audio(label="Generated Audio", autoplay=False, show_download_button=True, type="filepath"),
             gr.File(label="ASS Subtitle", interactive=False),
     )
 if __name__ == "__main__":
+    demo.launch(show_api=False, show_error=True)