Spaces:
Sleeping
Sleeping
GitHub Actions commited on
Commit ·
1c7725b
1
Parent(s): 2886be7
Deploy from GitHub: 0bf18943d192a2812c57599f6c25bf9739d523bf
Browse files- LICENSE +21 -0
- pyproject.toml +1 -6
- src/talking_snake/__main__.py +7 -0
- src/talking_snake/app.py +165 -20
- src/talking_snake/extract.py +448 -19
- src/talking_snake/static/app.js +439 -104
- src/talking_snake/static/index.html +35 -24
- src/talking_snake/static/styles.css +292 -97
- src/talking_snake/tts.py +226 -16
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 Luca Cappelletti
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[project]
|
| 2 |
name = "talking-snake"
|
| 3 |
-
version = "0.1.
|
| 4 |
description = "Just a talking snake that reads PDFs and web pages aloud."
|
| 5 |
readme = "README.md"
|
| 6 |
license = { text = "MIT" }
|
|
@@ -45,11 +45,6 @@ dev = [
|
|
| 45 |
"mypy>=1.14.0",
|
| 46 |
"pre-commit>=4.0.0",
|
| 47 |
]
|
| 48 |
-
# Flash Attention for ~2x faster inference (requires CUDA 11.6+)
|
| 49 |
-
# Install separately: pip install flash-attn --no-build-isolation
|
| 50 |
-
fast = [
|
| 51 |
-
"flash-attn>=2.5.0",
|
| 52 |
-
]
|
| 53 |
|
| 54 |
[project.scripts]
|
| 55 |
talking-snake = "talking_snake.__main__:main"
|
|
|
|
| 1 |
[project]
|
| 2 |
name = "talking-snake"
|
| 3 |
+
version = "0.1.1"
|
| 4 |
description = "Just a talking snake that reads PDFs and web pages aloud."
|
| 5 |
readme = "README.md"
|
| 6 |
license = { text = "MIT" }
|
|
|
|
| 45 |
"mypy>=1.14.0",
|
| 46 |
"pre-commit>=4.0.0",
|
| 47 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
[project.scripts]
|
| 50 |
talking-snake = "talking_snake.__main__:main"
|
src/talking_snake/__main__.py
CHANGED
|
@@ -99,6 +99,13 @@ def main() -> int:
|
|
| 99 |
return 1
|
| 100 |
|
| 101 |
print("✅ TTS model loaded!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
print()
|
| 103 |
|
| 104 |
# Create app with engine
|
|
|
|
| 99 |
return 1
|
| 100 |
|
| 101 |
print("✅ TTS model loaded!")
|
| 102 |
+
|
| 103 |
+
# Run calibration to get accurate time estimates
|
| 104 |
+
print("⏱️ Calibrating speech timing...")
|
| 105 |
+
try:
|
| 106 |
+
tts_engine.calibrate()
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"⚠️ Calibration failed (using defaults): {e}")
|
| 109 |
print()
|
| 110 |
|
| 111 |
# Create app with engine
|
src/talking_snake/app.py
CHANGED
|
@@ -11,12 +11,12 @@ import time
|
|
| 11 |
import uuid
|
| 12 |
from pathlib import Path
|
| 13 |
from typing import TYPE_CHECKING
|
| 14 |
-
from urllib.parse import urlparse
|
| 15 |
|
| 16 |
import httpx
|
| 17 |
import trafilatura
|
| 18 |
from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
|
| 19 |
-
from fastapi.responses import HTMLResponse, StreamingResponse
|
| 20 |
from fastapi.staticfiles import StaticFiles
|
| 21 |
from pydantic import BaseModel
|
| 22 |
|
|
@@ -24,6 +24,7 @@ from talking_snake.extract import clean_text, extract_text, get_page_count
|
|
| 24 |
from talking_snake.tts import (
|
| 25 |
DEFAULT_CHUNK_SIZE,
|
| 26 |
LANGUAGE_VOICES,
|
|
|
|
| 27 |
MockTTSEngine,
|
| 28 |
TTSEngineProtocol,
|
| 29 |
)
|
|
@@ -52,15 +53,31 @@ class AudioJob:
|
|
| 52 |
def __init__(self, job_id: str):
|
| 53 |
self.job_id = job_id
|
| 54 |
self.audio_queue: queue.Queue[bytes | None] = queue.Queue()
|
|
|
|
| 55 |
self.started = time.time()
|
| 56 |
self.completed = False
|
|
|
|
| 57 |
self.error: str | None = None
|
| 58 |
self.sample_rate = 24000 # Default, will be set by TTS engine
|
| 59 |
self.header_sent = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def put_audio(self, audio_bytes: bytes) -> None:
|
| 62 |
-
"""Add audio data to the queue."""
|
| 63 |
self.audio_queue.put(audio_bytes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
def finish(self) -> None:
|
| 66 |
"""Signal that audio generation is complete."""
|
|
@@ -117,6 +134,7 @@ class UrlRequest(BaseModel):
|
|
| 117 |
|
| 118 |
url: str
|
| 119 |
language: str = "english"
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
class TextRequest(BaseModel):
|
|
@@ -124,6 +142,7 @@ class TextRequest(BaseModel):
|
|
| 124 |
|
| 125 |
text: str
|
| 126 |
language: str = "english"
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
class EstimateResponse(BaseModel):
|
|
@@ -170,6 +189,7 @@ def create_app(tts_engine: TTSEngineProtocol | None = None) -> FastAPI:
|
|
| 170 |
app.add_api_route("/api/read-url-stream", read_url_stream, methods=["POST"])
|
| 171 |
app.add_api_route("/api/read-text-stream", read_text_stream, methods=["POST"])
|
| 172 |
app.add_api_route("/api/audio/{job_id}", stream_audio, methods=["GET"])
|
|
|
|
| 173 |
app.add_api_route("/api/languages", get_languages, methods=["GET"])
|
| 174 |
app.add_api_route("/api/device-info-stream", stream_device_info, methods=["GET"])
|
| 175 |
app.add_api_route("/api/health", health_check, methods=["GET"])
|
|
@@ -389,6 +409,9 @@ def _get_device_info() -> dict:
|
|
| 389 |
Returns:
|
| 390 |
Device type, memory usage, and model info.
|
| 391 |
"""
|
|
|
|
|
|
|
|
|
|
| 392 |
import torch
|
| 393 |
|
| 394 |
info = {
|
|
@@ -398,8 +421,20 @@ def _get_device_info() -> dict:
|
|
| 398 |
"memory_total_gb": 0,
|
| 399 |
"memory_percent": 0,
|
| 400 |
"batch_size": 1,
|
|
|
|
|
|
|
|
|
|
| 401 |
}
|
| 402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
if torch.cuda.is_available():
|
| 404 |
props = torch.cuda.get_device_properties(0)
|
| 405 |
# Use reserved memory for more accurate GPU usage (includes PyTorch cache)
|
|
@@ -421,6 +456,15 @@ def _get_device_info() -> dict:
|
|
| 421 |
if _tts_engine is not None:
|
| 422 |
info["batch_size"] = getattr(_tts_engine, "batch_size", 1)
|
| 423 |
info["chunk_size"] = getattr(_tts_engine, "chunk_size", 800)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
|
| 425 |
return info
|
| 426 |
|
|
@@ -461,9 +505,7 @@ async def stream_device_info() -> StreamingResponse:
|
|
| 461 |
)
|
| 462 |
|
| 463 |
|
| 464 |
-
def _estimate_time(
|
| 465 |
-
text: str, seconds_per_char: float = INITIAL_SECONDS_PER_CHAR
|
| 466 |
-
) -> tuple[int, float]:
|
| 467 |
"""Estimate processing time for text.
|
| 468 |
|
| 469 |
Args:
|
|
@@ -473,6 +515,8 @@ def _estimate_time(
|
|
| 473 |
Returns:
|
| 474 |
Tuple of (chunk_count, estimated_seconds).
|
| 475 |
"""
|
|
|
|
|
|
|
| 476 |
# Count chunks (500 chars per chunk approximately)
|
| 477 |
chunk_count = max(1, len(text) // 500 + (1 if len(text) % 500 else 0))
|
| 478 |
estimated_seconds = len(text) * seconds_per_char
|
|
@@ -521,6 +565,7 @@ def _generate_audio_to_job(
|
|
| 521 |
text: str,
|
| 522 |
tts_engine: TTSEngineProtocol,
|
| 523 |
language: str = "english",
|
|
|
|
| 524 |
doc_name: str = "document",
|
| 525 |
doc_type: str = "text",
|
| 526 |
page_count: int | None = None,
|
|
@@ -536,11 +581,10 @@ def _generate_audio_to_job(
|
|
| 536 |
text: Text to synthesize.
|
| 537 |
tts_engine: TTS engine to use.
|
| 538 |
language: Language for TTS (english, chinese, japanese, korean).
|
|
|
|
| 539 |
doc_name: Name of the document being processed.
|
| 540 |
doc_type: Type of document (pdf, url, text).
|
| 541 |
page_count: Number of pages (for PDFs).
|
| 542 |
-
tts_engine: TTS engine to use.
|
| 543 |
-
language: Language for TTS (english, chinese, japanese, korean).
|
| 544 |
|
| 545 |
Yields:
|
| 546 |
SSE events for progress.
|
|
@@ -551,6 +595,10 @@ def _generate_audio_to_job(
|
|
| 551 |
if hasattr(tts_engine, "set_language"):
|
| 552 |
tts_engine.set_language(language)
|
| 553 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
# Get chunk size and batch size from engine
|
| 555 |
chunk_size = getattr(tts_engine, "chunk_size", DEFAULT_CHUNK_SIZE)
|
| 556 |
batch_size = getattr(tts_engine, "batch_size", 1)
|
|
@@ -578,9 +626,13 @@ def _generate_audio_to_job(
|
|
| 578 |
total_chunks = len(chunks) if chunks else 1
|
| 579 |
total_chars = sum(len(c) for c in chunks)
|
| 580 |
|
| 581 |
-
# Use
|
| 582 |
-
seconds_per_char = INITIAL_SECONDS_PER_CHAR
|
| 583 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
|
| 585 |
# Send initial progress event with job_id and batch info
|
| 586 |
progress_data = {
|
|
@@ -589,7 +641,7 @@ def _generate_audio_to_job(
|
|
| 589 |
"current": 0,
|
| 590 |
"total": total_chunks,
|
| 591 |
"percent": 0,
|
| 592 |
-
"estimated_remaining": estimated_total,
|
| 593 |
"batch_size": batch_size,
|
| 594 |
"doc_name": doc_name,
|
| 595 |
"doc_type": doc_type,
|
|
@@ -648,13 +700,14 @@ def _generate_audio_to_job(
|
|
| 648 |
# Signal audio generation complete
|
| 649 |
job.finish()
|
| 650 |
|
| 651 |
-
# Send completion event
|
| 652 |
total_time = time.time() - start_time
|
| 653 |
complete_data = {
|
| 654 |
"type": "complete",
|
| 655 |
"total_time": round(total_time, 1),
|
| 656 |
"chunks_processed": chunks_processed,
|
| 657 |
"batch_size": batch_size,
|
|
|
|
| 658 |
}
|
| 659 |
yield f"event: complete\ndata: {json.dumps(complete_data)}\n\n".encode()
|
| 660 |
|
|
@@ -664,6 +717,7 @@ async def stream_audio(job_id: str) -> StreamingResponse:
|
|
| 664 |
|
| 665 |
This endpoint streams the raw WAV audio as it's being generated.
|
| 666 |
The browser can start playing as soon as data arrives.
|
|
|
|
| 667 |
|
| 668 |
Args:
|
| 669 |
job_id: The job ID to stream audio for.
|
|
@@ -675,7 +729,9 @@ async def stream_audio(job_id: str) -> StreamingResponse:
|
|
| 675 |
if job is None:
|
| 676 |
raise HTTPException(status_code=404, detail="Job not found")
|
| 677 |
|
| 678 |
-
def
|
|
|
|
|
|
|
| 679 |
# Send WAV header first
|
| 680 |
yield _create_wav_header(sample_rate=24000)
|
| 681 |
|
|
@@ -689,8 +745,6 @@ async def stream_audio(job_id: str) -> StreamingResponse:
|
|
| 689 |
break
|
| 690 |
# Skip WAV headers from individual chunks, only send raw PCM
|
| 691 |
if audio_data[:4] == b"RIFF":
|
| 692 |
-
# This is a WAV file, extract just the PCM data
|
| 693 |
-
# WAV header is 44 bytes for standard PCM
|
| 694 |
yield audio_data[44:]
|
| 695 |
else:
|
| 696 |
yield audio_data
|
|
@@ -698,11 +752,21 @@ async def stream_audio(job_id: str) -> StreamingResponse:
|
|
| 698 |
# Timeout waiting for data
|
| 699 |
break
|
| 700 |
|
| 701 |
-
|
| 702 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 703 |
|
| 704 |
return StreamingResponse(
|
| 705 |
-
|
| 706 |
media_type="audio/wav",
|
| 707 |
headers={
|
| 708 |
"Cache-Control": "no-cache",
|
|
@@ -711,9 +775,76 @@ async def stream_audio(job_id: str) -> StreamingResponse:
|
|
| 711 |
)
|
| 712 |
|
| 713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
async def read_pdf_stream(
|
| 715 |
file: UploadFile = File(...),
|
| 716 |
language: str = Form("english"),
|
|
|
|
| 717 |
) -> StreamingResponse:
|
| 718 |
"""Read a PDF with streaming progress updates.
|
| 719 |
|
|
@@ -722,6 +853,7 @@ async def read_pdf_stream(
|
|
| 722 |
Args:
|
| 723 |
file: Uploaded PDF file.
|
| 724 |
language: Language for TTS (english, chinese, japanese, korean).
|
|
|
|
| 725 |
|
| 726 |
Returns:
|
| 727 |
Streaming response with progress events including job_id.
|
|
@@ -767,6 +899,7 @@ async def read_pdf_stream(
|
|
| 767 |
text,
|
| 768 |
_tts_engine,
|
| 769 |
language,
|
|
|
|
| 770 |
doc_name=file.filename or "document.pdf",
|
| 771 |
doc_type="pdf",
|
| 772 |
page_count=page_count,
|
|
@@ -796,6 +929,7 @@ async def read_text_stream(request: TextRequest) -> StreamingResponse:
|
|
| 796 |
|
| 797 |
text = request.text.strip()
|
| 798 |
language = request.language if request.language in LANGUAGE_VOICES else "english"
|
|
|
|
| 799 |
|
| 800 |
if not text:
|
| 801 |
raise HTTPException(status_code=400, detail="Text is required")
|
|
@@ -809,6 +943,14 @@ async def read_text_stream(request: TextRequest) -> StreamingResponse:
|
|
| 809 |
if not text.strip():
|
| 810 |
raise HTTPException(status_code=400, detail="No readable text provided")
|
| 811 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
# Create a job for this request
|
| 813 |
job = _job_manager.create_job()
|
| 814 |
|
|
@@ -818,7 +960,8 @@ async def read_text_stream(request: TextRequest) -> StreamingResponse:
|
|
| 818 |
text,
|
| 819 |
_tts_engine,
|
| 820 |
language,
|
| 821 |
-
|
|
|
|
| 822 |
doc_type="text",
|
| 823 |
),
|
| 824 |
media_type="text/event-stream",
|
|
@@ -846,6 +989,7 @@ async def read_url_stream(request: UrlRequest) -> StreamingResponse:
|
|
| 846 |
|
| 847 |
url = request.url.strip()
|
| 848 |
language = request.language if request.language in LANGUAGE_VOICES else "english"
|
|
|
|
| 849 |
|
| 850 |
if not url:
|
| 851 |
raise HTTPException(status_code=400, detail="URL is required")
|
|
@@ -922,6 +1066,7 @@ async def read_url_stream(request: UrlRequest) -> StreamingResponse:
|
|
| 922 |
text,
|
| 923 |
_tts_engine,
|
| 924 |
language,
|
|
|
|
| 925 |
doc_name=doc_name,
|
| 926 |
doc_type="pdf" if is_pdf else "url",
|
| 927 |
page_count=page_count,
|
|
|
|
| 11 |
import uuid
|
| 12 |
from pathlib import Path
|
| 13 |
from typing import TYPE_CHECKING
|
| 14 |
+
from urllib.parse import quote, urlparse
|
| 15 |
|
| 16 |
import httpx
|
| 17 |
import trafilatura
|
| 18 |
from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
|
| 19 |
+
from fastapi.responses import HTMLResponse, Response, StreamingResponse
|
| 20 |
from fastapi.staticfiles import StaticFiles
|
| 21 |
from pydantic import BaseModel
|
| 22 |
|
|
|
|
| 24 |
from talking_snake.tts import (
|
| 25 |
DEFAULT_CHUNK_SIZE,
|
| 26 |
LANGUAGE_VOICES,
|
| 27 |
+
TTS_STYLES,
|
| 28 |
MockTTSEngine,
|
| 29 |
TTSEngineProtocol,
|
| 30 |
)
|
|
|
|
| 53 |
def __init__(self, job_id: str):
|
| 54 |
self.job_id = job_id
|
| 55 |
self.audio_queue: queue.Queue[bytes | None] = queue.Queue()
|
| 56 |
+
self.audio_cache: list[bytes] = [] # Cache PCM chunks for replay/download
|
| 57 |
self.started = time.time()
|
| 58 |
self.completed = False
|
| 59 |
+
self.stream_started = False # Track if live stream has started
|
| 60 |
self.error: str | None = None
|
| 61 |
self.sample_rate = 24000 # Default, will be set by TTS engine
|
| 62 |
self.header_sent = False
|
| 63 |
+
self._total_pcm_bytes = 0 # Track total audio bytes for duration calc
|
| 64 |
+
|
| 65 |
+
@property
|
| 66 |
+
def audio_duration(self) -> float:
|
| 67 |
+
"""Calculate audio duration in seconds from cached PCM data."""
|
| 68 |
+
# 16-bit mono audio: duration = bytes / (sample_rate * 2)
|
| 69 |
+
return self._total_pcm_bytes / (self.sample_rate * 2)
|
| 70 |
|
| 71 |
def put_audio(self, audio_bytes: bytes) -> None:
|
| 72 |
+
"""Add audio data to the queue and cache."""
|
| 73 |
self.audio_queue.put(audio_bytes)
|
| 74 |
+
# Cache the PCM data (strip WAV header if present)
|
| 75 |
+
if audio_bytes[:4] == b"RIFF":
|
| 76 |
+
pcm_data = audio_bytes[44:]
|
| 77 |
+
else:
|
| 78 |
+
pcm_data = audio_bytes
|
| 79 |
+
self.audio_cache.append(pcm_data)
|
| 80 |
+
self._total_pcm_bytes += len(pcm_data)
|
| 81 |
|
| 82 |
def finish(self) -> None:
|
| 83 |
"""Signal that audio generation is complete."""
|
|
|
|
| 134 |
|
| 135 |
url: str
|
| 136 |
language: str = "english"
|
| 137 |
+
style: str = "technical"
|
| 138 |
|
| 139 |
|
| 140 |
class TextRequest(BaseModel):
|
|
|
|
| 142 |
|
| 143 |
text: str
|
| 144 |
language: str = "english"
|
| 145 |
+
style: str = "technical"
|
| 146 |
|
| 147 |
|
| 148 |
class EstimateResponse(BaseModel):
|
|
|
|
| 189 |
app.add_api_route("/api/read-url-stream", read_url_stream, methods=["POST"])
|
| 190 |
app.add_api_route("/api/read-text-stream", read_text_stream, methods=["POST"])
|
| 191 |
app.add_api_route("/api/audio/{job_id}", stream_audio, methods=["GET"])
|
| 192 |
+
app.add_api_route("/api/download/{job_id}", download_audio, methods=["GET"])
|
| 193 |
app.add_api_route("/api/languages", get_languages, methods=["GET"])
|
| 194 |
app.add_api_route("/api/device-info-stream", stream_device_info, methods=["GET"])
|
| 195 |
app.add_api_route("/api/health", health_check, methods=["GET"])
|
|
|
|
| 409 |
Returns:
|
| 410 |
Device type, memory usage, and model info.
|
| 411 |
"""
|
| 412 |
+
import shutil
|
| 413 |
+
|
| 414 |
+
import psutil
|
| 415 |
import torch
|
| 416 |
|
| 417 |
info = {
|
|
|
|
| 421 |
"memory_total_gb": 0,
|
| 422 |
"memory_percent": 0,
|
| 423 |
"batch_size": 1,
|
| 424 |
+
"ram_used_gb": 0,
|
| 425 |
+
"ram_total_gb": 0,
|
| 426 |
+
"disk_free_gb": 0,
|
| 427 |
}
|
| 428 |
|
| 429 |
+
# Get RAM info
|
| 430 |
+
ram = psutil.virtual_memory()
|
| 431 |
+
info["ram_used_gb"] = round(ram.used / 1024**3, 1)
|
| 432 |
+
info["ram_total_gb"] = round(ram.total / 1024**3, 1)
|
| 433 |
+
|
| 434 |
+
# Get disk free space
|
| 435 |
+
disk = shutil.disk_usage("/")
|
| 436 |
+
info["disk_free_gb"] = round(disk.free / 1024**3, 1)
|
| 437 |
+
|
| 438 |
if torch.cuda.is_available():
|
| 439 |
props = torch.cuda.get_device_properties(0)
|
| 440 |
# Use reserved memory for more accurate GPU usage (includes PyTorch cache)
|
|
|
|
| 456 |
if _tts_engine is not None:
|
| 457 |
info["batch_size"] = getattr(_tts_engine, "batch_size", 1)
|
| 458 |
info["chunk_size"] = getattr(_tts_engine, "chunk_size", 800)
|
| 459 |
+
# Include model state
|
| 460 |
+
info["model_state"] = getattr(_tts_engine, "model_state", "unknown")
|
| 461 |
+
# Include timing stats
|
| 462 |
+
seconds_per_char = getattr(_tts_engine, "seconds_per_char", None)
|
| 463 |
+
if seconds_per_char is not None:
|
| 464 |
+
info["seconds_per_char"] = round(seconds_per_char, 4)
|
| 465 |
+
total_chars = getattr(_tts_engine, "total_chars_processed", 0)
|
| 466 |
+
if total_chars > 0:
|
| 467 |
+
info["total_chars_processed"] = total_chars
|
| 468 |
|
| 469 |
return info
|
| 470 |
|
|
|
|
| 505 |
)
|
| 506 |
|
| 507 |
|
| 508 |
+
def _estimate_time(text: str, seconds_per_char: float | None = None) -> tuple[int, float]:
|
|
|
|
|
|
|
| 509 |
"""Estimate processing time for text.
|
| 510 |
|
| 511 |
Args:
|
|
|
|
| 515 |
Returns:
|
| 516 |
Tuple of (chunk_count, estimated_seconds).
|
| 517 |
"""
|
| 518 |
+
if seconds_per_char is None:
|
| 519 |
+
seconds_per_char = INITIAL_SECONDS_PER_CHAR
|
| 520 |
# Count chunks (500 chars per chunk approximately)
|
| 521 |
chunk_count = max(1, len(text) // 500 + (1 if len(text) % 500 else 0))
|
| 522 |
estimated_seconds = len(text) * seconds_per_char
|
|
|
|
| 565 |
text: str,
|
| 566 |
tts_engine: TTSEngineProtocol,
|
| 567 |
language: str = "english",
|
| 568 |
+
style: str = "technical",
|
| 569 |
doc_name: str = "document",
|
| 570 |
doc_type: str = "text",
|
| 571 |
page_count: int | None = None,
|
|
|
|
| 581 |
text: Text to synthesize.
|
| 582 |
tts_engine: TTS engine to use.
|
| 583 |
language: Language for TTS (english, chinese, japanese, korean).
|
| 584 |
+
style: TTS style (technical, narrative, news, casual, academic).
|
| 585 |
doc_name: Name of the document being processed.
|
| 586 |
doc_type: Type of document (pdf, url, text).
|
| 587 |
page_count: Number of pages (for PDFs).
|
|
|
|
|
|
|
| 588 |
|
| 589 |
Yields:
|
| 590 |
SSE events for progress.
|
|
|
|
| 595 |
if hasattr(tts_engine, "set_language"):
|
| 596 |
tts_engine.set_language(language)
|
| 597 |
|
| 598 |
+
# Apply style if the engine supports it
|
| 599 |
+
if hasattr(tts_engine, "set_style"):
|
| 600 |
+
tts_engine.set_style(style)
|
| 601 |
+
|
| 602 |
# Get chunk size and batch size from engine
|
| 603 |
chunk_size = getattr(tts_engine, "chunk_size", DEFAULT_CHUNK_SIZE)
|
| 604 |
batch_size = getattr(tts_engine, "batch_size", 1)
|
|
|
|
| 626 |
total_chunks = len(chunks) if chunks else 1
|
| 627 |
total_chars = sum(len(c) for c in chunks)
|
| 628 |
|
| 629 |
+
# Use calibrated estimate if available, otherwise initial estimate
|
| 630 |
+
seconds_per_char = getattr(tts_engine, "seconds_per_char", None) or INITIAL_SECONDS_PER_CHAR
|
| 631 |
+
|
| 632 |
+
# Account for batch efficiency: processing N chunks in parallel is ~N times faster
|
| 633 |
+
# The efficiency isn't perfectly linear, so use a conservative factor of sqrt(batch_size)
|
| 634 |
+
batch_efficiency = batch_size**0.5 if batch_size > 1 else 1.0
|
| 635 |
+
estimated_total = (total_chars * seconds_per_char) / batch_efficiency
|
| 636 |
|
| 637 |
# Send initial progress event with job_id and batch info
|
| 638 |
progress_data = {
|
|
|
|
| 641 |
"current": 0,
|
| 642 |
"total": total_chunks,
|
| 643 |
"percent": 0,
|
| 644 |
+
"estimated_remaining": round(estimated_total, 1),
|
| 645 |
"batch_size": batch_size,
|
| 646 |
"doc_name": doc_name,
|
| 647 |
"doc_type": doc_type,
|
|
|
|
| 700 |
# Signal audio generation complete
|
| 701 |
job.finish()
|
| 702 |
|
| 703 |
+
# Send completion event with actual audio duration
|
| 704 |
total_time = time.time() - start_time
|
| 705 |
complete_data = {
|
| 706 |
"type": "complete",
|
| 707 |
"total_time": round(total_time, 1),
|
| 708 |
"chunks_processed": chunks_processed,
|
| 709 |
"batch_size": batch_size,
|
| 710 |
+
"audio_duration": round(job.audio_duration, 2),
|
| 711 |
}
|
| 712 |
yield f"event: complete\ndata: {json.dumps(complete_data)}\n\n".encode()
|
| 713 |
|
|
|
|
| 717 |
|
| 718 |
This endpoint streams the raw WAV audio as it's being generated.
|
| 719 |
The browser can start playing as soon as data arrives.
|
| 720 |
+
First request streams live; subsequent requests return cached audio.
|
| 721 |
|
| 722 |
Args:
|
| 723 |
job_id: The job ID to stream audio for.
|
|
|
|
| 729 |
if job is None:
|
| 730 |
raise HTTPException(status_code=404, detail="Job not found")
|
| 731 |
|
| 732 |
+
def generate_audio_live() -> Iterator[bytes]:
|
| 733 |
+
"""Stream audio live from queue (first request)."""
|
| 734 |
+
job.stream_started = True
|
| 735 |
# Send WAV header first
|
| 736 |
yield _create_wav_header(sample_rate=24000)
|
| 737 |
|
|
|
|
| 745 |
break
|
| 746 |
# Skip WAV headers from individual chunks, only send raw PCM
|
| 747 |
if audio_data[:4] == b"RIFF":
|
|
|
|
|
|
|
| 748 |
yield audio_data[44:]
|
| 749 |
else:
|
| 750 |
yield audio_data
|
|
|
|
| 752 |
# Timeout waiting for data
|
| 753 |
break
|
| 754 |
|
| 755 |
+
def generate_audio_cached() -> Iterator[bytes]:
|
| 756 |
+
"""Stream audio from cache (subsequent requests)."""
|
| 757 |
+
# Send WAV header first
|
| 758 |
+
yield _create_wav_header(sample_rate=24000)
|
| 759 |
+
# Send all cached chunks
|
| 760 |
+
yield from job.audio_cache
|
| 761 |
+
|
| 762 |
+
# Use live stream for first request, cached for subsequent
|
| 763 |
+
if not job.stream_started:
|
| 764 |
+
generator = generate_audio_live()
|
| 765 |
+
else:
|
| 766 |
+
generator = generate_audio_cached()
|
| 767 |
|
| 768 |
return StreamingResponse(
|
| 769 |
+
generator,
|
| 770 |
media_type="audio/wav",
|
| 771 |
headers={
|
| 772 |
"Cache-Control": "no-cache",
|
|
|
|
| 775 |
)
|
| 776 |
|
| 777 |
|
| 778 |
+
async def download_audio(job_id: str, filename: str = "audio.wav") -> Response:
|
| 779 |
+
"""Download complete audio file for a job.
|
| 780 |
+
|
| 781 |
+
This endpoint returns the full WAV file with correct headers for download.
|
| 782 |
+
Only works after generation is complete.
|
| 783 |
+
|
| 784 |
+
Args:
|
| 785 |
+
job_id: The job ID to download audio for.
|
| 786 |
+
filename: Suggested filename for download.
|
| 787 |
+
|
| 788 |
+
Returns:
|
| 789 |
+
Complete WAV audio file response.
|
| 790 |
+
"""
|
| 791 |
+
job = _job_manager.get_job(job_id)
|
| 792 |
+
if job is None:
|
| 793 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
| 794 |
+
|
| 795 |
+
if not job.audio_cache:
|
| 796 |
+
raise HTTPException(status_code=404, detail="No audio available")
|
| 797 |
+
|
| 798 |
+
# Combine all cached audio data
|
| 799 |
+
audio_data = b"".join(job.audio_cache)
|
| 800 |
+
|
| 801 |
+
# Create proper WAV header with actual size
|
| 802 |
+
sample_rate = 24000
|
| 803 |
+
bits_per_sample = 16
|
| 804 |
+
channels = 1
|
| 805 |
+
byte_rate = sample_rate * channels * bits_per_sample // 8
|
| 806 |
+
block_align = channels * bits_per_sample // 8
|
| 807 |
+
data_size = len(audio_data)
|
| 808 |
+
file_size = data_size + 36 # Header is 44 bytes, minus 8 for RIFF header
|
| 809 |
+
|
| 810 |
+
header = io.BytesIO()
|
| 811 |
+
header.write(b"RIFF")
|
| 812 |
+
header.write(struct.pack("<I", file_size))
|
| 813 |
+
header.write(b"WAVE")
|
| 814 |
+
header.write(b"fmt ")
|
| 815 |
+
header.write(struct.pack("<I", 16)) # fmt chunk size
|
| 816 |
+
header.write(struct.pack("<H", 1)) # PCM format
|
| 817 |
+
header.write(struct.pack("<H", channels))
|
| 818 |
+
header.write(struct.pack("<I", sample_rate))
|
| 819 |
+
header.write(struct.pack("<I", byte_rate))
|
| 820 |
+
header.write(struct.pack("<H", block_align))
|
| 821 |
+
header.write(struct.pack("<H", bits_per_sample))
|
| 822 |
+
header.write(b"data")
|
| 823 |
+
header.write(struct.pack("<I", data_size))
|
| 824 |
+
|
| 825 |
+
wav_data = header.getvalue() + audio_data
|
| 826 |
+
|
| 827 |
+
# RFC 5987 encoding for non-ASCII filenames
|
| 828 |
+
# Use ASCII-safe fallback + UTF-8 encoded filename*
|
| 829 |
+
safe_filename = filename.encode("ascii", "replace").decode("ascii")
|
| 830 |
+
encoded_filename = quote(filename, safe="")
|
| 831 |
+
|
| 832 |
+
return Response(
|
| 833 |
+
content=wav_data,
|
| 834 |
+
media_type="audio/wav",
|
| 835 |
+
headers={
|
| 836 |
+
"Content-Disposition": (
|
| 837 |
+
f"attachment; filename=\"{safe_filename}\"; filename*=UTF-8''{encoded_filename}"
|
| 838 |
+
),
|
| 839 |
+
"Content-Length": str(len(wav_data)),
|
| 840 |
+
},
|
| 841 |
+
)
|
| 842 |
+
|
| 843 |
+
|
| 844 |
async def read_pdf_stream(
|
| 845 |
file: UploadFile = File(...),
|
| 846 |
language: str = Form("english"),
|
| 847 |
+
style: str = Form("technical"),
|
| 848 |
) -> StreamingResponse:
|
| 849 |
"""Read a PDF with streaming progress updates.
|
| 850 |
|
|
|
|
| 853 |
Args:
|
| 854 |
file: Uploaded PDF file.
|
| 855 |
language: Language for TTS (english, chinese, japanese, korean).
|
| 856 |
+
style: TTS style (technical, narrative, news, casual, academic).
|
| 857 |
|
| 858 |
Returns:
|
| 859 |
Streaming response with progress events including job_id.
|
|
|
|
| 899 |
text,
|
| 900 |
_tts_engine,
|
| 901 |
language,
|
| 902 |
+
style,
|
| 903 |
doc_name=file.filename or "document.pdf",
|
| 904 |
doc_type="pdf",
|
| 905 |
page_count=page_count,
|
|
|
|
| 929 |
|
| 930 |
text = request.text.strip()
|
| 931 |
language = request.language if request.language in LANGUAGE_VOICES else "english"
|
| 932 |
+
style = request.style if request.style in TTS_STYLES else "technical"
|
| 933 |
|
| 934 |
if not text:
|
| 935 |
raise HTTPException(status_code=400, detail="Text is required")
|
|
|
|
| 943 |
if not text.strip():
|
| 944 |
raise HTTPException(status_code=400, detail="No readable text provided")
|
| 945 |
|
| 946 |
+
# Generate doc name from first few words
|
| 947 |
+
words = text.split()[:5]
|
| 948 |
+
doc_name = " ".join(words)
|
| 949 |
+
if len(doc_name) > 30:
|
| 950 |
+
doc_name = doc_name[:30] + "..."
|
| 951 |
+
elif len(words) == 5:
|
| 952 |
+
doc_name = doc_name + "..."
|
| 953 |
+
|
| 954 |
# Create a job for this request
|
| 955 |
job = _job_manager.create_job()
|
| 956 |
|
|
|
|
| 960 |
text,
|
| 961 |
_tts_engine,
|
| 962 |
language,
|
| 963 |
+
style,
|
| 964 |
+
doc_name=doc_name,
|
| 965 |
doc_type="text",
|
| 966 |
),
|
| 967 |
media_type="text/event-stream",
|
|
|
|
| 989 |
|
| 990 |
url = request.url.strip()
|
| 991 |
language = request.language if request.language in LANGUAGE_VOICES else "english"
|
| 992 |
+
style = request.style if request.style in TTS_STYLES else "technical"
|
| 993 |
|
| 994 |
if not url:
|
| 995 |
raise HTTPException(status_code=400, detail="URL is required")
|
|
|
|
| 1066 |
text,
|
| 1067 |
_tts_engine,
|
| 1068 |
language,
|
| 1069 |
+
style,
|
| 1070 |
doc_name=doc_name,
|
| 1071 |
doc_type="pdf" if is_pdf else "url",
|
| 1072 |
page_count=page_count,
|
src/talking_snake/extract.py
CHANGED
|
@@ -8,7 +8,14 @@ from collections import Counter
|
|
| 8 |
from dataclasses import dataclass
|
| 9 |
|
| 10 |
from pdfminer.high_level import extract_pages
|
| 11 |
-
from pdfminer.layout import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
@dataclass
|
|
@@ -19,6 +26,260 @@ class TextBlock:
|
|
| 19 |
y_ratio: float # 0.0 = bottom, 1.0 = top
|
| 20 |
font_size: float
|
| 21 |
page_num: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
|
|
@@ -50,21 +311,52 @@ def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
|
|
| 50 |
if not isinstance(element, LTTextBoxHorizontal):
|
| 51 |
continue
|
| 52 |
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
if not text:
|
| 55 |
continue
|
| 56 |
|
| 57 |
# Calculate Y position as ratio (0=bottom, 1=top)
|
| 58 |
y_ratio = element.y0 / page_height if page_height > 0 else 0.5
|
| 59 |
|
| 60 |
-
# Extract average font size from characters
|
| 61 |
-
font_sizes: list[float] = []
|
| 62 |
-
for line in element:
|
| 63 |
-
if isinstance(line, LTTextLineHorizontal):
|
| 64 |
-
for char in line:
|
| 65 |
-
if isinstance(char, LTChar):
|
| 66 |
-
font_sizes.append(char.size)
|
| 67 |
-
|
| 68 |
avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
|
| 69 |
|
| 70 |
blocks.append(
|
|
@@ -107,6 +399,9 @@ def extract_text(pdf_bytes: bytes) -> str:
|
|
| 107 |
if not blocks:
|
| 108 |
return ""
|
| 109 |
|
|
|
|
|
|
|
|
|
|
| 110 |
cleaned_blocks = clean_text_blocks(blocks)
|
| 111 |
text = "\n\n".join(block.text for block in cleaned_blocks)
|
| 112 |
|
|
@@ -161,6 +456,10 @@ def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
|
|
| 161 |
if is_page_number(block.text):
|
| 162 |
continue
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
# Skip very short lines with small font (likely captions/footnotes)
|
| 165 |
if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
|
| 166 |
continue
|
|
@@ -280,6 +579,96 @@ def normalize_for_tts(text: str) -> str:
|
|
| 280 |
Returns:
|
| 281 |
Normalized text optimized for TTS.
|
| 282 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
# === CODE AND TECHNICAL CONTENT ===
|
| 284 |
# Handle common programming patterns that read poorly
|
| 285 |
|
|
@@ -357,10 +746,37 @@ def normalize_for_tts(text: str) -> str:
|
|
| 357 |
text = text.replace("'''", "")
|
| 358 |
|
| 359 |
# === UNICODE NORMALIZATION ===
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
# Convert smart quotes to simple quotes
|
| 361 |
-
text = text.replace(""
|
| 362 |
-
text = text.replace("
|
| 363 |
-
text = text.replace("
|
| 364 |
|
| 365 |
# Normalize dashes to standard hyphen or remove
|
| 366 |
text = text.replace("–", "-") # en-dash
|
|
@@ -471,19 +887,32 @@ def normalize_for_tts(text: str) -> str:
|
|
| 471 |
# Remove content in angle brackets (often HTML/XML artifacts)
|
| 472 |
text = re.sub(r"<[^>]+>", "", text)
|
| 473 |
|
| 474 |
-
# Normalize multiple spaces
|
| 475 |
-
text = re.sub(r"[ \t]+", " ", text)
|
| 476 |
-
|
| 477 |
# Remove spaces before punctuation
|
| 478 |
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
|
| 479 |
|
| 480 |
# Ensure space after punctuation (but not before another punctuation)
|
| 481 |
text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
|
| 482 |
|
| 483 |
-
#
|
| 484 |
-
|
| 485 |
|
| 486 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 488 |
|
|
|
|
|
|
|
|
|
|
| 489 |
return text
|
|
|
|
| 8 |
from dataclasses import dataclass
|
| 9 |
|
| 10 |
from pdfminer.high_level import extract_pages
|
| 11 |
+
from pdfminer.layout import (
|
| 12 |
+
LAParams,
|
| 13 |
+
LTAnno,
|
| 14 |
+
LTChar,
|
| 15 |
+
LTPage,
|
| 16 |
+
LTTextBoxHorizontal,
|
| 17 |
+
LTTextLineHorizontal,
|
| 18 |
+
)
|
| 19 |
|
| 20 |
|
| 21 |
@dataclass
|
|
|
|
| 26 |
y_ratio: float # 0.0 = bottom, 1.0 = top
|
| 27 |
font_size: float
|
| 28 |
page_num: int
|
| 29 |
+
x0: float = 0.0 # Left edge position for table detection
|
| 30 |
+
x1: float = 0.0 # Right edge position for table detection
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _is_caption(text: str) -> bool:
|
| 34 |
+
"""Check if text is a figure/table caption.
|
| 35 |
+
|
| 36 |
+
Captions typically start with:
|
| 37 |
+
- "Figure 1:", "Fig. 2:", "Figure 1."
|
| 38 |
+
- "Table 1:", "Table 2."
|
| 39 |
+
- "Exhibit A:", "Chart 1:"
|
| 40 |
+
- "Source:", "Note:", "Notes:"
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
text: Text to check.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
True if text appears to be a caption.
|
| 47 |
+
"""
|
| 48 |
+
text = text.strip()
|
| 49 |
+
if not text:
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
# Common caption patterns (case-insensitive start)
|
| 53 |
+
caption_patterns = [
|
| 54 |
+
r"^fig(?:ure)?\.?\s*\d",
|
| 55 |
+
r"^table\.?\s*\d",
|
| 56 |
+
r"^exhibit\.?\s*[a-z0-9]",
|
| 57 |
+
r"^chart\.?\s*\d",
|
| 58 |
+
r"^graph\.?\s*\d",
|
| 59 |
+
r"^diagram\.?\s*\d",
|
| 60 |
+
r"^plate\.?\s*\d",
|
| 61 |
+
r"^scheme\.?\s*\d",
|
| 62 |
+
r"^box\.?\s*\d",
|
| 63 |
+
r"^panel\.?\s*[a-z0-9]",
|
| 64 |
+
r"^appendix\.?\s*[a-z0-9]",
|
| 65 |
+
r"^source\s*:",
|
| 66 |
+
r"^sources\s*:",
|
| 67 |
+
r"^note\s*:",
|
| 68 |
+
r"^notes\s*:",
|
| 69 |
+
r"^data\s*:",
|
| 70 |
+
r"^\*\s*p\s*[<>=]", # Statistical notes like "* p < 0.05"
|
| 71 |
+
r"^legend\s*:",
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
text_lower = text.lower()
|
| 75 |
+
for pattern in caption_patterns:
|
| 76 |
+
if re.match(pattern, text_lower):
|
| 77 |
+
return True
|
| 78 |
+
|
| 79 |
+
return False
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _is_table_like_text(text: str) -> bool:
|
| 83 |
+
"""Check if text looks like table content.
|
| 84 |
+
|
| 85 |
+
Tables often have:
|
| 86 |
+
- Very short text fragments
|
| 87 |
+
- Mostly numbers or single words
|
| 88 |
+
- Lots of whitespace-separated values
|
| 89 |
+
- Column headers or row labels
|
| 90 |
+
- Short phrases without sentence structure
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
text: Text to check.
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
True if the text appears to be table content.
|
| 97 |
+
"""
|
| 98 |
+
text = text.strip()
|
| 99 |
+
|
| 100 |
+
# Very short fragments are likely table cells
|
| 101 |
+
if len(text) < 5:
|
| 102 |
+
return True
|
| 103 |
+
|
| 104 |
+
# Count numbers vs letters
|
| 105 |
+
digits = sum(1 for c in text if c.isdigit())
|
| 106 |
+
letters = sum(1 for c in text if c.isalpha())
|
| 107 |
+
|
| 108 |
+
# Mostly numbers with few letters (like "123.45" or "2024")
|
| 109 |
+
if digits > 0 and letters < 3 and digits >= letters:
|
| 110 |
+
return True
|
| 111 |
+
|
| 112 |
+
# Check for patterns common in tables
|
| 113 |
+
# Multiple tab-separated or heavily spaced values
|
| 114 |
+
if "\t" in text or " " in text:
|
| 115 |
+
parts = re.split(r"\s{2,}|\t", text)
|
| 116 |
+
if len(parts) >= 3:
|
| 117 |
+
# Multiple short parts suggests table row
|
| 118 |
+
short_parts = sum(1 for p in parts if len(p.strip()) < 15)
|
| 119 |
+
if short_parts >= len(parts) * 0.6:
|
| 120 |
+
return True
|
| 121 |
+
|
| 122 |
+
# Single words that look like column headers
|
| 123 |
+
words = text.split()
|
| 124 |
+
if len(words) == 1 and len(text) < 20:
|
| 125 |
+
# Common table headers/labels
|
| 126 |
+
table_keywords = {
|
| 127 |
+
"total",
|
| 128 |
+
"sum",
|
| 129 |
+
"avg",
|
| 130 |
+
"average",
|
| 131 |
+
"mean",
|
| 132 |
+
"count",
|
| 133 |
+
"min",
|
| 134 |
+
"max",
|
| 135 |
+
"date",
|
| 136 |
+
"time",
|
| 137 |
+
"year",
|
| 138 |
+
"month",
|
| 139 |
+
"day",
|
| 140 |
+
"name",
|
| 141 |
+
"id",
|
| 142 |
+
"no",
|
| 143 |
+
"no.",
|
| 144 |
+
"value",
|
| 145 |
+
"amount",
|
| 146 |
+
"price",
|
| 147 |
+
"cost",
|
| 148 |
+
"qty",
|
| 149 |
+
"quantity",
|
| 150 |
+
"unit",
|
| 151 |
+
"row",
|
| 152 |
+
"column",
|
| 153 |
+
"col",
|
| 154 |
+
"item",
|
| 155 |
+
"description",
|
| 156 |
+
"desc",
|
| 157 |
+
"note",
|
| 158 |
+
"status",
|
| 159 |
+
"type",
|
| 160 |
+
"category",
|
| 161 |
+
"code",
|
| 162 |
+
"ref",
|
| 163 |
+
"reference",
|
| 164 |
+
}
|
| 165 |
+
if text.lower() in table_keywords:
|
| 166 |
+
return True
|
| 167 |
+
|
| 168 |
+
# Short phrases without sentence structure (likely table cells)
|
| 169 |
+
# Table cells typically:
|
| 170 |
+
# - Are short (< 50 chars)
|
| 171 |
+
# - Don't end with sentence-ending punctuation
|
| 172 |
+
# - Don't start with lowercase (unless very short)
|
| 173 |
+
# - Have few words (< 8)
|
| 174 |
+
if len(text) < 50 and len(words) < 8:
|
| 175 |
+
# Doesn't end like a sentence
|
| 176 |
+
if not text.rstrip().endswith((".", "!", "?", ":")):
|
| 177 |
+
# Common table cell patterns
|
| 178 |
+
text_lower = text.lower()
|
| 179 |
+
|
| 180 |
+
# Technical/status phrases common in tables
|
| 181 |
+
table_phrases = [
|
| 182 |
+
"supported",
|
| 183 |
+
"not supported",
|
| 184 |
+
"yes",
|
| 185 |
+
"no",
|
| 186 |
+
"n/a",
|
| 187 |
+
"none",
|
| 188 |
+
"required",
|
| 189 |
+
"optional",
|
| 190 |
+
"enabled",
|
| 191 |
+
"disabled",
|
| 192 |
+
"active",
|
| 193 |
+
"inactive",
|
| 194 |
+
"read-only",
|
| 195 |
+
"read only",
|
| 196 |
+
"write",
|
| 197 |
+
"read/write",
|
| 198 |
+
"read-write",
|
| 199 |
+
"must be",
|
| 200 |
+
"can be",
|
| 201 |
+
"should be",
|
| 202 |
+
"will be",
|
| 203 |
+
"available",
|
| 204 |
+
"unavailable",
|
| 205 |
+
"pending",
|
| 206 |
+
"completed",
|
| 207 |
+
"failed",
|
| 208 |
+
"true",
|
| 209 |
+
"false",
|
| 210 |
+
"default",
|
| 211 |
+
"custom",
|
| 212 |
+
"manual",
|
| 213 |
+
"automatic",
|
| 214 |
+
"identical",
|
| 215 |
+
"different",
|
| 216 |
+
"same",
|
| 217 |
+
"other",
|
| 218 |
+
]
|
| 219 |
+
for phrase in table_phrases:
|
| 220 |
+
if phrase in text_lower:
|
| 221 |
+
return True
|
| 222 |
+
|
| 223 |
+
# Looks like a label or header (Title Case or ALL CAPS, short)
|
| 224 |
+
if len(words) <= 4 and len(text) < 40:
|
| 225 |
+
# Check if it's Title Case or contains common label patterns
|
| 226 |
+
if text.istitle() or text.isupper():
|
| 227 |
+
return True
|
| 228 |
+
# Two-three word phrases that look like labels
|
| 229 |
+
if len(words) in (2, 3) and all(w[0].isupper() for w in words if w):
|
| 230 |
+
return True
|
| 231 |
+
|
| 232 |
+
return False
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def _filter_table_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
|
| 236 |
+
"""Filter out blocks that appear to be part of tables.
|
| 237 |
+
|
| 238 |
+
Detects tables by looking for:
|
| 239 |
+
- Multiple blocks at similar Y positions (table rows)
|
| 240 |
+
- Blocks with table-like content
|
| 241 |
+
|
| 242 |
+
Args:
|
| 243 |
+
blocks: List of text blocks.
|
| 244 |
+
|
| 245 |
+
Returns:
|
| 246 |
+
Filtered list with table content removed.
|
| 247 |
+
"""
|
| 248 |
+
if not blocks:
|
| 249 |
+
return blocks
|
| 250 |
+
|
| 251 |
+
# Group blocks by page and approximate Y position (row detection)
|
| 252 |
+
# Blocks within 1% of page height are considered same row
|
| 253 |
+
filtered = []
|
| 254 |
+
|
| 255 |
+
for page_num in set(b.page_num for b in blocks):
|
| 256 |
+
page_blocks = [b for b in blocks if b.page_num == page_num]
|
| 257 |
+
|
| 258 |
+
# Group by Y position (rounded to detect rows)
|
| 259 |
+
y_groups: dict[float, list[TextBlock]] = {}
|
| 260 |
+
for block in page_blocks:
|
| 261 |
+
y_key = round(block.y_ratio, 2) # Group within ~1% of page
|
| 262 |
+
if y_key not in y_groups:
|
| 263 |
+
y_groups[y_key] = []
|
| 264 |
+
y_groups[y_key].append(block)
|
| 265 |
+
|
| 266 |
+
for y_key, row_blocks in y_groups.items():
|
| 267 |
+
# If many blocks at same Y position, likely a table row
|
| 268 |
+
if len(row_blocks) >= 3:
|
| 269 |
+
# Check if most blocks look like table cells
|
| 270 |
+
table_like = sum(1 for b in row_blocks if _is_table_like_text(b.text))
|
| 271 |
+
if table_like >= len(row_blocks) * 0.5:
|
| 272 |
+
# Skip this entire row - it's a table
|
| 273 |
+
continue
|
| 274 |
+
|
| 275 |
+
# Filter individual blocks that look like table content
|
| 276 |
+
for block in row_blocks:
|
| 277 |
+
if not _is_table_like_text(block.text):
|
| 278 |
+
filtered.append(block)
|
| 279 |
+
|
| 280 |
+
# Sort by page and position (top to bottom)
|
| 281 |
+
filtered.sort(key=lambda b: (b.page_num, -b.y_ratio))
|
| 282 |
+
return filtered
|
| 283 |
|
| 284 |
|
| 285 |
def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
|
|
|
|
| 311 |
if not isinstance(element, LTTextBoxHorizontal):
|
| 312 |
continue
|
| 313 |
|
| 314 |
+
# Extract characters with their font sizes
|
| 315 |
+
# LTChar has font size, LTAnno is whitespace (use size=-1 to always keep)
|
| 316 |
+
chars_with_sizes: list[tuple[str, float]] = []
|
| 317 |
+
for line in element:
|
| 318 |
+
if isinstance(line, LTTextLineHorizontal):
|
| 319 |
+
for char in line:
|
| 320 |
+
if isinstance(char, LTChar):
|
| 321 |
+
chars_with_sizes.append((char.get_text(), char.size))
|
| 322 |
+
elif isinstance(char, LTAnno):
|
| 323 |
+
# Whitespace/newlines - always keep (use -1 as marker)
|
| 324 |
+
chars_with_sizes.append((char.get_text(), -1))
|
| 325 |
+
|
| 326 |
+
if not chars_with_sizes:
|
| 327 |
+
text = element.get_text().strip()
|
| 328 |
+
if text:
|
| 329 |
+
blocks.append(
|
| 330 |
+
TextBlock(
|
| 331 |
+
text=text,
|
| 332 |
+
y_ratio=element.y0 / page_height if page_height > 0 else 0.5,
|
| 333 |
+
font_size=10.0,
|
| 334 |
+
page_num=page_num,
|
| 335 |
+
)
|
| 336 |
+
)
|
| 337 |
+
continue
|
| 338 |
+
|
| 339 |
+
# Find dominant font size (most common, excluding whitespace markers)
|
| 340 |
+
font_sizes = [size for _, size in chars_with_sizes if size > 0]
|
| 341 |
+
if not font_sizes:
|
| 342 |
+
continue
|
| 343 |
+
size_counts = Counter(round(s, 1) for s in font_sizes)
|
| 344 |
+
dominant_size = max(size_counts, key=lambda x: size_counts[x])
|
| 345 |
+
|
| 346 |
+
# Filter out superscript/subscript characters (< 70% of dominant size)
|
| 347 |
+
# Keep whitespace (size=-1) and normal-sized characters
|
| 348 |
+
min_size = dominant_size * 0.7
|
| 349 |
+
filtered_text = "".join(
|
| 350 |
+
char for char, size in chars_with_sizes if size < 0 or size >= min_size
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
text = filtered_text.strip()
|
| 354 |
if not text:
|
| 355 |
continue
|
| 356 |
|
| 357 |
# Calculate Y position as ratio (0=bottom, 1=top)
|
| 358 |
y_ratio = element.y0 / page_height if page_height > 0 else 0.5
|
| 359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
|
| 361 |
|
| 362 |
blocks.append(
|
|
|
|
| 399 |
if not blocks:
|
| 400 |
return ""
|
| 401 |
|
| 402 |
+
# Filter out table content first
|
| 403 |
+
blocks = _filter_table_blocks(blocks)
|
| 404 |
+
|
| 405 |
cleaned_blocks = clean_text_blocks(blocks)
|
| 406 |
text = "\n\n".join(block.text for block in cleaned_blocks)
|
| 407 |
|
|
|
|
| 456 |
if is_page_number(block.text):
|
| 457 |
continue
|
| 458 |
|
| 459 |
+
# Skip figure/table captions
|
| 460 |
+
if _is_caption(block.text):
|
| 461 |
+
continue
|
| 462 |
+
|
| 463 |
# Skip very short lines with small font (likely captions/footnotes)
|
| 464 |
if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
|
| 465 |
continue
|
|
|
|
| 579 |
Returns:
|
| 580 |
Normalized text optimized for TTS.
|
| 581 |
"""
|
| 582 |
+
# === REMOVE ACADEMIC/PAPER ARTIFACTS ===
|
| 583 |
+
# Remove inline citations like (Smith et al., 2020) or (Smith, 2020; Jones, 2019)
|
| 584 |
+
# Also handles (Chen, 2018; Lee et al., 2020)
|
| 585 |
+
text = re.sub(r"\([^()]*\b\d{4}[a-z]?\b[^()]*\)", "", text)
|
| 586 |
+
|
| 587 |
+
# Remove author-year citations like "Smith (2020)" or "Smith et al. (2020)"
|
| 588 |
+
text = re.sub(
|
| 589 |
+
r"\b[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*\(\d{4}[a-z]?\)", "", text
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
# Clean up "by [Author]" patterns - remove the author part, keep "by" for grammar
|
| 593 |
+
# "by Smith" -> "" (will be cleaned up), "study by Smith found" -> "study found"
|
| 594 |
+
text = re.sub(
|
| 595 |
+
r"\bby\s+[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*,?\s*(?=found|showed|demonstrated|reported|observed|noted|suggested|concluded|argued|claimed|stated|proposed|discovered|revealed|indicated|confirmed)",
|
| 596 |
+
"",
|
| 597 |
+
text,
|
| 598 |
+
)
|
| 599 |
+
|
| 600 |
+
# Remove orphaned "et al." and similar
|
| 601 |
+
text = re.sub(r"\s+et\s+al\.?,?\s*", " ", text)
|
| 602 |
+
|
| 603 |
+
# Remove figure/table references like "see Figure 1" or "(see Table 2)"
|
| 604 |
+
text = re.sub(
|
| 605 |
+
r"\(?see\s+(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph|Appendix)\s*\d+[a-z]?\)?",
|
| 606 |
+
"",
|
| 607 |
+
text,
|
| 608 |
+
flags=re.IGNORECASE,
|
| 609 |
+
)
|
| 610 |
+
|
| 611 |
+
# Remove standalone figure/table references like "Figure 1 shows" -> "shows"
|
| 612 |
+
text = re.sub(
|
| 613 |
+
r"(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph)\s*\d+[a-z]?\s*(?:shows?|depicts?|illustrates?|presents?|displays?|summarizes?)",
|
| 614 |
+
"",
|
| 615 |
+
text,
|
| 616 |
+
flags=re.IGNORECASE,
|
| 617 |
+
)
|
| 618 |
+
|
| 619 |
+
# Remove section references like "Section 2.1" or "Chapter 3" (with surrounding context)
|
| 620 |
+
text = re.sub(
|
| 621 |
+
r"(?:in|see|as\s+(?:shown|described|discussed)\s+in|according\s+to)\s+(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*,?\s*",
|
| 622 |
+
"",
|
| 623 |
+
text,
|
| 624 |
+
flags=re.IGNORECASE,
|
| 625 |
+
)
|
| 626 |
+
text = re.sub(r"(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*", "", text, flags=re.IGNORECASE)
|
| 627 |
+
|
| 628 |
+
# Remove equation references like "Equation 1" or "Eq. (2)"
|
| 629 |
+
text = re.sub(r"(?:Equation|Eq\.?)\s*\(?\d+\)?", "", text, flags=re.IGNORECASE)
|
| 630 |
+
|
| 631 |
+
# Remove DOIs
|
| 632 |
+
text = re.sub(r"(?:doi:|DOI:?)\s*10\.\d{4,}/[^\s]+", "", text, flags=re.IGNORECASE)
|
| 633 |
+
|
| 634 |
+
# Remove arXiv references
|
| 635 |
+
text = re.sub(r"arXiv:\d{4}\.\d{4,}(?:v\d+)?", "", text, flags=re.IGNORECASE)
|
| 636 |
+
|
| 637 |
+
# Remove ISSN/ISBN numbers
|
| 638 |
+
text = re.sub(r"(?:ISSN|ISBN)[:\s]*[\d-]+", "", text, flags=re.IGNORECASE)
|
| 639 |
+
|
| 640 |
+
# Remove page ranges like "pp. 123-456" or "p. 42" or "pages 10-20"
|
| 641 |
+
text = re.sub(r"(?:p{1,2}\.?|pages?)\s*\d+(?:\s*[-–—]\s*\d+)?", "", text, flags=re.IGNORECASE)
|
| 642 |
+
|
| 643 |
+
# Remove volume/issue numbers like "Vol. 12, No. 3" (entire phrase)
|
| 644 |
+
text = re.sub(
|
| 645 |
+
r"(?:Vol(?:ume)?\.?\s*\d+,?\s*)?(?:Issue|No\.?)\s*\d+,?\s*", "", text, flags=re.IGNORECASE
|
| 646 |
+
)
|
| 647 |
+
text = re.sub(r"Vol(?:ume)?\.?\s*\d+,?\s*", "", text, flags=re.IGNORECASE)
|
| 648 |
+
|
| 649 |
+
# Remove copyright notices
|
| 650 |
+
text = re.sub(r"©\s*\d{4}[^.]*\.", "", text)
|
| 651 |
+
text = re.sub(r"Copyright\s*©?\s*\d{4}[^.]*\.", "", text, flags=re.IGNORECASE)
|
| 652 |
+
|
| 653 |
+
# Remove "All rights reserved" and similar
|
| 654 |
+
text = re.sub(r"All rights reserved\.?", "", text, flags=re.IGNORECASE)
|
| 655 |
+
|
| 656 |
+
# Remove asterisks used for footnote markers
|
| 657 |
+
text = re.sub(r"\*{1,3}(?=\s|$)", "", text)
|
| 658 |
+
|
| 659 |
+
# === NORMALIZE NEWLINES FIRST ===
|
| 660 |
+
# Convert various newline formats to standard \n
|
| 661 |
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
| 662 |
+
|
| 663 |
+
# Replace single newlines (mid-sentence line breaks) with spaces
|
| 664 |
+
# Keep double newlines as paragraph separators
|
| 665 |
+
# First, normalize multiple newlines to exactly two
|
| 666 |
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 667 |
+
|
| 668 |
+
# Replace single newlines that aren't paragraph breaks with spaces
|
| 669 |
+
# A single newline not preceded by sentence-ending punctuation is likely a line wrap
|
| 670 |
+
text = re.sub(r"(?<![.!?:\n])\n(?!\n)", " ", text)
|
| 671 |
+
|
| 672 |
# === CODE AND TECHNICAL CONTENT ===
|
| 673 |
# Handle common programming patterns that read poorly
|
| 674 |
|
|
|
|
| 746 |
text = text.replace("'''", "")
|
| 747 |
|
| 748 |
# === UNICODE NORMALIZATION ===
|
| 749 |
+
|
| 750 |
+
# Remove superscript characters (often footnote references)
|
| 751 |
+
# Includes Unicode superscript digits, letters, and modifier letters
|
| 752 |
+
superscripts = (
|
| 753 |
+
"⁰¹²³⁴⁵⁶⁷⁸⁹" # Superscript digits
|
| 754 |
+
"⁺⁻⁼⁽⁾" # Superscript operators
|
| 755 |
+
"ⁿⁱ" # Common superscript letters
|
| 756 |
+
"ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻ" # Superscript lowercase
|
| 757 |
+
"ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᴬᴭᴮᴯᴰᴱᴲᴳᴴᴵᴶᴷᴸᴹᴺᴻᴼᴽᴾᴿᵀᵁᵂ" # Superscript uppercase
|
| 758 |
+
"ᶦᶧᶨᶩᶪᶫᶬᶭᶮᶯᶰᶱᶲᶳᶴᶵᶶᶷᶸᶹᶺᶻᶼᶽᶾᶿ" # More modifier letters
|
| 759 |
+
"ʰʱʲʳʴʵʶʷʸʹʺʻʼʽˀˁˆˇˈˉˊˋˌˍˎˏːˑ" # Modifier letters
|
| 760 |
+
)
|
| 761 |
+
for char in superscripts:
|
| 762 |
+
text = text.replace(char, "")
|
| 763 |
+
|
| 764 |
+
# Also use regex to catch any remaining superscript-like characters
|
| 765 |
+
# Unicode categories for superscripts and modifiers
|
| 766 |
+
text = re.sub(r"[\u2070-\u209F]", "", text) # Superscripts and Subscripts block
|
| 767 |
+
text = re.sub(r"[\u1D2C-\u1D6A]", "", text) # Phonetic Extensions (modifier letters)
|
| 768 |
+
text = re.sub(r"[\u1D78-\u1D7F]", "", text) # More phonetic extensions
|
| 769 |
+
text = re.sub(r"[\u02B0-\u02FF]", "", text) # Spacing Modifier Letters
|
| 770 |
+
|
| 771 |
+
# Remove subscript characters
|
| 772 |
+
subscripts = "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₔₕₖₗₘₙₚₛₜ"
|
| 773 |
+
for char in subscripts:
|
| 774 |
+
text = text.replace(char, "")
|
| 775 |
+
|
| 776 |
# Convert smart quotes to simple quotes
|
| 777 |
+
text = text.replace("\u201c", '"').replace("\u201d", '"')
|
| 778 |
+
text = text.replace("\u2018", "'").replace("\u2019", "'")
|
| 779 |
+
text = text.replace("\u201e", '"').replace("\u201f", '"')
|
| 780 |
|
| 781 |
# Normalize dashes to standard hyphen or remove
|
| 782 |
text = text.replace("–", "-") # en-dash
|
|
|
|
| 887 |
# Remove content in angle brackets (often HTML/XML artifacts)
|
| 888 |
text = re.sub(r"<[^>]+>", "", text)
|
| 889 |
|
|
|
|
|
|
|
|
|
|
| 890 |
# Remove spaces before punctuation
|
| 891 |
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
|
| 892 |
|
| 893 |
# Ensure space after punctuation (but not before another punctuation)
|
| 894 |
text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
|
| 895 |
|
| 896 |
+
# === FINAL WHITESPACE NORMALIZATION ===
|
| 897 |
+
# This must happen LAST after all substitutions that can create gaps
|
| 898 |
|
| 899 |
+
# Collapse all whitespace (spaces, tabs, multiple spaces) to single space
|
| 900 |
+
# Do this per-line to preserve intentional paragraph breaks
|
| 901 |
+
lines = text.split("\n")
|
| 902 |
+
normalized_lines = []
|
| 903 |
+
for line in lines:
|
| 904 |
+
# Replace any sequence of whitespace with single space
|
| 905 |
+
line = re.sub(r"[ \t]+", " ", line)
|
| 906 |
+
# Strip leading/trailing whitespace from each line
|
| 907 |
+
line = line.strip()
|
| 908 |
+
normalized_lines.append(line)
|
| 909 |
+
|
| 910 |
+
text = "\n".join(normalized_lines)
|
| 911 |
+
|
| 912 |
+
# Remove excessive blank lines (keep max 1 blank line between paragraphs)
|
| 913 |
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 914 |
|
| 915 |
+
# Remove blank lines at start/end
|
| 916 |
+
text = text.strip()
|
| 917 |
+
|
| 918 |
return text
|
src/talking_snake/static/app.js
CHANGED
|
@@ -24,14 +24,15 @@ const deviceInfo = document.getElementById("deviceInfo");
|
|
| 24 |
const docInfo = document.getElementById("docInfo");
|
| 25 |
const languageButtons = document.querySelectorAll("#languageButtons .style-btn");
|
| 26 |
const processingProgressBar = document.getElementById("processingProgressBar");
|
|
|
|
| 27 |
|
| 28 |
// Custom player elements
|
| 29 |
const playerPlayBtn = document.getElementById("playerPlayBtn");
|
| 30 |
const progressBar = document.getElementById("progressBar");
|
| 31 |
const progressSlider = document.getElementById("progressSlider");
|
| 32 |
const timeDisplay = document.getElementById("timeDisplay");
|
| 33 |
-
const volumeBtn = document.getElementById("volumeBtn");
|
| 34 |
const downloadBtn = document.getElementById("downloadBtn");
|
|
|
|
| 35 |
|
| 36 |
// Constants
|
| 37 |
const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
|
|
@@ -39,11 +40,12 @@ const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
|
|
| 39 |
// State
|
| 40 |
let currentAbortController = null;
|
| 41 |
let selectedLanguage = "english";
|
|
|
|
| 42 |
let isPaused = false;
|
| 43 |
let estimatedDuration = 0; // Estimated total duration from server
|
| 44 |
-
let isMuted = false;
|
| 45 |
-
let currentAudioBlob = null; // Store audio blob for download
|
| 46 |
let currentDocName = ""; // Store document name for download filename
|
|
|
|
|
|
|
| 47 |
|
| 48 |
/**
|
| 49 |
* Format time in seconds to MM:SS
|
|
@@ -91,10 +93,32 @@ function updateDocInfo(data) {
|
|
| 91 |
const pageInfo = data.page_count ? `<span class="doc-pages"><i class="fa-solid fa-file"></i> ${data.page_count}p</span>` : "";
|
| 92 |
const charInfo = data.total_chars ? `<span class="doc-chars"><i class="fa-solid fa-font"></i> ${formatNumber(data.total_chars)}</span>` : "";
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
docInfo.innerHTML = `
|
| 95 |
<span class="doc-name" title="${docName}"><i class="fa-solid ${icon}"></i><span class="doc-name-text">${docName}</span></span>
|
| 96 |
${pageInfo}
|
| 97 |
${charInfo}
|
|
|
|
|
|
|
| 98 |
`;
|
| 99 |
}
|
| 100 |
|
|
@@ -102,16 +126,30 @@ function updateDocInfo(data) {
|
|
| 102 |
* Update the custom player progress bar and time display
|
| 103 |
*/
|
| 104 |
function updatePlayerProgress() {
|
| 105 |
-
|
| 106 |
-
//
|
| 107 |
-
let
|
| 108 |
-
if (
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
}
|
| 111 |
|
| 112 |
const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
|
| 113 |
progressBar.style.width = `${Math.min(progress, 100)}%`;
|
| 114 |
-
progressSlider.value = progress;
|
| 115 |
timeDisplay.textContent = `${formatTime(currentTime)} / ${formatTime(duration)}`;
|
| 116 |
}
|
| 117 |
|
|
@@ -120,11 +158,19 @@ function updatePlayerProgress() {
|
|
| 120 |
*/
|
| 121 |
function handleSeek(e) {
|
| 122 |
const percent = parseFloat(e.target.value);
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
}
|
| 127 |
-
audio.currentTime = (percent / 100) * duration;
|
| 128 |
updatePlayerProgress();
|
| 129 |
}
|
| 130 |
|
|
@@ -152,13 +198,39 @@ function updatePlayButton() {
|
|
| 152 |
}
|
| 153 |
|
| 154 |
/**
|
| 155 |
-
*
|
|
|
|
|
|
|
| 156 |
*/
|
| 157 |
-
function
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
}
|
| 163 |
|
| 164 |
/**
|
|
@@ -167,14 +239,27 @@ function toggleMute() {
|
|
| 167 |
*/
|
| 168 |
function updateDeviceInfo(info) {
|
| 169 |
const icon = info.device === "cuda" ? "fa-microchip" : "fa-server";
|
| 170 |
-
const
|
| 171 |
-
?
|
| 172 |
-
: "CPU
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
deviceInfo.innerHTML = `
|
| 174 |
-
<i class="fa-solid ${icon}"></i>
|
| 175 |
-
<span>${info.device_name}</span>
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
| 178 |
`;
|
| 179 |
deviceInfo.classList.add("visible");
|
| 180 |
}
|
|
@@ -207,58 +292,145 @@ initDeviceInfoStream();
|
|
| 207 |
// Custom player event listeners
|
| 208 |
playerPlayBtn.addEventListener("click", togglePlayerPlay);
|
| 209 |
progressSlider.addEventListener("input", handleSeek);
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
audio.addEventListener("timeupdate", updatePlayerProgress);
|
| 214 |
audio.addEventListener("ended", () => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
updatePlayButton();
|
| 216 |
progressBar.style.width = "100%";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
});
|
| 218 |
// Show pause button when audio actually starts playing
|
| 219 |
audio.addEventListener("playing", () => {
|
|
|
|
| 220 |
pauseBtn.classList.remove("hidden");
|
| 221 |
});
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
/**
|
| 224 |
-
*
|
| 225 |
* @param {string} jobId - The job ID for the audio
|
| 226 |
*/
|
| 227 |
-
async function
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
}
|
| 239 |
|
| 240 |
/**
|
| 241 |
* Download the current audio as a WAV file
|
| 242 |
*/
|
| 243 |
function downloadAudio() {
|
| 244 |
-
|
|
|
|
| 245 |
return;
|
| 246 |
}
|
| 247 |
|
| 248 |
-
const url = URL.createObjectURL(currentAudioBlob);
|
| 249 |
-
const a = document.createElement("a");
|
| 250 |
-
a.href = url;
|
| 251 |
-
|
| 252 |
// Create filename from document name
|
| 253 |
let filename = currentDocName || "audio";
|
| 254 |
-
// Remove file extension if present and add .wav
|
| 255 |
filename = filename.replace(/\.[^.]+$/, "") + ".wav";
|
| 256 |
-
a.download = filename;
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
document.body.appendChild(a);
|
| 259 |
a.click();
|
| 260 |
document.body.removeChild(a);
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
}
|
| 263 |
|
| 264 |
/**
|
|
@@ -269,6 +441,112 @@ function getSelectedLanguage() {
|
|
| 269 |
return selectedLanguage;
|
| 270 |
}
|
| 271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
/**
|
| 273 |
* Show the input section and hide processing section
|
| 274 |
*/
|
|
@@ -283,9 +561,10 @@ function showInputSection() {
|
|
| 283 |
function showProcessingSection() {
|
| 284 |
inputSection.classList.add("hidden");
|
| 285 |
processingSection.classList.add("visible");
|
| 286 |
-
// Reset progress bar and hide
|
| 287 |
processingProgressBar.style.width = "0%";
|
| 288 |
pauseBtn.classList.add("hidden");
|
|
|
|
| 289 |
}
|
| 290 |
|
| 291 |
/**
|
|
@@ -318,10 +597,10 @@ function stopGeneration() {
|
|
| 318 |
isPaused = false;
|
| 319 |
updatePauseButton();
|
| 320 |
|
| 321 |
-
// Hide download button and
|
| 322 |
downloadBtn.classList.add("hidden");
|
| 323 |
pauseBtn.classList.add("hidden");
|
| 324 |
-
|
| 325 |
|
| 326 |
// Reset progress bar
|
| 327 |
processingProgressBar.style.width = "0%";
|
|
@@ -370,16 +649,23 @@ function updatePauseButton() {
|
|
| 370 |
}
|
| 371 |
}
|
| 372 |
|
|
|
|
|
|
|
| 373 |
/**
|
| 374 |
-
*
|
| 375 |
-
* @param {
|
| 376 |
-
* @returns {string}
|
| 377 |
*/
|
| 378 |
-
function
|
| 379 |
-
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
}
|
| 382 |
-
return `~${Math.ceil(seconds)}s remaining`;
|
| 383 |
}
|
| 384 |
|
| 385 |
/**
|
|
@@ -387,15 +673,15 @@ function formatTimeRemaining(seconds) {
|
|
| 387 |
* Sets up audio stream once job_id is received
|
| 388 |
* @param {Response} response - Fetch response with SSE stream
|
| 389 |
* @param {string} docName - Document name for display
|
|
|
|
| 390 |
* @returns {Promise<void>}
|
| 391 |
* @throws {Error} If stream contains an error event or fails
|
| 392 |
*/
|
| 393 |
-
async function processStream(response, docName) {
|
| 394 |
const reader = response.body.getReader();
|
| 395 |
const decoder = new TextDecoder();
|
| 396 |
let lastStatus = "";
|
| 397 |
-
let
|
| 398 |
-
let audioStarted = false;
|
| 399 |
|
| 400 |
// Reset estimated duration
|
| 401 |
estimatedDuration = 0;
|
|
@@ -419,52 +705,66 @@ async function processStream(response, docName) {
|
|
| 419 |
throw new Error(data.message || "TTS generation failed");
|
| 420 |
} else if (data.type === "start" && data.job_id) {
|
| 421 |
// Got job ID - start audio stream immediately
|
| 422 |
-
jobId = data.job_id;
|
| 423 |
-
//
|
| 424 |
-
|
| 425 |
-
|
|
|
|
| 426 |
}
|
| 427 |
// Display document info
|
| 428 |
updateDocInfo(data);
|
| 429 |
-
if (!
|
| 430 |
-
|
| 431 |
-
//
|
| 432 |
-
|
| 433 |
-
audio.src = `/api/audio/${jobId}`;
|
| 434 |
-
audio.load();
|
| 435 |
-
// Try to play (may need user interaction first time)
|
| 436 |
-
audio.play().catch(() => {
|
| 437 |
-
// Autoplay blocked - will play when user clicks
|
| 438 |
-
});
|
| 439 |
-
updatePlayButton();
|
| 440 |
-
// Pause button will be shown by the 'playing' event listener
|
| 441 |
}
|
| 442 |
-
|
| 443 |
showStatus(
|
| 444 |
-
|
| 445 |
"loading"
|
| 446 |
);
|
| 447 |
// Update progress bar
|
| 448 |
processingProgressBar.style.width = "5%";
|
| 449 |
} else if (data.type === "progress") {
|
| 450 |
lastStatus = data.status;
|
| 451 |
-
|
| 452 |
showStatus(
|
| 453 |
-
`<span class="spinner"></span>${data.percent}%
|
| 454 |
"loading"
|
| 455 |
);
|
| 456 |
// Update progress bar
|
| 457 |
processingProgressBar.style.width = `${data.percent}%`;
|
| 458 |
} else if (data.type === "complete") {
|
| 459 |
// Generation complete - show player
|
| 460 |
-
//
|
| 461 |
-
if (data.
|
| 462 |
-
|
| 463 |
-
// Use total_time as a rough guide
|
| 464 |
-
estimatedDuration = Math.max(estimatedDuration, audio.currentTime + 10);
|
| 465 |
}
|
| 466 |
-
filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
currentDocName = docName;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
player.classList.add("visible");
|
| 469 |
// Set progress to 100%
|
| 470 |
processingProgressBar.style.width = "100%";
|
|
@@ -474,11 +774,6 @@ async function processStream(response, docName) {
|
|
| 474 |
"success"
|
| 475 |
);
|
| 476 |
updatePlayerProgress();
|
| 477 |
-
|
| 478 |
-
// Fetch audio blob for download capability
|
| 479 |
-
if (jobId) {
|
| 480 |
-
fetchAudioBlob(jobId);
|
| 481 |
-
}
|
| 482 |
}
|
| 483 |
} catch (parseError) {
|
| 484 |
// Check if it's our thrown error or a JSON parse error
|
|
@@ -518,11 +813,11 @@ async function handleFile(file) {
|
|
| 518 |
showStatus('<span class="spinner"></span> Extracting text...', "loading");
|
| 519 |
player.classList.remove("visible");
|
| 520 |
downloadBtn.classList.add("hidden");
|
| 521 |
-
currentAudioBlob = null;
|
| 522 |
|
| 523 |
const formData = new FormData();
|
| 524 |
formData.append("file", file);
|
| 525 |
formData.append("language", getSelectedLanguage());
|
|
|
|
| 526 |
|
| 527 |
// Create abort controller for this request
|
| 528 |
currentAbortController = new AbortController();
|
|
@@ -540,7 +835,7 @@ async function handleFile(file) {
|
|
| 540 |
}
|
| 541 |
|
| 542 |
// Process stream handles both progress SSE and starting audio playback
|
| 543 |
-
await processStream(response, file.name);
|
| 544 |
} catch (error) {
|
| 545 |
if (error.name === "AbortError") {
|
| 546 |
// User cancelled - already handled in stopGeneration
|
|
@@ -577,7 +872,6 @@ async function handleUrl(url) {
|
|
| 577 |
showStatus('<span class="spinner"></span> Fetching content...', "loading");
|
| 578 |
player.classList.remove("visible");
|
| 579 |
downloadBtn.classList.add("hidden");
|
| 580 |
-
currentAudioBlob = null;
|
| 581 |
urlSubmit.disabled = true;
|
| 582 |
|
| 583 |
// Create abort controller for this request
|
|
@@ -591,7 +885,8 @@ async function handleUrl(url) {
|
|
| 591 |
},
|
| 592 |
body: JSON.stringify({
|
| 593 |
url,
|
| 594 |
-
language: getSelectedLanguage()
|
|
|
|
| 595 |
}),
|
| 596 |
signal: currentAbortController.signal,
|
| 597 |
});
|
|
@@ -606,7 +901,7 @@ async function handleUrl(url) {
|
|
| 606 |
const docName = urlPath.split("/").pop() || "document";
|
| 607 |
|
| 608 |
// Process stream handles both progress SSE and starting audio playback
|
| 609 |
-
await processStream(response, docName);
|
| 610 |
} catch (error) {
|
| 611 |
if (error.name === "AbortError") {
|
| 612 |
// User cancelled - already handled in stopGeneration
|
|
@@ -641,7 +936,6 @@ async function handleText(text) {
|
|
| 641 |
showStatus('<span class="spinner"></span> Processing text...', "loading");
|
| 642 |
player.classList.remove("visible");
|
| 643 |
downloadBtn.classList.add("hidden");
|
| 644 |
-
currentAudioBlob = null;
|
| 645 |
textSubmit.disabled = true;
|
| 646 |
|
| 647 |
// Create abort controller for this request
|
|
@@ -655,7 +949,8 @@ async function handleText(text) {
|
|
| 655 |
},
|
| 656 |
body: JSON.stringify({
|
| 657 |
text,
|
| 658 |
-
language: getSelectedLanguage()
|
|
|
|
| 659 |
}),
|
| 660 |
signal: currentAbortController.signal,
|
| 661 |
});
|
|
@@ -665,8 +960,12 @@ async function handleText(text) {
|
|
| 665 |
throw new Error(error.detail || "Failed to process text");
|
| 666 |
}
|
| 667 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
// Process stream handles both progress SSE and starting audio playback
|
| 669 |
-
await processStream(response,
|
| 670 |
} catch (error) {
|
| 671 |
if (error.name === "AbortError") {
|
| 672 |
// User cancelled - already handled in stopGeneration
|
|
@@ -683,6 +982,15 @@ async function handleText(text) {
|
|
| 683 |
// Tab switching
|
| 684 |
tabs.forEach((tab) => {
|
| 685 |
tab.addEventListener("click", () => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
tabs.forEach((t) => t.classList.remove("active"));
|
| 687 |
tabContents.forEach((tc) => tc.classList.remove("active"));
|
| 688 |
tab.classList.add("active");
|
|
@@ -712,7 +1020,7 @@ dropZone.addEventListener("drop", (e) => {
|
|
| 712 |
|
| 713 |
// Click to select file
|
| 714 |
dropZone.addEventListener("click", (e) => {
|
| 715 |
-
if (e.target !== fileInput
|
| 716 |
fileInput.click();
|
| 717 |
}
|
| 718 |
});
|
|
@@ -746,15 +1054,34 @@ textInput.addEventListener("keydown", (e) => {
|
|
| 746 |
}
|
| 747 |
});
|
| 748 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
// Stop button
|
| 750 |
stopBtn.addEventListener("click", stopGeneration);
|
| 751 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 752 |
// Pause button
|
| 753 |
pauseBtn.addEventListener("click", togglePause);
|
| 754 |
|
| 755 |
// Download button
|
| 756 |
downloadBtn.addEventListener("click", downloadAudio);
|
| 757 |
|
|
|
|
|
|
|
|
|
|
| 758 |
// Update pause button when audio state changes
|
| 759 |
audio.addEventListener("play", updatePauseButton);
|
| 760 |
audio.addEventListener("pause", updatePauseButton);
|
|
@@ -766,8 +1093,16 @@ audio.addEventListener("ended", () => {
|
|
| 766 |
// Language selection
|
| 767 |
languageButtons.forEach((btn) => {
|
| 768 |
btn.addEventListener("click", () => {
|
| 769 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 770 |
btn.classList.add("active");
|
| 771 |
-
|
| 772 |
});
|
| 773 |
});
|
|
|
|
| 24 |
const docInfo = document.getElementById("docInfo");
|
| 25 |
const languageButtons = document.querySelectorAll("#languageButtons .style-btn");
|
| 26 |
const processingProgressBar = document.getElementById("processingProgressBar");
|
| 27 |
+
const streamPlayBtn = document.getElementById("streamPlayBtn");
|
| 28 |
|
| 29 |
// Custom player elements
|
| 30 |
const playerPlayBtn = document.getElementById("playerPlayBtn");
|
| 31 |
const progressBar = document.getElementById("progressBar");
|
| 32 |
const progressSlider = document.getElementById("progressSlider");
|
| 33 |
const timeDisplay = document.getElementById("timeDisplay");
|
|
|
|
| 34 |
const downloadBtn = document.getElementById("downloadBtn");
|
| 35 |
+
const deleteBtn = document.getElementById("deleteBtn");
|
| 36 |
|
| 37 |
// Constants
|
| 38 |
const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
|
|
|
|
| 40 |
// State
|
| 41 |
let currentAbortController = null;
|
| 42 |
let selectedLanguage = "english";
|
| 43 |
+
let selectedStyle = "technical";
|
| 44 |
let isPaused = false;
|
| 45 |
let estimatedDuration = 0; // Estimated total duration from server
|
|
|
|
|
|
|
| 46 |
let currentDocName = ""; // Store document name for download filename
|
| 47 |
+
let playbackStartTime = 0; // When playback started (for tracking real elapsed time)
|
| 48 |
+
let playbackElapsed = 0; // Total elapsed playback time
|
| 49 |
|
| 50 |
/**
|
| 51 |
* Format time in seconds to MM:SS
|
|
|
|
| 93 |
const pageInfo = data.page_count ? `<span class="doc-pages"><i class="fa-solid fa-file"></i> ${data.page_count}p</span>` : "";
|
| 94 |
const charInfo = data.total_chars ? `<span class="doc-chars"><i class="fa-solid fa-font"></i> ${formatNumber(data.total_chars)}</span>` : "";
|
| 95 |
|
| 96 |
+
// Style icons mapping
|
| 97 |
+
const styleIcons = {
|
| 98 |
+
technical: "fa-microchip",
|
| 99 |
+
narrative: "fa-book-open",
|
| 100 |
+
child_narrative: "fa-child",
|
| 101 |
+
news: "fa-newspaper",
|
| 102 |
+
academic: "fa-graduation-cap"
|
| 103 |
+
};
|
| 104 |
+
|
| 105 |
+
// Language flags mapping
|
| 106 |
+
const langFlags = {
|
| 107 |
+
english: "🇬🇧",
|
| 108 |
+
chinese: "🇨🇳",
|
| 109 |
+
japanese: "🇯🇵",
|
| 110 |
+
korean: "🇰🇷"
|
| 111 |
+
};
|
| 112 |
+
|
| 113 |
+
const styleIcon = styleIcons[selectedStyle] || "fa-microchip";
|
| 114 |
+
const langFlag = langFlags[selectedLanguage] || "🇬🇧";
|
| 115 |
+
|
| 116 |
docInfo.innerHTML = `
|
| 117 |
<span class="doc-name" title="${docName}"><i class="fa-solid ${icon}"></i><span class="doc-name-text">${docName}</span></span>
|
| 118 |
${pageInfo}
|
| 119 |
${charInfo}
|
| 120 |
+
<span class="doc-style" title="Style: ${selectedStyle}"><i class="fa-solid ${styleIcon}"></i></span>
|
| 121 |
+
<span class="doc-lang" title="Language: ${selectedLanguage}">${langFlag}</span>
|
| 122 |
`;
|
| 123 |
}
|
| 124 |
|
|
|
|
| 126 |
* Update the custom player progress bar and time display
|
| 127 |
*/
|
| 128 |
function updatePlayerProgress() {
|
| 129 |
+
// For streaming WAV, browser's duration/currentTime are unreliable
|
| 130 |
+
// Track real playback time ourselves
|
| 131 |
+
let currentTime;
|
| 132 |
+
if (playbackStartTime > 0 && !audio.paused) {
|
| 133 |
+
currentTime = playbackElapsed + (Date.now() - playbackStartTime) / 1000;
|
| 134 |
+
} else {
|
| 135 |
+
currentTime = playbackElapsed;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
// Use our estimated duration, update it if playback exceeds estimate
|
| 139 |
+
let duration = estimatedDuration;
|
| 140 |
+
if (currentTime > duration) {
|
| 141 |
+
estimatedDuration = currentTime + 10; // Extend estimate
|
| 142 |
+
duration = estimatedDuration;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
// Ensure we have reasonable values
|
| 146 |
+
if (duration <= 0) {
|
| 147 |
+
duration = 60; // Fallback
|
| 148 |
}
|
| 149 |
|
| 150 |
const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
|
| 151 |
progressBar.style.width = `${Math.min(progress, 100)}%`;
|
| 152 |
+
progressSlider.value = Math.min(progress, 100);
|
| 153 |
timeDisplay.textContent = `${formatTime(currentTime)} / ${formatTime(duration)}`;
|
| 154 |
}
|
| 155 |
|
|
|
|
| 158 |
*/
|
| 159 |
function handleSeek(e) {
|
| 160 |
const percent = parseFloat(e.target.value);
|
| 161 |
+
const duration = estimatedDuration || 60;
|
| 162 |
+
const seekTime = (percent / 100) * duration;
|
| 163 |
+
|
| 164 |
+
// Set our playback tracker
|
| 165 |
+
playbackElapsed = seekTime;
|
| 166 |
+
playbackStartTime = audio.paused ? 0 : Date.now();
|
| 167 |
+
|
| 168 |
+
// Try to seek the audio (may not work well with streaming)
|
| 169 |
+
try {
|
| 170 |
+
audio.currentTime = seekTime;
|
| 171 |
+
} catch {
|
| 172 |
+
// Seeking may fail with streaming audio
|
| 173 |
}
|
|
|
|
| 174 |
updatePlayerProgress();
|
| 175 |
}
|
| 176 |
|
|
|
|
| 198 |
}
|
| 199 |
|
| 200 |
/**
|
| 201 |
+
* Get HTML for model state indicator
|
| 202 |
+
* @param {string} state - Model state: loaded, loading, unloaded, unloading
|
| 203 |
+
* @returns {string} HTML string for the model state indicator
|
| 204 |
*/
|
| 205 |
+
function getModelStateHtml(state) {
|
| 206 |
+
const stateConfig = {
|
| 207 |
+
loaded: {
|
| 208 |
+
icon: "fa-circle-check",
|
| 209 |
+
class: "model-loaded",
|
| 210 |
+
text: "Model loaded",
|
| 211 |
+
tooltip: "TTS model is loaded in memory and ready for inference"
|
| 212 |
+
},
|
| 213 |
+
loading: {
|
| 214 |
+
icon: "fa-spinner fa-spin",
|
| 215 |
+
class: "model-loading",
|
| 216 |
+
text: "Loading...",
|
| 217 |
+
tooltip: "TTS model is being loaded into memory"
|
| 218 |
+
},
|
| 219 |
+
unloaded: {
|
| 220 |
+
icon: "fa-circle-xmark",
|
| 221 |
+
class: "model-unloaded",
|
| 222 |
+
text: "Model unloaded",
|
| 223 |
+
tooltip: "TTS model is not loaded (will load on first request)"
|
| 224 |
+
},
|
| 225 |
+
unloading: {
|
| 226 |
+
icon: "fa-spinner fa-spin",
|
| 227 |
+
class: "model-unloading",
|
| 228 |
+
text: "Unloading...",
|
| 229 |
+
tooltip: "TTS model is being unloaded from memory"
|
| 230 |
+
}
|
| 231 |
+
};
|
| 232 |
+
const config = stateConfig[state] || stateConfig.unloaded;
|
| 233 |
+
return `<span class="model-state ${config.class}" title="${config.tooltip}"><i class="fa-solid ${config.icon}"></i> ${config.text}</span>`;
|
| 234 |
}
|
| 235 |
|
| 236 |
/**
|
|
|
|
| 239 |
*/
|
| 240 |
function updateDeviceInfo(info) {
|
| 241 |
const icon = info.device === "cuda" ? "fa-microchip" : "fa-server";
|
| 242 |
+
const deviceTooltip = info.device === "cuda"
|
| 243 |
+
? "GPU accelerated inference for faster audio generation"
|
| 244 |
+
: "CPU-based inference (slower than GPU)";
|
| 245 |
+
const gpuMemoryInfo = info.device === "cuda"
|
| 246 |
+
? `<span class="device-memory" title="GPU memory used for model and inference"><i class="fa-solid fa-memory"></i> GPU: ${info.memory_used_gb}/${info.memory_total_gb}GB</span>`
|
| 247 |
+
: "";
|
| 248 |
+
const ramInfo = `<span class="device-memory" title="System RAM usage"><i class="fa-solid fa-memory"></i> RAM: ${info.ram_used_gb}/${info.ram_total_gb}GB</span>`;
|
| 249 |
+
// Show timing stats if available
|
| 250 |
+
const timingInfo = info.seconds_per_char !== undefined
|
| 251 |
+
? `<span class="device-timing" title="Average time to generate audio per character of text"><i class="fa-solid fa-stopwatch"></i> ${info.seconds_per_char.toFixed(4)}s/char</span>`
|
| 252 |
+
: "";
|
| 253 |
+
// Show model state
|
| 254 |
+
const modelStateInfo = getModelStateHtml(info.model_state);
|
| 255 |
deviceInfo.innerHTML = `
|
| 256 |
+
<i class="fa-solid ${icon}" title="${deviceTooltip}"></i>
|
| 257 |
+
<span title="${deviceTooltip}">${info.device_name}</span>
|
| 258 |
+
${modelStateInfo}
|
| 259 |
+
${gpuMemoryInfo}
|
| 260 |
+
${ramInfo}
|
| 261 |
+
${timingInfo}
|
| 262 |
+
<span class="device-ephemeral" title="Your documents are processed in memory only. Nothing is saved to disk or stored after processing."><i class="fa-solid fa-shield-halved"></i> No files stored</span>
|
| 263 |
`;
|
| 264 |
deviceInfo.classList.add("visible");
|
| 265 |
}
|
|
|
|
| 292 |
// Custom player event listeners
|
| 293 |
playerPlayBtn.addEventListener("click", togglePlayerPlay);
|
| 294 |
progressSlider.addEventListener("input", handleSeek);
|
| 295 |
+
audio.addEventListener("play", () => {
|
| 296 |
+
// Start tracking real playback time
|
| 297 |
+
playbackStartTime = Date.now();
|
| 298 |
+
updatePlayButton();
|
| 299 |
+
});
|
| 300 |
+
audio.addEventListener("pause", () => {
|
| 301 |
+
// Save elapsed time when pausing
|
| 302 |
+
if (playbackStartTime > 0) {
|
| 303 |
+
playbackElapsed += (Date.now() - playbackStartTime) / 1000;
|
| 304 |
+
playbackStartTime = 0;
|
| 305 |
+
}
|
| 306 |
+
updatePlayButton();
|
| 307 |
+
});
|
| 308 |
audio.addEventListener("timeupdate", updatePlayerProgress);
|
| 309 |
audio.addEventListener("ended", () => {
|
| 310 |
+
// Update elapsed to match duration on completion
|
| 311 |
+
if (playbackStartTime > 0) {
|
| 312 |
+
playbackElapsed += (Date.now() - playbackStartTime) / 1000;
|
| 313 |
+
playbackStartTime = 0;
|
| 314 |
+
}
|
| 315 |
+
// Ensure we show completion
|
| 316 |
+
if (estimatedDuration > 0 && playbackElapsed < estimatedDuration) {
|
| 317 |
+
playbackElapsed = estimatedDuration;
|
| 318 |
+
}
|
| 319 |
updatePlayButton();
|
| 320 |
progressBar.style.width = "100%";
|
| 321 |
+
timeDisplay.textContent = `${formatTime(estimatedDuration)} / ${formatTime(estimatedDuration)}`;
|
| 322 |
+
});
|
| 323 |
+
// Update duration when metadata is available
|
| 324 |
+
audio.addEventListener("loadedmetadata", () => {
|
| 325 |
+
// If browser has a valid duration, use it instead of estimate
|
| 326 |
+
if (isFinite(audio.duration) && audio.duration > 0 && audio.duration < 36000) {
|
| 327 |
+
estimatedDuration = audio.duration;
|
| 328 |
+
}
|
| 329 |
+
updatePlayerProgress();
|
| 330 |
+
});
|
| 331 |
+
// Also check duration changes (for streaming audio)
|
| 332 |
+
audio.addEventListener("durationchange", () => {
|
| 333 |
+
if (isFinite(audio.duration) && audio.duration > 0 && audio.duration < 36000) {
|
| 334 |
+
estimatedDuration = audio.duration;
|
| 335 |
+
}
|
| 336 |
+
updatePlayerProgress();
|
| 337 |
+
});
|
| 338 |
+
// Log audio errors for debugging
|
| 339 |
+
audio.addEventListener("error", () => {
|
| 340 |
+
console.error("Audio error:", audio.error?.message || "Unknown error");
|
| 341 |
});
|
| 342 |
// Show pause button when audio actually starts playing
|
| 343 |
audio.addEventListener("playing", () => {
|
| 344 |
+
streamPlayBtn.classList.add("hidden");
|
| 345 |
pauseBtn.classList.remove("hidden");
|
| 346 |
});
|
| 347 |
|
| 348 |
+
// Show stream play button when audio has enough data to start playing
|
| 349 |
+
audio.addEventListener("canplay", () => {
|
| 350 |
+
// Only show if processing is still in progress (player not visible yet)
|
| 351 |
+
// and audio is paused (not already playing) and pause button isn't showing
|
| 352 |
+
if (!player.classList.contains("visible") && audio.paused && pauseBtn.classList.contains("hidden")) {
|
| 353 |
+
streamPlayBtn.classList.remove("hidden");
|
| 354 |
+
}
|
| 355 |
+
});
|
| 356 |
+
|
| 357 |
/**
|
| 358 |
+
* Start streaming audio playback and enable download from cache
|
| 359 |
* @param {string} jobId - The job ID for the audio
|
| 360 |
*/
|
| 361 |
+
async function startAudioStream(jobId) {
|
| 362 |
+
const audioUrl = `/api/audio/${jobId}`;
|
| 363 |
+
|
| 364 |
+
// Reset playback tracking for new stream
|
| 365 |
+
playbackStartTime = 0;
|
| 366 |
+
playbackElapsed = 0;
|
| 367 |
+
|
| 368 |
+
// Set up audio source for streaming (user can click play)
|
| 369 |
+
audio.src = audioUrl;
|
| 370 |
+
audio.load();
|
| 371 |
+
|
| 372 |
+
// Store job ID for download - will fetch from cache
|
| 373 |
+
audio.dataset.jobId = jobId;
|
| 374 |
+
|
| 375 |
+
// Play button will be shown by the canplay event handler
|
| 376 |
}
|
| 377 |
|
| 378 |
/**
|
| 379 |
* Download the current audio as a WAV file
|
| 380 |
*/
|
| 381 |
function downloadAudio() {
|
| 382 |
+
const jobId = audio.dataset.jobId;
|
| 383 |
+
if (!jobId) {
|
| 384 |
return;
|
| 385 |
}
|
| 386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
// Create filename from document name
|
| 388 |
let filename = currentDocName || "audio";
|
|
|
|
| 389 |
filename = filename.replace(/\.[^.]+$/, "") + ".wav";
|
|
|
|
| 390 |
|
| 391 |
+
// Use download endpoint which returns proper WAV file
|
| 392 |
+
const a = document.createElement("a");
|
| 393 |
+
a.href = `/api/download/${jobId}?filename=${encodeURIComponent(filename)}`;
|
| 394 |
+
a.download = filename;
|
| 395 |
document.body.appendChild(a);
|
| 396 |
a.click();
|
| 397 |
document.body.removeChild(a);
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
/**
|
| 401 |
+
* Delete the current audio and reset the player
|
| 402 |
+
*/
|
| 403 |
+
function deleteAudio() {
|
| 404 |
+
// Stop audio immediately
|
| 405 |
+
audio.pause();
|
| 406 |
+
|
| 407 |
+
// Add deleting animation
|
| 408 |
+
player.classList.add("deleting");
|
| 409 |
+
|
| 410 |
+
// Wait for animation to complete
|
| 411 |
+
setTimeout(() => {
|
| 412 |
+
// Reset audio
|
| 413 |
+
audio.src = "";
|
| 414 |
+
audio.currentTime = 0;
|
| 415 |
+
|
| 416 |
+
// Clear state
|
| 417 |
+
currentDocName = "";
|
| 418 |
+
estimatedDuration = 0;
|
| 419 |
+
|
| 420 |
+
// Hide player and buttons
|
| 421 |
+
player.classList.remove("visible", "deleting");
|
| 422 |
+
downloadBtn.classList.add("hidden");
|
| 423 |
+
deleteBtn.classList.add("hidden");
|
| 424 |
+
|
| 425 |
+
// Reset progress
|
| 426 |
+
progressBar.style.width = "0%";
|
| 427 |
+
progressSlider.value = 0;
|
| 428 |
+
timeDisplay.textContent = "0:00 / 0:00";
|
| 429 |
+
updatePlayButton();
|
| 430 |
+
|
| 431 |
+
// Show input section again
|
| 432 |
+
inputSection.classList.remove("hidden");
|
| 433 |
+
}, 300);
|
| 434 |
}
|
| 435 |
|
| 436 |
/**
|
|
|
|
| 441 |
return selectedLanguage;
|
| 442 |
}
|
| 443 |
|
| 444 |
+
/**
|
| 445 |
+
* Detect language from text based on character scripts.
|
| 446 |
+
* @param {string} text - The text to analyze
|
| 447 |
+
* @returns {string|null} Detected language or null if mostly ASCII/Latin
|
| 448 |
+
*/
|
| 449 |
+
function detectLanguage(text) {
|
| 450 |
+
if (!text || text.length < 5) {
|
| 451 |
+
return null;
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
let chinese = 0;
|
| 455 |
+
let japanese = 0; // Hiragana + Katakana
|
| 456 |
+
let korean = 0;
|
| 457 |
+
let latin = 0;
|
| 458 |
+
|
| 459 |
+
for (const char of text) {
|
| 460 |
+
const code = char.charCodeAt(0);
|
| 461 |
+
// CJK Unified Ideographs (shared by Chinese/Japanese)
|
| 462 |
+
if (code >= 0x4e00 && code <= 0x9fff) {
|
| 463 |
+
chinese++;
|
| 464 |
+
}
|
| 465 |
+
// Hiragana
|
| 466 |
+
else if (code >= 0x3040 && code <= 0x309f) {
|
| 467 |
+
japanese++;
|
| 468 |
+
}
|
| 469 |
+
// Katakana
|
| 470 |
+
else if (code >= 0x30a0 && code <= 0x30ff) {
|
| 471 |
+
japanese++;
|
| 472 |
+
}
|
| 473 |
+
// Hangul Syllables
|
| 474 |
+
else if (code >= 0xac00 && code <= 0xd7af) {
|
| 475 |
+
korean++;
|
| 476 |
+
}
|
| 477 |
+
// Hangul Jamo
|
| 478 |
+
else if (code >= 0x1100 && code <= 0x11ff) {
|
| 479 |
+
korean++;
|
| 480 |
+
}
|
| 481 |
+
// Basic Latin letters
|
| 482 |
+
else if (
|
| 483 |
+
(code >= 0x41 && code <= 0x5a) ||
|
| 484 |
+
(code >= 0x61 && code <= 0x7a)
|
| 485 |
+
) {
|
| 486 |
+
latin++;
|
| 487 |
+
}
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
const total = chinese + japanese + korean + latin;
|
| 491 |
+
if (total === 0) {
|
| 492 |
+
return null;
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
// Japanese uses kanji (chinese chars) + kana, so check for kana first
|
| 496 |
+
if (japanese > 0 && (japanese + chinese) / total > 0.3) {
|
| 497 |
+
return "japanese";
|
| 498 |
+
}
|
| 499 |
+
// Korean
|
| 500 |
+
if (korean / total > 0.3) {
|
| 501 |
+
return "korean";
|
| 502 |
+
}
|
| 503 |
+
// Chinese (CJK without kana)
|
| 504 |
+
if (chinese / total > 0.3) {
|
| 505 |
+
return "chinese";
|
| 506 |
+
}
|
| 507 |
+
// Default to English for Latin text
|
| 508 |
+
if (latin / total > 0.5) {
|
| 509 |
+
return "english";
|
| 510 |
+
}
|
| 511 |
+
return null;
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
/**
|
| 515 |
+
* Set the selected language, optionally marking it as auto-detected.
|
| 516 |
+
* @param {string} lang - Language to select
|
| 517 |
+
* @param {boolean} isAuto - Whether this was auto-detected
|
| 518 |
+
*/
|
| 519 |
+
function setLanguage(lang, isAuto = false) {
|
| 520 |
+
const btn = document.querySelector(
|
| 521 |
+
`#languageButtons .style-btn[data-language="${lang}"]`
|
| 522 |
+
);
|
| 523 |
+
if (!btn || selectedLanguage === lang) {
|
| 524 |
+
return;
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
// Update selection state
|
| 528 |
+
languageButtons.forEach((b) => {
|
| 529 |
+
b.classList.remove("active", "auto-detected");
|
| 530 |
+
});
|
| 531 |
+
btn.classList.add("active");
|
| 532 |
+
selectedLanguage = lang;
|
| 533 |
+
|
| 534 |
+
// Visual feedback for auto-detection
|
| 535 |
+
if (isAuto) {
|
| 536 |
+
btn.classList.add("auto-detected");
|
| 537 |
+
// Remove animation class after it completes
|
| 538 |
+
setTimeout(() => btn.classList.remove("auto-detected"), 1500);
|
| 539 |
+
}
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
/**
|
| 543 |
+
* Get the currently selected style
|
| 544 |
+
* @returns {string} The selected style ID
|
| 545 |
+
*/
|
| 546 |
+
function getSelectedStyle() {
|
| 547 |
+
return selectedStyle;
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
/**
|
| 551 |
* Show the input section and hide processing section
|
| 552 |
*/
|
|
|
|
| 561 |
function showProcessingSection() {
|
| 562 |
inputSection.classList.add("hidden");
|
| 563 |
processingSection.classList.add("visible");
|
| 564 |
+
// Reset progress bar and hide buttons
|
| 565 |
processingProgressBar.style.width = "0%";
|
| 566 |
pauseBtn.classList.add("hidden");
|
| 567 |
+
streamPlayBtn.classList.add("hidden");
|
| 568 |
}
|
| 569 |
|
| 570 |
/**
|
|
|
|
| 597 |
isPaused = false;
|
| 598 |
updatePauseButton();
|
| 599 |
|
| 600 |
+
// Hide download button, pause button, and stream play button
|
| 601 |
downloadBtn.classList.add("hidden");
|
| 602 |
pauseBtn.classList.add("hidden");
|
| 603 |
+
streamPlayBtn.classList.add("hidden");
|
| 604 |
|
| 605 |
// Reset progress bar
|
| 606 |
processingProgressBar.style.width = "0%";
|
|
|
|
| 649 |
}
|
| 650 |
}
|
| 651 |
|
| 652 |
+
|
| 653 |
+
|
| 654 |
/**
|
| 655 |
+
* Get icon class for source type
|
| 656 |
+
* @param {string} sourceType - The source type ("pdf", "url", "text")
|
| 657 |
+
* @returns {string} Font Awesome icon class
|
| 658 |
*/
|
| 659 |
+
function getSourceIcon(sourceType) {
|
| 660 |
+
switch (sourceType) {
|
| 661 |
+
case "pdf":
|
| 662 |
+
return "fa-file-pdf";
|
| 663 |
+
case "url":
|
| 664 |
+
return "fa-link";
|
| 665 |
+
case "text":
|
| 666 |
+
default:
|
| 667 |
+
return "fa-keyboard";
|
| 668 |
}
|
|
|
|
| 669 |
}
|
| 670 |
|
| 671 |
/**
|
|
|
|
| 673 |
* Sets up audio stream once job_id is received
|
| 674 |
* @param {Response} response - Fetch response with SSE stream
|
| 675 |
* @param {string} docName - Document name for display
|
| 676 |
+
* @param {string} sourceType - Source type ("pdf", "url", "text")
|
| 677 |
* @returns {Promise<void>}
|
| 678 |
* @throws {Error} If stream contains an error event or fails
|
| 679 |
*/
|
| 680 |
+
async function processStream(response, docName, sourceType = "text") {
|
| 681 |
const reader = response.body.getReader();
|
| 682 |
const decoder = new TextDecoder();
|
| 683 |
let lastStatus = "";
|
| 684 |
+
let audioJobId = null;
|
|
|
|
| 685 |
|
| 686 |
// Reset estimated duration
|
| 687 |
estimatedDuration = 0;
|
|
|
|
| 705 |
throw new Error(data.message || "TTS generation failed");
|
| 706 |
} else if (data.type === "start" && data.job_id) {
|
| 707 |
// Got job ID - start audio stream immediately
|
| 708 |
+
const jobId = data.job_id;
|
| 709 |
+
// Estimate audio duration from character count
|
| 710 |
+
// Typical speech is ~14 chars/sec (150 wpm, 5 chars/word)
|
| 711 |
+
if (data.total_chars) {
|
| 712 |
+
estimatedDuration = data.total_chars / 14;
|
| 713 |
}
|
| 714 |
// Display document info
|
| 715 |
updateDocInfo(data);
|
| 716 |
+
if (!audioJobId) {
|
| 717 |
+
audioJobId = jobId;
|
| 718 |
+
// Start streaming playback immediately
|
| 719 |
+
startAudioStream(jobId);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 720 |
}
|
| 721 |
+
// Show generating status
|
| 722 |
showStatus(
|
| 723 |
+
'<span class="spinner"></span> Generating...',
|
| 724 |
"loading"
|
| 725 |
);
|
| 726 |
// Update progress bar
|
| 727 |
processingProgressBar.style.width = "5%";
|
| 728 |
} else if (data.type === "progress") {
|
| 729 |
lastStatus = data.status;
|
| 730 |
+
// Show progress percentage
|
| 731 |
showStatus(
|
| 732 |
+
`<span class="spinner"></span> ${data.percent}%`,
|
| 733 |
"loading"
|
| 734 |
);
|
| 735 |
// Update progress bar
|
| 736 |
processingProgressBar.style.width = `${data.percent}%`;
|
| 737 |
} else if (data.type === "complete") {
|
| 738 |
// Generation complete - show player
|
| 739 |
+
// Use actual audio duration from server if available
|
| 740 |
+
if (data.audio_duration && data.audio_duration > 0) {
|
| 741 |
+
estimatedDuration = data.audio_duration;
|
|
|
|
|
|
|
| 742 |
}
|
| 743 |
+
// Build filename with style and language indicators
|
| 744 |
+
const styleIcons = {
|
| 745 |
+
technical: "fa-microchip",
|
| 746 |
+
conversational: "fa-comments",
|
| 747 |
+
storytelling: "fa-book-open",
|
| 748 |
+
child_narrative: "fa-child",
|
| 749 |
+
news: "fa-newspaper",
|
| 750 |
+
academic: "fa-graduation-cap"
|
| 751 |
+
};
|
| 752 |
+
const langFlags = {
|
| 753 |
+
english: "🇬🇧",
|
| 754 |
+
chinese: "🇨🇳",
|
| 755 |
+
japanese: "🇯🇵",
|
| 756 |
+
korean: "🇰🇷"
|
| 757 |
+
};
|
| 758 |
+
const usedStyle = getSelectedStyle();
|
| 759 |
+
const usedLang = getSelectedLanguage();
|
| 760 |
+
const styleIcon = styleIcons[usedStyle] || "fa-microchip";
|
| 761 |
+
const langFlag = langFlags[usedLang] || "🇬🇧";
|
| 762 |
+
filename.innerHTML = `<i class="fa-solid ${getSourceIcon(sourceType)}"></i> ${docName} <span class="filename-meta"><i class="fa-solid ${styleIcon}" title="Style: ${usedStyle}"></i><span title="Language: ${usedLang}">${langFlag}</span></span>`;
|
| 763 |
currentDocName = docName;
|
| 764 |
+
// Hide stream buttons, show full player with download
|
| 765 |
+
streamPlayBtn.classList.add("hidden");
|
| 766 |
+
downloadBtn.classList.remove("hidden");
|
| 767 |
+
deleteBtn.classList.remove("hidden");
|
| 768 |
player.classList.add("visible");
|
| 769 |
// Set progress to 100%
|
| 770 |
processingProgressBar.style.width = "100%";
|
|
|
|
| 774 |
"success"
|
| 775 |
);
|
| 776 |
updatePlayerProgress();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
}
|
| 778 |
} catch (parseError) {
|
| 779 |
// Check if it's our thrown error or a JSON parse error
|
|
|
|
| 813 |
showStatus('<span class="spinner"></span> Extracting text...', "loading");
|
| 814 |
player.classList.remove("visible");
|
| 815 |
downloadBtn.classList.add("hidden");
|
|
|
|
| 816 |
|
| 817 |
const formData = new FormData();
|
| 818 |
formData.append("file", file);
|
| 819 |
formData.append("language", getSelectedLanguage());
|
| 820 |
+
formData.append("style", getSelectedStyle());
|
| 821 |
|
| 822 |
// Create abort controller for this request
|
| 823 |
currentAbortController = new AbortController();
|
|
|
|
| 835 |
}
|
| 836 |
|
| 837 |
// Process stream handles both progress SSE and starting audio playback
|
| 838 |
+
await processStream(response, file.name, "pdf");
|
| 839 |
} catch (error) {
|
| 840 |
if (error.name === "AbortError") {
|
| 841 |
// User cancelled - already handled in stopGeneration
|
|
|
|
| 872 |
showStatus('<span class="spinner"></span> Fetching content...', "loading");
|
| 873 |
player.classList.remove("visible");
|
| 874 |
downloadBtn.classList.add("hidden");
|
|
|
|
| 875 |
urlSubmit.disabled = true;
|
| 876 |
|
| 877 |
// Create abort controller for this request
|
|
|
|
| 885 |
},
|
| 886 |
body: JSON.stringify({
|
| 887 |
url,
|
| 888 |
+
language: getSelectedLanguage(),
|
| 889 |
+
style: getSelectedStyle()
|
| 890 |
}),
|
| 891 |
signal: currentAbortController.signal,
|
| 892 |
});
|
|
|
|
| 901 |
const docName = urlPath.split("/").pop() || "document";
|
| 902 |
|
| 903 |
// Process stream handles both progress SSE and starting audio playback
|
| 904 |
+
await processStream(response, docName, "url");
|
| 905 |
} catch (error) {
|
| 906 |
if (error.name === "AbortError") {
|
| 907 |
// User cancelled - already handled in stopGeneration
|
|
|
|
| 936 |
showStatus('<span class="spinner"></span> Processing text...', "loading");
|
| 937 |
player.classList.remove("visible");
|
| 938 |
downloadBtn.classList.add("hidden");
|
|
|
|
| 939 |
textSubmit.disabled = true;
|
| 940 |
|
| 941 |
// Create abort controller for this request
|
|
|
|
| 949 |
},
|
| 950 |
body: JSON.stringify({
|
| 951 |
text,
|
| 952 |
+
language: getSelectedLanguage(),
|
| 953 |
+
style: getSelectedStyle()
|
| 954 |
}),
|
| 955 |
signal: currentAbortController.signal,
|
| 956 |
});
|
|
|
|
| 960 |
throw new Error(error.detail || "Failed to process text");
|
| 961 |
}
|
| 962 |
|
| 963 |
+
// Generate document name from first few words
|
| 964 |
+
const words = text.trim().split(/\s+/).slice(0, 5).join(" ");
|
| 965 |
+
const docName = words.length > 30 ? words.slice(0, 30) + "..." : words;
|
| 966 |
+
|
| 967 |
// Process stream handles both progress SSE and starting audio playback
|
| 968 |
+
await processStream(response, docName, "text");
|
| 969 |
} catch (error) {
|
| 970 |
if (error.name === "AbortError") {
|
| 971 |
// User cancelled - already handled in stopGeneration
|
|
|
|
| 982 |
// Tab switching
|
| 983 |
tabs.forEach((tab) => {
|
| 984 |
tab.addEventListener("click", () => {
|
| 985 |
+
const isAlreadyActive = tab.classList.contains("active");
|
| 986 |
+
const isUploadTab = tab.dataset.tab === "upload";
|
| 987 |
+
|
| 988 |
+
// If clicking on already-active upload tab, open file picker
|
| 989 |
+
if (isAlreadyActive && isUploadTab) {
|
| 990 |
+
fileInput.click();
|
| 991 |
+
return;
|
| 992 |
+
}
|
| 993 |
+
|
| 994 |
tabs.forEach((t) => t.classList.remove("active"));
|
| 995 |
tabContents.forEach((tc) => tc.classList.remove("active"));
|
| 996 |
tab.classList.add("active");
|
|
|
|
| 1020 |
|
| 1021 |
// Click to select file
|
| 1022 |
dropZone.addEventListener("click", (e) => {
|
| 1023 |
+
if (e.target !== fileInput) {
|
| 1024 |
fileInput.click();
|
| 1025 |
}
|
| 1026 |
});
|
|
|
|
| 1054 |
}
|
| 1055 |
});
|
| 1056 |
|
| 1057 |
+
// Auto-detect language from text input
|
| 1058 |
+
textInput.addEventListener("input", () => {
|
| 1059 |
+
const detected = detectLanguage(textInput.value);
|
| 1060 |
+
if (detected) {
|
| 1061 |
+
setLanguage(detected, true);
|
| 1062 |
+
}
|
| 1063 |
+
});
|
| 1064 |
+
|
| 1065 |
// Stop button
|
| 1066 |
stopBtn.addEventListener("click", stopGeneration);
|
| 1067 |
|
| 1068 |
+
// Stream play button (during processing)
|
| 1069 |
+
streamPlayBtn.addEventListener("click", () => {
|
| 1070 |
+
audio.play().catch(() => {});
|
| 1071 |
+
// Hide stream play button and show pause button
|
| 1072 |
+
streamPlayBtn.classList.add("hidden");
|
| 1073 |
+
pauseBtn.classList.remove("hidden");
|
| 1074 |
+
});
|
| 1075 |
+
|
| 1076 |
// Pause button
|
| 1077 |
pauseBtn.addEventListener("click", togglePause);
|
| 1078 |
|
| 1079 |
// Download button
|
| 1080 |
downloadBtn.addEventListener("click", downloadAudio);
|
| 1081 |
|
| 1082 |
+
// Delete button
|
| 1083 |
+
deleteBtn.addEventListener("click", deleteAudio);
|
| 1084 |
+
|
| 1085 |
// Update pause button when audio state changes
|
| 1086 |
audio.addEventListener("play", updatePauseButton);
|
| 1087 |
audio.addEventListener("pause", updatePauseButton);
|
|
|
|
| 1093 |
// Language selection
|
| 1094 |
languageButtons.forEach((btn) => {
|
| 1095 |
btn.addEventListener("click", () => {
|
| 1096 |
+
setLanguage(btn.dataset.language, false);
|
| 1097 |
+
});
|
| 1098 |
+
});
|
| 1099 |
+
|
| 1100 |
+
// Style selection
|
| 1101 |
+
const styleButtons = document.querySelectorAll("#styleButtons .style-btn");
|
| 1102 |
+
styleButtons.forEach((btn) => {
|
| 1103 |
+
btn.addEventListener("click", () => {
|
| 1104 |
+
styleButtons.forEach((b) => b.classList.remove("active"));
|
| 1105 |
btn.classList.add("active");
|
| 1106 |
+
selectedStyle = btn.dataset.style;
|
| 1107 |
});
|
| 1108 |
});
|
src/talking_snake/static/index.html
CHANGED
|
@@ -26,6 +26,9 @@
|
|
| 26 |
<link rel="icon" type="image/png" sizes="512x512" href="/static/icon-512.png">
|
| 27 |
|
| 28 |
<link rel="stylesheet" href="/static/styles.css">
|
|
|
|
|
|
|
|
|
|
| 29 |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" crossorigin="anonymous" referrerpolicy="no-referrer">
|
| 30 |
<script src="https://unpkg.com/htmx.org@2.0.4"></script>
|
| 31 |
</head>
|
|
@@ -33,26 +36,37 @@
|
|
| 33 |
<div class="main-content">
|
| 34 |
<img src="/static/talking_snake.png" alt="Talking Snake" class="logo">
|
| 35 |
<h1>Talking Snake</h1>
|
| 36 |
-
<p class="subtitle">Transform PDFs & Web into Audio</p>
|
| 37 |
|
| 38 |
<div class="container">
|
| 39 |
<div class="input-section" id="inputSection">
|
| 40 |
<div class="options-row">
|
| 41 |
-
<div class="
|
| 42 |
-
<span class="style-label">
|
| 43 |
-
<div class="style-buttons" id="
|
| 44 |
-
<button class="style-btn
|
| 45 |
-
|
| 46 |
</button>
|
| 47 |
-
<button class="style-btn
|
| 48 |
-
|
| 49 |
</button>
|
| 50 |
-
<button class="style-btn
|
| 51 |
-
|
| 52 |
</button>
|
| 53 |
-
<button class="style-btn
|
| 54 |
-
|
| 55 |
</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
</div>
|
| 57 |
</div>
|
| 58 |
</div>
|
|
@@ -67,19 +81,16 @@
|
|
| 67 |
<div class="drop-zone" id="dropZone">
|
| 68 |
<i class="fa-solid fa-file-pdf drop-icon"></i>
|
| 69 |
<p>Drag & drop a PDF here</p>
|
| 70 |
-
<
|
| 71 |
-
<i class="fa-solid fa-folder-open"></i> Choose File
|
| 72 |
-
<input type="file" id="fileInput" accept=".pdf">
|
| 73 |
-
</label>
|
| 74 |
-
<p class="hint">Supports PDF documents up to 50MB</p>
|
| 75 |
</div>
|
| 76 |
</div>
|
| 77 |
|
| 78 |
<div class="tab-content" id="url-tab">
|
| 79 |
<div class="url-form">
|
| 80 |
-
<
|
| 81 |
-
|
| 82 |
-
|
|
|
|
| 83 |
</div>
|
| 84 |
</div>
|
| 85 |
|
|
@@ -87,7 +98,6 @@
|
|
| 87 |
<div class="text-form">
|
| 88 |
<textarea id="textInput" placeholder="Paste or type your text here..." rows="6"></textarea>
|
| 89 |
<button class="submit-btn" id="textSubmit"><i class="fa-solid fa-microphone"></i> Read Text</button>
|
| 90 |
-
<p class="hint">Paste any text you want to hear read aloud</p>
|
| 91 |
</div>
|
| 92 |
</div>
|
| 93 |
</div>
|
|
@@ -102,6 +112,7 @@
|
|
| 102 |
<div class="processing-progress-bar" id="processingProgressBar"></div>
|
| 103 |
</div>
|
| 104 |
<div class="control-buttons">
|
|
|
|
| 105 |
<button class="control-btn pause-btn hidden" id="pauseBtn" title="Pause/Resume"><i class="fa-solid fa-pause"></i></button>
|
| 106 |
<button class="control-btn stop-btn" id="stopBtn" title="Stop generation"><i class="fa-solid fa-stop"></i></button>
|
| 107 |
</div>
|
|
@@ -121,12 +132,12 @@
|
|
| 121 |
<input type="range" class="progress-slider" id="progressSlider" min="0" max="100" value="0">
|
| 122 |
</div>
|
| 123 |
<span class="time-display" id="timeDisplay">0:00 / 0:00</span>
|
| 124 |
-
<button class="player-btn volume-btn" id="volumeBtn" title="Mute/Unmute">
|
| 125 |
-
<i class="fa-solid fa-volume-high"></i>
|
| 126 |
-
</button>
|
| 127 |
<button class="player-btn download-btn hidden" id="downloadBtn" title="Download Audio">
|
| 128 |
<i class="fa-solid fa-download"></i>
|
| 129 |
</button>
|
|
|
|
|
|
|
|
|
|
| 130 |
</div>
|
| 131 |
<audio id="audio" preload="auto"></audio>
|
| 132 |
</div>
|
|
|
|
| 26 |
<link rel="icon" type="image/png" sizes="512x512" href="/static/icon-512.png">
|
| 27 |
|
| 28 |
<link rel="stylesheet" href="/static/styles.css">
|
| 29 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 30 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 31 |
+
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Fredoka:wght@500&display=swap">
|
| 32 |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" crossorigin="anonymous" referrerpolicy="no-referrer">
|
| 33 |
<script src="https://unpkg.com/htmx.org@2.0.4"></script>
|
| 34 |
</head>
|
|
|
|
| 36 |
<div class="main-content">
|
| 37 |
<img src="/static/talking_snake.png" alt="Talking Snake" class="logo">
|
| 38 |
<h1>Talking Snake</h1>
|
|
|
|
| 39 |
|
| 40 |
<div class="container">
|
| 41 |
<div class="input-section" id="inputSection">
|
| 42 |
<div class="options-row">
|
| 43 |
+
<div class="style-selector">
|
| 44 |
+
<span class="style-label">Style:</span>
|
| 45 |
+
<div class="style-buttons" id="styleButtons">
|
| 46 |
+
<button class="style-btn active" data-style="technical" title="Clear, precise reading for code and technical documentation">
|
| 47 |
+
<i class="fa-solid fa-microchip"></i>
|
| 48 |
</button>
|
| 49 |
+
<button class="style-btn" data-style="narrative" title="Natural, engaging reading for articles and stories">
|
| 50 |
+
<i class="fa-solid fa-book-open"></i>
|
| 51 |
</button>
|
| 52 |
+
<button class="style-btn" data-style="child_narrative" title="Playful, expressive reading for children's stories">
|
| 53 |
+
<i class="fa-solid fa-child"></i>
|
| 54 |
</button>
|
| 55 |
+
<button class="style-btn" data-style="news" title="Authoritative, clear delivery for news and reports">
|
| 56 |
+
<i class="fa-solid fa-newspaper"></i>
|
| 57 |
</button>
|
| 58 |
+
<button class="style-btn" data-style="academic" title="Measured, scholarly reading for papers and research">
|
| 59 |
+
<i class="fa-solid fa-graduation-cap"></i>
|
| 60 |
+
</button>
|
| 61 |
+
</div>
|
| 62 |
+
</div>
|
| 63 |
+
<div class="language-selector">
|
| 64 |
+
<span class="style-label">Language:</span>
|
| 65 |
+
<div class="style-buttons" id="languageButtons">
|
| 66 |
+
<button class="style-btn lang-btn active" data-language="english" title="English">🇬🇧</button>
|
| 67 |
+
<button class="style-btn lang-btn" data-language="chinese" title="Chinese">🇨🇳</button>
|
| 68 |
+
<button class="style-btn lang-btn" data-language="japanese" title="Japanese">🇯🇵</button>
|
| 69 |
+
<button class="style-btn lang-btn" data-language="korean" title="Korean">🇰🇷</button>
|
| 70 |
</div>
|
| 71 |
</div>
|
| 72 |
</div>
|
|
|
|
| 81 |
<div class="drop-zone" id="dropZone">
|
| 82 |
<i class="fa-solid fa-file-pdf drop-icon"></i>
|
| 83 |
<p>Drag & drop a PDF here</p>
|
| 84 |
+
<input type="file" id="fileInput" accept=".pdf" class="hidden-file-input">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
</div>
|
| 86 |
</div>
|
| 87 |
|
| 88 |
<div class="tab-content" id="url-tab">
|
| 89 |
<div class="url-form">
|
| 90 |
+
<div class="url-input-row">
|
| 91 |
+
<input type="url" id="urlInput" placeholder="https://example.com/article or .pdf">
|
| 92 |
+
<button class="submit-btn" id="urlSubmit"><i class="fa-solid fa-microphone"></i></button>
|
| 93 |
+
</div>
|
| 94 |
</div>
|
| 95 |
</div>
|
| 96 |
|
|
|
|
| 98 |
<div class="text-form">
|
| 99 |
<textarea id="textInput" placeholder="Paste or type your text here..." rows="6"></textarea>
|
| 100 |
<button class="submit-btn" id="textSubmit"><i class="fa-solid fa-microphone"></i> Read Text</button>
|
|
|
|
| 101 |
</div>
|
| 102 |
</div>
|
| 103 |
</div>
|
|
|
|
| 112 |
<div class="processing-progress-bar" id="processingProgressBar"></div>
|
| 113 |
</div>
|
| 114 |
<div class="control-buttons">
|
| 115 |
+
<button class="control-btn play-btn hidden" id="streamPlayBtn" title="Play audio"><i class="fa-solid fa-play"></i></button>
|
| 116 |
<button class="control-btn pause-btn hidden" id="pauseBtn" title="Pause/Resume"><i class="fa-solid fa-pause"></i></button>
|
| 117 |
<button class="control-btn stop-btn" id="stopBtn" title="Stop generation"><i class="fa-solid fa-stop"></i></button>
|
| 118 |
</div>
|
|
|
|
| 132 |
<input type="range" class="progress-slider" id="progressSlider" min="0" max="100" value="0">
|
| 133 |
</div>
|
| 134 |
<span class="time-display" id="timeDisplay">0:00 / 0:00</span>
|
|
|
|
|
|
|
|
|
|
| 135 |
<button class="player-btn download-btn hidden" id="downloadBtn" title="Download Audio">
|
| 136 |
<i class="fa-solid fa-download"></i>
|
| 137 |
</button>
|
| 138 |
+
<button class="player-btn delete-btn hidden" id="deleteBtn" title="Delete Audio">
|
| 139 |
+
<i class="fa-solid fa-trash"></i>
|
| 140 |
+
</button>
|
| 141 |
</div>
|
| 142 |
<audio id="audio" preload="auto"></audio>
|
| 143 |
</div>
|
src/talking_snake/static/styles.css
CHANGED
|
@@ -45,15 +45,25 @@ body {
|
|
| 45 |
}
|
| 46 |
|
| 47 |
h1 {
|
|
|
|
| 48 |
font-size: 1.75rem;
|
| 49 |
-
margin: 0 0 0.
|
| 50 |
color: var(--primary);
|
|
|
|
|
|
|
| 51 |
}
|
| 52 |
|
| 53 |
.subtitle {
|
| 54 |
color: var(--text-muted);
|
| 55 |
-
margin: 0
|
| 56 |
font-size: 0.9rem;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
}
|
| 58 |
|
| 59 |
.container {
|
|
@@ -65,9 +75,9 @@ h1 {
|
|
| 65 |
.options-row {
|
| 66 |
display: flex;
|
| 67 |
justify-content: center;
|
| 68 |
-
gap:
|
| 69 |
-
margin-bottom:
|
| 70 |
-
flex-wrap:
|
| 71 |
}
|
| 72 |
|
| 73 |
/* Style Selector */
|
|
@@ -75,38 +85,46 @@ h1 {
|
|
| 75 |
.language-selector {
|
| 76 |
display: flex;
|
| 77 |
align-items: center;
|
| 78 |
-
gap: 0.
|
| 79 |
-
flex-wrap:
|
| 80 |
}
|
| 81 |
|
| 82 |
.style-label {
|
| 83 |
-
font-size: 0.
|
| 84 |
color: var(--text-muted);
|
| 85 |
}
|
| 86 |
|
| 87 |
.style-buttons {
|
| 88 |
display: flex;
|
| 89 |
-
gap: 0.
|
| 90 |
}
|
| 91 |
|
| 92 |
.style-btn {
|
| 93 |
-
width:
|
| 94 |
-
height:
|
| 95 |
border: 1px solid var(--border);
|
| 96 |
-
border-radius:
|
| 97 |
background: var(--surface);
|
| 98 |
color: var(--text-muted);
|
| 99 |
cursor: pointer;
|
| 100 |
-
font-size: 0.
|
|
|
|
| 101 |
transition: all 0.15s ease;
|
| 102 |
display: flex;
|
| 103 |
align-items: center;
|
| 104 |
justify-content: center;
|
| 105 |
}
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
/* Language buttons use emoji flags */
|
| 108 |
.style-btn.lang-btn {
|
| 109 |
-
font-size:
|
|
|
|
|
|
|
| 110 |
}
|
| 111 |
|
| 112 |
.style-btn:hover {
|
|
@@ -120,6 +138,29 @@ h1 {
|
|
| 120 |
color: var(--primary);
|
| 121 |
}
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
/* Input Section - hidden during processing */
|
| 124 |
.input-section.hidden {
|
| 125 |
display: none;
|
|
@@ -207,6 +248,21 @@ h1 {
|
|
| 207 |
opacity: 0.6;
|
| 208 |
}
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
/* Status in processing */
|
| 211 |
.processing-section .status {
|
| 212 |
padding: 0;
|
|
@@ -245,12 +301,13 @@ h1 {
|
|
| 245 |
width: 36px;
|
| 246 |
height: 36px;
|
| 247 |
padding: 0;
|
| 248 |
-
color:
|
| 249 |
-
|
|
|
|
| 250 |
border-radius: 8px;
|
| 251 |
cursor: pointer;
|
| 252 |
font-size: 0.9rem;
|
| 253 |
-
transition: all 0.
|
| 254 |
display: flex;
|
| 255 |
align-items: center;
|
| 256 |
justify-content: center;
|
|
@@ -261,27 +318,28 @@ h1 {
|
|
| 261 |
}
|
| 262 |
|
| 263 |
.control-btn:hover {
|
| 264 |
-
|
|
|
|
|
|
|
| 265 |
}
|
| 266 |
|
| 267 |
-
.pause-btn {
|
| 268 |
-
|
| 269 |
-
background-size: 200% 200%;
|
| 270 |
-
animation: gradient-idle 3s ease infinite;
|
| 271 |
}
|
| 272 |
|
| 273 |
-
.
|
| 274 |
-
|
|
|
|
| 275 |
}
|
| 276 |
|
| 277 |
-
.
|
| 278 |
-
background:
|
| 279 |
-
background-size: 200% 200%;
|
| 280 |
-
animation: gradient-idle 3s ease infinite;
|
| 281 |
}
|
| 282 |
|
| 283 |
.stop-btn:hover {
|
| 284 |
-
|
|
|
|
|
|
|
| 285 |
}
|
| 286 |
|
| 287 |
@keyframes gradient-idle {
|
|
@@ -299,7 +357,7 @@ h1 {
|
|
| 299 |
.drop-zone {
|
| 300 |
border: 2px dashed var(--border);
|
| 301 |
border-radius: 8px;
|
| 302 |
-
padding:
|
| 303 |
text-align: center;
|
| 304 |
transition: all 0.2s ease;
|
| 305 |
cursor: pointer;
|
|
@@ -317,11 +375,6 @@ h1 {
|
|
| 317 |
font-size: 0.95rem;
|
| 318 |
}
|
| 319 |
|
| 320 |
-
.drop-zone .hint {
|
| 321 |
-
color: var(--text-muted);
|
| 322 |
-
font-size: 0.8rem;
|
| 323 |
-
}
|
| 324 |
-
|
| 325 |
.drop-icon {
|
| 326 |
font-size: 2.5rem;
|
| 327 |
color: var(--primary);
|
|
@@ -361,37 +414,60 @@ h1 {
|
|
| 361 |
|
| 362 |
.tab-content {
|
| 363 |
display: none;
|
|
|
|
|
|
|
| 364 |
}
|
| 365 |
|
| 366 |
.tab-content.active {
|
| 367 |
display: block;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
}
|
| 369 |
|
| 370 |
/* URL Form */
|
| 371 |
.url-form {
|
| 372 |
background: var(--surface);
|
| 373 |
border-radius: 8px;
|
| 374 |
-
padding:
|
| 375 |
}
|
| 376 |
|
| 377 |
-
.url-
|
| 378 |
-
|
| 379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
background: var(--bg);
|
| 381 |
border: 1px solid var(--border);
|
| 382 |
border-radius: 6px;
|
| 383 |
color: var(--text);
|
| 384 |
font-size: 0.9rem;
|
| 385 |
-
margin-bottom: 0.75rem;
|
| 386 |
transition: border-color 0.15s ease;
|
| 387 |
}
|
| 388 |
|
| 389 |
-
.url-
|
| 390 |
outline: none;
|
| 391 |
border-color: var(--primary);
|
| 392 |
}
|
| 393 |
|
| 394 |
-
.url-
|
| 395 |
color: var(--text-muted);
|
| 396 |
}
|
| 397 |
|
|
@@ -427,70 +503,79 @@ h1 {
|
|
| 427 |
color: var(--text-muted);
|
| 428 |
}
|
| 429 |
|
| 430 |
-
.text-form .hint {
|
| 431 |
-
color: var(--text-muted);
|
| 432 |
-
font-size: 0.8rem;
|
| 433 |
-
text-align: center;
|
| 434 |
-
margin: 0;
|
| 435 |
-
}
|
| 436 |
-
|
| 437 |
/* Buttons */
|
| 438 |
.submit-btn {
|
| 439 |
width: 100%;
|
| 440 |
padding: 0.6rem 1rem;
|
| 441 |
-
background:
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
color: white;
|
| 445 |
-
border: none;
|
| 446 |
border-radius: 8px;
|
| 447 |
cursor: pointer;
|
| 448 |
font-size: 0.9rem;
|
| 449 |
font-weight: 500;
|
| 450 |
-
transition:
|
| 451 |
margin-bottom: 0.5rem;
|
| 452 |
}
|
| 453 |
|
| 454 |
.submit-btn:hover {
|
| 455 |
-
|
| 456 |
-
|
|
|
|
| 457 |
}
|
| 458 |
|
| 459 |
.submit-btn:disabled {
|
| 460 |
-
opacity: 0.
|
| 461 |
cursor: not-allowed;
|
| 462 |
-
filter: none;
|
| 463 |
-
animation: none;
|
| 464 |
}
|
| 465 |
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
margin: 0;
|
|
|
|
|
|
|
|
|
|
| 471 |
}
|
| 472 |
|
| 473 |
input[type="file"] {
|
| 474 |
display: none;
|
| 475 |
}
|
| 476 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
.file-label {
|
| 478 |
display: inline-block;
|
| 479 |
padding: 0.5rem 1rem;
|
| 480 |
-
background:
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
color: white;
|
| 484 |
border-radius: 8px;
|
| 485 |
cursor: pointer;
|
| 486 |
font-weight: 500;
|
| 487 |
font-size: 0.9rem;
|
| 488 |
-
transition:
|
| 489 |
}
|
| 490 |
|
| 491 |
.file-label:hover {
|
| 492 |
-
|
| 493 |
-
|
|
|
|
| 494 |
}
|
| 495 |
|
| 496 |
/* Device Info - Subtle footer-like display */
|
|
@@ -498,17 +583,19 @@ input[type="file"] {
|
|
| 498 |
display: none;
|
| 499 |
justify-content: center;
|
| 500 |
align-items: center;
|
| 501 |
-
gap:
|
| 502 |
-
padding: 0.
|
| 503 |
font-size: 0.7rem;
|
| 504 |
color: var(--text-muted);
|
| 505 |
-
margin-top: 0.
|
| 506 |
opacity: 0.7;
|
|
|
|
| 507 |
}
|
| 508 |
|
| 509 |
.device-info.visible {
|
| 510 |
display: flex;
|
| 511 |
flex-wrap: wrap;
|
|
|
|
| 512 |
}
|
| 513 |
|
| 514 |
.device-info i {
|
|
@@ -517,7 +604,66 @@ input[type="file"] {
|
|
| 517 |
}
|
| 518 |
|
| 519 |
.device-memory {
|
| 520 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
}
|
| 522 |
|
| 523 |
.device-batch {
|
|
@@ -562,10 +708,10 @@ input[type="file"] {
|
|
| 562 |
|
| 563 |
/* Audio Player */
|
| 564 |
.player {
|
| 565 |
-
margin-top:
|
| 566 |
width: 100%;
|
| 567 |
display: none;
|
| 568 |
-
padding:
|
| 569 |
background: var(--surface);
|
| 570 |
border-radius: 12px;
|
| 571 |
border: 1px solid var(--border);
|
|
@@ -573,6 +719,35 @@ input[type="file"] {
|
|
| 573 |
|
| 574 |
.player.visible {
|
| 575 |
display: block;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
}
|
| 577 |
|
| 578 |
/* Hidden audio element */
|
|
@@ -590,31 +765,32 @@ input[type="file"] {
|
|
| 590 |
.player-btn {
|
| 591 |
width: 36px;
|
| 592 |
height: 36px;
|
| 593 |
-
border:
|
| 594 |
border-radius: 8px;
|
| 595 |
-
background:
|
| 596 |
-
|
| 597 |
-
animation: gradient-idle 3s ease infinite;
|
| 598 |
-
color: white;
|
| 599 |
cursor: pointer;
|
| 600 |
display: flex;
|
| 601 |
align-items: center;
|
| 602 |
justify-content: center;
|
| 603 |
font-size: 0.85rem;
|
| 604 |
-
transition:
|
| 605 |
flex-shrink: 0;
|
| 606 |
}
|
| 607 |
|
| 608 |
.player-btn:hover {
|
| 609 |
-
|
| 610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
}
|
| 612 |
|
| 613 |
.player-btn.volume-btn {
|
| 614 |
-
background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
|
| 615 |
-
background-size: 200% 200%;
|
| 616 |
-
animation: gradient-idle 3s ease infinite;
|
| 617 |
-
color: var(--text-muted);
|
| 618 |
width: 32px;
|
| 619 |
height: 32px;
|
| 620 |
font-size: 0.8rem;
|
|
@@ -622,14 +798,9 @@ input[type="file"] {
|
|
| 622 |
|
| 623 |
.player-btn.volume-btn:hover {
|
| 624 |
color: var(--text);
|
| 625 |
-
animation: gradient-shift 0.8s ease infinite;
|
| 626 |
}
|
| 627 |
|
| 628 |
.player-btn.download-btn {
|
| 629 |
-
background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
|
| 630 |
-
background-size: 200% 200%;
|
| 631 |
-
animation: gradient-idle 3s ease infinite;
|
| 632 |
-
color: var(--text-muted);
|
| 633 |
width: 32px;
|
| 634 |
height: 32px;
|
| 635 |
font-size: 0.8rem;
|
|
@@ -637,7 +808,18 @@ input[type="file"] {
|
|
| 637 |
|
| 638 |
.player-btn.download-btn:hover {
|
| 639 |
color: var(--primary);
|
| 640 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
}
|
| 642 |
|
| 643 |
.progress-container {
|
|
@@ -697,11 +879,24 @@ input[type="file"] {
|
|
| 697 |
}
|
| 698 |
|
| 699 |
.filename {
|
| 700 |
-
margin-bottom: 0.
|
| 701 |
font-size: 0.85rem;
|
| 702 |
font-weight: 500;
|
| 703 |
color: var(--text);
|
| 704 |
word-break: break-all;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
}
|
| 706 |
|
| 707 |
/* Spinner Animation */
|
|
@@ -714,7 +909,7 @@ input[type="file"] {
|
|
| 714 |
border-radius: 50%;
|
| 715 |
animation: spin 1s linear infinite;
|
| 716 |
margin-right: 0.4rem;
|
| 717 |
-
vertical-align:
|
| 718 |
}
|
| 719 |
|
| 720 |
@keyframes spin {
|
|
|
|
| 45 |
}
|
| 46 |
|
| 47 |
h1 {
|
| 48 |
+
font-family: Fredoka, sans-serif;
|
| 49 |
font-size: 1.75rem;
|
| 50 |
+
margin: 0 0 0.5rem;
|
| 51 |
color: var(--primary);
|
| 52 |
+
display: inline;
|
| 53 |
+
vertical-align: baseline;
|
| 54 |
}
|
| 55 |
|
| 56 |
.subtitle {
|
| 57 |
color: var(--text-muted);
|
| 58 |
+
margin: 0;
|
| 59 |
font-size: 0.9rem;
|
| 60 |
+
display: inline;
|
| 61 |
+
vertical-align: baseline;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.header-row {
|
| 65 |
+
margin-bottom: 0.5rem;
|
| 66 |
+
text-align: center;
|
| 67 |
}
|
| 68 |
|
| 69 |
.container {
|
|
|
|
| 75 |
.options-row {
|
| 76 |
display: flex;
|
| 77 |
justify-content: center;
|
| 78 |
+
gap: 0.75rem;
|
| 79 |
+
margin-bottom: 0.75rem;
|
| 80 |
+
flex-wrap: nowrap;
|
| 81 |
}
|
| 82 |
|
| 83 |
/* Style Selector */
|
|
|
|
| 85 |
.language-selector {
|
| 86 |
display: flex;
|
| 87 |
align-items: center;
|
| 88 |
+
gap: 0.4rem;
|
| 89 |
+
flex-wrap: nowrap;
|
| 90 |
}
|
| 91 |
|
| 92 |
.style-label {
|
| 93 |
+
font-size: 0.75rem;
|
| 94 |
color: var(--text-muted);
|
| 95 |
}
|
| 96 |
|
| 97 |
.style-buttons {
|
| 98 |
display: flex;
|
| 99 |
+
gap: 0.25rem;
|
| 100 |
}
|
| 101 |
|
| 102 |
.style-btn {
|
| 103 |
+
width: 28px;
|
| 104 |
+
height: 28px;
|
| 105 |
border: 1px solid var(--border);
|
| 106 |
+
border-radius: 5px;
|
| 107 |
background: var(--surface);
|
| 108 |
color: var(--text-muted);
|
| 109 |
cursor: pointer;
|
| 110 |
+
font-size: 0.75rem;
|
| 111 |
+
line-height: 1;
|
| 112 |
transition: all 0.15s ease;
|
| 113 |
display: flex;
|
| 114 |
align-items: center;
|
| 115 |
justify-content: center;
|
| 116 |
}
|
| 117 |
|
| 118 |
+
.style-btn i {
|
| 119 |
+
display: block;
|
| 120 |
+
line-height: 1;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
/* Language buttons use emoji flags */
|
| 124 |
.style-btn.lang-btn {
|
| 125 |
+
font-size: 1rem;
|
| 126 |
+
line-height: 1;
|
| 127 |
+
padding: 0;
|
| 128 |
}
|
| 129 |
|
| 130 |
.style-btn:hover {
|
|
|
|
| 138 |
color: var(--primary);
|
| 139 |
}
|
| 140 |
|
| 141 |
+
/* Auto-detected language indicator */
|
| 142 |
+
.style-btn.lang-btn.auto-detected {
|
| 143 |
+
animation: auto-detect-pulse 0.5s ease-out;
|
| 144 |
+
box-shadow: 0 0 0 2px var(--primary);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
@keyframes auto-detect-pulse {
|
| 148 |
+
0% {
|
| 149 |
+
transform: scale(1);
|
| 150 |
+
box-shadow: 0 0 0 0 rgba(212, 118, 58, 0.7);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
50% {
|
| 154 |
+
transform: scale(1.1);
|
| 155 |
+
box-shadow: 0 0 0 4px rgba(212, 118, 58, 0.4);
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
100% {
|
| 159 |
+
transform: scale(1);
|
| 160 |
+
box-shadow: 0 0 0 2px var(--primary);
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
/* Input Section - hidden during processing */
|
| 165 |
.input-section.hidden {
|
| 166 |
display: none;
|
|
|
|
| 248 |
opacity: 0.6;
|
| 249 |
}
|
| 250 |
|
| 251 |
+
.doc-info .doc-style,
|
| 252 |
+
.doc-info .doc-lang {
|
| 253 |
+
color: var(--text-muted);
|
| 254 |
+
font-size: 0.75rem;
|
| 255 |
+
display: flex;
|
| 256 |
+
align-items: center;
|
| 257 |
+
white-space: nowrap;
|
| 258 |
+
flex-shrink: 0;
|
| 259 |
+
opacity: 0.7;
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
.doc-info .doc-style i {
|
| 263 |
+
font-size: 0.7rem;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
/* Status in processing */
|
| 267 |
.processing-section .status {
|
| 268 |
padding: 0;
|
|
|
|
| 301 |
width: 36px;
|
| 302 |
height: 36px;
|
| 303 |
padding: 0;
|
| 304 |
+
color: var(--text-muted);
|
| 305 |
+
background: var(--surface);
|
| 306 |
+
border: 1px solid var(--border);
|
| 307 |
border-radius: 8px;
|
| 308 |
cursor: pointer;
|
| 309 |
font-size: 0.9rem;
|
| 310 |
+
transition: all 0.2s ease;
|
| 311 |
display: flex;
|
| 312 |
align-items: center;
|
| 313 |
justify-content: center;
|
|
|
|
| 318 |
}
|
| 319 |
|
| 320 |
.control-btn:hover {
|
| 321 |
+
color: var(--primary);
|
| 322 |
+
border-color: var(--primary);
|
| 323 |
+
background: rgb(212, 118, 58, 0.08);
|
| 324 |
}
|
| 325 |
|
| 326 |
+
.pause-btn:hover {
|
| 327 |
+
color: var(--primary);
|
|
|
|
|
|
|
| 328 |
}
|
| 329 |
|
| 330 |
+
.control-btn.play-btn {
|
| 331 |
+
color: var(--success);
|
| 332 |
+
border-color: var(--success);
|
| 333 |
}
|
| 334 |
|
| 335 |
+
.control-btn.play-btn:hover {
|
| 336 |
+
background: rgba(116, 184, 22, 0.15);
|
|
|
|
|
|
|
| 337 |
}
|
| 338 |
|
| 339 |
.stop-btn:hover {
|
| 340 |
+
color: var(--error);
|
| 341 |
+
border-color: var(--error);
|
| 342 |
+
background: rgb(196, 90, 74, 0.08);
|
| 343 |
}
|
| 344 |
|
| 345 |
@keyframes gradient-idle {
|
|
|
|
| 357 |
.drop-zone {
|
| 358 |
border: 2px dashed var(--border);
|
| 359 |
border-radius: 8px;
|
| 360 |
+
padding: 1rem 0;
|
| 361 |
text-align: center;
|
| 362 |
transition: all 0.2s ease;
|
| 363 |
cursor: pointer;
|
|
|
|
| 375 |
font-size: 0.95rem;
|
| 376 |
}
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
.drop-icon {
|
| 379 |
font-size: 2.5rem;
|
| 380 |
color: var(--primary);
|
|
|
|
| 414 |
|
| 415 |
.tab-content {
|
| 416 |
display: none;
|
| 417 |
+
opacity: 0;
|
| 418 |
+
transform: translateY(-8px);
|
| 419 |
}
|
| 420 |
|
| 421 |
.tab-content.active {
|
| 422 |
display: block;
|
| 423 |
+
opacity: 1;
|
| 424 |
+
transform: translateY(0);
|
| 425 |
+
animation: tab-fade-in 0.2s ease-out;
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
@keyframes tab-fade-in {
|
| 429 |
+
from {
|
| 430 |
+
opacity: 0;
|
| 431 |
+
transform: translateY(-8px);
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
to {
|
| 435 |
+
opacity: 1;
|
| 436 |
+
transform: translateY(0);
|
| 437 |
+
}
|
| 438 |
}
|
| 439 |
|
| 440 |
/* URL Form */
|
| 441 |
.url-form {
|
| 442 |
background: var(--surface);
|
| 443 |
border-radius: 8px;
|
| 444 |
+
padding: 0.75rem;
|
| 445 |
}
|
| 446 |
|
| 447 |
+
.url-input-row {
|
| 448 |
+
display: flex;
|
| 449 |
+
gap: 0.5rem;
|
| 450 |
+
align-items: center;
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
.url-input-row input[type="url"] {
|
| 454 |
+
flex: 1;
|
| 455 |
+
height: 40px;
|
| 456 |
+
padding: 0 0.75rem;
|
| 457 |
background: var(--bg);
|
| 458 |
border: 1px solid var(--border);
|
| 459 |
border-radius: 6px;
|
| 460 |
color: var(--text);
|
| 461 |
font-size: 0.9rem;
|
|
|
|
| 462 |
transition: border-color 0.15s ease;
|
| 463 |
}
|
| 464 |
|
| 465 |
+
.url-input-row input[type="url"]:focus {
|
| 466 |
outline: none;
|
| 467 |
border-color: var(--primary);
|
| 468 |
}
|
| 469 |
|
| 470 |
+
.url-input-row input[type="url"]::placeholder {
|
| 471 |
color: var(--text-muted);
|
| 472 |
}
|
| 473 |
|
|
|
|
| 503 |
color: var(--text-muted);
|
| 504 |
}
|
| 505 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
/* Buttons */
|
| 507 |
.submit-btn {
|
| 508 |
width: 100%;
|
| 509 |
padding: 0.6rem 1rem;
|
| 510 |
+
background: var(--surface);
|
| 511 |
+
color: var(--text);
|
| 512 |
+
border: 1px solid var(--border);
|
|
|
|
|
|
|
| 513 |
border-radius: 8px;
|
| 514 |
cursor: pointer;
|
| 515 |
font-size: 0.9rem;
|
| 516 |
font-weight: 500;
|
| 517 |
+
transition: all 0.2s ease;
|
| 518 |
margin-bottom: 0.5rem;
|
| 519 |
}
|
| 520 |
|
| 521 |
.submit-btn:hover {
|
| 522 |
+
color: var(--primary);
|
| 523 |
+
border-color: var(--primary);
|
| 524 |
+
background: rgb(212, 118, 58, 0.08);
|
| 525 |
}
|
| 526 |
|
| 527 |
.submit-btn:disabled {
|
| 528 |
+
opacity: 0.5;
|
| 529 |
cursor: not-allowed;
|
|
|
|
|
|
|
| 530 |
}
|
| 531 |
|
| 532 |
+
/* URL form button override - must come after base .submit-btn */
|
| 533 |
+
.url-input-row .submit-btn {
|
| 534 |
+
width: 40px;
|
| 535 |
+
height: 40px;
|
| 536 |
+
min-width: 40px;
|
| 537 |
+
min-height: 40px;
|
| 538 |
+
padding: 0;
|
| 539 |
+
margin: 0;
|
| 540 |
+
flex-shrink: 0;
|
| 541 |
+
display: flex;
|
| 542 |
+
align-items: center;
|
| 543 |
+
justify-content: center;
|
| 544 |
+
border-radius: 6px;
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
.url-input-row .submit-btn i {
|
| 548 |
margin: 0;
|
| 549 |
+
padding: 0;
|
| 550 |
+
line-height: 1;
|
| 551 |
+
display: block;
|
| 552 |
}
|
| 553 |
|
| 554 |
input[type="file"] {
|
| 555 |
display: none;
|
| 556 |
}
|
| 557 |
|
| 558 |
+
.hidden-file-input {
|
| 559 |
+
display: none !important;
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
.file-label {
|
| 563 |
display: inline-block;
|
| 564 |
padding: 0.5rem 1rem;
|
| 565 |
+
background: var(--surface);
|
| 566 |
+
color: var(--text);
|
| 567 |
+
border: 1px solid var(--border);
|
|
|
|
| 568 |
border-radius: 8px;
|
| 569 |
cursor: pointer;
|
| 570 |
font-weight: 500;
|
| 571 |
font-size: 0.9rem;
|
| 572 |
+
transition: all 0.2s ease;
|
| 573 |
}
|
| 574 |
|
| 575 |
.file-label:hover {
|
| 576 |
+
color: var(--primary);
|
| 577 |
+
border-color: var(--primary);
|
| 578 |
+
background: rgb(212, 118, 58, 0.08);
|
| 579 |
}
|
| 580 |
|
| 581 |
/* Device Info - Subtle footer-like display */
|
|
|
|
| 583 |
display: none;
|
| 584 |
justify-content: center;
|
| 585 |
align-items: center;
|
| 586 |
+
gap: 0.6rem;
|
| 587 |
+
padding: 0.4rem 1rem;
|
| 588 |
font-size: 0.7rem;
|
| 589 |
color: var(--text-muted);
|
| 590 |
+
margin-top: 0.25rem;
|
| 591 |
opacity: 0.7;
|
| 592 |
+
line-height: 1.2;
|
| 593 |
}
|
| 594 |
|
| 595 |
.device-info.visible {
|
| 596 |
display: flex;
|
| 597 |
flex-wrap: wrap;
|
| 598 |
+
row-gap: 0.2rem;
|
| 599 |
}
|
| 600 |
|
| 601 |
.device-info i {
|
|
|
|
| 604 |
}
|
| 605 |
|
| 606 |
.device-memory {
|
| 607 |
+
display: flex;
|
| 608 |
+
align-items: center;
|
| 609 |
+
gap: 0.25rem;
|
| 610 |
+
}
|
| 611 |
+
|
| 612 |
+
.device-memory i {
|
| 613 |
+
font-size: 0.6rem;
|
| 614 |
+
}
|
| 615 |
+
|
| 616 |
+
.device-ephemeral {
|
| 617 |
+
display: flex;
|
| 618 |
+
align-items: center;
|
| 619 |
+
gap: 0.25rem;
|
| 620 |
+
color: var(--success);
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
.device-ephemeral i {
|
| 624 |
+
color: var(--success);
|
| 625 |
+
font-size: 0.6rem;
|
| 626 |
+
}
|
| 627 |
+
|
| 628 |
+
.device-timing {
|
| 629 |
+
display: flex;
|
| 630 |
+
align-items: center;
|
| 631 |
+
gap: 0.25rem;
|
| 632 |
+
color: var(--text-muted);
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
.device-timing i {
|
| 636 |
+
font-size: 0.6rem;
|
| 637 |
+
}
|
| 638 |
+
|
| 639 |
+
/* Model state indicators */
|
| 640 |
+
.model-state {
|
| 641 |
+
display: flex;
|
| 642 |
+
align-items: center;
|
| 643 |
+
gap: 0.25rem;
|
| 644 |
+
padding: 0.15rem 0.4rem;
|
| 645 |
+
border-radius: 4px;
|
| 646 |
+
font-size: 0.65rem;
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
.model-state i {
|
| 650 |
+
font-size: 0.55rem;
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
.model-loaded {
|
| 654 |
+
background: rgb(16, 185, 129, 0.15);
|
| 655 |
+
color: var(--success);
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
.model-loading,
|
| 659 |
+
.model-unloading {
|
| 660 |
+
background: rgb(245, 158, 11, 0.15);
|
| 661 |
+
color: #f59e0b;
|
| 662 |
+
}
|
| 663 |
+
|
| 664 |
+
.model-unloaded {
|
| 665 |
+
background: rgb(107, 114, 128, 0.15);
|
| 666 |
+
color: var(--text-muted);
|
| 667 |
}
|
| 668 |
|
| 669 |
.device-batch {
|
|
|
|
| 708 |
|
| 709 |
/* Audio Player */
|
| 710 |
.player {
|
| 711 |
+
margin-top: 1rem;
|
| 712 |
width: 100%;
|
| 713 |
display: none;
|
| 714 |
+
padding: 0.75rem 1rem;
|
| 715 |
background: var(--surface);
|
| 716 |
border-radius: 12px;
|
| 717 |
border: 1px solid var(--border);
|
|
|
|
| 719 |
|
| 720 |
.player.visible {
|
| 721 |
display: block;
|
| 722 |
+
animation: slide-in 0.3s ease-out;
|
| 723 |
+
}
|
| 724 |
+
|
| 725 |
+
.player.deleting {
|
| 726 |
+
animation: slide-out 0.3s ease-out forwards;
|
| 727 |
+
}
|
| 728 |
+
|
| 729 |
+
@keyframes slide-in {
|
| 730 |
+
from {
|
| 731 |
+
opacity: 0;
|
| 732 |
+
transform: translateY(-10px);
|
| 733 |
+
}
|
| 734 |
+
|
| 735 |
+
to {
|
| 736 |
+
opacity: 1;
|
| 737 |
+
transform: translateY(0);
|
| 738 |
+
}
|
| 739 |
+
}
|
| 740 |
+
|
| 741 |
+
@keyframes slide-out {
|
| 742 |
+
from {
|
| 743 |
+
opacity: 1;
|
| 744 |
+
transform: translateY(0);
|
| 745 |
+
}
|
| 746 |
+
|
| 747 |
+
to {
|
| 748 |
+
opacity: 0;
|
| 749 |
+
transform: translateY(-10px);
|
| 750 |
+
}
|
| 751 |
}
|
| 752 |
|
| 753 |
/* Hidden audio element */
|
|
|
|
| 765 |
.player-btn {
|
| 766 |
width: 36px;
|
| 767 |
height: 36px;
|
| 768 |
+
border: 1px solid var(--border);
|
| 769 |
border-radius: 8px;
|
| 770 |
+
background: var(--surface);
|
| 771 |
+
color: var(--text-muted);
|
|
|
|
|
|
|
| 772 |
cursor: pointer;
|
| 773 |
display: flex;
|
| 774 |
align-items: center;
|
| 775 |
justify-content: center;
|
| 776 |
font-size: 0.85rem;
|
| 777 |
+
transition: all 0.2s ease;
|
| 778 |
flex-shrink: 0;
|
| 779 |
}
|
| 780 |
|
| 781 |
.player-btn:hover {
|
| 782 |
+
color: var(--primary);
|
| 783 |
+
border-color: var(--primary);
|
| 784 |
+
background: rgb(212, 118, 58, 0.08);
|
| 785 |
+
}
|
| 786 |
+
|
| 787 |
+
.player-btn.play-btn {
|
| 788 |
+
width: 40px;
|
| 789 |
+
height: 40px;
|
| 790 |
+
font-size: 0.9rem;
|
| 791 |
}
|
| 792 |
|
| 793 |
.player-btn.volume-btn {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
width: 32px;
|
| 795 |
height: 32px;
|
| 796 |
font-size: 0.8rem;
|
|
|
|
| 798 |
|
| 799 |
.player-btn.volume-btn:hover {
|
| 800 |
color: var(--text);
|
|
|
|
| 801 |
}
|
| 802 |
|
| 803 |
.player-btn.download-btn {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
width: 32px;
|
| 805 |
height: 32px;
|
| 806 |
font-size: 0.8rem;
|
|
|
|
| 808 |
|
| 809 |
.player-btn.download-btn:hover {
|
| 810 |
color: var(--primary);
|
| 811 |
+
}
|
| 812 |
+
|
| 813 |
+
.player-btn.delete-btn {
|
| 814 |
+
width: 32px;
|
| 815 |
+
height: 32px;
|
| 816 |
+
font-size: 0.8rem;
|
| 817 |
+
}
|
| 818 |
+
|
| 819 |
+
.player-btn.delete-btn:hover {
|
| 820 |
+
color: var(--error);
|
| 821 |
+
border-color: var(--error);
|
| 822 |
+
background: rgb(196, 90, 74, 0.08);
|
| 823 |
}
|
| 824 |
|
| 825 |
.progress-container {
|
|
|
|
| 879 |
}
|
| 880 |
|
| 881 |
.filename {
|
| 882 |
+
margin-bottom: 0.5rem;
|
| 883 |
font-size: 0.85rem;
|
| 884 |
font-weight: 500;
|
| 885 |
color: var(--text);
|
| 886 |
word-break: break-all;
|
| 887 |
+
display: flex;
|
| 888 |
+
align-items: center;
|
| 889 |
+
gap: 0.4rem;
|
| 890 |
+
flex-wrap: wrap;
|
| 891 |
+
}
|
| 892 |
+
|
| 893 |
+
.filename-meta {
|
| 894 |
+
display: inline-flex;
|
| 895 |
+
align-items: center;
|
| 896 |
+
gap: 0.35rem;
|
| 897 |
+
font-size: 0.8rem;
|
| 898 |
+
color: var(--text-muted);
|
| 899 |
+
margin-left: auto;
|
| 900 |
}
|
| 901 |
|
| 902 |
/* Spinner Animation */
|
|
|
|
| 909 |
border-radius: 50%;
|
| 910 |
animation: spin 1s linear infinite;
|
| 911 |
margin-right: 0.4rem;
|
| 912 |
+
vertical-align: -2px;
|
| 913 |
}
|
| 914 |
|
| 915 |
@keyframes spin {
|
src/talking_snake/tts.py
CHANGED
|
@@ -8,6 +8,7 @@ import time
|
|
| 8 |
import wave
|
| 9 |
from abc import ABC, abstractmethod
|
| 10 |
from collections.abc import Iterator
|
|
|
|
| 11 |
from typing import TYPE_CHECKING
|
| 12 |
|
| 13 |
if TYPE_CHECKING:
|
|
@@ -42,18 +43,146 @@ class TTSEngineProtocol(ABC):
|
|
| 42 |
return 1
|
| 43 |
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
)
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
# Language to default voice mapping
|
| 58 |
LANGUAGE_VOICES: dict[str, str] = {
|
| 59 |
"english": "Ryan",
|
|
@@ -65,8 +194,8 @@ LANGUAGE_VOICES: dict[str, str] = {
|
|
| 65 |
# Default chunk size for streaming
|
| 66 |
# Larger chunks = more stable voice, fewer artifacts at boundaries
|
| 67 |
# Smaller chunks = faster first audio but potential voice instability
|
| 68 |
-
#
|
| 69 |
-
DEFAULT_CHUNK_SIZE =
|
| 70 |
|
| 71 |
# Idle timeout before unloading model from GPU (seconds)
|
| 72 |
# Set to 0 to disable auto-unloading
|
|
@@ -140,9 +269,19 @@ class QwenTTSEngine(TTSEngineProtocol):
|
|
| 140 |
self._idle_timeout = idle_timeout
|
| 141 |
self._last_activity = time.time()
|
| 142 |
self._model_loaded = False
|
|
|
|
| 143 |
self._lock = threading.Lock()
|
| 144 |
self._unload_timer: threading.Timer | None = None
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
# Model will be loaded on first request (lazy loading)
|
| 147 |
self.model = None
|
| 148 |
|
|
@@ -150,6 +289,67 @@ class QwenTTSEngine(TTSEngineProtocol):
|
|
| 150 |
if idle_timeout == 0:
|
| 151 |
self._load_model()
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
def _load_model(self) -> None:
|
| 154 |
"""Load the model onto GPU or CPU."""
|
| 155 |
if self._model_loaded:
|
|
@@ -158,6 +358,7 @@ class QwenTTSEngine(TTSEngineProtocol):
|
|
| 158 |
import torch
|
| 159 |
from qwen_tts import Qwen3TTSModel
|
| 160 |
|
|
|
|
| 161 |
device_name = "GPU" if self.device == "cuda" else "CPU"
|
| 162 |
print(f"🔄 Loading TTS model onto {device_name}...")
|
| 163 |
start = time.time()
|
|
@@ -186,6 +387,7 @@ class QwenTTSEngine(TTSEngineProtocol):
|
|
| 186 |
)
|
| 187 |
|
| 188 |
self._model_loaded = True
|
|
|
|
| 189 |
|
| 190 |
# Calculate optimal batch size based on available VRAM
|
| 191 |
if self.device == "cuda":
|
|
@@ -205,6 +407,7 @@ class QwenTTSEngine(TTSEngineProtocol):
|
|
| 205 |
|
| 206 |
import torch
|
| 207 |
|
|
|
|
| 208 |
print("💤 Unloading TTS model from GPU (idle timeout)...")
|
| 209 |
|
| 210 |
# Delete model and clear references
|
|
@@ -218,6 +421,7 @@ class QwenTTSEngine(TTSEngineProtocol):
|
|
| 218 |
torch.cuda.empty_cache()
|
| 219 |
torch.cuda.synchronize()
|
| 220 |
|
|
|
|
| 221 |
print("✅ GPU memory freed")
|
| 222 |
|
| 223 |
def _schedule_unload(self) -> None:
|
|
@@ -307,6 +511,10 @@ class QwenTTSEngine(TTSEngineProtocol):
|
|
| 307 |
# Type guard - model is guaranteed to be loaded after _ensure_model_loaded
|
| 308 |
assert self.model is not None, "Model failed to load"
|
| 309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
try:
|
| 311 |
# Split text into chunks for streaming
|
| 312 |
chunks = self._split_text(text)
|
|
@@ -326,10 +534,9 @@ class QwenTTSEngine(TTSEngineProtocol):
|
|
| 326 |
continue
|
| 327 |
|
| 328 |
# Always use batched call for consistent GPU memory allocation
|
| 329 |
-
# Use
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
)
|
| 333 |
audios, sr = self.model.generate_custom_voice(
|
| 334 |
text=batch if len(batch) > 1 else batch[0],
|
| 335 |
speaker=[self.voice] * len(batch) if len(batch) > 1 else self.voice,
|
|
@@ -349,6 +556,9 @@ class QwenTTSEngine(TTSEngineProtocol):
|
|
| 349 |
first_chunk = False
|
| 350 |
yield wav_bytes
|
| 351 |
finally:
|
|
|
|
|
|
|
|
|
|
| 352 |
# Schedule model unload after idle timeout
|
| 353 |
self._schedule_unload()
|
| 354 |
|
|
|
|
| 8 |
import wave
|
| 9 |
from abc import ABC, abstractmethod
|
| 10 |
from collections.abc import Iterator
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
from typing import TYPE_CHECKING
|
| 13 |
|
| 14 |
if TYPE_CHECKING:
|
|
|
|
| 43 |
return 1
|
| 44 |
|
| 45 |
|
| 46 |
+
@dataclass
|
| 47 |
+
class TTSStyle:
|
| 48 |
+
"""Defines a TTS speaking style with its configuration."""
|
| 49 |
+
|
| 50 |
+
id: str # Unique identifier (e.g., "technical", "narrative")
|
| 51 |
+
name: str # Display name (e.g., "Technical Documentation")
|
| 52 |
+
icon: str # Font Awesome icon class (e.g., "fa-gear")
|
| 53 |
+
description: str # Short description for tooltips
|
| 54 |
+
prompt: str # The instruct prompt for the TTS model
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# === TTS STYLES ===
|
| 58 |
+
# Each style provides a different speaking approach optimized for specific content types
|
| 59 |
+
|
| 60 |
+
STYLE_TECHNICAL = TTSStyle(
|
| 61 |
+
id="technical",
|
| 62 |
+
name="Technical",
|
| 63 |
+
icon="fa-microchip",
|
| 64 |
+
description="Clear, precise reading for code and technical documentation",
|
| 65 |
+
prompt=(
|
| 66 |
+
"You are a technical speech engine reading engineering documents. "
|
| 67 |
+
"Your task is to convert text into clear, accurate spoken output. "
|
| 68 |
+
"Read in a neutral, controlled, professional voice. "
|
| 69 |
+
"Do not sound expressive, emotional, or conversational. "
|
| 70 |
+
"Do not use audiobook, storytelling, or presenter intonation. "
|
| 71 |
+
"Prioritize intelligibility and correctness over naturalness. "
|
| 72 |
+
"Maintain steady pacing and flat prosody appropriate for scientific material. "
|
| 73 |
+
"Pronounce all acronyms as individual letters unless they are standard spoken words. "
|
| 74 |
+
"Pronounce symbols, operators, and punctuation when they affect meaning. "
|
| 75 |
+
"Preserve capitalization, parentheses, and formatting as part of the spoken output. "
|
| 76 |
+
"When reading code, equations, or identifiers, slow down and speak every token clearly. "
|
| 77 |
+
"Insert short pauses at commas and longer pauses at periods and line breaks. "
|
| 78 |
+
"Do not summarize, interpret, or rephrase. "
|
| 79 |
+
"Read exactly what is written."
|
| 80 |
+
),
|
| 81 |
)
|
| 82 |
|
| 83 |
+
STYLE_NARRATIVE = TTSStyle(
|
| 84 |
+
id="narrative",
|
| 85 |
+
name="Narrative",
|
| 86 |
+
icon="fa-book-open",
|
| 87 |
+
description="Natural, engaging reading for articles and stories",
|
| 88 |
+
prompt=(
|
| 89 |
+
"You are a professional narrative voice reading long-form text. "
|
| 90 |
+
"Your task is to tell a story in a clear, engaging, and natural way. "
|
| 91 |
+
"Use a warm, expressive, and fluid voice. "
|
| 92 |
+
"Vary intonation and rhythm to reflect meaning, emotion, and emphasis. "
|
| 93 |
+
"Sound human and immersive, not robotic or monotone. "
|
| 94 |
+
"Maintain smooth pacing, slowing for important moments, speeding up for transitions. "
|
| 95 |
+
"Use natural pauses at punctuation and paragraph breaks. "
|
| 96 |
+
"Pronounce all words clearly, but do not over-articulate symbols or formatting. "
|
| 97 |
+
"Read acronyms as spoken words when they are commonly pronounced that way. "
|
| 98 |
+
"Preserve the narrative flow and emotional tone of the text. "
|
| 99 |
+
"Do not flatten or neutralize the delivery."
|
| 100 |
+
),
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
STYLE_CHILD_NARRATIVE = TTSStyle(
|
| 104 |
+
id="child_narrative",
|
| 105 |
+
name="Child Narrative",
|
| 106 |
+
icon="fa-child",
|
| 107 |
+
description="Playful, expressive reading for children's stories",
|
| 108 |
+
prompt=(
|
| 109 |
+
"You are a storyteller reading aloud to young children. "
|
| 110 |
+
"Your task is to tell a story in a friendly, gentle, and engaging way. "
|
| 111 |
+
"Use a warm, soft, and expressive voice. "
|
| 112 |
+
"Sound kind, calm, and reassuring. "
|
| 113 |
+
"Vary intonation to match emotions and actions in the story. "
|
| 114 |
+
"Maintain a slow to moderate pace with clear articulation. "
|
| 115 |
+
"Insert natural pauses so children can follow along. "
|
| 116 |
+
"Pronounce words simply and clearly. "
|
| 117 |
+
"Read acronyms and difficult words in their most familiar spoken form. "
|
| 118 |
+
"Keep the tone playful but soothing. "
|
| 119 |
+
"Do not sound technical, formal, or adult-oriented."
|
| 120 |
+
),
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
STYLE_NEWS = TTSStyle(
|
| 124 |
+
id="news",
|
| 125 |
+
name="News",
|
| 126 |
+
icon="fa-newspaper",
|
| 127 |
+
description="Authoritative, clear delivery for news and reports",
|
| 128 |
+
prompt=(
|
| 129 |
+
"You are a professional news anchor delivering broadcast news. "
|
| 130 |
+
"Your task is to read information clearly, confidently, and with authority. "
|
| 131 |
+
"Use a neutral, composed, and trustworthy voice. "
|
| 132 |
+
"Avoid emotional or dramatic delivery. "
|
| 133 |
+
"Do not sound conversational or casual. "
|
| 134 |
+
"Maintain a steady, moderate pace with crisp articulation. "
|
| 135 |
+
"Use controlled intonation to mark headlines, key facts, and transitions. "
|
| 136 |
+
"Pronounce names, numbers, acronyms, and places carefully and accurately. "
|
| 137 |
+
"Pause briefly at commas and longer at periods and topic changes. "
|
| 138 |
+
"Sound factual, objective, and broadcast-ready at all times."
|
| 139 |
+
),
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
STYLE_ACADEMIC = TTSStyle(
|
| 143 |
+
id="academic",
|
| 144 |
+
name="Academic",
|
| 145 |
+
icon="fa-graduation-cap",
|
| 146 |
+
description="Measured, scholarly reading for papers and research",
|
| 147 |
+
prompt=(
|
| 148 |
+
"You are an academic speech engine reading peer-reviewed scientific papers. "
|
| 149 |
+
"Your task is to render complex scholarly text into clear, precise spoken language. "
|
| 150 |
+
"Use a neutral, formal, and controlled voice. "
|
| 151 |
+
"Do not sound expressive, emotional, or conversational. "
|
| 152 |
+
"Do not use audiobook or presenter intonation. "
|
| 153 |
+
"Maintain steady pacing suitable for dense technical material. "
|
| 154 |
+
"Favor clarity and accuracy over naturalness. "
|
| 155 |
+
"Pronounce technical terminology, Greek letters, acronyms, and units correctly. "
|
| 156 |
+
"Read acronyms as individual letters unless they are standard spoken words. "
|
| 157 |
+
"Preserve capitalization, punctuation, and structure when they affect meaning. "
|
| 158 |
+
"Insert short pauses at commas and longer pauses at periods and section breaks. "
|
| 159 |
+
"Slow down slightly for equations, symbols, gene names, and references. "
|
| 160 |
+
"Do not summarize, interpret, or simplify the text. "
|
| 161 |
+
"Read exactly what is written."
|
| 162 |
+
),
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# Registry of all available styles
|
| 166 |
+
TTS_STYLES: dict[str, TTSStyle] = {
|
| 167 |
+
style.id: style
|
| 168 |
+
for style in [
|
| 169 |
+
STYLE_TECHNICAL,
|
| 170 |
+
STYLE_NARRATIVE,
|
| 171 |
+
STYLE_CHILD_NARRATIVE,
|
| 172 |
+
STYLE_NEWS,
|
| 173 |
+
STYLE_ACADEMIC,
|
| 174 |
+
]
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
# Default style
|
| 178 |
+
DEFAULT_STYLE = STYLE_TECHNICAL
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def get_style(style_id: str) -> TTSStyle:
|
| 182 |
+
"""Get a TTS style by ID, falling back to default if not found."""
|
| 183 |
+
return TTS_STYLES.get(style_id, DEFAULT_STYLE)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
# Language to default voice mapping
|
| 187 |
LANGUAGE_VOICES: dict[str, str] = {
|
| 188 |
"english": "Ryan",
|
|
|
|
| 194 |
# Default chunk size for streaming
|
| 195 |
# Larger chunks = more stable voice, fewer artifacts at boundaries
|
| 196 |
# Smaller chunks = faster first audio but potential voice instability
|
| 197 |
+
# 1800 chars provides good balance for natural speech flow
|
| 198 |
+
DEFAULT_CHUNK_SIZE = 1800
|
| 199 |
|
| 200 |
# Idle timeout before unloading model from GPU (seconds)
|
| 201 |
# Set to 0 to disable auto-unloading
|
|
|
|
| 269 |
self._idle_timeout = idle_timeout
|
| 270 |
self._last_activity = time.time()
|
| 271 |
self._model_loaded = False
|
| 272 |
+
self._model_state = "unloaded" # unloaded, loading, loaded, unloading
|
| 273 |
self._lock = threading.Lock()
|
| 274 |
self._unload_timer: threading.Timer | None = None
|
| 275 |
|
| 276 |
+
# Calibrated seconds per character (measured and updated over time)
|
| 277 |
+
self._seconds_per_char: float | None = None
|
| 278 |
+
# Cumulative stats for running average
|
| 279 |
+
self._total_chars_processed: int = 0
|
| 280 |
+
self._total_time_spent: float = 0.0
|
| 281 |
+
|
| 282 |
+
# Current style for TTS
|
| 283 |
+
self._style: TTSStyle = DEFAULT_STYLE
|
| 284 |
+
|
| 285 |
# Model will be loaded on first request (lazy loading)
|
| 286 |
self.model = None
|
| 287 |
|
|
|
|
| 289 |
if idle_timeout == 0:
|
| 290 |
self._load_model()
|
| 291 |
|
| 292 |
+
@property
|
| 293 |
+
def style(self) -> TTSStyle:
|
| 294 |
+
"""Return the current TTS style."""
|
| 295 |
+
return self._style
|
| 296 |
+
|
| 297 |
+
def set_style(self, style_id: str) -> None:
|
| 298 |
+
"""Set the TTS style by ID.
|
| 299 |
+
|
| 300 |
+
Args:
|
| 301 |
+
style_id: Style identifier (technical, narrative, news, casual, academic).
|
| 302 |
+
"""
|
| 303 |
+
self._style = get_style(style_id)
|
| 304 |
+
|
| 305 |
+
@property
|
| 306 |
+
def model_state(self) -> str:
|
| 307 |
+
"""Return the current model state: unloaded, loading, loaded, or unloading."""
|
| 308 |
+
return self._model_state
|
| 309 |
+
|
| 310 |
+
@property
|
| 311 |
+
def seconds_per_char(self) -> float | None:
|
| 312 |
+
"""Return calibrated seconds per character, or None if not yet measured."""
|
| 313 |
+
return self._seconds_per_char
|
| 314 |
+
|
| 315 |
+
@property
|
| 316 |
+
def total_chars_processed(self) -> int:
|
| 317 |
+
"""Return total characters processed since startup."""
|
| 318 |
+
return self._total_chars_processed
|
| 319 |
+
|
| 320 |
+
def _update_timing_stats(self, chars: int, elapsed: float) -> None:
|
| 321 |
+
"""Update cumulative timing statistics.
|
| 322 |
+
|
| 323 |
+
Args:
|
| 324 |
+
chars: Number of characters processed.
|
| 325 |
+
elapsed: Time taken in seconds.
|
| 326 |
+
"""
|
| 327 |
+
self._total_chars_processed += chars
|
| 328 |
+
self._total_time_spent += elapsed
|
| 329 |
+
if self._total_chars_processed > 0:
|
| 330 |
+
self._seconds_per_char = self._total_time_spent / self._total_chars_processed
|
| 331 |
+
|
| 332 |
+
def calibrate(self, test_text: str = "Hello, this is a calibration test.") -> float:
|
| 333 |
+
"""Run a calibration test to measure seconds per character.
|
| 334 |
+
|
| 335 |
+
Args:
|
| 336 |
+
test_text: Short text to use for calibration.
|
| 337 |
+
|
| 338 |
+
Returns:
|
| 339 |
+
Measured seconds per character.
|
| 340 |
+
"""
|
| 341 |
+
self._ensure_model_loaded()
|
| 342 |
+
|
| 343 |
+
start = time.time()
|
| 344 |
+
# Consume the generator to complete synthesis
|
| 345 |
+
for _ in self.synthesize(test_text):
|
| 346 |
+
pass
|
| 347 |
+
elapsed = time.time() - start
|
| 348 |
+
|
| 349 |
+
self._seconds_per_char = elapsed / len(test_text)
|
| 350 |
+
print(f"⏱️ Calibrated: {self._seconds_per_char:.4f}s per character")
|
| 351 |
+
return self._seconds_per_char
|
| 352 |
+
|
| 353 |
def _load_model(self) -> None:
|
| 354 |
"""Load the model onto GPU or CPU."""
|
| 355 |
if self._model_loaded:
|
|
|
|
| 358 |
import torch
|
| 359 |
from qwen_tts import Qwen3TTSModel
|
| 360 |
|
| 361 |
+
self._model_state = "loading"
|
| 362 |
device_name = "GPU" if self.device == "cuda" else "CPU"
|
| 363 |
print(f"🔄 Loading TTS model onto {device_name}...")
|
| 364 |
start = time.time()
|
|
|
|
| 387 |
)
|
| 388 |
|
| 389 |
self._model_loaded = True
|
| 390 |
+
self._model_state = "loaded"
|
| 391 |
|
| 392 |
# Calculate optimal batch size based on available VRAM
|
| 393 |
if self.device == "cuda":
|
|
|
|
| 407 |
|
| 408 |
import torch
|
| 409 |
|
| 410 |
+
self._model_state = "unloading"
|
| 411 |
print("💤 Unloading TTS model from GPU (idle timeout)...")
|
| 412 |
|
| 413 |
# Delete model and clear references
|
|
|
|
| 421 |
torch.cuda.empty_cache()
|
| 422 |
torch.cuda.synchronize()
|
| 423 |
|
| 424 |
+
self._model_state = "unloaded"
|
| 425 |
print("✅ GPU memory freed")
|
| 426 |
|
| 427 |
def _schedule_unload(self) -> None:
|
|
|
|
| 511 |
# Type guard - model is guaranteed to be loaded after _ensure_model_loaded
|
| 512 |
assert self.model is not None, "Model failed to load"
|
| 513 |
|
| 514 |
+
# Track timing for this synthesis
|
| 515 |
+
synthesis_start = time.time()
|
| 516 |
+
chars_in_text = len(text)
|
| 517 |
+
|
| 518 |
try:
|
| 519 |
# Split text into chunks for streaming
|
| 520 |
chunks = self._split_text(text)
|
|
|
|
| 534 |
continue
|
| 535 |
|
| 536 |
# Always use batched call for consistent GPU memory allocation
|
| 537 |
+
# Use the current style's prompt for delivery
|
| 538 |
+
style_prompt = self._style.prompt
|
| 539 |
+
batch_instruct = [style_prompt] * len(batch) if len(batch) > 1 else style_prompt
|
|
|
|
| 540 |
audios, sr = self.model.generate_custom_voice(
|
| 541 |
text=batch if len(batch) > 1 else batch[0],
|
| 542 |
speaker=[self.voice] * len(batch) if len(batch) > 1 else self.voice,
|
|
|
|
| 556 |
first_chunk = False
|
| 557 |
yield wav_bytes
|
| 558 |
finally:
|
| 559 |
+
# Update timing stats for future estimates
|
| 560 |
+
elapsed = time.time() - synthesis_start
|
| 561 |
+
self._update_timing_stats(chars_in_text, elapsed)
|
| 562 |
# Schedule model unload after idle timeout
|
| 563 |
self._schedule_unload()
|
| 564 |
|