Spaces:

LucaCappelletti94
/

talking-snake

Sleeping

App Files Files Community

GitHub Actions commited on Feb 7

Commit

1c7725b

1 Parent(s): 2886be7

Deploy from GitHub: 0bf18943d192a2812c57599f6c25bf9739d523bf

Browse files

Files changed (9) hide show

LICENSE +21 -0
pyproject.toml +1 -6
src/talking_snake/__main__.py +7 -0
src/talking_snake/app.py +165 -20
src/talking_snake/extract.py +448 -19
src/talking_snake/static/app.js +439 -104
src/talking_snake/static/index.html +35 -24
src/talking_snake/static/styles.css +292 -97
src/talking_snake/tts.py +226 -16

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Luca Cappelletti
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "talking-snake"
-version = "0.1.0"
 description = "Just a talking snake that reads PDFs and web pages aloud."
 readme = "README.md"
 license = { text = "MIT" }
@@ -45,11 +45,6 @@ dev = [
     "mypy>=1.14.0",
     "pre-commit>=4.0.0",
 ]
-# Flash Attention for ~2x faster inference (requires CUDA 11.6+)
-# Install separately: pip install flash-attn --no-build-isolation
-fast = [
-    "flash-attn>=2.5.0",
-]
 [project.scripts]
 talking-snake = "talking_snake.__main__:main"

 [project]
 name = "talking-snake"
+version = "0.1.1"
 description = "Just a talking snake that reads PDFs and web pages aloud."
 readme = "README.md"
 license = { text = "MIT" }
     "mypy>=1.14.0",
     "pre-commit>=4.0.0",
 ]
 [project.scripts]
 talking-snake = "talking_snake.__main__:main"

src/talking_snake/__main__.py CHANGED Viewed

@@ -99,6 +99,13 @@ def main() -> int:
         return 1
     print("✅ TTS model loaded!")
     print()
     # Create app with engine

         return 1
     print("✅ TTS model loaded!")
+    # Run calibration to get accurate time estimates
+    print("⏱️  Calibrating speech timing...")
+    try:
+        tts_engine.calibrate()
+    except Exception as e:
+        print(f"⚠️  Calibration failed (using defaults): {e}")
     print()
     # Create app with engine

src/talking_snake/app.py CHANGED Viewed

@@ -11,12 +11,12 @@ import time
 import uuid
 from pathlib import Path
 from typing import TYPE_CHECKING
-from urllib.parse import urlparse
 import httpx
 import trafilatura
 from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
-from fastapi.responses import HTMLResponse, StreamingResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
@@ -24,6 +24,7 @@ from talking_snake.extract import clean_text, extract_text, get_page_count
 from talking_snake.tts import (
     DEFAULT_CHUNK_SIZE,
     LANGUAGE_VOICES,
     MockTTSEngine,
     TTSEngineProtocol,
 )
@@ -52,15 +53,31 @@ class AudioJob:
     def __init__(self, job_id: str):
         self.job_id = job_id
         self.audio_queue: queue.Queue[bytes | None] = queue.Queue()
         self.started = time.time()
         self.completed = False
         self.error: str | None = None
         self.sample_rate = 24000  # Default, will be set by TTS engine
         self.header_sent = False
     def put_audio(self, audio_bytes: bytes) -> None:
-        """Add audio data to the queue."""
         self.audio_queue.put(audio_bytes)
     def finish(self) -> None:
         """Signal that audio generation is complete."""
@@ -117,6 +134,7 @@ class UrlRequest(BaseModel):
     url: str
     language: str = "english"
 class TextRequest(BaseModel):
@@ -124,6 +142,7 @@ class TextRequest(BaseModel):
     text: str
     language: str = "english"
 class EstimateResponse(BaseModel):
@@ -170,6 +189,7 @@ def create_app(tts_engine: TTSEngineProtocol | None = None) -> FastAPI:
     app.add_api_route("/api/read-url-stream", read_url_stream, methods=["POST"])
     app.add_api_route("/api/read-text-stream", read_text_stream, methods=["POST"])
     app.add_api_route("/api/audio/{job_id}", stream_audio, methods=["GET"])
     app.add_api_route("/api/languages", get_languages, methods=["GET"])
     app.add_api_route("/api/device-info-stream", stream_device_info, methods=["GET"])
     app.add_api_route("/api/health", health_check, methods=["GET"])
@@ -389,6 +409,9 @@ def _get_device_info() -> dict:
     Returns:
         Device type, memory usage, and model info.
     """
     import torch
     info = {
@@ -398,8 +421,20 @@ def _get_device_info() -> dict:
         "memory_total_gb": 0,
         "memory_percent": 0,
         "batch_size": 1,
     }
     if torch.cuda.is_available():
         props = torch.cuda.get_device_properties(0)
         # Use reserved memory for more accurate GPU usage (includes PyTorch cache)
@@ -421,6 +456,15 @@ def _get_device_info() -> dict:
     if _tts_engine is not None:
         info["batch_size"] = getattr(_tts_engine, "batch_size", 1)
         info["chunk_size"] = getattr(_tts_engine, "chunk_size", 800)
     return info
@@ -461,9 +505,7 @@ async def stream_device_info() -> StreamingResponse:
     )
-def _estimate_time(
-    text: str, seconds_per_char: float = INITIAL_SECONDS_PER_CHAR
-) -> tuple[int, float]:
     """Estimate processing time for text.
     Args:
@@ -473,6 +515,8 @@ def _estimate_time(
     Returns:
         Tuple of (chunk_count, estimated_seconds).
     """
     # Count chunks (500 chars per chunk approximately)
     chunk_count = max(1, len(text) // 500 + (1 if len(text) % 500 else 0))
     estimated_seconds = len(text) * seconds_per_char
@@ -521,6 +565,7 @@ def _generate_audio_to_job(
     text: str,
     tts_engine: TTSEngineProtocol,
     language: str = "english",
     doc_name: str = "document",
     doc_type: str = "text",
     page_count: int | None = None,
@@ -536,11 +581,10 @@ def _generate_audio_to_job(
         text: Text to synthesize.
         tts_engine: TTS engine to use.
         language: Language for TTS (english, chinese, japanese, korean).
         doc_name: Name of the document being processed.
         doc_type: Type of document (pdf, url, text).
         page_count: Number of pages (for PDFs).
-        tts_engine: TTS engine to use.
-        language: Language for TTS (english, chinese, japanese, korean).
     Yields:
         SSE events for progress.
@@ -551,6 +595,10 @@ def _generate_audio_to_job(
     if hasattr(tts_engine, "set_language"):
         tts_engine.set_language(language)
     # Get chunk size and batch size from engine
     chunk_size = getattr(tts_engine, "chunk_size", DEFAULT_CHUNK_SIZE)
     batch_size = getattr(tts_engine, "batch_size", 1)
@@ -578,9 +626,13 @@ def _generate_audio_to_job(
     total_chunks = len(chunks) if chunks else 1
     total_chars = sum(len(c) for c in chunks)
-    # Use initial estimate before calibration
-    seconds_per_char = INITIAL_SECONDS_PER_CHAR
-    estimated_total = total_chars * seconds_per_char
     # Send initial progress event with job_id and batch info
     progress_data = {
@@ -589,7 +641,7 @@ def _generate_audio_to_job(
         "current": 0,
         "total": total_chunks,
         "percent": 0,
-        "estimated_remaining": estimated_total,
         "batch_size": batch_size,
         "doc_name": doc_name,
         "doc_type": doc_type,
@@ -648,13 +700,14 @@ def _generate_audio_to_job(
     # Signal audio generation complete
     job.finish()
-    # Send completion event
     total_time = time.time() - start_time
     complete_data = {
         "type": "complete",
         "total_time": round(total_time, 1),
         "chunks_processed": chunks_processed,
         "batch_size": batch_size,
     }
     yield f"event: complete\ndata: {json.dumps(complete_data)}\n\n".encode()
@@ -664,6 +717,7 @@ async def stream_audio(job_id: str) -> StreamingResponse:
     This endpoint streams the raw WAV audio as it's being generated.
     The browser can start playing as soon as data arrives.
     Args:
         job_id: The job ID to stream audio for.
@@ -675,7 +729,9 @@ async def stream_audio(job_id: str) -> StreamingResponse:
     if job is None:
         raise HTTPException(status_code=404, detail="Job not found")
-    def generate_audio() -> Iterator[bytes]:
         # Send WAV header first
         yield _create_wav_header(sample_rate=24000)
@@ -689,8 +745,6 @@ async def stream_audio(job_id: str) -> StreamingResponse:
                     break
                 # Skip WAV headers from individual chunks, only send raw PCM
                 if audio_data[:4] == b"RIFF":
-                    # This is a WAV file, extract just the PCM data
-                    # WAV header is 44 bytes for standard PCM
                     yield audio_data[44:]
                 else:
                     yield audio_data
@@ -698,11 +752,21 @@ async def stream_audio(job_id: str) -> StreamingResponse:
                 # Timeout waiting for data
                 break
-        # Clean up job after streaming
-        _job_manager.remove_job(job_id)
     return StreamingResponse(
-        generate_audio(),
         media_type="audio/wav",
         headers={
             "Cache-Control": "no-cache",
@@ -711,9 +775,76 @@ async def stream_audio(job_id: str) -> StreamingResponse:
     )
 async def read_pdf_stream(
     file: UploadFile = File(...),
     language: str = Form("english"),
 ) -> StreamingResponse:
     """Read a PDF with streaming progress updates.
@@ -722,6 +853,7 @@ async def read_pdf_stream(
     Args:
         file: Uploaded PDF file.
         language: Language for TTS (english, chinese, japanese, korean).
     Returns:
         Streaming response with progress events including job_id.
@@ -767,6 +899,7 @@ async def read_pdf_stream(
             text,
             _tts_engine,
             language,
             doc_name=file.filename or "document.pdf",
             doc_type="pdf",
             page_count=page_count,
@@ -796,6 +929,7 @@ async def read_text_stream(request: TextRequest) -> StreamingResponse:
     text = request.text.strip()
     language = request.language if request.language in LANGUAGE_VOICES else "english"
     if not text:
         raise HTTPException(status_code=400, detail="Text is required")
@@ -809,6 +943,14 @@ async def read_text_stream(request: TextRequest) -> StreamingResponse:
     if not text.strip():
         raise HTTPException(status_code=400, detail="No readable text provided")
     # Create a job for this request
     job = _job_manager.create_job()
@@ -818,7 +960,8 @@ async def read_text_stream(request: TextRequest) -> StreamingResponse:
             text,
             _tts_engine,
             language,
-            doc_name="Pasted Text",
             doc_type="text",
         ),
         media_type="text/event-stream",
@@ -846,6 +989,7 @@ async def read_url_stream(request: UrlRequest) -> StreamingResponse:
     url = request.url.strip()
     language = request.language if request.language in LANGUAGE_VOICES else "english"
     if not url:
         raise HTTPException(status_code=400, detail="URL is required")
@@ -922,6 +1066,7 @@ async def read_url_stream(request: UrlRequest) -> StreamingResponse:
             text,
             _tts_engine,
             language,
             doc_name=doc_name,
             doc_type="pdf" if is_pdf else "url",
             page_count=page_count,

 import uuid
 from pathlib import Path
 from typing import TYPE_CHECKING
+from urllib.parse import quote, urlparse
 import httpx
 import trafilatura
 from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
+from fastapi.responses import HTMLResponse, Response, StreamingResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from talking_snake.tts import (
     DEFAULT_CHUNK_SIZE,
     LANGUAGE_VOICES,
+    TTS_STYLES,
     MockTTSEngine,
     TTSEngineProtocol,
 )
     def __init__(self, job_id: str):
         self.job_id = job_id
         self.audio_queue: queue.Queue[bytes | None] = queue.Queue()
+        self.audio_cache: list[bytes] = []  # Cache PCM chunks for replay/download
         self.started = time.time()
         self.completed = False
+        self.stream_started = False  # Track if live stream has started
         self.error: str | None = None
         self.sample_rate = 24000  # Default, will be set by TTS engine
         self.header_sent = False
+        self._total_pcm_bytes = 0  # Track total audio bytes for duration calc
+    @property
+    def audio_duration(self) -> float:
+        """Calculate audio duration in seconds from cached PCM data."""
+        # 16-bit mono audio: duration = bytes / (sample_rate * 2)
+        return self._total_pcm_bytes / (self.sample_rate * 2)
     def put_audio(self, audio_bytes: bytes) -> None:
+        """Add audio data to the queue and cache."""
         self.audio_queue.put(audio_bytes)
+        # Cache the PCM data (strip WAV header if present)
+        if audio_bytes[:4] == b"RIFF":
+            pcm_data = audio_bytes[44:]
+        else:
+            pcm_data = audio_bytes
+        self.audio_cache.append(pcm_data)
+        self._total_pcm_bytes += len(pcm_data)
     def finish(self) -> None:
         """Signal that audio generation is complete."""
     url: str
     language: str = "english"
+    style: str = "technical"
 class TextRequest(BaseModel):
     text: str
     language: str = "english"
+    style: str = "technical"
 class EstimateResponse(BaseModel):
     app.add_api_route("/api/read-url-stream", read_url_stream, methods=["POST"])
     app.add_api_route("/api/read-text-stream", read_text_stream, methods=["POST"])
     app.add_api_route("/api/audio/{job_id}", stream_audio, methods=["GET"])
+    app.add_api_route("/api/download/{job_id}", download_audio, methods=["GET"])
     app.add_api_route("/api/languages", get_languages, methods=["GET"])
     app.add_api_route("/api/device-info-stream", stream_device_info, methods=["GET"])
     app.add_api_route("/api/health", health_check, methods=["GET"])
     Returns:
         Device type, memory usage, and model info.
     """
+    import shutil
+    import psutil
     import torch
     info = {
         "memory_total_gb": 0,
         "memory_percent": 0,
         "batch_size": 1,
+        "ram_used_gb": 0,
+        "ram_total_gb": 0,
+        "disk_free_gb": 0,
     }
+    # Get RAM info
+    ram = psutil.virtual_memory()
+    info["ram_used_gb"] = round(ram.used / 1024**3, 1)
+    info["ram_total_gb"] = round(ram.total / 1024**3, 1)
+    # Get disk free space
+    disk = shutil.disk_usage("/")
+    info["disk_free_gb"] = round(disk.free / 1024**3, 1)
     if torch.cuda.is_available():
         props = torch.cuda.get_device_properties(0)
         # Use reserved memory for more accurate GPU usage (includes PyTorch cache)
     if _tts_engine is not None:
         info["batch_size"] = getattr(_tts_engine, "batch_size", 1)
         info["chunk_size"] = getattr(_tts_engine, "chunk_size", 800)
+        # Include model state
+        info["model_state"] = getattr(_tts_engine, "model_state", "unknown")
+        # Include timing stats
+        seconds_per_char = getattr(_tts_engine, "seconds_per_char", None)
+        if seconds_per_char is not None:
+            info["seconds_per_char"] = round(seconds_per_char, 4)
+        total_chars = getattr(_tts_engine, "total_chars_processed", 0)
+        if total_chars > 0:
+            info["total_chars_processed"] = total_chars
     return info
     )
+def _estimate_time(text: str, seconds_per_char: float | None = None) -> tuple[int, float]:
     """Estimate processing time for text.
     Args:
     Returns:
         Tuple of (chunk_count, estimated_seconds).
     """
+    if seconds_per_char is None:
+        seconds_per_char = INITIAL_SECONDS_PER_CHAR
     # Count chunks (500 chars per chunk approximately)
     chunk_count = max(1, len(text) // 500 + (1 if len(text) % 500 else 0))
     estimated_seconds = len(text) * seconds_per_char
     text: str,
     tts_engine: TTSEngineProtocol,
     language: str = "english",
+    style: str = "technical",
     doc_name: str = "document",
     doc_type: str = "text",
     page_count: int | None = None,
         text: Text to synthesize.
         tts_engine: TTS engine to use.
         language: Language for TTS (english, chinese, japanese, korean).
+        style: TTS style (technical, narrative, news, casual, academic).
         doc_name: Name of the document being processed.
         doc_type: Type of document (pdf, url, text).
         page_count: Number of pages (for PDFs).
     Yields:
         SSE events for progress.
     if hasattr(tts_engine, "set_language"):
         tts_engine.set_language(language)
+    # Apply style if the engine supports it
+    if hasattr(tts_engine, "set_style"):
+        tts_engine.set_style(style)
     # Get chunk size and batch size from engine
     chunk_size = getattr(tts_engine, "chunk_size", DEFAULT_CHUNK_SIZE)
     batch_size = getattr(tts_engine, "batch_size", 1)
     total_chunks = len(chunks) if chunks else 1
     total_chars = sum(len(c) for c in chunks)
+    # Use calibrated estimate if available, otherwise initial estimate
+    seconds_per_char = getattr(tts_engine, "seconds_per_char", None) or INITIAL_SECONDS_PER_CHAR
+    # Account for batch efficiency: processing N chunks in parallel is ~N times faster
+    # The efficiency isn't perfectly linear, so use a conservative factor of sqrt(batch_size)
+    batch_efficiency = batch_size**0.5 if batch_size > 1 else 1.0
+    estimated_total = (total_chars * seconds_per_char) / batch_efficiency
     # Send initial progress event with job_id and batch info
     progress_data = {
         "current": 0,
         "total": total_chunks,
         "percent": 0,
+        "estimated_remaining": round(estimated_total, 1),
         "batch_size": batch_size,
         "doc_name": doc_name,
         "doc_type": doc_type,
     # Signal audio generation complete
     job.finish()
+    # Send completion event with actual audio duration
     total_time = time.time() - start_time
     complete_data = {
         "type": "complete",
         "total_time": round(total_time, 1),
         "chunks_processed": chunks_processed,
         "batch_size": batch_size,
+        "audio_duration": round(job.audio_duration, 2),
     }
     yield f"event: complete\ndata: {json.dumps(complete_data)}\n\n".encode()
     This endpoint streams the raw WAV audio as it's being generated.
     The browser can start playing as soon as data arrives.
+    First request streams live; subsequent requests return cached audio.
     Args:
         job_id: The job ID to stream audio for.
     if job is None:
         raise HTTPException(status_code=404, detail="Job not found")
+    def generate_audio_live() -> Iterator[bytes]:
+        """Stream audio live from queue (first request)."""
+        job.stream_started = True
         # Send WAV header first
         yield _create_wav_header(sample_rate=24000)
                     break
                 # Skip WAV headers from individual chunks, only send raw PCM
                 if audio_data[:4] == b"RIFF":
                     yield audio_data[44:]
                 else:
                     yield audio_data
                 # Timeout waiting for data
                 break
+    def generate_audio_cached() -> Iterator[bytes]:
+        """Stream audio from cache (subsequent requests)."""
+        # Send WAV header first
+        yield _create_wav_header(sample_rate=24000)
+        # Send all cached chunks
+        yield from job.audio_cache
+    # Use live stream for first request, cached for subsequent
+    if not job.stream_started:
+        generator = generate_audio_live()
+    else:
+        generator = generate_audio_cached()
     return StreamingResponse(
+        generator,
         media_type="audio/wav",
         headers={
             "Cache-Control": "no-cache",
     )
+async def download_audio(job_id: str, filename: str = "audio.wav") -> Response:
+    """Download complete audio file for a job.
+    This endpoint returns the full WAV file with correct headers for download.
+    Only works after generation is complete.
+    Args:
+        job_id: The job ID to download audio for.
+        filename: Suggested filename for download.
+    Returns:
+        Complete WAV audio file response.
+    """
+    job = _job_manager.get_job(job_id)
+    if job is None:
+        raise HTTPException(status_code=404, detail="Job not found")
+    if not job.audio_cache:
+        raise HTTPException(status_code=404, detail="No audio available")
+    # Combine all cached audio data
+    audio_data = b"".join(job.audio_cache)
+    # Create proper WAV header with actual size
+    sample_rate = 24000
+    bits_per_sample = 16
+    channels = 1
+    byte_rate = sample_rate * channels * bits_per_sample // 8
+    block_align = channels * bits_per_sample // 8
+    data_size = len(audio_data)
+    file_size = data_size + 36  # Header is 44 bytes, minus 8 for RIFF header
+    header = io.BytesIO()
+    header.write(b"RIFF")
+    header.write(struct.pack("<I", file_size))
+    header.write(b"WAVE")
+    header.write(b"fmt ")
+    header.write(struct.pack("<I", 16))  # fmt chunk size
+    header.write(struct.pack("<H", 1))  # PCM format
+    header.write(struct.pack("<H", channels))
+    header.write(struct.pack("<I", sample_rate))
+    header.write(struct.pack("<I", byte_rate))
+    header.write(struct.pack("<H", block_align))
+    header.write(struct.pack("<H", bits_per_sample))
+    header.write(b"data")
+    header.write(struct.pack("<I", data_size))
+    wav_data = header.getvalue() + audio_data
+    # RFC 5987 encoding for non-ASCII filenames
+    # Use ASCII-safe fallback + UTF-8 encoded filename*
+    safe_filename = filename.encode("ascii", "replace").decode("ascii")
+    encoded_filename = quote(filename, safe="")
+    return Response(
+        content=wav_data,
+        media_type="audio/wav",
+        headers={
+            "Content-Disposition": (
+                f"attachment; filename=\"{safe_filename}\"; filename*=UTF-8''{encoded_filename}"
+            ),
+            "Content-Length": str(len(wav_data)),
+        },
+    )
 async def read_pdf_stream(
     file: UploadFile = File(...),
     language: str = Form("english"),
+    style: str = Form("technical"),
 ) -> StreamingResponse:
     """Read a PDF with streaming progress updates.
     Args:
         file: Uploaded PDF file.
         language: Language for TTS (english, chinese, japanese, korean).
+        style: TTS style (technical, narrative, news, casual, academic).
     Returns:
         Streaming response with progress events including job_id.
             text,
             _tts_engine,
             language,
+            style,
             doc_name=file.filename or "document.pdf",
             doc_type="pdf",
             page_count=page_count,
     text = request.text.strip()
     language = request.language if request.language in LANGUAGE_VOICES else "english"
+    style = request.style if request.style in TTS_STYLES else "technical"
     if not text:
         raise HTTPException(status_code=400, detail="Text is required")
     if not text.strip():
         raise HTTPException(status_code=400, detail="No readable text provided")
+    # Generate doc name from first few words
+    words = text.split()[:5]
+    doc_name = " ".join(words)
+    if len(doc_name) > 30:
+        doc_name = doc_name[:30] + "..."
+    elif len(words) == 5:
+        doc_name = doc_name + "..."
     # Create a job for this request
     job = _job_manager.create_job()
             text,
             _tts_engine,
             language,
+            style,
+            doc_name=doc_name,
             doc_type="text",
         ),
         media_type="text/event-stream",
     url = request.url.strip()
     language = request.language if request.language in LANGUAGE_VOICES else "english"
+    style = request.style if request.style in TTS_STYLES else "technical"
     if not url:
         raise HTTPException(status_code=400, detail="URL is required")
             text,
             _tts_engine,
             language,
+            style,
             doc_name=doc_name,
             doc_type="pdf" if is_pdf else "url",
             page_count=page_count,

src/talking_snake/extract.py CHANGED Viewed

@@ -8,7 +8,14 @@ from collections import Counter
 from dataclasses import dataclass
 from pdfminer.high_level import extract_pages
-from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBoxHorizontal, LTTextLineHorizontal
 @dataclass
@@ -19,6 +26,260 @@ class TextBlock:
     y_ratio: float  # 0.0 = bottom, 1.0 = top
     font_size: float
     page_num: int
 def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
@@ -50,21 +311,52 @@ def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
             if not isinstance(element, LTTextBoxHorizontal):
                 continue
-            text = element.get_text().strip()
             if not text:
                 continue
             # Calculate Y position as ratio (0=bottom, 1=top)
             y_ratio = element.y0 / page_height if page_height > 0 else 0.5
-            # Extract average font size from characters
-            font_sizes: list[float] = []
-            for line in element:
-                if isinstance(line, LTTextLineHorizontal):
-                    for char in line:
-                        if isinstance(char, LTChar):
-                            font_sizes.append(char.size)
             avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
             blocks.append(
@@ -107,6 +399,9 @@ def extract_text(pdf_bytes: bytes) -> str:
     if not blocks:
         return ""
     cleaned_blocks = clean_text_blocks(blocks)
     text = "\n\n".join(block.text for block in cleaned_blocks)
@@ -161,6 +456,10 @@ def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
         if is_page_number(block.text):
             continue
         # Skip very short lines with small font (likely captions/footnotes)
         if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
             continue
@@ -280,6 +579,96 @@ def normalize_for_tts(text: str) -> str:
     Returns:
         Normalized text optimized for TTS.
     """
     # === CODE AND TECHNICAL CONTENT ===
     # Handle common programming patterns that read poorly
@@ -357,10 +746,37 @@ def normalize_for_tts(text: str) -> str:
     text = text.replace("'''", "")
     # === UNICODE NORMALIZATION ===
     # Convert smart quotes to simple quotes
-    text = text.replace(""", '"').replace(""", '"')
-    text = text.replace("'", "'").replace("'", "'")
-    text = text.replace("„", '"').replace("‟", '"')
     # Normalize dashes to standard hyphen or remove
     text = text.replace("–", "-")  # en-dash
@@ -471,19 +887,32 @@ def normalize_for_tts(text: str) -> str:
     # Remove content in angle brackets (often HTML/XML artifacts)
     text = re.sub(r"<[^>]+>", "", text)
-    # Normalize multiple spaces
-    text = re.sub(r"[ \t]+", " ", text)
     # Remove spaces before punctuation
     text = re.sub(r"\s+([.,;:!?])", r"\1", text)
     # Ensure space after punctuation (but not before another punctuation)
     text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
-    # Remove leading/trailing whitespace from lines
-    text = "\n".join(line.strip() for line in text.split("\n"))
-    # Remove empty lines that resulted from cleaning
     text = re.sub(r"\n{3,}", "\n\n", text)
     return text

 from dataclasses import dataclass
 from pdfminer.high_level import extract_pages
+from pdfminer.layout import (
+    LAParams,
+    LTAnno,
+    LTChar,
+    LTPage,
+    LTTextBoxHorizontal,
+    LTTextLineHorizontal,
+)
 @dataclass
     y_ratio: float  # 0.0 = bottom, 1.0 = top
     font_size: float
     page_num: int
+    x0: float = 0.0  # Left edge position for table detection
+    x1: float = 0.0  # Right edge position for table detection
+def _is_caption(text: str) -> bool:
+    """Check if text is a figure/table caption.
+    Captions typically start with:
+    - "Figure 1:", "Fig. 2:", "Figure 1."
+    - "Table 1:", "Table 2."
+    - "Exhibit A:", "Chart 1:"
+    - "Source:", "Note:", "Notes:"
+    Args:
+        text: Text to check.
+    Returns:
+        True if text appears to be a caption.
+    """
+    text = text.strip()
+    if not text:
+        return False
+    # Common caption patterns (case-insensitive start)
+    caption_patterns = [
+        r"^fig(?:ure)?\.?\s*\d",
+        r"^table\.?\s*\d",
+        r"^exhibit\.?\s*[a-z0-9]",
+        r"^chart\.?\s*\d",
+        r"^graph\.?\s*\d",
+        r"^diagram\.?\s*\d",
+        r"^plate\.?\s*\d",
+        r"^scheme\.?\s*\d",
+        r"^box\.?\s*\d",
+        r"^panel\.?\s*[a-z0-9]",
+        r"^appendix\.?\s*[a-z0-9]",
+        r"^source\s*:",
+        r"^sources\s*:",
+        r"^note\s*:",
+        r"^notes\s*:",
+        r"^data\s*:",
+        r"^\*\s*p\s*[<>=]",  # Statistical notes like "* p < 0.05"
+        r"^legend\s*:",
+    ]
+    text_lower = text.lower()
+    for pattern in caption_patterns:
+        if re.match(pattern, text_lower):
+            return True
+    return False
+def _is_table_like_text(text: str) -> bool:
+    """Check if text looks like table content.
+    Tables often have:
+    - Very short text fragments
+    - Mostly numbers or single words
+    - Lots of whitespace-separated values
+    - Column headers or row labels
+    - Short phrases without sentence structure
+    Args:
+        text: Text to check.
+    Returns:
+        True if the text appears to be table content.
+    """
+    text = text.strip()
+    # Very short fragments are likely table cells
+    if len(text) < 5:
+        return True
+    # Count numbers vs letters
+    digits = sum(1 for c in text if c.isdigit())
+    letters = sum(1 for c in text if c.isalpha())
+    # Mostly numbers with few letters (like "123.45" or "2024")
+    if digits > 0 and letters < 3 and digits >= letters:
+        return True
+    # Check for patterns common in tables
+    # Multiple tab-separated or heavily spaced values
+    if "\t" in text or "  " in text:
+        parts = re.split(r"\s{2,}|\t", text)
+        if len(parts) >= 3:
+            # Multiple short parts suggests table row
+            short_parts = sum(1 for p in parts if len(p.strip()) < 15)
+            if short_parts >= len(parts) * 0.6:
+                return True
+    # Single words that look like column headers
+    words = text.split()
+    if len(words) == 1 and len(text) < 20:
+        # Common table headers/labels
+        table_keywords = {
+            "total",
+            "sum",
+            "avg",
+            "average",
+            "mean",
+            "count",
+            "min",
+            "max",
+            "date",
+            "time",
+            "year",
+            "month",
+            "day",
+            "name",
+            "id",
+            "no",
+            "no.",
+            "value",
+            "amount",
+            "price",
+            "cost",
+            "qty",
+            "quantity",
+            "unit",
+            "row",
+            "column",
+            "col",
+            "item",
+            "description",
+            "desc",
+            "note",
+            "status",
+            "type",
+            "category",
+            "code",
+            "ref",
+            "reference",
+        }
+        if text.lower() in table_keywords:
+            return True
+    # Short phrases without sentence structure (likely table cells)
+    # Table cells typically:
+    # - Are short (< 50 chars)
+    # - Don't end with sentence-ending punctuation
+    # - Don't start with lowercase (unless very short)
+    # - Have few words (< 8)
+    if len(text) < 50 and len(words) < 8:
+        # Doesn't end like a sentence
+        if not text.rstrip().endswith((".", "!", "?", ":")):
+            # Common table cell patterns
+            text_lower = text.lower()
+            # Technical/status phrases common in tables
+            table_phrases = [
+                "supported",
+                "not supported",
+                "yes",
+                "no",
+                "n/a",
+                "none",
+                "required",
+                "optional",
+                "enabled",
+                "disabled",
+                "active",
+                "inactive",
+                "read-only",
+                "read only",
+                "write",
+                "read/write",
+                "read-write",
+                "must be",
+                "can be",
+                "should be",
+                "will be",
+                "available",
+                "unavailable",
+                "pending",
+                "completed",
+                "failed",
+                "true",
+                "false",
+                "default",
+                "custom",
+                "manual",
+                "automatic",
+                "identical",
+                "different",
+                "same",
+                "other",
+            ]
+            for phrase in table_phrases:
+                if phrase in text_lower:
+                    return True
+            # Looks like a label or header (Title Case or ALL CAPS, short)
+            if len(words) <= 4 and len(text) < 40:
+                # Check if it's Title Case or contains common label patterns
+                if text.istitle() or text.isupper():
+                    return True
+                # Two-three word phrases that look like labels
+                if len(words) in (2, 3) and all(w[0].isupper() for w in words if w):
+                    return True
+    return False
+def _filter_table_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
+    """Filter out blocks that appear to be part of tables.
+    Detects tables by looking for:
+    - Multiple blocks at similar Y positions (table rows)
+    - Blocks with table-like content
+    Args:
+        blocks: List of text blocks.
+    Returns:
+        Filtered list with table content removed.
+    """
+    if not blocks:
+        return blocks
+    # Group blocks by page and approximate Y position (row detection)
+    # Blocks within 1% of page height are considered same row
+    filtered = []
+    for page_num in set(b.page_num for b in blocks):
+        page_blocks = [b for b in blocks if b.page_num == page_num]
+        # Group by Y position (rounded to detect rows)
+        y_groups: dict[float, list[TextBlock]] = {}
+        for block in page_blocks:
+            y_key = round(block.y_ratio, 2)  # Group within ~1% of page
+            if y_key not in y_groups:
+                y_groups[y_key] = []
+            y_groups[y_key].append(block)
+        for y_key, row_blocks in y_groups.items():
+            # If many blocks at same Y position, likely a table row
+            if len(row_blocks) >= 3:
+                # Check if most blocks look like table cells
+                table_like = sum(1 for b in row_blocks if _is_table_like_text(b.text))
+                if table_like >= len(row_blocks) * 0.5:
+                    # Skip this entire row - it's a table
+                    continue
+            # Filter individual blocks that look like table content
+            for block in row_blocks:
+                if not _is_table_like_text(block.text):
+                    filtered.append(block)
+    # Sort by page and position (top to bottom)
+    filtered.sort(key=lambda b: (b.page_num, -b.y_ratio))
+    return filtered
 def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
             if not isinstance(element, LTTextBoxHorizontal):
                 continue
+            # Extract characters with their font sizes
+            # LTChar has font size, LTAnno is whitespace (use size=-1 to always keep)
+            chars_with_sizes: list[tuple[str, float]] = []
+            for line in element:
+                if isinstance(line, LTTextLineHorizontal):
+                    for char in line:
+                        if isinstance(char, LTChar):
+                            chars_with_sizes.append((char.get_text(), char.size))
+                        elif isinstance(char, LTAnno):
+                            # Whitespace/newlines - always keep (use -1 as marker)
+                            chars_with_sizes.append((char.get_text(), -1))
+            if not chars_with_sizes:
+                text = element.get_text().strip()
+                if text:
+                    blocks.append(
+                        TextBlock(
+                            text=text,
+                            y_ratio=element.y0 / page_height if page_height > 0 else 0.5,
+                            font_size=10.0,
+                            page_num=page_num,
+                        )
+                    )
+                continue
+            # Find dominant font size (most common, excluding whitespace markers)
+            font_sizes = [size for _, size in chars_with_sizes if size > 0]
+            if not font_sizes:
+                continue
+            size_counts = Counter(round(s, 1) for s in font_sizes)
+            dominant_size = max(size_counts, key=lambda x: size_counts[x])
+            # Filter out superscript/subscript characters (< 70% of dominant size)
+            # Keep whitespace (size=-1) and normal-sized characters
+            min_size = dominant_size * 0.7
+            filtered_text = "".join(
+                char for char, size in chars_with_sizes if size < 0 or size >= min_size
+            )
+            text = filtered_text.strip()
             if not text:
                 continue
             # Calculate Y position as ratio (0=bottom, 1=top)
             y_ratio = element.y0 / page_height if page_height > 0 else 0.5
             avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
             blocks.append(
     if not blocks:
         return ""
+    # Filter out table content first
+    blocks = _filter_table_blocks(blocks)
     cleaned_blocks = clean_text_blocks(blocks)
     text = "\n\n".join(block.text for block in cleaned_blocks)
         if is_page_number(block.text):
             continue
+        # Skip figure/table captions
+        if _is_caption(block.text):
+            continue
         # Skip very short lines with small font (likely captions/footnotes)
         if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
             continue
     Returns:
         Normalized text optimized for TTS.
     """
+    # === REMOVE ACADEMIC/PAPER ARTIFACTS ===
+    # Remove inline citations like (Smith et al., 2020) or (Smith, 2020; Jones, 2019)
+    # Also handles (Chen, 2018; Lee et al., 2020)
+    text = re.sub(r"\([^()]*\b\d{4}[a-z]?\b[^()]*\)", "", text)
+    # Remove author-year citations like "Smith (2020)" or "Smith et al. (2020)"
+    text = re.sub(
+        r"\b[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*\(\d{4}[a-z]?\)", "", text
+    )
+    # Clean up "by [Author]" patterns - remove the author part, keep "by" for grammar
+    # "by Smith" -> "" (will be cleaned up), "study by Smith found" -> "study found"
+    text = re.sub(
+        r"\bby\s+[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*,?\s*(?=found|showed|demonstrated|reported|observed|noted|suggested|concluded|argued|claimed|stated|proposed|discovered|revealed|indicated|confirmed)",
+        "",
+        text,
+    )
+    # Remove orphaned "et al." and similar
+    text = re.sub(r"\s+et\s+al\.?,?\s*", " ", text)
+    # Remove figure/table references like "see Figure 1" or "(see Table 2)"
+    text = re.sub(
+        r"\(?see\s+(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph|Appendix)\s*\d+[a-z]?\)?",
+        "",
+        text,
+        flags=re.IGNORECASE,
+    )
+    # Remove standalone figure/table references like "Figure 1 shows" -> "shows"
+    text = re.sub(
+        r"(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph)\s*\d+[a-z]?\s*(?:shows?|depicts?|illustrates?|presents?|displays?|summarizes?)",
+        "",
+        text,
+        flags=re.IGNORECASE,
+    )
+    # Remove section references like "Section 2.1" or "Chapter 3" (with surrounding context)
+    text = re.sub(
+        r"(?:in|see|as\s+(?:shown|described|discussed)\s+in|according\s+to)\s+(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*,?\s*",
+        "",
+        text,
+        flags=re.IGNORECASE,
+    )
+    text = re.sub(r"(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*", "", text, flags=re.IGNORECASE)
+    # Remove equation references like "Equation 1" or "Eq. (2)"
+    text = re.sub(r"(?:Equation|Eq\.?)\s*\(?\d+\)?", "", text, flags=re.IGNORECASE)
+    # Remove DOIs
+    text = re.sub(r"(?:doi:|DOI:?)\s*10\.\d{4,}/[^\s]+", "", text, flags=re.IGNORECASE)
+    # Remove arXiv references
+    text = re.sub(r"arXiv:\d{4}\.\d{4,}(?:v\d+)?", "", text, flags=re.IGNORECASE)
+    # Remove ISSN/ISBN numbers
+    text = re.sub(r"(?:ISSN|ISBN)[:\s]*[\d-]+", "", text, flags=re.IGNORECASE)
+    # Remove page ranges like "pp. 123-456" or "p. 42" or "pages 10-20"
+    text = re.sub(r"(?:p{1,2}\.?|pages?)\s*\d+(?:\s*[-–—]\s*\d+)?", "", text, flags=re.IGNORECASE)
+    # Remove volume/issue numbers like "Vol. 12, No. 3" (entire phrase)
+    text = re.sub(
+        r"(?:Vol(?:ume)?\.?\s*\d+,?\s*)?(?:Issue|No\.?)\s*\d+,?\s*", "", text, flags=re.IGNORECASE
+    )
+    text = re.sub(r"Vol(?:ume)?\.?\s*\d+,?\s*", "", text, flags=re.IGNORECASE)
+    # Remove copyright notices
+    text = re.sub(r"©\s*\d{4}[^.]*\.", "", text)
+    text = re.sub(r"Copyright\s*©?\s*\d{4}[^.]*\.", "", text, flags=re.IGNORECASE)
+    # Remove "All rights reserved" and similar
+    text = re.sub(r"All rights reserved\.?", "", text, flags=re.IGNORECASE)
+    # Remove asterisks used for footnote markers
+    text = re.sub(r"\*{1,3}(?=\s|$)", "", text)
+    # === NORMALIZE NEWLINES FIRST ===
+    # Convert various newline formats to standard \n
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    # Replace single newlines (mid-sentence line breaks) with spaces
+    # Keep double newlines as paragraph separators
+    # First, normalize multiple newlines to exactly two
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    # Replace single newlines that aren't paragraph breaks with spaces
+    # A single newline not preceded by sentence-ending punctuation is likely a line wrap
+    text = re.sub(r"(?<![.!?:\n])\n(?!\n)", " ", text)
     # === CODE AND TECHNICAL CONTENT ===
     # Handle common programming patterns that read poorly
     text = text.replace("'''", "")
     # === UNICODE NORMALIZATION ===
+    # Remove superscript characters (often footnote references)
+    # Includes Unicode superscript digits, letters, and modifier letters
+    superscripts = (
+        "⁰¹²³⁴⁵⁶⁷⁸⁹"  # Superscript digits
+        "⁺⁻⁼⁽⁾"  # Superscript operators
+        "ⁿⁱ"  # Common superscript letters
+        "ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻ"  # Superscript lowercase
+        "ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᴬᴭᴮᴯᴰᴱᴲᴳᴴᴵᴶᴷᴸᴹᴺᴻᴼᴽᴾᴿᵀᵁᵂ"  # Superscript uppercase
+        "ᶦᶧᶨᶩᶪᶫᶬᶭᶮᶯᶰᶱᶲᶳᶴᶵᶶᶷᶸᶹᶺᶻᶼᶽᶾᶿ"  # More modifier letters
+        "ʰʱʲʳʴʵʶʷʸʹʺʻʼʽˀˁˆˇˈˉˊˋˌˍˎˏːˑ"  # Modifier letters
+    )
+    for char in superscripts:
+        text = text.replace(char, "")
+    # Also use regex to catch any remaining superscript-like characters
+    # Unicode categories for superscripts and modifiers
+    text = re.sub(r"[\u2070-\u209F]", "", text)  # Superscripts and Subscripts block
+    text = re.sub(r"[\u1D2C-\u1D6A]", "", text)  # Phonetic Extensions (modifier letters)
+    text = re.sub(r"[\u1D78-\u1D7F]", "", text)  # More phonetic extensions
+    text = re.sub(r"[\u02B0-\u02FF]", "", text)  # Spacing Modifier Letters
+    # Remove subscript characters
+    subscripts = "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₔₕₖₗₘₙₚₛₜ"
+    for char in subscripts:
+        text = text.replace(char, "")
     # Convert smart quotes to simple quotes
+    text = text.replace("\u201c", '"').replace("\u201d", '"')
+    text = text.replace("\u2018", "'").replace("\u2019", "'")
+    text = text.replace("\u201e", '"').replace("\u201f", '"')
     # Normalize dashes to standard hyphen or remove
     text = text.replace("–", "-")  # en-dash
     # Remove content in angle brackets (often HTML/XML artifacts)
     text = re.sub(r"<[^>]+>", "", text)
     # Remove spaces before punctuation
     text = re.sub(r"\s+([.,;:!?])", r"\1", text)
     # Ensure space after punctuation (but not before another punctuation)
     text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
+    # === FINAL WHITESPACE NORMALIZATION ===
+    # This must happen LAST after all substitutions that can create gaps
+    # Collapse all whitespace (spaces, tabs, multiple spaces) to single space
+    # Do this per-line to preserve intentional paragraph breaks
+    lines = text.split("\n")
+    normalized_lines = []
+    for line in lines:
+        # Replace any sequence of whitespace with single space
+        line = re.sub(r"[ \t]+", " ", line)
+        # Strip leading/trailing whitespace from each line
+        line = line.strip()
+        normalized_lines.append(line)
+    text = "\n".join(normalized_lines)
+    # Remove excessive blank lines (keep max 1 blank line between paragraphs)
     text = re.sub(r"\n{3,}", "\n\n", text)
+    # Remove blank lines at start/end
+    text = text.strip()
     return text

src/talking_snake/static/app.js CHANGED Viewed

@@ -24,14 +24,15 @@ const deviceInfo = document.getElementById("deviceInfo");
 const docInfo = document.getElementById("docInfo");
 const languageButtons = document.querySelectorAll("#languageButtons .style-btn");
 const processingProgressBar = document.getElementById("processingProgressBar");
 // Custom player elements
 const playerPlayBtn = document.getElementById("playerPlayBtn");
 const progressBar = document.getElementById("progressBar");
 const progressSlider = document.getElementById("progressSlider");
 const timeDisplay = document.getElementById("timeDisplay");
-const volumeBtn = document.getElementById("volumeBtn");
 const downloadBtn = document.getElementById("downloadBtn");
 // Constants
 const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
@@ -39,11 +40,12 @@ const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
 // State
 let currentAbortController = null;
 let selectedLanguage = "english";
 let isPaused = false;
 let estimatedDuration = 0; // Estimated total duration from server
-let isMuted = false;
-let currentAudioBlob = null; // Store audio blob for download
 let currentDocName = ""; // Store document name for download filename
 /**
  * Format time in seconds to MM:SS
@@ -91,10 +93,32 @@ function updateDocInfo(data) {
     const pageInfo = data.page_count ? `<span class="doc-pages"><i class="fa-solid fa-file"></i> ${data.page_count}p</span>` : "";
     const charInfo = data.total_chars ? `<span class="doc-chars"><i class="fa-solid fa-font"></i> ${formatNumber(data.total_chars)}</span>` : "";
     docInfo.innerHTML = `
         <span class="doc-name" title="${docName}"><i class="fa-solid ${icon}"></i><span class="doc-name-text">${docName}</span></span>
         ${pageInfo}
         ${charInfo}
     `;
 }
@@ -102,16 +126,30 @@ function updateDocInfo(data) {
  * Update the custom player progress bar and time display
  */
 function updatePlayerProgress() {
-    const currentTime = audio.currentTime || 0;
-    // Use estimated duration if audio duration is unrealistic (streaming issue)
-    let duration = audio.duration;
-    if (!isFinite(duration) || duration > 36000 || duration <= 0) {
-        duration = estimatedDuration || currentTime + 60; // Fallback
     }
     const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
     progressBar.style.width = `${Math.min(progress, 100)}%`;
-    progressSlider.value = progress;
     timeDisplay.textContent = `${formatTime(currentTime)} / ${formatTime(duration)}`;
 }
@@ -120,11 +158,19 @@ function updatePlayerProgress() {
  */
 function handleSeek(e) {
     const percent = parseFloat(e.target.value);
-    let duration = audio.duration;
-    if (!isFinite(duration) || duration > 36000) {
-        duration = estimatedDuration || 60;
     }
-    audio.currentTime = (percent / 100) * duration;
     updatePlayerProgress();
 }
@@ -152,13 +198,39 @@ function updatePlayButton() {
 }
 /**
- * Toggle mute
  */
-function toggleMute() {
-    isMuted = !isMuted;
-    audio.muted = isMuted;
-    const icon = volumeBtn.querySelector("i");
-    icon.className = isMuted ? "fa-solid fa-volume-xmark" : "fa-solid fa-volume-high";
 }
 /**
@@ -167,14 +239,27 @@ function toggleMute() {
  */
 function updateDeviceInfo(info) {
     const icon = info.device === "cuda" ? "fa-microchip" : "fa-server";
-    const memoryInfo = info.device === "cuda"
-        ? `${info.memory_used_gb}GB / ${info.memory_total_gb}GB (${info.memory_percent}%)`
-        : "CPU mode";
     deviceInfo.innerHTML = `
-        <i class="fa-solid ${icon}"></i>
-        <span>${info.device_name}</span>
-        <span class="device-memory">${memoryInfo}</span>
-        <span class="device-batch">Batch: ${info.batch_size}</span>
     `;
     deviceInfo.classList.add("visible");
 }
@@ -207,58 +292,145 @@ initDeviceInfoStream();
 // Custom player event listeners
 playerPlayBtn.addEventListener("click", togglePlayerPlay);
 progressSlider.addEventListener("input", handleSeek);
-volumeBtn.addEventListener("click", toggleMute);
-audio.addEventListener("play", updatePlayButton);
-audio.addEventListener("pause", updatePlayButton);
 audio.addEventListener("timeupdate", updatePlayerProgress);
 audio.addEventListener("ended", () => {
     updatePlayButton();
     progressBar.style.width = "100%";
 });
 // Show pause button when audio actually starts playing
 audio.addEventListener("playing", () => {
     pauseBtn.classList.remove("hidden");
 });
 /**
- * Fetch audio blob from the server for download capability
  * @param {string} jobId - The job ID for the audio
  */
-async function fetchAudioBlob(jobId) {
-    try {
-        const response = await fetch(`/api/audio/${jobId}`);
-        if (response.ok) {
-            currentAudioBlob = await response.blob();
-            // Show download button
-            downloadBtn.classList.remove("hidden");
-        }
-    } catch (error) {
-        console.error("Failed to fetch audio for download:", error);
-    }
 }
 /**
  * Download the current audio as a WAV file
  */
 function downloadAudio() {
-    if (!currentAudioBlob) {
         return;
     }
-    const url = URL.createObjectURL(currentAudioBlob);
-    const a = document.createElement("a");
-    a.href = url;
     // Create filename from document name
     let filename = currentDocName || "audio";
-    // Remove file extension if present and add .wav
     filename = filename.replace(/\.[^.]+$/, "") + ".wav";
-    a.download = filename;
     document.body.appendChild(a);
     a.click();
     document.body.removeChild(a);
-    URL.revokeObjectURL(url);
 }
 /**
@@ -269,6 +441,112 @@ function getSelectedLanguage() {
     return selectedLanguage;
 }
 /**
  * Show the input section and hide processing section
  */
@@ -283,9 +561,10 @@ function showInputSection() {
 function showProcessingSection() {
     inputSection.classList.add("hidden");
     processingSection.classList.add("visible");
-    // Reset progress bar and hide pause button
     processingProgressBar.style.width = "0%";
     pauseBtn.classList.add("hidden");
 }
 /**
@@ -318,10 +597,10 @@ function stopGeneration() {
     isPaused = false;
     updatePauseButton();
-    // Hide download button and pause button
     downloadBtn.classList.add("hidden");
     pauseBtn.classList.add("hidden");
-    currentAudioBlob = null;
     // Reset progress bar
     processingProgressBar.style.width = "0%";
@@ -370,16 +649,23 @@ function updatePauseButton() {
     }
 }
 /**
- * Format remaining time for display
- * @param {number} seconds - Remaining time in seconds
- * @returns {string} Formatted time string
  */
-function formatTimeRemaining(seconds) {
-    if (seconds > 60) {
-        return `~${Math.ceil(seconds / 60)} min remaining`;
     }
-    return `~${Math.ceil(seconds)}s remaining`;
 }
 /**
@@ -387,15 +673,15 @@ function formatTimeRemaining(seconds) {
  * Sets up audio stream once job_id is received
  * @param {Response} response - Fetch response with SSE stream
  * @param {string} docName - Document name for display
  * @returns {Promise<void>}
  * @throws {Error} If stream contains an error event or fails
  */
-async function processStream(response, docName) {
     const reader = response.body.getReader();
     const decoder = new TextDecoder();
     let lastStatus = "";
-    let jobId = null;
-    let audioStarted = false;
     // Reset estimated duration
     estimatedDuration = 0;
@@ -419,52 +705,66 @@ async function processStream(response, docName) {
                             throw new Error(data.message || "TTS generation failed");
                         } else if (data.type === "start" && data.job_id) {
                             // Got job ID - start audio stream immediately
-                            jobId = data.job_id;
-                            // Capture initial duration estimate
-                            if (data.estimated_remaining) {
-                                estimatedDuration = data.estimated_remaining;
                             }
                             // Display document info
                             updateDocInfo(data);
-                            if (!audioStarted) {
-                                audioStarted = true;
-                                // Set audio source to stream endpoint
-                                // Browser will start playing as data arrives
-                                audio.src = `/api/audio/${jobId}`;
-                                audio.load();
-                                // Try to play (may need user interaction first time)
-                                audio.play().catch(() => {
-                                    // Autoplay blocked - will play when user clicks
-                                });
-                                updatePlayButton();
-                                // Pause button will be shown by the 'playing' event listener
                             }
-                            const timeStr = formatTimeRemaining(data.estimated_remaining);
                             showStatus(
-                                `<span class="spinner"></span>ETA ${timeStr}`,
                                 "loading"
                             );
                             // Update progress bar
                             processingProgressBar.style.width = "5%";
                         } else if (data.type === "progress") {
                             lastStatus = data.status;
-                            const timeStr = formatTimeRemaining(data.estimated_remaining);
                             showStatus(
-                                `<span class="spinner"></span>${data.percent}% • ETA ${timeStr}`,
                                 "loading"
                             );
                             // Update progress bar
                             processingProgressBar.style.width = `${data.percent}%`;
                         } else if (data.type === "complete") {
                             // Generation complete - show player
-                            // Update estimated duration based on actual processing time
-                            if (data.total_time) {
-                                // Estimate audio duration: ~0.1s per char at normal speech rate
-                                // Use total_time as a rough guide
-                                estimatedDuration = Math.max(estimatedDuration, audio.currentTime + 10);
                             }
-                            filename.textContent = docName;
                             currentDocName = docName;
                             player.classList.add("visible");
                             // Set progress to 100%
                             processingProgressBar.style.width = "100%";
@@ -474,11 +774,6 @@ async function processStream(response, docName) {
                                 "success"
                             );
                             updatePlayerProgress();
-                            // Fetch audio blob for download capability
-                            if (jobId) {
-                                fetchAudioBlob(jobId);
-                            }
                         }
                     } catch (parseError) {
                         // Check if it's our thrown error or a JSON parse error
@@ -518,11 +813,11 @@ async function handleFile(file) {
     showStatus('<span class="spinner"></span> Extracting text...', "loading");
     player.classList.remove("visible");
     downloadBtn.classList.add("hidden");
-    currentAudioBlob = null;
     const formData = new FormData();
     formData.append("file", file);
     formData.append("language", getSelectedLanguage());
     // Create abort controller for this request
     currentAbortController = new AbortController();
@@ -540,7 +835,7 @@ async function handleFile(file) {
         }
         // Process stream handles both progress SSE and starting audio playback
-        await processStream(response, file.name);
     } catch (error) {
         if (error.name === "AbortError") {
             // User cancelled - already handled in stopGeneration
@@ -577,7 +872,6 @@ async function handleUrl(url) {
     showStatus('<span class="spinner"></span> Fetching content...', "loading");
     player.classList.remove("visible");
     downloadBtn.classList.add("hidden");
-    currentAudioBlob = null;
     urlSubmit.disabled = true;
     // Create abort controller for this request
@@ -591,7 +885,8 @@ async function handleUrl(url) {
             },
             body: JSON.stringify({
                 url,
-                language: getSelectedLanguage()
             }),
             signal: currentAbortController.signal,
         });
@@ -606,7 +901,7 @@ async function handleUrl(url) {
         const docName = urlPath.split("/").pop() || "document";
         // Process stream handles both progress SSE and starting audio playback
-        await processStream(response, docName);
     } catch (error) {
         if (error.name === "AbortError") {
             // User cancelled - already handled in stopGeneration
@@ -641,7 +936,6 @@ async function handleText(text) {
     showStatus('<span class="spinner"></span> Processing text...', "loading");
     player.classList.remove("visible");
     downloadBtn.classList.add("hidden");
-    currentAudioBlob = null;
     textSubmit.disabled = true;
     // Create abort controller for this request
@@ -655,7 +949,8 @@ async function handleText(text) {
             },
             body: JSON.stringify({
                 text,
-                language: getSelectedLanguage()
             }),
             signal: currentAbortController.signal,
         });
@@ -665,8 +960,12 @@ async function handleText(text) {
             throw new Error(error.detail || "Failed to process text");
         }
         // Process stream handles both progress SSE and starting audio playback
-        await processStream(response, "Pasted Text");
     } catch (error) {
         if (error.name === "AbortError") {
             // User cancelled - already handled in stopGeneration
@@ -683,6 +982,15 @@ async function handleText(text) {
 // Tab switching
 tabs.forEach((tab) => {
     tab.addEventListener("click", () => {
         tabs.forEach((t) => t.classList.remove("active"));
         tabContents.forEach((tc) => tc.classList.remove("active"));
         tab.classList.add("active");
@@ -712,7 +1020,7 @@ dropZone.addEventListener("drop", (e) => {
 // Click to select file
 dropZone.addEventListener("click", (e) => {
-    if (e.target !== fileInput && !e.target.classList.contains("file-label")) {
         fileInput.click();
     }
 });
@@ -746,15 +1054,34 @@ textInput.addEventListener("keydown", (e) => {
     }
 });
 // Stop button
 stopBtn.addEventListener("click", stopGeneration);
 // Pause button
 pauseBtn.addEventListener("click", togglePause);
 // Download button
 downloadBtn.addEventListener("click", downloadAudio);
 // Update pause button when audio state changes
 audio.addEventListener("play", updatePauseButton);
 audio.addEventListener("pause", updatePauseButton);
@@ -766,8 +1093,16 @@ audio.addEventListener("ended", () => {
 // Language selection
 languageButtons.forEach((btn) => {
     btn.addEventListener("click", () => {
-        languageButtons.forEach((b) => b.classList.remove("active"));
         btn.classList.add("active");
-        selectedLanguage = btn.dataset.language;
     });
 });

 const docInfo = document.getElementById("docInfo");
 const languageButtons = document.querySelectorAll("#languageButtons .style-btn");
 const processingProgressBar = document.getElementById("processingProgressBar");
+const streamPlayBtn = document.getElementById("streamPlayBtn");
 // Custom player elements
 const playerPlayBtn = document.getElementById("playerPlayBtn");
 const progressBar = document.getElementById("progressBar");
 const progressSlider = document.getElementById("progressSlider");
 const timeDisplay = document.getElementById("timeDisplay");
 const downloadBtn = document.getElementById("downloadBtn");
+const deleteBtn = document.getElementById("deleteBtn");
 // Constants
 const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
 // State
 let currentAbortController = null;
 let selectedLanguage = "english";
+let selectedStyle = "technical";
 let isPaused = false;
 let estimatedDuration = 0; // Estimated total duration from server
 let currentDocName = ""; // Store document name for download filename
+let playbackStartTime = 0; // When playback started (for tracking real elapsed time)
+let playbackElapsed = 0; // Total elapsed playback time
 /**
  * Format time in seconds to MM:SS
     const pageInfo = data.page_count ? `<span class="doc-pages"><i class="fa-solid fa-file"></i> ${data.page_count}p</span>` : "";
     const charInfo = data.total_chars ? `<span class="doc-chars"><i class="fa-solid fa-font"></i> ${formatNumber(data.total_chars)}</span>` : "";
+    // Style icons mapping
+    const styleIcons = {
+        technical: "fa-microchip",
+        narrative: "fa-book-open",
+        child_narrative: "fa-child",
+        news: "fa-newspaper",
+        academic: "fa-graduation-cap"
+    };
+    // Language flags mapping
+    const langFlags = {
+        english: "🇬🇧",
+        chinese: "🇨🇳",
+        japanese: "🇯🇵",
+        korean: "🇰🇷"
+    };
+    const styleIcon = styleIcons[selectedStyle] || "fa-microchip";
+    const langFlag = langFlags[selectedLanguage] || "🇬🇧";
     docInfo.innerHTML = `
         <span class="doc-name" title="${docName}"><i class="fa-solid ${icon}"></i><span class="doc-name-text">${docName}</span></span>
         ${pageInfo}
         ${charInfo}
+        <span class="doc-style" title="Style: ${selectedStyle}"><i class="fa-solid ${styleIcon}"></i></span>
+        <span class="doc-lang" title="Language: ${selectedLanguage}">${langFlag}</span>
     `;
 }
  * Update the custom player progress bar and time display
  */
 function updatePlayerProgress() {
+    // For streaming WAV, browser's duration/currentTime are unreliable
+    // Track real playback time ourselves
+    let currentTime;
+    if (playbackStartTime > 0 && !audio.paused) {
+        currentTime = playbackElapsed + (Date.now() - playbackStartTime) / 1000;
+    } else {
+        currentTime = playbackElapsed;
+    }
+    // Use our estimated duration, update it if playback exceeds estimate
+    let duration = estimatedDuration;
+    if (currentTime > duration) {
+        estimatedDuration = currentTime + 10; // Extend estimate
+        duration = estimatedDuration;
+    }
+    // Ensure we have reasonable values
+    if (duration <= 0) {
+        duration = 60; // Fallback
     }
     const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
     progressBar.style.width = `${Math.min(progress, 100)}%`;
+    progressSlider.value = Math.min(progress, 100);
     timeDisplay.textContent = `${formatTime(currentTime)} / ${formatTime(duration)}`;
 }
  */
 function handleSeek(e) {
     const percent = parseFloat(e.target.value);
+    const duration = estimatedDuration || 60;
+    const seekTime = (percent / 100) * duration;
+    // Set our playback tracker
+    playbackElapsed = seekTime;
+    playbackStartTime = audio.paused ? 0 : Date.now();
+    // Try to seek the audio (may not work well with streaming)
+    try {
+        audio.currentTime = seekTime;
+    } catch {
+        // Seeking may fail with streaming audio
     }
     updatePlayerProgress();
 }
 }
 /**
+ * Get HTML for model state indicator
+ * @param {string} state - Model state: loaded, loading, unloaded, unloading
+ * @returns {string} HTML string for the model state indicator
  */
+function getModelStateHtml(state) {
+    const stateConfig = {
+        loaded: {
+            icon: "fa-circle-check",
+            class: "model-loaded",
+            text: "Model loaded",
+            tooltip: "TTS model is loaded in memory and ready for inference"
+        },
+        loading: {
+            icon: "fa-spinner fa-spin",
+            class: "model-loading",
+            text: "Loading...",
+            tooltip: "TTS model is being loaded into memory"
+        },
+        unloaded: {
+            icon: "fa-circle-xmark",
+            class: "model-unloaded",
+            text: "Model unloaded",
+            tooltip: "TTS model is not loaded (will load on first request)"
+        },
+        unloading: {
+            icon: "fa-spinner fa-spin",
+            class: "model-unloading",
+            text: "Unloading...",
+            tooltip: "TTS model is being unloaded from memory"
+        }
+    };
+    const config = stateConfig[state] || stateConfig.unloaded;
+    return `<span class="model-state ${config.class}" title="${config.tooltip}"><i class="fa-solid ${config.icon}"></i> ${config.text}</span>`;
 }
 /**
  */
 function updateDeviceInfo(info) {
     const icon = info.device === "cuda" ? "fa-microchip" : "fa-server";
+    const deviceTooltip = info.device === "cuda"
+        ? "GPU accelerated inference for faster audio generation"
+        : "CPU-based inference (slower than GPU)";
+    const gpuMemoryInfo = info.device === "cuda"
+        ? `<span class="device-memory" title="GPU memory used for model and inference"><i class="fa-solid fa-memory"></i> GPU: ${info.memory_used_gb}/${info.memory_total_gb}GB</span>`
+        : "";
+    const ramInfo = `<span class="device-memory" title="System RAM usage"><i class="fa-solid fa-memory"></i> RAM: ${info.ram_used_gb}/${info.ram_total_gb}GB</span>`;
+    // Show timing stats if available
+    const timingInfo = info.seconds_per_char !== undefined
+        ? `<span class="device-timing" title="Average time to generate audio per character of text"><i class="fa-solid fa-stopwatch"></i> ${info.seconds_per_char.toFixed(4)}s/char</span>`
+        : "";
+    // Show model state
+    const modelStateInfo = getModelStateHtml(info.model_state);
     deviceInfo.innerHTML = `
+        <i class="fa-solid ${icon}" title="${deviceTooltip}"></i>
+        <span title="${deviceTooltip}">${info.device_name}</span>
+        ${modelStateInfo}
+        ${gpuMemoryInfo}
+        ${ramInfo}
+        ${timingInfo}
+        <span class="device-ephemeral" title="Your documents are processed in memory only. Nothing is saved to disk or stored after processing."><i class="fa-solid fa-shield-halved"></i> No files stored</span>
     `;
     deviceInfo.classList.add("visible");
 }
 // Custom player event listeners
 playerPlayBtn.addEventListener("click", togglePlayerPlay);
 progressSlider.addEventListener("input", handleSeek);
+audio.addEventListener("play", () => {
+    // Start tracking real playback time
+    playbackStartTime = Date.now();
+    updatePlayButton();
+});
+audio.addEventListener("pause", () => {
+    // Save elapsed time when pausing
+    if (playbackStartTime > 0) {
+        playbackElapsed += (Date.now() - playbackStartTime) / 1000;
+        playbackStartTime = 0;
+    }
+    updatePlayButton();
+});
 audio.addEventListener("timeupdate", updatePlayerProgress);
 audio.addEventListener("ended", () => {
+    // Update elapsed to match duration on completion
+    if (playbackStartTime > 0) {
+        playbackElapsed += (Date.now() - playbackStartTime) / 1000;
+        playbackStartTime = 0;
+    }
+    // Ensure we show completion
+    if (estimatedDuration > 0 && playbackElapsed < estimatedDuration) {
+        playbackElapsed = estimatedDuration;
+    }
     updatePlayButton();
     progressBar.style.width = "100%";
+    timeDisplay.textContent = `${formatTime(estimatedDuration)} / ${formatTime(estimatedDuration)}`;
+});
+// Update duration when metadata is available
+audio.addEventListener("loadedmetadata", () => {
+    // If browser has a valid duration, use it instead of estimate
+    if (isFinite(audio.duration) && audio.duration > 0 && audio.duration < 36000) {
+        estimatedDuration = audio.duration;
+    }
+    updatePlayerProgress();
+});
+// Also check duration changes (for streaming audio)
+audio.addEventListener("durationchange", () => {
+    if (isFinite(audio.duration) && audio.duration > 0 && audio.duration < 36000) {
+        estimatedDuration = audio.duration;
+    }
+    updatePlayerProgress();
+});
+// Log audio errors for debugging
+audio.addEventListener("error", () => {
+    console.error("Audio error:", audio.error?.message || "Unknown error");
 });
 // Show pause button when audio actually starts playing
 audio.addEventListener("playing", () => {
+    streamPlayBtn.classList.add("hidden");
     pauseBtn.classList.remove("hidden");
 });
+// Show stream play button when audio has enough data to start playing
+audio.addEventListener("canplay", () => {
+    // Only show if processing is still in progress (player not visible yet)
+    // and audio is paused (not already playing) and pause button isn't showing
+    if (!player.classList.contains("visible") && audio.paused && pauseBtn.classList.contains("hidden")) {
+        streamPlayBtn.classList.remove("hidden");
+    }
+});
 /**
+ * Start streaming audio playback and enable download from cache
  * @param {string} jobId - The job ID for the audio
  */
+async function startAudioStream(jobId) {
+    const audioUrl = `/api/audio/${jobId}`;
+    // Reset playback tracking for new stream
+    playbackStartTime = 0;
+    playbackElapsed = 0;
+    // Set up audio source for streaming (user can click play)
+    audio.src = audioUrl;
+    audio.load();
+    // Store job ID for download - will fetch from cache
+    audio.dataset.jobId = jobId;
+    // Play button will be shown by the canplay event handler
 }
 /**
  * Download the current audio as a WAV file
  */
 function downloadAudio() {
+    const jobId = audio.dataset.jobId;
+    if (!jobId) {
         return;
     }
     // Create filename from document name
     let filename = currentDocName || "audio";
     filename = filename.replace(/\.[^.]+$/, "") + ".wav";
+    // Use download endpoint which returns proper WAV file
+    const a = document.createElement("a");
+    a.href = `/api/download/${jobId}?filename=${encodeURIComponent(filename)}`;
+    a.download = filename;
     document.body.appendChild(a);
     a.click();
     document.body.removeChild(a);
+}
+/**
+ * Delete the current audio and reset the player
+ */
+function deleteAudio() {
+    // Stop audio immediately
+    audio.pause();
+    // Add deleting animation
+    player.classList.add("deleting");
+    // Wait for animation to complete
+    setTimeout(() => {
+        // Reset audio
+        audio.src = "";
+        audio.currentTime = 0;
+        // Clear state
+        currentDocName = "";
+        estimatedDuration = 0;
+        // Hide player and buttons
+        player.classList.remove("visible", "deleting");
+        downloadBtn.classList.add("hidden");
+        deleteBtn.classList.add("hidden");
+        // Reset progress
+        progressBar.style.width = "0%";
+        progressSlider.value = 0;
+        timeDisplay.textContent = "0:00 / 0:00";
+        updatePlayButton();
+        // Show input section again
+        inputSection.classList.remove("hidden");
+    }, 300);
 }
 /**
     return selectedLanguage;
 }
+/**
+ * Detect language from text based on character scripts.
+ * @param {string} text - The text to analyze
+ * @returns {string|null} Detected language or null if mostly ASCII/Latin
+ */
+function detectLanguage(text) {
+    if (!text || text.length < 5) {
+        return null;
+    }
+    let chinese = 0;
+    let japanese = 0; // Hiragana + Katakana
+    let korean = 0;
+    let latin = 0;
+    for (const char of text) {
+        const code = char.charCodeAt(0);
+        // CJK Unified Ideographs (shared by Chinese/Japanese)
+        if (code >= 0x4e00 && code <= 0x9fff) {
+            chinese++;
+        }
+        // Hiragana
+        else if (code >= 0x3040 && code <= 0x309f) {
+            japanese++;
+        }
+        // Katakana
+        else if (code >= 0x30a0 && code <= 0x30ff) {
+            japanese++;
+        }
+        // Hangul Syllables
+        else if (code >= 0xac00 && code <= 0xd7af) {
+            korean++;
+        }
+        // Hangul Jamo
+        else if (code >= 0x1100 && code <= 0x11ff) {
+            korean++;
+        }
+        // Basic Latin letters
+        else if (
+            (code >= 0x41 && code <= 0x5a) ||
+            (code >= 0x61 && code <= 0x7a)
+        ) {
+            latin++;
+        }
+    }
+    const total = chinese + japanese + korean + latin;
+    if (total === 0) {
+        return null;
+    }
+    // Japanese uses kanji (chinese chars) + kana, so check for kana first
+    if (japanese > 0 && (japanese + chinese) / total > 0.3) {
+        return "japanese";
+    }
+    // Korean
+    if (korean / total > 0.3) {
+        return "korean";
+    }
+    // Chinese (CJK without kana)
+    if (chinese / total > 0.3) {
+        return "chinese";
+    }
+    // Default to English for Latin text
+    if (latin / total > 0.5) {
+        return "english";
+    }
+    return null;
+}
+/**
+ * Set the selected language, optionally marking it as auto-detected.
+ * @param {string} lang - Language to select
+ * @param {boolean} isAuto - Whether this was auto-detected
+ */
+function setLanguage(lang, isAuto = false) {
+    const btn = document.querySelector(
+        `#languageButtons .style-btn[data-language="${lang}"]`
+    );
+    if (!btn || selectedLanguage === lang) {
+        return;
+    }
+    // Update selection state
+    languageButtons.forEach((b) => {
+        b.classList.remove("active", "auto-detected");
+    });
+    btn.classList.add("active");
+    selectedLanguage = lang;
+    // Visual feedback for auto-detection
+    if (isAuto) {
+        btn.classList.add("auto-detected");
+        // Remove animation class after it completes
+        setTimeout(() => btn.classList.remove("auto-detected"), 1500);
+    }
+}
+/**
+ * Get the currently selected style
+ * @returns {string} The selected style ID
+ */
+function getSelectedStyle() {
+    return selectedStyle;
+}
 /**
  * Show the input section and hide processing section
  */
 function showProcessingSection() {
     inputSection.classList.add("hidden");
     processingSection.classList.add("visible");
+    // Reset progress bar and hide buttons
     processingProgressBar.style.width = "0%";
     pauseBtn.classList.add("hidden");
+    streamPlayBtn.classList.add("hidden");
 }
 /**
     isPaused = false;
     updatePauseButton();
+    // Hide download button, pause button, and stream play button
     downloadBtn.classList.add("hidden");
     pauseBtn.classList.add("hidden");
+    streamPlayBtn.classList.add("hidden");
     // Reset progress bar
     processingProgressBar.style.width = "0%";
     }
 }
 /**
+ * Get icon class for source type
+ * @param {string} sourceType - The source type ("pdf", "url", "text")
+ * @returns {string} Font Awesome icon class
  */
+function getSourceIcon(sourceType) {
+    switch (sourceType) {
+        case "pdf":
+            return "fa-file-pdf";
+        case "url":
+            return "fa-link";
+        case "text":
+        default:
+            return "fa-keyboard";
     }
 }
 /**
  * Sets up audio stream once job_id is received
  * @param {Response} response - Fetch response with SSE stream
  * @param {string} docName - Document name for display
+ * @param {string} sourceType - Source type ("pdf", "url", "text")
  * @returns {Promise<void>}
  * @throws {Error} If stream contains an error event or fails
  */
+async function processStream(response, docName, sourceType = "text") {
     const reader = response.body.getReader();
     const decoder = new TextDecoder();
     let lastStatus = "";
+    let audioJobId = null;
     // Reset estimated duration
     estimatedDuration = 0;
                             throw new Error(data.message || "TTS generation failed");
                         } else if (data.type === "start" && data.job_id) {
                             // Got job ID - start audio stream immediately
+                            const jobId = data.job_id;
+                            // Estimate audio duration from character count
+                            // Typical speech is ~14 chars/sec (150 wpm, 5 chars/word)
+                            if (data.total_chars) {
+                                estimatedDuration = data.total_chars / 14;
                             }
                             // Display document info
                             updateDocInfo(data);
+                            if (!audioJobId) {
+                                audioJobId = jobId;
+                                // Start streaming playback immediately
+                                startAudioStream(jobId);
                             }
+                            // Show generating status
                             showStatus(
+                                '<span class="spinner"></span> Generating...',
                                 "loading"
                             );
                             // Update progress bar
                             processingProgressBar.style.width = "5%";
                         } else if (data.type === "progress") {
                             lastStatus = data.status;
+                            // Show progress percentage
                             showStatus(
+                                `<span class="spinner"></span> ${data.percent}%`,
                                 "loading"
                             );
                             // Update progress bar
                             processingProgressBar.style.width = `${data.percent}%`;
                         } else if (data.type === "complete") {
                             // Generation complete - show player
+                            // Use actual audio duration from server if available
+                            if (data.audio_duration && data.audio_duration > 0) {
+                                estimatedDuration = data.audio_duration;
                             }
+                            // Build filename with style and language indicators
+                            const styleIcons = {
+                                technical: "fa-microchip",
+                                conversational: "fa-comments",
+                                storytelling: "fa-book-open",
+                                child_narrative: "fa-child",
+                                news: "fa-newspaper",
+                                academic: "fa-graduation-cap"
+                            };
+                            const langFlags = {
+                                english: "🇬🇧",
+                                chinese: "🇨🇳",
+                                japanese: "🇯🇵",
+                                korean: "🇰🇷"
+                            };
+                            const usedStyle = getSelectedStyle();
+                            const usedLang = getSelectedLanguage();
+                            const styleIcon = styleIcons[usedStyle] || "fa-microchip";
+                            const langFlag = langFlags[usedLang] || "🇬🇧";
+                            filename.innerHTML = `<i class="fa-solid ${getSourceIcon(sourceType)}"></i> ${docName} <span class="filename-meta"><i class="fa-solid ${styleIcon}" title="Style: ${usedStyle}"></i><span title="Language: ${usedLang}">${langFlag}</span></span>`;
                             currentDocName = docName;
+                            // Hide stream buttons, show full player with download
+                            streamPlayBtn.classList.add("hidden");
+                            downloadBtn.classList.remove("hidden");
+                            deleteBtn.classList.remove("hidden");
                             player.classList.add("visible");
                             // Set progress to 100%
                             processingProgressBar.style.width = "100%";
                                 "success"
                             );
                             updatePlayerProgress();
                         }
                     } catch (parseError) {
                         // Check if it's our thrown error or a JSON parse error
     showStatus('<span class="spinner"></span> Extracting text...', "loading");
     player.classList.remove("visible");
     downloadBtn.classList.add("hidden");
     const formData = new FormData();
     formData.append("file", file);
     formData.append("language", getSelectedLanguage());
+    formData.append("style", getSelectedStyle());
     // Create abort controller for this request
     currentAbortController = new AbortController();
         }
         // Process stream handles both progress SSE and starting audio playback
+        await processStream(response, file.name, "pdf");
     } catch (error) {
         if (error.name === "AbortError") {
             // User cancelled - already handled in stopGeneration
     showStatus('<span class="spinner"></span> Fetching content...', "loading");
     player.classList.remove("visible");
     downloadBtn.classList.add("hidden");
     urlSubmit.disabled = true;
     // Create abort controller for this request
             },
             body: JSON.stringify({
                 url,
+                language: getSelectedLanguage(),
+                style: getSelectedStyle()
             }),
             signal: currentAbortController.signal,
         });
         const docName = urlPath.split("/").pop() || "document";
         // Process stream handles both progress SSE and starting audio playback
+        await processStream(response, docName, "url");
     } catch (error) {
         if (error.name === "AbortError") {
             // User cancelled - already handled in stopGeneration
     showStatus('<span class="spinner"></span> Processing text...', "loading");
     player.classList.remove("visible");
     downloadBtn.classList.add("hidden");
     textSubmit.disabled = true;
     // Create abort controller for this request
             },
             body: JSON.stringify({
                 text,
+                language: getSelectedLanguage(),
+                style: getSelectedStyle()
             }),
             signal: currentAbortController.signal,
         });
             throw new Error(error.detail || "Failed to process text");
         }
+        // Generate document name from first few words
+        const words = text.trim().split(/\s+/).slice(0, 5).join(" ");
+        const docName = words.length > 30 ? words.slice(0, 30) + "..." : words;
         // Process stream handles both progress SSE and starting audio playback
+        await processStream(response, docName, "text");
     } catch (error) {
         if (error.name === "AbortError") {
             // User cancelled - already handled in stopGeneration
 // Tab switching
 tabs.forEach((tab) => {
     tab.addEventListener("click", () => {
+        const isAlreadyActive = tab.classList.contains("active");
+        const isUploadTab = tab.dataset.tab === "upload";
+        // If clicking on already-active upload tab, open file picker
+        if (isAlreadyActive && isUploadTab) {
+            fileInput.click();
+            return;
+        }
         tabs.forEach((t) => t.classList.remove("active"));
         tabContents.forEach((tc) => tc.classList.remove("active"));
         tab.classList.add("active");
 // Click to select file
 dropZone.addEventListener("click", (e) => {
+    if (e.target !== fileInput) {
         fileInput.click();
     }
 });
     }
 });
+// Auto-detect language from text input
+textInput.addEventListener("input", () => {
+    const detected = detectLanguage(textInput.value);
+    if (detected) {
+        setLanguage(detected, true);
+    }
+});
 // Stop button
 stopBtn.addEventListener("click", stopGeneration);
+// Stream play button (during processing)
+streamPlayBtn.addEventListener("click", () => {
+    audio.play().catch(() => {});
+    // Hide stream play button and show pause button
+    streamPlayBtn.classList.add("hidden");
+    pauseBtn.classList.remove("hidden");
+});
 // Pause button
 pauseBtn.addEventListener("click", togglePause);
 // Download button
 downloadBtn.addEventListener("click", downloadAudio);
+// Delete button
+deleteBtn.addEventListener("click", deleteAudio);
 // Update pause button when audio state changes
 audio.addEventListener("play", updatePauseButton);
 audio.addEventListener("pause", updatePauseButton);
 // Language selection
 languageButtons.forEach((btn) => {
     btn.addEventListener("click", () => {
+        setLanguage(btn.dataset.language, false);
+    });
+});
+// Style selection
+const styleButtons = document.querySelectorAll("#styleButtons .style-btn");
+styleButtons.forEach((btn) => {
+    btn.addEventListener("click", () => {
+        styleButtons.forEach((b) => b.classList.remove("active"));
         btn.classList.add("active");
+        selectedStyle = btn.dataset.style;
     });
 });

src/talking_snake/static/index.html CHANGED Viewed

@@ -26,6 +26,9 @@
     <link rel="icon" type="image/png" sizes="512x512" href="/static/icon-512.png">
     <link rel="stylesheet" href="/static/styles.css">
     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" crossorigin="anonymous" referrerpolicy="no-referrer">
     <script src="https://unpkg.com/htmx.org@2.0.4"></script>
 </head>
@@ -33,26 +36,37 @@
     <div class="main-content">
         <img src="/static/talking_snake.png" alt="Talking Snake" class="logo">
         <h1>Talking Snake</h1>
-        <p class="subtitle">Transform PDFs & Web into Audio</p>
         <div class="container">
             <div class="input-section" id="inputSection">
                 <div class="options-row">
-                    <div class="language-selector">
-                        <span class="style-label">Language:</span>
-                        <div class="style-buttons" id="languageButtons">
-                            <button class="style-btn lang-btn active" data-language="english" title="English">
-                                🇬🇧
                             </button>
-                            <button class="style-btn lang-btn" data-language="chinese" title="Chinese">
-                                🇨🇳
                             </button>
-                            <button class="style-btn lang-btn" data-language="japanese" title="Japanese">
-                                🇯🇵
                             </button>
-                            <button class="style-btn lang-btn" data-language="korean" title="Korean">
-                                🇰🇷
                             </button>
                         </div>
                     </div>
                 </div>
@@ -67,19 +81,16 @@
                     <div class="drop-zone" id="dropZone">
                         <i class="fa-solid fa-file-pdf drop-icon"></i>
                         <p>Drag & drop a PDF here</p>
-                        <label class="file-label">
-                            <i class="fa-solid fa-folder-open"></i> Choose File
-                            <input type="file" id="fileInput" accept=".pdf">
-                        </label>
-                        <p class="hint">Supports PDF documents up to 50MB</p>
                     </div>
                 </div>
                 <div class="tab-content" id="url-tab">
                     <div class="url-form">
-                        <input type="url" id="urlInput" placeholder="https://example.com/article or .pdf">
-                        <button class="submit-btn" id="urlSubmit"><i class="fa-solid fa-microphone"></i> Read Content</button>
-                        <p class="hint">Enter a link to a PDF or web page (articles, docs, blogs)</p>
                     </div>
                 </div>
@@ -87,7 +98,6 @@
                     <div class="text-form">
                         <textarea id="textInput" placeholder="Paste or type your text here..." rows="6"></textarea>
                         <button class="submit-btn" id="textSubmit"><i class="fa-solid fa-microphone"></i> Read Text</button>
-                        <p class="hint">Paste any text you want to hear read aloud</p>
                     </div>
                 </div>
             </div>
@@ -102,6 +112,7 @@
                         <div class="processing-progress-bar" id="processingProgressBar"></div>
                     </div>
                     <div class="control-buttons">
                         <button class="control-btn pause-btn hidden" id="pauseBtn" title="Pause/Resume"><i class="fa-solid fa-pause"></i></button>
                         <button class="control-btn stop-btn" id="stopBtn" title="Stop generation"><i class="fa-solid fa-stop"></i></button>
                     </div>
@@ -121,12 +132,12 @@
                         <input type="range" class="progress-slider" id="progressSlider" min="0" max="100" value="0">
                     </div>
                     <span class="time-display" id="timeDisplay">0:00 / 0:00</span>
-                    <button class="player-btn volume-btn" id="volumeBtn" title="Mute/Unmute">
-                        <i class="fa-solid fa-volume-high"></i>
-                    </button>
                     <button class="player-btn download-btn hidden" id="downloadBtn" title="Download Audio">
                         <i class="fa-solid fa-download"></i>
                     </button>
                 </div>
                 <audio id="audio" preload="auto"></audio>
             </div>

     <link rel="icon" type="image/png" sizes="512x512" href="/static/icon-512.png">
     <link rel="stylesheet" href="/static/styles.css">
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Fredoka:wght@500&display=swap">
     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" crossorigin="anonymous" referrerpolicy="no-referrer">
     <script src="https://unpkg.com/htmx.org@2.0.4"></script>
 </head>
     <div class="main-content">
         <img src="/static/talking_snake.png" alt="Talking Snake" class="logo">
         <h1>Talking Snake</h1>
         <div class="container">
             <div class="input-section" id="inputSection">
                 <div class="options-row">
+                    <div class="style-selector">
+                        <span class="style-label">Style:</span>
+                        <div class="style-buttons" id="styleButtons">
+                            <button class="style-btn active" data-style="technical" title="Clear, precise reading for code and technical documentation">
+                                <i class="fa-solid fa-microchip"></i>
                             </button>
+                            <button class="style-btn" data-style="narrative" title="Natural, engaging reading for articles and stories">
+                                <i class="fa-solid fa-book-open"></i>
                             </button>
+                            <button class="style-btn" data-style="child_narrative" title="Playful, expressive reading for children's stories">
+                                <i class="fa-solid fa-child"></i>
                             </button>
+                            <button class="style-btn" data-style="news" title="Authoritative, clear delivery for news and reports">
+                                <i class="fa-solid fa-newspaper"></i>
                             </button>
+                            <button class="style-btn" data-style="academic" title="Measured, scholarly reading for papers and research">
+                                <i class="fa-solid fa-graduation-cap"></i>
+                            </button>
+                        </div>
+                    </div>
+                    <div class="language-selector">
+                        <span class="style-label">Language:</span>
+                        <div class="style-buttons" id="languageButtons">
+                            <button class="style-btn lang-btn active" data-language="english" title="English">🇬🇧</button>
+                            <button class="style-btn lang-btn" data-language="chinese" title="Chinese">🇨🇳</button>
+                            <button class="style-btn lang-btn" data-language="japanese" title="Japanese">🇯🇵</button>
+                            <button class="style-btn lang-btn" data-language="korean" title="Korean">🇰🇷</button>
                         </div>
                     </div>
                 </div>
                     <div class="drop-zone" id="dropZone">
                         <i class="fa-solid fa-file-pdf drop-icon"></i>
                         <p>Drag & drop a PDF here</p>
+                        <input type="file" id="fileInput" accept=".pdf" class="hidden-file-input">
                     </div>
                 </div>
                 <div class="tab-content" id="url-tab">
                     <div class="url-form">
+                        <div class="url-input-row">
+                            <input type="url" id="urlInput" placeholder="https://example.com/article or .pdf">
+                            <button class="submit-btn" id="urlSubmit"><i class="fa-solid fa-microphone"></i></button>
+                        </div>
                     </div>
                 </div>
                     <div class="text-form">
                         <textarea id="textInput" placeholder="Paste or type your text here..." rows="6"></textarea>
                         <button class="submit-btn" id="textSubmit"><i class="fa-solid fa-microphone"></i> Read Text</button>
                     </div>
                 </div>
             </div>
                         <div class="processing-progress-bar" id="processingProgressBar"></div>
                     </div>
                     <div class="control-buttons">
+                        <button class="control-btn play-btn hidden" id="streamPlayBtn" title="Play audio"><i class="fa-solid fa-play"></i></button>
                         <button class="control-btn pause-btn hidden" id="pauseBtn" title="Pause/Resume"><i class="fa-solid fa-pause"></i></button>
                         <button class="control-btn stop-btn" id="stopBtn" title="Stop generation"><i class="fa-solid fa-stop"></i></button>
                     </div>
                         <input type="range" class="progress-slider" id="progressSlider" min="0" max="100" value="0">
                     </div>
                     <span class="time-display" id="timeDisplay">0:00 / 0:00</span>
                     <button class="player-btn download-btn hidden" id="downloadBtn" title="Download Audio">
                         <i class="fa-solid fa-download"></i>
                     </button>
+                    <button class="player-btn delete-btn hidden" id="deleteBtn" title="Delete Audio">
+                        <i class="fa-solid fa-trash"></i>
+                    </button>
                 </div>
                 <audio id="audio" preload="auto"></audio>
             </div>

src/talking_snake/static/styles.css CHANGED Viewed

@@ -45,15 +45,25 @@ body {
 }
 h1 {
     font-size: 1.75rem;
-    margin: 0 0 0.25rem;
     color: var(--primary);
 }
 .subtitle {
     color: var(--text-muted);
-    margin: 0 0 1rem;
     font-size: 0.9rem;
 }
 .container {
@@ -65,9 +75,9 @@ h1 {
 .options-row {
     display: flex;
     justify-content: center;
-    gap: 1.5rem;
-    margin-bottom: 1rem;
-    flex-wrap: wrap;
 }
 /* Style Selector */
@@ -75,38 +85,46 @@ h1 {
 .language-selector {
     display: flex;
     align-items: center;
-    gap: 0.5rem;
-    flex-wrap: wrap;
 }
 .style-label {
-    font-size: 0.85rem;
     color: var(--text-muted);
 }
 .style-buttons {
     display: flex;
-    gap: 0.35rem;
 }
 .style-btn {
-    width: 38px;
-    height: 38px;
     border: 1px solid var(--border);
-    border-radius: 6px;
     background: var(--surface);
     color: var(--text-muted);
     cursor: pointer;
-    font-size: 0.95rem;
     transition: all 0.15s ease;
     display: flex;
     align-items: center;
     justify-content: center;
 }
 /* Language buttons use emoji flags */
 .style-btn.lang-btn {
-    font-size: 1.2rem;
 }
 .style-btn:hover {
@@ -120,6 +138,29 @@ h1 {
     color: var(--primary);
 }
 /* Input Section - hidden during processing */
 .input-section.hidden {
     display: none;
@@ -207,6 +248,21 @@ h1 {
     opacity: 0.6;
 }
 /* Status in processing */
 .processing-section .status {
     padding: 0;
@@ -245,12 +301,13 @@ h1 {
     width: 36px;
     height: 36px;
     padding: 0;
-    color: white;
-    border: none;
     border-radius: 8px;
     cursor: pointer;
     font-size: 0.9rem;
-    transition: all 0.15s ease;
     display: flex;
     align-items: center;
     justify-content: center;
@@ -261,27 +318,28 @@ h1 {
 }
 .control-btn:hover {
-    filter: brightness(1.1);
 }
-.pause-btn {
-    background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
-    background-size: 200% 200%;
-    animation: gradient-idle 3s ease infinite;
 }
-.pause-btn:hover {
-    animation: gradient-shift 0.8s ease infinite;
 }
-.stop-btn {
-    background: linear-gradient(135deg, var(--error), #8b3a30, var(--error));
-    background-size: 200% 200%;
-    animation: gradient-idle 3s ease infinite;
 }
 .stop-btn:hover {
-    animation: gradient-shift 0.8s ease infinite;
 }
 @keyframes gradient-idle {
@@ -299,7 +357,7 @@ h1 {
 .drop-zone {
     border: 2px dashed var(--border);
     border-radius: 8px;
-    padding: 1.5rem 1rem;
     text-align: center;
     transition: all 0.2s ease;
     cursor: pointer;
@@ -317,11 +375,6 @@ h1 {
     font-size: 0.95rem;
 }
-.drop-zone .hint {
-    color: var(--text-muted);
-    font-size: 0.8rem;
-}
 .drop-icon {
     font-size: 2.5rem;
     color: var(--primary);
@@ -361,37 +414,60 @@ h1 {
 .tab-content {
     display: none;
 }
 .tab-content.active {
     display: block;
 }
 /* URL Form */
 .url-form {
     background: var(--surface);
     border-radius: 8px;
-    padding: 1rem;
 }
-.url-form input[type="url"] {
-    width: 100%;
-    padding: 0.6rem 0.75rem;
     background: var(--bg);
     border: 1px solid var(--border);
     border-radius: 6px;
     color: var(--text);
     font-size: 0.9rem;
-    margin-bottom: 0.75rem;
     transition: border-color 0.15s ease;
 }
-.url-form input[type="url"]:focus {
     outline: none;
     border-color: var(--primary);
 }
-.url-form input[type="url"]::placeholder {
     color: var(--text-muted);
 }
@@ -427,70 +503,79 @@ h1 {
     color: var(--text-muted);
 }
-.text-form .hint {
-    color: var(--text-muted);
-    font-size: 0.8rem;
-    text-align: center;
-    margin: 0;
-}
 /* Buttons */
 .submit-btn {
     width: 100%;
     padding: 0.6rem 1rem;
-    background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
-    background-size: 200% 200%;
-    animation: gradient-idle 3s ease infinite;
-    color: white;
-    border: none;
     border-radius: 8px;
     cursor: pointer;
     font-size: 0.9rem;
     font-weight: 500;
-    transition: filter 0.15s ease;
     margin-bottom: 0.5rem;
 }
 .submit-btn:hover {
-    filter: brightness(1.1);
-    animation: gradient-shift 0.8s ease infinite;
 }
 .submit-btn:disabled {
-    opacity: 0.6;
     cursor: not-allowed;
-    filter: none;
-    animation: none;
 }
-.url-form .hint {
-    color: var(--text-muted);
-    font-size: 0.8rem;
-    text-align: center;
     margin: 0;
 }
 input[type="file"] {
     display: none;
 }
 .file-label {
     display: inline-block;
     padding: 0.5rem 1rem;
-    background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
-    background-size: 200% 200%;
-    animation: gradient-idle 3s ease infinite;
-    color: white;
     border-radius: 8px;
     cursor: pointer;
     font-weight: 500;
     font-size: 0.9rem;
-    transition: filter 0.15s ease;
 }
 .file-label:hover {
-    filter: brightness(1.1);
-    animation: gradient-shift 0.8s ease infinite;
 }
 /* Device Info - Subtle footer-like display */
@@ -498,17 +583,19 @@ input[type="file"] {
     display: none;
     justify-content: center;
     align-items: center;
-    gap: 1rem;
-    padding: 0.75rem 1rem;
     font-size: 0.7rem;
     color: var(--text-muted);
-    margin-top: 0.5rem;
     opacity: 0.7;
 }
 .device-info.visible {
     display: flex;
     flex-wrap: wrap;
 }
 .device-info i {
@@ -517,7 +604,66 @@ input[type="file"] {
 }
 .device-memory {
-    opacity: 0.9;
 }
 .device-batch {
@@ -562,10 +708,10 @@ input[type="file"] {
 /* Audio Player */
 .player {
-    margin-top: 1.5rem;
     width: 100%;
     display: none;
-    padding: 1.25rem;
     background: var(--surface);
     border-radius: 12px;
     border: 1px solid var(--border);
@@ -573,6 +719,35 @@ input[type="file"] {
 .player.visible {
     display: block;
 }
 /* Hidden audio element */
@@ -590,31 +765,32 @@ input[type="file"] {
 .player-btn {
     width: 36px;
     height: 36px;
-    border: none;
     border-radius: 8px;
-    background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
-    background-size: 200% 200%;
-    animation: gradient-idle 3s ease infinite;
-    color: white;
     cursor: pointer;
     display: flex;
     align-items: center;
     justify-content: center;
     font-size: 0.85rem;
-    transition: filter 0.15s ease;
     flex-shrink: 0;
 }
 .player-btn:hover {
-    filter: brightness(1.1);
-    animation: gradient-shift 0.8s ease infinite;
 }
 .player-btn.volume-btn {
-    background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
-    background-size: 200% 200%;
-    animation: gradient-idle 3s ease infinite;
-    color: var(--text-muted);
     width: 32px;
     height: 32px;
     font-size: 0.8rem;
@@ -622,14 +798,9 @@ input[type="file"] {
 .player-btn.volume-btn:hover {
     color: var(--text);
-    animation: gradient-shift 0.8s ease infinite;
 }
 .player-btn.download-btn {
-    background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
-    background-size: 200% 200%;
-    animation: gradient-idle 3s ease infinite;
-    color: var(--text-muted);
     width: 32px;
     height: 32px;
     font-size: 0.8rem;
@@ -637,7 +808,18 @@ input[type="file"] {
 .player-btn.download-btn:hover {
     color: var(--primary);
-    animation: gradient-shift 0.8s ease infinite;
 }
 .progress-container {
@@ -697,11 +879,24 @@ input[type="file"] {
 }
 .filename {
-    margin-bottom: 0.75rem;
     font-size: 0.85rem;
     font-weight: 500;
     color: var(--text);
     word-break: break-all;
 }
 /* Spinner Animation */
@@ -714,7 +909,7 @@ input[type="file"] {
     border-radius: 50%;
     animation: spin 1s linear infinite;
     margin-right: 0.4rem;
-    vertical-align: middle;
 }
 @keyframes spin {

 }
 h1 {
+    font-family: Fredoka, sans-serif;
     font-size: 1.75rem;
+    margin: 0 0 0.5rem;
     color: var(--primary);
+    display: inline;
+    vertical-align: baseline;
 }
 .subtitle {
     color: var(--text-muted);
+    margin: 0;
     font-size: 0.9rem;
+    display: inline;
+    vertical-align: baseline;
+}
+.header-row {
+    margin-bottom: 0.5rem;
+    text-align: center;
 }
 .container {
 .options-row {
     display: flex;
     justify-content: center;
+    gap: 0.75rem;
+    margin-bottom: 0.75rem;
+    flex-wrap: nowrap;
 }
 /* Style Selector */
 .language-selector {
     display: flex;
     align-items: center;
+    gap: 0.4rem;
+    flex-wrap: nowrap;
 }
 .style-label {
+    font-size: 0.75rem;
     color: var(--text-muted);
 }
 .style-buttons {
     display: flex;
+    gap: 0.25rem;
 }
 .style-btn {
+    width: 28px;
+    height: 28px;
     border: 1px solid var(--border);
+    border-radius: 5px;
     background: var(--surface);
     color: var(--text-muted);
     cursor: pointer;
+    font-size: 0.75rem;
+    line-height: 1;
     transition: all 0.15s ease;
     display: flex;
     align-items: center;
     justify-content: center;
 }
+.style-btn i {
+    display: block;
+    line-height: 1;
+}
 /* Language buttons use emoji flags */
 .style-btn.lang-btn {
+    font-size: 1rem;
+    line-height: 1;
+    padding: 0;
 }
 .style-btn:hover {
     color: var(--primary);
 }
+/* Auto-detected language indicator */
+.style-btn.lang-btn.auto-detected {
+    animation: auto-detect-pulse 0.5s ease-out;
+    box-shadow: 0 0 0 2px var(--primary);
+}
+@keyframes auto-detect-pulse {
+    0% {
+        transform: scale(1);
+        box-shadow: 0 0 0 0 rgba(212, 118, 58, 0.7);
+    }
+    50% {
+        transform: scale(1.1);
+        box-shadow: 0 0 0 4px rgba(212, 118, 58, 0.4);
+    }
+    100% {
+        transform: scale(1);
+        box-shadow: 0 0 0 2px var(--primary);
+    }
+}
 /* Input Section - hidden during processing */
 .input-section.hidden {
     display: none;
     opacity: 0.6;
 }
+.doc-info .doc-style,
+.doc-info .doc-lang {
+    color: var(--text-muted);
+    font-size: 0.75rem;
+    display: flex;
+    align-items: center;
+    white-space: nowrap;
+    flex-shrink: 0;
+    opacity: 0.7;
+}
+.doc-info .doc-style i {
+    font-size: 0.7rem;
+}
 /* Status in processing */
 .processing-section .status {
     padding: 0;
     width: 36px;
     height: 36px;
     padding: 0;
+    color: var(--text-muted);
+    background: var(--surface);
+    border: 1px solid var(--border);
     border-radius: 8px;
     cursor: pointer;
     font-size: 0.9rem;
+    transition: all 0.2s ease;
     display: flex;
     align-items: center;
     justify-content: center;
 }
 .control-btn:hover {
+    color: var(--primary);
+    border-color: var(--primary);
+    background: rgb(212, 118, 58, 0.08);
 }
+.pause-btn:hover {
+    color: var(--primary);
 }
+.control-btn.play-btn {
+    color: var(--success);
+    border-color: var(--success);
 }
+.control-btn.play-btn:hover {
+    background: rgba(116, 184, 22, 0.15);
 }
 .stop-btn:hover {
+    color: var(--error);
+    border-color: var(--error);
+    background: rgb(196, 90, 74, 0.08);
 }
 @keyframes gradient-idle {
 .drop-zone {
     border: 2px dashed var(--border);
     border-radius: 8px;
+    padding: 1rem 0;
     text-align: center;
     transition: all 0.2s ease;
     cursor: pointer;
     font-size: 0.95rem;
 }
 .drop-icon {
     font-size: 2.5rem;
     color: var(--primary);
 .tab-content {
     display: none;
+    opacity: 0;
+    transform: translateY(-8px);
 }
 .tab-content.active {
     display: block;
+    opacity: 1;
+    transform: translateY(0);
+    animation: tab-fade-in 0.2s ease-out;
+}
+@keyframes tab-fade-in {
+    from {
+        opacity: 0;
+        transform: translateY(-8px);
+    }
+    to {
+        opacity: 1;
+        transform: translateY(0);
+    }
 }
 /* URL Form */
 .url-form {
     background: var(--surface);
     border-radius: 8px;
+    padding: 0.75rem;
 }
+.url-input-row {
+    display: flex;
+    gap: 0.5rem;
+    align-items: center;
+}
+.url-input-row input[type="url"] {
+    flex: 1;
+    height: 40px;
+    padding: 0 0.75rem;
     background: var(--bg);
     border: 1px solid var(--border);
     border-radius: 6px;
     color: var(--text);
     font-size: 0.9rem;
     transition: border-color 0.15s ease;
 }
+.url-input-row input[type="url"]:focus {
     outline: none;
     border-color: var(--primary);
 }
+.url-input-row input[type="url"]::placeholder {
     color: var(--text-muted);
 }
     color: var(--text-muted);
 }
 /* Buttons */
 .submit-btn {
     width: 100%;
     padding: 0.6rem 1rem;
+    background: var(--surface);
+    color: var(--text);
+    border: 1px solid var(--border);
     border-radius: 8px;
     cursor: pointer;
     font-size: 0.9rem;
     font-weight: 500;
+    transition: all 0.2s ease;
     margin-bottom: 0.5rem;
 }
 .submit-btn:hover {
+    color: var(--primary);
+    border-color: var(--primary);
+    background: rgb(212, 118, 58, 0.08);
 }
 .submit-btn:disabled {
+    opacity: 0.5;
     cursor: not-allowed;
 }
+/* URL form button override - must come after base .submit-btn */
+.url-input-row .submit-btn {
+    width: 40px;
+    height: 40px;
+    min-width: 40px;
+    min-height: 40px;
+    padding: 0;
+    margin: 0;
+    flex-shrink: 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    border-radius: 6px;
+}
+.url-input-row .submit-btn i {
     margin: 0;
+    padding: 0;
+    line-height: 1;
+    display: block;
 }
 input[type="file"] {
     display: none;
 }
+.hidden-file-input {
+    display: none !important;
+}
 .file-label {
     display: inline-block;
     padding: 0.5rem 1rem;
+    background: var(--surface);
+    color: var(--text);
+    border: 1px solid var(--border);
     border-radius: 8px;
     cursor: pointer;
     font-weight: 500;
     font-size: 0.9rem;
+    transition: all 0.2s ease;
 }
 .file-label:hover {
+    color: var(--primary);
+    border-color: var(--primary);
+    background: rgb(212, 118, 58, 0.08);
 }
 /* Device Info - Subtle footer-like display */
     display: none;
     justify-content: center;
     align-items: center;
+    gap: 0.6rem;
+    padding: 0.4rem 1rem;
     font-size: 0.7rem;
     color: var(--text-muted);
+    margin-top: 0.25rem;
     opacity: 0.7;
+    line-height: 1.2;
 }
 .device-info.visible {
     display: flex;
     flex-wrap: wrap;
+    row-gap: 0.2rem;
 }
 .device-info i {
 }
 .device-memory {
+    display: flex;
+    align-items: center;
+    gap: 0.25rem;
+}
+.device-memory i {
+    font-size: 0.6rem;
+}
+.device-ephemeral {
+    display: flex;
+    align-items: center;
+    gap: 0.25rem;
+    color: var(--success);
+}
+.device-ephemeral i {
+    color: var(--success);
+    font-size: 0.6rem;
+}
+.device-timing {
+    display: flex;
+    align-items: center;
+    gap: 0.25rem;
+    color: var(--text-muted);
+}
+.device-timing i {
+    font-size: 0.6rem;
+}
+/* Model state indicators */
+.model-state {
+    display: flex;
+    align-items: center;
+    gap: 0.25rem;
+    padding: 0.15rem 0.4rem;
+    border-radius: 4px;
+    font-size: 0.65rem;
+}
+.model-state i {
+    font-size: 0.55rem;
+}
+.model-loaded {
+    background: rgb(16, 185, 129, 0.15);
+    color: var(--success);
+}
+.model-loading,
+.model-unloading {
+    background: rgb(245, 158, 11, 0.15);
+    color: #f59e0b;
+}
+.model-unloaded {
+    background: rgb(107, 114, 128, 0.15);
+    color: var(--text-muted);
 }
 .device-batch {
 /* Audio Player */
 .player {
+    margin-top: 1rem;
     width: 100%;
     display: none;
+    padding: 0.75rem 1rem;
     background: var(--surface);
     border-radius: 12px;
     border: 1px solid var(--border);
 .player.visible {
     display: block;
+    animation: slide-in 0.3s ease-out;
+}
+.player.deleting {
+    animation: slide-out 0.3s ease-out forwards;
+}
+@keyframes slide-in {
+    from {
+        opacity: 0;
+        transform: translateY(-10px);
+    }
+    to {
+        opacity: 1;
+        transform: translateY(0);
+    }
+}
+@keyframes slide-out {
+    from {
+        opacity: 1;
+        transform: translateY(0);
+    }
+    to {
+        opacity: 0;
+        transform: translateY(-10px);
+    }
 }
 /* Hidden audio element */
 .player-btn {
     width: 36px;
     height: 36px;
+    border: 1px solid var(--border);
     border-radius: 8px;
+    background: var(--surface);
+    color: var(--text-muted);
     cursor: pointer;
     display: flex;
     align-items: center;
     justify-content: center;
     font-size: 0.85rem;
+    transition: all 0.2s ease;
     flex-shrink: 0;
 }
 .player-btn:hover {
+    color: var(--primary);
+    border-color: var(--primary);
+    background: rgb(212, 118, 58, 0.08);
+}
+.player-btn.play-btn {
+    width: 40px;
+    height: 40px;
+    font-size: 0.9rem;
 }
 .player-btn.volume-btn {
     width: 32px;
     height: 32px;
     font-size: 0.8rem;
 .player-btn.volume-btn:hover {
     color: var(--text);
 }
 .player-btn.download-btn {
     width: 32px;
     height: 32px;
     font-size: 0.8rem;
 .player-btn.download-btn:hover {
     color: var(--primary);
+}
+.player-btn.delete-btn {
+    width: 32px;
+    height: 32px;
+    font-size: 0.8rem;
+}
+.player-btn.delete-btn:hover {
+    color: var(--error);
+    border-color: var(--error);
+    background: rgb(196, 90, 74, 0.08);
 }
 .progress-container {
 }
 .filename {
+    margin-bottom: 0.5rem;
     font-size: 0.85rem;
     font-weight: 500;
     color: var(--text);
     word-break: break-all;
+    display: flex;
+    align-items: center;
+    gap: 0.4rem;
+    flex-wrap: wrap;
+}
+.filename-meta {
+    display: inline-flex;
+    align-items: center;
+    gap: 0.35rem;
+    font-size: 0.8rem;
+    color: var(--text-muted);
+    margin-left: auto;
 }
 /* Spinner Animation */
     border-radius: 50%;
     animation: spin 1s linear infinite;
     margin-right: 0.4rem;
+    vertical-align: -2px;
 }
 @keyframes spin {

src/talking_snake/tts.py CHANGED Viewed

@@ -8,6 +8,7 @@ import time
 import wave
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
@@ -42,18 +43,146 @@ class TTSEngineProtocol(ABC):
         return 1
-# Professional narration style prompt
-# This instructs the model to read with clear, authoritative delivery
-PROFESSIONAL_STYLE = (
-    "Read this as a professional narrator with clear enunciation, "
-    "measured pacing, and an authoritative yet warm tone. "
-    "Speak naturally as if presenting an audiobook or documentary. "
-    "Avoid sounding robotic or monotone. Emphasize key points and maintain a steady rhythm. "
-    "Use appropriate intonation to convey meaning and keep the listener engaged. "
-    "This is not casual conversation, but a polished narration style. "
-    "Use proper diction, read correctly acronyms, and pronounce all words clearly."
 )
 # Language to default voice mapping
 LANGUAGE_VOICES: dict[str, str] = {
     "english": "Ryan",
@@ -65,8 +194,8 @@ LANGUAGE_VOICES: dict[str, str] = {
 # Default chunk size for streaming
 # Larger chunks = more stable voice, fewer artifacts at boundaries
 # Smaller chunks = faster first audio but potential voice instability
-# 1200 chars provides good balance for natural speech flow
-DEFAULT_CHUNK_SIZE = 1200
 # Idle timeout before unloading model from GPU (seconds)
 # Set to 0 to disable auto-unloading
@@ -140,9 +269,19 @@ class QwenTTSEngine(TTSEngineProtocol):
         self._idle_timeout = idle_timeout
         self._last_activity = time.time()
         self._model_loaded = False
         self._lock = threading.Lock()
         self._unload_timer: threading.Timer | None = None
         # Model will be loaded on first request (lazy loading)
         self.model = None
@@ -150,6 +289,67 @@ class QwenTTSEngine(TTSEngineProtocol):
         if idle_timeout == 0:
             self._load_model()
     def _load_model(self) -> None:
         """Load the model onto GPU or CPU."""
         if self._model_loaded:
@@ -158,6 +358,7 @@ class QwenTTSEngine(TTSEngineProtocol):
         import torch
         from qwen_tts import Qwen3TTSModel
         device_name = "GPU" if self.device == "cuda" else "CPU"
         print(f"🔄 Loading TTS model onto {device_name}...")
         start = time.time()
@@ -186,6 +387,7 @@ class QwenTTSEngine(TTSEngineProtocol):
             )
         self._model_loaded = True
         # Calculate optimal batch size based on available VRAM
         if self.device == "cuda":
@@ -205,6 +407,7 @@ class QwenTTSEngine(TTSEngineProtocol):
             import torch
             print("💤 Unloading TTS model from GPU (idle timeout)...")
             # Delete model and clear references
@@ -218,6 +421,7 @@ class QwenTTSEngine(TTSEngineProtocol):
                 torch.cuda.empty_cache()
                 torch.cuda.synchronize()
             print("✅ GPU memory freed")
     def _schedule_unload(self) -> None:
@@ -307,6 +511,10 @@ class QwenTTSEngine(TTSEngineProtocol):
         # Type guard - model is guaranteed to be loaded after _ensure_model_loaded
         assert self.model is not None, "Model failed to load"
         try:
             # Split text into chunks for streaming
             chunks = self._split_text(text)
@@ -326,10 +534,9 @@ class QwenTTSEngine(TTSEngineProtocol):
                     continue
                 # Always use batched call for consistent GPU memory allocation
-                # Use professional narration style for clear, authoritative delivery
-                batch_instruct = (
-                    [PROFESSIONAL_STYLE] * len(batch) if len(batch) > 1 else PROFESSIONAL_STYLE
-                )
                 audios, sr = self.model.generate_custom_voice(
                     text=batch if len(batch) > 1 else batch[0],
                     speaker=[self.voice] * len(batch) if len(batch) > 1 else self.voice,
@@ -349,6 +556,9 @@ class QwenTTSEngine(TTSEngineProtocol):
                     first_chunk = False
                     yield wav_bytes
         finally:
             # Schedule model unload after idle timeout
             self._schedule_unload()

 import wave
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
+from dataclasses import dataclass
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
         return 1
+@dataclass
+class TTSStyle:
+    """Defines a TTS speaking style with its configuration."""
+    id: str  # Unique identifier (e.g., "technical", "narrative")
+    name: str  # Display name (e.g., "Technical Documentation")
+    icon: str  # Font Awesome icon class (e.g., "fa-gear")
+    description: str  # Short description for tooltips
+    prompt: str  # The instruct prompt for the TTS model
+# === TTS STYLES ===
+# Each style provides a different speaking approach optimized for specific content types
+STYLE_TECHNICAL = TTSStyle(
+    id="technical",
+    name="Technical",
+    icon="fa-microchip",
+    description="Clear, precise reading for code and technical documentation",
+    prompt=(
+        "You are a technical speech engine reading engineering documents. "
+        "Your task is to convert text into clear, accurate spoken output. "
+        "Read in a neutral, controlled, professional voice. "
+        "Do not sound expressive, emotional, or conversational. "
+        "Do not use audiobook, storytelling, or presenter intonation. "
+        "Prioritize intelligibility and correctness over naturalness. "
+        "Maintain steady pacing and flat prosody appropriate for scientific material. "
+        "Pronounce all acronyms as individual letters unless they are standard spoken words. "
+        "Pronounce symbols, operators, and punctuation when they affect meaning. "
+        "Preserve capitalization, parentheses, and formatting as part of the spoken output. "
+        "When reading code, equations, or identifiers, slow down and speak every token clearly. "
+        "Insert short pauses at commas and longer pauses at periods and line breaks. "
+        "Do not summarize, interpret, or rephrase. "
+        "Read exactly what is written."
+    ),
 )
+STYLE_NARRATIVE = TTSStyle(
+    id="narrative",
+    name="Narrative",
+    icon="fa-book-open",
+    description="Natural, engaging reading for articles and stories",
+    prompt=(
+        "You are a professional narrative voice reading long-form text. "
+        "Your task is to tell a story in a clear, engaging, and natural way. "
+        "Use a warm, expressive, and fluid voice. "
+        "Vary intonation and rhythm to reflect meaning, emotion, and emphasis. "
+        "Sound human and immersive, not robotic or monotone. "
+        "Maintain smooth pacing, slowing for important moments, speeding up for transitions. "
+        "Use natural pauses at punctuation and paragraph breaks. "
+        "Pronounce all words clearly, but do not over-articulate symbols or formatting. "
+        "Read acronyms as spoken words when they are commonly pronounced that way. "
+        "Preserve the narrative flow and emotional tone of the text. "
+        "Do not flatten or neutralize the delivery."
+    ),
+)
+STYLE_CHILD_NARRATIVE = TTSStyle(
+    id="child_narrative",
+    name="Child Narrative",
+    icon="fa-child",
+    description="Playful, expressive reading for children's stories",
+    prompt=(
+        "You are a storyteller reading aloud to young children. "
+        "Your task is to tell a story in a friendly, gentle, and engaging way. "
+        "Use a warm, soft, and expressive voice. "
+        "Sound kind, calm, and reassuring. "
+        "Vary intonation to match emotions and actions in the story. "
+        "Maintain a slow to moderate pace with clear articulation. "
+        "Insert natural pauses so children can follow along. "
+        "Pronounce words simply and clearly. "
+        "Read acronyms and difficult words in their most familiar spoken form. "
+        "Keep the tone playful but soothing. "
+        "Do not sound technical, formal, or adult-oriented."
+    ),
+)
+STYLE_NEWS = TTSStyle(
+    id="news",
+    name="News",
+    icon="fa-newspaper",
+    description="Authoritative, clear delivery for news and reports",
+    prompt=(
+        "You are a professional news anchor delivering broadcast news. "
+        "Your task is to read information clearly, confidently, and with authority. "
+        "Use a neutral, composed, and trustworthy voice. "
+        "Avoid emotional or dramatic delivery. "
+        "Do not sound conversational or casual. "
+        "Maintain a steady, moderate pace with crisp articulation. "
+        "Use controlled intonation to mark headlines, key facts, and transitions. "
+        "Pronounce names, numbers, acronyms, and places carefully and accurately. "
+        "Pause briefly at commas and longer at periods and topic changes. "
+        "Sound factual, objective, and broadcast-ready at all times."
+    ),
+)
+STYLE_ACADEMIC = TTSStyle(
+    id="academic",
+    name="Academic",
+    icon="fa-graduation-cap",
+    description="Measured, scholarly reading for papers and research",
+    prompt=(
+        "You are an academic speech engine reading peer-reviewed scientific papers. "
+        "Your task is to render complex scholarly text into clear, precise spoken language. "
+        "Use a neutral, formal, and controlled voice. "
+        "Do not sound expressive, emotional, or conversational. "
+        "Do not use audiobook or presenter intonation. "
+        "Maintain steady pacing suitable for dense technical material. "
+        "Favor clarity and accuracy over naturalness. "
+        "Pronounce technical terminology, Greek letters, acronyms, and units correctly. "
+        "Read acronyms as individual letters unless they are standard spoken words. "
+        "Preserve capitalization, punctuation, and structure when they affect meaning. "
+        "Insert short pauses at commas and longer pauses at periods and section breaks. "
+        "Slow down slightly for equations, symbols, gene names, and references. "
+        "Do not summarize, interpret, or simplify the text. "
+        "Read exactly what is written."
+    ),
+)
+# Registry of all available styles
+TTS_STYLES: dict[str, TTSStyle] = {
+    style.id: style
+    for style in [
+        STYLE_TECHNICAL,
+        STYLE_NARRATIVE,
+        STYLE_CHILD_NARRATIVE,
+        STYLE_NEWS,
+        STYLE_ACADEMIC,
+    ]
+}
+# Default style
+DEFAULT_STYLE = STYLE_TECHNICAL
+def get_style(style_id: str) -> TTSStyle:
+    """Get a TTS style by ID, falling back to default if not found."""
+    return TTS_STYLES.get(style_id, DEFAULT_STYLE)
 # Language to default voice mapping
 LANGUAGE_VOICES: dict[str, str] = {
     "english": "Ryan",
 # Default chunk size for streaming
 # Larger chunks = more stable voice, fewer artifacts at boundaries
 # Smaller chunks = faster first audio but potential voice instability
+# 1800 chars provides good balance for natural speech flow
+DEFAULT_CHUNK_SIZE = 1800
 # Idle timeout before unloading model from GPU (seconds)
 # Set to 0 to disable auto-unloading
         self._idle_timeout = idle_timeout
         self._last_activity = time.time()
         self._model_loaded = False
+        self._model_state = "unloaded"  # unloaded, loading, loaded, unloading
         self._lock = threading.Lock()
         self._unload_timer: threading.Timer | None = None
+        # Calibrated seconds per character (measured and updated over time)
+        self._seconds_per_char: float | None = None
+        # Cumulative stats for running average
+        self._total_chars_processed: int = 0
+        self._total_time_spent: float = 0.0
+        # Current style for TTS
+        self._style: TTSStyle = DEFAULT_STYLE
         # Model will be loaded on first request (lazy loading)
         self.model = None
         if idle_timeout == 0:
             self._load_model()
+    @property
+    def style(self) -> TTSStyle:
+        """Return the current TTS style."""
+        return self._style
+    def set_style(self, style_id: str) -> None:
+        """Set the TTS style by ID.
+        Args:
+            style_id: Style identifier (technical, narrative, news, casual, academic).
+        """
+        self._style = get_style(style_id)
+    @property
+    def model_state(self) -> str:
+        """Return the current model state: unloaded, loading, loaded, or unloading."""
+        return self._model_state
+    @property
+    def seconds_per_char(self) -> float | None:
+        """Return calibrated seconds per character, or None if not yet measured."""
+        return self._seconds_per_char
+    @property
+    def total_chars_processed(self) -> int:
+        """Return total characters processed since startup."""
+        return self._total_chars_processed
+    def _update_timing_stats(self, chars: int, elapsed: float) -> None:
+        """Update cumulative timing statistics.
+        Args:
+            chars: Number of characters processed.
+            elapsed: Time taken in seconds.
+        """
+        self._total_chars_processed += chars
+        self._total_time_spent += elapsed
+        if self._total_chars_processed > 0:
+            self._seconds_per_char = self._total_time_spent / self._total_chars_processed
+    def calibrate(self, test_text: str = "Hello, this is a calibration test.") -> float:
+        """Run a calibration test to measure seconds per character.
+        Args:
+            test_text: Short text to use for calibration.
+        Returns:
+            Measured seconds per character.
+        """
+        self._ensure_model_loaded()
+        start = time.time()
+        # Consume the generator to complete synthesis
+        for _ in self.synthesize(test_text):
+            pass
+        elapsed = time.time() - start
+        self._seconds_per_char = elapsed / len(test_text)
+        print(f"⏱️  Calibrated: {self._seconds_per_char:.4f}s per character")
+        return self._seconds_per_char
     def _load_model(self) -> None:
         """Load the model onto GPU or CPU."""
         if self._model_loaded:
         import torch
         from qwen_tts import Qwen3TTSModel
+        self._model_state = "loading"
         device_name = "GPU" if self.device == "cuda" else "CPU"
         print(f"🔄 Loading TTS model onto {device_name}...")
         start = time.time()
             )
         self._model_loaded = True
+        self._model_state = "loaded"
         # Calculate optimal batch size based on available VRAM
         if self.device == "cuda":
             import torch
+            self._model_state = "unloading"
             print("💤 Unloading TTS model from GPU (idle timeout)...")
             # Delete model and clear references
                 torch.cuda.empty_cache()
                 torch.cuda.synchronize()
+            self._model_state = "unloaded"
             print("✅ GPU memory freed")
     def _schedule_unload(self) -> None:
         # Type guard - model is guaranteed to be loaded after _ensure_model_loaded
         assert self.model is not None, "Model failed to load"
+        # Track timing for this synthesis
+        synthesis_start = time.time()
+        chars_in_text = len(text)
         try:
             # Split text into chunks for streaming
             chunks = self._split_text(text)
                     continue
                 # Always use batched call for consistent GPU memory allocation
+                # Use the current style's prompt for delivery
+                style_prompt = self._style.prompt
+                batch_instruct = [style_prompt] * len(batch) if len(batch) > 1 else style_prompt
                 audios, sr = self.model.generate_custom_voice(
                     text=batch if len(batch) > 1 else batch[0],
                     speaker=[self.voice] * len(batch) if len(batch) > 1 else self.voice,
                     first_chunk = False
                     yield wav_bytes
         finally:
+            # Update timing stats for future estimates
+            elapsed = time.time() - synthesis_start
+            self._update_timing_stats(chars_in_text, elapsed)
             # Schedule model unload after idle timeout
             self._schedule_unload()