GitHub Actions commited on
Commit
1c7725b
·
1 Parent(s): 2886be7

Deploy from GitHub: 0bf18943d192a2812c57599f6c25bf9739d523bf

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Luca Cappelletti
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [project]
2
  name = "talking-snake"
3
- version = "0.1.0"
4
  description = "Just a talking snake that reads PDFs and web pages aloud."
5
  readme = "README.md"
6
  license = { text = "MIT" }
@@ -45,11 +45,6 @@ dev = [
45
  "mypy>=1.14.0",
46
  "pre-commit>=4.0.0",
47
  ]
48
- # Flash Attention for ~2x faster inference (requires CUDA 11.6+)
49
- # Install separately: pip install flash-attn --no-build-isolation
50
- fast = [
51
- "flash-attn>=2.5.0",
52
- ]
53
 
54
  [project.scripts]
55
  talking-snake = "talking_snake.__main__:main"
 
1
  [project]
2
  name = "talking-snake"
3
+ version = "0.1.1"
4
  description = "Just a talking snake that reads PDFs and web pages aloud."
5
  readme = "README.md"
6
  license = { text = "MIT" }
 
45
  "mypy>=1.14.0",
46
  "pre-commit>=4.0.0",
47
  ]
 
 
 
 
 
48
 
49
  [project.scripts]
50
  talking-snake = "talking_snake.__main__:main"
src/talking_snake/__main__.py CHANGED
@@ -99,6 +99,13 @@ def main() -> int:
99
  return 1
100
 
101
  print("✅ TTS model loaded!")
 
 
 
 
 
 
 
102
  print()
103
 
104
  # Create app with engine
 
99
  return 1
100
 
101
  print("✅ TTS model loaded!")
102
+
103
+ # Run calibration to get accurate time estimates
104
+ print("⏱️ Calibrating speech timing...")
105
+ try:
106
+ tts_engine.calibrate()
107
+ except Exception as e:
108
+ print(f"⚠️ Calibration failed (using defaults): {e}")
109
  print()
110
 
111
  # Create app with engine
src/talking_snake/app.py CHANGED
@@ -11,12 +11,12 @@ import time
11
  import uuid
12
  from pathlib import Path
13
  from typing import TYPE_CHECKING
14
- from urllib.parse import urlparse
15
 
16
  import httpx
17
  import trafilatura
18
  from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
19
- from fastapi.responses import HTMLResponse, StreamingResponse
20
  from fastapi.staticfiles import StaticFiles
21
  from pydantic import BaseModel
22
 
@@ -24,6 +24,7 @@ from talking_snake.extract import clean_text, extract_text, get_page_count
24
  from talking_snake.tts import (
25
  DEFAULT_CHUNK_SIZE,
26
  LANGUAGE_VOICES,
 
27
  MockTTSEngine,
28
  TTSEngineProtocol,
29
  )
@@ -52,15 +53,31 @@ class AudioJob:
52
  def __init__(self, job_id: str):
53
  self.job_id = job_id
54
  self.audio_queue: queue.Queue[bytes | None] = queue.Queue()
 
55
  self.started = time.time()
56
  self.completed = False
 
57
  self.error: str | None = None
58
  self.sample_rate = 24000 # Default, will be set by TTS engine
59
  self.header_sent = False
 
 
 
 
 
 
 
60
 
61
  def put_audio(self, audio_bytes: bytes) -> None:
62
- """Add audio data to the queue."""
63
  self.audio_queue.put(audio_bytes)
 
 
 
 
 
 
 
64
 
65
  def finish(self) -> None:
66
  """Signal that audio generation is complete."""
@@ -117,6 +134,7 @@ class UrlRequest(BaseModel):
117
 
118
  url: str
119
  language: str = "english"
 
120
 
121
 
122
  class TextRequest(BaseModel):
@@ -124,6 +142,7 @@ class TextRequest(BaseModel):
124
 
125
  text: str
126
  language: str = "english"
 
127
 
128
 
129
  class EstimateResponse(BaseModel):
@@ -170,6 +189,7 @@ def create_app(tts_engine: TTSEngineProtocol | None = None) -> FastAPI:
170
  app.add_api_route("/api/read-url-stream", read_url_stream, methods=["POST"])
171
  app.add_api_route("/api/read-text-stream", read_text_stream, methods=["POST"])
172
  app.add_api_route("/api/audio/{job_id}", stream_audio, methods=["GET"])
 
173
  app.add_api_route("/api/languages", get_languages, methods=["GET"])
174
  app.add_api_route("/api/device-info-stream", stream_device_info, methods=["GET"])
175
  app.add_api_route("/api/health", health_check, methods=["GET"])
@@ -389,6 +409,9 @@ def _get_device_info() -> dict:
389
  Returns:
390
  Device type, memory usage, and model info.
391
  """
 
 
 
392
  import torch
393
 
394
  info = {
@@ -398,8 +421,20 @@ def _get_device_info() -> dict:
398
  "memory_total_gb": 0,
399
  "memory_percent": 0,
400
  "batch_size": 1,
 
 
 
401
  }
402
 
 
 
 
 
 
 
 
 
 
403
  if torch.cuda.is_available():
404
  props = torch.cuda.get_device_properties(0)
405
  # Use reserved memory for more accurate GPU usage (includes PyTorch cache)
@@ -421,6 +456,15 @@ def _get_device_info() -> dict:
421
  if _tts_engine is not None:
422
  info["batch_size"] = getattr(_tts_engine, "batch_size", 1)
423
  info["chunk_size"] = getattr(_tts_engine, "chunk_size", 800)
 
 
 
 
 
 
 
 
 
424
 
425
  return info
426
 
@@ -461,9 +505,7 @@ async def stream_device_info() -> StreamingResponse:
461
  )
462
 
463
 
464
- def _estimate_time(
465
- text: str, seconds_per_char: float = INITIAL_SECONDS_PER_CHAR
466
- ) -> tuple[int, float]:
467
  """Estimate processing time for text.
468
 
469
  Args:
@@ -473,6 +515,8 @@ def _estimate_time(
473
  Returns:
474
  Tuple of (chunk_count, estimated_seconds).
475
  """
 
 
476
  # Count chunks (500 chars per chunk approximately)
477
  chunk_count = max(1, len(text) // 500 + (1 if len(text) % 500 else 0))
478
  estimated_seconds = len(text) * seconds_per_char
@@ -521,6 +565,7 @@ def _generate_audio_to_job(
521
  text: str,
522
  tts_engine: TTSEngineProtocol,
523
  language: str = "english",
 
524
  doc_name: str = "document",
525
  doc_type: str = "text",
526
  page_count: int | None = None,
@@ -536,11 +581,10 @@ def _generate_audio_to_job(
536
  text: Text to synthesize.
537
  tts_engine: TTS engine to use.
538
  language: Language for TTS (english, chinese, japanese, korean).
 
539
  doc_name: Name of the document being processed.
540
  doc_type: Type of document (pdf, url, text).
541
  page_count: Number of pages (for PDFs).
542
- tts_engine: TTS engine to use.
543
- language: Language for TTS (english, chinese, japanese, korean).
544
 
545
  Yields:
546
  SSE events for progress.
@@ -551,6 +595,10 @@ def _generate_audio_to_job(
551
  if hasattr(tts_engine, "set_language"):
552
  tts_engine.set_language(language)
553
 
 
 
 
 
554
  # Get chunk size and batch size from engine
555
  chunk_size = getattr(tts_engine, "chunk_size", DEFAULT_CHUNK_SIZE)
556
  batch_size = getattr(tts_engine, "batch_size", 1)
@@ -578,9 +626,13 @@ def _generate_audio_to_job(
578
  total_chunks = len(chunks) if chunks else 1
579
  total_chars = sum(len(c) for c in chunks)
580
 
581
- # Use initial estimate before calibration
582
- seconds_per_char = INITIAL_SECONDS_PER_CHAR
583
- estimated_total = total_chars * seconds_per_char
 
 
 
 
584
 
585
  # Send initial progress event with job_id and batch info
586
  progress_data = {
@@ -589,7 +641,7 @@ def _generate_audio_to_job(
589
  "current": 0,
590
  "total": total_chunks,
591
  "percent": 0,
592
- "estimated_remaining": estimated_total,
593
  "batch_size": batch_size,
594
  "doc_name": doc_name,
595
  "doc_type": doc_type,
@@ -648,13 +700,14 @@ def _generate_audio_to_job(
648
  # Signal audio generation complete
649
  job.finish()
650
 
651
- # Send completion event
652
  total_time = time.time() - start_time
653
  complete_data = {
654
  "type": "complete",
655
  "total_time": round(total_time, 1),
656
  "chunks_processed": chunks_processed,
657
  "batch_size": batch_size,
 
658
  }
659
  yield f"event: complete\ndata: {json.dumps(complete_data)}\n\n".encode()
660
 
@@ -664,6 +717,7 @@ async def stream_audio(job_id: str) -> StreamingResponse:
664
 
665
  This endpoint streams the raw WAV audio as it's being generated.
666
  The browser can start playing as soon as data arrives.
 
667
 
668
  Args:
669
  job_id: The job ID to stream audio for.
@@ -675,7 +729,9 @@ async def stream_audio(job_id: str) -> StreamingResponse:
675
  if job is None:
676
  raise HTTPException(status_code=404, detail="Job not found")
677
 
678
- def generate_audio() -> Iterator[bytes]:
 
 
679
  # Send WAV header first
680
  yield _create_wav_header(sample_rate=24000)
681
 
@@ -689,8 +745,6 @@ async def stream_audio(job_id: str) -> StreamingResponse:
689
  break
690
  # Skip WAV headers from individual chunks, only send raw PCM
691
  if audio_data[:4] == b"RIFF":
692
- # This is a WAV file, extract just the PCM data
693
- # WAV header is 44 bytes for standard PCM
694
  yield audio_data[44:]
695
  else:
696
  yield audio_data
@@ -698,11 +752,21 @@ async def stream_audio(job_id: str) -> StreamingResponse:
698
  # Timeout waiting for data
699
  break
700
 
701
- # Clean up job after streaming
702
- _job_manager.remove_job(job_id)
 
 
 
 
 
 
 
 
 
 
703
 
704
  return StreamingResponse(
705
- generate_audio(),
706
  media_type="audio/wav",
707
  headers={
708
  "Cache-Control": "no-cache",
@@ -711,9 +775,76 @@ async def stream_audio(job_id: str) -> StreamingResponse:
711
  )
712
 
713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
  async def read_pdf_stream(
715
  file: UploadFile = File(...),
716
  language: str = Form("english"),
 
717
  ) -> StreamingResponse:
718
  """Read a PDF with streaming progress updates.
719
 
@@ -722,6 +853,7 @@ async def read_pdf_stream(
722
  Args:
723
  file: Uploaded PDF file.
724
  language: Language for TTS (english, chinese, japanese, korean).
 
725
 
726
  Returns:
727
  Streaming response with progress events including job_id.
@@ -767,6 +899,7 @@ async def read_pdf_stream(
767
  text,
768
  _tts_engine,
769
  language,
 
770
  doc_name=file.filename or "document.pdf",
771
  doc_type="pdf",
772
  page_count=page_count,
@@ -796,6 +929,7 @@ async def read_text_stream(request: TextRequest) -> StreamingResponse:
796
 
797
  text = request.text.strip()
798
  language = request.language if request.language in LANGUAGE_VOICES else "english"
 
799
 
800
  if not text:
801
  raise HTTPException(status_code=400, detail="Text is required")
@@ -809,6 +943,14 @@ async def read_text_stream(request: TextRequest) -> StreamingResponse:
809
  if not text.strip():
810
  raise HTTPException(status_code=400, detail="No readable text provided")
811
 
 
 
 
 
 
 
 
 
812
  # Create a job for this request
813
  job = _job_manager.create_job()
814
 
@@ -818,7 +960,8 @@ async def read_text_stream(request: TextRequest) -> StreamingResponse:
818
  text,
819
  _tts_engine,
820
  language,
821
- doc_name="Pasted Text",
 
822
  doc_type="text",
823
  ),
824
  media_type="text/event-stream",
@@ -846,6 +989,7 @@ async def read_url_stream(request: UrlRequest) -> StreamingResponse:
846
 
847
  url = request.url.strip()
848
  language = request.language if request.language in LANGUAGE_VOICES else "english"
 
849
 
850
  if not url:
851
  raise HTTPException(status_code=400, detail="URL is required")
@@ -922,6 +1066,7 @@ async def read_url_stream(request: UrlRequest) -> StreamingResponse:
922
  text,
923
  _tts_engine,
924
  language,
 
925
  doc_name=doc_name,
926
  doc_type="pdf" if is_pdf else "url",
927
  page_count=page_count,
 
11
  import uuid
12
  from pathlib import Path
13
  from typing import TYPE_CHECKING
14
+ from urllib.parse import quote, urlparse
15
 
16
  import httpx
17
  import trafilatura
18
  from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
19
+ from fastapi.responses import HTMLResponse, Response, StreamingResponse
20
  from fastapi.staticfiles import StaticFiles
21
  from pydantic import BaseModel
22
 
 
24
  from talking_snake.tts import (
25
  DEFAULT_CHUNK_SIZE,
26
  LANGUAGE_VOICES,
27
+ TTS_STYLES,
28
  MockTTSEngine,
29
  TTSEngineProtocol,
30
  )
 
53
  def __init__(self, job_id: str):
54
  self.job_id = job_id
55
  self.audio_queue: queue.Queue[bytes | None] = queue.Queue()
56
+ self.audio_cache: list[bytes] = [] # Cache PCM chunks for replay/download
57
  self.started = time.time()
58
  self.completed = False
59
+ self.stream_started = False # Track if live stream has started
60
  self.error: str | None = None
61
  self.sample_rate = 24000 # Default, will be set by TTS engine
62
  self.header_sent = False
63
+ self._total_pcm_bytes = 0 # Track total audio bytes for duration calc
64
+
65
+ @property
66
+ def audio_duration(self) -> float:
67
+ """Calculate audio duration in seconds from cached PCM data."""
68
+ # 16-bit mono audio: duration = bytes / (sample_rate * 2)
69
+ return self._total_pcm_bytes / (self.sample_rate * 2)
70
 
71
  def put_audio(self, audio_bytes: bytes) -> None:
72
+ """Add audio data to the queue and cache."""
73
  self.audio_queue.put(audio_bytes)
74
+ # Cache the PCM data (strip WAV header if present)
75
+ if audio_bytes[:4] == b"RIFF":
76
+ pcm_data = audio_bytes[44:]
77
+ else:
78
+ pcm_data = audio_bytes
79
+ self.audio_cache.append(pcm_data)
80
+ self._total_pcm_bytes += len(pcm_data)
81
 
82
  def finish(self) -> None:
83
  """Signal that audio generation is complete."""
 
134
 
135
  url: str
136
  language: str = "english"
137
+ style: str = "technical"
138
 
139
 
140
  class TextRequest(BaseModel):
 
142
 
143
  text: str
144
  language: str = "english"
145
+ style: str = "technical"
146
 
147
 
148
  class EstimateResponse(BaseModel):
 
189
  app.add_api_route("/api/read-url-stream", read_url_stream, methods=["POST"])
190
  app.add_api_route("/api/read-text-stream", read_text_stream, methods=["POST"])
191
  app.add_api_route("/api/audio/{job_id}", stream_audio, methods=["GET"])
192
+ app.add_api_route("/api/download/{job_id}", download_audio, methods=["GET"])
193
  app.add_api_route("/api/languages", get_languages, methods=["GET"])
194
  app.add_api_route("/api/device-info-stream", stream_device_info, methods=["GET"])
195
  app.add_api_route("/api/health", health_check, methods=["GET"])
 
409
  Returns:
410
  Device type, memory usage, and model info.
411
  """
412
+ import shutil
413
+
414
+ import psutil
415
  import torch
416
 
417
  info = {
 
421
  "memory_total_gb": 0,
422
  "memory_percent": 0,
423
  "batch_size": 1,
424
+ "ram_used_gb": 0,
425
+ "ram_total_gb": 0,
426
+ "disk_free_gb": 0,
427
  }
428
 
429
+ # Get RAM info
430
+ ram = psutil.virtual_memory()
431
+ info["ram_used_gb"] = round(ram.used / 1024**3, 1)
432
+ info["ram_total_gb"] = round(ram.total / 1024**3, 1)
433
+
434
+ # Get disk free space
435
+ disk = shutil.disk_usage("/")
436
+ info["disk_free_gb"] = round(disk.free / 1024**3, 1)
437
+
438
  if torch.cuda.is_available():
439
  props = torch.cuda.get_device_properties(0)
440
  # Use reserved memory for more accurate GPU usage (includes PyTorch cache)
 
456
  if _tts_engine is not None:
457
  info["batch_size"] = getattr(_tts_engine, "batch_size", 1)
458
  info["chunk_size"] = getattr(_tts_engine, "chunk_size", 800)
459
+ # Include model state
460
+ info["model_state"] = getattr(_tts_engine, "model_state", "unknown")
461
+ # Include timing stats
462
+ seconds_per_char = getattr(_tts_engine, "seconds_per_char", None)
463
+ if seconds_per_char is not None:
464
+ info["seconds_per_char"] = round(seconds_per_char, 4)
465
+ total_chars = getattr(_tts_engine, "total_chars_processed", 0)
466
+ if total_chars > 0:
467
+ info["total_chars_processed"] = total_chars
468
 
469
  return info
470
 
 
505
  )
506
 
507
 
508
+ def _estimate_time(text: str, seconds_per_char: float | None = None) -> tuple[int, float]:
 
 
509
  """Estimate processing time for text.
510
 
511
  Args:
 
515
  Returns:
516
  Tuple of (chunk_count, estimated_seconds).
517
  """
518
+ if seconds_per_char is None:
519
+ seconds_per_char = INITIAL_SECONDS_PER_CHAR
520
  # Count chunks (500 chars per chunk approximately)
521
  chunk_count = max(1, len(text) // 500 + (1 if len(text) % 500 else 0))
522
  estimated_seconds = len(text) * seconds_per_char
 
565
  text: str,
566
  tts_engine: TTSEngineProtocol,
567
  language: str = "english",
568
+ style: str = "technical",
569
  doc_name: str = "document",
570
  doc_type: str = "text",
571
  page_count: int | None = None,
 
581
  text: Text to synthesize.
582
  tts_engine: TTS engine to use.
583
  language: Language for TTS (english, chinese, japanese, korean).
584
+ style: TTS style (technical, narrative, news, casual, academic).
585
  doc_name: Name of the document being processed.
586
  doc_type: Type of document (pdf, url, text).
587
  page_count: Number of pages (for PDFs).
 
 
588
 
589
  Yields:
590
  SSE events for progress.
 
595
  if hasattr(tts_engine, "set_language"):
596
  tts_engine.set_language(language)
597
 
598
+ # Apply style if the engine supports it
599
+ if hasattr(tts_engine, "set_style"):
600
+ tts_engine.set_style(style)
601
+
602
  # Get chunk size and batch size from engine
603
  chunk_size = getattr(tts_engine, "chunk_size", DEFAULT_CHUNK_SIZE)
604
  batch_size = getattr(tts_engine, "batch_size", 1)
 
626
  total_chunks = len(chunks) if chunks else 1
627
  total_chars = sum(len(c) for c in chunks)
628
 
629
+ # Use calibrated estimate if available, otherwise initial estimate
630
+ seconds_per_char = getattr(tts_engine, "seconds_per_char", None) or INITIAL_SECONDS_PER_CHAR
631
+
632
+ # Account for batch efficiency: processing N chunks in parallel is ~N times faster
633
+ # The efficiency isn't perfectly linear, so use a conservative factor of sqrt(batch_size)
634
+ batch_efficiency = batch_size**0.5 if batch_size > 1 else 1.0
635
+ estimated_total = (total_chars * seconds_per_char) / batch_efficiency
636
 
637
  # Send initial progress event with job_id and batch info
638
  progress_data = {
 
641
  "current": 0,
642
  "total": total_chunks,
643
  "percent": 0,
644
+ "estimated_remaining": round(estimated_total, 1),
645
  "batch_size": batch_size,
646
  "doc_name": doc_name,
647
  "doc_type": doc_type,
 
700
  # Signal audio generation complete
701
  job.finish()
702
 
703
+ # Send completion event with actual audio duration
704
  total_time = time.time() - start_time
705
  complete_data = {
706
  "type": "complete",
707
  "total_time": round(total_time, 1),
708
  "chunks_processed": chunks_processed,
709
  "batch_size": batch_size,
710
+ "audio_duration": round(job.audio_duration, 2),
711
  }
712
  yield f"event: complete\ndata: {json.dumps(complete_data)}\n\n".encode()
713
 
 
717
 
718
  This endpoint streams the raw WAV audio as it's being generated.
719
  The browser can start playing as soon as data arrives.
720
+ First request streams live; subsequent requests return cached audio.
721
 
722
  Args:
723
  job_id: The job ID to stream audio for.
 
729
  if job is None:
730
  raise HTTPException(status_code=404, detail="Job not found")
731
 
732
+ def generate_audio_live() -> Iterator[bytes]:
733
+ """Stream audio live from queue (first request)."""
734
+ job.stream_started = True
735
  # Send WAV header first
736
  yield _create_wav_header(sample_rate=24000)
737
 
 
745
  break
746
  # Skip WAV headers from individual chunks, only send raw PCM
747
  if audio_data[:4] == b"RIFF":
 
 
748
  yield audio_data[44:]
749
  else:
750
  yield audio_data
 
752
  # Timeout waiting for data
753
  break
754
 
755
+ def generate_audio_cached() -> Iterator[bytes]:
756
+ """Stream audio from cache (subsequent requests)."""
757
+ # Send WAV header first
758
+ yield _create_wav_header(sample_rate=24000)
759
+ # Send all cached chunks
760
+ yield from job.audio_cache
761
+
762
+ # Use live stream for first request, cached for subsequent
763
+ if not job.stream_started:
764
+ generator = generate_audio_live()
765
+ else:
766
+ generator = generate_audio_cached()
767
 
768
  return StreamingResponse(
769
+ generator,
770
  media_type="audio/wav",
771
  headers={
772
  "Cache-Control": "no-cache",
 
775
  )
776
 
777
 
778
+ async def download_audio(job_id: str, filename: str = "audio.wav") -> Response:
779
+ """Download complete audio file for a job.
780
+
781
+ This endpoint returns the full WAV file with correct headers for download.
782
+ Only works after generation is complete.
783
+
784
+ Args:
785
+ job_id: The job ID to download audio for.
786
+ filename: Suggested filename for download.
787
+
788
+ Returns:
789
+ Complete WAV audio file response.
790
+ """
791
+ job = _job_manager.get_job(job_id)
792
+ if job is None:
793
+ raise HTTPException(status_code=404, detail="Job not found")
794
+
795
+ if not job.audio_cache:
796
+ raise HTTPException(status_code=404, detail="No audio available")
797
+
798
+ # Combine all cached audio data
799
+ audio_data = b"".join(job.audio_cache)
800
+
801
+ # Create proper WAV header with actual size
802
+ sample_rate = 24000
803
+ bits_per_sample = 16
804
+ channels = 1
805
+ byte_rate = sample_rate * channels * bits_per_sample // 8
806
+ block_align = channels * bits_per_sample // 8
807
+ data_size = len(audio_data)
808
+ file_size = data_size + 36 # Header is 44 bytes, minus 8 for RIFF header
809
+
810
+ header = io.BytesIO()
811
+ header.write(b"RIFF")
812
+ header.write(struct.pack("<I", file_size))
813
+ header.write(b"WAVE")
814
+ header.write(b"fmt ")
815
+ header.write(struct.pack("<I", 16)) # fmt chunk size
816
+ header.write(struct.pack("<H", 1)) # PCM format
817
+ header.write(struct.pack("<H", channels))
818
+ header.write(struct.pack("<I", sample_rate))
819
+ header.write(struct.pack("<I", byte_rate))
820
+ header.write(struct.pack("<H", block_align))
821
+ header.write(struct.pack("<H", bits_per_sample))
822
+ header.write(b"data")
823
+ header.write(struct.pack("<I", data_size))
824
+
825
+ wav_data = header.getvalue() + audio_data
826
+
827
+ # RFC 5987 encoding for non-ASCII filenames
828
+ # Use ASCII-safe fallback + UTF-8 encoded filename*
829
+ safe_filename = filename.encode("ascii", "replace").decode("ascii")
830
+ encoded_filename = quote(filename, safe="")
831
+
832
+ return Response(
833
+ content=wav_data,
834
+ media_type="audio/wav",
835
+ headers={
836
+ "Content-Disposition": (
837
+ f"attachment; filename=\"{safe_filename}\"; filename*=UTF-8''{encoded_filename}"
838
+ ),
839
+ "Content-Length": str(len(wav_data)),
840
+ },
841
+ )
842
+
843
+
844
  async def read_pdf_stream(
845
  file: UploadFile = File(...),
846
  language: str = Form("english"),
847
+ style: str = Form("technical"),
848
  ) -> StreamingResponse:
849
  """Read a PDF with streaming progress updates.
850
 
 
853
  Args:
854
  file: Uploaded PDF file.
855
  language: Language for TTS (english, chinese, japanese, korean).
856
+ style: TTS style (technical, narrative, news, casual, academic).
857
 
858
  Returns:
859
  Streaming response with progress events including job_id.
 
899
  text,
900
  _tts_engine,
901
  language,
902
+ style,
903
  doc_name=file.filename or "document.pdf",
904
  doc_type="pdf",
905
  page_count=page_count,
 
929
 
930
  text = request.text.strip()
931
  language = request.language if request.language in LANGUAGE_VOICES else "english"
932
+ style = request.style if request.style in TTS_STYLES else "technical"
933
 
934
  if not text:
935
  raise HTTPException(status_code=400, detail="Text is required")
 
943
  if not text.strip():
944
  raise HTTPException(status_code=400, detail="No readable text provided")
945
 
946
+ # Generate doc name from first few words
947
+ words = text.split()[:5]
948
+ doc_name = " ".join(words)
949
+ if len(doc_name) > 30:
950
+ doc_name = doc_name[:30] + "..."
951
+ elif len(words) == 5:
952
+ doc_name = doc_name + "..."
953
+
954
  # Create a job for this request
955
  job = _job_manager.create_job()
956
 
 
960
  text,
961
  _tts_engine,
962
  language,
963
+ style,
964
+ doc_name=doc_name,
965
  doc_type="text",
966
  ),
967
  media_type="text/event-stream",
 
989
 
990
  url = request.url.strip()
991
  language = request.language if request.language in LANGUAGE_VOICES else "english"
992
+ style = request.style if request.style in TTS_STYLES else "technical"
993
 
994
  if not url:
995
  raise HTTPException(status_code=400, detail="URL is required")
 
1066
  text,
1067
  _tts_engine,
1068
  language,
1069
+ style,
1070
  doc_name=doc_name,
1071
  doc_type="pdf" if is_pdf else "url",
1072
  page_count=page_count,
src/talking_snake/extract.py CHANGED
@@ -8,7 +8,14 @@ from collections import Counter
8
  from dataclasses import dataclass
9
 
10
  from pdfminer.high_level import extract_pages
11
- from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBoxHorizontal, LTTextLineHorizontal
 
 
 
 
 
 
 
12
 
13
 
14
  @dataclass
@@ -19,6 +26,260 @@ class TextBlock:
19
  y_ratio: float # 0.0 = bottom, 1.0 = top
20
  font_size: float
21
  page_num: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
@@ -50,21 +311,52 @@ def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
50
  if not isinstance(element, LTTextBoxHorizontal):
51
  continue
52
 
53
- text = element.get_text().strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  if not text:
55
  continue
56
 
57
  # Calculate Y position as ratio (0=bottom, 1=top)
58
  y_ratio = element.y0 / page_height if page_height > 0 else 0.5
59
 
60
- # Extract average font size from characters
61
- font_sizes: list[float] = []
62
- for line in element:
63
- if isinstance(line, LTTextLineHorizontal):
64
- for char in line:
65
- if isinstance(char, LTChar):
66
- font_sizes.append(char.size)
67
-
68
  avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
69
 
70
  blocks.append(
@@ -107,6 +399,9 @@ def extract_text(pdf_bytes: bytes) -> str:
107
  if not blocks:
108
  return ""
109
 
 
 
 
110
  cleaned_blocks = clean_text_blocks(blocks)
111
  text = "\n\n".join(block.text for block in cleaned_blocks)
112
 
@@ -161,6 +456,10 @@ def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
161
  if is_page_number(block.text):
162
  continue
163
 
 
 
 
 
164
  # Skip very short lines with small font (likely captions/footnotes)
165
  if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
166
  continue
@@ -280,6 +579,96 @@ def normalize_for_tts(text: str) -> str:
280
  Returns:
281
  Normalized text optimized for TTS.
282
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  # === CODE AND TECHNICAL CONTENT ===
284
  # Handle common programming patterns that read poorly
285
 
@@ -357,10 +746,37 @@ def normalize_for_tts(text: str) -> str:
357
  text = text.replace("'''", "")
358
 
359
  # === UNICODE NORMALIZATION ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  # Convert smart quotes to simple quotes
361
- text = text.replace(""", '"').replace(""", '"')
362
- text = text.replace("'", "'").replace("'", "'")
363
- text = text.replace("", '"').replace("", '"')
364
 
365
  # Normalize dashes to standard hyphen or remove
366
  text = text.replace("–", "-") # en-dash
@@ -471,19 +887,32 @@ def normalize_for_tts(text: str) -> str:
471
  # Remove content in angle brackets (often HTML/XML artifacts)
472
  text = re.sub(r"<[^>]+>", "", text)
473
 
474
- # Normalize multiple spaces
475
- text = re.sub(r"[ \t]+", " ", text)
476
-
477
  # Remove spaces before punctuation
478
  text = re.sub(r"\s+([.,;:!?])", r"\1", text)
479
 
480
  # Ensure space after punctuation (but not before another punctuation)
481
  text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
482
 
483
- # Remove leading/trailing whitespace from lines
484
- text = "\n".join(line.strip() for line in text.split("\n"))
485
 
486
- # Remove empty lines that resulted from cleaning
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  text = re.sub(r"\n{3,}", "\n\n", text)
488
 
 
 
 
489
  return text
 
8
  from dataclasses import dataclass
9
 
10
  from pdfminer.high_level import extract_pages
11
+ from pdfminer.layout import (
12
+ LAParams,
13
+ LTAnno,
14
+ LTChar,
15
+ LTPage,
16
+ LTTextBoxHorizontal,
17
+ LTTextLineHorizontal,
18
+ )
19
 
20
 
21
  @dataclass
 
26
  y_ratio: float # 0.0 = bottom, 1.0 = top
27
  font_size: float
28
  page_num: int
29
+ x0: float = 0.0 # Left edge position for table detection
30
+ x1: float = 0.0 # Right edge position for table detection
31
+
32
+
33
+ def _is_caption(text: str) -> bool:
34
+ """Check if text is a figure/table caption.
35
+
36
+ Captions typically start with:
37
+ - "Figure 1:", "Fig. 2:", "Figure 1."
38
+ - "Table 1:", "Table 2."
39
+ - "Exhibit A:", "Chart 1:"
40
+ - "Source:", "Note:", "Notes:"
41
+
42
+ Args:
43
+ text: Text to check.
44
+
45
+ Returns:
46
+ True if text appears to be a caption.
47
+ """
48
+ text = text.strip()
49
+ if not text:
50
+ return False
51
+
52
+ # Common caption patterns (case-insensitive start)
53
+ caption_patterns = [
54
+ r"^fig(?:ure)?\.?\s*\d",
55
+ r"^table\.?\s*\d",
56
+ r"^exhibit\.?\s*[a-z0-9]",
57
+ r"^chart\.?\s*\d",
58
+ r"^graph\.?\s*\d",
59
+ r"^diagram\.?\s*\d",
60
+ r"^plate\.?\s*\d",
61
+ r"^scheme\.?\s*\d",
62
+ r"^box\.?\s*\d",
63
+ r"^panel\.?\s*[a-z0-9]",
64
+ r"^appendix\.?\s*[a-z0-9]",
65
+ r"^source\s*:",
66
+ r"^sources\s*:",
67
+ r"^note\s*:",
68
+ r"^notes\s*:",
69
+ r"^data\s*:",
70
+ r"^\*\s*p\s*[<>=]", # Statistical notes like "* p < 0.05"
71
+ r"^legend\s*:",
72
+ ]
73
+
74
+ text_lower = text.lower()
75
+ for pattern in caption_patterns:
76
+ if re.match(pattern, text_lower):
77
+ return True
78
+
79
+ return False
80
+
81
+
82
+ def _is_table_like_text(text: str) -> bool:
83
+ """Check if text looks like table content.
84
+
85
+ Tables often have:
86
+ - Very short text fragments
87
+ - Mostly numbers or single words
88
+ - Lots of whitespace-separated values
89
+ - Column headers or row labels
90
+ - Short phrases without sentence structure
91
+
92
+ Args:
93
+ text: Text to check.
94
+
95
+ Returns:
96
+ True if the text appears to be table content.
97
+ """
98
+ text = text.strip()
99
+
100
+ # Very short fragments are likely table cells
101
+ if len(text) < 5:
102
+ return True
103
+
104
+ # Count numbers vs letters
105
+ digits = sum(1 for c in text if c.isdigit())
106
+ letters = sum(1 for c in text if c.isalpha())
107
+
108
+ # Mostly numbers with few letters (like "123.45" or "2024")
109
+ if digits > 0 and letters < 3 and digits >= letters:
110
+ return True
111
+
112
+ # Check for patterns common in tables
113
+ # Multiple tab-separated or heavily spaced values
114
+ if "\t" in text or " " in text:
115
+ parts = re.split(r"\s{2,}|\t", text)
116
+ if len(parts) >= 3:
117
+ # Multiple short parts suggests table row
118
+ short_parts = sum(1 for p in parts if len(p.strip()) < 15)
119
+ if short_parts >= len(parts) * 0.6:
120
+ return True
121
+
122
+ # Single words that look like column headers
123
+ words = text.split()
124
+ if len(words) == 1 and len(text) < 20:
125
+ # Common table headers/labels
126
+ table_keywords = {
127
+ "total",
128
+ "sum",
129
+ "avg",
130
+ "average",
131
+ "mean",
132
+ "count",
133
+ "min",
134
+ "max",
135
+ "date",
136
+ "time",
137
+ "year",
138
+ "month",
139
+ "day",
140
+ "name",
141
+ "id",
142
+ "no",
143
+ "no.",
144
+ "value",
145
+ "amount",
146
+ "price",
147
+ "cost",
148
+ "qty",
149
+ "quantity",
150
+ "unit",
151
+ "row",
152
+ "column",
153
+ "col",
154
+ "item",
155
+ "description",
156
+ "desc",
157
+ "note",
158
+ "status",
159
+ "type",
160
+ "category",
161
+ "code",
162
+ "ref",
163
+ "reference",
164
+ }
165
+ if text.lower() in table_keywords:
166
+ return True
167
+
168
+ # Short phrases without sentence structure (likely table cells)
169
+ # Table cells typically:
170
+ # - Are short (< 50 chars)
171
+ # - Don't end with sentence-ending punctuation
172
+ # - Don't start with lowercase (unless very short)
173
+ # - Have few words (< 8)
174
+ if len(text) < 50 and len(words) < 8:
175
+ # Doesn't end like a sentence
176
+ if not text.rstrip().endswith((".", "!", "?", ":")):
177
+ # Common table cell patterns
178
+ text_lower = text.lower()
179
+
180
+ # Technical/status phrases common in tables
181
+ table_phrases = [
182
+ "supported",
183
+ "not supported",
184
+ "yes",
185
+ "no",
186
+ "n/a",
187
+ "none",
188
+ "required",
189
+ "optional",
190
+ "enabled",
191
+ "disabled",
192
+ "active",
193
+ "inactive",
194
+ "read-only",
195
+ "read only",
196
+ "write",
197
+ "read/write",
198
+ "read-write",
199
+ "must be",
200
+ "can be",
201
+ "should be",
202
+ "will be",
203
+ "available",
204
+ "unavailable",
205
+ "pending",
206
+ "completed",
207
+ "failed",
208
+ "true",
209
+ "false",
210
+ "default",
211
+ "custom",
212
+ "manual",
213
+ "automatic",
214
+ "identical",
215
+ "different",
216
+ "same",
217
+ "other",
218
+ ]
219
+ for phrase in table_phrases:
220
+ if phrase in text_lower:
221
+ return True
222
+
223
+ # Looks like a label or header (Title Case or ALL CAPS, short)
224
+ if len(words) <= 4 and len(text) < 40:
225
+ # Check if it's Title Case or contains common label patterns
226
+ if text.istitle() or text.isupper():
227
+ return True
228
+ # Two-three word phrases that look like labels
229
+ if len(words) in (2, 3) and all(w[0].isupper() for w in words if w):
230
+ return True
231
+
232
+ return False
233
+
234
+
235
+ def _filter_table_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
236
+ """Filter out blocks that appear to be part of tables.
237
+
238
+ Detects tables by looking for:
239
+ - Multiple blocks at similar Y positions (table rows)
240
+ - Blocks with table-like content
241
+
242
+ Args:
243
+ blocks: List of text blocks.
244
+
245
+ Returns:
246
+ Filtered list with table content removed.
247
+ """
248
+ if not blocks:
249
+ return blocks
250
+
251
+ # Group blocks by page and approximate Y position (row detection)
252
+ # Blocks within 1% of page height are considered same row
253
+ filtered = []
254
+
255
+ for page_num in set(b.page_num for b in blocks):
256
+ page_blocks = [b for b in blocks if b.page_num == page_num]
257
+
258
+ # Group by Y position (rounded to detect rows)
259
+ y_groups: dict[float, list[TextBlock]] = {}
260
+ for block in page_blocks:
261
+ y_key = round(block.y_ratio, 2) # Group within ~1% of page
262
+ if y_key not in y_groups:
263
+ y_groups[y_key] = []
264
+ y_groups[y_key].append(block)
265
+
266
+ for y_key, row_blocks in y_groups.items():
267
+ # If many blocks at same Y position, likely a table row
268
+ if len(row_blocks) >= 3:
269
+ # Check if most blocks look like table cells
270
+ table_like = sum(1 for b in row_blocks if _is_table_like_text(b.text))
271
+ if table_like >= len(row_blocks) * 0.5:
272
+ # Skip this entire row - it's a table
273
+ continue
274
+
275
+ # Filter individual blocks that look like table content
276
+ for block in row_blocks:
277
+ if not _is_table_like_text(block.text):
278
+ filtered.append(block)
279
+
280
+ # Sort by page and position (top to bottom)
281
+ filtered.sort(key=lambda b: (b.page_num, -b.y_ratio))
282
+ return filtered
283
 
284
 
285
  def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
 
311
  if not isinstance(element, LTTextBoxHorizontal):
312
  continue
313
 
314
+ # Extract characters with their font sizes
315
+ # LTChar has font size, LTAnno is whitespace (use size=-1 to always keep)
316
+ chars_with_sizes: list[tuple[str, float]] = []
317
+ for line in element:
318
+ if isinstance(line, LTTextLineHorizontal):
319
+ for char in line:
320
+ if isinstance(char, LTChar):
321
+ chars_with_sizes.append((char.get_text(), char.size))
322
+ elif isinstance(char, LTAnno):
323
+ # Whitespace/newlines - always keep (use -1 as marker)
324
+ chars_with_sizes.append((char.get_text(), -1))
325
+
326
+ if not chars_with_sizes:
327
+ text = element.get_text().strip()
328
+ if text:
329
+ blocks.append(
330
+ TextBlock(
331
+ text=text,
332
+ y_ratio=element.y0 / page_height if page_height > 0 else 0.5,
333
+ font_size=10.0,
334
+ page_num=page_num,
335
+ )
336
+ )
337
+ continue
338
+
339
+ # Find dominant font size (most common, excluding whitespace markers)
340
+ font_sizes = [size for _, size in chars_with_sizes if size > 0]
341
+ if not font_sizes:
342
+ continue
343
+ size_counts = Counter(round(s, 1) for s in font_sizes)
344
+ dominant_size = max(size_counts, key=lambda x: size_counts[x])
345
+
346
+ # Filter out superscript/subscript characters (< 70% of dominant size)
347
+ # Keep whitespace (size=-1) and normal-sized characters
348
+ min_size = dominant_size * 0.7
349
+ filtered_text = "".join(
350
+ char for char, size in chars_with_sizes if size < 0 or size >= min_size
351
+ )
352
+
353
+ text = filtered_text.strip()
354
  if not text:
355
  continue
356
 
357
  # Calculate Y position as ratio (0=bottom, 1=top)
358
  y_ratio = element.y0 / page_height if page_height > 0 else 0.5
359
 
 
 
 
 
 
 
 
 
360
  avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
361
 
362
  blocks.append(
 
399
  if not blocks:
400
  return ""
401
 
402
+ # Filter out table content first
403
+ blocks = _filter_table_blocks(blocks)
404
+
405
  cleaned_blocks = clean_text_blocks(blocks)
406
  text = "\n\n".join(block.text for block in cleaned_blocks)
407
 
 
456
  if is_page_number(block.text):
457
  continue
458
 
459
+ # Skip figure/table captions
460
+ if _is_caption(block.text):
461
+ continue
462
+
463
  # Skip very short lines with small font (likely captions/footnotes)
464
  if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
465
  continue
 
579
  Returns:
580
  Normalized text optimized for TTS.
581
  """
582
+ # === REMOVE ACADEMIC/PAPER ARTIFACTS ===
583
+ # Remove inline citations like (Smith et al., 2020) or (Smith, 2020; Jones, 2019)
584
+ # Also handles (Chen, 2018; Lee et al., 2020)
585
+ text = re.sub(r"\([^()]*\b\d{4}[a-z]?\b[^()]*\)", "", text)
586
+
587
+ # Remove author-year citations like "Smith (2020)" or "Smith et al. (2020)"
588
+ text = re.sub(
589
+ r"\b[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*\(\d{4}[a-z]?\)", "", text
590
+ )
591
+
592
+ # Clean up "by [Author]" patterns - remove the author part, keep "by" for grammar
593
+ # "by Smith" -> "" (will be cleaned up), "study by Smith found" -> "study found"
594
+ text = re.sub(
595
+ r"\bby\s+[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*,?\s*(?=found|showed|demonstrated|reported|observed|noted|suggested|concluded|argued|claimed|stated|proposed|discovered|revealed|indicated|confirmed)",
596
+ "",
597
+ text,
598
+ )
599
+
600
+ # Remove orphaned "et al." and similar
601
+ text = re.sub(r"\s+et\s+al\.?,?\s*", " ", text)
602
+
603
+ # Remove figure/table references like "see Figure 1" or "(see Table 2)"
604
+ text = re.sub(
605
+ r"\(?see\s+(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph|Appendix)\s*\d+[a-z]?\)?",
606
+ "",
607
+ text,
608
+ flags=re.IGNORECASE,
609
+ )
610
+
611
+ # Remove standalone figure/table references like "Figure 1 shows" -> "shows"
612
+ text = re.sub(
613
+ r"(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph)\s*\d+[a-z]?\s*(?:shows?|depicts?|illustrates?|presents?|displays?|summarizes?)",
614
+ "",
615
+ text,
616
+ flags=re.IGNORECASE,
617
+ )
618
+
619
+ # Remove section references like "Section 2.1" or "Chapter 3" (with surrounding context)
620
+ text = re.sub(
621
+ r"(?:in|see|as\s+(?:shown|described|discussed)\s+in|according\s+to)\s+(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*,?\s*",
622
+ "",
623
+ text,
624
+ flags=re.IGNORECASE,
625
+ )
626
+ text = re.sub(r"(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*", "", text, flags=re.IGNORECASE)
627
+
628
+ # Remove equation references like "Equation 1" or "Eq. (2)"
629
+ text = re.sub(r"(?:Equation|Eq\.?)\s*\(?\d+\)?", "", text, flags=re.IGNORECASE)
630
+
631
+ # Remove DOIs
632
+ text = re.sub(r"(?:doi:|DOI:?)\s*10\.\d{4,}/[^\s]+", "", text, flags=re.IGNORECASE)
633
+
634
+ # Remove arXiv references
635
+ text = re.sub(r"arXiv:\d{4}\.\d{4,}(?:v\d+)?", "", text, flags=re.IGNORECASE)
636
+
637
+ # Remove ISSN/ISBN numbers
638
+ text = re.sub(r"(?:ISSN|ISBN)[:\s]*[\d-]+", "", text, flags=re.IGNORECASE)
639
+
640
+ # Remove page ranges like "pp. 123-456" or "p. 42" or "pages 10-20"
641
+ text = re.sub(r"(?:p{1,2}\.?|pages?)\s*\d+(?:\s*[-–—]\s*\d+)?", "", text, flags=re.IGNORECASE)
642
+
643
+ # Remove volume/issue numbers like "Vol. 12, No. 3" (entire phrase)
644
+ text = re.sub(
645
+ r"(?:Vol(?:ume)?\.?\s*\d+,?\s*)?(?:Issue|No\.?)\s*\d+,?\s*", "", text, flags=re.IGNORECASE
646
+ )
647
+ text = re.sub(r"Vol(?:ume)?\.?\s*\d+,?\s*", "", text, flags=re.IGNORECASE)
648
+
649
+ # Remove copyright notices
650
+ text = re.sub(r"©\s*\d{4}[^.]*\.", "", text)
651
+ text = re.sub(r"Copyright\s*©?\s*\d{4}[^.]*\.", "", text, flags=re.IGNORECASE)
652
+
653
+ # Remove "All rights reserved" and similar
654
+ text = re.sub(r"All rights reserved\.?", "", text, flags=re.IGNORECASE)
655
+
656
+ # Remove asterisks used for footnote markers
657
+ text = re.sub(r"\*{1,3}(?=\s|$)", "", text)
658
+
659
+ # === NORMALIZE NEWLINES FIRST ===
660
+ # Convert various newline formats to standard \n
661
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
662
+
663
+ # Replace single newlines (mid-sentence line breaks) with spaces
664
+ # Keep double newlines as paragraph separators
665
+ # First, normalize multiple newlines to exactly two
666
+ text = re.sub(r"\n{3,}", "\n\n", text)
667
+
668
+ # Replace single newlines that aren't paragraph breaks with spaces
669
+ # A single newline not preceded by sentence-ending punctuation is likely a line wrap
670
+ text = re.sub(r"(?<![.!?:\n])\n(?!\n)", " ", text)
671
+
672
  # === CODE AND TECHNICAL CONTENT ===
673
  # Handle common programming patterns that read poorly
674
 
 
746
  text = text.replace("'''", "")
747
 
748
  # === UNICODE NORMALIZATION ===
749
+
750
+ # Remove superscript characters (often footnote references)
751
+ # Includes Unicode superscript digits, letters, and modifier letters
752
+ superscripts = (
753
+ "⁰¹²³⁴⁵⁶⁷⁸⁹" # Superscript digits
754
+ "⁺⁻⁼⁽⁾" # Superscript operators
755
+ "ⁿⁱ" # Common superscript letters
756
+ "ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻ" # Superscript lowercase
757
+ "ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᴬᴭᴮᴯᴰᴱᴲᴳᴴᴵᴶᴷᴸᴹᴺᴻᴼᴽᴾᴿᵀᵁᵂ" # Superscript uppercase
758
+ "ᶦᶧᶨᶩᶪᶫᶬᶭᶮᶯᶰᶱᶲᶳᶴᶵᶶᶷᶸᶹᶺᶻᶼᶽᶾᶿ" # More modifier letters
759
+ "ʰʱʲʳʴʵʶʷʸʹʺʻʼʽˀˁˆˇˈˉˊˋˌˍˎˏːˑ" # Modifier letters
760
+ )
761
+ for char in superscripts:
762
+ text = text.replace(char, "")
763
+
764
+ # Also use regex to catch any remaining superscript-like characters
765
+ # Unicode categories for superscripts and modifiers
766
+ text = re.sub(r"[\u2070-\u209F]", "", text) # Superscripts and Subscripts block
767
+ text = re.sub(r"[\u1D2C-\u1D6A]", "", text) # Phonetic Extensions (modifier letters)
768
+ text = re.sub(r"[\u1D78-\u1D7F]", "", text) # More phonetic extensions
769
+ text = re.sub(r"[\u02B0-\u02FF]", "", text) # Spacing Modifier Letters
770
+
771
+ # Remove subscript characters
772
+ subscripts = "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₔₕₖₗₘₙₚₛₜ"
773
+ for char in subscripts:
774
+ text = text.replace(char, "")
775
+
776
  # Convert smart quotes to simple quotes
777
+ text = text.replace("\u201c", '"').replace("\u201d", '"')
778
+ text = text.replace("\u2018", "'").replace("\u2019", "'")
779
+ text = text.replace("\u201e", '"').replace("\u201f", '"')
780
 
781
  # Normalize dashes to standard hyphen or remove
782
  text = text.replace("–", "-") # en-dash
 
887
  # Remove content in angle brackets (often HTML/XML artifacts)
888
  text = re.sub(r"<[^>]+>", "", text)
889
 
 
 
 
890
  # Remove spaces before punctuation
891
  text = re.sub(r"\s+([.,;:!?])", r"\1", text)
892
 
893
  # Ensure space after punctuation (but not before another punctuation)
894
  text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
895
 
896
+ # === FINAL WHITESPACE NORMALIZATION ===
897
+ # This must happen LAST after all substitutions that can create gaps
898
 
899
+ # Collapse all whitespace (spaces, tabs, multiple spaces) to single space
900
+ # Do this per-line to preserve intentional paragraph breaks
901
+ lines = text.split("\n")
902
+ normalized_lines = []
903
+ for line in lines:
904
+ # Replace any sequence of whitespace with single space
905
+ line = re.sub(r"[ \t]+", " ", line)
906
+ # Strip leading/trailing whitespace from each line
907
+ line = line.strip()
908
+ normalized_lines.append(line)
909
+
910
+ text = "\n".join(normalized_lines)
911
+
912
+ # Remove excessive blank lines (keep max 1 blank line between paragraphs)
913
  text = re.sub(r"\n{3,}", "\n\n", text)
914
 
915
+ # Remove blank lines at start/end
916
+ text = text.strip()
917
+
918
  return text
src/talking_snake/static/app.js CHANGED
@@ -24,14 +24,15 @@ const deviceInfo = document.getElementById("deviceInfo");
24
  const docInfo = document.getElementById("docInfo");
25
  const languageButtons = document.querySelectorAll("#languageButtons .style-btn");
26
  const processingProgressBar = document.getElementById("processingProgressBar");
 
27
 
28
  // Custom player elements
29
  const playerPlayBtn = document.getElementById("playerPlayBtn");
30
  const progressBar = document.getElementById("progressBar");
31
  const progressSlider = document.getElementById("progressSlider");
32
  const timeDisplay = document.getElementById("timeDisplay");
33
- const volumeBtn = document.getElementById("volumeBtn");
34
  const downloadBtn = document.getElementById("downloadBtn");
 
35
 
36
  // Constants
37
  const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
@@ -39,11 +40,12 @@ const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
39
  // State
40
  let currentAbortController = null;
41
  let selectedLanguage = "english";
 
42
  let isPaused = false;
43
  let estimatedDuration = 0; // Estimated total duration from server
44
- let isMuted = false;
45
- let currentAudioBlob = null; // Store audio blob for download
46
  let currentDocName = ""; // Store document name for download filename
 
 
47
 
48
  /**
49
  * Format time in seconds to MM:SS
@@ -91,10 +93,32 @@ function updateDocInfo(data) {
91
  const pageInfo = data.page_count ? `<span class="doc-pages"><i class="fa-solid fa-file"></i> ${data.page_count}p</span>` : "";
92
  const charInfo = data.total_chars ? `<span class="doc-chars"><i class="fa-solid fa-font"></i> ${formatNumber(data.total_chars)}</span>` : "";
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  docInfo.innerHTML = `
95
  <span class="doc-name" title="${docName}"><i class="fa-solid ${icon}"></i><span class="doc-name-text">${docName}</span></span>
96
  ${pageInfo}
97
  ${charInfo}
 
 
98
  `;
99
  }
100
 
@@ -102,16 +126,30 @@ function updateDocInfo(data) {
102
  * Update the custom player progress bar and time display
103
  */
104
  function updatePlayerProgress() {
105
- const currentTime = audio.currentTime || 0;
106
- // Use estimated duration if audio duration is unrealistic (streaming issue)
107
- let duration = audio.duration;
108
- if (!isFinite(duration) || duration > 36000 || duration <= 0) {
109
- duration = estimatedDuration || currentTime + 60; // Fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  }
111
 
112
  const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
113
  progressBar.style.width = `${Math.min(progress, 100)}%`;
114
- progressSlider.value = progress;
115
  timeDisplay.textContent = `${formatTime(currentTime)} / ${formatTime(duration)}`;
116
  }
117
 
@@ -120,11 +158,19 @@ function updatePlayerProgress() {
120
  */
121
  function handleSeek(e) {
122
  const percent = parseFloat(e.target.value);
123
- let duration = audio.duration;
124
- if (!isFinite(duration) || duration > 36000) {
125
- duration = estimatedDuration || 60;
 
 
 
 
 
 
 
 
 
126
  }
127
- audio.currentTime = (percent / 100) * duration;
128
  updatePlayerProgress();
129
  }
130
 
@@ -152,13 +198,39 @@ function updatePlayButton() {
152
  }
153
 
154
  /**
155
- * Toggle mute
 
 
156
  */
157
- function toggleMute() {
158
- isMuted = !isMuted;
159
- audio.muted = isMuted;
160
- const icon = volumeBtn.querySelector("i");
161
- icon.className = isMuted ? "fa-solid fa-volume-xmark" : "fa-solid fa-volume-high";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  }
163
 
164
  /**
@@ -167,14 +239,27 @@ function toggleMute() {
167
  */
168
  function updateDeviceInfo(info) {
169
  const icon = info.device === "cuda" ? "fa-microchip" : "fa-server";
170
- const memoryInfo = info.device === "cuda"
171
- ? `${info.memory_used_gb}GB / ${info.memory_total_gb}GB (${info.memory_percent}%)`
172
- : "CPU mode";
 
 
 
 
 
 
 
 
 
 
173
  deviceInfo.innerHTML = `
174
- <i class="fa-solid ${icon}"></i>
175
- <span>${info.device_name}</span>
176
- <span class="device-memory">${memoryInfo}</span>
177
- <span class="device-batch">Batch: ${info.batch_size}</span>
 
 
 
178
  `;
179
  deviceInfo.classList.add("visible");
180
  }
@@ -207,58 +292,145 @@ initDeviceInfoStream();
207
  // Custom player event listeners
208
  playerPlayBtn.addEventListener("click", togglePlayerPlay);
209
  progressSlider.addEventListener("input", handleSeek);
210
- volumeBtn.addEventListener("click", toggleMute);
211
- audio.addEventListener("play", updatePlayButton);
212
- audio.addEventListener("pause", updatePlayButton);
 
 
 
 
 
 
 
 
 
 
213
  audio.addEventListener("timeupdate", updatePlayerProgress);
214
  audio.addEventListener("ended", () => {
 
 
 
 
 
 
 
 
 
215
  updatePlayButton();
216
  progressBar.style.width = "100%";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  });
218
  // Show pause button when audio actually starts playing
219
  audio.addEventListener("playing", () => {
 
220
  pauseBtn.classList.remove("hidden");
221
  });
222
 
 
 
 
 
 
 
 
 
 
223
  /**
224
- * Fetch audio blob from the server for download capability
225
  * @param {string} jobId - The job ID for the audio
226
  */
227
- async function fetchAudioBlob(jobId) {
228
- try {
229
- const response = await fetch(`/api/audio/${jobId}`);
230
- if (response.ok) {
231
- currentAudioBlob = await response.blob();
232
- // Show download button
233
- downloadBtn.classList.remove("hidden");
234
- }
235
- } catch (error) {
236
- console.error("Failed to fetch audio for download:", error);
237
- }
 
 
 
 
238
  }
239
 
240
  /**
241
  * Download the current audio as a WAV file
242
  */
243
  function downloadAudio() {
244
- if (!currentAudioBlob) {
 
245
  return;
246
  }
247
 
248
- const url = URL.createObjectURL(currentAudioBlob);
249
- const a = document.createElement("a");
250
- a.href = url;
251
-
252
  // Create filename from document name
253
  let filename = currentDocName || "audio";
254
- // Remove file extension if present and add .wav
255
  filename = filename.replace(/\.[^.]+$/, "") + ".wav";
256
- a.download = filename;
257
 
 
 
 
 
258
  document.body.appendChild(a);
259
  a.click();
260
  document.body.removeChild(a);
261
- URL.revokeObjectURL(url);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  }
263
 
264
  /**
@@ -269,6 +441,112 @@ function getSelectedLanguage() {
269
  return selectedLanguage;
270
  }
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  /**
273
  * Show the input section and hide processing section
274
  */
@@ -283,9 +561,10 @@ function showInputSection() {
283
  function showProcessingSection() {
284
  inputSection.classList.add("hidden");
285
  processingSection.classList.add("visible");
286
- // Reset progress bar and hide pause button
287
  processingProgressBar.style.width = "0%";
288
  pauseBtn.classList.add("hidden");
 
289
  }
290
 
291
  /**
@@ -318,10 +597,10 @@ function stopGeneration() {
318
  isPaused = false;
319
  updatePauseButton();
320
 
321
- // Hide download button and pause button
322
  downloadBtn.classList.add("hidden");
323
  pauseBtn.classList.add("hidden");
324
- currentAudioBlob = null;
325
 
326
  // Reset progress bar
327
  processingProgressBar.style.width = "0%";
@@ -370,16 +649,23 @@ function updatePauseButton() {
370
  }
371
  }
372
 
 
 
373
  /**
374
- * Format remaining time for display
375
- * @param {number} seconds - Remaining time in seconds
376
- * @returns {string} Formatted time string
377
  */
378
- function formatTimeRemaining(seconds) {
379
- if (seconds > 60) {
380
- return `~${Math.ceil(seconds / 60)} min remaining`;
 
 
 
 
 
 
381
  }
382
- return `~${Math.ceil(seconds)}s remaining`;
383
  }
384
 
385
  /**
@@ -387,15 +673,15 @@ function formatTimeRemaining(seconds) {
387
  * Sets up audio stream once job_id is received
388
  * @param {Response} response - Fetch response with SSE stream
389
  * @param {string} docName - Document name for display
 
390
  * @returns {Promise<void>}
391
  * @throws {Error} If stream contains an error event or fails
392
  */
393
- async function processStream(response, docName) {
394
  const reader = response.body.getReader();
395
  const decoder = new TextDecoder();
396
  let lastStatus = "";
397
- let jobId = null;
398
- let audioStarted = false;
399
 
400
  // Reset estimated duration
401
  estimatedDuration = 0;
@@ -419,52 +705,66 @@ async function processStream(response, docName) {
419
  throw new Error(data.message || "TTS generation failed");
420
  } else if (data.type === "start" && data.job_id) {
421
  // Got job ID - start audio stream immediately
422
- jobId = data.job_id;
423
- // Capture initial duration estimate
424
- if (data.estimated_remaining) {
425
- estimatedDuration = data.estimated_remaining;
 
426
  }
427
  // Display document info
428
  updateDocInfo(data);
429
- if (!audioStarted) {
430
- audioStarted = true;
431
- // Set audio source to stream endpoint
432
- // Browser will start playing as data arrives
433
- audio.src = `/api/audio/${jobId}`;
434
- audio.load();
435
- // Try to play (may need user interaction first time)
436
- audio.play().catch(() => {
437
- // Autoplay blocked - will play when user clicks
438
- });
439
- updatePlayButton();
440
- // Pause button will be shown by the 'playing' event listener
441
  }
442
- const timeStr = formatTimeRemaining(data.estimated_remaining);
443
  showStatus(
444
- `<span class="spinner"></span>ETA ${timeStr}`,
445
  "loading"
446
  );
447
  // Update progress bar
448
  processingProgressBar.style.width = "5%";
449
  } else if (data.type === "progress") {
450
  lastStatus = data.status;
451
- const timeStr = formatTimeRemaining(data.estimated_remaining);
452
  showStatus(
453
- `<span class="spinner"></span>${data.percent}% • ETA ${timeStr}`,
454
  "loading"
455
  );
456
  // Update progress bar
457
  processingProgressBar.style.width = `${data.percent}%`;
458
  } else if (data.type === "complete") {
459
  // Generation complete - show player
460
- // Update estimated duration based on actual processing time
461
- if (data.total_time) {
462
- // Estimate audio duration: ~0.1s per char at normal speech rate
463
- // Use total_time as a rough guide
464
- estimatedDuration = Math.max(estimatedDuration, audio.currentTime + 10);
465
  }
466
- filename.textContent = docName;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  currentDocName = docName;
 
 
 
 
468
  player.classList.add("visible");
469
  // Set progress to 100%
470
  processingProgressBar.style.width = "100%";
@@ -474,11 +774,6 @@ async function processStream(response, docName) {
474
  "success"
475
  );
476
  updatePlayerProgress();
477
-
478
- // Fetch audio blob for download capability
479
- if (jobId) {
480
- fetchAudioBlob(jobId);
481
- }
482
  }
483
  } catch (parseError) {
484
  // Check if it's our thrown error or a JSON parse error
@@ -518,11 +813,11 @@ async function handleFile(file) {
518
  showStatus('<span class="spinner"></span> Extracting text...', "loading");
519
  player.classList.remove("visible");
520
  downloadBtn.classList.add("hidden");
521
- currentAudioBlob = null;
522
 
523
  const formData = new FormData();
524
  formData.append("file", file);
525
  formData.append("language", getSelectedLanguage());
 
526
 
527
  // Create abort controller for this request
528
  currentAbortController = new AbortController();
@@ -540,7 +835,7 @@ async function handleFile(file) {
540
  }
541
 
542
  // Process stream handles both progress SSE and starting audio playback
543
- await processStream(response, file.name);
544
  } catch (error) {
545
  if (error.name === "AbortError") {
546
  // User cancelled - already handled in stopGeneration
@@ -577,7 +872,6 @@ async function handleUrl(url) {
577
  showStatus('<span class="spinner"></span> Fetching content...', "loading");
578
  player.classList.remove("visible");
579
  downloadBtn.classList.add("hidden");
580
- currentAudioBlob = null;
581
  urlSubmit.disabled = true;
582
 
583
  // Create abort controller for this request
@@ -591,7 +885,8 @@ async function handleUrl(url) {
591
  },
592
  body: JSON.stringify({
593
  url,
594
- language: getSelectedLanguage()
 
595
  }),
596
  signal: currentAbortController.signal,
597
  });
@@ -606,7 +901,7 @@ async function handleUrl(url) {
606
  const docName = urlPath.split("/").pop() || "document";
607
 
608
  // Process stream handles both progress SSE and starting audio playback
609
- await processStream(response, docName);
610
  } catch (error) {
611
  if (error.name === "AbortError") {
612
  // User cancelled - already handled in stopGeneration
@@ -641,7 +936,6 @@ async function handleText(text) {
641
  showStatus('<span class="spinner"></span> Processing text...', "loading");
642
  player.classList.remove("visible");
643
  downloadBtn.classList.add("hidden");
644
- currentAudioBlob = null;
645
  textSubmit.disabled = true;
646
 
647
  // Create abort controller for this request
@@ -655,7 +949,8 @@ async function handleText(text) {
655
  },
656
  body: JSON.stringify({
657
  text,
658
- language: getSelectedLanguage()
 
659
  }),
660
  signal: currentAbortController.signal,
661
  });
@@ -665,8 +960,12 @@ async function handleText(text) {
665
  throw new Error(error.detail || "Failed to process text");
666
  }
667
 
 
 
 
 
668
  // Process stream handles both progress SSE and starting audio playback
669
- await processStream(response, "Pasted Text");
670
  } catch (error) {
671
  if (error.name === "AbortError") {
672
  // User cancelled - already handled in stopGeneration
@@ -683,6 +982,15 @@ async function handleText(text) {
683
  // Tab switching
684
  tabs.forEach((tab) => {
685
  tab.addEventListener("click", () => {
 
 
 
 
 
 
 
 
 
686
  tabs.forEach((t) => t.classList.remove("active"));
687
  tabContents.forEach((tc) => tc.classList.remove("active"));
688
  tab.classList.add("active");
@@ -712,7 +1020,7 @@ dropZone.addEventListener("drop", (e) => {
712
 
713
  // Click to select file
714
  dropZone.addEventListener("click", (e) => {
715
- if (e.target !== fileInput && !e.target.classList.contains("file-label")) {
716
  fileInput.click();
717
  }
718
  });
@@ -746,15 +1054,34 @@ textInput.addEventListener("keydown", (e) => {
746
  }
747
  });
748
 
 
 
 
 
 
 
 
 
749
  // Stop button
750
  stopBtn.addEventListener("click", stopGeneration);
751
 
 
 
 
 
 
 
 
 
752
  // Pause button
753
  pauseBtn.addEventListener("click", togglePause);
754
 
755
  // Download button
756
  downloadBtn.addEventListener("click", downloadAudio);
757
 
 
 
 
758
  // Update pause button when audio state changes
759
  audio.addEventListener("play", updatePauseButton);
760
  audio.addEventListener("pause", updatePauseButton);
@@ -766,8 +1093,16 @@ audio.addEventListener("ended", () => {
766
  // Language selection
767
  languageButtons.forEach((btn) => {
768
  btn.addEventListener("click", () => {
769
- languageButtons.forEach((b) => b.classList.remove("active"));
 
 
 
 
 
 
 
 
770
  btn.classList.add("active");
771
- selectedLanguage = btn.dataset.language;
772
  });
773
  });
 
24
  const docInfo = document.getElementById("docInfo");
25
  const languageButtons = document.querySelectorAll("#languageButtons .style-btn");
26
  const processingProgressBar = document.getElementById("processingProgressBar");
27
+ const streamPlayBtn = document.getElementById("streamPlayBtn");
28
 
29
  // Custom player elements
30
  const playerPlayBtn = document.getElementById("playerPlayBtn");
31
  const progressBar = document.getElementById("progressBar");
32
  const progressSlider = document.getElementById("progressSlider");
33
  const timeDisplay = document.getElementById("timeDisplay");
 
34
  const downloadBtn = document.getElementById("downloadBtn");
35
+ const deleteBtn = document.getElementById("deleteBtn");
36
 
37
  // Constants
38
  const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
 
40
  // State
41
  let currentAbortController = null;
42
  let selectedLanguage = "english";
43
+ let selectedStyle = "technical";
44
  let isPaused = false;
45
  let estimatedDuration = 0; // Estimated total duration from server
 
 
46
  let currentDocName = ""; // Store document name for download filename
47
+ let playbackStartTime = 0; // When playback started (for tracking real elapsed time)
48
+ let playbackElapsed = 0; // Total elapsed playback time
49
 
50
  /**
51
  * Format time in seconds to MM:SS
 
93
  const pageInfo = data.page_count ? `<span class="doc-pages"><i class="fa-solid fa-file"></i> ${data.page_count}p</span>` : "";
94
  const charInfo = data.total_chars ? `<span class="doc-chars"><i class="fa-solid fa-font"></i> ${formatNumber(data.total_chars)}</span>` : "";
95
 
96
+ // Style icons mapping
97
+ const styleIcons = {
98
+ technical: "fa-microchip",
99
+ narrative: "fa-book-open",
100
+ child_narrative: "fa-child",
101
+ news: "fa-newspaper",
102
+ academic: "fa-graduation-cap"
103
+ };
104
+
105
+ // Language flags mapping
106
+ const langFlags = {
107
+ english: "🇬🇧",
108
+ chinese: "🇨🇳",
109
+ japanese: "🇯🇵",
110
+ korean: "🇰🇷"
111
+ };
112
+
113
+ const styleIcon = styleIcons[selectedStyle] || "fa-microchip";
114
+ const langFlag = langFlags[selectedLanguage] || "🇬🇧";
115
+
116
  docInfo.innerHTML = `
117
  <span class="doc-name" title="${docName}"><i class="fa-solid ${icon}"></i><span class="doc-name-text">${docName}</span></span>
118
  ${pageInfo}
119
  ${charInfo}
120
+ <span class="doc-style" title="Style: ${selectedStyle}"><i class="fa-solid ${styleIcon}"></i></span>
121
+ <span class="doc-lang" title="Language: ${selectedLanguage}">${langFlag}</span>
122
  `;
123
  }
124
 
 
126
  * Update the custom player progress bar and time display
127
  */
128
  function updatePlayerProgress() {
129
+ // For streaming WAV, browser's duration/currentTime are unreliable
130
+ // Track real playback time ourselves
131
+ let currentTime;
132
+ if (playbackStartTime > 0 && !audio.paused) {
133
+ currentTime = playbackElapsed + (Date.now() - playbackStartTime) / 1000;
134
+ } else {
135
+ currentTime = playbackElapsed;
136
+ }
137
+
138
+ // Use our estimated duration, update it if playback exceeds estimate
139
+ let duration = estimatedDuration;
140
+ if (currentTime > duration) {
141
+ estimatedDuration = currentTime + 10; // Extend estimate
142
+ duration = estimatedDuration;
143
+ }
144
+
145
+ // Ensure we have reasonable values
146
+ if (duration <= 0) {
147
+ duration = 60; // Fallback
148
  }
149
 
150
  const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
151
  progressBar.style.width = `${Math.min(progress, 100)}%`;
152
+ progressSlider.value = Math.min(progress, 100);
153
  timeDisplay.textContent = `${formatTime(currentTime)} / ${formatTime(duration)}`;
154
  }
155
 
 
158
  */
159
  function handleSeek(e) {
160
  const percent = parseFloat(e.target.value);
161
+ const duration = estimatedDuration || 60;
162
+ const seekTime = (percent / 100) * duration;
163
+
164
+ // Set our playback tracker
165
+ playbackElapsed = seekTime;
166
+ playbackStartTime = audio.paused ? 0 : Date.now();
167
+
168
+ // Try to seek the audio (may not work well with streaming)
169
+ try {
170
+ audio.currentTime = seekTime;
171
+ } catch {
172
+ // Seeking may fail with streaming audio
173
  }
 
174
  updatePlayerProgress();
175
  }
176
 
 
198
  }
199
 
200
  /**
201
+ * Get HTML for model state indicator
202
+ * @param {string} state - Model state: loaded, loading, unloaded, unloading
203
+ * @returns {string} HTML string for the model state indicator
204
  */
205
+ function getModelStateHtml(state) {
206
+ const stateConfig = {
207
+ loaded: {
208
+ icon: "fa-circle-check",
209
+ class: "model-loaded",
210
+ text: "Model loaded",
211
+ tooltip: "TTS model is loaded in memory and ready for inference"
212
+ },
213
+ loading: {
214
+ icon: "fa-spinner fa-spin",
215
+ class: "model-loading",
216
+ text: "Loading...",
217
+ tooltip: "TTS model is being loaded into memory"
218
+ },
219
+ unloaded: {
220
+ icon: "fa-circle-xmark",
221
+ class: "model-unloaded",
222
+ text: "Model unloaded",
223
+ tooltip: "TTS model is not loaded (will load on first request)"
224
+ },
225
+ unloading: {
226
+ icon: "fa-spinner fa-spin",
227
+ class: "model-unloading",
228
+ text: "Unloading...",
229
+ tooltip: "TTS model is being unloaded from memory"
230
+ }
231
+ };
232
+ const config = stateConfig[state] || stateConfig.unloaded;
233
+ return `<span class="model-state ${config.class}" title="${config.tooltip}"><i class="fa-solid ${config.icon}"></i> ${config.text}</span>`;
234
  }
235
 
236
  /**
 
239
  */
240
  function updateDeviceInfo(info) {
241
  const icon = info.device === "cuda" ? "fa-microchip" : "fa-server";
242
+ const deviceTooltip = info.device === "cuda"
243
+ ? "GPU accelerated inference for faster audio generation"
244
+ : "CPU-based inference (slower than GPU)";
245
+ const gpuMemoryInfo = info.device === "cuda"
246
+ ? `<span class="device-memory" title="GPU memory used for model and inference"><i class="fa-solid fa-memory"></i> GPU: ${info.memory_used_gb}/${info.memory_total_gb}GB</span>`
247
+ : "";
248
+ const ramInfo = `<span class="device-memory" title="System RAM usage"><i class="fa-solid fa-memory"></i> RAM: ${info.ram_used_gb}/${info.ram_total_gb}GB</span>`;
249
+ // Show timing stats if available
250
+ const timingInfo = info.seconds_per_char !== undefined
251
+ ? `<span class="device-timing" title="Average time to generate audio per character of text"><i class="fa-solid fa-stopwatch"></i> ${info.seconds_per_char.toFixed(4)}s/char</span>`
252
+ : "";
253
+ // Show model state
254
+ const modelStateInfo = getModelStateHtml(info.model_state);
255
  deviceInfo.innerHTML = `
256
+ <i class="fa-solid ${icon}" title="${deviceTooltip}"></i>
257
+ <span title="${deviceTooltip}">${info.device_name}</span>
258
+ ${modelStateInfo}
259
+ ${gpuMemoryInfo}
260
+ ${ramInfo}
261
+ ${timingInfo}
262
+ <span class="device-ephemeral" title="Your documents are processed in memory only. Nothing is saved to disk or stored after processing."><i class="fa-solid fa-shield-halved"></i> No files stored</span>
263
  `;
264
  deviceInfo.classList.add("visible");
265
  }
 
292
  // Custom player event listeners
293
  playerPlayBtn.addEventListener("click", togglePlayerPlay);
294
  progressSlider.addEventListener("input", handleSeek);
295
+ audio.addEventListener("play", () => {
296
+ // Start tracking real playback time
297
+ playbackStartTime = Date.now();
298
+ updatePlayButton();
299
+ });
300
+ audio.addEventListener("pause", () => {
301
+ // Save elapsed time when pausing
302
+ if (playbackStartTime > 0) {
303
+ playbackElapsed += (Date.now() - playbackStartTime) / 1000;
304
+ playbackStartTime = 0;
305
+ }
306
+ updatePlayButton();
307
+ });
308
  audio.addEventListener("timeupdate", updatePlayerProgress);
309
  audio.addEventListener("ended", () => {
310
+ // Update elapsed to match duration on completion
311
+ if (playbackStartTime > 0) {
312
+ playbackElapsed += (Date.now() - playbackStartTime) / 1000;
313
+ playbackStartTime = 0;
314
+ }
315
+ // Ensure we show completion
316
+ if (estimatedDuration > 0 && playbackElapsed < estimatedDuration) {
317
+ playbackElapsed = estimatedDuration;
318
+ }
319
  updatePlayButton();
320
  progressBar.style.width = "100%";
321
+ timeDisplay.textContent = `${formatTime(estimatedDuration)} / ${formatTime(estimatedDuration)}`;
322
+ });
323
+ // Update duration when metadata is available
324
+ audio.addEventListener("loadedmetadata", () => {
325
+ // If browser has a valid duration, use it instead of estimate
326
+ if (isFinite(audio.duration) && audio.duration > 0 && audio.duration < 36000) {
327
+ estimatedDuration = audio.duration;
328
+ }
329
+ updatePlayerProgress();
330
+ });
331
+ // Also check duration changes (for streaming audio)
332
+ audio.addEventListener("durationchange", () => {
333
+ if (isFinite(audio.duration) && audio.duration > 0 && audio.duration < 36000) {
334
+ estimatedDuration = audio.duration;
335
+ }
336
+ updatePlayerProgress();
337
+ });
338
+ // Log audio errors for debugging
339
+ audio.addEventListener("error", () => {
340
+ console.error("Audio error:", audio.error?.message || "Unknown error");
341
  });
342
  // Show pause button when audio actually starts playing
343
  audio.addEventListener("playing", () => {
344
+ streamPlayBtn.classList.add("hidden");
345
  pauseBtn.classList.remove("hidden");
346
  });
347
 
348
+ // Show stream play button when audio has enough data to start playing
349
+ audio.addEventListener("canplay", () => {
350
+ // Only show if processing is still in progress (player not visible yet)
351
+ // and audio is paused (not already playing) and pause button isn't showing
352
+ if (!player.classList.contains("visible") && audio.paused && pauseBtn.classList.contains("hidden")) {
353
+ streamPlayBtn.classList.remove("hidden");
354
+ }
355
+ });
356
+
357
  /**
358
+ * Start streaming audio playback and enable download from cache
359
  * @param {string} jobId - The job ID for the audio
360
  */
361
+ async function startAudioStream(jobId) {
362
+ const audioUrl = `/api/audio/${jobId}`;
363
+
364
+ // Reset playback tracking for new stream
365
+ playbackStartTime = 0;
366
+ playbackElapsed = 0;
367
+
368
+ // Set up audio source for streaming (user can click play)
369
+ audio.src = audioUrl;
370
+ audio.load();
371
+
372
+ // Store job ID for download - will fetch from cache
373
+ audio.dataset.jobId = jobId;
374
+
375
+ // Play button will be shown by the canplay event handler
376
  }
377
 
378
  /**
379
  * Download the current audio as a WAV file
380
  */
381
  function downloadAudio() {
382
+ const jobId = audio.dataset.jobId;
383
+ if (!jobId) {
384
  return;
385
  }
386
 
 
 
 
 
387
  // Create filename from document name
388
  let filename = currentDocName || "audio";
 
389
  filename = filename.replace(/\.[^.]+$/, "") + ".wav";
 
390
 
391
+ // Use download endpoint which returns proper WAV file
392
+ const a = document.createElement("a");
393
+ a.href = `/api/download/${jobId}?filename=${encodeURIComponent(filename)}`;
394
+ a.download = filename;
395
  document.body.appendChild(a);
396
  a.click();
397
  document.body.removeChild(a);
398
+ }
399
+
400
+ /**
401
+ * Delete the current audio and reset the player
402
+ */
403
+ function deleteAudio() {
404
+ // Stop audio immediately
405
+ audio.pause();
406
+
407
+ // Add deleting animation
408
+ player.classList.add("deleting");
409
+
410
+ // Wait for animation to complete
411
+ setTimeout(() => {
412
+ // Reset audio
413
+ audio.src = "";
414
+ audio.currentTime = 0;
415
+
416
+ // Clear state
417
+ currentDocName = "";
418
+ estimatedDuration = 0;
419
+
420
+ // Hide player and buttons
421
+ player.classList.remove("visible", "deleting");
422
+ downloadBtn.classList.add("hidden");
423
+ deleteBtn.classList.add("hidden");
424
+
425
+ // Reset progress
426
+ progressBar.style.width = "0%";
427
+ progressSlider.value = 0;
428
+ timeDisplay.textContent = "0:00 / 0:00";
429
+ updatePlayButton();
430
+
431
+ // Show input section again
432
+ inputSection.classList.remove("hidden");
433
+ }, 300);
434
  }
435
 
436
  /**
 
441
  return selectedLanguage;
442
  }
443
 
444
+ /**
445
+ * Detect language from text based on character scripts.
446
+ * @param {string} text - The text to analyze
447
+ * @returns {string|null} Detected language or null if mostly ASCII/Latin
448
+ */
449
+ function detectLanguage(text) {
450
+ if (!text || text.length < 5) {
451
+ return null;
452
+ }
453
+
454
+ let chinese = 0;
455
+ let japanese = 0; // Hiragana + Katakana
456
+ let korean = 0;
457
+ let latin = 0;
458
+
459
+ for (const char of text) {
460
+ const code = char.charCodeAt(0);
461
+ // CJK Unified Ideographs (shared by Chinese/Japanese)
462
+ if (code >= 0x4e00 && code <= 0x9fff) {
463
+ chinese++;
464
+ }
465
+ // Hiragana
466
+ else if (code >= 0x3040 && code <= 0x309f) {
467
+ japanese++;
468
+ }
469
+ // Katakana
470
+ else if (code >= 0x30a0 && code <= 0x30ff) {
471
+ japanese++;
472
+ }
473
+ // Hangul Syllables
474
+ else if (code >= 0xac00 && code <= 0xd7af) {
475
+ korean++;
476
+ }
477
+ // Hangul Jamo
478
+ else if (code >= 0x1100 && code <= 0x11ff) {
479
+ korean++;
480
+ }
481
+ // Basic Latin letters
482
+ else if (
483
+ (code >= 0x41 && code <= 0x5a) ||
484
+ (code >= 0x61 && code <= 0x7a)
485
+ ) {
486
+ latin++;
487
+ }
488
+ }
489
+
490
+ const total = chinese + japanese + korean + latin;
491
+ if (total === 0) {
492
+ return null;
493
+ }
494
+
495
+ // Japanese uses kanji (chinese chars) + kana, so check for kana first
496
+ if (japanese > 0 && (japanese + chinese) / total > 0.3) {
497
+ return "japanese";
498
+ }
499
+ // Korean
500
+ if (korean / total > 0.3) {
501
+ return "korean";
502
+ }
503
+ // Chinese (CJK without kana)
504
+ if (chinese / total > 0.3) {
505
+ return "chinese";
506
+ }
507
+ // Default to English for Latin text
508
+ if (latin / total > 0.5) {
509
+ return "english";
510
+ }
511
+ return null;
512
+ }
513
+
514
+ /**
515
+ * Set the selected language, optionally marking it as auto-detected.
516
+ * @param {string} lang - Language to select
517
+ * @param {boolean} isAuto - Whether this was auto-detected
518
+ */
519
+ function setLanguage(lang, isAuto = false) {
520
+ const btn = document.querySelector(
521
+ `#languageButtons .style-btn[data-language="${lang}"]`
522
+ );
523
+ if (!btn || selectedLanguage === lang) {
524
+ return;
525
+ }
526
+
527
+ // Update selection state
528
+ languageButtons.forEach((b) => {
529
+ b.classList.remove("active", "auto-detected");
530
+ });
531
+ btn.classList.add("active");
532
+ selectedLanguage = lang;
533
+
534
+ // Visual feedback for auto-detection
535
+ if (isAuto) {
536
+ btn.classList.add("auto-detected");
537
+ // Remove animation class after it completes
538
+ setTimeout(() => btn.classList.remove("auto-detected"), 1500);
539
+ }
540
+ }
541
+
542
+ /**
543
+ * Get the currently selected style
544
+ * @returns {string} The selected style ID
545
+ */
546
+ function getSelectedStyle() {
547
+ return selectedStyle;
548
+ }
549
+
550
  /**
551
  * Show the input section and hide processing section
552
  */
 
561
  function showProcessingSection() {
562
  inputSection.classList.add("hidden");
563
  processingSection.classList.add("visible");
564
+ // Reset progress bar and hide buttons
565
  processingProgressBar.style.width = "0%";
566
  pauseBtn.classList.add("hidden");
567
+ streamPlayBtn.classList.add("hidden");
568
  }
569
 
570
  /**
 
597
  isPaused = false;
598
  updatePauseButton();
599
 
600
+ // Hide download button, pause button, and stream play button
601
  downloadBtn.classList.add("hidden");
602
  pauseBtn.classList.add("hidden");
603
+ streamPlayBtn.classList.add("hidden");
604
 
605
  // Reset progress bar
606
  processingProgressBar.style.width = "0%";
 
649
  }
650
  }
651
 
652
+
653
+
654
  /**
655
+ * Get icon class for source type
656
+ * @param {string} sourceType - The source type ("pdf", "url", "text")
657
+ * @returns {string} Font Awesome icon class
658
  */
659
+ function getSourceIcon(sourceType) {
660
+ switch (sourceType) {
661
+ case "pdf":
662
+ return "fa-file-pdf";
663
+ case "url":
664
+ return "fa-link";
665
+ case "text":
666
+ default:
667
+ return "fa-keyboard";
668
  }
 
669
  }
670
 
671
  /**
 
673
  * Sets up audio stream once job_id is received
674
  * @param {Response} response - Fetch response with SSE stream
675
  * @param {string} docName - Document name for display
676
+ * @param {string} sourceType - Source type ("pdf", "url", "text")
677
  * @returns {Promise<void>}
678
  * @throws {Error} If stream contains an error event or fails
679
  */
680
+ async function processStream(response, docName, sourceType = "text") {
681
  const reader = response.body.getReader();
682
  const decoder = new TextDecoder();
683
  let lastStatus = "";
684
+ let audioJobId = null;
 
685
 
686
  // Reset estimated duration
687
  estimatedDuration = 0;
 
705
  throw new Error(data.message || "TTS generation failed");
706
  } else if (data.type === "start" && data.job_id) {
707
  // Got job ID - start audio stream immediately
708
+ const jobId = data.job_id;
709
+ // Estimate audio duration from character count
710
+ // Typical speech is ~14 chars/sec (150 wpm, 5 chars/word)
711
+ if (data.total_chars) {
712
+ estimatedDuration = data.total_chars / 14;
713
  }
714
  // Display document info
715
  updateDocInfo(data);
716
+ if (!audioJobId) {
717
+ audioJobId = jobId;
718
+ // Start streaming playback immediately
719
+ startAudioStream(jobId);
 
 
 
 
 
 
 
 
720
  }
721
+ // Show generating status
722
  showStatus(
723
+ '<span class="spinner"></span> Generating...',
724
  "loading"
725
  );
726
  // Update progress bar
727
  processingProgressBar.style.width = "5%";
728
  } else if (data.type === "progress") {
729
  lastStatus = data.status;
730
+ // Show progress percentage
731
  showStatus(
732
+ `<span class="spinner"></span> ${data.percent}%`,
733
  "loading"
734
  );
735
  // Update progress bar
736
  processingProgressBar.style.width = `${data.percent}%`;
737
  } else if (data.type === "complete") {
738
  // Generation complete - show player
739
+ // Use actual audio duration from server if available
740
+ if (data.audio_duration && data.audio_duration > 0) {
741
+ estimatedDuration = data.audio_duration;
 
 
742
  }
743
+ // Build filename with style and language indicators
744
+ const styleIcons = {
745
+ technical: "fa-microchip",
746
+ conversational: "fa-comments",
747
+ storytelling: "fa-book-open",
748
+ child_narrative: "fa-child",
749
+ news: "fa-newspaper",
750
+ academic: "fa-graduation-cap"
751
+ };
752
+ const langFlags = {
753
+ english: "🇬🇧",
754
+ chinese: "🇨🇳",
755
+ japanese: "🇯🇵",
756
+ korean: "🇰🇷"
757
+ };
758
+ const usedStyle = getSelectedStyle();
759
+ const usedLang = getSelectedLanguage();
760
+ const styleIcon = styleIcons[usedStyle] || "fa-microchip";
761
+ const langFlag = langFlags[usedLang] || "🇬🇧";
762
+ filename.innerHTML = `<i class="fa-solid ${getSourceIcon(sourceType)}"></i> ${docName} <span class="filename-meta"><i class="fa-solid ${styleIcon}" title="Style: ${usedStyle}"></i><span title="Language: ${usedLang}">${langFlag}</span></span>`;
763
  currentDocName = docName;
764
+ // Hide stream buttons, show full player with download
765
+ streamPlayBtn.classList.add("hidden");
766
+ downloadBtn.classList.remove("hidden");
767
+ deleteBtn.classList.remove("hidden");
768
  player.classList.add("visible");
769
  // Set progress to 100%
770
  processingProgressBar.style.width = "100%";
 
774
  "success"
775
  );
776
  updatePlayerProgress();
 
 
 
 
 
777
  }
778
  } catch (parseError) {
779
  // Check if it's our thrown error or a JSON parse error
 
813
  showStatus('<span class="spinner"></span> Extracting text...', "loading");
814
  player.classList.remove("visible");
815
  downloadBtn.classList.add("hidden");
 
816
 
817
  const formData = new FormData();
818
  formData.append("file", file);
819
  formData.append("language", getSelectedLanguage());
820
+ formData.append("style", getSelectedStyle());
821
 
822
  // Create abort controller for this request
823
  currentAbortController = new AbortController();
 
835
  }
836
 
837
  // Process stream handles both progress SSE and starting audio playback
838
+ await processStream(response, file.name, "pdf");
839
  } catch (error) {
840
  if (error.name === "AbortError") {
841
  // User cancelled - already handled in stopGeneration
 
872
  showStatus('<span class="spinner"></span> Fetching content...', "loading");
873
  player.classList.remove("visible");
874
  downloadBtn.classList.add("hidden");
 
875
  urlSubmit.disabled = true;
876
 
877
  // Create abort controller for this request
 
885
  },
886
  body: JSON.stringify({
887
  url,
888
+ language: getSelectedLanguage(),
889
+ style: getSelectedStyle()
890
  }),
891
  signal: currentAbortController.signal,
892
  });
 
901
  const docName = urlPath.split("/").pop() || "document";
902
 
903
  // Process stream handles both progress SSE and starting audio playback
904
+ await processStream(response, docName, "url");
905
  } catch (error) {
906
  if (error.name === "AbortError") {
907
  // User cancelled - already handled in stopGeneration
 
936
  showStatus('<span class="spinner"></span> Processing text...', "loading");
937
  player.classList.remove("visible");
938
  downloadBtn.classList.add("hidden");
 
939
  textSubmit.disabled = true;
940
 
941
  // Create abort controller for this request
 
949
  },
950
  body: JSON.stringify({
951
  text,
952
+ language: getSelectedLanguage(),
953
+ style: getSelectedStyle()
954
  }),
955
  signal: currentAbortController.signal,
956
  });
 
960
  throw new Error(error.detail || "Failed to process text");
961
  }
962
 
963
+ // Generate document name from first few words
964
+ const words = text.trim().split(/\s+/).slice(0, 5).join(" ");
965
+ const docName = words.length > 30 ? words.slice(0, 30) + "..." : words;
966
+
967
  // Process stream handles both progress SSE and starting audio playback
968
+ await processStream(response, docName, "text");
969
  } catch (error) {
970
  if (error.name === "AbortError") {
971
  // User cancelled - already handled in stopGeneration
 
982
  // Tab switching
983
  tabs.forEach((tab) => {
984
  tab.addEventListener("click", () => {
985
+ const isAlreadyActive = tab.classList.contains("active");
986
+ const isUploadTab = tab.dataset.tab === "upload";
987
+
988
+ // If clicking on already-active upload tab, open file picker
989
+ if (isAlreadyActive && isUploadTab) {
990
+ fileInput.click();
991
+ return;
992
+ }
993
+
994
  tabs.forEach((t) => t.classList.remove("active"));
995
  tabContents.forEach((tc) => tc.classList.remove("active"));
996
  tab.classList.add("active");
 
1020
 
1021
  // Click to select file
1022
  dropZone.addEventListener("click", (e) => {
1023
+ if (e.target !== fileInput) {
1024
  fileInput.click();
1025
  }
1026
  });
 
1054
  }
1055
  });
1056
 
1057
+ // Auto-detect language from text input
1058
+ textInput.addEventListener("input", () => {
1059
+ const detected = detectLanguage(textInput.value);
1060
+ if (detected) {
1061
+ setLanguage(detected, true);
1062
+ }
1063
+ });
1064
+
1065
  // Stop button
1066
  stopBtn.addEventListener("click", stopGeneration);
1067
 
1068
+ // Stream play button (during processing)
1069
+ streamPlayBtn.addEventListener("click", () => {
1070
+ audio.play().catch(() => {});
1071
+ // Hide stream play button and show pause button
1072
+ streamPlayBtn.classList.add("hidden");
1073
+ pauseBtn.classList.remove("hidden");
1074
+ });
1075
+
1076
  // Pause button
1077
  pauseBtn.addEventListener("click", togglePause);
1078
 
1079
  // Download button
1080
  downloadBtn.addEventListener("click", downloadAudio);
1081
 
1082
+ // Delete button
1083
+ deleteBtn.addEventListener("click", deleteAudio);
1084
+
1085
  // Update pause button when audio state changes
1086
  audio.addEventListener("play", updatePauseButton);
1087
  audio.addEventListener("pause", updatePauseButton);
 
1093
  // Language selection
1094
  languageButtons.forEach((btn) => {
1095
  btn.addEventListener("click", () => {
1096
+ setLanguage(btn.dataset.language, false);
1097
+ });
1098
+ });
1099
+
1100
+ // Style selection
1101
+ const styleButtons = document.querySelectorAll("#styleButtons .style-btn");
1102
+ styleButtons.forEach((btn) => {
1103
+ btn.addEventListener("click", () => {
1104
+ styleButtons.forEach((b) => b.classList.remove("active"));
1105
  btn.classList.add("active");
1106
+ selectedStyle = btn.dataset.style;
1107
  });
1108
  });
src/talking_snake/static/index.html CHANGED
@@ -26,6 +26,9 @@
26
  <link rel="icon" type="image/png" sizes="512x512" href="/static/icon-512.png">
27
 
28
  <link rel="stylesheet" href="/static/styles.css">
 
 
 
29
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" crossorigin="anonymous" referrerpolicy="no-referrer">
30
  <script src="https://unpkg.com/htmx.org@2.0.4"></script>
31
  </head>
@@ -33,26 +36,37 @@
33
  <div class="main-content">
34
  <img src="/static/talking_snake.png" alt="Talking Snake" class="logo">
35
  <h1>Talking Snake</h1>
36
- <p class="subtitle">Transform PDFs & Web into Audio</p>
37
 
38
  <div class="container">
39
  <div class="input-section" id="inputSection">
40
  <div class="options-row">
41
- <div class="language-selector">
42
- <span class="style-label">Language:</span>
43
- <div class="style-buttons" id="languageButtons">
44
- <button class="style-btn lang-btn active" data-language="english" title="English">
45
- 🇬🇧
46
  </button>
47
- <button class="style-btn lang-btn" data-language="chinese" title="Chinese">
48
- 🇨🇳
49
  </button>
50
- <button class="style-btn lang-btn" data-language="japanese" title="Japanese">
51
- 🇯🇵
52
  </button>
53
- <button class="style-btn lang-btn" data-language="korean" title="Korean">
54
- 🇰🇷
55
  </button>
 
 
 
 
 
 
 
 
 
 
 
 
56
  </div>
57
  </div>
58
  </div>
@@ -67,19 +81,16 @@
67
  <div class="drop-zone" id="dropZone">
68
  <i class="fa-solid fa-file-pdf drop-icon"></i>
69
  <p>Drag & drop a PDF here</p>
70
- <label class="file-label">
71
- <i class="fa-solid fa-folder-open"></i> Choose File
72
- <input type="file" id="fileInput" accept=".pdf">
73
- </label>
74
- <p class="hint">Supports PDF documents up to 50MB</p>
75
  </div>
76
  </div>
77
 
78
  <div class="tab-content" id="url-tab">
79
  <div class="url-form">
80
- <input type="url" id="urlInput" placeholder="https://example.com/article or .pdf">
81
- <button class="submit-btn" id="urlSubmit"><i class="fa-solid fa-microphone"></i> Read Content</button>
82
- <p class="hint">Enter a link to a PDF or web page (articles, docs, blogs)</p>
 
83
  </div>
84
  </div>
85
 
@@ -87,7 +98,6 @@
87
  <div class="text-form">
88
  <textarea id="textInput" placeholder="Paste or type your text here..." rows="6"></textarea>
89
  <button class="submit-btn" id="textSubmit"><i class="fa-solid fa-microphone"></i> Read Text</button>
90
- <p class="hint">Paste any text you want to hear read aloud</p>
91
  </div>
92
  </div>
93
  </div>
@@ -102,6 +112,7 @@
102
  <div class="processing-progress-bar" id="processingProgressBar"></div>
103
  </div>
104
  <div class="control-buttons">
 
105
  <button class="control-btn pause-btn hidden" id="pauseBtn" title="Pause/Resume"><i class="fa-solid fa-pause"></i></button>
106
  <button class="control-btn stop-btn" id="stopBtn" title="Stop generation"><i class="fa-solid fa-stop"></i></button>
107
  </div>
@@ -121,12 +132,12 @@
121
  <input type="range" class="progress-slider" id="progressSlider" min="0" max="100" value="0">
122
  </div>
123
  <span class="time-display" id="timeDisplay">0:00 / 0:00</span>
124
- <button class="player-btn volume-btn" id="volumeBtn" title="Mute/Unmute">
125
- <i class="fa-solid fa-volume-high"></i>
126
- </button>
127
  <button class="player-btn download-btn hidden" id="downloadBtn" title="Download Audio">
128
  <i class="fa-solid fa-download"></i>
129
  </button>
 
 
 
130
  </div>
131
  <audio id="audio" preload="auto"></audio>
132
  </div>
 
26
  <link rel="icon" type="image/png" sizes="512x512" href="/static/icon-512.png">
27
 
28
  <link rel="stylesheet" href="/static/styles.css">
29
+ <link rel="preconnect" href="https://fonts.googleapis.com">
30
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
31
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Fredoka:wght@500&display=swap">
32
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" crossorigin="anonymous" referrerpolicy="no-referrer">
33
  <script src="https://unpkg.com/htmx.org@2.0.4"></script>
34
  </head>
 
36
  <div class="main-content">
37
  <img src="/static/talking_snake.png" alt="Talking Snake" class="logo">
38
  <h1>Talking Snake</h1>
 
39
 
40
  <div class="container">
41
  <div class="input-section" id="inputSection">
42
  <div class="options-row">
43
+ <div class="style-selector">
44
+ <span class="style-label">Style:</span>
45
+ <div class="style-buttons" id="styleButtons">
46
+ <button class="style-btn active" data-style="technical" title="Clear, precise reading for code and technical documentation">
47
+ <i class="fa-solid fa-microchip"></i>
48
  </button>
49
+ <button class="style-btn" data-style="narrative" title="Natural, engaging reading for articles and stories">
50
+ <i class="fa-solid fa-book-open"></i>
51
  </button>
52
+ <button class="style-btn" data-style="child_narrative" title="Playful, expressive reading for children's stories">
53
+ <i class="fa-solid fa-child"></i>
54
  </button>
55
+ <button class="style-btn" data-style="news" title="Authoritative, clear delivery for news and reports">
56
+ <i class="fa-solid fa-newspaper"></i>
57
  </button>
58
+ <button class="style-btn" data-style="academic" title="Measured, scholarly reading for papers and research">
59
+ <i class="fa-solid fa-graduation-cap"></i>
60
+ </button>
61
+ </div>
62
+ </div>
63
+ <div class="language-selector">
64
+ <span class="style-label">Language:</span>
65
+ <div class="style-buttons" id="languageButtons">
66
+ <button class="style-btn lang-btn active" data-language="english" title="English">🇬🇧</button>
67
+ <button class="style-btn lang-btn" data-language="chinese" title="Chinese">🇨🇳</button>
68
+ <button class="style-btn lang-btn" data-language="japanese" title="Japanese">🇯🇵</button>
69
+ <button class="style-btn lang-btn" data-language="korean" title="Korean">🇰🇷</button>
70
  </div>
71
  </div>
72
  </div>
 
81
  <div class="drop-zone" id="dropZone">
82
  <i class="fa-solid fa-file-pdf drop-icon"></i>
83
  <p>Drag & drop a PDF here</p>
84
+ <input type="file" id="fileInput" accept=".pdf" class="hidden-file-input">
 
 
 
 
85
  </div>
86
  </div>
87
 
88
  <div class="tab-content" id="url-tab">
89
  <div class="url-form">
90
+ <div class="url-input-row">
91
+ <input type="url" id="urlInput" placeholder="https://example.com/article or .pdf">
92
+ <button class="submit-btn" id="urlSubmit"><i class="fa-solid fa-microphone"></i></button>
93
+ </div>
94
  </div>
95
  </div>
96
 
 
98
  <div class="text-form">
99
  <textarea id="textInput" placeholder="Paste or type your text here..." rows="6"></textarea>
100
  <button class="submit-btn" id="textSubmit"><i class="fa-solid fa-microphone"></i> Read Text</button>
 
101
  </div>
102
  </div>
103
  </div>
 
112
  <div class="processing-progress-bar" id="processingProgressBar"></div>
113
  </div>
114
  <div class="control-buttons">
115
+ <button class="control-btn play-btn hidden" id="streamPlayBtn" title="Play audio"><i class="fa-solid fa-play"></i></button>
116
  <button class="control-btn pause-btn hidden" id="pauseBtn" title="Pause/Resume"><i class="fa-solid fa-pause"></i></button>
117
  <button class="control-btn stop-btn" id="stopBtn" title="Stop generation"><i class="fa-solid fa-stop"></i></button>
118
  </div>
 
132
  <input type="range" class="progress-slider" id="progressSlider" min="0" max="100" value="0">
133
  </div>
134
  <span class="time-display" id="timeDisplay">0:00 / 0:00</span>
 
 
 
135
  <button class="player-btn download-btn hidden" id="downloadBtn" title="Download Audio">
136
  <i class="fa-solid fa-download"></i>
137
  </button>
138
+ <button class="player-btn delete-btn hidden" id="deleteBtn" title="Delete Audio">
139
+ <i class="fa-solid fa-trash"></i>
140
+ </button>
141
  </div>
142
  <audio id="audio" preload="auto"></audio>
143
  </div>
src/talking_snake/static/styles.css CHANGED
@@ -45,15 +45,25 @@ body {
45
  }
46
 
47
  h1 {
 
48
  font-size: 1.75rem;
49
- margin: 0 0 0.25rem;
50
  color: var(--primary);
 
 
51
  }
52
 
53
  .subtitle {
54
  color: var(--text-muted);
55
- margin: 0 0 1rem;
56
  font-size: 0.9rem;
 
 
 
 
 
 
 
57
  }
58
 
59
  .container {
@@ -65,9 +75,9 @@ h1 {
65
  .options-row {
66
  display: flex;
67
  justify-content: center;
68
- gap: 1.5rem;
69
- margin-bottom: 1rem;
70
- flex-wrap: wrap;
71
  }
72
 
73
  /* Style Selector */
@@ -75,38 +85,46 @@ h1 {
75
  .language-selector {
76
  display: flex;
77
  align-items: center;
78
- gap: 0.5rem;
79
- flex-wrap: wrap;
80
  }
81
 
82
  .style-label {
83
- font-size: 0.85rem;
84
  color: var(--text-muted);
85
  }
86
 
87
  .style-buttons {
88
  display: flex;
89
- gap: 0.35rem;
90
  }
91
 
92
  .style-btn {
93
- width: 38px;
94
- height: 38px;
95
  border: 1px solid var(--border);
96
- border-radius: 6px;
97
  background: var(--surface);
98
  color: var(--text-muted);
99
  cursor: pointer;
100
- font-size: 0.95rem;
 
101
  transition: all 0.15s ease;
102
  display: flex;
103
  align-items: center;
104
  justify-content: center;
105
  }
106
 
 
 
 
 
 
107
  /* Language buttons use emoji flags */
108
  .style-btn.lang-btn {
109
- font-size: 1.2rem;
 
 
110
  }
111
 
112
  .style-btn:hover {
@@ -120,6 +138,29 @@ h1 {
120
  color: var(--primary);
121
  }
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  /* Input Section - hidden during processing */
124
  .input-section.hidden {
125
  display: none;
@@ -207,6 +248,21 @@ h1 {
207
  opacity: 0.6;
208
  }
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  /* Status in processing */
211
  .processing-section .status {
212
  padding: 0;
@@ -245,12 +301,13 @@ h1 {
245
  width: 36px;
246
  height: 36px;
247
  padding: 0;
248
- color: white;
249
- border: none;
 
250
  border-radius: 8px;
251
  cursor: pointer;
252
  font-size: 0.9rem;
253
- transition: all 0.15s ease;
254
  display: flex;
255
  align-items: center;
256
  justify-content: center;
@@ -261,27 +318,28 @@ h1 {
261
  }
262
 
263
  .control-btn:hover {
264
- filter: brightness(1.1);
 
 
265
  }
266
 
267
- .pause-btn {
268
- background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
269
- background-size: 200% 200%;
270
- animation: gradient-idle 3s ease infinite;
271
  }
272
 
273
- .pause-btn:hover {
274
- animation: gradient-shift 0.8s ease infinite;
 
275
  }
276
 
277
- .stop-btn {
278
- background: linear-gradient(135deg, var(--error), #8b3a30, var(--error));
279
- background-size: 200% 200%;
280
- animation: gradient-idle 3s ease infinite;
281
  }
282
 
283
  .stop-btn:hover {
284
- animation: gradient-shift 0.8s ease infinite;
 
 
285
  }
286
 
287
  @keyframes gradient-idle {
@@ -299,7 +357,7 @@ h1 {
299
  .drop-zone {
300
  border: 2px dashed var(--border);
301
  border-radius: 8px;
302
- padding: 1.5rem 1rem;
303
  text-align: center;
304
  transition: all 0.2s ease;
305
  cursor: pointer;
@@ -317,11 +375,6 @@ h1 {
317
  font-size: 0.95rem;
318
  }
319
 
320
- .drop-zone .hint {
321
- color: var(--text-muted);
322
- font-size: 0.8rem;
323
- }
324
-
325
  .drop-icon {
326
  font-size: 2.5rem;
327
  color: var(--primary);
@@ -361,37 +414,60 @@ h1 {
361
 
362
  .tab-content {
363
  display: none;
 
 
364
  }
365
 
366
  .tab-content.active {
367
  display: block;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  }
369
 
370
  /* URL Form */
371
  .url-form {
372
  background: var(--surface);
373
  border-radius: 8px;
374
- padding: 1rem;
375
  }
376
 
377
- .url-form input[type="url"] {
378
- width: 100%;
379
- padding: 0.6rem 0.75rem;
 
 
 
 
 
 
 
380
  background: var(--bg);
381
  border: 1px solid var(--border);
382
  border-radius: 6px;
383
  color: var(--text);
384
  font-size: 0.9rem;
385
- margin-bottom: 0.75rem;
386
  transition: border-color 0.15s ease;
387
  }
388
 
389
- .url-form input[type="url"]:focus {
390
  outline: none;
391
  border-color: var(--primary);
392
  }
393
 
394
- .url-form input[type="url"]::placeholder {
395
  color: var(--text-muted);
396
  }
397
 
@@ -427,70 +503,79 @@ h1 {
427
  color: var(--text-muted);
428
  }
429
 
430
- .text-form .hint {
431
- color: var(--text-muted);
432
- font-size: 0.8rem;
433
- text-align: center;
434
- margin: 0;
435
- }
436
-
437
  /* Buttons */
438
  .submit-btn {
439
  width: 100%;
440
  padding: 0.6rem 1rem;
441
- background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
442
- background-size: 200% 200%;
443
- animation: gradient-idle 3s ease infinite;
444
- color: white;
445
- border: none;
446
  border-radius: 8px;
447
  cursor: pointer;
448
  font-size: 0.9rem;
449
  font-weight: 500;
450
- transition: filter 0.15s ease;
451
  margin-bottom: 0.5rem;
452
  }
453
 
454
  .submit-btn:hover {
455
- filter: brightness(1.1);
456
- animation: gradient-shift 0.8s ease infinite;
 
457
  }
458
 
459
  .submit-btn:disabled {
460
- opacity: 0.6;
461
  cursor: not-allowed;
462
- filter: none;
463
- animation: none;
464
  }
465
 
466
- .url-form .hint {
467
- color: var(--text-muted);
468
- font-size: 0.8rem;
469
- text-align: center;
 
 
 
 
 
 
 
 
 
 
 
 
470
  margin: 0;
 
 
 
471
  }
472
 
473
  input[type="file"] {
474
  display: none;
475
  }
476
 
 
 
 
 
477
  .file-label {
478
  display: inline-block;
479
  padding: 0.5rem 1rem;
480
- background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
481
- background-size: 200% 200%;
482
- animation: gradient-idle 3s ease infinite;
483
- color: white;
484
  border-radius: 8px;
485
  cursor: pointer;
486
  font-weight: 500;
487
  font-size: 0.9rem;
488
- transition: filter 0.15s ease;
489
  }
490
 
491
  .file-label:hover {
492
- filter: brightness(1.1);
493
- animation: gradient-shift 0.8s ease infinite;
 
494
  }
495
 
496
  /* Device Info - Subtle footer-like display */
@@ -498,17 +583,19 @@ input[type="file"] {
498
  display: none;
499
  justify-content: center;
500
  align-items: center;
501
- gap: 1rem;
502
- padding: 0.75rem 1rem;
503
  font-size: 0.7rem;
504
  color: var(--text-muted);
505
- margin-top: 0.5rem;
506
  opacity: 0.7;
 
507
  }
508
 
509
  .device-info.visible {
510
  display: flex;
511
  flex-wrap: wrap;
 
512
  }
513
 
514
  .device-info i {
@@ -517,7 +604,66 @@ input[type="file"] {
517
  }
518
 
519
  .device-memory {
520
- opacity: 0.9;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  }
522
 
523
  .device-batch {
@@ -562,10 +708,10 @@ input[type="file"] {
562
 
563
  /* Audio Player */
564
  .player {
565
- margin-top: 1.5rem;
566
  width: 100%;
567
  display: none;
568
- padding: 1.25rem;
569
  background: var(--surface);
570
  border-radius: 12px;
571
  border: 1px solid var(--border);
@@ -573,6 +719,35 @@ input[type="file"] {
573
 
574
  .player.visible {
575
  display: block;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  }
577
 
578
  /* Hidden audio element */
@@ -590,31 +765,32 @@ input[type="file"] {
590
  .player-btn {
591
  width: 36px;
592
  height: 36px;
593
- border: none;
594
  border-radius: 8px;
595
- background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
596
- background-size: 200% 200%;
597
- animation: gradient-idle 3s ease infinite;
598
- color: white;
599
  cursor: pointer;
600
  display: flex;
601
  align-items: center;
602
  justify-content: center;
603
  font-size: 0.85rem;
604
- transition: filter 0.15s ease;
605
  flex-shrink: 0;
606
  }
607
 
608
  .player-btn:hover {
609
- filter: brightness(1.1);
610
- animation: gradient-shift 0.8s ease infinite;
 
 
 
 
 
 
 
611
  }
612
 
613
  .player-btn.volume-btn {
614
- background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
615
- background-size: 200% 200%;
616
- animation: gradient-idle 3s ease infinite;
617
- color: var(--text-muted);
618
  width: 32px;
619
  height: 32px;
620
  font-size: 0.8rem;
@@ -622,14 +798,9 @@ input[type="file"] {
622
 
623
  .player-btn.volume-btn:hover {
624
  color: var(--text);
625
- animation: gradient-shift 0.8s ease infinite;
626
  }
627
 
628
  .player-btn.download-btn {
629
- background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
630
- background-size: 200% 200%;
631
- animation: gradient-idle 3s ease infinite;
632
- color: var(--text-muted);
633
  width: 32px;
634
  height: 32px;
635
  font-size: 0.8rem;
@@ -637,7 +808,18 @@ input[type="file"] {
637
 
638
  .player-btn.download-btn:hover {
639
  color: var(--primary);
640
- animation: gradient-shift 0.8s ease infinite;
 
 
 
 
 
 
 
 
 
 
 
641
  }
642
 
643
  .progress-container {
@@ -697,11 +879,24 @@ input[type="file"] {
697
  }
698
 
699
  .filename {
700
- margin-bottom: 0.75rem;
701
  font-size: 0.85rem;
702
  font-weight: 500;
703
  color: var(--text);
704
  word-break: break-all;
 
 
 
 
 
 
 
 
 
 
 
 
 
705
  }
706
 
707
  /* Spinner Animation */
@@ -714,7 +909,7 @@ input[type="file"] {
714
  border-radius: 50%;
715
  animation: spin 1s linear infinite;
716
  margin-right: 0.4rem;
717
- vertical-align: middle;
718
  }
719
 
720
  @keyframes spin {
 
45
  }
46
 
47
  h1 {
48
+ font-family: Fredoka, sans-serif;
49
  font-size: 1.75rem;
50
+ margin: 0 0 0.5rem;
51
  color: var(--primary);
52
+ display: inline;
53
+ vertical-align: baseline;
54
  }
55
 
56
  .subtitle {
57
  color: var(--text-muted);
58
+ margin: 0;
59
  font-size: 0.9rem;
60
+ display: inline;
61
+ vertical-align: baseline;
62
+ }
63
+
64
+ .header-row {
65
+ margin-bottom: 0.5rem;
66
+ text-align: center;
67
  }
68
 
69
  .container {
 
75
  .options-row {
76
  display: flex;
77
  justify-content: center;
78
+ gap: 0.75rem;
79
+ margin-bottom: 0.75rem;
80
+ flex-wrap: nowrap;
81
  }
82
 
83
  /* Style Selector */
 
85
  .language-selector {
86
  display: flex;
87
  align-items: center;
88
+ gap: 0.4rem;
89
+ flex-wrap: nowrap;
90
  }
91
 
92
  .style-label {
93
+ font-size: 0.75rem;
94
  color: var(--text-muted);
95
  }
96
 
97
  .style-buttons {
98
  display: flex;
99
+ gap: 0.25rem;
100
  }
101
 
102
  .style-btn {
103
+ width: 28px;
104
+ height: 28px;
105
  border: 1px solid var(--border);
106
+ border-radius: 5px;
107
  background: var(--surface);
108
  color: var(--text-muted);
109
  cursor: pointer;
110
+ font-size: 0.75rem;
111
+ line-height: 1;
112
  transition: all 0.15s ease;
113
  display: flex;
114
  align-items: center;
115
  justify-content: center;
116
  }
117
 
118
+ .style-btn i {
119
+ display: block;
120
+ line-height: 1;
121
+ }
122
+
123
  /* Language buttons use emoji flags */
124
  .style-btn.lang-btn {
125
+ font-size: 1rem;
126
+ line-height: 1;
127
+ padding: 0;
128
  }
129
 
130
  .style-btn:hover {
 
138
  color: var(--primary);
139
  }
140
 
141
+ /* Auto-detected language indicator */
142
+ .style-btn.lang-btn.auto-detected {
143
+ animation: auto-detect-pulse 0.5s ease-out;
144
+ box-shadow: 0 0 0 2px var(--primary);
145
+ }
146
+
147
+ @keyframes auto-detect-pulse {
148
+ 0% {
149
+ transform: scale(1);
150
+ box-shadow: 0 0 0 0 rgba(212, 118, 58, 0.7);
151
+ }
152
+
153
+ 50% {
154
+ transform: scale(1.1);
155
+ box-shadow: 0 0 0 4px rgba(212, 118, 58, 0.4);
156
+ }
157
+
158
+ 100% {
159
+ transform: scale(1);
160
+ box-shadow: 0 0 0 2px var(--primary);
161
+ }
162
+ }
163
+
164
  /* Input Section - hidden during processing */
165
  .input-section.hidden {
166
  display: none;
 
248
  opacity: 0.6;
249
  }
250
 
251
+ .doc-info .doc-style,
252
+ .doc-info .doc-lang {
253
+ color: var(--text-muted);
254
+ font-size: 0.75rem;
255
+ display: flex;
256
+ align-items: center;
257
+ white-space: nowrap;
258
+ flex-shrink: 0;
259
+ opacity: 0.7;
260
+ }
261
+
262
+ .doc-info .doc-style i {
263
+ font-size: 0.7rem;
264
+ }
265
+
266
  /* Status in processing */
267
  .processing-section .status {
268
  padding: 0;
 
301
  width: 36px;
302
  height: 36px;
303
  padding: 0;
304
+ color: var(--text-muted);
305
+ background: var(--surface);
306
+ border: 1px solid var(--border);
307
  border-radius: 8px;
308
  cursor: pointer;
309
  font-size: 0.9rem;
310
+ transition: all 0.2s ease;
311
  display: flex;
312
  align-items: center;
313
  justify-content: center;
 
318
  }
319
 
320
  .control-btn:hover {
321
+ color: var(--primary);
322
+ border-color: var(--primary);
323
+ background: rgb(212, 118, 58, 0.08);
324
  }
325
 
326
+ .pause-btn:hover {
327
+ color: var(--primary);
 
 
328
  }
329
 
330
+ .control-btn.play-btn {
331
+ color: var(--success);
332
+ border-color: var(--success);
333
  }
334
 
335
+ .control-btn.play-btn:hover {
336
+ background: rgba(116, 184, 22, 0.15);
 
 
337
  }
338
 
339
  .stop-btn:hover {
340
+ color: var(--error);
341
+ border-color: var(--error);
342
+ background: rgb(196, 90, 74, 0.08);
343
  }
344
 
345
  @keyframes gradient-idle {
 
357
  .drop-zone {
358
  border: 2px dashed var(--border);
359
  border-radius: 8px;
360
+ padding: 1rem 0;
361
  text-align: center;
362
  transition: all 0.2s ease;
363
  cursor: pointer;
 
375
  font-size: 0.95rem;
376
  }
377
 
 
 
 
 
 
378
  .drop-icon {
379
  font-size: 2.5rem;
380
  color: var(--primary);
 
414
 
415
  .tab-content {
416
  display: none;
417
+ opacity: 0;
418
+ transform: translateY(-8px);
419
  }
420
 
421
  .tab-content.active {
422
  display: block;
423
+ opacity: 1;
424
+ transform: translateY(0);
425
+ animation: tab-fade-in 0.2s ease-out;
426
+ }
427
+
428
+ @keyframes tab-fade-in {
429
+ from {
430
+ opacity: 0;
431
+ transform: translateY(-8px);
432
+ }
433
+
434
+ to {
435
+ opacity: 1;
436
+ transform: translateY(0);
437
+ }
438
  }
439
 
440
  /* URL Form */
441
  .url-form {
442
  background: var(--surface);
443
  border-radius: 8px;
444
+ padding: 0.75rem;
445
  }
446
 
447
+ .url-input-row {
448
+ display: flex;
449
+ gap: 0.5rem;
450
+ align-items: center;
451
+ }
452
+
453
+ .url-input-row input[type="url"] {
454
+ flex: 1;
455
+ height: 40px;
456
+ padding: 0 0.75rem;
457
  background: var(--bg);
458
  border: 1px solid var(--border);
459
  border-radius: 6px;
460
  color: var(--text);
461
  font-size: 0.9rem;
 
462
  transition: border-color 0.15s ease;
463
  }
464
 
465
+ .url-input-row input[type="url"]:focus {
466
  outline: none;
467
  border-color: var(--primary);
468
  }
469
 
470
+ .url-input-row input[type="url"]::placeholder {
471
  color: var(--text-muted);
472
  }
473
 
 
503
  color: var(--text-muted);
504
  }
505
 
 
 
 
 
 
 
 
506
  /* Buttons */
507
  .submit-btn {
508
  width: 100%;
509
  padding: 0.6rem 1rem;
510
+ background: var(--surface);
511
+ color: var(--text);
512
+ border: 1px solid var(--border);
 
 
513
  border-radius: 8px;
514
  cursor: pointer;
515
  font-size: 0.9rem;
516
  font-weight: 500;
517
+ transition: all 0.2s ease;
518
  margin-bottom: 0.5rem;
519
  }
520
 
521
  .submit-btn:hover {
522
+ color: var(--primary);
523
+ border-color: var(--primary);
524
+ background: rgb(212, 118, 58, 0.08);
525
  }
526
 
527
  .submit-btn:disabled {
528
+ opacity: 0.5;
529
  cursor: not-allowed;
 
 
530
  }
531
 
532
+ /* URL form button override - must come after base .submit-btn */
533
+ .url-input-row .submit-btn {
534
+ width: 40px;
535
+ height: 40px;
536
+ min-width: 40px;
537
+ min-height: 40px;
538
+ padding: 0;
539
+ margin: 0;
540
+ flex-shrink: 0;
541
+ display: flex;
542
+ align-items: center;
543
+ justify-content: center;
544
+ border-radius: 6px;
545
+ }
546
+
547
+ .url-input-row .submit-btn i {
548
  margin: 0;
549
+ padding: 0;
550
+ line-height: 1;
551
+ display: block;
552
  }
553
 
554
  input[type="file"] {
555
  display: none;
556
  }
557
 
558
+ .hidden-file-input {
559
+ display: none !important;
560
+ }
561
+
562
  .file-label {
563
  display: inline-block;
564
  padding: 0.5rem 1rem;
565
+ background: var(--surface);
566
+ color: var(--text);
567
+ border: 1px solid var(--border);
 
568
  border-radius: 8px;
569
  cursor: pointer;
570
  font-weight: 500;
571
  font-size: 0.9rem;
572
+ transition: all 0.2s ease;
573
  }
574
 
575
  .file-label:hover {
576
+ color: var(--primary);
577
+ border-color: var(--primary);
578
+ background: rgb(212, 118, 58, 0.08);
579
  }
580
 
581
  /* Device Info - Subtle footer-like display */
 
583
  display: none;
584
  justify-content: center;
585
  align-items: center;
586
+ gap: 0.6rem;
587
+ padding: 0.4rem 1rem;
588
  font-size: 0.7rem;
589
  color: var(--text-muted);
590
+ margin-top: 0.25rem;
591
  opacity: 0.7;
592
+ line-height: 1.2;
593
  }
594
 
595
  .device-info.visible {
596
  display: flex;
597
  flex-wrap: wrap;
598
+ row-gap: 0.2rem;
599
  }
600
 
601
  .device-info i {
 
604
  }
605
 
606
  .device-memory {
607
+ display: flex;
608
+ align-items: center;
609
+ gap: 0.25rem;
610
+ }
611
+
612
+ .device-memory i {
613
+ font-size: 0.6rem;
614
+ }
615
+
616
+ .device-ephemeral {
617
+ display: flex;
618
+ align-items: center;
619
+ gap: 0.25rem;
620
+ color: var(--success);
621
+ }
622
+
623
+ .device-ephemeral i {
624
+ color: var(--success);
625
+ font-size: 0.6rem;
626
+ }
627
+
628
+ .device-timing {
629
+ display: flex;
630
+ align-items: center;
631
+ gap: 0.25rem;
632
+ color: var(--text-muted);
633
+ }
634
+
635
+ .device-timing i {
636
+ font-size: 0.6rem;
637
+ }
638
+
639
+ /* Model state indicators */
640
+ .model-state {
641
+ display: flex;
642
+ align-items: center;
643
+ gap: 0.25rem;
644
+ padding: 0.15rem 0.4rem;
645
+ border-radius: 4px;
646
+ font-size: 0.65rem;
647
+ }
648
+
649
+ .model-state i {
650
+ font-size: 0.55rem;
651
+ }
652
+
653
+ .model-loaded {
654
+ background: rgb(16, 185, 129, 0.15);
655
+ color: var(--success);
656
+ }
657
+
658
+ .model-loading,
659
+ .model-unloading {
660
+ background: rgb(245, 158, 11, 0.15);
661
+ color: #f59e0b;
662
+ }
663
+
664
+ .model-unloaded {
665
+ background: rgb(107, 114, 128, 0.15);
666
+ color: var(--text-muted);
667
  }
668
 
669
  .device-batch {
 
708
 
709
  /* Audio Player */
710
  .player {
711
+ margin-top: 1rem;
712
  width: 100%;
713
  display: none;
714
+ padding: 0.75rem 1rem;
715
  background: var(--surface);
716
  border-radius: 12px;
717
  border: 1px solid var(--border);
 
719
 
720
  .player.visible {
721
  display: block;
722
+ animation: slide-in 0.3s ease-out;
723
+ }
724
+
725
+ .player.deleting {
726
+ animation: slide-out 0.3s ease-out forwards;
727
+ }
728
+
729
+ @keyframes slide-in {
730
+ from {
731
+ opacity: 0;
732
+ transform: translateY(-10px);
733
+ }
734
+
735
+ to {
736
+ opacity: 1;
737
+ transform: translateY(0);
738
+ }
739
+ }
740
+
741
+ @keyframes slide-out {
742
+ from {
743
+ opacity: 1;
744
+ transform: translateY(0);
745
+ }
746
+
747
+ to {
748
+ opacity: 0;
749
+ transform: translateY(-10px);
750
+ }
751
  }
752
 
753
  /* Hidden audio element */
 
765
  .player-btn {
766
  width: 36px;
767
  height: 36px;
768
+ border: 1px solid var(--border);
769
  border-radius: 8px;
770
+ background: var(--surface);
771
+ color: var(--text-muted);
 
 
772
  cursor: pointer;
773
  display: flex;
774
  align-items: center;
775
  justify-content: center;
776
  font-size: 0.85rem;
777
+ transition: all 0.2s ease;
778
  flex-shrink: 0;
779
  }
780
 
781
  .player-btn:hover {
782
+ color: var(--primary);
783
+ border-color: var(--primary);
784
+ background: rgb(212, 118, 58, 0.08);
785
+ }
786
+
787
+ .player-btn.play-btn {
788
+ width: 40px;
789
+ height: 40px;
790
+ font-size: 0.9rem;
791
  }
792
 
793
  .player-btn.volume-btn {
 
 
 
 
794
  width: 32px;
795
  height: 32px;
796
  font-size: 0.8rem;
 
798
 
799
  .player-btn.volume-btn:hover {
800
  color: var(--text);
 
801
  }
802
 
803
  .player-btn.download-btn {
 
 
 
 
804
  width: 32px;
805
  height: 32px;
806
  font-size: 0.8rem;
 
808
 
809
  .player-btn.download-btn:hover {
810
  color: var(--primary);
811
+ }
812
+
813
+ .player-btn.delete-btn {
814
+ width: 32px;
815
+ height: 32px;
816
+ font-size: 0.8rem;
817
+ }
818
+
819
+ .player-btn.delete-btn:hover {
820
+ color: var(--error);
821
+ border-color: var(--error);
822
+ background: rgb(196, 90, 74, 0.08);
823
  }
824
 
825
  .progress-container {
 
879
  }
880
 
881
  .filename {
882
+ margin-bottom: 0.5rem;
883
  font-size: 0.85rem;
884
  font-weight: 500;
885
  color: var(--text);
886
  word-break: break-all;
887
+ display: flex;
888
+ align-items: center;
889
+ gap: 0.4rem;
890
+ flex-wrap: wrap;
891
+ }
892
+
893
+ .filename-meta {
894
+ display: inline-flex;
895
+ align-items: center;
896
+ gap: 0.35rem;
897
+ font-size: 0.8rem;
898
+ color: var(--text-muted);
899
+ margin-left: auto;
900
  }
901
 
902
  /* Spinner Animation */
 
909
  border-radius: 50%;
910
  animation: spin 1s linear infinite;
911
  margin-right: 0.4rem;
912
+ vertical-align: -2px;
913
  }
914
 
915
  @keyframes spin {
src/talking_snake/tts.py CHANGED
@@ -8,6 +8,7 @@ import time
8
  import wave
9
  from abc import ABC, abstractmethod
10
  from collections.abc import Iterator
 
11
  from typing import TYPE_CHECKING
12
 
13
  if TYPE_CHECKING:
@@ -42,18 +43,146 @@ class TTSEngineProtocol(ABC):
42
  return 1
43
 
44
 
45
- # Professional narration style prompt
46
- # This instructs the model to read with clear, authoritative delivery
47
- PROFESSIONAL_STYLE = (
48
- "Read this as a professional narrator with clear enunciation, "
49
- "measured pacing, and an authoritative yet warm tone. "
50
- "Speak naturally as if presenting an audiobook or documentary. "
51
- "Avoid sounding robotic or monotone. Emphasize key points and maintain a steady rhythm. "
52
- "Use appropriate intonation to convey meaning and keep the listener engaged. "
53
- "This is not casual conversation, but a polished narration style. "
54
- "Use proper diction, read correctly acronyms, and pronounce all words clearly."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  )
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  # Language to default voice mapping
58
  LANGUAGE_VOICES: dict[str, str] = {
59
  "english": "Ryan",
@@ -65,8 +194,8 @@ LANGUAGE_VOICES: dict[str, str] = {
65
  # Default chunk size for streaming
66
  # Larger chunks = more stable voice, fewer artifacts at boundaries
67
  # Smaller chunks = faster first audio but potential voice instability
68
- # 1200 chars provides good balance for natural speech flow
69
- DEFAULT_CHUNK_SIZE = 1200
70
 
71
  # Idle timeout before unloading model from GPU (seconds)
72
  # Set to 0 to disable auto-unloading
@@ -140,9 +269,19 @@ class QwenTTSEngine(TTSEngineProtocol):
140
  self._idle_timeout = idle_timeout
141
  self._last_activity = time.time()
142
  self._model_loaded = False
 
143
  self._lock = threading.Lock()
144
  self._unload_timer: threading.Timer | None = None
145
 
 
 
 
 
 
 
 
 
 
146
  # Model will be loaded on first request (lazy loading)
147
  self.model = None
148
 
@@ -150,6 +289,67 @@ class QwenTTSEngine(TTSEngineProtocol):
150
  if idle_timeout == 0:
151
  self._load_model()
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def _load_model(self) -> None:
154
  """Load the model onto GPU or CPU."""
155
  if self._model_loaded:
@@ -158,6 +358,7 @@ class QwenTTSEngine(TTSEngineProtocol):
158
  import torch
159
  from qwen_tts import Qwen3TTSModel
160
 
 
161
  device_name = "GPU" if self.device == "cuda" else "CPU"
162
  print(f"🔄 Loading TTS model onto {device_name}...")
163
  start = time.time()
@@ -186,6 +387,7 @@ class QwenTTSEngine(TTSEngineProtocol):
186
  )
187
 
188
  self._model_loaded = True
 
189
 
190
  # Calculate optimal batch size based on available VRAM
191
  if self.device == "cuda":
@@ -205,6 +407,7 @@ class QwenTTSEngine(TTSEngineProtocol):
205
 
206
  import torch
207
 
 
208
  print("💤 Unloading TTS model from GPU (idle timeout)...")
209
 
210
  # Delete model and clear references
@@ -218,6 +421,7 @@ class QwenTTSEngine(TTSEngineProtocol):
218
  torch.cuda.empty_cache()
219
  torch.cuda.synchronize()
220
 
 
221
  print("✅ GPU memory freed")
222
 
223
  def _schedule_unload(self) -> None:
@@ -307,6 +511,10 @@ class QwenTTSEngine(TTSEngineProtocol):
307
  # Type guard - model is guaranteed to be loaded after _ensure_model_loaded
308
  assert self.model is not None, "Model failed to load"
309
 
 
 
 
 
310
  try:
311
  # Split text into chunks for streaming
312
  chunks = self._split_text(text)
@@ -326,10 +534,9 @@ class QwenTTSEngine(TTSEngineProtocol):
326
  continue
327
 
328
  # Always use batched call for consistent GPU memory allocation
329
- # Use professional narration style for clear, authoritative delivery
330
- batch_instruct = (
331
- [PROFESSIONAL_STYLE] * len(batch) if len(batch) > 1 else PROFESSIONAL_STYLE
332
- )
333
  audios, sr = self.model.generate_custom_voice(
334
  text=batch if len(batch) > 1 else batch[0],
335
  speaker=[self.voice] * len(batch) if len(batch) > 1 else self.voice,
@@ -349,6 +556,9 @@ class QwenTTSEngine(TTSEngineProtocol):
349
  first_chunk = False
350
  yield wav_bytes
351
  finally:
 
 
 
352
  # Schedule model unload after idle timeout
353
  self._schedule_unload()
354
 
 
8
  import wave
9
  from abc import ABC, abstractmethod
10
  from collections.abc import Iterator
11
+ from dataclasses import dataclass
12
  from typing import TYPE_CHECKING
13
 
14
  if TYPE_CHECKING:
 
43
  return 1
44
 
45
 
46
+ @dataclass
47
+ class TTSStyle:
48
+ """Defines a TTS speaking style with its configuration."""
49
+
50
+ id: str # Unique identifier (e.g., "technical", "narrative")
51
+ name: str # Display name (e.g., "Technical Documentation")
52
+ icon: str # Font Awesome icon class (e.g., "fa-gear")
53
+ description: str # Short description for tooltips
54
+ prompt: str # The instruct prompt for the TTS model
55
+
56
+
57
+ # === TTS STYLES ===
58
+ # Each style provides a different speaking approach optimized for specific content types
59
+
60
+ STYLE_TECHNICAL = TTSStyle(
61
+ id="technical",
62
+ name="Technical",
63
+ icon="fa-microchip",
64
+ description="Clear, precise reading for code and technical documentation",
65
+ prompt=(
66
+ "You are a technical speech engine reading engineering documents. "
67
+ "Your task is to convert text into clear, accurate spoken output. "
68
+ "Read in a neutral, controlled, professional voice. "
69
+ "Do not sound expressive, emotional, or conversational. "
70
+ "Do not use audiobook, storytelling, or presenter intonation. "
71
+ "Prioritize intelligibility and correctness over naturalness. "
72
+ "Maintain steady pacing and flat prosody appropriate for scientific material. "
73
+ "Pronounce all acronyms as individual letters unless they are standard spoken words. "
74
+ "Pronounce symbols, operators, and punctuation when they affect meaning. "
75
+ "Preserve capitalization, parentheses, and formatting as part of the spoken output. "
76
+ "When reading code, equations, or identifiers, slow down and speak every token clearly. "
77
+ "Insert short pauses at commas and longer pauses at periods and line breaks. "
78
+ "Do not summarize, interpret, or rephrase. "
79
+ "Read exactly what is written."
80
+ ),
81
  )
82
 
83
+ STYLE_NARRATIVE = TTSStyle(
84
+ id="narrative",
85
+ name="Narrative",
86
+ icon="fa-book-open",
87
+ description="Natural, engaging reading for articles and stories",
88
+ prompt=(
89
+ "You are a professional narrative voice reading long-form text. "
90
+ "Your task is to tell a story in a clear, engaging, and natural way. "
91
+ "Use a warm, expressive, and fluid voice. "
92
+ "Vary intonation and rhythm to reflect meaning, emotion, and emphasis. "
93
+ "Sound human and immersive, not robotic or monotone. "
94
+ "Maintain smooth pacing, slowing for important moments, speeding up for transitions. "
95
+ "Use natural pauses at punctuation and paragraph breaks. "
96
+ "Pronounce all words clearly, but do not over-articulate symbols or formatting. "
97
+ "Read acronyms as spoken words when they are commonly pronounced that way. "
98
+ "Preserve the narrative flow and emotional tone of the text. "
99
+ "Do not flatten or neutralize the delivery."
100
+ ),
101
+ )
102
+
103
+ STYLE_CHILD_NARRATIVE = TTSStyle(
104
+ id="child_narrative",
105
+ name="Child Narrative",
106
+ icon="fa-child",
107
+ description="Playful, expressive reading for children's stories",
108
+ prompt=(
109
+ "You are a storyteller reading aloud to young children. "
110
+ "Your task is to tell a story in a friendly, gentle, and engaging way. "
111
+ "Use a warm, soft, and expressive voice. "
112
+ "Sound kind, calm, and reassuring. "
113
+ "Vary intonation to match emotions and actions in the story. "
114
+ "Maintain a slow to moderate pace with clear articulation. "
115
+ "Insert natural pauses so children can follow along. "
116
+ "Pronounce words simply and clearly. "
117
+ "Read acronyms and difficult words in their most familiar spoken form. "
118
+ "Keep the tone playful but soothing. "
119
+ "Do not sound technical, formal, or adult-oriented."
120
+ ),
121
+ )
122
+
123
+ STYLE_NEWS = TTSStyle(
124
+ id="news",
125
+ name="News",
126
+ icon="fa-newspaper",
127
+ description="Authoritative, clear delivery for news and reports",
128
+ prompt=(
129
+ "You are a professional news anchor delivering broadcast news. "
130
+ "Your task is to read information clearly, confidently, and with authority. "
131
+ "Use a neutral, composed, and trustworthy voice. "
132
+ "Avoid emotional or dramatic delivery. "
133
+ "Do not sound conversational or casual. "
134
+ "Maintain a steady, moderate pace with crisp articulation. "
135
+ "Use controlled intonation to mark headlines, key facts, and transitions. "
136
+ "Pronounce names, numbers, acronyms, and places carefully and accurately. "
137
+ "Pause briefly at commas and longer at periods and topic changes. "
138
+ "Sound factual, objective, and broadcast-ready at all times."
139
+ ),
140
+ )
141
+
142
+ STYLE_ACADEMIC = TTSStyle(
143
+ id="academic",
144
+ name="Academic",
145
+ icon="fa-graduation-cap",
146
+ description="Measured, scholarly reading for papers and research",
147
+ prompt=(
148
+ "You are an academic speech engine reading peer-reviewed scientific papers. "
149
+ "Your task is to render complex scholarly text into clear, precise spoken language. "
150
+ "Use a neutral, formal, and controlled voice. "
151
+ "Do not sound expressive, emotional, or conversational. "
152
+ "Do not use audiobook or presenter intonation. "
153
+ "Maintain steady pacing suitable for dense technical material. "
154
+ "Favor clarity and accuracy over naturalness. "
155
+ "Pronounce technical terminology, Greek letters, acronyms, and units correctly. "
156
+ "Read acronyms as individual letters unless they are standard spoken words. "
157
+ "Preserve capitalization, punctuation, and structure when they affect meaning. "
158
+ "Insert short pauses at commas and longer pauses at periods and section breaks. "
159
+ "Slow down slightly for equations, symbols, gene names, and references. "
160
+ "Do not summarize, interpret, or simplify the text. "
161
+ "Read exactly what is written."
162
+ ),
163
+ )
164
+
165
+ # Registry of all available styles
166
+ TTS_STYLES: dict[str, TTSStyle] = {
167
+ style.id: style
168
+ for style in [
169
+ STYLE_TECHNICAL,
170
+ STYLE_NARRATIVE,
171
+ STYLE_CHILD_NARRATIVE,
172
+ STYLE_NEWS,
173
+ STYLE_ACADEMIC,
174
+ ]
175
+ }
176
+
177
+ # Default style
178
+ DEFAULT_STYLE = STYLE_TECHNICAL
179
+
180
+
181
+ def get_style(style_id: str) -> TTSStyle:
182
+ """Get a TTS style by ID, falling back to default if not found."""
183
+ return TTS_STYLES.get(style_id, DEFAULT_STYLE)
184
+
185
+
186
  # Language to default voice mapping
187
  LANGUAGE_VOICES: dict[str, str] = {
188
  "english": "Ryan",
 
194
  # Default chunk size for streaming
195
  # Larger chunks = more stable voice, fewer artifacts at boundaries
196
  # Smaller chunks = faster first audio but potential voice instability
197
+ # 1800 chars provides good balance for natural speech flow
198
+ DEFAULT_CHUNK_SIZE = 1800
199
 
200
  # Idle timeout before unloading model from GPU (seconds)
201
  # Set to 0 to disable auto-unloading
 
269
  self._idle_timeout = idle_timeout
270
  self._last_activity = time.time()
271
  self._model_loaded = False
272
+ self._model_state = "unloaded" # unloaded, loading, loaded, unloading
273
  self._lock = threading.Lock()
274
  self._unload_timer: threading.Timer | None = None
275
 
276
+ # Calibrated seconds per character (measured and updated over time)
277
+ self._seconds_per_char: float | None = None
278
+ # Cumulative stats for running average
279
+ self._total_chars_processed: int = 0
280
+ self._total_time_spent: float = 0.0
281
+
282
+ # Current style for TTS
283
+ self._style: TTSStyle = DEFAULT_STYLE
284
+
285
  # Model will be loaded on first request (lazy loading)
286
  self.model = None
287
 
 
289
  if idle_timeout == 0:
290
  self._load_model()
291
 
292
+ @property
293
+ def style(self) -> TTSStyle:
294
+ """Return the current TTS style."""
295
+ return self._style
296
+
297
+ def set_style(self, style_id: str) -> None:
298
+ """Set the TTS style by ID.
299
+
300
+ Args:
301
+ style_id: Style identifier (technical, narrative, news, casual, academic).
302
+ """
303
+ self._style = get_style(style_id)
304
+
305
+ @property
306
+ def model_state(self) -> str:
307
+ """Return the current model state: unloaded, loading, loaded, or unloading."""
308
+ return self._model_state
309
+
310
+ @property
311
+ def seconds_per_char(self) -> float | None:
312
+ """Return calibrated seconds per character, or None if not yet measured."""
313
+ return self._seconds_per_char
314
+
315
+ @property
316
+ def total_chars_processed(self) -> int:
317
+ """Return total characters processed since startup."""
318
+ return self._total_chars_processed
319
+
320
+ def _update_timing_stats(self, chars: int, elapsed: float) -> None:
321
+ """Update cumulative timing statistics.
322
+
323
+ Args:
324
+ chars: Number of characters processed.
325
+ elapsed: Time taken in seconds.
326
+ """
327
+ self._total_chars_processed += chars
328
+ self._total_time_spent += elapsed
329
+ if self._total_chars_processed > 0:
330
+ self._seconds_per_char = self._total_time_spent / self._total_chars_processed
331
+
332
+ def calibrate(self, test_text: str = "Hello, this is a calibration test.") -> float:
333
+ """Run a calibration test to measure seconds per character.
334
+
335
+ Args:
336
+ test_text: Short text to use for calibration.
337
+
338
+ Returns:
339
+ Measured seconds per character.
340
+ """
341
+ self._ensure_model_loaded()
342
+
343
+ start = time.time()
344
+ # Consume the generator to complete synthesis
345
+ for _ in self.synthesize(test_text):
346
+ pass
347
+ elapsed = time.time() - start
348
+
349
+ self._seconds_per_char = elapsed / len(test_text)
350
+ print(f"⏱️ Calibrated: {self._seconds_per_char:.4f}s per character")
351
+ return self._seconds_per_char
352
+
353
  def _load_model(self) -> None:
354
  """Load the model onto GPU or CPU."""
355
  if self._model_loaded:
 
358
  import torch
359
  from qwen_tts import Qwen3TTSModel
360
 
361
+ self._model_state = "loading"
362
  device_name = "GPU" if self.device == "cuda" else "CPU"
363
  print(f"🔄 Loading TTS model onto {device_name}...")
364
  start = time.time()
 
387
  )
388
 
389
  self._model_loaded = True
390
+ self._model_state = "loaded"
391
 
392
  # Calculate optimal batch size based on available VRAM
393
  if self.device == "cuda":
 
407
 
408
  import torch
409
 
410
+ self._model_state = "unloading"
411
  print("💤 Unloading TTS model from GPU (idle timeout)...")
412
 
413
  # Delete model and clear references
 
421
  torch.cuda.empty_cache()
422
  torch.cuda.synchronize()
423
 
424
+ self._model_state = "unloaded"
425
  print("✅ GPU memory freed")
426
 
427
  def _schedule_unload(self) -> None:
 
511
  # Type guard - model is guaranteed to be loaded after _ensure_model_loaded
512
  assert self.model is not None, "Model failed to load"
513
 
514
+ # Track timing for this synthesis
515
+ synthesis_start = time.time()
516
+ chars_in_text = len(text)
517
+
518
  try:
519
  # Split text into chunks for streaming
520
  chunks = self._split_text(text)
 
534
  continue
535
 
536
  # Always use batched call for consistent GPU memory allocation
537
+ # Use the current style's prompt for delivery
538
+ style_prompt = self._style.prompt
539
+ batch_instruct = [style_prompt] * len(batch) if len(batch) > 1 else style_prompt
 
540
  audios, sr = self.model.generate_custom_voice(
541
  text=batch if len(batch) > 1 else batch[0],
542
  speaker=[self.voice] * len(batch) if len(batch) > 1 else self.voice,
 
556
  first_chunk = False
557
  yield wav_bytes
558
  finally:
559
+ # Update timing stats for future estimates
560
+ elapsed = time.time() - synthesis_start
561
+ self._update_timing_stats(chars_in_text, elapsed)
562
  # Schedule model unload after idle timeout
563
  self._schedule_unload()
564