jkorstad commited on
Commit
d63ba06
·
1 Parent(s): e5292e6

Major enhancement: file upload, chapter detection, segment previews, multi-format export, speed control, project save/load, quick generate, sample stories

Browse files
__pycache__/app.cpython-311.pyc CHANGED
Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ
 
__pycache__/backend.cpython-311.pyc CHANGED
Binary files a/__pycache__/backend.cpython-311.pyc and b/__pycache__/backend.cpython-311.pyc differ
 
app.py CHANGED
@@ -1,9 +1,12 @@
1
  """
2
- AudioBook Forge - Gradio Frontend
3
- High-fidelity audiobook generator with character voice mapping.
 
4
  """
5
 
6
  import os
 
 
7
  from pathlib import Path
8
  from typing import Dict, List, Optional
9
 
@@ -31,6 +34,10 @@ from backend import (
31
  AudiobookPipeline,
32
  VoiceConfig,
33
  PRESET_SPEAKERS,
 
 
 
 
34
  )
35
 
36
  # ---------------------------------------------------------------------------
@@ -79,6 +86,25 @@ body, .gradio-container {
79
  padding: 1.25rem !important;
80
  }
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  button.primary {
83
  background: linear-gradient(135deg, #6366f1, #4f46e5) !important;
84
  border: none !important;
@@ -145,6 +171,42 @@ code {
145
  padding: 0.1rem 0.3rem !important;
146
  border-radius: 4px !important;
147
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  """
149
 
150
  # ---------------------------------------------------------------------------
@@ -152,6 +214,10 @@ code {
152
  # ---------------------------------------------------------------------------
153
 
154
  _pipeline: Optional[AudiobookPipeline] = None
 
 
 
 
155
 
156
 
157
  def get_pipeline() -> AudiobookPipeline:
@@ -175,6 +241,36 @@ def on_mode_change(mode: str) -> tuple:
175
  return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
176
 
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def extract_chars(text: str, use_ai: bool) -> tuple:
179
  if not text or len(text.strip()) < 20:
180
  return [], "Text too short. Please paste at least a paragraph."
@@ -184,6 +280,13 @@ def extract_chars(text: str, use_ai: bool) -> tuple:
184
  return chars, status
185
 
186
 
 
 
 
 
 
 
 
187
  # ---------------------------------------------------------------------------
188
  # GPU-wrapped functions (ZeroGPU)
189
  # ---------------------------------------------------------------------------
@@ -191,12 +294,12 @@ def extract_chars(text: str, use_ai: bool) -> tuple:
191
  @spaces.GPU(duration=180)
192
  def generate_audiobook_gpu(
193
  text,
194
- nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,
195
- gen_temp, gen_seed,
196
- names, descs, modes, presets, audios, ref_texts, designs, instructs, langs,
197
  ):
198
  if not text or len(text.strip()) < 50:
199
- return None, "Error: Please provide at least 50 characters of story text."
200
 
201
  pipe = get_pipeline()
202
 
@@ -209,6 +312,7 @@ def generate_audiobook_gpu(
209
  design_desc=nar_design if nar_mode == "design" else None,
210
  instruct=nar_instruct,
211
  language=nar_lang,
 
212
  )
213
 
214
  char_configs = {}
@@ -224,6 +328,7 @@ def generate_audiobook_gpu(
224
  design_desc=designs[i] if modes[i] == "design" else None,
225
  instruct=instructs[i] or "",
226
  language=langs[i],
 
227
  )
228
  char_configs[names[i]] = vc
229
 
@@ -235,7 +340,7 @@ def generate_audiobook_gpu(
235
  print(progress_text)
236
 
237
  try:
238
- output_path, _ = pipe.generate(
239
  text=text,
240
  narrator_config=nar_cfg,
241
  character_configs=char_configs,
@@ -243,15 +348,38 @@ def generate_audiobook_gpu(
243
  temperature=gen_temp,
244
  seed=int(gen_seed),
245
  )
246
- return output_path, f"Done! Audiobook generated."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  except Exception as e:
248
  import traceback
249
  traceback.print_exc()
250
- return None, f"Error: {str(e)}"
251
 
252
 
253
  @spaces.GPU(duration=60)
254
- def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang):
255
  pipe = get_pipeline()
256
  vc = VoiceConfig(
257
  name="Narrator",
@@ -262,6 +390,7 @@ def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang):
262
  design_desc=design if mode == "design" else None,
263
  instruct=instruct,
264
  language=lang,
 
265
  )
266
  try:
267
  wav, sr = pipe.preview_voice(vc)
@@ -272,6 +401,163 @@ def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang):
272
  return None, f"Preview failed: {e}"
273
 
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  # ---------------------------------------------------------------------------
276
  # Build UI
277
  # ---------------------------------------------------------------------------
@@ -308,107 +594,126 @@ def build_app():
308
  """)
309
 
310
  with gr.Tabs():
311
- # ==================== TAB 1 ====================
312
- with gr.TabItem("📖 Story Setup"):
313
  with gr.Row():
314
  with gr.Column(scale=2):
 
 
 
 
 
315
  story_input = gr.TextArea(
316
  label="Story Text",
317
  placeholder="Paste your book chapter, short story, or script here...",
318
- lines=20,
319
  max_lines=40,
320
  )
 
 
 
 
 
 
321
  with gr.Column(scale=1):
322
- gr.Markdown("### Character Detection")
323
- use_ai_check = gr.Checkbox(
324
- label="Use AI enhancement (slower, more accurate)",
325
- value=False,
 
 
 
 
 
 
 
 
 
 
 
 
326
  )
327
- extract_btn = gr.Button("🔍 Extract Characters", variant="primary")
 
 
328
  gr.Markdown("---")
329
- gr.Markdown("**Tips:**")
330
- gr.Markdown("- Use `Character: \"dialogue\"` format for best results.")
331
- gr.Markdown("- Or standard prose with quoted dialogue.")
332
- gr.Markdown("- AI mode uses a small LLM for deeper analysis.")
 
 
 
 
 
 
 
 
 
 
 
333
 
334
  extract_status = gr.Textbox(label="Status", interactive=False)
335
 
336
- # ==================== TAB 2 ====================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  with gr.TabItem("🎭 Voice Cast"):
338
  with gr.Row():
339
  with gr.Column(scale=1):
340
  gr.Markdown("## Narrator")
341
  with gr.Column(elem_classes="ab-card"):
342
- nar_mode = gr.Dropdown(
343
- choices=["preset", "clone", "design"],
344
- value="preset",
345
- label="Narrator Mode",
346
- )
347
- nar_preset = gr.Dropdown(
348
- choices=list(PRESET_SPEAKERS.keys()),
349
- value="Ryan",
350
- label="Preset Voice",
351
- )
352
- nar_audio = gr.Audio(
353
- label="Upload Voice Sample (3–10s)",
354
- type="filepath",
355
- visible=False,
356
- )
357
- nar_ref_text = gr.Textbox(
358
- label="Reference Transcript",
359
- placeholder="What does the reference audio say?",
360
- visible=False,
361
- )
362
- nar_design = gr.TextArea(
363
- label="Voice Description",
364
- placeholder="e.g. A warm, raspy baritone with a slight British accent.",
365
- visible=False,
366
- lines=2,
367
- )
368
- nar_instruct = gr.Textbox(
369
- label="Style Instruction",
370
- placeholder="e.g. Calm, measured storytelling pace.",
371
- )
372
- nar_lang = gr.Dropdown(
373
- choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"],
374
- value="English",
375
- label="Language",
376
- )
377
  nar_preview_btn = gr.Button("🔊 Preview Narrator", variant="secondary")
378
  nar_preview_audio = gr.Audio(label="Preview", interactive=False)
379
  nar_preview_status = gr.Textbox(show_label=False, interactive=False)
380
 
381
- nar_mode.change(
382
- on_mode_change,
383
- inputs=nar_mode,
384
- outputs=[nar_preset, nar_audio, nar_ref_text, nar_design],
385
- )
386
  nar_preview_btn.click(
387
  preview_narrator_gpu,
388
- inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang],
389
  outputs=[nar_preview_audio, nar_preview_status],
390
  )
391
 
392
  with gr.Column(scale=2):
393
  gr.Markdown("## Character Voices")
394
- gr.Markdown("Configure up to 8 characters. Use **preset** for built-in speakers, **clone** to upload a voice sample, or **design** to describe a voice from text.")
395
-
396
- char_names = []
397
- char_descs = []
398
- char_modes = []
399
- char_presets = []
400
- char_audios = []
401
- char_ref_texts = []
402
- char_designs = []
403
- char_instructs = []
404
- char_langs = []
405
- char_rows = []
406
 
407
  for i in range(8):
408
  visible_default = (i == 0)
409
  with gr.Group(visible=visible_default) as row:
410
  with gr.Row():
411
- cn = gr.Textbox(label=f"Name", placeholder="e.g. Alice", visible=visible_default)
412
  cd = gr.Textbox(label="Description", placeholder="Personality note", visible=visible_default)
413
  cm = gr.Dropdown(label="Mode", choices=["preset", "clone", "design"], value="preset", visible=visible_default)
414
  cp = gr.Dropdown(label="Preset", choices=list(PRESET_SPEAKERS.keys()), value="Ryan", visible=visible_default)
@@ -418,11 +723,16 @@ def build_app():
418
  cdes = gr.TextArea(label="Voice Description", placeholder="e.g. A shrill, nervous teenager.", visible=False, lines=2)
419
  cinstr = gr.Textbox(label="Style Instruction", placeholder="e.g. Angry and loud.", visible=visible_default)
420
  cl = gr.Dropdown(label="Language", choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", visible=visible_default)
421
-
422
- cm.change(
423
- on_mode_change,
424
- inputs=cm,
425
- outputs=[cp, ca, crt, cdes],
 
 
 
 
 
426
  )
427
 
428
  char_rows.append(row)
@@ -435,46 +745,72 @@ def build_app():
435
  char_designs.append(cdes)
436
  char_instructs.append(cinstr)
437
  char_langs.append(cl)
 
 
 
438
 
439
- # ==================== TAB 3 ====================
440
  with gr.TabItem("⚡ Generate"):
441
  with gr.Row():
442
  with gr.Column(scale=1):
443
  gr.Markdown("### Settings")
444
  gen_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
445
  gen_seed = gr.Number(value=42, precision=0, label="Seed (fix for consistency)")
446
- gen_btn = gr.Button("▶️ Generate Audiobook", variant="primary", size="lg")
 
447
  gen_progress = gr.Textbox(label="Progress", interactive=False, value="Ready.")
448
 
449
  with gr.Column(scale=2):
450
  gr.Markdown("### Output")
451
  output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False)
452
  output_status = gr.Textbox(label="Status", interactive=False)
 
453
 
454
- # ==================== TAB 4 ====================
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  with gr.TabItem("ℹ️ About"):
456
  gr.Markdown("""
457
  ## AudioBook Forge
458
 
459
- **Model-agnostic, high-fidelity audiobook generator** powered by [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS). Create audiobooks where every character speaks with their own unique voice.
460
-
461
- ## Features
462
-
463
- - 🎙️ **Character Voice Mapping** — Automatically detect characters from your story and assign unique voices to each one
464
- - 🎭 **Three Voice Modes**
465
- - **Preset** — 9 premium built-in speakers (English, Chinese, Japanese, Korean, dialects)
466
- - **Clone** — Upload a 3–10 second voice sample to clone any voice
467
- - **Design** — Describe a voice in text and the AI creates it
468
- - 📖 **Smart Text Processing** — Automatically distinguishes narration from dialogue and routes each segment to the correct voice
469
- - 🌐 **Multilingual** — Supports 10 languages via Qwen3-TTS
470
- - **ZeroGPU** — Runs on Hugging Face ZeroGPU (free A100 compute)
471
- - 🔧 **Model Agnostic** — Backend is swappable; upgrade to future SOTA TTS models without changing the UI
 
 
 
 
 
 
 
472
 
473
  ### Tips for Best Quality
474
- - Use clean, noise-free voice samples for cloning (3–10 seconds).
475
- - Keep reference transcripts accurate — they guide the cloning quality.
476
- - Lower temperature (0.5–0.6) for stable narration; higher (0.8–0.9) for expressive dialogue.
477
- - Use a fixed seed across chunks to prevent voice drift.
 
478
  """)
479
 
480
  # ---------- Extract wiring ----------
@@ -494,6 +830,9 @@ def build_app():
494
  gr.update(visible=False),
495
  gr.update(value=chars[i].get("voice_instruct", ""), visible=True),
496
  gr.update(value=chars[i].get("language", "English"), visible=True),
 
 
 
497
  ])
498
  else:
499
  updates.extend([
@@ -507,35 +846,68 @@ def build_app():
507
  gr.update(visible=False),
508
  gr.update(visible=False),
509
  gr.update(visible=False),
 
 
 
510
  ])
511
  return [status] + updates
512
 
513
- extract_btn.click(
514
- do_extract,
515
- inputs=[story_input, use_ai_check],
516
- outputs=[extract_status] + [
517
- item for sublist in [
518
- [char_rows[i], char_names[i], char_descs[i], char_modes[i], char_presets[i],
519
- char_audios[i], char_ref_texts[i], char_designs[i], char_instructs[i], char_langs[i]]
520
- for i in range(8)
521
- ] for item in sublist
522
- ],
523
- )
524
 
525
  # ---------- Generate wiring ----------
526
  all_char_inputs = (
527
  char_names + char_descs + char_modes + char_presets +
528
- char_audios + char_ref_texts + char_designs + char_instructs + char_langs
529
  )
530
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
  gen_btn.click(
532
- generate_audiobook_gpu,
 
 
 
 
 
 
 
533
  inputs=[
534
  story_input,
535
- nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang,
536
- gen_temp, gen_seed,
537
- ] + all_char_inputs,
538
- outputs=[output_audio, output_status],
 
 
 
 
 
539
  )
540
 
541
  return demo
 
1
  """
2
+ AudioBook Forge - Enhanced Gradio Frontend
3
+ High-fidelity audiobook generator with character voice mapping,
4
+ file upload, chapter selection, segment previews, and project save/load.
5
  """
6
 
7
  import os
8
+ import json
9
+ import base64
10
  from pathlib import Path
11
  from typing import Dict, List, Optional
12
 
 
34
  AudiobookPipeline,
35
  VoiceConfig,
36
  PRESET_SPEAKERS,
37
+ SAMPLE_STORIES,
38
+ save_project,
39
+ load_project,
40
+ estimate_duration,
41
  )
42
 
43
  # ---------------------------------------------------------------------------
 
86
  padding: 1.25rem !important;
87
  }
88
 
89
+ .ab-stat {
90
+ background: #0f172a;
91
+ border: 1px solid #334155;
92
+ border-radius: 10px;
93
+ padding: 0.75rem 1rem;
94
+ text-align: center;
95
+ }
96
+ .ab-stat .value {
97
+ font-size: 1.4rem;
98
+ font-weight: 700;
99
+ color: #22d3ee;
100
+ }
101
+ .ab-stat .label {
102
+ font-size: 0.75rem;
103
+ color: #94a3b8;
104
+ text-transform: uppercase;
105
+ letter-spacing: 0.05em;
106
+ }
107
+
108
  button.primary {
109
  background: linear-gradient(135deg, #6366f1, #4f46e5) !important;
110
  border: none !important;
 
171
  padding: 0.1rem 0.3rem !important;
172
  border-radius: 4px !important;
173
  }
174
+
175
+ /* Progress bar styling */
176
+ progress {
177
+ width: 100%;
178
+ height: 8px;
179
+ border-radius: 4px;
180
+ background: #334155;
181
+ }
182
+ progress::-webkit-progress-bar {
183
+ background: #334155;
184
+ border-radius: 4px;
185
+ }
186
+ progress::-webkit-progress-value {
187
+ background: linear-gradient(90deg, #6366f1, #22d3ee);
188
+ border-radius: 4px;
189
+ }
190
+
191
+ /* Segment list styling */
192
+ .seg-item {
193
+ background: #0f172a;
194
+ border: 1px solid #334155;
195
+ border-radius: 8px;
196
+ padding: 0.5rem 0.75rem;
197
+ margin-bottom: 0.4rem;
198
+ font-size: 0.85rem;
199
+ }
200
+ .seg-item .seg-type {
201
+ display: inline-block;
202
+ padding: 0.1rem 0.4rem;
203
+ border-radius: 4px;
204
+ font-size: 0.7rem;
205
+ font-weight: 600;
206
+ text-transform: uppercase;
207
+ }
208
+ .seg-type.narration { background: #4f46e5; color: #fff; }
209
+ .seg-type.dialogue { background: #22d3ee; color: #0f172a; }
210
  """
211
 
212
  # ---------------------------------------------------------------------------
 
214
  # ---------------------------------------------------------------------------
215
 
216
  _pipeline: Optional[AudiobookPipeline] = None
217
+ _stored_text: str = ""
218
+ _stored_chapters: List[Dict] = []
219
+ _stored_segments_meta: List[Dict] = []
220
+ _stored_segment_paths: List[str] = []
221
 
222
 
223
  def get_pipeline() -> AudiobookPipeline:
 
241
  return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
242
 
243
 
244
+ def update_stats(text: str) -> tuple:
245
+ wc = len(text.split()) if text else 0
246
+ dur = estimate_duration(wc)
247
+ return str(wc), dur
248
+
249
+
250
+ def load_sample(name: str) -> str:
251
+ return SAMPLE_STORIES.get(name, "")
252
+
253
+
254
+ def handle_upload(file_obj) -> tuple:
255
+ if file_obj is None:
256
+ return "", "No file uploaded."
257
+ try:
258
+ pipe = get_pipeline()
259
+ text, fname = pipe.parse_upload(file_obj)
260
+ text = pipe.processor.clean_text(text)
261
+ global _stored_text, _stored_chapters
262
+ _stored_text = text
263
+ _stored_chapters = pipe.detect_chapters(text)
264
+ ch_info = " | ".join([f"Ch{c['idx']+1}: {c['word_count']}w" for c in _stored_chapters[:5]])
265
+ if len(_stored_chapters) > 5:
266
+ ch_info += f" (+{len(_stored_chapters)-5} more)"
267
+ wc = len(text.split())
268
+ dur = estimate_duration(wc)
269
+ return text, f"Loaded {fname} — {wc} words (~{dur}) | Chapters: {ch_info if _stored_chapters else '1 (auto)'}"
270
+ except Exception as e:
271
+ return "", f"Error: {e}"
272
+
273
+
274
  def extract_chars(text: str, use_ai: bool) -> tuple:
275
  if not text or len(text.strip()) < 20:
276
  return [], "Text too short. Please paste at least a paragraph."
 
280
  return chars, status
281
 
282
 
283
+ def get_chapter_text(text: str, chapter_idx: int) -> str:
284
+ if not text:
285
+ return ""
286
+ pipe = get_pipeline()
287
+ return pipe.get_chapter_text(text, chapter_idx)
288
+
289
+
290
  # ---------------------------------------------------------------------------
291
  # GPU-wrapped functions (ZeroGPU)
292
  # ---------------------------------------------------------------------------
 
294
  @spaces.GPU(duration=180)
295
  def generate_audiobook_gpu(
296
  text,
297
+ nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
298
+ gen_temp, gen_seed, output_fmt,
299
+ names, descs, modes, presets, audios, ref_texts, designs, instructs, langs, speeds,
300
  ):
301
  if not text or len(text.strip()) < 50:
302
+ return None, None, "Error: Please provide at least 50 characters of story text.", ""
303
 
304
  pipe = get_pipeline()
305
 
 
312
  design_desc=nar_design if nar_mode == "design" else None,
313
  instruct=nar_instruct,
314
  language=nar_lang,
315
+ speed=float(nar_speed),
316
  )
317
 
318
  char_configs = {}
 
328
  design_desc=designs[i] if modes[i] == "design" else None,
329
  instruct=instructs[i] or "",
330
  language=langs[i],
331
+ speed=float(speeds[i]) if speeds[i] else 1.0,
332
  )
333
  char_configs[names[i]] = vc
334
 
 
340
  print(progress_text)
341
 
342
  try:
343
+ output_path, seg_paths, seg_meta = pipe.generate(
344
  text=text,
345
  narrator_config=nar_cfg,
346
  character_configs=char_configs,
 
348
  temperature=gen_temp,
349
  seed=int(gen_seed),
350
  )
351
+ global _stored_segment_paths, _stored_segments_meta
352
+ _stored_segment_paths = seg_paths
353
+ _stored_segments_meta = seg_meta
354
+
355
+ # Build segment list HTML
356
+ seg_html = "<div style='max-height: 300px; overflow-y: auto;'>"
357
+ for s in seg_meta[:50]:
358
+ tclass = "narration" if s['type'] == 'narration' else "dialogue"
359
+ seg_html += f"<div class='seg-item'><span class='seg-type {tclass}'>{s['type']}</span> <strong>{s['speaker']}</strong>: {s['text']}</div>"
360
+ if len(seg_meta) > 50:
361
+ seg_html += f"<div style='text-align:center;color:#94a3b8;padding:0.5rem;'>... and {len(seg_meta)-50} more segments</div>"
362
+ seg_html += "</div>"
363
+
364
+ # Extra export
365
+ extra_path = None
366
+ if output_fmt == "wav":
367
+ extra_path = output_path.replace(".mp3", ".wav")
368
+ from backend import save_audiobook
369
+ save_audiobook(seg_paths, extra_path, fmt="wav")
370
+ elif output_fmt == "zip":
371
+ extra_path = pipe.export_segments_zip(seg_paths)
372
+
373
+ final_path = extra_path if extra_path else output_path
374
+ return final_path, seg_html, f"Done! {len(seg_meta)} segments generated.", progress_text
375
  except Exception as e:
376
  import traceback
377
  traceback.print_exc()
378
+ return None, "", f"Error: {str(e)}", progress_text
379
 
380
 
381
  @spaces.GPU(duration=60)
382
+ def preview_narrator_gpu(mode, preset, audio, ref_text, design, instruct, lang, speed):
383
  pipe = get_pipeline()
384
  vc = VoiceConfig(
385
  name="Narrator",
 
390
  design_desc=design if mode == "design" else None,
391
  instruct=instruct,
392
  language=lang,
393
+ speed=float(speed),
394
  )
395
  try:
396
  wav, sr = pipe.preview_voice(vc)
 
401
  return None, f"Preview failed: {e}"
402
 
403
 
404
+ @spaces.GPU(duration=60)
405
+ def preview_char_voice_gpu(name, mode, preset, audio, ref_text, design, instruct, lang, speed):
406
+ pipe = get_pipeline()
407
+ vc = VoiceConfig(
408
+ name=name or "Character",
409
+ mode=mode,
410
+ preset=preset if mode == "preset" else None,
411
+ ref_audio=audio if mode == "clone" and audio else None,
412
+ ref_text=ref_text if mode == "clone" else None,
413
+ design_desc=design if mode == "design" else None,
414
+ instruct=instruct,
415
+ language=lang,
416
+ speed=float(speed) if speed else 1.0,
417
+ )
418
+ try:
419
+ sample = f"Hello, I am {name or 'your character'}. This is how I sound in the story."
420
+ wav, sr = pipe.preview_voice(vc, sample_text=sample)
421
+ return (sr, wav), f"{name or 'Character'} preview ready!"
422
+ except Exception as e:
423
+ import traceback
424
+ traceback.print_exc()
425
+ return None, f"Preview failed: {e}"
426
+
427
+
428
+ # ---------------------------------------------------------------------------
429
+ # Project Save/Load
430
+ # ---------------------------------------------------------------------------
431
+
432
+ def do_save_project(text, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
433
+ names, descs, modes, presets, audios, ref_texts, designs, instructs, langs, speeds,
434
+ gen_temp, gen_seed):
435
+ nar_cfg = VoiceConfig(
436
+ name="Narrator", mode=nar_mode, preset=nar_preset if nar_mode == "preset" else None,
437
+ ref_audio=nar_audio if nar_mode == "clone" and nar_audio else None,
438
+ ref_text=nar_ref_text if nar_mode == "clone" else None,
439
+ design_desc=nar_design if nar_mode == "design" else None,
440
+ instruct=nar_instruct, language=nar_lang, speed=float(nar_speed),
441
+ )
442
+ char_configs = {}
443
+ for i in range(8):
444
+ if not names[i]:
445
+ continue
446
+ char_configs[names[i]] = VoiceConfig(
447
+ name=names[i], mode=modes[i],
448
+ preset=presets[i] if modes[i] == "preset" else None,
449
+ ref_audio=audios[i] if modes[i] == "clone" and audios[i] else None,
450
+ ref_text=ref_texts[i] if modes[i] == "clone" else None,
451
+ design_desc=designs[i] if modes[i] == "design" else None,
452
+ instruct=instructs[i] or "", language=langs[i],
453
+ speed=float(speeds[i]) if speeds[i] else 1.0,
454
+ )
455
+ settings = {"temperature": gen_temp, "seed": int(gen_seed)}
456
+ json_str = save_project(text, nar_cfg, char_configs, settings)
457
+ return json_str
458
+
459
+
460
+ def do_load_project(json_str):
461
+ try:
462
+ data = load_project(json_str)
463
+ nar = data["narrator"]
464
+ chars = data.get("characters", {})
465
+
466
+ # Build updates for narrator
467
+ nar_updates = [
468
+ gr.update(value=nar.mode),
469
+ gr.update(value=nar.preset if nar.preset else "Ryan", visible=nar.mode=="preset"),
470
+ gr.update(value=nar.ref_audio, visible=nar.mode=="clone"),
471
+ gr.update(value=nar.ref_text, visible=nar.mode=="clone"),
472
+ gr.update(value=nar.design_desc, visible=nar.mode=="design"),
473
+ gr.update(value=nar.instruct),
474
+ gr.update(value=nar.language),
475
+ gr.update(value=nar.speed),
476
+ ]
477
+
478
+ # Build updates for characters (up to 8)
479
+ char_updates = []
480
+ char_items = list(chars.items())[:8]
481
+ for i in range(8):
482
+ if i < len(char_items):
483
+ _, c = char_items[i]
484
+ char_updates.extend([
485
+ gr.update(visible=True),
486
+ gr.update(value=c.name, visible=True),
487
+ gr.update(value="", visible=True),
488
+ gr.update(value=c.mode, visible=True),
489
+ gr.update(value=c.preset if c.preset else "Ryan", visible=c.mode=="preset"),
490
+ gr.update(value=c.ref_audio, visible=c.mode=="clone"),
491
+ gr.update(value=c.ref_text, visible=c.mode=="clone"),
492
+ gr.update(value=c.design_desc, visible=c.mode=="design"),
493
+ gr.update(value=c.instruct, visible=True),
494
+ gr.update(value=c.language, visible=True),
495
+ gr.update(value=c.speed, visible=True),
496
+ ])
497
+ else:
498
+ char_updates.extend([
499
+ gr.update(visible=False),
500
+ gr.update(visible=False),
501
+ gr.update(visible=False),
502
+ gr.update(visible=False),
503
+ gr.update(visible=False),
504
+ gr.update(visible=False),
505
+ gr.update(visible=False),
506
+ gr.update(visible=False),
507
+ gr.update(visible=False),
508
+ gr.update(visible=False),
509
+ gr.update(visible=False),
510
+ ])
511
+
512
+ text_sample = data.get("text_sample", "")
513
+ return [text_sample] + nar_updates + char_updates + [f"Project loaded! {len(chars)} characters configured."]
514
+ except Exception as e:
515
+ return [""] + [gr.update()]*43 + [f"Error loading project: {e}"]
516
+
517
+
518
+ # ---------------------------------------------------------------------------
519
+ # Quick Generate
520
+ # ---------------------------------------------------------------------------
521
+
522
+ @spaces.GPU(duration=180)
523
+ def quick_generate_gpu(text, narrator_preset, gen_temp, gen_seed, output_fmt):
524
+ """One-click generation with all defaults."""
525
+ if not text or len(text.strip()) < 50:
526
+ return None, "Error: Text too short."
527
+
528
+ pipe = get_pipeline()
529
+ nar_cfg = VoiceConfig(name="Narrator", mode="preset", preset=narrator_preset,
530
+ language="English", speed=1.0)
531
+
532
+ def prog_cb(ratio: float, msg: str):
533
+ print(f"[{ratio*100:.0f}%] {msg}")
534
+
535
+ try:
536
+ output_path, seg_paths, seg_meta = pipe.generate(
537
+ text=text,
538
+ narrator_config=nar_cfg,
539
+ character_configs={},
540
+ progress_callback=prog_cb,
541
+ temperature=gen_temp,
542
+ seed=int(gen_seed),
543
+ )
544
+
545
+ extra_path = None
546
+ if output_fmt == "wav":
547
+ extra_path = output_path.replace(".mp3", ".wav")
548
+ from backend import save_audiobook
549
+ save_audiobook(seg_paths, extra_path, fmt="wav")
550
+ elif output_fmt == "zip":
551
+ extra_path = pipe.export_segments_zip(seg_paths)
552
+
553
+ final_path = extra_path if extra_path else output_path
554
+ return final_path, f"Quick audiobook ready! {len(seg_meta)} segments."
555
+ except Exception as e:
556
+ import traceback
557
+ traceback.print_exc()
558
+ return None, f"Error: {str(e)}"
559
+
560
+
561
  # ---------------------------------------------------------------------------
562
  # Build UI
563
  # ---------------------------------------------------------------------------
 
594
  """)
595
 
596
  with gr.Tabs():
597
+ # ==================== TAB 1: Story ====================
598
+ with gr.TabItem("📖 Story"):
599
  with gr.Row():
600
  with gr.Column(scale=2):
601
+ gr.Markdown("### Upload or Paste")
602
+ file_upload = gr.File(
603
+ label="Upload EPUB, PDF, TXT, or HTML",
604
+ file_types=[".txt", ".epub", ".pdf", ".html", ".htm"],
605
+ )
606
  story_input = gr.TextArea(
607
  label="Story Text",
608
  placeholder="Paste your book chapter, short story, or script here...",
609
+ lines=18,
610
  max_lines=40,
611
  )
612
+ sample_dropdown = gr.Dropdown(
613
+ label="Or try a sample story",
614
+ choices=list(SAMPLE_STORIES.keys()),
615
+ value=None,
616
+ )
617
+
618
  with gr.Column(scale=1):
619
+ gr.Markdown("### Stats")
620
+ with gr.Row():
621
+ stat_words = gr.Textbox(label="Words", value="0", interactive=False)
622
+ stat_dur = gr.Textbox(label="Est. Duration", value="0 sec", interactive=False)
623
+ gr.Markdown("---")
624
+ gr.Markdown("### Quick Generate")
625
+ quick_preset = gr.Dropdown(
626
+ choices=list(PRESET_SPEAKERS.keys()),
627
+ value="Ryan",
628
+ label="Narrator Voice",
629
+ )
630
+ quick_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
631
+ quick_fmt = gr.Dropdown(
632
+ choices=["mp3", "wav", "zip"],
633
+ value="mp3",
634
+ label="Output Format",
635
  )
636
+ quick_btn = gr.Button(" Quick Generate", variant="primary")
637
+ quick_audio = gr.Audio(label="Quick Audiobook", interactive=False)
638
+ quick_status = gr.Textbox(show_label=False, interactive=False)
639
  gr.Markdown("---")
640
+ gr.Markdown("**Quick Generate** uses a single narrator voice for the entire text — perfect for articles, essays, and simple stories.")
641
+
642
+ with gr.Row():
643
+ chapter_selector = gr.Dropdown(
644
+ label="Chapter / Section",
645
+ choices=["All"],
646
+ value="All",
647
+ interactive=True,
648
+ )
649
+ refresh_chapters_btn = gr.Button("🔄 Detect Chapters")
650
+
651
+ with gr.Row():
652
+ gr.Markdown("### Character Detection")
653
+ use_ai_check = gr.Checkbox(label="Use AI enhancement (slower, more accurate)", value=False)
654
+ extract_btn = gr.Button("🔍 Extract Characters", variant="primary")
655
 
656
  extract_status = gr.Textbox(label="Status", interactive=False)
657
 
658
+ # Wiring
659
+ file_upload.change(handle_upload, inputs=[file_upload], outputs=[story_input, extract_status])
660
+ sample_dropdown.change(load_sample, inputs=[sample_dropdown], outputs=[story_input])
661
+ story_input.change(update_stats, inputs=[story_input], outputs=[stat_words, stat_dur])
662
+ quick_btn.click(
663
+ quick_generate_gpu,
664
+ inputs=[story_input, quick_preset, quick_temp, quick_fmt],
665
+ outputs=[quick_audio, quick_status],
666
+ )
667
+
668
+ # Chapter detection
669
+ def refresh_chapters(text):
670
+ if not text:
671
+ return gr.update(choices=["All"], value="All")
672
+ pipe = get_pipeline()
673
+ chs = pipe.detect_chapters(text)
674
+ choices = ["All"] + [f"Ch{c['idx']+1}: {c['title'][:60]}" for c in chs]
675
+ return gr.update(choices=choices, value="All")
676
+
677
+ refresh_chapters_btn.click(refresh_chapters, inputs=[story_input], outputs=[chapter_selector])
678
+
679
+ # ==================== TAB 2: Voice Cast ====================
680
  with gr.TabItem("🎭 Voice Cast"):
681
  with gr.Row():
682
  with gr.Column(scale=1):
683
  gr.Markdown("## Narrator")
684
  with gr.Column(elem_classes="ab-card"):
685
+ nar_mode = gr.Dropdown(choices=["preset", "clone", "design"], value="preset", label="Mode")
686
+ nar_preset = gr.Dropdown(choices=list(PRESET_SPEAKERS.keys()), value="Ryan", label="Preset Voice")
687
+ nar_audio = gr.Audio(label="Upload Voice Sample (3–10s)", type="filepath", visible=False)
688
+ nar_ref_text = gr.Textbox(label="Reference Transcript", placeholder="What does the sample say?", visible=False)
689
+ nar_design = gr.TextArea(label="Voice Description", placeholder="e.g. A warm, raspy baritone...", visible=False, lines=2)
690
+ nar_instruct = gr.Textbox(label="Style Instruction", placeholder="e.g. Calm, measured storytelling.")
691
+ nar_lang = gr.Dropdown(choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", label="Language")
692
+ nar_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
  nar_preview_btn = gr.Button("🔊 Preview Narrator", variant="secondary")
694
  nar_preview_audio = gr.Audio(label="Preview", interactive=False)
695
  nar_preview_status = gr.Textbox(show_label=False, interactive=False)
696
 
697
+ nar_mode.change(on_mode_change, inputs=nar_mode, outputs=[nar_preset, nar_audio, nar_ref_text, nar_design])
 
 
 
 
698
  nar_preview_btn.click(
699
  preview_narrator_gpu,
700
+ inputs=[nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed],
701
  outputs=[nar_preview_audio, nar_preview_status],
702
  )
703
 
704
  with gr.Column(scale=2):
705
  gr.Markdown("## Character Voices")
706
+ gr.Markdown("Configure up to 8 characters. Each can use Preset, Clone, or Design mode.")
707
+
708
+ char_names, char_descs, char_modes, char_presets = [], [], [], []
709
+ char_audios, char_ref_texts, char_designs, char_instructs, char_langs, char_speeds = [], [], [], [], [], []
710
+ char_rows, char_preview_btns, char_preview_audios = [], [], []
 
 
 
 
 
 
 
711
 
712
  for i in range(8):
713
  visible_default = (i == 0)
714
  with gr.Group(visible=visible_default) as row:
715
  with gr.Row():
716
+ cn = gr.Textbox(label="Name", placeholder="e.g. Alice", visible=visible_default)
717
  cd = gr.Textbox(label="Description", placeholder="Personality note", visible=visible_default)
718
  cm = gr.Dropdown(label="Mode", choices=["preset", "clone", "design"], value="preset", visible=visible_default)
719
  cp = gr.Dropdown(label="Preset", choices=list(PRESET_SPEAKERS.keys()), value="Ryan", visible=visible_default)
 
723
  cdes = gr.TextArea(label="Voice Description", placeholder="e.g. A shrill, nervous teenager.", visible=False, lines=2)
724
  cinstr = gr.Textbox(label="Style Instruction", placeholder="e.g. Angry and loud.", visible=visible_default)
725
  cl = gr.Dropdown(label="Language", choices=["English", "Chinese", "Japanese", "Korean", "German", "French", "Spanish", "Italian", "Portuguese", "Russian"], value="English", visible=visible_default)
726
+ cspd = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1, visible=visible_default)
727
+ with gr.Row():
728
+ cpv_btn = gr.Button("🔊 Preview", variant="secondary", visible=visible_default)
729
+ cpv_audio = gr.Audio(label="Preview", interactive=False, visible=visible_default)
730
+
731
+ cm.change(on_mode_change, inputs=cm, outputs=[cp, ca, crt, cdes])
732
+ cpv_btn.click(
733
+ preview_char_voice_gpu,
734
+ inputs=[cn, cm, cp, ca, crt, cdes, cinstr, cl, cspd],
735
+ outputs=[cpv_audio, cpv_btn], # reuse button for status
736
  )
737
 
738
  char_rows.append(row)
 
745
  char_designs.append(cdes)
746
  char_instructs.append(cinstr)
747
  char_langs.append(cl)
748
+ char_speeds.append(cspd)
749
+ char_preview_btns.append(cpv_btn)
750
+ char_preview_audios.append(cpv_audio)
751
 
752
+ # ==================== TAB 3: Generate ====================
753
  with gr.TabItem("⚡ Generate"):
754
  with gr.Row():
755
  with gr.Column(scale=1):
756
  gr.Markdown("### Settings")
757
  gen_temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
758
  gen_seed = gr.Number(value=42, precision=0, label="Seed (fix for consistency)")
759
+ output_fmt = gr.Dropdown(choices=["mp3", "wav", "zip"], value="mp3", label="Output Format")
760
+ gen_btn = gr.Button("▶️ Generate Full Audiobook", variant="primary", size="lg")
761
  gen_progress = gr.Textbox(label="Progress", interactive=False, value="Ready.")
762
 
763
  with gr.Column(scale=2):
764
  gr.Markdown("### Output")
765
  output_audio = gr.Audio(label="Generated Audiobook", type="filepath", interactive=False)
766
  output_status = gr.Textbox(label="Status", interactive=False)
767
+ segment_list = gr.HTML(label="Segments")
768
 
769
+ # ==================== TAB 4: Project ====================
770
+ with gr.TabItem("💾 Project"):
771
+ with gr.Row():
772
+ with gr.Column():
773
+ gr.Markdown("### Save Project")
774
+ save_btn = gr.Button("💾 Save Configuration", variant="primary")
775
+ project_json = gr.TextArea(label="Project JSON (copy this to save)", lines=10, interactive=True)
776
+ with gr.Column():
777
+ gr.Markdown("### Load Project")
778
+ load_json = gr.TextArea(label="Paste Project JSON here", lines=10, interactive=True)
779
+ load_btn = gr.Button("📂 Load Configuration", variant="secondary")
780
+ load_status = gr.Textbox(label="Status", interactive=False)
781
+
782
+ # ==================== TAB 5: About ====================
783
  with gr.TabItem("ℹ️ About"):
784
  gr.Markdown("""
785
  ## AudioBook Forge
786
 
787
+ **Model-agnostic, high-fidelity audiobook generator** powered by [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS).
788
+
789
+ ### Features
790
+ - 📁 **File Upload** — Import EPUB, PDF, TXT, or HTML directly
791
+ - 📖 **Chapter Detection** — Auto-detects chapters/sections for selective generation
792
+ - 🎙️ **Character Voice Mapping** — Auto-extract characters and assign unique voices
793
+ - 🎭 **Three Voice Modes** — Preset (9 speakers), Clone (upload sample), Design (text description)
794
+ - **Quick Generate** — One-click audiobook with a single narrator voice
795
+ - 🎚️ **Speed Control** — Adjust playback speed per voice (0.5x–2.0x)
796
+ - 📦 **Multi-format Export** — MP3, WAV, or ZIP of individual segments
797
+ - 💾 **Save/Load Projects** — Export and restore your voice configurations
798
+ - 🌐 **10 Languages** — English, Chinese, Japanese, Korean, German, French, Spanish, Italian, Portuguese, Russian
799
+ - **ZeroGPU** — Runs on Hugging Face ZeroGPU (free compute)
800
+
801
+ ### Workflow
802
+ 1. **Upload or paste** your story text
803
+ 2. **Detect chapters** (optional) and select a range
804
+ 3. **Extract characters** or use Quick Generate for simple narration
805
+ 4. **Assign voices** to narrator and each character
806
+ 5. **Generate** and download your audiobook
807
 
808
  ### Tips for Best Quality
809
+ - Use clean, noise-free voice samples for cloning (3–10 seconds)
810
+ - Keep reference transcripts accurate
811
+ - Lower temperature (0.5–0.6) for stable narration; higher (0.8–0.9) for expressive dialogue
812
+ - Use a fixed seed to prevent voice drift across segments
813
+ - Use speed adjustment to fine-tune pacing per character
814
  """)
815
 
816
  # ---------- Extract wiring ----------
 
830
  gr.update(visible=False),
831
  gr.update(value=chars[i].get("voice_instruct", ""), visible=True),
832
  gr.update(value=chars[i].get("language", "English"), visible=True),
833
+ gr.update(value=chars[i].get("speed", 1.0), visible=True),
834
+ gr.update(visible=True),
835
+ gr.update(visible=True),
836
  ])
837
  else:
838
  updates.extend([
 
846
  gr.update(visible=False),
847
  gr.update(visible=False),
848
  gr.update(visible=False),
849
+ gr.update(visible=False),
850
+ gr.update(visible=False),
851
+ gr.update(visible=False),
852
  ])
853
  return [status] + updates
854
 
855
+ extract_outputs = [extract_status] + [
856
+ item for sublist in [
857
+ [char_rows[i], char_names[i], char_descs[i], char_modes[i], char_presets[i],
858
+ char_audios[i], char_ref_texts[i], char_designs[i], char_instructs[i], char_langs[i],
859
+ char_speeds[i], char_preview_btns[i], char_preview_audios[i]]
860
+ for i in range(8)
861
+ ] for item in sublist
862
+ ]
863
+ extract_btn.click(do_extract, inputs=[story_input, use_ai_check], outputs=extract_outputs)
 
 
864
 
865
  # ---------- Generate wiring ----------
866
  all_char_inputs = (
867
  char_names + char_descs + char_modes + char_presets +
868
+ char_audios + char_ref_texts + char_designs + char_instructs + char_langs + char_speeds
869
  )
870
 
871
+ def get_text_for_gen(story_text, chapter_sel):
872
+ if chapter_sel == "All" or not chapter_sel:
873
+ return story_text
874
+ # Extract chapter index
875
+ try:
876
+ idx = int(chapter_sel.split(":")[0].replace("Ch", "")) - 1
877
+ return get_chapter_text(story_text, idx)
878
+ except:
879
+ return story_text
880
+
881
+ def wrapped_generate(story_text, chapter_sel, *args):
882
+ text = get_text_for_gen(story_text, chapter_sel)
883
+ return generate_audiobook_gpu(text, *args)
884
+
885
+ gen_inputs = [
886
+ story_input, chapter_selector,
887
+ nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
888
+ gen_temp, gen_seed, output_fmt,
889
+ ] + all_char_inputs
890
+
891
  gen_btn.click(
892
+ wrapped_generate,
893
+ inputs=gen_inputs,
894
+ outputs=[output_audio, segment_list, output_status, gen_progress],
895
+ )
896
+
897
+ # ---------- Project wiring ----------
898
+ save_btn.click(
899
+ do_save_project,
900
  inputs=[
901
  story_input,
902
+ nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed,
903
+ ] + all_char_inputs + [gen_temp, gen_seed],
904
+ outputs=[project_json],
905
+ )
906
+
907
+ load_btn.click(
908
+ do_load_project,
909
+ inputs=[load_json],
910
+ outputs=[story_input, nar_mode, nar_preset, nar_audio, nar_ref_text, nar_design, nar_instruct, nar_lang, nar_speed] + extract_outputs[1:] + [load_status],
911
  )
912
 
913
  return demo
backend.py CHANGED
@@ -1,7 +1,8 @@
1
  """
2
  AudioBook Forge - Backend
3
  Model-agnostic TTS engine with Qwen3-TTS support.
4
- Character extraction, dialogue parsing, and audio stitching.
 
5
  """
6
 
7
  import os
@@ -9,10 +10,12 @@ import re
9
  import json
10
  import hashlib
11
  import tempfile
 
12
  from pathlib import Path
13
  from typing import List, Dict, Optional, Tuple, Any
14
- from dataclasses import dataclass, field
15
  from collections import defaultdict
 
16
  import warnings
17
 
18
  import numpy as np
@@ -39,6 +42,75 @@ PRESET_SPEAKERS = {
39
  MAX_CHUNK_CHARS = 380
40
  MIN_CHUNK_CHARS = 80
41
  CROSSFADE_MS = 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  # ---------------------------------------------------------------------------
44
  # Data Classes
@@ -47,21 +119,30 @@ CROSSFADE_MS = 80
47
  @dataclass
48
  class VoiceConfig:
49
  name: str = "Narrator"
50
- mode: str = "preset" # preset | clone | design
51
- preset: Optional[str] = None # e.g., "Ryan"
52
  ref_audio: Optional[str] = None
53
  ref_text: Optional[str] = None
54
  design_desc: Optional[str] = None
55
- instruct: str = "" # style instruction
56
  language: str = "English"
 
 
 
 
 
 
 
 
57
 
58
 
59
  @dataclass
60
  class TextSegment:
61
  text: str
62
- seg_type: str # narration | dialogue
63
  speaker: Optional[str] = None
64
  emotion_hint: Optional[str] = None
 
65
 
66
 
67
  @dataclass
@@ -72,16 +153,254 @@ class CharacterProfile:
72
  occurrences: int = 0
73
 
74
 
 
 
 
 
 
 
 
 
75
  # ---------------------------------------------------------------------------
76
- # TTS Engine (Model-Agnostic Wrapper)
77
  # ---------------------------------------------------------------------------
78
 
79
- class TTSEngine:
80
- """
81
- Model-agnostic TTS engine.
82
- Currently backed by Qwen3-TTS. Swappable architecture.
83
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
 
85
  def __init__(self, device: str = "cuda"):
86
  self.device = device
87
  self._custom_voice_model = None
@@ -150,7 +469,10 @@ class TTSEngine:
150
  return self._design_model
151
 
152
  def _cache_key(self, text: str, voice: VoiceConfig) -> str:
153
- payload = f"{text}|{voice.mode}|{voice.preset}|{voice.ref_audio}|{voice.design_desc}|{voice.instruct}|{voice.language}"
 
 
 
154
  return hashlib.md5(payload.encode()).hexdigest()
155
 
156
  def _cached_path(self, key: str) -> Path:
@@ -163,7 +485,6 @@ class TTSEngine:
163
  temperature: float = 0.7,
164
  seed: int = 42,
165
  ) -> Tuple[np.ndarray, int]:
166
- """Generate audio for a text chunk. Returns (audio_array, sample_rate)."""
167
  cache_key = self._cache_key(text, voice)
168
  cache_path = self._cached_path(cache_key)
169
  if cache_path.exists():
@@ -205,15 +526,43 @@ class TTSEngine:
205
  else:
206
  raise ValueError(f"Unknown voice mode: {voice.mode}")
207
 
208
- # Handle stereo or list returns
209
  if isinstance(wavs, list):
210
  wavs = wavs[0]
211
  if wavs.ndim > 1:
212
  wavs = wavs.mean(axis=1)
213
 
 
 
 
 
214
  sf.write(str(cache_path), wavs, sr)
215
  return wavs, sr
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  def status(self) -> Dict[str, Any]:
218
  return {
219
  "custom_loaded": self._custom_voice_model is not None,
@@ -222,144 +571,16 @@ class TTSEngine:
222
  }
223
 
224
 
225
- # ---------------------------------------------------------------------------
226
- # Text Processing
227
- # ---------------------------------------------------------------------------
228
-
229
- class TextProcessor:
230
- """Extract characters, parse dialogue, chunk text."""
231
-
232
- DIALOGUE_RE = re.compile(
233
- r'(?:^|[.!?\n]\s+)\s*"([^"]{3,500})"' # quoted dialogue
234
- )
235
- SPEAKER_RE = re.compile(
236
- r'(?:^|\n)\s*([A-Z][a-zA-Z\s]{1,20})(?:\s*[:\-–])\s*"([^"]+)"'
237
- )
238
- NAME_RE = re.compile(
239
- r'\b([A-Z][a-z]{1,15})\b'
240
- )
241
-
242
- @staticmethod
243
- def extract_characters(text: str, use_ai: bool = False) -> List[CharacterProfile]:
244
- """Extract character names and basic stats from text."""
245
- profiles: Dict[str, CharacterProfile] = {}
246
-
247
- # Pattern: Name: "dialogue"
248
- for match in TextProcessor.SPEAKER_RE.finditer(text):
249
- name = match.group(1).strip()
250
- if len(name) > 2:
251
- if name not in profiles:
252
- profiles[name] = CharacterProfile(name=name)
253
- profiles[name].occurrences += 1
254
-
255
- # Pattern: quoted dialogue near "he said / she said"
256
- for match in TextProcessor.DIALOGUE_RE.finditer(text):
257
- quote = match.group(1)
258
- before = text[max(0, match.start() - 120):match.start()]
259
- said_match = re.search(r'([A-Z][a-z]{1,15})\s+(?:said|cried|shouted|whispered|replied|asked)', before)
260
- if said_match:
261
- name = said_match.group(1)
262
- if name not in profiles:
263
- profiles[name] = CharacterProfile(name=name)
264
- profiles[name].occurrences += 1
265
-
266
- # Fallback: capitalized names appearing frequently
267
- all_names = TextProcessor.NAME_RE.findall(text)
268
- from collections import Counter
269
- common = Counter(all_names).most_common(30)
270
- for name, count in common:
271
- if count >= 3 and len(name) > 2 and name not in profiles:
272
- # Filter common words
273
- if name.lower() in {"the", "and", "but", "for", "are", "was", "were", "had", "have", "has", "his", "her", "she", "him", "they", "them", "said", "with", "from", "that", "this", "what", "when", "where", "would", "could", "should"}:
274
- continue
275
- profiles[name] = CharacterProfile(name=name, occurrences=count)
276
-
277
- result = sorted(profiles.values(), key=lambda p: p.occurrences, reverse=True)
278
- return result[:12] # Cap at 12 characters
279
-
280
- @staticmethod
281
- def segment_text(text: str, characters: List[str]) -> List[TextSegment]:
282
- """Split text into narration/dialogue segments."""
283
- segments = []
284
- # Normalize newlines
285
- text = text.replace("\r\n", "\n").replace("\r", "\n")
286
-
287
- # Split by paragraphs first
288
- paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
289
-
290
- for para in paragraphs:
291
- # Check if paragraph starts with Character: "dialogue"
292
- speaker_match = re.match(r'^([A-Z][a-zA-Z\s]{1,20})[:\-–]\s*"([^"]+)"', para)
293
- if speaker_match:
294
- speaker = speaker_match.group(1).strip()
295
- dialogue = speaker_match.group(2)
296
- segments.append(TextSegment(text=dialogue, seg_type="dialogue", speaker=speaker))
297
- # Remainder of paragraph as narration
298
- remainder = para[speaker_match.end():].strip()
299
- if remainder:
300
- segments.append(TextSegment(text=remainder, seg_type="narration"))
301
- continue
302
-
303
- # Check for inline quotes
304
- parts = re.split(r'"([^"]{3,500})"', para)
305
- for i, part in enumerate(parts):
306
- part = part.strip()
307
- if not part:
308
- continue
309
- if i % 2 == 1:
310
- # This was inside quotes
311
- # Try to attribute speaker from surrounding text
312
- speaker = None
313
- segments.append(TextSegment(text=part, seg_type="dialogue", speaker=speaker))
314
- else:
315
- segments.append(TextSegment(text=part, seg_type="narration"))
316
-
317
- # Merge adjacent narration segments
318
- merged = []
319
- for seg in segments:
320
- if merged and seg.seg_type == "narration" and merged[-1].seg_type == "narration":
321
- merged[-1].text += " " + seg.text
322
- else:
323
- merged.append(seg)
324
- return merged
325
-
326
- @staticmethod
327
- def chunk_segments(segments: List[TextSegment], max_chars: int = MAX_CHUNK_CHARS) -> List[TextSegment]:
328
- """Break long segments into smaller chunks at sentence boundaries."""
329
- result = []
330
- for seg in segments:
331
- if len(seg.text) <= max_chars:
332
- result.append(seg)
333
- continue
334
- # Split into sentences
335
- sentences = re.split(r'(?<=[.!?])\s+', seg.text)
336
- current_text = ""
337
- current_speaker = seg.speaker
338
- current_type = seg.seg_type
339
- for sent in sentences:
340
- if len(current_text) + len(sent) + 1 <= max_chars:
341
- current_text += (" " if current_text else "") + sent
342
- else:
343
- if current_text:
344
- result.append(TextSegment(text=current_text.strip(), seg_type=current_type, speaker=current_speaker))
345
- current_text = sent
346
- if current_text:
347
- result.append(TextSegment(text=current_text.strip(), seg_type=current_type, speaker=current_speaker))
348
- return result
349
-
350
-
351
  # ---------------------------------------------------------------------------
352
  # Audio Utils
353
  # ---------------------------------------------------------------------------
354
 
355
  def stitch_audio(paths: List[str], crossfade_ms: int = CROSSFADE_MS) -> AudioSegment:
356
- """Concatenate WAV files with crossfade."""
357
  if not paths:
358
  return AudioSegment.silent(duration=0)
359
  combined = AudioSegment.from_wav(paths[0])
360
  for p in paths[1:]:
361
  next_seg = AudioSegment.from_wav(p)
362
- # Simple overlap crossfade
363
  if crossfade_ms > 0 and len(combined) > crossfade_ms and len(next_seg) > crossfade_ms:
364
  combined = combined.append(next_seg, crossfade=crossfade_ms)
365
  else:
@@ -368,41 +589,91 @@ def stitch_audio(paths: List[str], crossfade_ms: int = CROSSFADE_MS) -> AudioSeg
368
 
369
 
370
  def normalize_audio(audio: AudioSegment, target_dBFS: float = -1.5) -> AudioSegment:
371
- """Peak normalize audio."""
372
  change = target_dBFS - audio.max_dBFS
373
  return audio.apply_gain(change)
374
 
375
 
376
- def save_audiobook(segments_paths: List[str], output_path: str, title: str = "Audiobook") -> str:
377
- """Stitch segments and export final audiobook."""
378
  if not segments_paths:
379
  return ""
380
  combined = stitch_audio(segments_paths)
381
  combined = normalize_audio(combined)
382
- combined.export(output_path, format="mp3", bitrate="192k", tags={"title": title, "artist": "AudioBook Forge"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  return output_path
384
 
385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  # ---------------------------------------------------------------------------
387
- # Optional: AI Character Extraction via HF Inference
388
  # ---------------------------------------------------------------------------
389
 
390
  def ai_extract_characters(text: str, api_token: Optional[str] = None) -> List[CharacterProfile]:
391
- """Use a small HF model to extract characters with descriptions."""
392
  try:
393
  from huggingface_hub import InferenceClient
394
  client = InferenceClient(token=api_token or os.getenv("HF_TOKEN"))
395
-
396
- # Truncate text for context window
397
  sample = text[:4000] + ("\n...[truncated]" if len(text) > 4000 else "")
398
-
399
  prompt = (
400
  "Extract all named characters from the following story excerpt. "
401
  "For each character, provide their name and a brief description of their personality/role. "
402
  "Return ONLY a JSON array like: [{\"name\":\"Alice\",\"description\":\"Curious young girl\"},...]\n\n"
403
  f"STORY:\n{sample}\n\nJSON:"
404
  )
405
-
406
  response = client.text_generation(
407
  model="Qwen/Qwen3-1.7B",
408
  prompt=prompt,
@@ -410,8 +681,6 @@ def ai_extract_characters(text: str, api_token: Optional[str] = None) -> List[Ch
410
  temperature=0.3,
411
  return_full_text=False,
412
  )
413
-
414
- # Extract JSON from response
415
  json_match = re.search(r'\[.*?\]', response, re.DOTALL)
416
  if json_match:
417
  data = json.loads(json_match.group())
@@ -438,6 +707,22 @@ class AudiobookPipeline:
438
  self.temp_dir = Path(tempfile.gettempdir()) / "audiobook_segments"
439
  self.temp_dir.mkdir(exist_ok=True)
440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  def extract_characters(self, text: str, use_ai: bool = False) -> List[Dict]:
442
  if use_ai:
443
  profiles = ai_extract_characters(text)
@@ -453,10 +738,21 @@ class AudiobookPipeline:
453
  "voice_mode": "preset",
454
  "voice_preset": "Ryan",
455
  "voice_instruct": "",
 
 
456
  }
457
  for p in profiles
458
  ]
459
 
 
 
 
 
 
 
 
 
 
460
  def generate(
461
  self,
462
  text: str,
@@ -465,22 +761,22 @@ class AudiobookPipeline:
465
  progress_callback=None,
466
  temperature: float = 0.7,
467
  seed: int = 42,
468
- ) -> Tuple[str, List[str]]:
469
  """
470
  Generate audiobook.
471
- Returns (final_mp3_path, list_of_segment_wav_paths).
472
  """
473
  segments = self.processor.segment_text(text, list(character_configs.keys()))
474
  segments = self.processor.chunk_segments(segments)
475
 
476
  segment_paths = []
 
477
  total = len(segments)
478
 
479
  for i, seg in enumerate(segments):
480
  if progress_callback:
481
- progress_callback(i / total, f"Generating segment {i+1}/{total} ({seg.seg_type})...")
482
 
483
- # Determine voice
484
  if seg.seg_type == "dialogue" and seg.speaker and seg.speaker in character_configs:
485
  voice = character_configs[seg.speaker]
486
  else:
@@ -491,20 +787,38 @@ class AudiobookPipeline:
491
  seg_path = self.temp_dir / f"seg_{i:04d}_{voice.name}.wav"
492
  sf.write(str(seg_path), wav, sr)
493
  segment_paths.append(str(seg_path))
 
 
 
 
 
 
 
494
  except Exception as e:
495
  print(f"[Pipeline] Segment {i} failed: {e}")
496
- # Insert silence to maintain timing
497
  silent = AudioSegment.silent(duration=500)
498
  seg_path = self.temp_dir / f"seg_{i:04d}_silent.wav"
499
  silent.export(str(seg_path), format="wav")
500
  segment_paths.append(str(seg_path))
 
 
 
 
 
 
 
 
501
 
502
  if progress_callback:
503
- progress_callback(1.0, "Stitching final audiobook...")
504
 
505
  output_path = str(self.temp_dir / "audiobook_final.mp3")
506
  save_audiobook(segment_paths, output_path, title="Generated Audiobook")
507
- return output_path, segment_paths
 
 
 
 
508
 
509
  def preview_voice(
510
  self,
 
1
  """
2
  AudioBook Forge - Backend
3
  Model-agnostic TTS engine with Qwen3-TTS support.
4
+ Character extraction, dialogue parsing, audio stitching, file import,
5
+ chapter detection, segment preview, and multi-format export.
6
  """
7
 
8
  import os
 
10
  import json
11
  import hashlib
12
  import tempfile
13
+ import zipfile
14
  from pathlib import Path
15
  from typing import List, Dict, Optional, Tuple, Any
16
+ from dataclasses import dataclass, field, asdict
17
  from collections import defaultdict
18
+ from html.parser import HTMLParser
19
  import warnings
20
 
21
  import numpy as np
 
42
  MAX_CHUNK_CHARS = 380
43
  MIN_CHUNK_CHARS = 80
44
  CROSSFADE_MS = 80
45
+ WORDS_PER_MINUTE = 150
46
+
47
+ SAMPLE_STORIES = {
48
+ "The Velveteen Rabbit (excerpt)": """There was once a velveteen rabbit, and in the beginning he was really splendid. He was fat and bunchy, as a rabbit should be; his coat was spotted brown and white, he had real thread whiskers, and his ears were lined with pink sateen.
49
+
50
+ On Christmas morning, when he sat wedged in the top of the Boy's stocking, with a sprig of holly between his paws, the effect was charming.
51
+
52
+ There were other things in the stocking, nuts and oranges and a toy engine, and chocolate almonds and a clockwork mouse, but the Rabbit was quite the best of all. For at least two hours the Boy loved him, and then Aunts and Uncles came to dinner, and there was a great rustling of tissue paper and unwrapping of parcels, and in the excitement of looking at all the new presents the Velveteen Rabbit was forgotten.
53
+
54
+ For a long time he lived in the toy cupboard or on the nursery floor, and no one thought very much about him. He was naturally shy, and being only made of velveteen, some of the more expensive toys quite snubbed him. The mechanical toys were very superior, and looked down upon every one else; they were full of modern ideas, and pretended they were real.
55
+
56
+ The Rabbit could not claim to be a model of anything, for he didn't know that real rabbits existed; he thought they were all stuffed with sawdust like himself, and he understood that sawdust was quite out-of-date and should never be mentioned in modern circles.
57
+
58
+ Even Timothy, the jointed wooden lion, who was made by the disabled soldiers, and should have had broader views, put on airs and pretended he was connected with Government. Between them all the poor little Rabbit was made to feel himself very insignificant and commonplace, and the only person who was kind to him at all was the Skin Horse.
59
+
60
+ The Skin Horse had lived longer in the nursery than any of the others. He was so old that his brown coat was bald in patches and showed the seams underneath, and most of the hairs in his tail had been pulled out to string bead necklaces.
61
+
62
+ He was wise, for he had seen a long succession of mechanical toys arrive to boast and swagger, and by-and-by break their mainsprings and pass away, and he knew that they were only toys, and would never turn into anything else. For nursery magic is very strange and wonderful, and only those playthings that are old and wise and experienced like the Skin Horse understand all about it.""",
63
+
64
+ "A Study in Scarlet (excerpt)": """In the year 1878 I took my degree of Doctor of Medicine of the University of London, and proceeded to Netley to go through the course prescribed for surgeons in the army. Having completed my studies there, I was duly attached to the Fifth Northumberland Fusiliers as Assistant Surgeon.
65
+
66
+ The regiment was stationed in India at the time, and before I could join it, the second Afghan war had broken out. On landing at Bombay, I learned that my corps had advanced through the passes, and was already deep in the enemy's country.
67
+
68
+ I followed, however, with many other officers who were in the same situation as myself, and succeeded in reaching Candahar in safety, where I found my regiment, and at once entered upon my new duties.
69
+
70
+ The campaign brought honours and promotion to many, but for me it had nothing but misfortune and disaster. I was removed from my brigade and attached to the Berkshires, with whom I served at the fatal battle of Maiwand.
71
+
72
+ There I was struck on the shoulder by a Jezail bullet, which shattered the bone and grazed the subclavian artery. I should have fallen into the hands of the murderous Ghazis had it not been for the devotion and courage shown by Murray, my orderly, who threw me across a pack-horse, and succeeded in bringing me safely to the British lines.
73
+
74
+ Worn with pain, and weak from the prolonged hardships which I had undergone, I was removed, with a great train of wounded sufferers, to the base hospital at Peshawar. Here I rallied, and had already improved so far as to be able to walk about the wards, and even to bask a little upon the verandah, when I was struck down by enteric fever, that curse of our Indian possessions.
75
+
76
+ For months my life was despaired of, and when at last I came to myself and became convalescent, I was so weak and emaciated that a medical board determined that not a day should be lost in sending me back to England. I was dispatched, accordingly, in the troopship Orontes, and landed a month later on Portsmouth jetty, with my health irretrievably ruined, but with permission from a paternal government to spend the next nine months in attempting to improve it.""",
77
+
78
+ "Pride and Prejudice (excerpt)": """It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.
79
+
80
+ However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.
81
+
82
+ \"My dear Mr. Bennet,\" said his lady to him one day, \"have you heard that Netherfield Park is let at last?\"
83
+
84
+ Mr. Bennet replied that he had not.
85
+
86
+ \"But it is,\" returned she; \"for Mrs. Long has just been here, and she told me all about it.\"
87
+
88
+ Mr. Bennet made no answer.
89
+
90
+ \"Do you not want to know who has taken it?\" cried his wife impatiently.
91
+
92
+ \"You want to tell me, and I have no objection to hearing it.\"
93
+
94
+ This was invitation enough.
95
+
96
+ \"Why, my dear, you must know, Mrs. Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to see the place, and was so much delighted with it, that he agreed with Mr. Morris immediately; that he is to take possession before Michaelmas, and some of his servants are to be in the house by the end of next week.\"
97
+
98
+ \"What is his name?\"
99
+
100
+ \"Bingley.\"
101
+
102
+ \"Is he married or single?\"
103
+
104
+ \"Oh! Single, my dear, to be sure! A single man of large fortune; four or five thousand a year. What a fine thing for our girls!\"
105
+
106
+ \"How so? How can it affect them?\"
107
+
108
+ \"My dear Mr. Bennet,\" replied his wife, \"how can you be so tiresome! You must know that I am thinking of his marrying one of them.\"
109
+
110
+ \"Is that his design in settling here?\"
111
+
112
+ \"Design! Nonsense, how can you talk so! But it is very likely that he may fall in love with one of them, and therefore you must visit him as soon as he comes.\"""",
113
+ }
114
 
115
  # ---------------------------------------------------------------------------
116
  # Data Classes
 
119
  @dataclass
120
  class VoiceConfig:
121
  name: str = "Narrator"
122
+ mode: str = "preset"
123
+ preset: Optional[str] = None
124
  ref_audio: Optional[str] = None
125
  ref_text: Optional[str] = None
126
  design_desc: Optional[str] = None
127
+ instruct: str = ""
128
  language: str = "English"
129
+ speed: float = 1.0 # 0.5 to 2.0
130
+
131
+ def to_dict(self) -> dict:
132
+ return asdict(self)
133
+
134
+ @classmethod
135
+ def from_dict(cls, d: dict) -> "VoiceConfig":
136
+ return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
137
 
138
 
139
  @dataclass
140
  class TextSegment:
141
  text: str
142
+ seg_type: str = "narration"
143
  speaker: Optional[str] = None
144
  emotion_hint: Optional[str] = None
145
+ chapter_idx: int = 0
146
 
147
 
148
  @dataclass
 
153
  occurrences: int = 0
154
 
155
 
156
+ @dataclass
157
+ class Chapter:
158
+ idx: int
159
+ title: str
160
+ text: str
161
+ word_count: int = 0
162
+
163
+
164
  # ---------------------------------------------------------------------------
165
+ # File Importers
166
  # ---------------------------------------------------------------------------
167
 
168
+ class EPUBTextExtractor(HTMLParser):
169
+ def __init__(self):
170
+ super().__init__()
171
+ self.text_parts = []
172
+ self.in_script = False
173
+ self.in_body = False
174
+
175
+ def handle_starttag(self, tag, attrs):
176
+ if tag in ("script", "style"):
177
+ self.in_script = True
178
+ if tag == "body":
179
+ self.in_body = True
180
+ if tag in ("p", "div", "h1", "h2", "h3", "h4", "br"):
181
+ self.text_parts.append("\n")
182
+
183
+ def handle_endtag(self, tag):
184
+ if tag in ("script", "style"):
185
+ self.in_script = False
186
+ if tag in ("p", "div", "h1", "h2", "h3", "h4"):
187
+ self.text_parts.append("\n")
188
+
189
+ def handle_data(self, data):
190
+ if not self.in_script:
191
+ self.text_parts.append(data)
192
+
193
+ def get_text(self) -> str:
194
+ text = "".join(self.text_parts)
195
+ text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
196
+ text = re.sub(r"[ \t]+", " ", text)
197
+ return text.strip()
198
+
199
+
200
+ def parse_file(filepath: str) -> Tuple[str, str]:
201
+ """Parse uploaded file and return (text, filename)."""
202
+ path = Path(filepath)
203
+ suffix = path.suffix.lower()
204
+
205
+ if suffix == ".txt":
206
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
207
+ return f.read(), path.name
208
+
209
+ elif suffix == ".epub":
210
+ with zipfile.ZipFile(path, "r") as z:
211
+ texts = []
212
+ for name in z.namelist():
213
+ if name.endswith((".html", ".htm", ".xhtml", ".xml")):
214
+ with z.open(name) as f:
215
+ content = f.read().decode("utf-8", errors="ignore")
216
+ parser = EPUBTextExtractor()
217
+ parser.feed(content)
218
+ texts.append(parser.get_text())
219
+ return "\n\n".join(texts), path.name
220
+
221
+ elif suffix == ".pdf":
222
+ try:
223
+ from PyPDF2 import PdfReader
224
+ reader = PdfReader(str(path))
225
+ texts = []
226
+ for page in reader.pages:
227
+ t = page.extract_text()
228
+ if t:
229
+ texts.append(t)
230
+ return "\n\n".join(texts), path.name
231
+ except Exception as e:
232
+ raise ValueError(f"PDF parsing failed: {e}")
233
+
234
+ elif suffix in (".html", ".htm"):
235
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
236
+ content = f.read()
237
+ parser = EPUBTextExtractor()
238
+ parser.feed(content)
239
+ return parser.get_text(), path.name
240
+
241
+ else:
242
+ raise ValueError(f"Unsupported file type: {suffix}")
243
+
244
+
245
+ # ---------------------------------------------------------------------------
246
+ # Text Processing
247
+ # ---------------------------------------------------------------------------
248
+
249
+ class TextProcessor:
250
+ DIALOGUE_RE = re.compile(r'(?:^|[.!?\n]\s+)\s*"([^"]{3,500})"')
251
+ SPEAKER_RE = re.compile(r'(?:^|\n)\s*([A-Z][a-zA-Z\s]{1,20})(?:\s*[:\-–])\s*"([^"]+)"')
252
+ NAME_RE = re.compile(r'\b([A-Z][a-z]{1,15})\b')
253
+ CHAPTER_RE = re.compile(
254
+ r'^(?:\s*(?:Chapter|CHAPTER|Part|PART|Book|BOOK|Section|SECTION)\s*(?:[IVX\d]+|[A-Z]).*)$',
255
+ re.MULTILINE,
256
+ )
257
+ HEADER_RE = re.compile(
258
+ r'^(?:\s*\d+\s+|\s*Page\s*\d+.*|\s*www\.\S+.*|\s*Copyright.*|\s*All rights reserved.*)$',
259
+ re.MULTILINE | re.IGNORECASE,
260
+ )
261
+
262
+ @staticmethod
263
+ def clean_text(text: str) -> str:
264
+ """Remove headers, page numbers, excessive whitespace."""
265
+ text = TextProcessor.HEADER_RE.sub("", text)
266
+ text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
267
+ text = re.sub(r"[ \t]+", " ", text)
268
+ text = re.sub(r"^\s+", "", text, flags=re.MULTILINE)
269
+ return text.strip()
270
+
271
+ @staticmethod
272
+ def detect_chapters(text: str) -> List[Chapter]:
273
+ """Split text into chapters by chapter headings."""
274
+ matches = list(TextProcessor.CHAPTER_RE.finditer(text))
275
+ if len(matches) < 2:
276
+ # No clear chapters; return as single chapter
277
+ words = len(text.split())
278
+ return [Chapter(idx=0, title="Full Text", text=text, word_count=words)]
279
+
280
+ chapters = []
281
+ for i, match in enumerate(matches):
282
+ start = match.start()
283
+ title = match.group(0).strip()
284
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
285
+ ch_text = text[start:end].strip()
286
+ words = len(ch_text.split())
287
+ chapters.append(Chapter(idx=i, title=title, text=ch_text, word_count=words))
288
+ return chapters
289
+
290
+ @staticmethod
291
+ def extract_characters(text: str, use_ai: bool = False) -> List[CharacterProfile]:
292
+ profiles: Dict[str, CharacterProfile] = {}
293
+
294
+ for match in TextProcessor.SPEAKER_RE.finditer(text):
295
+ name = match.group(1).strip()
296
+ if len(name) > 2:
297
+ if name not in profiles:
298
+ profiles[name] = CharacterProfile(name=name)
299
+ profiles[name].occurrences += 1
300
+
301
+ for match in TextProcessor.DIALOGUE_RE.finditer(text):
302
+ before = text[max(0, match.start() - 120):match.start()]
303
+ said_match = re.search(
304
+ r'([A-Z][a-z]{1,15})\s+(?:said|cried|shouted|whispered|replied|asked|answered|called|exclaimed)',
305
+ before,
306
+ )
307
+ if said_match:
308
+ name = said_match.group(1)
309
+ if name not in profiles:
310
+ profiles[name] = CharacterProfile(name=name)
311
+ profiles[name].occurrences += 1
312
+
313
+ all_names = TextProcessor.NAME_RE.findall(text)
314
+ from collections import Counter
315
+ common = Counter(all_names).most_common(30)
316
+ for name, count in common:
317
+ if count >= 3 and len(name) > 2 and name not in profiles:
318
+ if name.lower() in {
319
+ "the", "and", "but", "for", "are", "was", "were", "had", "have", "has",
320
+ "his", "her", "she", "him", "they", "them", "said", "with", "from",
321
+ "that", "this", "what", "when", "where", "would", "could", "should",
322
+ "not", "you", "all", "any", "can", "had", "her", "was", "one", "our",
323
+ "out", "day", "get", "has", "him", "his", "how", "its", "may", "new",
324
+ "now", "old", "see", "two", "who", "boy", "man", "way", "too", "upon",
325
+ }:
326
+ continue
327
+ profiles[name] = CharacterProfile(name=name, occurrences=count)
328
+
329
+ result = sorted(profiles.values(), key=lambda p: p.occurrences, reverse=True)
330
+ return result[:12]
331
+
332
+ @staticmethod
333
+ def segment_text(text: str, characters: List[str]) -> List[TextSegment]:
334
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
335
+ paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
336
+ segments = []
337
+
338
+ for para in paragraphs:
339
+ speaker_match = re.match(
340
+ r'^([A-Z][a-zA-Z\s]{1,20})[:\-–]\s*"([^"]+)"',
341
+ para,
342
+ )
343
+ if speaker_match:
344
+ speaker = speaker_match.group(1).strip()
345
+ dialogue = speaker_match.group(2)
346
+ segments.append(TextSegment(text=dialogue, seg_type="dialogue", speaker=speaker))
347
+ remainder = para[speaker_match.end():].strip()
348
+ if remainder:
349
+ segments.append(TextSegment(text=remainder, seg_type="narration"))
350
+ continue
351
+
352
+ parts = re.split(r'"([^"]{3,500})"', para)
353
+ for i, part in enumerate(parts):
354
+ part = part.strip()
355
+ if not part:
356
+ continue
357
+ if i % 2 == 1:
358
+ segments.append(TextSegment(text=part, seg_type="dialogue", speaker=None))
359
+ else:
360
+ segments.append(TextSegment(text=part, seg_type="narration"))
361
+
362
+ merged = []
363
+ for seg in segments:
364
+ if merged and seg.seg_type == "narration" and merged[-1].seg_type == "narration":
365
+ merged[-1].text += " " + seg.text
366
+ else:
367
+ merged.append(seg)
368
+ return merged
369
+
370
+ @staticmethod
371
+ def chunk_segments(segments: List[TextSegment], max_chars: int = MAX_CHUNK_CHARS) -> List[TextSegment]:
372
+ result = []
373
+ for seg in segments:
374
+ if len(seg.text) <= max_chars:
375
+ result.append(seg)
376
+ continue
377
+ sentences = re.split(r'(?<=[.!?])\s+', seg.text)
378
+ current_text = ""
379
+ for sent in sentences:
380
+ if len(current_text) + len(sent) + 1 <= max_chars:
381
+ current_text += (" " if current_text else "") + sent
382
+ else:
383
+ if current_text:
384
+ result.append(TextSegment(
385
+ text=current_text.strip(),
386
+ seg_type=seg.seg_type,
387
+ speaker=seg.speaker,
388
+ ))
389
+ current_text = sent
390
+ if current_text:
391
+ result.append(TextSegment(
392
+ text=current_text.strip(),
393
+ seg_type=seg.seg_type,
394
+ speaker=seg.speaker,
395
+ ))
396
+ return result
397
+
398
+
399
+ # ---------------------------------------------------------------------------
400
+ # TTS Engine
401
+ # ---------------------------------------------------------------------------
402
 
403
+ class TTSEngine:
404
  def __init__(self, device: str = "cuda"):
405
  self.device = device
406
  self._custom_voice_model = None
 
469
  return self._design_model
470
 
471
  def _cache_key(self, text: str, voice: VoiceConfig) -> str:
472
+ payload = (
473
+ f"{text}|{voice.mode}|{voice.preset}|{voice.ref_audio}|"
474
+ f"{voice.design_desc}|{voice.instruct}|{voice.language}|{voice.speed}"
475
+ )
476
  return hashlib.md5(payload.encode()).hexdigest()
477
 
478
  def _cached_path(self, key: str) -> Path:
 
485
  temperature: float = 0.7,
486
  seed: int = 42,
487
  ) -> Tuple[np.ndarray, int]:
 
488
  cache_key = self._cache_key(text, voice)
489
  cache_path = self._cached_path(cache_key)
490
  if cache_path.exists():
 
526
  else:
527
  raise ValueError(f"Unknown voice mode: {voice.mode}")
528
 
 
529
  if isinstance(wavs, list):
530
  wavs = wavs[0]
531
  if wavs.ndim > 1:
532
  wavs = wavs.mean(axis=1)
533
 
534
+ # Apply speed adjustment
535
+ if voice.speed != 1.0 and voice.speed > 0.3:
536
+ wavs = self._adjust_speed(wavs, sr, voice.speed)
537
+
538
  sf.write(str(cache_path), wavs, sr)
539
  return wavs, sr
540
 
541
+ @staticmethod
542
+ def _adjust_speed(audio: np.ndarray, sr: int, speed: float) -> np.ndarray:
543
+ """Adjust audio speed using pydub."""
544
+ if abs(speed - 1.0) < 0.05:
545
+ return audio
546
+ # Convert to pydub AudioSegment
547
+ audio = (audio * 32767).astype(np.int16)
548
+ seg = AudioSegment(
549
+ audio.tobytes(),
550
+ frame_rate=sr,
551
+ sample_width=2,
552
+ channels=1 if audio.ndim == 1 else audio.shape[1],
553
+ )
554
+ if speed > 1.0:
555
+ seg = seg.speedup(playback_speed=speed)
556
+ else:
557
+ # slowdown by adding frames
558
+ seg = seg._spawn(seg.raw_data, overrides={
559
+ "frame_rate": int(seg.frame_rate * speed)
560
+ })
561
+ seg = seg.set_frame_rate(sr)
562
+ # Convert back to numpy
563
+ samples = np.array(seg.get_array_of_samples())
564
+ return samples.astype(np.float32) / 32767.0
565
+
566
  def status(self) -> Dict[str, Any]:
567
  return {
568
  "custom_loaded": self._custom_voice_model is not None,
 
571
  }
572
 
573
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  # ---------------------------------------------------------------------------
575
  # Audio Utils
576
  # ---------------------------------------------------------------------------
577
 
578
  def stitch_audio(paths: List[str], crossfade_ms: int = CROSSFADE_MS) -> AudioSegment:
 
579
  if not paths:
580
  return AudioSegment.silent(duration=0)
581
  combined = AudioSegment.from_wav(paths[0])
582
  for p in paths[1:]:
583
  next_seg = AudioSegment.from_wav(p)
 
584
  if crossfade_ms > 0 and len(combined) > crossfade_ms and len(next_seg) > crossfade_ms:
585
  combined = combined.append(next_seg, crossfade=crossfade_ms)
586
  else:
 
589
 
590
 
591
  def normalize_audio(audio: AudioSegment, target_dBFS: float = -1.5) -> AudioSegment:
 
592
  change = target_dBFS - audio.max_dBFS
593
  return audio.apply_gain(change)
594
 
595
 
596
+ def save_audiobook(segments_paths: List[str], output_path: str, title: str = "Audiobook", fmt: str = "mp3") -> str:
 
597
  if not segments_paths:
598
  return ""
599
  combined = stitch_audio(segments_paths)
600
  combined = normalize_audio(combined)
601
+ if fmt == "mp3":
602
+ combined.export(output_path, format="mp3", bitrate="192k", tags={"title": title, "artist": "AudioBook Forge"})
603
+ elif fmt == "wav":
604
+ combined.export(output_path, format="wav", tags={"title": title, "artist": "AudioBook Forge"})
605
+ elif fmt == "ogg":
606
+ combined.export(output_path, format="ogg", tags={"title": title, "artist": "AudioBook Forge"})
607
+ return output_path
608
+
609
+
610
+ def save_segment_zip(segments_paths: List[str], output_path: str) -> str:
611
+ """Save individual segment WAVs as a ZIP."""
612
+ with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
613
+ for i, p in enumerate(segments_paths):
614
+ arcname = f"segment_{i:04d}.wav"
615
+ zf.write(p, arcname)
616
  return output_path
617
 
618
 
619
+ def estimate_duration(word_count: int, wpm: int = WORDS_PER_MINUTE) -> str:
620
+ minutes = word_count / wpm
621
+ if minutes < 1:
622
+ return f"{int(minutes * 60)} seconds"
623
+ elif minutes < 60:
624
+ return f"{minutes:.1f} minutes"
625
+ else:
626
+ hours = int(minutes // 60)
627
+ mins = int(minutes % 60)
628
+ return f"{hours}h {mins}m"
629
+
630
+
631
+ # ---------------------------------------------------------------------------
632
+ # Project Save/Load
633
+ # ---------------------------------------------------------------------------
634
+
635
+ def save_project(
636
+ text: str,
637
+ narrator: VoiceConfig,
638
+ characters: Dict[str, VoiceConfig],
639
+ settings: dict,
640
+ ) -> str:
641
+ """Save project to JSON string."""
642
+ data = {
643
+ "version": "1.1",
644
+ "text_sample": text[:2000] + ("..." if len(text) > 2000 else ""),
645
+ "text_hash": hashlib.md5(text.encode()).hexdigest()[:16],
646
+ "narrator": narrator.to_dict(),
647
+ "characters": {k: v.to_dict() for k, v in characters.items()},
648
+ "settings": settings,
649
+ }
650
+ return json.dumps(data, indent=2)
651
+
652
+
653
+ def load_project(json_str: str) -> dict:
654
+ """Load project from JSON string."""
655
+ data = json.loads(json_str)
656
+ if data.get("version", "1.0").startswith("1."):
657
+ data["narrator"] = VoiceConfig.from_dict(data["narrator"])
658
+ data["characters"] = {k: VoiceConfig.from_dict(v) for k, v in data.get("characters", {}).items()}
659
+ return data
660
+
661
+
662
  # ---------------------------------------------------------------------------
663
+ # AI Character Extraction
664
  # ---------------------------------------------------------------------------
665
 
666
  def ai_extract_characters(text: str, api_token: Optional[str] = None) -> List[CharacterProfile]:
 
667
  try:
668
  from huggingface_hub import InferenceClient
669
  client = InferenceClient(token=api_token or os.getenv("HF_TOKEN"))
 
 
670
  sample = text[:4000] + ("\n...[truncated]" if len(text) > 4000 else "")
 
671
  prompt = (
672
  "Extract all named characters from the following story excerpt. "
673
  "For each character, provide their name and a brief description of their personality/role. "
674
  "Return ONLY a JSON array like: [{\"name\":\"Alice\",\"description\":\"Curious young girl\"},...]\n\n"
675
  f"STORY:\n{sample}\n\nJSON:"
676
  )
 
677
  response = client.text_generation(
678
  model="Qwen/Qwen3-1.7B",
679
  prompt=prompt,
 
681
  temperature=0.3,
682
  return_full_text=False,
683
  )
 
 
684
  json_match = re.search(r'\[.*?\]', response, re.DOTALL)
685
  if json_match:
686
  data = json.loads(json_match.group())
 
707
  self.temp_dir = Path(tempfile.gettempdir()) / "audiobook_segments"
708
  self.temp_dir.mkdir(exist_ok=True)
709
 
710
+ def parse_upload(self, filepath: str) -> Tuple[str, str]:
711
+ return parse_file(filepath)
712
+
713
+ def detect_chapters(self, text: str) -> List[Dict]:
714
+ chapters = self.processor.detect_chapters(text)
715
+ return [
716
+ {"idx": c.idx, "title": c.title, "word_count": c.word_count}
717
+ for c in chapters
718
+ ]
719
+
720
+ def get_chapter_text(self, text: str, chapter_idx: int) -> str:
721
+ chapters = self.processor.detect_chapters(text)
722
+ if 0 <= chapter_idx < len(chapters):
723
+ return chapters[chapter_idx].text
724
+ return text
725
+
726
  def extract_characters(self, text: str, use_ai: bool = False) -> List[Dict]:
727
  if use_ai:
728
  profiles = ai_extract_characters(text)
 
738
  "voice_mode": "preset",
739
  "voice_preset": "Ryan",
740
  "voice_instruct": "",
741
+ "speed": 1.0,
742
+ "language": "English",
743
  }
744
  for p in profiles
745
  ]
746
 
747
+ def preview_segment(
748
+ self,
749
+ text: str,
750
+ voice: VoiceConfig,
751
+ temperature: float = 0.7,
752
+ seed: int = 42,
753
+ ) -> Tuple[np.ndarray, int]:
754
+ return self.tts.synthesize(text, voice, temperature=temperature, seed=seed)
755
+
756
  def generate(
757
  self,
758
  text: str,
 
761
  progress_callback=None,
762
  temperature: float = 0.7,
763
  seed: int = 42,
764
+ ) -> Tuple[str, List[str], List[Dict]]:
765
  """
766
  Generate audiobook.
767
+ Returns (final_path, segment_paths, segment_metadata).
768
  """
769
  segments = self.processor.segment_text(text, list(character_configs.keys()))
770
  segments = self.processor.chunk_segments(segments)
771
 
772
  segment_paths = []
773
+ segment_meta = []
774
  total = len(segments)
775
 
776
  for i, seg in enumerate(segments):
777
  if progress_callback:
778
+ progress_callback(i / total, f"Segment {i+1}/{total} ({seg.seg_type})...")
779
 
 
780
  if seg.seg_type == "dialogue" and seg.speaker and seg.speaker in character_configs:
781
  voice = character_configs[seg.speaker]
782
  else:
 
787
  seg_path = self.temp_dir / f"seg_{i:04d}_{voice.name}.wav"
788
  sf.write(str(seg_path), wav, sr)
789
  segment_paths.append(str(seg_path))
790
+ segment_meta.append({
791
+ "idx": i,
792
+ "type": seg.seg_type,
793
+ "speaker": seg.speaker or voice.name,
794
+ "text": seg.text[:100] + ("..." if len(seg.text) > 100 else ""),
795
+ "path": str(seg_path),
796
+ })
797
  except Exception as e:
798
  print(f"[Pipeline] Segment {i} failed: {e}")
 
799
  silent = AudioSegment.silent(duration=500)
800
  seg_path = self.temp_dir / f"seg_{i:04d}_silent.wav"
801
  silent.export(str(seg_path), format="wav")
802
  segment_paths.append(str(seg_path))
803
+ segment_meta.append({
804
+ "idx": i,
805
+ "type": seg.seg_type,
806
+ "speaker": voice.name,
807
+ "text": seg.text[:100] + "...",
808
+ "path": str(seg_path),
809
+ "error": str(e),
810
+ })
811
 
812
  if progress_callback:
813
+ progress_callback(1.0, "Finalizing audiobook...")
814
 
815
  output_path = str(self.temp_dir / "audiobook_final.mp3")
816
  save_audiobook(segment_paths, output_path, title="Generated Audiobook")
817
+ return output_path, segment_paths, segment_meta
818
+
819
+ def export_segments_zip(self, segment_paths: List[str]) -> str:
820
+ output_path = str(self.temp_dir / "audiobook_segments.zip")
821
+ return save_segment_zip(segment_paths, output_path)
822
 
823
  def preview_voice(
824
  self,
requirements.txt CHANGED
@@ -9,3 +9,4 @@ huggingface-hub>=0.23.0
9
  soundfile>=0.12.0
10
  pydub>=0.25.0
11
  numpy>=1.26.0
 
 
9
  soundfile>=0.12.0
10
  pydub>=0.25.0
11
  numpy>=1.26.0
12
+ PyPDF2>=3.0.0