XXiao commited on
Commit
df2999f
·
1 Parent(s): 6c66831
Files changed (2) hide show
  1. README.md +2 -0
  2. app.py +381 -72
README.md CHANGED
@@ -84,6 +84,7 @@ git push hf main
84
  API_UR="https://dashscope.aliyuncs.com/compatible-mode/v1"
85
  API_KEY="你的Key"
86
  USE_MOCK_MODELS=0
 
87
 
88
  # TTS 走 HF Space(优先)
89
  HF_TTS_SPACE_ID="your-org/audio"
@@ -100,6 +101,7 @@ HF_TTS_ALLOW_FALLBACK=1
100
 
101
  - 如果你更希望用完整 URL,可以改为 `HF_TTS_SPACE_URL="https://your-org-audio.hf.space"`。
102
  - 如果不想回退到原 TTS 接口,设置 `HF_TTS_ALLOW_FALLBACK=0`。
 
103
 
104
  ## 角色目录结构(自动发现)
105
 
 
84
  API_UR="https://dashscope.aliyuncs.com/compatible-mode/v1"
85
  API_KEY="你的Key"
86
  USE_MOCK_MODELS=0
87
+ USE_MOCK_TTS=0
88
 
89
  # TTS 走 HF Space(优先)
90
  HF_TTS_SPACE_ID="your-org/audio"
 
101
 
102
  - 如果你更希望用完整 URL,可以改为 `HF_TTS_SPACE_URL="https://your-org-audio.hf.space"`。
103
  - 如果不想回退到原 TTS 接口,设置 `HF_TTS_ALLOW_FALLBACK=0`。
104
+ - 如果只想 mock 文本/题目,但 TTS 用真实接口:`USE_MOCK_MODELS=1` 且 `USE_MOCK_TTS=0`。
105
 
106
  ## 角色目录结构(自动发现)
107
 
app.py CHANGED
@@ -63,6 +63,7 @@ _load_dotenv_file(APP_DIR / ".env")
63
  API_URL = os.getenv("API_URL") or os.getenv("API_UR", "")
64
  API_KEY = os.getenv("API_KEY", "")
65
  USE_MOCK_MODELS = os.getenv("USE_MOCK_MODELS", "0" if (API_URL and API_KEY) else "1") == "1"
 
66
  CHAT_MODEL_ID = os.getenv("QWEN_VL_MODEL_ID", "qwen-vl-max")
67
  TTS_MODEL_ID = os.getenv("QWEN_TTS_MODEL_ID", "qwen-tts")
68
  TTS_SPEAKER = os.getenv("QWEN_TTS_SPEAKER", "longxiaochun_v2")
@@ -258,6 +259,7 @@ def new_session_state() -> Dict[str, Any]:
258
  return {
259
  "lecture_text": "",
260
  "lecture_audio_path": None,
 
261
  "explanation_audio_path": None,
262
  "last_explanation_tts_text": "",
263
  "pdf_path": None,
@@ -416,10 +418,10 @@ def _is_hf_tts_enabled() -> bool:
416
 
417
 
418
  def _tts_backend_name() -> str:
 
 
419
  if _is_hf_tts_enabled():
420
  return f"hf_space:{HF_TTS_SPACE_ID or HF_TTS_SPACE_URL}"
421
- if USE_MOCK_MODELS:
422
- return "mock_tts"
423
  return "api_tts"
424
 
425
 
@@ -492,6 +494,34 @@ def split_text_for_tts(text: str, max_len: int = 480) -> List[str]:
492
  return chunks
493
 
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  def concat_wav_files(wav_paths: List[str], out_path: str) -> str:
496
  if not wav_paths:
497
  raise RuntimeError("No WAV chunks to concatenate.")
@@ -525,7 +555,8 @@ class QwenPipelineEngine:
525
  PDF -> lecture text -> MCQs -> TTS audio
526
 
527
  This ships with a mock mode by default so the workflow is runnable immediately.
528
- When USE_MOCK_MODELS=0, it calls remote APIs:
 
529
  - VL: OpenAI-compatible /chat/completions (works with DashScope compatible-mode and vLLM-style APIs)
530
  - TTS: HF Space /tts_chunk (optional) or DashScope/OpenAI-compatible endpoints
531
  """
@@ -551,11 +582,11 @@ class QwenPipelineEngine:
551
  def ensure_tts_loaded(self) -> None:
552
  if self.tts_loaded:
553
  return
554
- if _is_hf_tts_enabled():
555
- self._ensure_hf_tts_client()
556
  self.tts_loaded = True
557
  return
558
- if self.mock_mode:
 
559
  self.tts_loaded = True
560
  return
561
  _require_api_url()
@@ -586,33 +617,62 @@ class QwenPipelineEngine:
586
  return self._hf_tts_client
587
 
588
  def _hf_space_tts_single(self, text: str, out_path: str, *, voice: str, language: str) -> str:
589
- client = self._ensure_hf_tts_client()
590
  configured = (HF_TTS_API_NAME or "").strip()
591
  normalized = configured.lstrip("/")
592
- api_candidates: List[str] = []
593
- for cand in [configured, f"/{normalized}" if normalized else "", normalized, "/tts_chunk", "tts_chunk", "/predict", "predict"]:
594
- cand = cand.strip()
595
- if cand and cand not in api_candidates:
596
- api_candidates.append(cand)
597
 
598
  result: Any = None
599
  last_exc: Optional[Exception] = None
600
- for api_name in api_candidates:
601
- try:
602
- result = client.predict(
603
- text=text,
604
- voice=voice,
605
- language=language,
606
- api_name=api_name,
607
- )
608
- last_exc = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
  break
610
- except Exception as exc:
611
- msg = str(exc)
612
- if "Cannot find a function with api_name" in msg:
613
- last_exc = exc
614
- continue
615
- raise
616
  if last_exc is not None:
617
  available_hint = ""
618
  view_api = getattr(client, "view_api", None)
@@ -757,7 +817,7 @@ class QwenPipelineEngine:
757
  except Exception as exc:
758
  if not HF_TTS_ALLOW_FALLBACK:
759
  raise RuntimeError(f"HF Space TTS failed and fallback is disabled: {type(exc).__name__}: {exc}")
760
- if self.mock_mode:
761
  return write_tone_wav(text, out_path)
762
 
763
  openai_url = f"{_require_api_url()}/audio/speech"
@@ -808,12 +868,8 @@ class QwenPipelineEngine:
808
  raise RuntimeError(f"Failed to download TTS audio {audio_resp.status_code}: {audio_resp.text[:500]}")
809
  return _save_binary_audio(audio_resp.content, out_path)
810
 
811
- def _real_tts(self, text: str, out_path: str, *, voice: Optional[str] = None) -> str:
812
- if not TEXT_SPLIT_TO_CHUNK:
813
- if not str(text or "").strip():
814
- return write_tone_wav("empty", out_path)
815
- return self._real_tts_single(str(text), out_path, voice=voice)
816
- chunks = split_text_for_tts(text, max_len=480)
817
  if not chunks:
818
  return write_tone_wav("empty", out_path)
819
  if len(chunks) == 1:
@@ -825,6 +881,31 @@ class QwenPipelineEngine:
825
  chunk_paths.append(self._real_tts_single(chunk, chunk_path, voice=voice))
826
  return concat_wav_files(chunk_paths, out_path)
827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
  @spaces.GPU
829
  def build_lesson_and_quiz(self, pdf_path: str, character_cfg: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
830
  self.ensure_vl_loaded()
@@ -942,7 +1023,7 @@ class QwenPipelineEngine:
942
  def synthesize_tts(self, text: str, name_prefix: str = "audio", *, voice: Optional[str] = None) -> str:
943
  self.ensure_tts_loaded()
944
  out_path = str(TMP_DIR / f"{name_prefix}_{uuid.uuid4().hex}.wav")
945
- if self.mock_mode and not _is_hf_tts_enabled():
946
  return write_tone_wav(text, out_path)
947
  return self._real_tts(text, out_path, voice=voice)
948
 
@@ -1416,6 +1497,7 @@ def reset_ui_from_state(
1416
  submit_interactive = quiz_ready and not state.get("completed", False)
1417
  radio_interactive = submit_interactive
1418
  lecture_tts_ready = bool(state.get("lecture_text"))
 
1419
  if state.get("completed"):
1420
  radio_interactive = False
1421
  return (
@@ -1428,7 +1510,13 @@ def reset_ui_from_state(
1428
  gr.update(visible=show_explain_page),
1429
  gr.update(visible=show_exam_page),
1430
  state.get("status", "Idle"),
1431
- build_clickable_lecture_html(state.get("lecture_text", "")),
 
 
 
 
 
 
1432
  state.get("lecture_audio_path", None),
1433
  gr.update(interactive=lecture_tts_ready),
1434
  gr.update(visible=lecture_tts_ready, interactive=lecture_tts_ready),
@@ -1465,6 +1553,7 @@ def process_pdf(pdf_file: Optional[str], character_id: str, state: Dict[str, Any
1465
 
1466
  state["lecture_text"] = lecture_text
1467
  state["lecture_audio_path"] = None
 
1468
  state["explanation_audio_path"] = None
1469
  state["last_explanation_tts_text"] = ""
1470
  state["pdf_path"] = pdf_file
@@ -1712,6 +1801,7 @@ def on_character_change(character_id: str, state: Dict[str, Any]):
1712
  state["character_id"] = cfg["id"]
1713
  state["current_page"] = "explain"
1714
  state["lecture_audio_path"] = None
 
1715
  state["explanation_audio_path"] = None
1716
  state["last_explanation_tts_text"] = ""
1717
  # Keep generated content if user wants to compare, but hide result pages until next generate.
@@ -1739,14 +1829,26 @@ def tts_voice_for_character(character_id: Optional[str]) -> str:
1739
  def play_lecture_audio(state: Dict[str, Any]):
1740
  if not state.get("lecture_text"):
1741
  state["status"] = "No lecture text available."
1742
- return state, state["status"], state.get("lecture_audio_path"), "Generate lecture first."
 
 
 
 
 
 
1743
  backend = _tts_backend_name()
1744
  voice = tts_voice_for_character(state.get("character_id"))
1745
  try:
1746
  state["status"] = f"Generating lecture audio ({backend})..."
1747
  state["lecture_audio_path"] = engine.synthesize_tts(state["lecture_text"], name_prefix="lecture", voice=voice)
1748
  state["status"] = "Lecture audio ready."
1749
- return state, state["status"], state["lecture_audio_path"], f"Lecture audio generated via `{backend}`."
 
 
 
 
 
 
1750
  except Exception as exc:
1751
  state["status"] = "Lecture audio generation failed."
1752
  return (
@@ -1754,6 +1856,7 @@ def play_lecture_audio(state: Dict[str, Any]):
1754
  state["status"],
1755
  state.get("lecture_audio_path"),
1756
  f"TTS error via `{backend}`: {type(exc).__name__}: {exc}",
 
1757
  )
1758
 
1759
 
@@ -1765,14 +1868,39 @@ def split_lecture_paragraphs(text: str) -> List[str]:
1765
  return [p.strip() for p in pieces if p and p.strip()]
1766
 
1767
 
1768
- def build_clickable_lecture_html(lecture_text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
1769
  paragraphs = split_lecture_paragraphs(lecture_text)
1770
  if not paragraphs:
1771
  return '<div class="lecture-empty">Generated lecture explanation will appear here...</div>'
 
1772
  parts: List[str] = ['<div class="lecture-clickable">']
1773
  for i, p in enumerate(paragraphs):
1774
  safe = html.escape(p, quote=False).replace("\n", "<br>")
1775
- parts.append(f'<div class="lecture-paragraph" data-idx="{i}">{safe}</div>')
 
 
 
 
 
 
 
 
 
 
 
 
 
1776
  parts.append("</div>")
1777
  return "".join(parts)
1778
 
@@ -1782,7 +1910,13 @@ def play_lecture_paragraph_audio(paragraph_idx: str, state: Dict[str, Any]):
1782
  paragraphs = split_lecture_paragraphs(str(lecture_text or ""))
1783
  if not paragraphs:
1784
  state["status"] = "暂无讲解内容。"
1785
- return state, state.get("status", "Idle"), state.get("lecture_audio_path"), "请先生成讲解。"
 
 
 
 
 
 
1786
 
1787
  try:
1788
  idx = int(str(paragraph_idx or "").strip())
@@ -1790,11 +1924,18 @@ def play_lecture_paragraph_audio(paragraph_idx: str, state: Dict[str, Any]):
1790
  idx = -1
1791
  if idx < 0 or idx >= len(paragraphs):
1792
  state["status"] = "段落选择无效。"
1793
- return state, state.get("status", "Idle"), state.get("lecture_audio_path"), "请重新点击要播放的段落。"
 
 
 
 
 
 
1794
 
1795
  backend = _tts_backend_name()
1796
  voice = tts_voice_for_character(state.get("character_id"))
1797
  try:
 
1798
  state["status"] = f"正在生成段落语音({backend})..."
1799
  audio_path = engine.synthesize_tts(
1800
  paragraphs[idx],
@@ -1803,10 +1944,23 @@ def play_lecture_paragraph_audio(paragraph_idx: str, state: Dict[str, Any]):
1803
  )
1804
  state["lecture_audio_path"] = audio_path
1805
  state["status"] = "段落语音已生成。"
1806
- return state, state["status"], audio_path, f"已生成第 {idx+1}/{len(paragraphs)} 段语音,可在下方播放。"
 
 
 
 
 
 
 
1807
  except Exception as exc:
1808
  state["status"] = "段落语音生成失败。"
1809
- return state, state["status"], state.get("lecture_audio_path"), f"TTS error via `{backend}`: {type(exc).__name__}: {exc}"
 
 
 
 
 
 
1810
 
1811
 
1812
  def play_explanation_audio(state: Dict[str, Any]):
@@ -1825,6 +1979,32 @@ def play_explanation_audio(state: Dict[str, Any]):
1825
  return state, state["status"], state.get("explanation_audio_path"), f"TTS error: {type(exc).__name__}: {exc}"
1826
 
1827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1828
  def build_css() -> str:
1829
  bg_css = ""
1830
 
@@ -2130,8 +2310,16 @@ body {{
2130
  line-height: 1.45 !important;
2131
  color: rgba(244,246,251,0.95) !important;
2132
  }}
 
 
 
 
 
 
 
2133
  #lecture-clickable .lecture-paragraph {{
2134
  cursor: pointer;
 
2135
  padding: 10px 12px;
2136
  border-radius: 14px;
2137
  margin: 0 0 10px 0;
@@ -2145,6 +2333,18 @@ body {{
2145
  background: rgba(255,255,255,0.08);
2146
  border-color: rgba(255,255,255,0.14);
2147
  }}
 
 
 
 
 
 
 
 
 
 
 
 
2148
  .lecture-empty {{
2149
  padding: 10px 12px;
2150
  color: rgba(244,246,251,0.72);
@@ -2367,6 +2567,18 @@ body {{
2367
  background: rgba(15, 23, 42, 0.06);
2368
  border-color: rgba(15, 23, 42, 0.16);
2369
  }}
 
 
 
 
 
 
 
 
 
 
 
 
2370
  .lecture-empty {{
2371
  color: rgba(15, 23, 42, 0.72);
2372
  }}
@@ -2739,7 +2951,7 @@ with gr.Blocks(css=CSS) as demo:
2739
  () => {
2740
  const state = window.__lectureClickTtsGlobal || (window.__lectureClickTtsGlobal = {});
2741
  if (state.bound) return;
2742
- state.bound = true;
2743
  const grRoot = (typeof window.gradioApp === "function") ? window.gradioApp() : null;
2744
  const rootCandidates = [
2745
  document,
@@ -2786,30 +2998,112 @@ with gr.Blocks(css=CSS) as demo:
2786
  state.observer = new MutationObserver(() => bindAudioLoading());
2787
  state.observer.observe(document.body, { childList: true, subtree: true, attributes: true });
2788
  }
2789
- document.addEventListener(
2790
- "click",
2791
- (e) => {
2792
- const path = (e && typeof e.composedPath === "function") ? e.composedPath() : [];
2793
- let para = null;
2794
- for (const n of path) {
2795
- if (n && n.classList && n.classList.contains("lecture-paragraph")) { para = n; break; }
 
 
 
 
 
 
 
 
 
 
 
 
2796
  }
2797
- if (!para) return;
2798
- const idx = para.getAttribute("data-idx");
2799
- const input = q("#selected-paragraph textarea, #selected-paragraph input");
2800
- const btn = q("#play-paragraph-btn button");
2801
- if (!input || !btn) {
2802
- showLoading("未找到段落播放控件,请刷新页面重试。");
2803
- return;
 
 
2804
  }
2805
- input.value = idx || "";
2806
- input.dispatchEvent(new Event("input", { bubbles: true }));
2807
- input.dispatchEvent(new Event("change", { bubbles: true }));
2808
- showLoading("正在生成语音...");
2809
- btn.click();
2810
- },
2811
- true
2812
- );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2813
  }
2814
  """,
2815
  )
@@ -2827,7 +3121,15 @@ with gr.Blocks(css=CSS) as demo:
2827
  )
2828
  with gr.Row(elem_id="lecture-actions"):
2829
  play_lecture_btn = gr.Button("Play Lecture Audio", interactive=False, scale=0)
2830
- gr.Markdown("提示:点击任意段落可生成该段语音并在下方播放。", elem_id="paragraph-tts-tip")
 
 
 
 
 
 
 
 
2831
  lecture_feedback = gr.Markdown("")
2832
  with gr.Row(elem_id="exam-entry-wrap"):
2833
  exam_btn = gr.Button("Go to Exam", interactive=False, variant="secondary", scale=0)
@@ -2943,6 +3245,7 @@ with gr.Blocks(css=CSS) as demo:
2943
  exam_page,
2944
  status_box,
2945
  lecture_box,
 
2946
  lecture_audio,
2947
  play_lecture_btn,
2948
  exam_btn,
@@ -2969,15 +3272,21 @@ with gr.Blocks(css=CSS) as demo:
2969
  submit_btn.click(fn=submit_answer, inputs=[choice_radio, state], outputs=outputs, show_progress="hidden")
2970
  restart_btn.click(fn=restart_quiz, inputs=[state], outputs=outputs, show_progress="hidden")
2971
  play_lecture_btn.click(
2972
- fn=play_lecture_audio,
2973
  inputs=[state],
2974
- outputs=[state, status_box, lecture_audio, lecture_feedback],
2975
  show_progress="minimal",
2976
  )
2977
  play_paragraph_btn.click(
2978
- fn=play_lecture_paragraph_audio,
2979
  inputs=[paragraph_idx, state],
2980
- outputs=[state, status_box, lecture_audio, lecture_feedback],
 
 
 
 
 
 
2981
  show_progress="minimal",
2982
  )
2983
 
 
63
  API_URL = os.getenv("API_URL") or os.getenv("API_UR", "")
64
  API_KEY = os.getenv("API_KEY", "")
65
  USE_MOCK_MODELS = os.getenv("USE_MOCK_MODELS", "0" if (API_URL and API_KEY) else "1") == "1"
66
+ USE_MOCK_TTS = os.getenv("USE_MOCK_TTS", "0") == "1"
67
  CHAT_MODEL_ID = os.getenv("QWEN_VL_MODEL_ID", "qwen-vl-max")
68
  TTS_MODEL_ID = os.getenv("QWEN_TTS_MODEL_ID", "qwen-tts")
69
  TTS_SPEAKER = os.getenv("QWEN_TTS_SPEAKER", "longxiaochun_v2")
 
259
  return {
260
  "lecture_text": "",
261
  "lecture_audio_path": None,
262
+ "selected_paragraph_idx": "",
263
  "explanation_audio_path": None,
264
  "last_explanation_tts_text": "",
265
  "pdf_path": None,
 
418
 
419
 
420
  def _tts_backend_name() -> str:
421
+ if USE_MOCK_TTS:
422
+ return "mock_tts"
423
  if _is_hf_tts_enabled():
424
  return f"hf_space:{HF_TTS_SPACE_ID or HF_TTS_SPACE_URL}"
 
 
425
  return "api_tts"
426
 
427
 
 
494
  return chunks
495
 
496
 
497
+ def split_text_every_two_sentences(text: str, max_len: int = 480) -> List[str]:
498
+ cleaned = re.sub(r"\s+", " ", (text or "")).strip()
499
+ if not cleaned:
500
+ return []
501
+ if len(cleaned) <= max_len:
502
+ return [cleaned]
503
+
504
+ sentences = [s.strip() for s in re.split(r"(?<=[。!?!?;;::\.])\s*", cleaned) if s and s.strip()]
505
+ if not sentences:
506
+ return split_text_for_tts(cleaned, max_len=max_len)
507
+
508
+ groups: List[str] = []
509
+ i = 0
510
+ while i < len(sentences):
511
+ pair = " ".join(sentences[i:i + 2]).strip()
512
+ if pair:
513
+ groups.append(pair)
514
+ i += 2
515
+
516
+ chunks: List[str] = []
517
+ for g in groups:
518
+ if len(g) <= max_len:
519
+ chunks.append(g)
520
+ else:
521
+ chunks.extend(split_text_for_tts(g, max_len=max_len))
522
+ return [c for c in chunks if c and c.strip()]
523
+
524
+
525
  def concat_wav_files(wav_paths: List[str], out_path: str) -> str:
526
  if not wav_paths:
527
  raise RuntimeError("No WAV chunks to concatenate.")
 
555
  PDF -> lecture text -> MCQs -> TTS audio
556
 
557
  This ships with a mock mode by default so the workflow is runnable immediately.
558
+ When USE_MOCK_MODELS=0, it calls remote APIs for text generation.
559
+ TTS mock is controlled separately by USE_MOCK_TTS.
560
  - VL: OpenAI-compatible /chat/completions (works with DashScope compatible-mode and vLLM-style APIs)
561
  - TTS: HF Space /tts_chunk (optional) or DashScope/OpenAI-compatible endpoints
562
  """
 
582
  def ensure_tts_loaded(self) -> None:
583
  if self.tts_loaded:
584
  return
585
+ if USE_MOCK_TTS:
 
586
  self.tts_loaded = True
587
  return
588
+ if _is_hf_tts_enabled():
589
+ self._ensure_hf_tts_client()
590
  self.tts_loaded = True
591
  return
592
  _require_api_url()
 
617
  return self._hf_tts_client
618
 
619
  def _hf_space_tts_single(self, text: str, out_path: str, *, voice: str, language: str) -> str:
 
620
  configured = (HF_TTS_API_NAME or "").strip()
621
  normalized = configured.lstrip("/")
 
 
 
 
 
622
 
623
  result: Any = None
624
  last_exc: Optional[Exception] = None
625
+ api_candidates: List[str] = []
626
+ for attempt in range(2):
627
+ client = self._ensure_hf_tts_client()
628
+ api_prefix = ""
629
+ cfg = getattr(client, "config", None)
630
+ if isinstance(cfg, dict):
631
+ api_prefix = str(cfg.get("api_prefix") or "").strip()
632
+
633
+ api_candidates = []
634
+ prefixed = f"{api_prefix.rstrip('/')}/{normalized}" if api_prefix and normalized else ""
635
+ for cand in [
636
+ configured,
637
+ f"/{normalized}" if normalized else "",
638
+ normalized,
639
+ prefixed,
640
+ "/gradio_api/tts_chunk",
641
+ "gradio_api/tts_chunk",
642
+ "/tts_chunk",
643
+ "tts_chunk",
644
+ "/predict",
645
+ "predict",
646
+ ]:
647
+ cand = cand.strip()
648
+ if cand and cand not in api_candidates:
649
+ api_candidates.append(cand)
650
+
651
+ result = None
652
+ last_exc = None
653
+ for api_name in api_candidates:
654
+ try:
655
+ result = client.predict(
656
+ text=text,
657
+ voice=voice,
658
+ language=language,
659
+ api_name=api_name,
660
+ )
661
+ last_exc = None
662
+ break
663
+ except Exception as exc:
664
+ msg = str(exc)
665
+ lower_msg = msg.lower()
666
+ if ("cannot find a function" in lower_msg) and ("api_name" in lower_msg):
667
+ last_exc = exc
668
+ continue
669
+ raise
670
+ if last_exc is None:
671
  break
672
+ # Refresh cached client once in case the upstream app reloaded and endpoints changed.
673
+ if attempt == 0:
674
+ self._hf_tts_client = None
675
+
 
 
676
  if last_exc is not None:
677
  available_hint = ""
678
  view_api = getattr(client, "view_api", None)
 
817
  except Exception as exc:
818
  if not HF_TTS_ALLOW_FALLBACK:
819
  raise RuntimeError(f"HF Space TTS failed and fallback is disabled: {type(exc).__name__}: {exc}")
820
+ if USE_MOCK_TTS:
821
  return write_tone_wav(text, out_path)
822
 
823
  openai_url = f"{_require_api_url()}/audio/speech"
 
868
  raise RuntimeError(f"Failed to download TTS audio {audio_resp.status_code}: {audio_resp.text[:500]}")
869
  return _save_binary_audio(audio_resp.content, out_path)
870
 
871
+ def _synthesize_tts_chunks(self, chunks: List[str], out_path: str, *, voice: Optional[str] = None) -> str:
872
+ chunks = [str(c or "").strip() for c in chunks if str(c or "").strip()]
 
 
 
 
873
  if not chunks:
874
  return write_tone_wav("empty", out_path)
875
  if len(chunks) == 1:
 
881
  chunk_paths.append(self._real_tts_single(chunk, chunk_path, voice=voice))
882
  return concat_wav_files(chunk_paths, out_path)
883
 
884
+ def _real_tts(self, text: str, out_path: str, *, voice: Optional[str] = None) -> str:
885
+ cleaned = str(text or "").strip()
886
+ if not cleaned:
887
+ return write_tone_wav("empty", out_path)
888
+
889
+ if TEXT_SPLIT_TO_CHUNK:
890
+ return self._synthesize_tts_chunks(split_text_for_tts(cleaned, max_len=480), out_path, voice=voice)
891
+
892
+ try:
893
+ return self._real_tts_single(cleaned, out_path, voice=voice)
894
+ except Exception as exc:
895
+ err = str(exc).lower()
896
+ too_long = (
897
+ "text too long" in err
898
+ or "too long for chunk-level api" in err
899
+ or "chunk-level api" in err
900
+ )
901
+ if not too_long:
902
+ raise
903
+ return self._synthesize_tts_chunks(
904
+ split_text_every_two_sentences(cleaned, max_len=480),
905
+ out_path,
906
+ voice=voice,
907
+ )
908
+
909
  @spaces.GPU
910
  def build_lesson_and_quiz(self, pdf_path: str, character_cfg: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
911
  self.ensure_vl_loaded()
 
1023
  def synthesize_tts(self, text: str, name_prefix: str = "audio", *, voice: Optional[str] = None) -> str:
1024
  self.ensure_tts_loaded()
1025
  out_path = str(TMP_DIR / f"{name_prefix}_{uuid.uuid4().hex}.wav")
1026
+ if USE_MOCK_TTS:
1027
  return write_tone_wav(text, out_path)
1028
  return self._real_tts(text, out_path, voice=voice)
1029
 
 
1497
  submit_interactive = quiz_ready and not state.get("completed", False)
1498
  radio_interactive = submit_interactive
1499
  lecture_tts_ready = bool(state.get("lecture_text"))
1500
+ selected_paragraph_value = str(state.get("selected_paragraph_idx", "")).strip() or None
1501
  if state.get("completed"):
1502
  radio_interactive = False
1503
  return (
 
1510
  gr.update(visible=show_explain_page),
1511
  gr.update(visible=show_exam_page),
1512
  state.get("status", "Idle"),
1513
+ build_clickable_lecture_html(state.get("lecture_text", ""), str(state.get("selected_paragraph_idx", ""))),
1514
+ gr.update(
1515
+ choices=paragraph_picker_choices(state.get("lecture_text", "")),
1516
+ value=selected_paragraph_value,
1517
+ interactive=lecture_tts_ready,
1518
+ visible=lecture_tts_ready,
1519
+ ),
1520
  state.get("lecture_audio_path", None),
1521
  gr.update(interactive=lecture_tts_ready),
1522
  gr.update(visible=lecture_tts_ready, interactive=lecture_tts_ready),
 
1553
 
1554
  state["lecture_text"] = lecture_text
1555
  state["lecture_audio_path"] = None
1556
+ state["selected_paragraph_idx"] = ""
1557
  state["explanation_audio_path"] = None
1558
  state["last_explanation_tts_text"] = ""
1559
  state["pdf_path"] = pdf_file
 
1801
  state["character_id"] = cfg["id"]
1802
  state["current_page"] = "explain"
1803
  state["lecture_audio_path"] = None
1804
+ state["selected_paragraph_idx"] = ""
1805
  state["explanation_audio_path"] = None
1806
  state["last_explanation_tts_text"] = ""
1807
  # Keep generated content if user wants to compare, but hide result pages until next generate.
 
1829
  def play_lecture_audio(state: Dict[str, Any]):
1830
  if not state.get("lecture_text"):
1831
  state["status"] = "No lecture text available."
1832
+ return (
1833
+ state,
1834
+ state["status"],
1835
+ state.get("lecture_audio_path"),
1836
+ "Generate lecture first.",
1837
+ build_clickable_lecture_html(state.get("lecture_text", ""), str(state.get("selected_paragraph_idx", ""))),
1838
+ )
1839
  backend = _tts_backend_name()
1840
  voice = tts_voice_for_character(state.get("character_id"))
1841
  try:
1842
  state["status"] = f"Generating lecture audio ({backend})..."
1843
  state["lecture_audio_path"] = engine.synthesize_tts(state["lecture_text"], name_prefix="lecture", voice=voice)
1844
  state["status"] = "Lecture audio ready."
1845
+ return (
1846
+ state,
1847
+ state["status"],
1848
+ state["lecture_audio_path"],
1849
+ f"Lecture audio generated via `{backend}`.",
1850
+ build_clickable_lecture_html(state.get("lecture_text", ""), str(state.get("selected_paragraph_idx", ""))),
1851
+ )
1852
  except Exception as exc:
1853
  state["status"] = "Lecture audio generation failed."
1854
  return (
 
1856
  state["status"],
1857
  state.get("lecture_audio_path"),
1858
  f"TTS error via `{backend}`: {type(exc).__name__}: {exc}",
1859
+ build_clickable_lecture_html(state.get("lecture_text", ""), str(state.get("selected_paragraph_idx", ""))),
1860
  )
1861
 
1862
 
 
1868
  return [p.strip() for p in pieces if p and p.strip()]
1869
 
1870
 
1871
+ def paragraph_picker_choices(lecture_text: str) -> List[tuple[str, str]]:
1872
+ paragraphs = split_lecture_paragraphs(lecture_text)
1873
+ choices: List[tuple[str, str]] = []
1874
+ for i, p in enumerate(paragraphs):
1875
+ preview = re.sub(r"\s+", " ", str(p or "")).strip()
1876
+ if len(preview) > 110:
1877
+ preview = preview[:107].rstrip() + "..."
1878
+ choices.append((f"Chunk {i + 1}: {preview}", str(i)))
1879
+ return choices
1880
+
1881
+
1882
+ def build_clickable_lecture_html(lecture_text: str, selected_idx: str = "") -> str:
1883
  paragraphs = split_lecture_paragraphs(lecture_text)
1884
  if not paragraphs:
1885
  return '<div class="lecture-empty">Generated lecture explanation will appear here...</div>'
1886
+ selected = str(selected_idx or "").strip()
1887
  parts: List[str] = ['<div class="lecture-clickable">']
1888
  for i, p in enumerate(paragraphs):
1889
  safe = html.escape(p, quote=False).replace("\n", "<br>")
1890
+ selected_cls = " is-selected" if selected and selected == str(i) else ""
1891
+ selected_style = (
1892
+ "background: #f97316 !important; "
1893
+ "border-color: #f97316 !important; "
1894
+ "box-shadow: 0 0 0 1px rgba(255,255,255,0.16) inset !important; "
1895
+ "color: #ffffff !important;"
1896
+ if selected_cls
1897
+ else ""
1898
+ )
1899
+ parts.append(
1900
+ f'<div class="lecture-paragraph{selected_cls}" data-idx="{i}" '
1901
+ f'style="{selected_style}" '
1902
+ f'onclick="window.__lectureSelectParagraph && window.__lectureSelectParagraph({i}, this, true);">{safe}</div>'
1903
+ )
1904
  parts.append("</div>")
1905
  return "".join(parts)
1906
 
 
1910
  paragraphs = split_lecture_paragraphs(str(lecture_text or ""))
1911
  if not paragraphs:
1912
  state["status"] = "暂无讲解内容。"
1913
+ return (
1914
+ state,
1915
+ state.get("status", "Idle"),
1916
+ state.get("lecture_audio_path"),
1917
+ "请先生成讲解。",
1918
+ build_clickable_lecture_html(state.get("lecture_text", ""), str(state.get("selected_paragraph_idx", ""))),
1919
+ )
1920
 
1921
  try:
1922
  idx = int(str(paragraph_idx or "").strip())
 
1924
  idx = -1
1925
  if idx < 0 or idx >= len(paragraphs):
1926
  state["status"] = "段落选择无效。"
1927
+ return (
1928
+ state,
1929
+ state.get("status", "Idle"),
1930
+ state.get("lecture_audio_path"),
1931
+ "请重新点击要播放的段落。",
1932
+ build_clickable_lecture_html(state.get("lecture_text", ""), str(state.get("selected_paragraph_idx", ""))),
1933
+ )
1934
 
1935
  backend = _tts_backend_name()
1936
  voice = tts_voice_for_character(state.get("character_id"))
1937
  try:
1938
+ state["selected_paragraph_idx"] = str(idx)
1939
  state["status"] = f"正在生成段落语音({backend})..."
1940
  audio_path = engine.synthesize_tts(
1941
  paragraphs[idx],
 
1944
  )
1945
  state["lecture_audio_path"] = audio_path
1946
  state["status"] = "段落语音已生成。"
1947
+ char_len = len(paragraphs[idx])
1948
+ return (
1949
+ state,
1950
+ state["status"],
1951
+ audio_path,
1952
+ f"已生成第 {idx+1}/{len(paragraphs)} 段语音({char_len} 字符),可在下方播放。",
1953
+ build_clickable_lecture_html(state.get("lecture_text", ""), str(state.get("selected_paragraph_idx", ""))),
1954
+ )
1955
  except Exception as exc:
1956
  state["status"] = "段落语音生成失败。"
1957
+ return (
1958
+ state,
1959
+ state["status"],
1960
+ state.get("lecture_audio_path"),
1961
+ f"TTS error via `{backend}`: {type(exc).__name__}: {exc}",
1962
+ build_clickable_lecture_html(state.get("lecture_text", ""), str(state.get("selected_paragraph_idx", ""))),
1963
+ )
1964
 
1965
 
1966
  def play_explanation_audio(state: Dict[str, Any]):
 
1979
  return state, state["status"], state.get("explanation_audio_path"), f"TTS error: {type(exc).__name__}: {exc}"
1980
 
1981
 
1982
+ def on_play_lecture_audio_click(state: Dict[str, Any]):
1983
+ state, status, audio_path, feedback, lecture_html = play_lecture_audio(state)
1984
+ selected_paragraph_value = str(state.get("selected_paragraph_idx", "")).strip() or None
1985
+ return (
1986
+ state,
1987
+ status,
1988
+ audio_path,
1989
+ feedback,
1990
+ lecture_html,
1991
+ gr.update(value=selected_paragraph_value),
1992
+ )
1993
+
1994
+
1995
+ def on_play_paragraph_click(paragraph_idx: str, state: Dict[str, Any]):
1996
+ state, status, audio_path, feedback, lecture_html = play_lecture_paragraph_audio(paragraph_idx, state)
1997
+ selected_paragraph_value = str(state.get("selected_paragraph_idx", "")).strip() or None
1998
+ return (
1999
+ state,
2000
+ status,
2001
+ audio_path,
2002
+ feedback,
2003
+ lecture_html,
2004
+ gr.update(value=selected_paragraph_value),
2005
+ )
2006
+
2007
+
2008
  def build_css() -> str:
2009
  bg_css = ""
2010
 
 
2310
  line-height: 1.45 !important;
2311
  color: rgba(244,246,251,0.95) !important;
2312
  }}
2313
+ #lecture-clickable,
2314
+ #lecture-clickable .html-container,
2315
+ #lecture-clickable .html-container *,
2316
+ #lecture-clickable .lecture-clickable,
2317
+ #lecture-clickable .lecture-clickable * {{
2318
+ pointer-events: auto !important;
2319
+ }}
2320
  #lecture-clickable .lecture-paragraph {{
2321
  cursor: pointer;
2322
+ pointer-events: auto !important;
2323
  padding: 10px 12px;
2324
  border-radius: 14px;
2325
  margin: 0 0 10px 0;
 
2333
  background: rgba(255,255,255,0.08);
2334
  border-color: rgba(255,255,255,0.14);
2335
  }}
2336
+ #lecture-clickable .lecture-paragraph.is-selected {{
2337
+ background: #f97316 !important;
2338
+ border-color: #f97316 !important;
2339
+ box-shadow: 0 0 0 1px rgba(255,255,255,0.16) inset !important;
2340
+ color: #ffffff !important;
2341
+ }}
2342
+ #lecture-clickable .lecture-paragraph[data-selected="1"] {{
2343
+ background: #f97316 !important;
2344
+ border-color: #f97316 !important;
2345
+ box-shadow: 0 0 0 1px rgba(255,255,255,0.16) inset !important;
2346
+ color: #ffffff !important;
2347
+ }}
2348
  .lecture-empty {{
2349
  padding: 10px 12px;
2350
  color: rgba(244,246,251,0.72);
 
2567
  background: rgba(15, 23, 42, 0.06);
2568
  border-color: rgba(15, 23, 42, 0.16);
2569
  }}
2570
+ #lecture-clickable .lecture-paragraph.is-selected {{
2571
+ background: #f97316 !important;
2572
+ border-color: #f97316 !important;
2573
+ box-shadow: 0 0 0 1px rgba(255,255,255,0.18) inset !important;
2574
+ color: #ffffff !important;
2575
+ }}
2576
+ #lecture-clickable .lecture-paragraph[data-selected="1"] {{
2577
+ background: #f97316 !important;
2578
+ border-color: #f97316 !important;
2579
+ box-shadow: 0 0 0 1px rgba(255,255,255,0.18) inset !important;
2580
+ color: #ffffff !important;
2581
+ }}
2582
  .lecture-empty {{
2583
  color: rgba(15, 23, 42, 0.72);
2584
  }}
 
2951
  () => {
2952
  const state = window.__lectureClickTtsGlobal || (window.__lectureClickTtsGlobal = {});
2953
  if (state.bound) return;
2954
+ try {
2955
  const grRoot = (typeof window.gradioApp === "function") ? window.gradioApp() : null;
2956
  const rootCandidates = [
2957
  document,
 
2998
  state.observer = new MutationObserver(() => bindAudioLoading());
2999
  state.observer.observe(document.body, { childList: true, subtree: true, attributes: true });
3000
  }
3001
+ const selectParagraph = (idx, para, autoPlay) => {
3002
+ const indexText = String(idx ?? "").trim();
3003
+ const selectedInlineStyle = {
3004
+ background: "#f97316",
3005
+ borderColor: "#f97316",
3006
+ boxShadow: "0 0 0 1px rgba(255,255,255,0.16) inset",
3007
+ color: "#ffffff",
3008
+ };
3009
+ for (const r of rootCandidates) {
3010
+ const nodes = r.querySelectorAll ? r.querySelectorAll("#lecture-clickable .lecture-paragraph.is-selected") : [];
3011
+ for (const node of nodes) {
3012
+ node.classList.remove("is-selected");
3013
+ node.removeAttribute("data-selected");
3014
+ if (node.style) {
3015
+ node.style.removeProperty("background");
3016
+ node.style.removeProperty("border-color");
3017
+ node.style.removeProperty("box-shadow");
3018
+ node.style.removeProperty("color");
3019
+ }
3020
  }
3021
+ }
3022
+ if (para && para.classList) {
3023
+ para.classList.add("is-selected");
3024
+ para.setAttribute("data-selected", "1");
3025
+ if (para.style) {
3026
+ para.style.setProperty("background", selectedInlineStyle.background, "important");
3027
+ para.style.setProperty("border-color", selectedInlineStyle.borderColor, "important");
3028
+ para.style.setProperty("box-shadow", selectedInlineStyle.boxShadow, "important");
3029
+ para.style.setProperty("color", selectedInlineStyle.color, "important");
3030
  }
3031
+ }
3032
+
3033
+ let input = q("#selected-paragraph textarea, #selected-paragraph input");
3034
+ if (!input) {
3035
+ const inputWrap = q("#selected-paragraph");
3036
+ input = inputWrap && inputWrap.querySelector ? inputWrap.querySelector("textarea, input") : null;
3037
+ }
3038
+ if (!input) {
3039
+ showLoading("未找到段落选择控件,请刷新页面重试。");
3040
+ return;
3041
+ }
3042
+ input.value = indexText;
3043
+ input.dispatchEvent(new Event("input", { bubbles: true }));
3044
+ input.dispatchEvent(new Event("change", { bubbles: true }));
3045
+
3046
+ if (!autoPlay) return;
3047
+ let btn = q("#play-paragraph-btn button, #play-paragraph-btn");
3048
+ if (btn && btn.querySelector && btn.tagName !== "BUTTON") {
3049
+ const innerBtn = btn.querySelector("button");
3050
+ if (innerBtn) btn = innerBtn;
3051
+ }
3052
+ if (!btn) {
3053
+ showLoading("未找到段落播放控件,请刷新页面重试。");
3054
+ return;
3055
+ }
3056
+ showLoading("正在生成语音...");
3057
+ btn.click();
3058
+ };
3059
+ window.__lectureSelectParagraph = (idx, el, autoPlay = true) => {
3060
+ selectParagraph(idx, el, autoPlay);
3061
+ };
3062
+
3063
+ const paragraphFromEvent = (e) => {
3064
+ const target = e ? e.target : null;
3065
+ if (target && target.nodeType === 1 && target.closest) {
3066
+ const p = target.closest(".lecture-paragraph");
3067
+ if (p) return p;
3068
+ }
3069
+ const path = (e && typeof e.composedPath === "function") ? e.composedPath() : [];
3070
+ for (const n of path) {
3071
+ if (n && n.classList && n.classList.contains("lecture-paragraph")) return n;
3072
+ }
3073
+ return null;
3074
+ };
3075
+
3076
+ const onParagraphClick = (e) => {
3077
+ const para = paragraphFromEvent(e);
3078
+ if (!para) return;
3079
+ const idx = para.getAttribute("data-idx");
3080
+ if (typeof idx !== "string" || idx.trim() === "") return;
3081
+ selectParagraph(idx, para, true);
3082
+ };
3083
+ const bindClickRoot = (root) => {
3084
+ if (!root || !root.addEventListener) return;
3085
+ if (root.__lectureClickBound) return;
3086
+ root.__lectureClickBound = true;
3087
+ root.addEventListener("click", onParagraphClick, true);
3088
+ };
3089
+
3090
+ for (const r of rootCandidates) bindClickRoot(r);
3091
+ bindClickRoot(window);
3092
+
3093
+ if (!state.rebindObserver) {
3094
+ state.rebindObserver = new MutationObserver(() => {
3095
+ const nextRoot = (typeof window.gradioApp === "function") ? window.gradioApp() : null;
3096
+ for (const r of [document, nextRoot && nextRoot.shadowRoot ? nextRoot.shadowRoot : null, nextRoot]) {
3097
+ bindClickRoot(r);
3098
+ }
3099
+ });
3100
+ state.rebindObserver.observe(document.body, { childList: true, subtree: true });
3101
+ }
3102
+ state.bound = true;
3103
+ } catch (err) {
3104
+ state.bound = false;
3105
+ try { console.error("lecture click bridge failed:", err); } catch (_) {}
3106
+ }
3107
  }
3108
  """,
3109
  )
 
3121
  )
3122
  with gr.Row(elem_id="lecture-actions"):
3123
  play_lecture_btn = gr.Button("Play Lecture Audio", interactive=False, scale=0)
3124
+ gr.Markdown("提示:可直接点击段落播放;若浏览器拦截点击,请使用下方 Chunk selector。", elem_id="paragraph-tts-tip")
3125
+ paragraph_picker = gr.Radio(
3126
+ choices=[],
3127
+ value=None,
3128
+ interactive=False,
3129
+ visible=False,
3130
+ label="Chunks (fallback selector)",
3131
+ elem_id="paragraph-picker",
3132
+ )
3133
  lecture_feedback = gr.Markdown("")
3134
  with gr.Row(elem_id="exam-entry-wrap"):
3135
  exam_btn = gr.Button("Go to Exam", interactive=False, variant="secondary", scale=0)
 
3245
  exam_page,
3246
  status_box,
3247
  lecture_box,
3248
+ paragraph_picker,
3249
  lecture_audio,
3250
  play_lecture_btn,
3251
  exam_btn,
 
3272
  submit_btn.click(fn=submit_answer, inputs=[choice_radio, state], outputs=outputs, show_progress="hidden")
3273
  restart_btn.click(fn=restart_quiz, inputs=[state], outputs=outputs, show_progress="hidden")
3274
  play_lecture_btn.click(
3275
+ fn=on_play_lecture_audio_click,
3276
  inputs=[state],
3277
+ outputs=[state, status_box, lecture_audio, lecture_feedback, lecture_box, paragraph_picker],
3278
  show_progress="minimal",
3279
  )
3280
  play_paragraph_btn.click(
3281
+ fn=on_play_paragraph_click,
3282
  inputs=[paragraph_idx, state],
3283
+ outputs=[state, status_box, lecture_audio, lecture_feedback, lecture_box, paragraph_picker],
3284
+ show_progress="minimal",
3285
+ )
3286
+ paragraph_picker.change(
3287
+ fn=on_play_paragraph_click,
3288
+ inputs=[paragraph_picker, state],
3289
+ outputs=[state, status_box, lecture_audio, lecture_feedback, lecture_box, paragraph_picker],
3290
  show_progress="minimal",
3291
  )
3292