Jiaxuan Yang commited on
Commit
f718c5e
·
1 Parent(s): c19cf53
Files changed (1) hide show
  1. app.py +33 -531
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import base64
2
  import html
3
- import io
4
  import json
5
  import math
6
  import os
@@ -35,11 +34,6 @@ try:
35
  except Exception: # pragma: no cover
36
  pdfium = None # type: ignore
37
 
38
- try:
39
- import soundfile as sf # type: ignore
40
- except Exception: # pragma: no cover
41
- sf = None # type: ignore
42
-
43
 
44
  APP_DIR = Path(__file__).parent.resolve()
45
  TMP_DIR = APP_DIR / "tmp_outputs"
@@ -68,41 +62,11 @@ CHAT_MODEL_ID = os.getenv("QWEN_VL_MODEL_ID", "qwen-vl-max")
68
  TTS_MODEL_ID = os.getenv("QWEN_TTS_MODEL_ID", "qwen-tts")
69
  TTS_SPEAKER = os.getenv("QWEN_TTS_SPEAKER", "longxiaochun_v2")
70
  TTS_FORMAT = os.getenv("QWEN_TTS_FORMAT", "wav")
71
- TTS_BACKEND = (os.getenv("TTS_BACKEND") or "gpt_sovits_local").strip().lower()
72
  API_TIMEOUT_SEC = int(os.getenv("API_TIMEOUT_SEC", "180"))
73
  QWEN_VL_MAX_PAGES = int(os.getenv("QWEN_VL_MAX_PAGES", "4"))
74
  QWEN_VL_RENDER_SCALE = float(os.getenv("QWEN_VL_RENDER_SCALE", "1.5"))
75
  QWEN_VL_MAX_NEW_TOKENS = int(os.getenv("QWEN_VL_MAX_NEW_TOKENS", "800"))
76
  QWEN_VL_MCQ_MAX_NEW_TOKENS = int(os.getenv("QWEN_VL_MCQ_MAX_NEW_TOKENS", "1800"))
77
- GPT_SOVITS_BASE_URL = (os.getenv("GPT_SOVITS_BASE_URL") or "http://127.0.0.1:9880").rstrip("/")
78
- GPT_SOVITS_TTS_ENDPOINT = os.getenv("GPT_SOVITS_TTS_ENDPOINT", "/tts")
79
- GPT_SOVITS_SET_SOVITS_ENDPOINT = os.getenv("GPT_SOVITS_SET_SOVITS_ENDPOINT", "/set_sovits_weights")
80
- GPT_SOVITS_SET_GPT_ENDPOINT = os.getenv("GPT_SOVITS_SET_GPT_ENDPOINT", "/set_gpt_weights")
81
- GPT_SOVITS_DEFAULT_SOVITS_PATH = os.getenv(
82
- "GPT_SOVITS_DEFAULT_SOVITS_PATH",
83
- str((APP_DIR / "audio" / "s2Gv2ProPlus.pth").resolve()),
84
- )
85
- GPT_SOVITS_DEFAULT_GPT_PATH = os.getenv("GPT_SOVITS_DEFAULT_GPT_PATH", "")
86
- GPT_SOVITS_DEFAULT_REF_AUDIO_PATH = os.getenv("GPT_SOVITS_REF_AUDIO_PATH", "")
87
- GPT_SOVITS_DEFAULT_PROMPT_TEXT = os.getenv("GPT_SOVITS_PROMPT_TEXT", "")
88
- GPT_SOVITS_DEFAULT_PROMPT_LANG = os.getenv("GPT_SOVITS_PROMPT_LANG", "zh")
89
- GPT_SOVITS_DEFAULT_TEXT_LANG = os.getenv("GPT_SOVITS_TEXT_LANG", "zh")
90
- GPT_SOVITS_MEDIA_TYPE = os.getenv("GPT_SOVITS_MEDIA_TYPE", "wav")
91
- GPT_SOVITS_STREAMING_MODE = os.getenv("GPT_SOVITS_STREAMING_MODE", "0").strip() == "1"
92
- GPT_SOVITS_ROLE_MODEL_MAP_RAW = os.getenv("GPT_SOVITS_ROLE_MODEL_MAP", "")
93
-
94
-
95
- def _parse_json_dict_env(raw: str) -> Dict[str, Any]:
96
- if not raw.strip():
97
- return {}
98
- try:
99
- data = json.loads(raw)
100
- except Exception:
101
- return {}
102
- return data if isinstance(data, dict) else {}
103
-
104
-
105
- GPT_SOVITS_ROLE_MODEL_MAP = _parse_json_dict_env(GPT_SOVITS_ROLE_MODEL_MAP_RAW)
106
 
107
 
108
  DEFAULT_LECTURE_PROMPT_TEMPLATE = """
@@ -221,14 +185,6 @@ def load_character_configs() -> Dict[str, Dict[str, Any]]:
221
  d / str(meta.get("mcq_retry_prompt_file", "mcq_retry_prompt.txt")),
222
  DEFAULT_MCQ_RETRY_PROMPT_TEMPLATE,
223
  ),
224
- # Optional local GPT-SoVITS overrides.
225
- "voice_model": str(meta.get("voice_model", meta.get("display_name", cid))),
226
- "gpt_sovits_sovits_path": str(meta.get("gpt_sovits_sovits_path", "")).strip(),
227
- "gpt_sovits_gpt_path": str(meta.get("gpt_sovits_gpt_path", "")).strip(),
228
- "gpt_sovits_ref_audio_path": str(meta.get("gpt_sovits_ref_audio_path", "")).strip(),
229
- "gpt_sovits_prompt_text": str(meta.get("gpt_sovits_prompt_text", "")).strip(),
230
- "gpt_sovits_prompt_lang": str(meta.get("gpt_sovits_prompt_lang", "")).strip(),
231
- "gpt_sovits_text_lang": str(meta.get("gpt_sovits_text_lang", "")).strip(),
232
  }
233
  configs[cid] = config
234
 
@@ -245,13 +201,6 @@ def load_character_configs() -> Dict[str, Dict[str, Any]]:
245
  "lecture_prompt_template": DEFAULT_LECTURE_PROMPT_TEMPLATE,
246
  "mcq_prompt_template": DEFAULT_MCQ_PROMPT_TEMPLATE,
247
  "mcq_retry_prompt_template": DEFAULT_MCQ_RETRY_PROMPT_TEMPLATE,
248
- "voice_model": "default",
249
- "gpt_sovits_sovits_path": "",
250
- "gpt_sovits_gpt_path": "",
251
- "gpt_sovits_ref_audio_path": "",
252
- "gpt_sovits_prompt_text": "",
253
- "gpt_sovits_prompt_lang": "",
254
- "gpt_sovits_text_lang": "",
255
  }
256
  return configs
257
 
@@ -266,45 +215,6 @@ def get_character_config(character_id: Optional[str]) -> Dict[str, Any]:
266
  return CHARACTER_CONFIGS[DEFAULT_CHARACTER_ID]
267
 
268
 
269
- def normalize_role_key(value: Optional[str]) -> str:
270
- s = str(value or "").strip().lower()
271
- return re.sub(r"[^a-z0-9]+", "", s)
272
-
273
-
274
- def build_role_aliases(character_id: Optional[str], character_cfg: Optional[Dict[str, Any]] = None) -> List[str]:
275
- raws: List[str] = []
276
- if character_id:
277
- raws.append(character_id)
278
- if character_cfg:
279
- for k in ["id", "display_name", "chat_label", "voice_model"]:
280
- v = character_cfg.get(k)
281
- if isinstance(v, str) and v.strip():
282
- raws.append(v.strip())
283
- seen: set[str] = set()
284
- out: List[str] = []
285
- for raw in raws:
286
- for candidate in [raw, normalize_role_key(raw)]:
287
- if not candidate or candidate in seen:
288
- continue
289
- seen.add(candidate)
290
- out.append(candidate)
291
- return out
292
-
293
-
294
- def resolve_local_path_maybe(value: Optional[str]) -> str:
295
- s = str(value or "").strip()
296
- if not s:
297
- return ""
298
- p = Path(s).expanduser()
299
- if not p.is_absolute():
300
- p = (APP_DIR / p).resolve()
301
- return str(p)
302
-
303
-
304
- def file_exists(path_str: Optional[str]) -> bool:
305
- return bool(path_str) and Path(str(path_str)).expanduser().exists()
306
-
307
-
308
  @dataclass
309
  class MCQItem:
310
  question: str
@@ -551,12 +461,9 @@ class QwenPipelineEngine:
551
 
552
  def __init__(self) -> None:
553
  self.mock_mode = USE_MOCK_MODELS
554
- self.tts_backend = TTS_BACKEND
555
  self.vl_loaded = False
556
  self.tts_loaded = False
557
  self._pdf_page_cache: Dict[str, List[str]] = {}
558
- self._loaded_sovits_weights: Optional[str] = None
559
- self._loaded_gpt_weights: Optional[str] = None
560
 
561
  def ensure_vl_loaded(self) -> None:
562
  if self.vl_loaded:
@@ -572,11 +479,6 @@ class QwenPipelineEngine:
572
  def ensure_tts_loaded(self) -> None:
573
  if self.tts_loaded:
574
  return
575
- if self.tts_backend == "gpt_sovits_local":
576
- if not GPT_SOVITS_BASE_URL:
577
- raise RuntimeError("Missing GPT_SOVITS_BASE_URL for local GPT-SoVITS TTS.")
578
- self.tts_loaded = True
579
- return
580
  if self.mock_mode:
581
  self.tts_loaded = True
582
  return
@@ -585,196 +487,6 @@ class QwenPipelineEngine:
585
  raise RuntimeError("Missing API_KEY for TTS API calls.")
586
  self.tts_loaded = True
587
 
588
- def _gptsovits_endpoint_url(self, endpoint: str) -> str:
589
- endpoint = endpoint.strip()
590
- if endpoint.startswith("http://") or endpoint.startswith("https://"):
591
- return endpoint
592
- if not endpoint.startswith("/"):
593
- endpoint = "/" + endpoint
594
- return f"{GPT_SOVITS_BASE_URL}{endpoint}"
595
-
596
- def _find_gptsovits_role_entry(
597
- self,
598
- character_id: Optional[str],
599
- character_cfg: Optional[Dict[str, Any]],
600
- ) -> Dict[str, Any]:
601
- aliases = build_role_aliases(character_id, character_cfg)
602
-
603
- for key in aliases:
604
- entry = GPT_SOVITS_ROLE_MODEL_MAP.get(key)
605
- if entry is None:
606
- continue
607
- if isinstance(entry, str):
608
- return {"sovits_path": entry}
609
- if isinstance(entry, dict):
610
- return dict(entry)
611
-
612
- norm_map: Dict[str, Any] = {}
613
- for k, v in GPT_SOVITS_ROLE_MODEL_MAP.items():
614
- nk = normalize_role_key(k)
615
- if nk and nk not in norm_map:
616
- norm_map[nk] = v
617
- for key in aliases:
618
- entry = norm_map.get(normalize_role_key(key))
619
- if entry is None:
620
- continue
621
- if isinstance(entry, str):
622
- return {"sovits_path": entry}
623
- if isinstance(entry, dict):
624
- return dict(entry)
625
-
626
- return {}
627
-
628
- def _guess_sovits_path_from_audio_dir(self, aliases: List[str]) -> str:
629
- audio_dir = APP_DIR / "audio"
630
- if not audio_dir.exists():
631
- return ""
632
- pth_files = [p for p in audio_dir.iterdir() if p.is_file() and p.suffix.lower() == ".pth"]
633
- if not pth_files:
634
- return ""
635
-
636
- alias_set = {normalize_role_key(a) for a in aliases if a}
637
- for p in pth_files:
638
- stem_norm = normalize_role_key(p.stem)
639
- if stem_norm and stem_norm in alias_set:
640
- return str(p.resolve())
641
- for p in pth_files:
642
- name_norm = normalize_role_key(p.name)
643
- if name_norm and any(a and a in name_norm for a in alias_set):
644
- return str(p.resolve())
645
- return ""
646
-
647
- def _guess_role_ref_audio_path(self, aliases: List[str]) -> str:
648
- audio_dir = APP_DIR / "audio"
649
- if not audio_dir.exists():
650
- return ""
651
- candidates = [p for p in audio_dir.iterdir() if p.is_file() and p.suffix.lower() in {".wav", ".mp3", ".flac", ".m4a"}]
652
- alias_set = {normalize_role_key(a) for a in aliases if a}
653
- for p in candidates:
654
- stem_norm = normalize_role_key(p.stem)
655
- if stem_norm and stem_norm in alias_set:
656
- return str(p.resolve())
657
- return ""
658
-
659
- def _guess_role_prompt_text(self, aliases: List[str]) -> str:
660
- audio_dir = APP_DIR / "audio"
661
- if not audio_dir.exists():
662
- return ""
663
- alias_set = {normalize_role_key(a) for a in aliases if a}
664
- for p in audio_dir.iterdir():
665
- if not p.is_file() or p.suffix.lower() != ".txt":
666
- continue
667
- stem_norm = normalize_role_key(p.stem)
668
- if stem_norm and stem_norm in alias_set:
669
- try:
670
- return p.read_text(encoding="utf-8").strip()
671
- except Exception:
672
- return ""
673
- return ""
674
-
675
- def _gptsovits_role_tts_config(
676
- self,
677
- character_id: Optional[str],
678
- character_cfg: Optional[Dict[str, Any]],
679
- ) -> Dict[str, str]:
680
- aliases = build_role_aliases(character_id, character_cfg)
681
- entry = self._find_gptsovits_role_entry(character_id, character_cfg)
682
-
683
- cfg = character_cfg or {}
684
- sovits_path = resolve_local_path_maybe(
685
- str(entry.get("sovits_path") or entry.get("model_path") or cfg.get("gpt_sovits_sovits_path") or "")
686
- )
687
- if not file_exists(sovits_path):
688
- guessed = self._guess_sovits_path_from_audio_dir(aliases)
689
- if guessed:
690
- sovits_path = guessed
691
- if not file_exists(sovits_path):
692
- sovits_path = resolve_local_path_maybe(GPT_SOVITS_DEFAULT_SOVITS_PATH)
693
-
694
- gpt_path = resolve_local_path_maybe(
695
- str(entry.get("gpt_path") or cfg.get("gpt_sovits_gpt_path") or GPT_SOVITS_DEFAULT_GPT_PATH or "")
696
- )
697
- if gpt_path and not file_exists(gpt_path):
698
- gpt_path = ""
699
-
700
- ref_audio_path = resolve_local_path_maybe(
701
- str(entry.get("ref_audio_path") or cfg.get("gpt_sovits_ref_audio_path") or GPT_SOVITS_DEFAULT_REF_AUDIO_PATH or "")
702
- )
703
- if ref_audio_path and not file_exists(ref_audio_path):
704
- ref_audio_path = ""
705
- if not ref_audio_path:
706
- guessed_ref = self._guess_role_ref_audio_path(aliases)
707
- if guessed_ref:
708
- ref_audio_path = guessed_ref
709
-
710
- prompt_text = str(
711
- entry.get("prompt_text") or cfg.get("gpt_sovits_prompt_text") or GPT_SOVITS_DEFAULT_PROMPT_TEXT or ""
712
- ).strip()
713
- if not prompt_text:
714
- prompt_text = self._guess_role_prompt_text(aliases)
715
- prompt_lang = str(
716
- entry.get("prompt_lang") or cfg.get("gpt_sovits_prompt_lang") or GPT_SOVITS_DEFAULT_PROMPT_LANG or "zh"
717
- ).strip() or "zh"
718
- text_lang = str(
719
- entry.get("text_lang") or cfg.get("gpt_sovits_text_lang") or GPT_SOVITS_DEFAULT_TEXT_LANG or "zh"
720
- ).strip() or "zh"
721
-
722
- return {
723
- "sovits_path": sovits_path,
724
- "gpt_path": gpt_path,
725
- "ref_audio_path": ref_audio_path,
726
- "prompt_text": prompt_text,
727
- "prompt_lang": prompt_lang,
728
- "text_lang": text_lang,
729
- }
730
-
731
- def _gptsovits_set_weights(self, endpoint: str, weights_path: str) -> None:
732
- if not weights_path:
733
- return
734
- url = self._gptsovits_endpoint_url(endpoint)
735
- attempts = [
736
- ("POST", {"weights_path": weights_path}),
737
- ("POST", {"path": weights_path}),
738
- ("GET", {"weights_path": weights_path}),
739
- ("GET", {"path": weights_path}),
740
- ]
741
- last_err = ""
742
- for method, payload in attempts:
743
- try:
744
- if method == "POST":
745
- resp = requests.post(url, json=payload, timeout=API_TIMEOUT_SEC)
746
- else:
747
- resp = requests.get(url, params=payload, timeout=API_TIMEOUT_SEC)
748
- if resp.status_code < 400:
749
- return
750
- last_err = f"{resp.status_code}: {resp.text[:400]}"
751
- except requests.RequestException as exc:
752
- last_err = f"{type(exc).__name__}: {exc}"
753
- raise RuntimeError(f"Failed to load GPT-SoVITS weights via {url}. Last error: {last_err}")
754
-
755
- def _gptsovits_ensure_role_model(
756
- self,
757
- character_id: Optional[str],
758
- character_cfg: Optional[Dict[str, Any]],
759
- ) -> Dict[str, str]:
760
- cfg = self._gptsovits_role_tts_config(character_id, character_cfg)
761
- sovits_path = cfg.get("sovits_path", "")
762
- if not sovits_path:
763
- raise RuntimeError(
764
- "No SoVITS weight found. Put role-specific .pth in ./audio/ or set GPT_SOVITS_DEFAULT_SOVITS_PATH."
765
- )
766
- if not file_exists(sovits_path):
767
- raise RuntimeError(f"SoVITS weight file not found: {sovits_path}")
768
- if self._loaded_sovits_weights != sovits_path:
769
- self._gptsovits_set_weights(GPT_SOVITS_SET_SOVITS_ENDPOINT, sovits_path)
770
- self._loaded_sovits_weights = sovits_path
771
-
772
- gpt_path = cfg.get("gpt_path", "")
773
- if gpt_path and self._loaded_gpt_weights != gpt_path:
774
- self._gptsovits_set_weights(GPT_SOVITS_SET_GPT_ENDPOINT, gpt_path)
775
- self._loaded_gpt_weights = gpt_path
776
- return cfg
777
-
778
  def _mock_generate_lecture(self, pdf_excerpt: str) -> str:
779
  excerpt = re.sub(r"\s+", " ", pdf_excerpt).strip()
780
  excerpt = excerpt[:1000]
@@ -958,132 +670,6 @@ class QwenPipelineEngine:
958
  chunk_paths.append(self._real_tts_single(chunk, chunk_path))
959
  return concat_wav_files(chunk_paths, out_path)
960
 
961
- def _maybe_transcode_to_wav(self, audio_bytes: bytes, out_path: str) -> str:
962
- if not audio_bytes:
963
- raise RuntimeError("Empty audio payload from GPT-SoVITS.")
964
- if audio_bytes[:4] == b"RIFF":
965
- return _save_binary_audio(audio_bytes, out_path)
966
- if sf is None:
967
- return _save_binary_audio(audio_bytes, out_path)
968
- try:
969
- data, sr = sf.read(io.BytesIO(audio_bytes))
970
- sf.write(out_path, data, sr, format="WAV")
971
- return out_path
972
- except Exception:
973
- return _save_binary_audio(audio_bytes, out_path)
974
-
975
- def _extract_audio_bytes_from_json(self, data: Dict[str, Any]) -> bytes:
976
- candidates = [
977
- data.get("audio"),
978
- data.get("audio_base64"),
979
- data.get("audioData"),
980
- (data.get("data") or {}).get("audio") if isinstance(data.get("data"), dict) else None,
981
- (data.get("output") or {}).get("audio") if isinstance(data.get("output"), dict) else None,
982
- ]
983
- for item in candidates:
984
- if isinstance(item, str) and item.strip():
985
- s = item.strip()
986
- if s.startswith("data:"):
987
- _, _, s = s.partition(",")
988
- try:
989
- return base64.b64decode(s)
990
- except Exception:
991
- continue
992
- url_candidates = [
993
- data.get("audio_url"),
994
- data.get("url"),
995
- (data.get("output") or {}).get("audio_url") if isinstance(data.get("output"), dict) else None,
996
- ]
997
- for u in url_candidates:
998
- if isinstance(u, str) and u.strip():
999
- resp = requests.get(u.strip(), timeout=API_TIMEOUT_SEC)
1000
- if resp.status_code >= 400:
1001
- raise RuntimeError(f"Failed downloading GPT-SoVITS audio URL {resp.status_code}: {resp.text[:300]}")
1002
- return resp.content
1003
- raise RuntimeError(f"GPT-SoVITS JSON response did not contain audio payload: {str(data)[:500]}")
1004
-
1005
- def _gptsovits_tts_single(
1006
- self,
1007
- text: str,
1008
- out_path: str,
1009
- role_cfg: Dict[str, str],
1010
- ) -> str:
1011
- if not text.strip():
1012
- return write_tone_wav("empty", out_path)
1013
- payload: Dict[str, Any] = {
1014
- "text": text,
1015
- "text_lang": role_cfg.get("text_lang") or "zh",
1016
- "media_type": GPT_SOVITS_MEDIA_TYPE,
1017
- "streaming_mode": GPT_SOVITS_STREAMING_MODE,
1018
- }
1019
- ref_audio_path = role_cfg.get("ref_audio_path", "").strip()
1020
- prompt_text = role_cfg.get("prompt_text", "").strip()
1021
- prompt_lang = role_cfg.get("prompt_lang", "").strip() or "zh"
1022
- if ref_audio_path:
1023
- payload["ref_audio_path"] = ref_audio_path
1024
- if prompt_text:
1025
- payload["prompt_text"] = prompt_text
1026
- payload["prompt_lang"] = prompt_lang
1027
-
1028
- url = self._gptsovits_endpoint_url(GPT_SOVITS_TTS_ENDPOINT)
1029
- last_err = ""
1030
- responses: List[requests.Response] = []
1031
- try:
1032
- responses.append(requests.post(url, json=payload, timeout=API_TIMEOUT_SEC))
1033
- except requests.RequestException as exc:
1034
- last_err = f"POST {type(exc).__name__}: {exc}"
1035
- if not responses or responses[-1].status_code in {404, 405, 422}:
1036
- try:
1037
- responses.append(requests.get(url, params=payload, timeout=API_TIMEOUT_SEC))
1038
- except requests.RequestException as exc:
1039
- last_err = f"{last_err}; GET {type(exc).__name__}: {exc}".strip("; ")
1040
-
1041
- for resp in responses:
1042
- if resp.status_code >= 400:
1043
- last_err = f"{resp.status_code}: {resp.text[:500]}"
1044
- continue
1045
- content_type = (resp.headers.get("content-type") or "").lower()
1046
- if "application/json" in content_type:
1047
- data = resp.json()
1048
- audio_bytes = self._extract_audio_bytes_from_json(data)
1049
- return self._maybe_transcode_to_wav(audio_bytes, out_path)
1050
- return self._maybe_transcode_to_wav(resp.content, out_path)
1051
-
1052
- missing_bits = []
1053
- if not ref_audio_path:
1054
- missing_bits.append("GPT_SOVITS_REF_AUDIO_PATH/ref_audio_path")
1055
- if not prompt_text:
1056
- missing_bits.append("GPT_SOVITS_PROMPT_TEXT/prompt_text")
1057
- hint = ""
1058
- if missing_bits:
1059
- hint = f" (check {', '.join(missing_bits)} for your GPT-SoVITS API setup)"
1060
- raise RuntimeError(f"GPT-SoVITS /tts request failed: {last_err}{hint}")
1061
-
1062
- def _gptsovits_tts(
1063
- self,
1064
- text: str,
1065
- out_path: str,
1066
- *,
1067
- character_id: Optional[str] = None,
1068
- character_cfg: Optional[Dict[str, Any]] = None,
1069
- ) -> str:
1070
- role_cfg = self._gptsovits_ensure_role_model(character_id, character_cfg)
1071
- # For non-WAV outputs, avoid chunking because concatenation is WAV-only.
1072
- if GPT_SOVITS_MEDIA_TYPE.lower() != "wav":
1073
- return self._gptsovits_tts_single(text, out_path, role_cfg)
1074
-
1075
- chunks = split_text_for_tts(text, max_len=220)
1076
- if not chunks:
1077
- return write_tone_wav("empty", out_path)
1078
- if len(chunks) == 1:
1079
- return self._gptsovits_tts_single(chunks[0], out_path, role_cfg)
1080
-
1081
- chunk_paths: List[str] = []
1082
- for idx, chunk in enumerate(chunks, start=1):
1083
- chunk_out = str(TMP_DIR / f"gptsovits_chunk_{idx}_{uuid.uuid4().hex}.wav")
1084
- chunk_paths.append(self._gptsovits_tts_single(chunk, chunk_out, role_cfg))
1085
- return concat_wav_files(chunk_paths, out_path)
1086
-
1087
  @spaces.GPU
1088
  def build_lesson_and_quiz(self, pdf_path: str, character_cfg: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
1089
  self.ensure_vl_loaded()
@@ -1198,18 +784,9 @@ class QwenPipelineEngine:
1198
  return rebalance_mcq_answers([asdict(q) for q in mcqs])
1199
 
1200
  @spaces.GPU
1201
- def synthesize_tts(
1202
- self,
1203
- text: str,
1204
- name_prefix: str = "audio",
1205
- *,
1206
- character_id: Optional[str] = None,
1207
- character_cfg: Optional[Dict[str, Any]] = None,
1208
- ) -> str:
1209
  self.ensure_tts_loaded()
1210
  out_path = str(TMP_DIR / f"{name_prefix}_{uuid.uuid4().hex}.wav")
1211
- if self.tts_backend == "gpt_sovits_local":
1212
- return self._gptsovits_tts(text, out_path, character_id=character_id, character_cfg=character_cfg)
1213
  if self.mock_mode:
1214
  return write_tone_wav(text, out_path)
1215
  return self._real_tts(text, out_path)
@@ -2000,15 +1577,8 @@ def play_lecture_audio(state: Dict[str, Any]):
2000
  state["status"] = "No lecture text available."
2001
  return state, state["status"], state.get("lecture_audio_path"), "Generate lecture first."
2002
  try:
2003
- character_id = str(state.get("character_id") or DEFAULT_CHARACTER_ID)
2004
- character_cfg = get_character_config(character_id)
2005
  state["status"] = "Generating lecture audio..."
2006
- state["lecture_audio_path"] = engine.synthesize_tts(
2007
- state["lecture_text"],
2008
- name_prefix="lecture",
2009
- character_id=character_id,
2010
- character_cfg=character_cfg,
2011
- )
2012
  state["status"] = "Lecture audio ready."
2013
  return state, state["status"], state["lecture_audio_path"], "Lecture audio generated."
2014
  except Exception as exc:
@@ -2022,15 +1592,8 @@ def play_explanation_audio(state: Dict[str, Any]):
2022
  state["status"] = "No explanation available for TTS."
2023
  return state, state["status"], state.get("explanation_audio_path"), "Answer a question incorrectly first."
2024
  try:
2025
- character_id = str(state.get("exam_character_id") or state.get("character_id") or DEFAULT_CHARACTER_ID)
2026
- character_cfg = get_character_config(character_id)
2027
  state["status"] = "Generating explanation audio..."
2028
- state["explanation_audio_path"] = engine.synthesize_tts(
2029
- text,
2030
- name_prefix="explanation",
2031
- character_id=character_id,
2032
- character_cfg=character_cfg,
2033
- )
2034
  state["status"] = "Explanation audio ready."
2035
  return state, state["status"], state["explanation_audio_path"], "Explanation audio generated."
2036
  except Exception as exc:
@@ -2045,14 +1608,16 @@ def build_css() -> str:
2045
  @import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Inter:wght@400;500;600;700&display=swap');
2046
 
2047
  html, body {{
 
2048
  min-height: 100%;
2049
- height: auto;
2050
  }}
2051
  body {{
2052
  background-color: #ffffff !important;
 
2053
  font-family: "Inter", sans-serif !important;
2054
  }}
2055
  .app, #root, .gradio-container, .gradio-container > .main {{
 
2056
  background: transparent !important;
2057
  }}
2058
  .gradio-container {{
@@ -2087,8 +1652,8 @@ body {{
2087
  color: #eef1f6 !important;
2088
  }}
2089
  #page-shell {{
2090
- min-height: 100vh;
2091
- padding: 2rem 1.2rem 2rem 1.2rem;
2092
  max-width: 980px;
2093
  margin: 0 auto;
2094
  }}
@@ -2354,8 +1919,12 @@ body {{
2354
  margin-top: 0.25rem !important;
2355
  }}
2356
  #bottom-composer {{
2357
- width: min(860px, 100%);
2358
- margin: 0 auto 1rem auto;
 
 
 
 
2359
  background: rgba(24, 26, 34, 0.88);
2360
  border: 1px solid rgba(255,255,255,0.08);
2361
  border-radius: 999px;
@@ -2474,79 +2043,7 @@ body {{
2474
  border: 1px solid rgba(59, 130, 246, 0.28);
2475
  color: rgba(255, 255, 255, 0.95);
2476
  }}
2477
- #exam-picker-overlay {{
2478
- position: fixed;
2479
- inset: 0;
2480
- z-index: 200;
2481
- display: none;
2482
- align-items: center;
2483
- justify-content: center;
2484
- background: rgba(2, 6, 23, 0.55);
2485
- backdrop-filter: blur(6px);
2486
- padding: 16px;
2487
- }}
2488
- #exam-picker-overlay:not(.hide) {{
2489
- display: flex;
2490
- }}
2491
- #exam-picker-overlay.hide {{
2492
- display: none !important;
2493
- pointer-events: none !important;
2494
- }}
2495
- #exam-picker-modal {{
2496
- width: min(720px, 94vw);
2497
- border-radius: 16px;
2498
- background: rgba(14, 16, 24, 0.96);
2499
- border: 1px solid rgba(255, 255, 255, 0.12);
2500
- box-shadow: 0 18px 50px rgba(0, 0, 0, 0.45);
2501
- padding: 16px;
2502
- height: auto !important;
2503
- max-height: 320px;
2504
- overflow: hidden;
2505
- }}
2506
- #exam-picker-modal .block,
2507
- #exam-picker-modal .wrap,
2508
- #exam-picker-modal .panel {{
2509
- background: transparent !important;
2510
- border: none !important;
2511
- box-shadow: none !important;
2512
- }}
2513
- #exam-picker-title {{
2514
- font-weight: 700;
2515
- color: #f4f6fb;
2516
- margin-bottom: 10px;
2517
- }}
2518
- .exam-picker-grid {{
2519
- display: flex !important;
2520
- flex-wrap: nowrap;
2521
- gap: 12px;
2522
- }}
2523
- .exam-picker-card {{
2524
- flex: 1 1 0;
2525
- min-width: 0 !important;
2526
- border-radius: 14px;
2527
- border: 1px solid rgba(255, 255, 255, 0.14);
2528
- background: rgba(255, 255, 255, 0.06);
2529
- padding: 12px;
2530
- overflow: hidden;
2531
- transition: transform 120ms ease, border-color 120ms ease, box-shadow 120ms ease;
2532
- }}
2533
- .exam-picker-card:hover {{
2534
- transform: translateY(-2px);
2535
- border-color: rgba(59, 130, 246, 0.42);
2536
- box-shadow: 0 10px 24px rgba(0, 0, 0, 0.35);
2537
- }}
2538
- .exam-picker-avatar {{
2539
- width: 56px;
2540
- height: 56px;
2541
- border-radius: 999px;
2542
- object-fit: cover;
2543
- display: block;
2544
- margin: 0 auto 10px auto;
2545
- }}
2546
- .exam-picker-card button {{
2547
- width: 100%;
2548
- }}
2549
- @media (prefers-color-scheme: light) and (prefers-color-scheme: dark) {{
2550
  body {{
2551
  background: linear-gradient(180deg, #f5f7fb 0%, #eef2f8 100%) !important;
2552
  }}
@@ -2935,18 +2432,6 @@ with gr.Blocks(css=CSS) as demo:
2935
  container=False,
2936
  )
2937
 
2938
- with gr.Row(elem_id="bottom-composer"):
2939
- pdf_input = gr.File(
2940
- label="",
2941
- show_label=False,
2942
- file_types=[".pdf"],
2943
- type="filepath",
2944
- elem_id="pdf-uploader",
2945
- scale=7,
2946
- min_width=0,
2947
- )
2948
- run_btn = gr.Button("Generate", variant="primary", elem_id="generate-btn", scale=3, min_width=120)
2949
-
2950
  state = gr.State(new_session_state())
2951
 
2952
  loading_md = gr.HTML("", elem_id="gen-loading", visible=False)
@@ -3055,6 +2540,18 @@ with gr.Blocks(css=CSS) as demo:
3055
  score_box = gr.Textbox(label="Score", value="Score: 0 / 0", interactive=False, visible=False)
3056
  feedback_box = gr.Textbox(label="Feedback / Explanation", lines=8, interactive=False, visible=False)
3057
 
 
 
 
 
 
 
 
 
 
 
 
 
3058
  outputs = [
3059
  state,
3060
  character_header_html,
@@ -3102,4 +2599,9 @@ with gr.Blocks(css=CSS) as demo:
3102
  demo.queue()
3103
 
3104
  if __name__ == "__main__":
3105
- demo.launch()
 
 
 
 
 
 
1
  import base64
2
  import html
 
3
  import json
4
  import math
5
  import os
 
34
  except Exception: # pragma: no cover
35
  pdfium = None # type: ignore
36
 
 
 
 
 
 
37
 
38
  APP_DIR = Path(__file__).parent.resolve()
39
  TMP_DIR = APP_DIR / "tmp_outputs"
 
62
  TTS_MODEL_ID = os.getenv("QWEN_TTS_MODEL_ID", "qwen-tts")
63
  TTS_SPEAKER = os.getenv("QWEN_TTS_SPEAKER", "longxiaochun_v2")
64
  TTS_FORMAT = os.getenv("QWEN_TTS_FORMAT", "wav")
 
65
  API_TIMEOUT_SEC = int(os.getenv("API_TIMEOUT_SEC", "180"))
66
  QWEN_VL_MAX_PAGES = int(os.getenv("QWEN_VL_MAX_PAGES", "4"))
67
  QWEN_VL_RENDER_SCALE = float(os.getenv("QWEN_VL_RENDER_SCALE", "1.5"))
68
  QWEN_VL_MAX_NEW_TOKENS = int(os.getenv("QWEN_VL_MAX_NEW_TOKENS", "800"))
69
  QWEN_VL_MCQ_MAX_NEW_TOKENS = int(os.getenv("QWEN_VL_MCQ_MAX_NEW_TOKENS", "1800"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
  DEFAULT_LECTURE_PROMPT_TEMPLATE = """
 
185
  d / str(meta.get("mcq_retry_prompt_file", "mcq_retry_prompt.txt")),
186
  DEFAULT_MCQ_RETRY_PROMPT_TEMPLATE,
187
  ),
 
 
 
 
 
 
 
 
188
  }
189
  configs[cid] = config
190
 
 
201
  "lecture_prompt_template": DEFAULT_LECTURE_PROMPT_TEMPLATE,
202
  "mcq_prompt_template": DEFAULT_MCQ_PROMPT_TEMPLATE,
203
  "mcq_retry_prompt_template": DEFAULT_MCQ_RETRY_PROMPT_TEMPLATE,
 
 
 
 
 
 
 
204
  }
205
  return configs
206
 
 
215
  return CHARACTER_CONFIGS[DEFAULT_CHARACTER_ID]
216
 
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  @dataclass
219
  class MCQItem:
220
  question: str
 
461
 
462
  def __init__(self) -> None:
463
  self.mock_mode = USE_MOCK_MODELS
 
464
  self.vl_loaded = False
465
  self.tts_loaded = False
466
  self._pdf_page_cache: Dict[str, List[str]] = {}
 
 
467
 
468
  def ensure_vl_loaded(self) -> None:
469
  if self.vl_loaded:
 
479
  def ensure_tts_loaded(self) -> None:
480
  if self.tts_loaded:
481
  return
 
 
 
 
 
482
  if self.mock_mode:
483
  self.tts_loaded = True
484
  return
 
487
  raise RuntimeError("Missing API_KEY for TTS API calls.")
488
  self.tts_loaded = True
489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  def _mock_generate_lecture(self, pdf_excerpt: str) -> str:
491
  excerpt = re.sub(r"\s+", " ", pdf_excerpt).strip()
492
  excerpt = excerpt[:1000]
 
670
  chunk_paths.append(self._real_tts_single(chunk, chunk_path))
671
  return concat_wav_files(chunk_paths, out_path)
672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
673
  @spaces.GPU
674
  def build_lesson_and_quiz(self, pdf_path: str, character_cfg: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
675
  self.ensure_vl_loaded()
 
784
  return rebalance_mcq_answers([asdict(q) for q in mcqs])
785
 
786
  @spaces.GPU
787
+ def synthesize_tts(self, text: str, name_prefix: str = "audio") -> str:
 
 
 
 
 
 
 
788
  self.ensure_tts_loaded()
789
  out_path = str(TMP_DIR / f"{name_prefix}_{uuid.uuid4().hex}.wav")
 
 
790
  if self.mock_mode:
791
  return write_tone_wav(text, out_path)
792
  return self._real_tts(text, out_path)
 
1577
  state["status"] = "No lecture text available."
1578
  return state, state["status"], state.get("lecture_audio_path"), "Generate lecture first."
1579
  try:
 
 
1580
  state["status"] = "Generating lecture audio..."
1581
+ state["lecture_audio_path"] = engine.synthesize_tts(state["lecture_text"], name_prefix="lecture")
 
 
 
 
 
1582
  state["status"] = "Lecture audio ready."
1583
  return state, state["status"], state["lecture_audio_path"], "Lecture audio generated."
1584
  except Exception as exc:
 
1592
  state["status"] = "No explanation available for TTS."
1593
  return state, state["status"], state.get("explanation_audio_path"), "Answer a question incorrectly first."
1594
  try:
 
 
1595
  state["status"] = "Generating explanation audio..."
1596
+ state["explanation_audio_path"] = engine.synthesize_tts(text, name_prefix="explanation")
 
 
 
 
 
1597
  state["status"] = "Explanation audio ready."
1598
  return state, state["status"], state["explanation_audio_path"], "Explanation audio generated."
1599
  except Exception as exc:
 
1608
  @import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Inter:wght@400;500;600;700&display=swap');
1609
 
1610
  html, body {{
1611
+ height: 100%;
1612
  min-height: 100%;
 
1613
  }}
1614
  body {{
1615
  background-color: #ffffff !important;
1616
+ color: #0f172a !important;
1617
  font-family: "Inter", sans-serif !important;
1618
  }}
1619
  .app, #root, .gradio-container, .gradio-container > .main {{
1620
+ min-height: 100%;
1621
  background: transparent !important;
1622
  }}
1623
  .gradio-container {{
 
1652
  color: #eef1f6 !important;
1653
  }}
1654
  #page-shell {{
1655
+ min-height: 100%;
1656
+ padding: 2rem 1.2rem 9rem 1.2rem;
1657
  max-width: 980px;
1658
  margin: 0 auto;
1659
  }}
 
1919
  margin-top: 0.25rem !important;
1920
  }}
1921
  #bottom-composer {{
1922
+ position: fixed;
1923
+ left: 50%;
1924
+ transform: translateX(-50%);
1925
+ bottom: 18px;
1926
+ width: min(860px, calc(100vw - 28px));
1927
+ z-index: 40;
1928
  background: rgba(24, 26, 34, 0.88);
1929
  border: 1px solid rgba(255,255,255,0.08);
1930
  border-radius: 999px;
 
2043
  border: 1px solid rgba(59, 130, 246, 0.28);
2044
  color: rgba(255, 255, 255, 0.95);
2045
  }}
2046
+ @media (prefers-color-scheme: light) {{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2047
  body {{
2048
  background: linear-gradient(180deg, #f5f7fb 0%, #eef2f8 100%) !important;
2049
  }}
 
2432
  container=False,
2433
  )
2434
 
 
 
 
 
 
 
 
 
 
 
 
 
2435
  state = gr.State(new_session_state())
2436
 
2437
  loading_md = gr.HTML("", elem_id="gen-loading", visible=False)
 
2540
  score_box = gr.Textbox(label="Score", value="Score: 0 / 0", interactive=False, visible=False)
2541
  feedback_box = gr.Textbox(label="Feedback / Explanation", lines=8, interactive=False, visible=False)
2542
 
2543
+ with gr.Row(elem_id="bottom-composer"):
2544
+ pdf_input = gr.File(
2545
+ label="",
2546
+ show_label=False,
2547
+ file_types=[".pdf"],
2548
+ type="filepath",
2549
+ elem_id="pdf-uploader",
2550
+ scale=7,
2551
+ min_width=0,
2552
+ )
2553
+ run_btn = gr.Button("Generate", variant="primary", elem_id="generate-btn", scale=3, min_width=120)
2554
+
2555
  outputs = [
2556
  state,
2557
  character_header_html,
 
2599
  demo.queue()
2600
 
2601
  if __name__ == "__main__":
2602
+ demo.launch(
2603
+ server_name="0.0.0.0",
2604
+ server_port=7860,
2605
+ css=CSS,
2606
+ ssr_mode=False,
2607
+ )