Spaces:

OpenMOSS-Team
/

MOSS-TTS-Nano

Running on Zero

App Files Files Community

Kuangwei Chen commited on 4 days ago

Commit

ebd7ee5

1 Parent(s): 2d87bb9

Add normalization toggles to Space UI

Browse files

Files changed (3) hide show

app.py +23 -4
requirements.txt +1 -0
text_normalization_pipeline.py +32 -16

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ except ImportError:
     spaces = _SpacesFallback()
 from nano_tts_runtime import DEFAULT_VOICE, NanoTTSService, build_default_voice_presets
-from text_normalization_pipeline import prepare_tts_request_texts
 APP_DIR = Path(__file__).resolve().parent
 CHECKPOINT_PATH = APP_DIR / "weights" / "tts"
@@ -162,6 +162,11 @@ def get_runtime_tts_service() -> NanoTTSService:
     return get_tts_service(bool(torch.cuda.is_available()))
 def preload_service() -> None:
     started_at = time.monotonic()
     service = get_runtime_tts_service()
@@ -346,6 +351,8 @@ def run_inference(
     voice: str,
     prompt_audio_path: str | None,
     selected_demo_audio_path: str | None,
     max_new_frames: int,
     voice_clone_max_text_tokens: int,
     do_sample: bool,
@@ -361,6 +368,7 @@ def run_inference(
     generated_audio_path: str | None = None
     try:
         service = get_runtime_tts_service()
         effective_prompt_audio_path = resolve_effective_prompt_audio_path(
             voice=voice,
             prompt_audio_path=prompt_audio_path,
@@ -374,8 +382,9 @@ def run_inference(
             text=normalized_text,
             prompt_text="",
             voice=voice,
-            enable_wetext=False,
-            text_normalizer_manager=None,
         )
         prompt_source = build_prompt_source_text(
             voice=voice,
@@ -471,10 +480,18 @@ def build_demo():
                 )
                 gr.Markdown(
-                    "Robust text normalization is always on. Runtime device and backbone are fixed by the Space and are not user-configurable. Uploaded reference audio overrides the selected example case."
                 )
                 with gr.Accordion("Advanced Parameters", open=False):
                     max_new_frames = gr.Slider(
                         minimum=64,
                         maximum=512,
@@ -589,6 +606,8 @@ def build_demo():
                 voice,
                 prompt_audio,
                 selected_demo_audio_path,
                 max_new_frames,
                 voice_clone_max_text_tokens,
                 do_sample,

     spaces = _SpacesFallback()
 from nano_tts_runtime import DEFAULT_VOICE, NanoTTSService, build_default_voice_presets
+from text_normalization_pipeline import WeTextProcessingManager, prepare_tts_request_texts
 APP_DIR = Path(__file__).resolve().parent
 CHECKPOINT_PATH = APP_DIR / "weights" / "tts"
     return get_tts_service(bool(torch.cuda.is_available()))
+@functools.lru_cache(maxsize=1)
+def get_text_normalizer_manager() -> WeTextProcessingManager:
+    return WeTextProcessingManager()
 def preload_service() -> None:
     started_at = time.monotonic()
     service = get_runtime_tts_service()
     voice: str,
     prompt_audio_path: str | None,
     selected_demo_audio_path: str | None,
+    enable_wetext_processing: bool,
+    enable_normalize_tts_text: bool,
     max_new_frames: int,
     voice_clone_max_text_tokens: int,
     do_sample: bool,
     generated_audio_path: str | None = None
     try:
         service = get_runtime_tts_service()
+        text_normalizer_manager = get_text_normalizer_manager() if enable_wetext_processing else None
         effective_prompt_audio_path = resolve_effective_prompt_audio_path(
             voice=voice,
             prompt_audio_path=prompt_audio_path,
             text=normalized_text,
             prompt_text="",
             voice=voice,
+            enable_wetext=bool(enable_wetext_processing),
+            enable_normalize_tts_text=bool(enable_normalize_tts_text),
+            text_normalizer_manager=text_normalizer_manager,
         )
         prompt_source = build_prompt_source_text(
             voice=voice,
                 )
                 gr.Markdown(
+                    "Runtime device and backbone are fixed by the Space and are not user-configurable. Uploaded reference audio overrides the selected example case."
                 )
                 with gr.Accordion("Advanced Parameters", open=False):
+                    enable_wetext_processing = gr.Checkbox(
+                        value=False,
+                        label="Enable WeTextProcessing",
+                    )
+                    enable_normalize_tts_text = gr.Checkbox(
+                        value=True,
+                        label="Enable normalize_tts_text",
+                    )
                     max_new_frames = gr.Slider(
                         minimum=64,
                         maximum=512,
                 voice,
                 prompt_audio,
                 selected_demo_audio_path,
+                enable_wetext_processing,
+                enable_normalize_tts_text,
                 max_new_frames,
                 voice_clone_max_text_tokens,
                 do_sample,

requirements.txt CHANGED Viewed

@@ -7,3 +7,4 @@ safetensors>=0.4.3
 soundfile>=0.13.1
 gradio==6.5.1
 spaces

 soundfile>=0.13.1
 gradio==6.5.1
 spaces
+WeTextProcessing>=1.0.4.1

text_normalization_pipeline.py CHANGED Viewed

@@ -135,6 +135,7 @@ def prepare_tts_request_texts(
     prompt_text: str,
     voice: str,
     enable_wetext: bool,
     text_normalizer_manager: WeTextProcessingManager | None,
 ) -> dict[str, object]:
     raw_text = str(text or "")
@@ -168,28 +169,43 @@ def prepare_tts_request_texts(
                 normalization_language,
             )
-    final_text = normalize_tts_text(intermediate_text)
-    final_prompt_text = normalize_tts_text(intermediate_prompt_text) if intermediate_prompt_text else ""
-    if final_text != intermediate_text:
-        logging.info(
-            "normalized text chars_before=%d chars_after=%d stage=robust_final",
-            len(intermediate_text),
-            len(final_text),
-        )
-    if intermediate_prompt_text and final_prompt_text != intermediate_prompt_text:
-        logging.info(
-            "normalized prompt_text chars_before=%d chars_after=%d stage=robust_final",
-            len(intermediate_prompt_text),
-            len(final_prompt_text),
-        )
     return {
         "text": final_text,
         "prompt_text": final_prompt_text,
         "normalized_text": final_text,
         "normalized_prompt_text": final_prompt_text,
-        "normalization_method": (f"wetext:{normalization_language}+robust" if enable_wetext else "robust"),
         "text_normalization_language": normalization_language,
-        "text_normalization_enabled": bool(enable_wetext),
     }

     prompt_text: str,
     voice: str,
     enable_wetext: bool,
+    enable_normalize_tts_text: bool,
     text_normalizer_manager: WeTextProcessingManager | None,
 ) -> dict[str, object]:
     raw_text = str(text or "")
                 normalization_language,
             )
+    if enable_normalize_tts_text:
+        final_text = normalize_tts_text(intermediate_text)
+        final_prompt_text = normalize_tts_text(intermediate_prompt_text) if intermediate_prompt_text else ""
+        if final_text != intermediate_text:
+            logging.info(
+                "normalized text chars_before=%d chars_after=%d stage=robust_final",
+                len(intermediate_text),
+                len(final_text),
+            )
+        if intermediate_prompt_text and final_prompt_text != intermediate_prompt_text:
+            logging.info(
+                "normalized prompt_text chars_before=%d chars_after=%d stage=robust_final",
+                len(intermediate_prompt_text),
+                len(final_prompt_text),
+            )
+    else:
+        final_text = intermediate_text
+        final_prompt_text = intermediate_prompt_text
+    if enable_wetext and enable_normalize_tts_text:
+        normalization_method = f"wetext:{normalization_language}+robust"
+    elif enable_wetext:
+        normalization_method = f"wetext:{normalization_language}"
+    elif enable_normalize_tts_text:
+        normalization_method = "robust"
+    else:
+        normalization_method = "none"
     return {
         "text": final_text,
         "prompt_text": final_prompt_text,
         "normalized_text": final_text,
         "normalized_prompt_text": final_prompt_text,
+        "normalization_method": normalization_method,
         "text_normalization_language": normalization_language,
+        "text_normalization_enabled": bool(enable_wetext or enable_normalize_tts_text),
+        "wetext_enabled": bool(enable_wetext),
+        "normalize_tts_text_enabled": bool(enable_normalize_tts_text),
     }