Spaces:

LEMAS-Project
/

LEMAS-Edit

Running on Zero

App Files Files Community

Approximetal commited on Jan 2

Commit

c9c7e92

verified ·

1 Parent(s): 1f053ff

Upload gradio_mix.py with huggingface_hub

Browse files

Files changed (1) hide show

gradio_mix.py +66 -82

gradio_mix.py CHANGED Viewed

@@ -36,16 +36,10 @@ _JIEBA_DICT = os.path.join(
 if os.path.isfile(_JIEBA_DICT):
     jieba.set_dictionary(_JIEBA_DICT)
-# import sys
-# sys.path.append("/cto_labs/vistring/zhaozhiyuan/code/SpeechAugment/versatile_audio_super_resolution")
-# from inference import Predictor
 # from inference_tts_scale import inference_one_sample as inference_tts
 import langid
-langid.set_languages(['es','pt','zh','en','de','fr','it', 'ar', 'ru', 'ja', 'ko', 'hi', 'th', 'id', 'vi'])
-# import nltk
-# nltk.download('punkt')
 os.environ['CURL_CA_BUNDLE'] = ''
 DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
@@ -77,57 +71,6 @@ def seed_everything(seed):
         torch.backends.cudnn.deterministic = True
-# class AudioSR:
-#     def __init__(self, model_name):
-#         code_dir = "/cto_labs/vistring/zhaozhiyuan/code/SpeechAugment/versatile_audio_super_resolution"
-#         self.model = self.load_model(model_name, code_dir)
-#         self.sr = 48000
-#         self.chunk_size=10.24
-#         self.overlap=0.16
-#         self.guidance_scale=1
-#         self.ddim_steps=20
-#         self.multiband_ensemble=False
-#     def load_model(self, model_name, code_dir):
-#         import sys, json
-#         sys.path.append(code_dir)
-#         from inference import Predictor
-#         sr_model = Predictor()
-#         sr_model.setup(model_name)
-#         return sr_model
-#     def audiosr(self, in_wav, src_sr, tar_sr, chunk_size=10.24, overlap=0.16, seed=0, guidance_scale=1, ddim_steps=20, multiband_ensemble=False):
-#         if seed == 0:
-#             seed = random.randint(0, 2**32 - 1)
-#         print(f"Setting seed to: {seed}")
-#         print(f"overlap = {overlap}")
-#         print(f"guidance_scale = {guidance_scale}")
-#         print(f"ddim_steps = {ddim_steps}")
-#         print(f"chunk_size = {chunk_size}")
-#         print(f"multiband_ensemble = {multiband_ensemble}")
-#         print(f"in_wav.shape = {in_wav.shape}")
-#         in_wav = torchaudio.functional.resample(in_wav.squeeze(), src_sr, 24000)
-#         in_wav = in_wav.squeeze().numpy()
-#         out_wav = self.model.process_audio(
-#             in_wav, 24000,
-#             chunk_size=chunk_size,
-#             overlap=overlap,
-#             seed=seed,
-#             guidance_scale=guidance_scale,
-#             ddim_steps=ddim_steps,
-#             multiband_ensemble=multiband_ensemble,
-#         )
-#         out_wav = out_wav[:int(self.sr*in_wav.shape[0]/24000)].T
-#         if tar_sr != self.sr:
-#             out_wav = torchaudio.functional.resample(torch.from_numpy(out_wav).squeeze(), self.sr, tar_sr)
-#         else:
-#             out_wav = torch.from_numpy(out_wav)
-#         print(f"out.shape = {out_wav.shape} tar_sr={tar_sr}")
-#         return out_wav.squeeze()
 class UVR5:
     """Small wrapper around the bundled uvr5 implementation for denoising."""
@@ -465,7 +408,7 @@ class WhisperxModel:
             ASR_DEVICE,
             compute_type="float32",
             asr_options={
-                "suppress_numerals": True,
                 "max_new_tokens": None,
                 "clip_timestamps": None,
                 "initial_prompt": prompt,
@@ -481,10 +424,7 @@ class WhisperxModel:
         audio = load_wav(audio_info).numpy()
         if lang is None:
             lang = self.model.detect_language(audio)
-        if lang == 'zh':
-            self.model.options._replace(initial_prompt="简体中文：")
-        else:
-            self.model.options._replace(initial_prompt=None)
         segments = self.model.transcribe(audio, batch_size=8, language=lang)["segments"]
         transcript = " ".join([segment["text"] for segment in segments])
@@ -587,7 +527,7 @@ def load_models(lemas_model_name, whisper_model_name, alignment_model_name, deno
     if denoise_model_name == "UVR5":
         denoise_model = UVR5(os.path.join(str(PRETRAINED_ROOT), "uvr5"))
     elif denoise_model_name == "DeepFilterNet":
-        denoise_model = DeepFilterNet("./audio_preprocess/denoiser_model.onnx")
     if alignment_model_name == "MMS":
         align_model = MMSAlignModel()
@@ -615,18 +555,29 @@ def load_models(lemas_model_name, whisper_model_name, alignment_model_name, deno
     prosody_cfg = Path(CKPTS_ROOT) / "prosody_encoder" / "pretssel_cfg.json"
     prosody_ckpt = Path(CKPTS_ROOT) / "prosody_encoder" / "prosody_encoder_UnitY2.pt"
-    use_prosody = prosody_cfg.is_file() and prosody_ckpt.is_file()
     tts_edit_model = TTS(
         model=lemas_model_name,
         ckpt_file=ckpt_file,
         vocab_file=str(vocab_file),
         device=device,
-        use_ema=True,
-        frontend="phone",
         use_prosody_encoder=use_prosody,
         prosody_cfg_path=str(prosody_cfg) if use_prosody else "",
         prosody_ckpt_path=str(prosody_ckpt) if use_prosody else "",
     )
     logging.info(f"Loaded LEMAS-TTS edit model from {ckpt_file}")
@@ -819,19 +770,23 @@ def run(seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
     seed_val = None if seed == -1 else int(seed)
     wav_out, _ = gen_wav_multilingual(
         tts_edit_model,
         segment_audio,
         tts_edit_model.target_sample_rate,
         target_text,
         parts_to_edit,
         nfe_step=int(nfe_step),
         cfg_strength=float(cfg_strength),
         sway_sampling_coef=float(sway_sampling_coef),
         ref_ratio=float(ref_ratio),
         no_ref_audio=False,
         use_acc_grl=False,
-        use_prosody_encoder_flag=True,
         seed=seed_val,
     )
@@ -969,22 +924,46 @@ def get_app():
                 with gr.Accordion("Select models", open=False) as models_selector:
                     # For LEMAS-TTS editing, we expose a simple model selector
                     # between the two multilingual variants.
-                    lemas_model_choice = gr.Radio(
-                        label="LEMAS-TTS Model",
-                        choices=["multilingual_grl", "multilingual_prosody"],
-                        value="multilingual_grl",
-                        interactive=True,
-                    )
                     with gr.Row():
                         denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
                         # whisper_backend_choice = gr.Radio(label="Whisper backend", value="", choices=["whisperX", "whisper"])
                         whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
-                        # audiosr_choice = gr.Radio(label="AudioSR model", scale=2, value="None", choices=["basic", "speech", "None"])
         with gr.Row():
             with gr.Column(scale=2):
-                input_audio = gr.Audio(value=f"{DEMO_PATH}/V-00013_en-US.wav", label="Input Audio", interactive=True)
                 with gr.Row():
                     transcribe_btn = gr.Button(value="Transcribe")
@@ -1000,7 +979,7 @@ def get_app():
                 with gr.Row():
                     denoise_btn = gr.Button(value="Denoise")
                     cancel_btn = gr.Button(value="Cancel Denoise")
-                denoise_audio = gr.Audio(label="Denoised Audio", value=None, interactive=False)
             with gr.Column(scale=3):
                 with gr.Group():
@@ -1035,20 +1014,25 @@ def get_app():
                     with gr.Row():
                         edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.614, step=0.001, value=4.022)
                         edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.614, step=0.001, value=5.768)
                     with gr.Row():
-                        check_btn = gr.Button(value="Check edit words",scale=1)
-                        edit_audio = gr.Audio(label="Edit word(s)", scale=3)
                     run_btn = gr.Button(value="Run", variant="primary")
             with gr.Column(scale=2):
-                output_audio = gr.Audio(label="Output Audio")
                 with gr.Accordion("Inference transcript", open=True):
                     inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False, info="Inference was performed on this transcript.")
                 with gr.Group(visible=False) as long_tts_sentence_editor:
                     sentence_selector = gr.Dropdown(label="Sentence", value=None,
                                                     info="Select sentence you want to regenerate")
-                    sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
                     rerun_btn = gr.Button(value="Rerun")
         with gr.Row():
@@ -1064,7 +1048,7 @@ def get_app():
                         label="Speed",
                         minimum=0.5,
                         maximum=1.5,
-                        step=0.05,
                         value=1.0,
                         info="Placeholder for future use; currently not applied.",
                     )

 if os.path.isfile(_JIEBA_DICT):
     jieba.set_dictionary(_JIEBA_DICT)
 # from inference_tts_scale import inference_one_sample as inference_tts
 import langid
+langid.set_languages(['es','pt','zh','en','de','fr','it', 'ru', 'id', 'vi'])
 os.environ['CURL_CA_BUNDLE'] = ''
 DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
         torch.backends.cudnn.deterministic = True
 class UVR5:
     """Small wrapper around the bundled uvr5 implementation for denoising."""
             ASR_DEVICE,
             compute_type="float32",
             asr_options={
+                "suppress_numerals": False,
                 "max_new_tokens": None,
                 "clip_timestamps": None,
                 "initial_prompt": prompt,
         audio = load_wav(audio_info).numpy()
         if lang is None:
             lang = self.model.detect_language(audio)
         segments = self.model.transcribe(audio, batch_size=8, language=lang)["segments"]
         transcript = " ".join([segment["text"] for segment in segments])
     if denoise_model_name == "UVR5":
         denoise_model = UVR5(os.path.join(str(PRETRAINED_ROOT), "uvr5"))
     elif denoise_model_name == "DeepFilterNet":
+        denoise_model = DeepFilterNet("./pretrained_models/denoiser_model.onnx")
     if alignment_model_name == "MMS":
         align_model = MMSAlignModel()
     prosody_cfg = Path(CKPTS_ROOT) / "prosody_encoder" / "pretssel_cfg.json"
     prosody_ckpt = Path(CKPTS_ROOT) / "prosody_encoder" / "prosody_encoder_UnitY2.pt"
+    # Decide whether to enable the prosody encoder:
+    # - multilingual_prosody: True (if assets exist)
+    # - multilingual_grl: False (GRL-only variant)
+    # - others: fall back to presence of assets.
+    if lemas_model_name.endswith("prosody"):
+        use_prosody = prosody_cfg.is_file() and prosody_ckpt.is_file()
+    elif lemas_model_name.endswith("grl"):
+        use_prosody = False
+    else:
+        use_prosody = prosody_cfg.is_file() and prosody_ckpt.is_file()
     tts_edit_model = TTS(
         model=lemas_model_name,
         ckpt_file=ckpt_file,
         vocab_file=str(vocab_file),
         device=device,
         use_prosody_encoder=use_prosody,
         prosody_cfg_path=str(prosody_cfg) if use_prosody else "",
         prosody_ckpt_path=str(prosody_ckpt) if use_prosody else "",
+        ode_method="euler",
+        use_ema=True,
+        frontend="phone",
     )
     logging.info(f"Loaded LEMAS-TTS edit model from {ckpt_file}")
     seed_val = None if seed == -1 else int(seed)
+    # Decide whether to use prosody encoder at inference based on how TTS was built
+    use_prosody_flag = bool(getattr(tts_edit_model, "use_prosody_encoder", False))
     wav_out, _ = gen_wav_multilingual(
         tts_edit_model,
         segment_audio,
         tts_edit_model.target_sample_rate,
         target_text,
         parts_to_edit,
+        speed=float(speed),
         nfe_step=int(nfe_step),
         cfg_strength=float(cfg_strength),
         sway_sampling_coef=float(sway_sampling_coef),
         ref_ratio=float(ref_ratio),
         no_ref_audio=False,
         use_acc_grl=False,
+        use_prosody_encoder_flag=use_prosody_flag,
         seed=seed_val,
     )
                 with gr.Accordion("Select models", open=False) as models_selector:
                     # For LEMAS-TTS editing, we expose a simple model selector
                     # between the two multilingual variants.
                     with gr.Row():
+                        lemas_model_choice = gr.Radio(
+                            label="Edit Model",
+                            choices=["multilingual_grl", "multilingual_prosody"],
+                            value="multilingual_grl",
+                            interactive=True,
+                            scale=3,
+                        )
                         denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
                         # whisper_backend_choice = gr.Radio(label="Whisper backend", value="", choices=["whisperX", "whisper"])
                         whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
         with gr.Row():
             with gr.Column(scale=2):
+                # Use a numpy waveform as default value to avoid Gradio's
+                # InvalidPathError with local filesystem paths.
+                _demo_value = None
+                demo_candidates = [
+                    os.path.join(DEMO_PATH, "V-00013_en-US.wav"),
+                    os.path.join(os.path.dirname(__file__), "..", "VoiceCraft", "demo", "V-00013_en-US.wav"),
+                ]
+                for demo_path in demo_candidates:
+                    try:
+                        if not os.path.isfile(demo_path):
+                            continue
+                        _demo_wav, _demo_sr = torchaudio.load(demo_path)
+                        if _demo_wav.dim() > 1 and _demo_wav.shape[0] > 1:
+                            _demo_wav = _demo_wav.mean(dim=0, keepdim=True)
+                        _demo_value = (_demo_sr, _demo_wav.squeeze(0).numpy())
+                        break
+                    except Exception:
+                        continue
+                input_audio = gr.Audio(
+                    value=_demo_value,
+                    label="Input Audio",
+                    interactive=True,
+                    type="numpy",
+                )
                 with gr.Row():
                     transcribe_btn = gr.Button(value="Transcribe")
                 with gr.Row():
                     denoise_btn = gr.Button(value="Denoise")
                     cancel_btn = gr.Button(value="Cancel Denoise")
+                denoise_audio = gr.Audio(label="Denoised Audio", value=None, interactive=False, type="numpy")
             with gr.Column(scale=3):
                 with gr.Group():
                     with gr.Row():
                         edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.614, step=0.001, value=4.022)
                         edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.614, step=0.001, value=5.768)
+                    # Put the button and audio in separate columns so that
+                    # the tall audio widget does not overlap the clickable
+                    # area of the button.
                     with gr.Row():
+                        with gr.Column(scale=1):
+                            check_btn = gr.Button(value="Check edit words")
+                        with gr.Column(scale=3):
+                            edit_audio = gr.Audio(label="Edit word(s)", scale=3, type="numpy")
                     run_btn = gr.Button(value="Run", variant="primary")
             with gr.Column(scale=2):
+                output_audio = gr.Audio(label="Output Audio", type="numpy")
                 with gr.Accordion("Inference transcript", open=True):
                     inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False, info="Inference was performed on this transcript.")
                 with gr.Group(visible=False) as long_tts_sentence_editor:
                     sentence_selector = gr.Dropdown(label="Sentence", value=None,
                                                     info="Select sentence you want to regenerate")
+                    sentence_audio = gr.Audio(label="Sentence Audio", scale=2, type="numpy")
                     rerun_btn = gr.Button(value="Rerun")
         with gr.Row():
                         label="Speed",
                         minimum=0.5,
                         maximum=1.5,
+                        step=0.1,
                         value=1.0,
                         info="Placeholder for future use; currently not applied.",
                     )