Spaces:

nukopy
/

vallex-prototyping

Sleeping

App Files Files Community

nukopy commited on Oct 29, 2025

Commit

df70e48

1 Parent(s): ae18720

feat: 計測用のログの追加

Browse files

Files changed (2) hide show

apps/audio_cloning/cheched_vallex.py +12 -3
apps/audio_cloning/vallex/main.py +8 -7

apps/audio_cloning/cheched_vallex.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List, Optional, Tuple
 import gradio as gr
 import numpy as np
 import torch
 from .vallex import main as vallex
@@ -132,6 +133,8 @@ def refresh_prompt_choices():
     )
 def infer_from_cached_prompt(
     text: str,
     language: str,
@@ -165,7 +168,7 @@ def infer_from_cached_prompt(
     except Exception as err:  # pylint: disable=broad-except
         logger.exception("Failed to load cached prompt", exc_info=err)
         return (f"プロンプトの読み込みに失敗しました: {err}", None)
-    timings.append(("プロンプト読込", time.perf_counter() - start_time))
     lang_pr = code2lang.get(lang_code, "en")
@@ -178,6 +181,9 @@ def infer_from_cached_prompt(
     conditioned_text = f"{lang_token}{text}{lang_token}"
     phone_tokens, langs = vallex.text_tokenizer.tokenize(
         text=f"_{conditioned_text}".strip()
     )
@@ -186,7 +192,7 @@ def infer_from_cached_prompt(
     enroll_x_lens = torch.IntTensor([text_prompts.shape[-1]])
     text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
     text_tokens_lens += enroll_x_lens
-    timings.append(("テキスト準備", time.perf_counter() - start_time))
     vallex.model.to(vallex.device)
@@ -209,7 +215,7 @@ def infer_from_cached_prompt(
         else token2lang[langdropdown2token[accent]],
         best_of=5,
     )
-    timings.append(("モデル推論", time.perf_counter() - start_time))
     logger.info("Inference completed")
     start_time = time.perf_counter()
@@ -228,6 +234,9 @@ def infer_from_cached_prompt(
         f"Synthesized text: {conditioned_text}"
     )
     timing_report = "\n↓\n".join(
         f"{step}：{duration:.4f} sec" for step, duration in timings
     )

 import gradio as gr
 import numpy as np
+import spaces
 import torch
 from .vallex import main as vallex
     )
+@spaces.GPU(duration=120)
+@torch.no_grad()
 def infer_from_cached_prompt(
     text: str,
     language: str,
     except Exception as err:  # pylint: disable=broad-except
         logger.exception("Failed to load cached prompt", exc_info=err)
         return (f"プロンプトの読み込みに失敗しました: {err}", None)
+    timings.append(("[cached] 話者特徴抽出", time.perf_counter() - start_time))
     lang_pr = code2lang.get(lang_code, "en")
     conditioned_text = f"{lang_token}{text}{lang_token}"
+    timings.append(("テキスト準備", time.perf_counter() - start_time))
+    start_time = time.perf_counter()
     phone_tokens, langs = vallex.text_tokenizer.tokenize(
         text=f"_{conditioned_text}".strip()
     )
     enroll_x_lens = torch.IntTensor([text_prompts.shape[-1]])
     text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
     text_tokens_lens += enroll_x_lens
+    timings.append(("音素化/トークナイズ", time.perf_counter() - start_time))
     vallex.model.to(vallex.device)
         else token2lang[langdropdown2token[accent]],
         best_of=5,
     )
+    timings.append(("音響モデル推論", time.perf_counter() - start_time))
     logger.info("Inference completed")
     start_time = time.perf_counter()
         f"Synthesized text: {conditioned_text}"
     )
+    for step, duration in timings:
+        logger.info("%s：%.4f sec", step, duration)
     timing_report = "\n↓\n".join(
         f"{step}：{duration:.4f} sec" for step, duration in timings
     )

apps/audio_cloning/vallex/main.py CHANGED Viewed

@@ -373,7 +373,7 @@ def infer_from_audio(
     if wav_pr.ndim == 1:
         wav_pr = wav_pr.unsqueeze(0)
     assert wav_pr.ndim and wav_pr.size(0) == 1
-    timings.append(("音声前処理", time.perf_counter() - start_time))
     start_time = time.perf_counter()
     if transcript_content == "":
@@ -382,16 +382,14 @@ def infer_from_audio(
         lang_pr = langid.classify(str(transcript_content))[0]
         lang_token = lang2token[lang_pr]
         text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
-    timings.append(("プロンプト生成", time.perf_counter() - start_time))
-    start_time = time.perf_counter()
     if language == "auto-detect":
         lang_token = lang2token[langid.classify(text)[0]]
     else:
         lang_token = langdropdown2token[language]
     lang = token2lang[lang_token]
     text = lang_token + text + lang_token
-    timings.append(("言語設定", time.perf_counter() - start_time))
     # onload model
     model.to(device)
@@ -400,7 +398,7 @@ def infer_from_audio(
     # tokenize audio
     encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
     audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
-    timings.append(("音声トークナイズ", time.perf_counter() - start_time))
     start_time = time.perf_counter()
     # tokenize text
@@ -415,7 +413,7 @@ def infer_from_audio(
     text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
     text_tokens_lens += enroll_x_lens
     lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
-    timings.append(("テキストトークナイズ", time.perf_counter() - start_time))
     start_time = time.perf_counter()
     encoded_frames = model.inference(
@@ -429,7 +427,7 @@ def infer_from_audio(
         text_language=langs if accent == "no-accent" else lang,
         best_of=5,
     )
-    timings.append(("モデル推論", time.perf_counter() - start_time))
     # Decode with Vocos
     start_time = time.perf_counter()
     frames = encoded_frames.permute(2, 0, 1)
@@ -437,6 +435,9 @@ def infer_from_audio(
     samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
     timings.append(("ボコーダ復号", time.perf_counter() - start_time))
     timing_report = "\n↓\n".join(
         f"{step}：{duration:.4f} sec" for step, duration in timings
     )

     if wav_pr.ndim == 1:
         wav_pr = wav_pr.unsqueeze(0)
     assert wav_pr.ndim and wav_pr.size(0) == 1
+    timings.append(("前処理", time.perf_counter() - start_time))
     start_time = time.perf_counter()
     if transcript_content == "":
         lang_pr = langid.classify(str(transcript_content))[0]
         lang_token = lang2token[lang_pr]
         text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
     if language == "auto-detect":
         lang_token = lang2token[langid.classify(text)[0]]
     else:
         lang_token = langdropdown2token[language]
     lang = token2lang[lang_token]
     text = lang_token + text + lang_token
+    timings.append(("テキスト準備", time.perf_counter() - start_time))
     # onload model
     model.to(device)
     # tokenize audio
     encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
     audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
+    timings.append(("話者特徴抽出", time.perf_counter() - start_time))
     start_time = time.perf_counter()
     # tokenize text
     text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
     text_tokens_lens += enroll_x_lens
     lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+    timings.append(("音素化/トークナイズ", time.perf_counter() - start_time))
     start_time = time.perf_counter()
     encoded_frames = model.inference(
         text_language=langs if accent == "no-accent" else lang,
         best_of=5,
     )
+    timings.append(("音響モデル推論", time.perf_counter() - start_time))
     # Decode with Vocos
     start_time = time.perf_counter()
     frames = encoded_frames.permute(2, 0, 1)
     samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
     timings.append(("ボコーダ復号", time.perf_counter() - start_time))
+    for step, duration in timings:
+        logger.info("%s：%.4f sec", step, duration)
     timing_report = "\n↓\n".join(
         f"{step}：{duration:.4f} sec" for step, duration in timings
     )