Spaces:
Sleeping
Sleeping
nukopy commited on
Commit ·
9236f5d
1
Parent(s): df70e48
fix: logger.info to print for HuggingFace Spaces logging
Browse files
apps/audio_cloning/cheched_vallex.py
CHANGED
|
@@ -154,7 +154,7 @@ def infer_from_cached_prompt(
|
|
| 154 |
timings: List[Tuple[str, float]] = []
|
| 155 |
start_time = time.perf_counter()
|
| 156 |
try:
|
| 157 |
-
|
| 158 |
prompt_data = np.load(prompt_path)
|
| 159 |
audio_tokens = torch.from_numpy(prompt_data["audio_tokens"]).to(
|
| 160 |
dtype=torch.long
|
|
@@ -201,7 +201,7 @@ def infer_from_cached_prompt(
|
|
| 201 |
audio_prompts = audio_prompts.unsqueeze(0)
|
| 202 |
|
| 203 |
start_time = time.perf_counter()
|
| 204 |
-
|
| 205 |
encoded_frames = vallex.model.inference(
|
| 206 |
text_tokens.to(vallex.device),
|
| 207 |
text_tokens_lens.to(vallex.device),
|
|
@@ -216,17 +216,17 @@ def infer_from_cached_prompt(
|
|
| 216 |
best_of=5,
|
| 217 |
)
|
| 218 |
timings.append(("音響モデル推論", time.perf_counter() - start_time))
|
| 219 |
-
|
| 220 |
|
| 221 |
start_time = time.perf_counter()
|
| 222 |
-
|
| 223 |
frames = encoded_frames.permute(2, 0, 1)
|
| 224 |
features = vallex.vocos.codes_to_features(frames)
|
| 225 |
samples = vallex.vocos.decode(
|
| 226 |
features, bandwidth_id=torch.tensor([2], device=vallex.device)
|
| 227 |
)
|
| 228 |
timings.append(("ボコーダ復号", time.perf_counter() - start_time))
|
| 229 |
-
|
| 230 |
|
| 231 |
message = (
|
| 232 |
f"Loaded cached prompt: {prompt_filename}\n"
|
|
@@ -235,12 +235,12 @@ def infer_from_cached_prompt(
|
|
| 235 |
)
|
| 236 |
|
| 237 |
for step, duration in timings:
|
| 238 |
-
|
| 239 |
|
| 240 |
timing_report = "\n↓\n".join(
|
| 241 |
f"{step}:{duration:.4f} sec" for step, duration in timings
|
| 242 |
)
|
| 243 |
-
|
| 244 |
|
| 245 |
return message, (24000, samples.squeeze(0).cpu().numpy())
|
| 246 |
|
|
|
|
| 154 |
timings: List[Tuple[str, float]] = []
|
| 155 |
start_time = time.perf_counter()
|
| 156 |
try:
|
| 157 |
+
print("Loading cached prompt from: %s", prompt_path)
|
| 158 |
prompt_data = np.load(prompt_path)
|
| 159 |
audio_tokens = torch.from_numpy(prompt_data["audio_tokens"]).to(
|
| 160 |
dtype=torch.long
|
|
|
|
| 201 |
audio_prompts = audio_prompts.unsqueeze(0)
|
| 202 |
|
| 203 |
start_time = time.perf_counter()
|
| 204 |
+
print("Start inferring from cached prompt: %s", prompt_path)
|
| 205 |
encoded_frames = vallex.model.inference(
|
| 206 |
text_tokens.to(vallex.device),
|
| 207 |
text_tokens_lens.to(vallex.device),
|
|
|
|
| 216 |
best_of=5,
|
| 217 |
)
|
| 218 |
timings.append(("音響モデル推論", time.perf_counter() - start_time))
|
| 219 |
+
print("Inference completed")
|
| 220 |
|
| 221 |
start_time = time.perf_counter()
|
| 222 |
+
print("Decoding with Vocos...")
|
| 223 |
frames = encoded_frames.permute(2, 0, 1)
|
| 224 |
features = vallex.vocos.codes_to_features(frames)
|
| 225 |
samples = vallex.vocos.decode(
|
| 226 |
features, bandwidth_id=torch.tensor([2], device=vallex.device)
|
| 227 |
)
|
| 228 |
timings.append(("ボコーダ復号", time.perf_counter() - start_time))
|
| 229 |
+
print("Decoding completed")
|
| 230 |
|
| 231 |
message = (
|
| 232 |
f"Loaded cached prompt: {prompt_filename}\n"
|
|
|
|
| 235 |
)
|
| 236 |
|
| 237 |
for step, duration in timings:
|
| 238 |
+
print("%s:%.4f sec", step, duration)
|
| 239 |
|
| 240 |
timing_report = "\n↓\n".join(
|
| 241 |
f"{step}:{duration:.4f} sec" for step, duration in timings
|
| 242 |
)
|
| 243 |
+
print("推論ステップ計測結果\n%s", timing_report)
|
| 244 |
|
| 245 |
return message, (24000, samples.squeeze(0).cpu().numpy())
|
| 246 |
|
apps/audio_cloning/main.py
CHANGED
|
@@ -15,7 +15,7 @@ def main():
|
|
| 15 |
setup_logger()
|
| 16 |
|
| 17 |
# gradio app
|
| 18 |
-
|
| 19 |
gr.Markdown("# Charamix Audio Cloning Prototype")
|
| 20 |
|
| 21 |
# zero-shot audio cloning
|
|
|
|
| 15 |
setup_logger()
|
| 16 |
|
| 17 |
# gradio app
|
| 18 |
+
print("Initializing Gradio app")
|
| 19 |
gr.Markdown("# Charamix Audio Cloning Prototype")
|
| 20 |
|
| 21 |
# zero-shot audio cloning
|
apps/audio_cloning/vallex/main.py
CHANGED
|
@@ -43,25 +43,25 @@ logger = logging.getLogger(__name__)
|
|
| 43 |
# set base directory
|
| 44 |
OUTPUT_BASE_DIR = os.getenv("HF_HOME", ".")
|
| 45 |
PREPARED_BASE_DIR = "."
|
| 46 |
-
|
| 47 |
-
|
| 48 |
|
| 49 |
# set languages
|
| 50 |
langid.set_languages(["en", "zh", "ja"])
|
| 51 |
|
| 52 |
# set nltk data path
|
| 53 |
nltk.data.path = nltk.data.path + [os.path.join(os.getcwd(), "nltk_data")]
|
| 54 |
-
|
| 55 |
|
| 56 |
# get encoding
|
| 57 |
-
|
| 58 |
"default encoding is %s,file system encoding is %s",
|
| 59 |
sys.getdefaultencoding(),
|
| 60 |
sys.getfilesystemencoding(),
|
| 61 |
)
|
| 62 |
|
| 63 |
# check python version
|
| 64 |
-
|
| 65 |
if sys.version_info[0] < 3 or sys.version_info[1] < 7:
|
| 66 |
logger.warning("The Python version is too low and may cause problems")
|
| 67 |
if platform.system().lower() == "windows":
|
|
@@ -74,7 +74,7 @@ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
|
| 74 |
|
| 75 |
# set torch threads (guarded for hot-reload)
|
| 76 |
thread_count = multiprocessing.cpu_count()
|
| 77 |
-
|
| 78 |
if not getattr(torch, "_vallex_threads_configured", False):
|
| 79 |
torch.set_num_threads(thread_count)
|
| 80 |
try:
|
|
@@ -88,10 +88,10 @@ if not getattr(torch, "_vallex_threads_configured", False):
|
|
| 88 |
# gradio のリロード時に torch.set_num_iterop_threads を実行するとエラーになるので、設定済みのフラグをセット
|
| 89 |
setattr(torch, "_vallex_threads_configured", True)
|
| 90 |
else:
|
| 91 |
-
|
| 92 |
|
| 93 |
# set text tokenizer and collater
|
| 94 |
-
|
| 95 |
tokenizer_path = os.path.join(
|
| 96 |
PREPARED_BASE_DIR, "apps/audio_cloning/vallex/g2p/bpe_69.json"
|
| 97 |
)
|
|
@@ -99,13 +99,13 @@ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path=tokenizer_path)
|
|
| 99 |
text_collater = get_text_token_collater()
|
| 100 |
|
| 101 |
# set device
|
| 102 |
-
|
| 103 |
device = torch.device("cpu")
|
| 104 |
if torch.cuda.is_available():
|
| 105 |
device = torch.device("cuda", 0)
|
| 106 |
# if torch.backends.mps.is_available():
|
| 107 |
# device = torch.device("mps")
|
| 108 |
-
|
| 109 |
|
| 110 |
# Download VALL-E-X model weights if not exists
|
| 111 |
OUTPUT_DIR_CHECKPOINTS = os.path.join(OUTPUT_BASE_DIR, "models/checkpoints")
|
|
@@ -127,7 +127,7 @@ if not os.path.exists(OUTPUT_PATH_CHECKPOINTS):
|
|
| 127 |
out=OUTPUT_PATH_CHECKPOINTS,
|
| 128 |
bar=wget.bar_adaptive,
|
| 129 |
)
|
| 130 |
-
|
| 131 |
except Exception as e:
|
| 132 |
logger.error("Error downloading model weights: %s", e)
|
| 133 |
raise Exception(
|
|
@@ -156,7 +156,7 @@ assert not missing_keys
|
|
| 156 |
model.eval()
|
| 157 |
|
| 158 |
# Encodec-based tokenizer: converts reference audio into discrete conditioning tokens for VALLE
|
| 159 |
-
|
| 160 |
audio_tokenizer = AudioTokenizer(device)
|
| 161 |
|
| 162 |
# Vocos vocoder: decodes VALLE's discrete acoustic codes back into a 24 kHz waveform
|
|
@@ -168,12 +168,12 @@ if not os.path.exists(OUTPUT_DIR_WHISPER):
|
|
| 168 |
os.makedirs(OUTPUT_DIR_WHISPER, exist_ok=True)
|
| 169 |
|
| 170 |
try:
|
| 171 |
-
|
| 172 |
model_name = "tiny"
|
| 173 |
whisper_model = whisper.load_model(
|
| 174 |
model_name, device="cpu", download_root=OUTPUT_DIR_WHISPER
|
| 175 |
)
|
| 176 |
-
|
| 177 |
except NotImplementedError as e:
|
| 178 |
logger.error("Error on loading Whisper model: %s", e)
|
| 179 |
raise Exception(
|
|
@@ -188,7 +188,7 @@ except Exception as e:
|
|
| 188 |
) from e
|
| 189 |
|
| 190 |
# Initialize Voice Presets
|
| 191 |
-
|
| 192 |
PRESETS_DIR = os.path.join(PREPARED_BASE_DIR, "apps/audio_cloning/vallex/presets")
|
| 193 |
preset_list = os.walk(PRESETS_DIR).__next__()[2]
|
| 194 |
preset_list = [preset[:-4] for preset in preset_list if preset.endswith(".npz")]
|
|
@@ -436,12 +436,12 @@ def infer_from_audio(
|
|
| 436 |
timings.append(("ボコーダ復号", time.perf_counter() - start_time))
|
| 437 |
|
| 438 |
for step, duration in timings:
|
| 439 |
-
|
| 440 |
|
| 441 |
timing_report = "\n↓\n".join(
|
| 442 |
f"{step}:{duration:.4f} sec" for step, duration in timings
|
| 443 |
)
|
| 444 |
-
|
| 445 |
|
| 446 |
message = f"text prompt: {text_pr}\nsythesized text: {text}"
|
| 447 |
return message, (24000, samples.squeeze(0).cpu().numpy())
|
|
|
|
| 43 |
# set base directory
|
| 44 |
OUTPUT_BASE_DIR = os.getenv("HF_HOME", ".")
|
| 45 |
PREPARED_BASE_DIR = "."
|
| 46 |
+
print("Base directory: %s", OUTPUT_BASE_DIR)
|
| 47 |
+
print("Prepared base directory: %s", PREPARED_BASE_DIR)
|
| 48 |
|
| 49 |
# set languages
|
| 50 |
langid.set_languages(["en", "zh", "ja"])
|
| 51 |
|
| 52 |
# set nltk data path
|
| 53 |
nltk.data.path = nltk.data.path + [os.path.join(os.getcwd(), "nltk_data")]
|
| 54 |
+
print("nltk_data path: %s", nltk.data.path)
|
| 55 |
|
| 56 |
# get encoding
|
| 57 |
+
print(
|
| 58 |
"default encoding is %s,file system encoding is %s",
|
| 59 |
sys.getdefaultencoding(),
|
| 60 |
sys.getfilesystemencoding(),
|
| 61 |
)
|
| 62 |
|
| 63 |
# check python version
|
| 64 |
+
print("You are using Python version %s", platform.python_version())
|
| 65 |
if sys.version_info[0] < 3 or sys.version_info[1] < 7:
|
| 66 |
logger.warning("The Python version is too low and may cause problems")
|
| 67 |
if platform.system().lower() == "windows":
|
|
|
|
| 74 |
|
| 75 |
# set torch threads (guarded for hot-reload)
|
| 76 |
thread_count = multiprocessing.cpu_count()
|
| 77 |
+
print("Use %d cpu cores for computing", thread_count)
|
| 78 |
if not getattr(torch, "_vallex_threads_configured", False):
|
| 79 |
torch.set_num_threads(thread_count)
|
| 80 |
try:
|
|
|
|
| 88 |
# gradio のリロード時に torch.set_num_iterop_threads を実行するとエラーになるので、設定済みのフラグをセット
|
| 89 |
setattr(torch, "_vallex_threads_configured", True)
|
| 90 |
else:
|
| 91 |
+
print("Torch threads already configured; skipping reconfiguration")
|
| 92 |
|
| 93 |
# set text tokenizer and collater
|
| 94 |
+
print("Setting text tokenizer and collater...")
|
| 95 |
tokenizer_path = os.path.join(
|
| 96 |
PREPARED_BASE_DIR, "apps/audio_cloning/vallex/g2p/bpe_69.json"
|
| 97 |
)
|
|
|
|
| 99 |
text_collater = get_text_token_collater()
|
| 100 |
|
| 101 |
# set device
|
| 102 |
+
print("Setting device...")
|
| 103 |
device = torch.device("cpu")
|
| 104 |
if torch.cuda.is_available():
|
| 105 |
device = torch.device("cuda", 0)
|
| 106 |
# if torch.backends.mps.is_available():
|
| 107 |
# device = torch.device("mps")
|
| 108 |
+
print("Device set to %s", device)
|
| 109 |
|
| 110 |
# Download VALL-E-X model weights if not exists
|
| 111 |
OUTPUT_DIR_CHECKPOINTS = os.path.join(OUTPUT_BASE_DIR, "models/checkpoints")
|
|
|
|
| 127 |
out=OUTPUT_PATH_CHECKPOINTS,
|
| 128 |
bar=wget.bar_adaptive,
|
| 129 |
)
|
| 130 |
+
print("Model weights downloaded successfully")
|
| 131 |
except Exception as e:
|
| 132 |
logger.error("Error downloading model weights: %s", e)
|
| 133 |
raise Exception(
|
|
|
|
| 156 |
model.eval()
|
| 157 |
|
| 158 |
# Encodec-based tokenizer: converts reference audio into discrete conditioning tokens for VALLE
|
| 159 |
+
print("Initializing Encodec-based tokenizer...")
|
| 160 |
audio_tokenizer = AudioTokenizer(device)
|
| 161 |
|
| 162 |
# Vocos vocoder: decodes VALLE's discrete acoustic codes back into a 24 kHz waveform
|
|
|
|
| 168 |
os.makedirs(OUTPUT_DIR_WHISPER, exist_ok=True)
|
| 169 |
|
| 170 |
try:
|
| 171 |
+
print("Loading Whisper model...")
|
| 172 |
model_name = "tiny"
|
| 173 |
whisper_model = whisper.load_model(
|
| 174 |
model_name, device="cpu", download_root=OUTPUT_DIR_WHISPER
|
| 175 |
)
|
| 176 |
+
print("Whisper model loaded successfully")
|
| 177 |
except NotImplementedError as e:
|
| 178 |
logger.error("Error on loading Whisper model: %s", e)
|
| 179 |
raise Exception(
|
|
|
|
| 188 |
) from e
|
| 189 |
|
| 190 |
# Initialize Voice Presets
|
| 191 |
+
print("Initializing Voice Presets...")
|
| 192 |
PRESETS_DIR = os.path.join(PREPARED_BASE_DIR, "apps/audio_cloning/vallex/presets")
|
| 193 |
preset_list = os.walk(PRESETS_DIR).__next__()[2]
|
| 194 |
preset_list = [preset[:-4] for preset in preset_list if preset.endswith(".npz")]
|
|
|
|
| 436 |
timings.append(("ボコーダ復号", time.perf_counter() - start_time))
|
| 437 |
|
| 438 |
for step, duration in timings:
|
| 439 |
+
print("%s:%.4f sec", step, duration)
|
| 440 |
|
| 441 |
timing_report = "\n↓\n".join(
|
| 442 |
f"{step}:{duration:.4f} sec" for step, duration in timings
|
| 443 |
)
|
| 444 |
+
print("推論ステップ計測結果\n%s", timing_report)
|
| 445 |
|
| 446 |
message = f"text prompt: {text_pr}\nsythesized text: {text}"
|
| 447 |
return message, (24000, samples.squeeze(0).cpu().numpy())
|