Spaces:

buchi-stdesign
/

style-bert-vits2-fastapi

Runtime error

App Files Files Community

buchi-stdesign commited on Apr 29, 2025

Commit

94766d1

verified ·

1 Parent(s): f6f2d6e

Update inference.py

Browse files

Files changed (1) hide show

inference.py +55 -43

inference.py CHANGED Viewed

@@ -1,77 +1,89 @@
 import os
 import torch
 import numpy as np
-import json
-from scipy.io import wavfile
 from huggingface_hub import hf_hub_download
-from src.sbv2.synthesizer_trn import SynthesizerTrn
-from src.sbv2 import commons
 from src.sbv2 import utils
 from src.sbv2.text import text_to_sequence
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = None
-hps = None
 MODEL_REPO = os.getenv("MODEL_REPO")
 HF_TOKEN = os.getenv("HF_TOKEN")
 CACHE_DIR = "/tmp/models"
 def load_model():
     global model, hps
     config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
     model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
     style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)
     with open(config_path, "r", encoding="utf-8") as f:
         hps = json.load(f)
-    n_vocab = 77
     model = SynthesizerTrn(
         n_vocab,
         hps["model"]["inter_channels"],
         hps["model"]["hidden_channels"],
         hps["model"]["filter_channels"],
-        hps["model"]["n_heads"],
-        hps["model"]["n_layers"],
-        hps["model"]["kernel_size"],
-        hps["model"]["p_dropout"],
-        resblock=hps["model"]["resblock"],
-        resblock_kernel_sizes=hps["model"]["resblock_kernel_sizes"],
-        resblock_dilation_sizes=hps["model"]["resblock_dilation_sizes"],
-        upsample_rates=hps["model"]["upsample_rates"],
-        upsample_initial_channel=hps["model"]["upsample_initial_channel"],
-        upsample_kernel_sizes=hps["model"]["upsample_kernel_sizes"],
-        gin_channels=hps["model"].get("gin_channels", 0),
     ).to(device)
-    _ = utils.load_checkpoint(model_path, model, None, strict=True)
     model.eval()
-    print("✅ Model loaded successfully (strict=True).")
-def synthesize(text):
-    global model, hps
-    if model is None or hps is None:
-        raise RuntimeError("Model not loaded!")
-    stn_tst = torch.LongTensor(text_to_sequence(text, hps["data"]["text_cleaners"], hps["data"].get("cleaned_text", True))).unsqueeze(0).to(device)
     with torch.no_grad():
-        x_tst_lengths = torch.LongTensor([stn_tst.size(1)]).to(device)
-        sid = torch.LongTensor([0]).to(device) if hps["data"].get("n_speakers", 0) > 0 else None
-        audio = model.infer(
-            stn_tst,
-            x_tst_lengths,
-            sid=sid,
-            noise_scale=0.667,
-            noise_scale_w=0.8,
-            length_scale=1.0
-        )[0][0, 0].data.cpu().float().numpy()
-    return audio

 import os
 import torch
 import numpy as np
+import soundfile as sf
+from fastapi import FastAPI
 from huggingface_hub import hf_hub_download
 from src.sbv2 import utils
+from src.sbv2.synthesizer_trn import SynthesizerTrn
 from src.sbv2.text import text_to_sequence
 MODEL_REPO = os.getenv("MODEL_REPO")
 HF_TOKEN = os.getenv("HF_TOKEN")
 CACHE_DIR = "/tmp/models"
+app = FastAPI()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def load_model():
     global model, hps
+    # config.json と model.safetensors と style_vectors.npy をダウンロード
     config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
     model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
     style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)
+    # configロード
+    import json
     with open(config_path, "r", encoding="utf-8") as f:
         hps = json.load(f)
+    n_vocab = 77  # 小春音アミ用 symbol数
     model = SynthesizerTrn(
         n_vocab,
+        hps["model"]["p_dropout"],
+        hps["data"]["segment_size"] // 2,
         hps["model"]["inter_channels"],
+        hps["model"]["out_channels"],
         hps["model"]["hidden_channels"],
         hps["model"]["filter_channels"],
+        hps["model"]["dec_kernel_size"],
+        hps["model"]["enc_channels"],
+        hps["model"]["enc_out_channels"],
+        hps["model"]["enc_kernel_size"],
+        hps["model"]["enc_dilation_rate"],
+        hps["model"]["enc_n_layers"],
+        hps["model"]["flow_hidden_channels"],
+        hps["model"]["flow_kernel_size"],
+        hps["model"]["flow_n_layers"],
+        hps["model"]["flow_n_flows"],
+        hps["model"]["sdp_hidden_channels"],
+        hps["model"]["sdp_kernel_size"],
+        hps["model"]["sdp_n_layers"],
+        hps["model"]["sdp_dropout"],
+        hps["audio"]["sampling_rate"],
+        hps["audio"]["filter_length"],
+        hps["audio"]["hop_length"],
+        hps["audio"]["win_length"],
+        hps["model"]["resblock"],
+        hps["model"]["resblock_kernel_sizes"],
+        hps["model"]["resblock_dilation_sizes"],
+        hps["model"]["upsample_rates"],
+        hps["model"]["upsample_initial_channel"],
+        hps["model"]["upsample_kernel_sizes"],
+        hps["model"].get("gin_channels", 0)
     ).to(device)
+    # safetensorsロード
+    utils.load_checkpoint(model_path, model, strict=True)
     model.eval()
+@app.get("/voice")
+def synthesize(text: str):
+    # テキストを音素に変換
+    sequence = np.array(text_to_sequence(text, hps["data"]["text_cleaners"]), dtype=np.int64)
+    sequence = torch.LongTensor(sequence).unsqueeze(0).to(device)
+    # 推論
     with torch.no_grad():
+        audio = model.infer(sequence, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].data.cpu().numpy()
+    # 一時WAVファイル保存
+    output_path = "/tmp/output.wav"
+    sf.write(output_path, audio, hps["audio"]["sampling_rate"])
+    return {"audio_path": output_path}