Spaces:

jonathanjordan21
/

tts-rvc-autopst

Runtime error

App Files Files Community

jonathanjordan21 commited on Jul 19, 2024

Commit

f28083c

verified ·

1 Parent(s): ad6dc1b

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -39

app.py CHANGED Viewed

@@ -16,6 +16,8 @@ from collections import OrderedDict
 from onmt_modules.misc import sequence_mask
 from model_autopst import Generator_2 as Predictor
 from hparams_autopst import hparams
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -71,6 +73,10 @@ model = build_model().to(device)
 checkpoint = torch.load(hf_hub_download(repo_id="jonathanjordan21/AutoPST", filename="checkpoint_step001000000_ema.pth"), map_location=torch.device('cpu'))
 model.load_state_dict(checkpoint["state_dict"])
 # for name, sp in spect_vc.items():
 #     print(name)
@@ -81,57 +87,164 @@ model.load_state_dict(checkpoint["state_dict"])
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
     ],
 )
 if __name__ == "__main__":
     demo.launch()

 from onmt_modules.misc import sequence_mask
 from model_autopst import Generator_2 as Predictor
 from hparams_autopst import hparams
+from model_sea import Generator
+from hparams_sea import hparams as sea_hparams
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 checkpoint = torch.load(hf_hub_download(repo_id="jonathanjordan21/AutoPST", filename="checkpoint_step001000000_ema.pth"), map_location=torch.device('cpu'))
 model.load_state_dict(checkpoint["state_dict"])
+# sea_checkpoint = torch.load(hf_hub_download(repo_id="jonathanjordan21/AutoPST", filename='sea.ckpt'), map_location=lambda storage, loc: storage)
+# gen =Generator(sea_hparams)
+# gen.load_state_dict(sea_checkpoint['model'], strict=True)
 # for name, sp in spect_vc.items():
 #     print(name)
+# def respond(
+#     message,
+#     history: list[tuple[str, str]],
+#     system_message,
+#     max_tokens,
+#     temperature,
+#     top_p,
+# ):
+#     messages = [{"role": "system", "content": system_message}]
+#     for val in history:
+#         if val[0]:
+#             messages.append({"role": "user", "content": val[0]})
+#         if val[1]:
+#             messages.append({"role": "assistant", "content": val[1]})
+#     messages.append({"role": "user", "content": message})
+#     response = ""
+#     for message in client.chat_completion(
+#         messages,
+#         max_tokens=max_tokens,
+#         stream=True,
+#         temperature=temperature,
+#         top_p=top_p,
+#     ):
+#         token = message.choices[0].delta.content
+#         response += token
+#         yield response
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """
+# demo = gr.ChatInterface(
+#     respond,
+#     additional_inputs=[
+#         gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
+#         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+#         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+#         gr.Slider(
+#             minimum=0.1,
+#             maximum=1.0,
+#             value=0.95,
+#             step=0.05,
+#             label="Top-p (nucleus sampling)",
+#         ),
+#     ],
+# )
+import os
+import pickle
+import numpy as np
+import soundfile as sf
+from scipy import signal
+from scipy.signal import get_window
+from librosa.filters import mel
+from numpy.random import RandomState
+def butter_highpass(cutoff, fs, order=5):
+    nyq = 0.5 * fs
+    normal_cutoff = cutoff / nyq
+    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
+    return b, a
+def pySTFT(x, fft_length=1024, hop_length=256):
+    x = np.pad(x, int(fft_length//2), mode='reflect')
+    noverlap = fft_length - hop_length
+    shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
+    strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
+    result = np.lib.stride_tricks.as_strided(x, shape=shape,
+                                             strides=strides)
+    fft_window = get_window('hann', fft_length, fftbins=True)
+    result = np.fft.rfft(fft_window * result, n=fft_length).T
+    return np.abs(result)
+def create_sp(cep_real, spk_emb):
+    # cep_real, spk_emb = dict_test[uttr[0]][uttr[2]]
+    cep_real_A = torch.from_numpy(cep_real).unsqueeze(0).to(device)
+    len_real_A = torch.tensor(cep_real_A.size(1)).unsqueeze(0).to(device)
+    real_mask_A = sequence_mask(len_real_A, cep_real_A.size(1)).float()
+    # _, spk_emb = dict_test[uttr[1]][uttr[2]]
+    spk_emb_B = torch.from_numpy(spk_emb).unsqueeze(0).to(device)
+    with torch.no_grad():
+        spect_output, len_spect = P.infer_onmt(cep_real_A.transpose(2,1)[:,:14,:],
+                                                real_mask_A,
+                                                len_real_A,
+                                                spk_emb_B)
+    uttr_tgt = spect_output[:len_spect[0],0,:].cpu().numpy()
+    return uttr_tgt
+def create_mel(x):
+    mel_basis = mel(sr=16000, n_fft=1024, fmin=90, fmax=7600, n_mels=80).T
+    min_level = np.exp(-100 / 20 * np.log(10))
+    b, a = butter_highpass(30, 16000, order=5)
+    mfcc_mean, mfcc_std, dctmx = pickle.load(open('assets/mfcc_stats.pkl', 'rb'))
+    spk2emb = pickle.load(open('assets/spk2emb_82.pkl', 'rb'))
+    if x.shape[0] % 256 == 0:
+        x = np.concatenate((x, np.array([1e-06])), axis=0)
+    y = signal.filtfilt(b, a, x)
+    D = pySTFT(y * 0.96).T
+    D_mel = np.dot(D, mel_basis)
+    D_db = 20 * np.log10(np.maximum(min_level, D_mel))
+    # mel sp
+    S = (D_db + 80) / 100
+    # mel cep
+    cc_tmp = S.dot(dctmx)
+    cc_norm = (cc_tmp - mfcc_mean) / mfcc_std
+    S = np.clip(S, 0, 1)
+    # teacher code
+    # cc_torch = torch.from_numpy(cc_norm[:,0:20].astype(np.float32)).unsqueeze(0).to(device)
+    # with torch.no_grad():
+    #     codes = gen.encode(cc_torch, torch.ones_like(cc_torch[:,:,0])).squeeze(0)
+    return S, cc_norm
+def transcribe(audio, spk):
+    sr, y = audio
+    y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    spk_emb = np.zeros((82,))
+    spk_emb[spk-1] = 1
+    mel_sp, mel_cep = create_mel(y)
+    sp = create_sp(mel_cep, spk_emb)
+    waveform = wavegen(model, c=sp)
+    return 16000, waveform.numpy()
+    # return transcriber({"sampling_rate": sr, "raw": y})["text"]
+demo = gr.Interface(
+    transcribe,
+    [
+        gr.Audio(),
+        gr.Slider(1, 82, value=21, label="Count", info="Choose between 1 and 82")
     ],
+    "audio",
 )
 if __name__ == "__main__":
     demo.launch()