Spaces:

KevinGeng
/

Laronix_voice_quality_checking_system_FILEIO

Build error

App Files Files Community

KevinGeng commited on Sep 15, 2022

Commit

5918e9e

1 Parent(s): eb2441e

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -7

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import torch.nn as nn
 import lightning_module
 import pdb
 import jiwer
 # ASR part
 from transformers import pipeline
 p = pipeline("automatic-speech-recognition")
@@ -19,6 +20,11 @@ transformation = jiwer.Compose([
     jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
 ])
 class ChangeSampleRate(nn.Module):
     def __init__(self, input_rate: int, output_rate: int):
         super().__init__()
@@ -35,7 +41,8 @@ class ChangeSampleRate(nn.Module):
         output = round_down * (1. - indices.fmod(1.)).unsqueeze(0) + round_up * indices.fmod(1.).unsqueeze(0)
         return output
-model = lightning_module.BaselineLightningModule.load_from_checkpoint("./epoch=3-step=7459.ckpt").eval()
 def calc_mos(audio_path, ref):
     wav, sr = torchaudio.load(audio_path)
     osr = 16_000
@@ -46,7 +53,7 @@ def calc_mos(audio_path, ref):
     trans = p(audio_path)["text"]
     # WER
     wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
     batch = {
         'wav': out_wavs,
         'domains': torch.tensor([0]),
@@ -54,10 +61,17 @@ def calc_mos(audio_path, ref):
     }
     with torch.no_grad():
         output = model(batch)
     predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
-    return predic_mos, trans, wer
 description ="""
 MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
@@ -71,9 +85,14 @@ Add WER interface.
 iface = gr.Interface(
   fn=calc_mos,
-  inputs=[gr.Audio(type='filepath'), gr.Textbox(placeholder="Insert referance here", label="Referance")],
-  outputs=[gr.Textbox("Predicted MOS"), gr.Textbox("Hypothesis"), gr.Textbox("WER")],
-  title="Laronix Voice Quality Checking Demo",
   description=description,
   allow_flagging="auto",
 )

 import lightning_module
 import pdb
 import jiwer
 # ASR part
 from transformers import pipeline
 p = pipeline("automatic-speech-recognition")
     jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
 ])
+# WPM part
+from transformers import Wav2Vec2PhonemeCTCTokenizer, Wav2Vec2Processor, Wav2Vec2ForCTC
+processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
+phoneme_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
 class ChangeSampleRate(nn.Module):
     def __init__(self, input_rate: int, output_rate: int):
         super().__init__()
         output = round_down * (1. - indices.fmod(1.)).unsqueeze(0) + round_up * indices.fmod(1.).unsqueeze(0)
         return output
+model = lightning_module.BaselineLightningModule.load_from_checkpoint("epoch=3-step=7459.ckpt").eval()
 def calc_mos(audio_path, ref):
     wav, sr = torchaudio.load(audio_path)
     osr = 16_000
     trans = p(audio_path)["text"]
     # WER
     wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
+    # MOS
     batch = {
         'wav': out_wavs,
         'domains': torch.tensor([0]),
     }
     with torch.no_grad():
         output = model(batch)
     predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
+    # Phonemes per minute (PPM)
+    with torch.no_grad():
+        logits = phoneme_model(out_wavs).logits
+    phone_predicted_ids = torch.argmax(logits, dim=-1)
+    phone_transcription = processor.batch_decode(phone_predicted_ids)
+    lst_phonemes = phone_transcription[0].split(" ")
+    wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
+    ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
+    return predic_mos, trans, wer, phone_transcription, ppm
 description ="""
 MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
 iface = gr.Interface(
   fn=calc_mos,
+  inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
+          gr.Textbox(placeholder="Input referance here", label="Referance")],
+  outputs=[gr.Textbox(placeholder="Predicted MOS", label="Predicted MOS"),
+           gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
+           gr.Textbox(placeholder="Word Error Rate", label = "WER"),
+           gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
+           gr.Textbox(placeholder="Phonemes per minutes", label="PPM")],
+  title="Laronix's Voice Quality Checking System Demo",
   description=description,
   allow_flagging="auto",
 )