Spaces:

KevinGeng
/

Laronix_voice_quality_checking_system_FILEIO

Build error

App Files Files Community

KevinGeng commited on Oct 9, 2023

Commit

211fff4

1 Parent(s): b27c1ab

Update app.py

Browse files

Support multi channel
better ASR model

Files changed (1) hide show

app.py +17 -10

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from random import sample
 import gradio as gr
 import torchaudio
@@ -10,8 +9,12 @@ import jiwer
 # ASR part
 from transformers import pipeline
-p = pipeline("automatic-speech-recognition")
 # WER part
 transformation = jiwer.Compose([
     jiwer.ToLowerCase(),
@@ -21,10 +24,10 @@ transformation = jiwer.Compose([
 ])
 # WPM part
-from transformers import Wav2Vec2PhonemeCTCTokenizer, Wav2Vec2Processor, Wav2Vec2ForCTC
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
 phoneme_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
 class ChangeSampleRate(nn.Module):
     def __init__(self, input_rate: int, output_rate: int):
         super().__init__()
@@ -44,7 +47,9 @@ class ChangeSampleRate(nn.Module):
 model = lightning_module.BaselineLightningModule.load_from_checkpoint("epoch=3-step=7459.ckpt").eval()
 def calc_mos(audio_path, ref):
-    wav, sr = torchaudio.load(audio_path, channels_first=True)  # Mono channel
     osr = 16_000
     batch = wav.unsqueeze(0).repeat(10, 1, 1)
     csr = ChangeSampleRate(sr, osr)
@@ -73,6 +78,7 @@ def calc_mos(audio_path, ref):
     return predic_mos, trans, wer, phone_transcription, ppm
 description ="""
 MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
 This demo only accepts .wav format. Best at 16 kHz sampling rate.
@@ -83,15 +89,16 @@ Add ASR based on wav2vec-960, currently only English available.
 Add WER interface.
 """
 iface = gr.Interface(
   fn=calc_mos,
   inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
-          gr.Textbox(placeholder="Input referance here", label="Referance")],
-  outputs=[gr.Textbox(placeholder="Predicted MOS", label="Predicted MOS"),
            gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
-           gr.Textbox(placeholder="Word Error Rate", label = "WER"),
            gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
-           gr.Textbox(placeholder="Phonemes per minutes", label="PPM")],
   title="Laronix's Voice Quality Checking System Demo",
   description=description,
   allow_flagging="auto",

 from random import sample
 import gradio as gr
 import torchaudio
 # ASR part
 from transformers import pipeline
+# p = pipeline("automatic-speech-recognition")
+p = pipeline(
+    "automatic-speech-recognition",
+    model="KevinGeng/whipser_medium_en_PAL300_step25",
+    device=0,
+)
 # WER part
 transformation = jiwer.Compose([
     jiwer.ToLowerCase(),
 ])
 # WPM part
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
 phoneme_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
+# phoneme_model =  pipeline(model="facebook/wav2vec2-xlsr-53-espeak-cv-ft")
 class ChangeSampleRate(nn.Module):
     def __init__(self, input_rate: int, output_rate: int):
         super().__init__()
 model = lightning_module.BaselineLightningModule.load_from_checkpoint("epoch=3-step=7459.ckpt").eval()
 def calc_mos(audio_path, ref):
+    wav, sr = torchaudio.load(audio_path, channels_first=True)
+    if wav.shape[0] > 1:
+        wav = wav.mean(dim=0, keepdim=True) # Mono channel
     osr = 16_000
     batch = wav.unsqueeze(0).repeat(10, 1, 1)
     csr = ChangeSampleRate(sr, osr)
     return predic_mos, trans, wer, phone_transcription, ppm
 description ="""
 MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
 This demo only accepts .wav format. Best at 16 kHz sampling rate.
 Add WER interface.
 """
 iface = gr.Interface(
   fn=calc_mos,
   inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
+          gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
+  outputs=[gr.Textbox(placeholder="Naturalness evaluation, ranged 1 to 5, the higher the better.", label="Predicted MOS"),
            gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
+           gr.Textbox(placeholder="Word Error Rate: Only valid when Reference is given", label = "WER"),
            gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
+           gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
   title="Laronix's Voice Quality Checking System Demo",
   description=description,
   allow_flagging="auto",