Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| from difflib import Differ | |
| import librosa | |
| # import spaces #[uncomment to use ZeroGPU] | |
| import torch | |
| # ################ CHANGE THIS TO CHANGE THE LANGUAGE ###################### # | |
| from TaiwaneseHokkien import TaiwaneseHokkien | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model_repo_id = "emlinking/wav2vec2-large-xls-r-300m-tsm-asr-v6" | |
| if torch.cuda.is_available(): | |
| torch_dtype = torch.float16 | |
| else: | |
| torch_dtype = torch.float32 | |
| language = TaiwaneseHokkien(device=device, torch_dtype=torch_dtype) | |
| # ########################################################################## # | |
| # @spaces.GPU #[uncomment to use ZeroGPU] | |
| def infer( | |
| audio, | |
| target | |
| ): | |
| if type(audio) != tuple or type(target) != str: return [None, None] | |
| # preprocess | |
| sampling_rate, wav = audio | |
| if wav.ndim > 1: | |
| wav = wav.mean(axis=1) | |
| wav = wav.astype(np.float32) | |
| wav /= np.max(np.abs(wav)) | |
| wav = librosa.resample(y=wav, orig_sr=sampling_rate, target_sr=16_000) | |
| user_pron = language.asr(wav) | |
| # compare texts | |
| d_toks = language.compare(target, user_pron) | |
| return (user_pron, d_toks) | |
| css = """ | |
| #col-container { | |
| margin: 0 auto; | |
| max-width: 640px; | |
| } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| gr.Markdown(" # PhonoLearn") | |
| target = gr.Textbox(label='Practice Sentence (Tâi-lô)') | |
| input_audio = gr.Audio( | |
| sources=["microphone", "upload"] | |
| ) | |
| output = gr.Textbox(label='Your Pronunciation') | |
| diff = gr.HighlightedText( | |
| label='Comparison', | |
| combine_adjacent=True, | |
| show_legend=True, | |
| color_map=language.compare_colors | |
| ) | |
| input_audio.input(fn=infer, inputs=[input_audio, target], outputs=[output, diff]) | |
| if __name__ == "__main__": | |
| demo.launch() | |