Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import subprocess, os, sys, pickle, random, librosa, huggingface_hub | |
| sys.path.append('./pronunciation-score-icelandic') | |
| from captinialign import makeAlign | |
| from captiniscore import PronunciationScorer | |
| #from captinifeedback import FeedbackConverter | |
| from hffeedback import FeedbackConverter | |
| from os.path import basename, splitext | |
| from gradio_rangeslider import RangeSlider | |
| def setup(): | |
| # Speech embedding model and layer must match the pre-computed scoring models. | |
| #speech_featurizer_path = 'language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h' | |
| # when kaldi==5.5.1016, w2v2 needs to download local copy before loading pretrained?????????? | |
| speech_featurizer_path = huggingface_hub.snapshot_download( | |
| repo_id='language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h', | |
| repo_type='model') | |
| speech_featurizer_layer = 8 | |
| models_data_dir = huggingface_hub.snapshot_download( | |
| repo_id='clr/captini-scoring-references', | |
| repo_type='dataset') #local_dir="path/to/folder" | |
| task_scoring_models = os.path.join(models_data_dir, 'models/task_models_w2v2-IS-30e967h_l8_3EP49G/') | |
| task_key_path = os.path.join(models_data_dir, 'models/task_key_3EP49G.json') | |
| phone_key_path = os.path.join(models_data_dir, 'models/phone_key_3EP49G.tsv') | |
| monophone_reference_feat_path = os.path.join(models_data_dir, 'models/monophones/w2v2-IS-30e967h_SPLIT1.pickle') | |
| task_text_path = os.path.join(models_data_dir, 'models/task2text.txt') | |
| # Define constants for converting pronunciation scores | |
| # to user feedback | |
| lower_bound_100 = -0.1 | |
| upper_bound_100 = 0.03 #0.02 | |
| # PronunciationScorer takes considerable time to initialise, | |
| # due to loading the w2v2 featurizer. | |
| # After the first loading, it quickly scores each new user input speech. | |
| # It's faster on GPU. | |
| # Do not re-load a new w2v2 featurizer each time a user speaks. | |
| scorer = PronunciationScorer( | |
| task_scoring_models, | |
| speech_featurizer_path, | |
| speech_featurizer_layer, | |
| task_text_path, | |
| monophone_reference_feat_path) | |
| # FeedbackConverter new module to process scores into user feedback | |
| #fb = FeedbackConverter(task_key_path, phone_key_path, lower_bound_100, upper_bound_100) | |
| fb_inits = {'t_key': task_key_path, | |
| 'p_key': phone_key_path, | |
| 'lb': lower_bound_100, | |
| 'ub': upper_bound_100} | |
| with open(task_text_path,'r') as handle: | |
| prompts = handle.read().splitlines() | |
| prompts = [tuple(l.split('\t')) for l in prompts] | |
| return scorer, prompts, fb_inits | |
| # return a random prompt text from the list of prompt texts | |
| def refresh_prompt_fn(): | |
| task_id, sentence, normed_text = random.choice(prompts) | |
| tips = _user_instructions(sentence) | |
| return [tuple([task_id, normed_text]), | |
| gr.update(value = [( sentence,"hl") ] ), | |
| tips, | |
| None, | |
| _disp_scorertype('...'), | |
| "## Einkunn/gæði framburðar: ...", | |
| None, | |
| None] | |
| # heuristic for different instructions | |
| # based on how users should speak short vs longer prompts | |
| def _user_instructions(to_speak): | |
| if (len(to_speak.split(' '))<2) or ' – ' in to_speak: | |
| return """Lestu eftirfarandi texta. Talaðu skýrt:""" | |
| else: | |
| return """Lestu eftirfarandi texta eins og þu segir í venjulegu samtali:""" | |
| def recal_fb_fn(cslider): | |
| lowr, uppr = cslider | |
| return FeedbackConverter(fb_params['t_key'], fb_params['p_key'], lowr, uppr) | |
| def score_speech_fn(current_prompt, user_wav, fbc, devopts): | |
| task_id, norm_text = current_prompt | |
| task_text, task_model = scorer.task_scorer(task_id) | |
| print(user_wav) | |
| user_wav_duration = librosa.get_duration(path=user_wav) | |
| word_aligns, phone_aligns = makeAlign( | |
| task_text, | |
| user_wav, | |
| user_wav_duration, | |
| splitext(basename(user_wav))[0], | |
| './pronunciation-score-icelandic/alignment/new/', | |
| './pronunciation-score-icelandic/alignment/captini_pretrained_aligner/' | |
| ) | |
| #print('an audio file! : ', user_wav) | |
| print('TASK ID', task_id, 'TEXT:', task_text) | |
| #print('Duration --', user_wav_duration) | |
| #print('WORD ALIGNS:', word_aligns) | |
| #print('PHONE ALIgNS:', phone_aligns) | |
| if word_aligns: | |
| word_scores, phone_scores = scorer.score_one( | |
| task_model, | |
| user_wav, | |
| word_aligns, | |
| phone_aligns) | |
| print('feedback with:', fbc.lower_bound_100, fbc.upper_bound_100) | |
| task_feedback, word_feedback, phone_feedback = fbc.convert( | |
| word_scores, | |
| phone_scores, | |
| task_id) | |
| collected_info = {'task_feedback': task_feedback, | |
| 'word_feedback': word_feedback, 'phone_feedback': phone_feedback, | |
| 'word_scores': word_scores, 'phone_scores': phone_scores, | |
| 'word_aligns': word_aligns, 'phone_aligns': phone_aligns} | |
| #print('WORD SCORES:', word_scores) | |
| #print('PHONE SCORES:', phone_scores) | |
| #print('TASK FB:', task_feedback) | |
| #print('WORD FB:', word_feedback) | |
| #print('PHONE FB', phone_feedback) | |
| disp_fb = f'## Einkunn/gæði framburðar: {task_feedback}' | |
| #return ''.join([ hc_from_3(self.phone_3sort_monophone(score,label)) for label,score in scores_list ]) | |
| disp_fb2 = ' '.join([''.join([fbc.hc_from_3(sc3,phid) for sc3,phid in wphones]) for wrd, wphones in phone_feedback]) | |
| #disp_fb2 = ' '.join([htxt for wrd, htxt in phone_feedback]) | |
| disp_fb2 = f'## {disp_fb2}' | |
| disp_plot1 = fbc.generate_graphic_feedback_0(user_wav, word_aligns, phone_aligns, phone_feedback, devopts) | |
| disp_blocksplot = fbc.generate_graphic_feedback_blocks(phone_scores) | |
| else: | |
| disp_fb = "Failure. If this happens every time try a different device/browser, " | |
| disp_fb += "Kaldi speech processor does not work on some (older?) devices, " | |
| disp_fb += "this should not be a possible issue on Spaces but unfortunately it is." | |
| disp_fb2 = '' | |
| disp_plot1 = None | |
| disp_blocksplot = None | |
| if devopts: | |
| return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, disp_blocksplot | |
| else: | |
| return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, gr.update(value=None) | |
| #return disp_fb, disp_fb2, _disp_scorertype(task_model[1]), disp_plot1 | |
| def _disp_scorertype(scorertype): | |
| if scorertype == 'task': | |
| scoring_model_type = 'Full' | |
| elif scorertype == 'phone': | |
| scoring_model_type = 'Monophone' | |
| else: | |
| scoring_model_type = '...' | |
| return f"### Scoring model for this exercise was: [{scoring_model_type}]" | |
| # toggle on/off some components | |
| def dev_opts_fn(check_box,phone_output_keyinfo): | |
| energy_key = """, <span style='color:#F49098;'>--- Hljóðstyrkur (root mean square energy)</span>""" | |
| reset_fb = FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_params['lb'], fb_params['ub']) | |
| if check_box: # user has just turned ON extra options | |
| new_key = phone_output_keyinfo + energy_key | |
| new_vis = True | |
| else: | |
| new_key = phone_output_keyinfo[:-len(energy_key)] | |
| new_vis = False | |
| return [new_key, | |
| gr.update(value= _disp_scorertype('...'),visible=new_vis), | |
| gr.update(value=(fb_params['lb'], fb_params['ub']),visible=new_vis), | |
| reset_fb, | |
| gr.update(visible=new_vis), | |
| gr.update(visible=new_vis), | |
| gr.update(visible=new_vis), #gr.Plot(), | |
| gr.update(visible=new_vis)] | |
| def display0(score_output): | |
| task_feedback = score_output['task_feedback'] | |
| word_feedback = score_output['word_feedback'] | |
| phone_feedback = score_output['phone_feedback'] | |
| ostring = f'{task_feedback}' | |
| ostring += '\n--\n' | |
| for w_s,p_s in zip(word_feedback, phone_feedback): | |
| assert w_s[0] == p_s[0] | |
| ostring += f'{w_s[0]}\t{w_s[1]}--\n' | |
| for i in range(len(p_s[1])): | |
| ostring += f'\t{p_s[1][i][0]}\t{p_s[1][i][1]}\n' | |
| return ostring | |
| scorer, prompts, fb_params = setup() | |
| bl = gr.Blocks() | |
| with bl: | |
| gr.Markdown( | |
| """ | |
| ## Framburðarþjálfun [Nýrómur] | |
| """) | |
| #setup user-adjustable feedback calibration | |
| fb_lb, fb_ub = fb_params['lb'], fb_params['ub'] | |
| fb = gr.State(FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_lb, fb_ub)) | |
| current_prompt = gr.State((None,None)) | |
| with gr.Row(equal_height=True): | |
| new_prompt_button = gr.Button("🔃 Endurnýja texta ⮕",scale=0) | |
| with gr.Column(): | |
| prompt_instructions = gr.Markdown("""Lestu eftirfarandi texta:""") | |
| prompt_text = gr.HighlightedText(value=[("[--- dæmi ---]","hl")], | |
| color_map={"hl": "#A8DADC"}, | |
| show_legend=False, show_label=False,show_inline_category=False) | |
| #with gr.Row(equal_height=True): | |
| #with gr.Column(): | |
| user_speech = gr.Audio(sources=["microphone"],type="filepath", | |
| waveform_options=gr.WaveformOptions(sample_rate=16000), | |
| label="Lestu upp texta og vistaðu upptöku") | |
| #with gr.Column(scale=0): | |
| score_speech_button = gr.Button("⇩⇩⇩ Greina upptökuna ⇩⇩⇩") | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| placeholder_output = gr.Markdown("## Einkunn/gæði framburðar: ...") | |
| with gr.Column(scale=3): | |
| blocks_output = gr.Plot(visible=False) | |
| blocks_info = gr.Markdown("""*This colour feedback was not calibrated by exercise/phone, | |
| relative colours may mismatch other scores""",visible=False) | |
| plot_output = gr.Plot() | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| #phone_output_keyinfo = gr.Markdown("""### Lykill: <span style='color:#26701C;'>Meira nálægt</span>, <span style='color:#E85907;'>Minna nálægt</span>, <span style='color:#BBBBBB;'>Ekki hægt að greina (of stutt hljóð)</span>, <span style='color:#88447F;'>••• Ítónun (tónhæð)</span>""") | |
| phone_output_keyinfo = gr.Markdown("""### Lykill: <span style='color:#26701C;'>Nálægt réttum framburði</span>, <span style='color:#E85907;'>Fjarri réttum framburði</span>, <span style='color:#BBBBBB;'>Ekki hægt að greina (of stutt hljóð)</span>, <span style='color:#88447F;'>••• Ítónun (tónhæð)</span>""") | |
| with gr.Column(scale=1): | |
| dev_checkbox = gr.Checkbox(label="[Sýna villuleitarverkfæri]", value=False) | |
| modeltype_info = gr.Markdown(_disp_scorertype('...'),visible=False) | |
| with gr.Row(): | |
| #modeltype_info = gr.Markdown(_disp_scorertype('...'),visible=False) # doesnt fit on mobile | |
| with gr.Column(): | |
| calibrate_slider = RangeSlider(minimum=-1, maximum=1, value=(fb_lb, fb_ub), interactive=True, visible=False) | |
| cali_title = gr.Markdown("""#### ◭ Re-calibrate scoring""", visible=False) | |
| cali_info=gr.Markdown("""Too easy, always 100? Raise the maximum. Too hard, always 0? Lower the minimum. | |
| ### Re-calibration does not affect Monophone model. | |
| """, visible=False) | |
| new_prompt_button.click(refresh_prompt_fn, | |
| inputs = [], | |
| outputs = [current_prompt, prompt_text, prompt_instructions, user_speech, modeltype_info, placeholder_output, plot_output, blocks_output]) | |
| dev_checkbox.input(dev_opts_fn, | |
| inputs = [dev_checkbox,phone_output_keyinfo], | |
| outputs = [phone_output_keyinfo, modeltype_info, calibrate_slider, fb, cali_title, cali_info, blocks_output, blocks_info]) | |
| calibrate_slider.release(recal_fb_fn, | |
| inputs = [calibrate_slider], | |
| outputs = [fb] ) | |
| score_speech_button.click(score_speech_fn, | |
| inputs=[current_prompt, user_speech, fb, dev_checkbox], | |
| outputs = [placeholder_output, modeltype_info, plot_output, blocks_output]) | |
| #outputs = [placeholder_output, phone_output, modeltype_info, plot_output]) | |
| bl.launch() | |