import gradio as gr import subprocess, os, sys, pickle, random, librosa, huggingface_hub sys.path.append('./pronunciation-score-icelandic') from captinialign import makeAlign from captiniscore import PronunciationScorer #from captinifeedback import FeedbackConverter from hffeedback import FeedbackConverter from os.path import basename, splitext from gradio_rangeslider import RangeSlider def setup(): # Speech embedding model and layer must match the pre-computed scoring models. #speech_featurizer_path = 'language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h' # when kaldi==5.5.1016, w2v2 needs to download local copy before loading pretrained?????????? speech_featurizer_path = huggingface_hub.snapshot_download( repo_id='language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h', repo_type='model') speech_featurizer_layer = 8 models_data_dir = huggingface_hub.snapshot_download( repo_id='clr/captini-scoring-references', repo_type='dataset') #local_dir="path/to/folder" task_scoring_models = os.path.join(models_data_dir, 'models/task_models_w2v2-IS-30e967h_l8_3EP49G/') task_key_path = os.path.join(models_data_dir, 'models/task_key_3EP49G.json') phone_key_path = os.path.join(models_data_dir, 'models/phone_key_3EP49G.tsv') monophone_reference_feat_path = os.path.join(models_data_dir, 'models/monophones/w2v2-IS-30e967h_SPLIT1.pickle') task_text_path = os.path.join(models_data_dir, 'models/task2text.txt') # Define constants for converting pronunciation scores # to user feedback lower_bound_100 = -0.1 upper_bound_100 = 0.03 #0.02 # PronunciationScorer takes considerable time to initialise, # due to loading the w2v2 featurizer. # After the first loading, it quickly scores each new user input speech. # It's faster on GPU. # Do not re-load a new w2v2 featurizer each time a user speaks. scorer = PronunciationScorer( task_scoring_models, speech_featurizer_path, speech_featurizer_layer, task_text_path, monophone_reference_feat_path) # FeedbackConverter new module to process scores into user feedback #fb = FeedbackConverter(task_key_path, phone_key_path, lower_bound_100, upper_bound_100) fb_inits = {'t_key': task_key_path, 'p_key': phone_key_path, 'lb': lower_bound_100, 'ub': upper_bound_100} with open(task_text_path,'r') as handle: prompts = handle.read().splitlines() prompts = [tuple(l.split('\t')) for l in prompts] return scorer, prompts, fb_inits # return a random prompt text from the list of prompt texts def refresh_prompt_fn(): task_id, sentence, normed_text = random.choice(prompts) tips = _user_instructions(sentence) return [tuple([task_id, normed_text]), gr.update(value = [( sentence,"hl") ] ), tips, None, _disp_scorertype('...'), "## Einkunn/gæði framburðar: ...", None, None] # heuristic for different instructions # based on how users should speak short vs longer prompts def _user_instructions(to_speak): if (len(to_speak.split(' '))<2) or ' – ' in to_speak: return """Lestu eftirfarandi texta. Talaðu skýrt:""" else: return """Lestu eftirfarandi texta eins og þu segir í venjulegu samtali:""" def recal_fb_fn(cslider): lowr, uppr = cslider return FeedbackConverter(fb_params['t_key'], fb_params['p_key'], lowr, uppr) def score_speech_fn(current_prompt, user_wav, fbc, devopts): task_id, norm_text = current_prompt task_text, task_model = scorer.task_scorer(task_id) print(user_wav) user_wav_duration = librosa.get_duration(path=user_wav) word_aligns, phone_aligns = makeAlign( task_text, user_wav, user_wav_duration, splitext(basename(user_wav))[0], './pronunciation-score-icelandic/alignment/new/', './pronunciation-score-icelandic/alignment/captini_pretrained_aligner/' ) #print('an audio file! : ', user_wav) print('TASK ID', task_id, 'TEXT:', task_text) #print('Duration --', user_wav_duration) #print('WORD ALIGNS:', word_aligns) #print('PHONE ALIgNS:', phone_aligns) if word_aligns: word_scores, phone_scores = scorer.score_one( task_model, user_wav, word_aligns, phone_aligns) print('feedback with:', fbc.lower_bound_100, fbc.upper_bound_100) task_feedback, word_feedback, phone_feedback = fbc.convert( word_scores, phone_scores, task_id) collected_info = {'task_feedback': task_feedback, 'word_feedback': word_feedback, 'phone_feedback': phone_feedback, 'word_scores': word_scores, 'phone_scores': phone_scores, 'word_aligns': word_aligns, 'phone_aligns': phone_aligns} #print('WORD SCORES:', word_scores) #print('PHONE SCORES:', phone_scores) #print('TASK FB:', task_feedback) #print('WORD FB:', word_feedback) #print('PHONE FB', phone_feedback) disp_fb = f'## Einkunn/gæði framburðar: {task_feedback}' #return ''.join([ hc_from_3(self.phone_3sort_monophone(score,label)) for label,score in scores_list ]) disp_fb2 = ' '.join([''.join([fbc.hc_from_3(sc3,phid) for sc3,phid in wphones]) for wrd, wphones in phone_feedback]) #disp_fb2 = ' '.join([htxt for wrd, htxt in phone_feedback]) disp_fb2 = f'## {disp_fb2}' disp_plot1 = fbc.generate_graphic_feedback_0(user_wav, word_aligns, phone_aligns, phone_feedback, devopts) disp_blocksplot = fbc.generate_graphic_feedback_blocks(phone_scores) else: disp_fb = "Failure. If this happens every time try a different device/browser, " disp_fb += "Kaldi speech processor does not work on some (older?) devices, " disp_fb += "this should not be a possible issue on Spaces but unfortunately it is." disp_fb2 = '' disp_plot1 = None disp_blocksplot = None if devopts: return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, disp_blocksplot else: return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, gr.update(value=None) #return disp_fb, disp_fb2, _disp_scorertype(task_model[1]), disp_plot1 def _disp_scorertype(scorertype): if scorertype == 'task': scoring_model_type = 'Full' elif scorertype == 'phone': scoring_model_type = 'Monophone' else: scoring_model_type = '...' return f"### Scoring model for this exercise was: [{scoring_model_type}]" # toggle on/off some components def dev_opts_fn(check_box,phone_output_keyinfo): energy_key = """, --- Hljóðstyrkur (root mean square energy)""" reset_fb = FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_params['lb'], fb_params['ub']) if check_box: # user has just turned ON extra options new_key = phone_output_keyinfo + energy_key new_vis = True else: new_key = phone_output_keyinfo[:-len(energy_key)] new_vis = False return [new_key, gr.update(value= _disp_scorertype('...'),visible=new_vis), gr.update(value=(fb_params['lb'], fb_params['ub']),visible=new_vis), reset_fb, gr.update(visible=new_vis), gr.update(visible=new_vis), gr.update(visible=new_vis), #gr.Plot(), gr.update(visible=new_vis)] def display0(score_output): task_feedback = score_output['task_feedback'] word_feedback = score_output['word_feedback'] phone_feedback = score_output['phone_feedback'] ostring = f'{task_feedback}' ostring += '\n--\n' for w_s,p_s in zip(word_feedback, phone_feedback): assert w_s[0] == p_s[0] ostring += f'{w_s[0]}\t{w_s[1]}--\n' for i in range(len(p_s[1])): ostring += f'\t{p_s[1][i][0]}\t{p_s[1][i][1]}\n' return ostring scorer, prompts, fb_params = setup() bl = gr.Blocks() with bl: gr.Markdown( """ ## Framburðarþjálfun [Nýrómur] """) #setup user-adjustable feedback calibration fb_lb, fb_ub = fb_params['lb'], fb_params['ub'] fb = gr.State(FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_lb, fb_ub)) current_prompt = gr.State((None,None)) with gr.Row(equal_height=True): new_prompt_button = gr.Button("🔃 Endurnýja texta ⮕",scale=0) with gr.Column(): prompt_instructions = gr.Markdown("""Lestu eftirfarandi texta:""") prompt_text = gr.HighlightedText(value=[("[--- dæmi ---]","hl")], color_map={"hl": "#A8DADC"}, show_legend=False, show_label=False,show_inline_category=False) #with gr.Row(equal_height=True): #with gr.Column(): user_speech = gr.Audio(sources=["microphone"],type="filepath", waveform_options=gr.WaveformOptions(sample_rate=16000), label="Lestu upp texta og vistaðu upptöku") #with gr.Column(scale=0): score_speech_button = gr.Button("⇩⇩⇩ Greina upptökuna ⇩⇩⇩") with gr.Row(equal_height=True): with gr.Column(scale=1): placeholder_output = gr.Markdown("## Einkunn/gæði framburðar: ...") with gr.Column(scale=3): blocks_output = gr.Plot(visible=False) blocks_info = gr.Markdown("""*This colour feedback was not calibrated by exercise/phone, relative colours may mismatch other scores""",visible=False) plot_output = gr.Plot() with gr.Row(): with gr.Column(scale=2): #phone_output_keyinfo = gr.Markdown("""### Lykill: Meira nálægt, Minna nálægt, Ekki hægt að greina (of stutt hljóð), ••• Ítónun (tónhæð)""") phone_output_keyinfo = gr.Markdown("""### Lykill: Nálægt réttum framburði, Fjarri réttum framburði, Ekki hægt að greina (of stutt hljóð), ••• Ítónun (tónhæð)""") with gr.Column(scale=1): dev_checkbox = gr.Checkbox(label="[Sýna villuleitarverkfæri]", value=False) modeltype_info = gr.Markdown(_disp_scorertype('...'),visible=False) with gr.Row(): #modeltype_info = gr.Markdown(_disp_scorertype('...'),visible=False) # doesnt fit on mobile with gr.Column(): calibrate_slider = RangeSlider(minimum=-1, maximum=1, value=(fb_lb, fb_ub), interactive=True, visible=False) cali_title = gr.Markdown("""#### ◭ Re-calibrate scoring""", visible=False) cali_info=gr.Markdown("""Too easy, always 100? Raise the maximum. Too hard, always 0? Lower the minimum. ### Re-calibration does not affect Monophone model. """, visible=False) new_prompt_button.click(refresh_prompt_fn, inputs = [], outputs = [current_prompt, prompt_text, prompt_instructions, user_speech, modeltype_info, placeholder_output, plot_output, blocks_output]) dev_checkbox.input(dev_opts_fn, inputs = [dev_checkbox,phone_output_keyinfo], outputs = [phone_output_keyinfo, modeltype_info, calibrate_slider, fb, cali_title, cali_info, blocks_output, blocks_info]) calibrate_slider.release(recal_fb_fn, inputs = [calibrate_slider], outputs = [fb] ) score_speech_button.click(score_speech_fn, inputs=[current_prompt, user_speech, fb, dev_checkbox], outputs = [placeholder_output, modeltype_info, plot_output, blocks_output]) #outputs = [placeholder_output, phone_output, modeltype_info, plot_output]) bl.launch()