Spaces:

clr
/

nyro-dev-p

Running

App Files Files Community

clr commited on 20 days ago

Commit

f2fb25b

1 Parent(s): 2c42716

Create app.py

Browse files

Files changed (1) hide show

app.py +237 -0

app.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import gradio as gr
+import subprocess, os, sys, pickle, random, librosa, huggingface_hub
+sys.path.append('./pronunciation-score-icelandic')
+from captinialign import makeAlign
+from captiniscore import PronunciationScorer
+#from captinifeedback import FeedbackConverter
+from hffeedback import FeedbackConverter
+from os.path import basename, splitext
+from gradio_rangeslider import RangeSlider
+def setup():
+    # Speech embedding model and layer must match the pre-computed scoring models.
+    #speech_featurizer_path = 'language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h'
+    # when kaldi==5.5.1016, w2v2 needs to download local copy before loading pretrained??????????
+    speech_featurizer_path = huggingface_hub.snapshot_download(
+        repo_id='language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h',
+        repo_type='model')
+    speech_featurizer_layer = 8
+    models_data_dir = huggingface_hub.snapshot_download(
+        repo_id='clr/captini-scoring-references',
+        repo_type='dataset') #local_dir="path/to/folder"
+    task_scoring_models = os.path.join(models_data_dir, 'models/task_models_w2v2-IS-30e967h_l8_3EP49G/')
+    task_key_path = os.path.join(models_data_dir, 'models/task_key_3EP49G.json')
+    phone_key_path = os.path.join(models_data_dir, 'models/phone_key_3EP49G.tsv')
+    monophone_reference_feat_path = os.path.join(models_data_dir, 'models/monophones/w2v2-IS-30e967h_SPLIT1.pickle')
+    task_text_path = os.path.join(models_data_dir, 'models/task2text.txt')
+    # Define constants for converting pronunciation scores
+    # to user feedback
+    lower_bound_100 = -0.1
+    upper_bound_100 = 0.03 #0.02
+    # PronunciationScorer takes considerable time to initialise,
+    #     due to loading the w2v2 featurizer.
+    # After the first loading, it quickly scores each new user input speech.
+    # It's faster on GPU.
+    # Do not re-load a new w2v2 featurizer each time a user speaks.
+    scorer = PronunciationScorer(
+        task_scoring_models,
+        speech_featurizer_path,
+        speech_featurizer_layer,
+        task_text_path,
+        monophone_reference_feat_path)
+    # FeedbackConverter new module to process scores into user feedback
+    #fb = FeedbackConverter(task_key_path, phone_key_path, lower_bound_100, upper_bound_100)
+    fb_inits = {'t_key': task_key_path,
+                'p_key': phone_key_path,
+                'lb': lower_bound_100,
+                'ub': upper_bound_100}
+    with open(task_text_path,'r') as handle:
+        prompts = handle.read().splitlines()
+    prompts = [tuple(l.split('\t')) for l in prompts]
+    return scorer, prompts, fb_inits
+# return a random prompt text from the list of prompt texts
+def refresh_prompt_fn():
+    task_id, sentence, normed_text = random.choice(prompts)
+    return tuple([task_id, normed_text]), sentence, None, _disp_scorertype('...'), None, None
+def recal_fb_fn(cslider):
+    lowr, uppr = cslider
+    return FeedbackConverter(fb_params['t_key'], fb_params['p_key'], lowr, uppr)
+def score_speech_fn(current_prompt, user_wav, fbc):
+    task_id, norm_text = current_prompt
+    task_text, task_model = scorer.task_scorer(task_id)
+    print(user_wav)
+    user_wav_duration = librosa.get_duration(path=user_wav)
+    word_aligns, phone_aligns = makeAlign(
+        task_text,
+        user_wav,
+        user_wav_duration,
+        splitext(basename(user_wav))[0],
+        './pronunciation-score-icelandic/alignment/new/',
+        './pronunciation-score-icelandic/alignment/captini_pretrained_aligner/'
+    )
+    #print('an audio file! : ', user_wav)
+    print('TASK ID', task_id, 'TEXT:', task_text)
+    #print('Duration --', user_wav_duration)
+    #print('WORD ALIGNS:', word_aligns)
+    #print('PHONE ALIgNS:', phone_aligns)
+    if word_aligns:
+        word_scores, phone_scores = scorer.score_one(
+            task_model,
+            user_wav,
+            word_aligns,
+            phone_aligns)
+        print('feedback with:', fbc.lower_bound_100, fbc.upper_bound_100)
+        task_feedback, word_feedback, phone_feedback = fbc.convert(
+            word_scores,
+            phone_scores,
+            task_id)
+        collected_info = {'task_feedback': task_feedback,
+                          'word_feedback': word_feedback, 'phone_feedback': phone_feedback,
+                          'word_scores': word_scores, 'phone_scores': phone_scores,
+                          'word_aligns': word_aligns, 'phone_aligns': phone_aligns}
+        #print('WORD SCORES:', word_scores)
+        #print('PHONE SCORES:', phone_scores)
+        #print('TASK FB:', task_feedback)
+        #print('WORD FB:', word_feedback)
+        #print('PHONE FB', phone_feedback)
+        #disp_fb = display0(collected_info)
+        disp_fb = f'## Einkunn/gæði: {task_feedback}'
+        # colorised html version....
+#return ''.join([ hc_from_3(self.phone_3sort_monophone(score,label)) for label,score in scores_list ])
+        disp_fb2 = ' '.join([''.join([fbc.hc_from_3(sc3,phid) for sc3,phid in wphones]) for wrd, wphones in phone_feedback])
+#disp_fb2 = ' '.join([htxt for wrd, htxt in phone_feedback])
+        disp_fb2 = f'## {disp_fb2}'
+        disp_plot1 = fbc.generate_graphic_feedback_0(user_wav, word_aligns, phone_aligns, phone_feedback)
+        disp_blocksplot = fbc.generate_graphic_feedback_blocks(phone_scores)
+    else:
+        disp_fb = "Failure. If this happens every time try a different device/browser, "
+        disp_fb += "Kaldi speech processor does not work on some (older?) devices, "
+        disp_fb += "this should not be a possible issue on Spaces but unfortunately it is."
+        disp_fb2 = ''
+        disp_plot1 = None
+        disp_blocksplot = None
+    return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, disp_blocksplot
+    #return disp_fb, disp_fb2, _disp_scorertype(task_model[1]), disp_plot1
+def _disp_scorertype(scorertype):
+    if scorertype == 'task':
+        scoring_model_type = 'Full'
+    elif scorertype == 'phone':
+        scoring_model_type = 'Monophone'
+    else:
+        scoring_model_type = '...'
+    return f"### Scoring model for this exercise was: [{scoring_model_type}]"
+def display0(score_output):
+    task_feedback = score_output['task_feedback']
+    word_feedback = score_output['word_feedback']
+    phone_feedback = score_output['phone_feedback']
+    ostring = f'{task_feedback}'
+    ostring += '\n--\n'
+    for w_s,p_s in zip(word_feedback, phone_feedback):
+        assert w_s[0] == p_s[0]
+        ostring += f'{w_s[0]}\t{w_s[1]}--\n'
+        for i in range(len(p_s[1])):
+            ostring += f'\t{p_s[1][i][0]}\t{p_s[1][i][1]}\n'
+    return ostring
+scorer, prompts, fb_params = setup()
+bl = gr.Blocks()
+with bl:
+    gr.Markdown(
+        """
+    ## Framburðarþjálfun [Nýrómur]
+    """)
+    #setup user-adjustable feedback calibration
+    fb_lb, fb_ub = fb_params['lb'], fb_params['ub']
+    fb = gr.State(FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_lb, fb_ub))
+    current_prompt = gr.State((None,None))
+    with gr.Row(equal_height=True):
+        new_prompt_button = gr.Button("🔃 ⮕ \nEndurnýja textann",scale=0)
+        prompt_text = gr.Textbox(label="Texti", value="[--- stuttur æfingatexti ---]", interactive=False)
+    #with gr.Row(equal_height=True):
+    #with gr.Column():
+    user_speech = gr.Audio(sources=["upload", "microphone"],type="filepath",
+                        waveform_options=gr.WaveformOptions(sample_rate=16000),
+                        label="Lestu textann upphátt og vistaðu upptöku")
+    #with gr.Column(scale=0):
+    score_speech_button = gr.Button("⇩⇩⇩")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            placeholder_output = gr.Markdown("## Einkunn/gæði: ...")
+        with gr.Column(scale=3):
+            blocks_output = gr.Plot()
+            blocks_info = gr.Markdown("""*This colour feedback was not calibrated by exercise/phone,
+                                        relative colours may mismatch other scores""")
+    #phone_output = gr.Markdown('')
+    plot_output = gr.Plot()
+    with gr.Row():
+        with gr.Column():
+            modeltype_info = gr.Markdown(_disp_scorertype('...'))
+            phone_output_keyinfo = gr.Markdown("""### Lykill: <span style='color:#0000FF;'>Rétt</span>, <span style='color:#FF0000;'>Rangt</span>, <span style='color:#BBBBBB;'>Ekki hægt að greina (of stutt hljóð)</span>, <span style='color:#8a2be2;'>••• Pitch</span>, <span style='color:#FFDAB9;'>--- Energy</span>""")
+        calibrate_slider = RangeSlider(minimum=-1, maximum=1, value=(fb_lb, fb_ub), interactive=True)
+        cali_info=gr.Markdown("""
+        #### ◄ Re-calibrate scoring
+        Too easy, always 100? Raise the maximum. Too hard, always 0? Lower the minimum.
+        ### Note: Does not affect Monophone model.
+        """)
+    new_prompt_button.click(refresh_prompt_fn, inputs = [], outputs = [current_prompt, prompt_text, user_speech, modeltype_info, plot_output, blocks_output])
+    calibrate_slider.release(recal_fb_fn, inputs = [calibrate_slider], outputs = [fb] )
+    score_speech_button.click(score_speech_fn,
+                              inputs=[current_prompt, user_speech, fb],
+                              outputs = [placeholder_output, modeltype_info, plot_output, blocks_output])
+                              #outputs = [placeholder_output, phone_output, modeltype_info, plot_output])
+bl.launch()