Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import subprocess, os, sys, pickle, random, librosa, huggingface_hub
|
| 3 |
+
sys.path.append('./pronunciation-score-icelandic')
|
| 4 |
+
from captinialign import makeAlign
|
| 5 |
+
from captiniscore import PronunciationScorer
|
| 6 |
+
#from captinifeedback import FeedbackConverter
|
| 7 |
+
from hffeedback import FeedbackConverter
|
| 8 |
+
from os.path import basename, splitext
|
| 9 |
+
from gradio_rangeslider import RangeSlider
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def setup():
|
| 13 |
+
|
| 14 |
+
# Speech embedding model and layer must match the pre-computed scoring models.
|
| 15 |
+
#speech_featurizer_path = 'language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h'
|
| 16 |
+
# when kaldi==5.5.1016, w2v2 needs to download local copy before loading pretrained??????????
|
| 17 |
+
speech_featurizer_path = huggingface_hub.snapshot_download(
|
| 18 |
+
repo_id='language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h',
|
| 19 |
+
repo_type='model')
|
| 20 |
+
speech_featurizer_layer = 8
|
| 21 |
+
|
| 22 |
+
models_data_dir = huggingface_hub.snapshot_download(
|
| 23 |
+
repo_id='clr/captini-scoring-references',
|
| 24 |
+
repo_type='dataset') #local_dir="path/to/folder"
|
| 25 |
+
|
| 26 |
+
task_scoring_models = os.path.join(models_data_dir, 'models/task_models_w2v2-IS-30e967h_l8_3EP49G/')
|
| 27 |
+
task_key_path = os.path.join(models_data_dir, 'models/task_key_3EP49G.json')
|
| 28 |
+
|
| 29 |
+
phone_key_path = os.path.join(models_data_dir, 'models/phone_key_3EP49G.tsv')
|
| 30 |
+
monophone_reference_feat_path = os.path.join(models_data_dir, 'models/monophones/w2v2-IS-30e967h_SPLIT1.pickle')
|
| 31 |
+
|
| 32 |
+
task_text_path = os.path.join(models_data_dir, 'models/task2text.txt')
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Define constants for converting pronunciation scores
|
| 36 |
+
# to user feedback
|
| 37 |
+
lower_bound_100 = -0.1
|
| 38 |
+
upper_bound_100 = 0.03 #0.02
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# PronunciationScorer takes considerable time to initialise,
|
| 42 |
+
# due to loading the w2v2 featurizer.
|
| 43 |
+
# After the first loading, it quickly scores each new user input speech.
|
| 44 |
+
# It's faster on GPU.
|
| 45 |
+
# Do not re-load a new w2v2 featurizer each time a user speaks.
|
| 46 |
+
scorer = PronunciationScorer(
|
| 47 |
+
task_scoring_models,
|
| 48 |
+
speech_featurizer_path,
|
| 49 |
+
speech_featurizer_layer,
|
| 50 |
+
task_text_path,
|
| 51 |
+
monophone_reference_feat_path)
|
| 52 |
+
|
| 53 |
+
# FeedbackConverter new module to process scores into user feedback
|
| 54 |
+
#fb = FeedbackConverter(task_key_path, phone_key_path, lower_bound_100, upper_bound_100)
|
| 55 |
+
fb_inits = {'t_key': task_key_path,
|
| 56 |
+
'p_key': phone_key_path,
|
| 57 |
+
'lb': lower_bound_100,
|
| 58 |
+
'ub': upper_bound_100}
|
| 59 |
+
|
| 60 |
+
with open(task_text_path,'r') as handle:
|
| 61 |
+
prompts = handle.read().splitlines()
|
| 62 |
+
prompts = [tuple(l.split('\t')) for l in prompts]
|
| 63 |
+
|
| 64 |
+
return scorer, prompts, fb_inits
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# return a random prompt text from the list of prompt texts
|
| 68 |
+
def refresh_prompt_fn():
|
| 69 |
+
task_id, sentence, normed_text = random.choice(prompts)
|
| 70 |
+
return tuple([task_id, normed_text]), sentence, None, _disp_scorertype('...'), None, None
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def recal_fb_fn(cslider):
|
| 74 |
+
lowr, uppr = cslider
|
| 75 |
+
return FeedbackConverter(fb_params['t_key'], fb_params['p_key'], lowr, uppr)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def score_speech_fn(current_prompt, user_wav, fbc):
|
| 79 |
+
|
| 80 |
+
task_id, norm_text = current_prompt
|
| 81 |
+
task_text, task_model = scorer.task_scorer(task_id)
|
| 82 |
+
print(user_wav)
|
| 83 |
+
user_wav_duration = librosa.get_duration(path=user_wav)
|
| 84 |
+
|
| 85 |
+
word_aligns, phone_aligns = makeAlign(
|
| 86 |
+
task_text,
|
| 87 |
+
user_wav,
|
| 88 |
+
user_wav_duration,
|
| 89 |
+
splitext(basename(user_wav))[0],
|
| 90 |
+
'./pronunciation-score-icelandic/alignment/new/',
|
| 91 |
+
'./pronunciation-score-icelandic/alignment/captini_pretrained_aligner/'
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
#print('an audio file! : ', user_wav)
|
| 95 |
+
print('TASK ID', task_id, 'TEXT:', task_text)
|
| 96 |
+
#print('Duration --', user_wav_duration)
|
| 97 |
+
#print('WORD ALIGNS:', word_aligns)
|
| 98 |
+
#print('PHONE ALIgNS:', phone_aligns)
|
| 99 |
+
|
| 100 |
+
if word_aligns:
|
| 101 |
+
word_scores, phone_scores = scorer.score_one(
|
| 102 |
+
task_model,
|
| 103 |
+
user_wav,
|
| 104 |
+
word_aligns,
|
| 105 |
+
phone_aligns)
|
| 106 |
+
|
| 107 |
+
print('feedback with:', fbc.lower_bound_100, fbc.upper_bound_100)
|
| 108 |
+
|
| 109 |
+
task_feedback, word_feedback, phone_feedback = fbc.convert(
|
| 110 |
+
word_scores,
|
| 111 |
+
phone_scores,
|
| 112 |
+
task_id)
|
| 113 |
+
|
| 114 |
+
collected_info = {'task_feedback': task_feedback,
|
| 115 |
+
'word_feedback': word_feedback, 'phone_feedback': phone_feedback,
|
| 116 |
+
'word_scores': word_scores, 'phone_scores': phone_scores,
|
| 117 |
+
'word_aligns': word_aligns, 'phone_aligns': phone_aligns}
|
| 118 |
+
|
| 119 |
+
#print('WORD SCORES:', word_scores)
|
| 120 |
+
#print('PHONE SCORES:', phone_scores)
|
| 121 |
+
#print('TASK FB:', task_feedback)
|
| 122 |
+
#print('WORD FB:', word_feedback)
|
| 123 |
+
#print('PHONE FB', phone_feedback)
|
| 124 |
+
|
| 125 |
+
#disp_fb = display0(collected_info)
|
| 126 |
+
disp_fb = f'## Einkunn/gæði: {task_feedback}'
|
| 127 |
+
|
| 128 |
+
# colorised html version....
|
| 129 |
+
#return ''.join([ hc_from_3(self.phone_3sort_monophone(score,label)) for label,score in scores_list ])
|
| 130 |
+
disp_fb2 = ' '.join([''.join([fbc.hc_from_3(sc3,phid) for sc3,phid in wphones]) for wrd, wphones in phone_feedback])
|
| 131 |
+
#disp_fb2 = ' '.join([htxt for wrd, htxt in phone_feedback])
|
| 132 |
+
disp_fb2 = f'## {disp_fb2}'
|
| 133 |
+
|
| 134 |
+
disp_plot1 = fbc.generate_graphic_feedback_0(user_wav, word_aligns, phone_aligns, phone_feedback)
|
| 135 |
+
|
| 136 |
+
disp_blocksplot = fbc.generate_graphic_feedback_blocks(phone_scores)
|
| 137 |
+
|
| 138 |
+
else:
|
| 139 |
+
disp_fb = "Failure. If this happens every time try a different device/browser, "
|
| 140 |
+
disp_fb += "Kaldi speech processor does not work on some (older?) devices, "
|
| 141 |
+
disp_fb += "this should not be a possible issue on Spaces but unfortunately it is."
|
| 142 |
+
disp_fb2 = ''
|
| 143 |
+
disp_plot1 = None
|
| 144 |
+
disp_blocksplot = None
|
| 145 |
+
|
| 146 |
+
return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, disp_blocksplot
|
| 147 |
+
#return disp_fb, disp_fb2, _disp_scorertype(task_model[1]), disp_plot1
|
| 148 |
+
|
| 149 |
+
def _disp_scorertype(scorertype):
|
| 150 |
+
if scorertype == 'task':
|
| 151 |
+
scoring_model_type = 'Full'
|
| 152 |
+
elif scorertype == 'phone':
|
| 153 |
+
scoring_model_type = 'Monophone'
|
| 154 |
+
else:
|
| 155 |
+
scoring_model_type = '...'
|
| 156 |
+
return f"### Scoring model for this exercise was: [{scoring_model_type}]"
|
| 157 |
+
|
| 158 |
+
def display0(score_output):
|
| 159 |
+
task_feedback = score_output['task_feedback']
|
| 160 |
+
word_feedback = score_output['word_feedback']
|
| 161 |
+
phone_feedback = score_output['phone_feedback']
|
| 162 |
+
|
| 163 |
+
ostring = f'{task_feedback}'
|
| 164 |
+
ostring += '\n--\n'
|
| 165 |
+
for w_s,p_s in zip(word_feedback, phone_feedback):
|
| 166 |
+
assert w_s[0] == p_s[0]
|
| 167 |
+
ostring += f'{w_s[0]}\t{w_s[1]}--\n'
|
| 168 |
+
for i in range(len(p_s[1])):
|
| 169 |
+
ostring += f'\t{p_s[1][i][0]}\t{p_s[1][i][1]}\n'
|
| 170 |
+
|
| 171 |
+
return ostring
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
scorer, prompts, fb_params = setup()
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
bl = gr.Blocks()
|
| 178 |
+
with bl:
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
gr.Markdown(
|
| 182 |
+
"""
|
| 183 |
+
## Framburðarþjálfun [Nýrómur]
|
| 184 |
+
|
| 185 |
+
""")
|
| 186 |
+
|
| 187 |
+
#setup user-adjustable feedback calibration
|
| 188 |
+
fb_lb, fb_ub = fb_params['lb'], fb_params['ub']
|
| 189 |
+
fb = gr.State(FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_lb, fb_ub))
|
| 190 |
+
|
| 191 |
+
current_prompt = gr.State((None,None))
|
| 192 |
+
with gr.Row(equal_height=True):
|
| 193 |
+
new_prompt_button = gr.Button("🔃 ⮕ \nEndurnýja textann",scale=0)
|
| 194 |
+
prompt_text = gr.Textbox(label="Texti", value="[--- stuttur æfingatexti ---]", interactive=False)
|
| 195 |
+
|
| 196 |
+
#with gr.Row(equal_height=True):
|
| 197 |
+
#with gr.Column():
|
| 198 |
+
user_speech = gr.Audio(sources=["upload", "microphone"],type="filepath",
|
| 199 |
+
waveform_options=gr.WaveformOptions(sample_rate=16000),
|
| 200 |
+
label="Lestu textann upphátt og vistaðu upptöku")
|
| 201 |
+
#with gr.Column(scale=0):
|
| 202 |
+
score_speech_button = gr.Button("⇩⇩⇩")
|
| 203 |
+
|
| 204 |
+
with gr.Row(equal_height=True):
|
| 205 |
+
with gr.Column(scale=1):
|
| 206 |
+
placeholder_output = gr.Markdown("## Einkunn/gæði: ...")
|
| 207 |
+
with gr.Column(scale=3):
|
| 208 |
+
blocks_output = gr.Plot()
|
| 209 |
+
blocks_info = gr.Markdown("""*This colour feedback was not calibrated by exercise/phone,
|
| 210 |
+
relative colours may mismatch other scores""")
|
| 211 |
+
|
| 212 |
+
#phone_output = gr.Markdown('')
|
| 213 |
+
plot_output = gr.Plot()
|
| 214 |
+
|
| 215 |
+
with gr.Row():
|
| 216 |
+
with gr.Column():
|
| 217 |
+
modeltype_info = gr.Markdown(_disp_scorertype('...'))
|
| 218 |
+
phone_output_keyinfo = gr.Markdown("""### Lykill: <span style='color:#0000FF;'>Rétt</span>, <span style='color:#FF0000;'>Rangt</span>, <span style='color:#BBBBBB;'>Ekki hægt að greina (of stutt hljóð)</span>, <span style='color:#8a2be2;'>••• Pitch</span>, <span style='color:#FFDAB9;'>--- Energy</span>""")
|
| 219 |
+
calibrate_slider = RangeSlider(minimum=-1, maximum=1, value=(fb_lb, fb_ub), interactive=True)
|
| 220 |
+
cali_info=gr.Markdown("""
|
| 221 |
+
#### ◄ Re-calibrate scoring
|
| 222 |
+
|
| 223 |
+
Too easy, always 100? Raise the maximum. Too hard, always 0? Lower the minimum.
|
| 224 |
+
|
| 225 |
+
### Note: Does not affect Monophone model.
|
| 226 |
+
""")
|
| 227 |
+
|
| 228 |
+
new_prompt_button.click(refresh_prompt_fn, inputs = [], outputs = [current_prompt, prompt_text, user_speech, modeltype_info, plot_output, blocks_output])
|
| 229 |
+
calibrate_slider.release(recal_fb_fn, inputs = [calibrate_slider], outputs = [fb] )
|
| 230 |
+
score_speech_button.click(score_speech_fn,
|
| 231 |
+
inputs=[current_prompt, user_speech, fb],
|
| 232 |
+
outputs = [placeholder_output, modeltype_info, plot_output, blocks_output])
|
| 233 |
+
#outputs = [placeholder_output, phone_output, modeltype_info, plot_output])
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
bl.launch()
|
| 237 |
+
|