Spaces:
Running
Running
File size: 12,472 Bytes
f2fb25b abf7733 68bf109 8c1c1d7 68bf109 58b221a 68bf109 f2fb25b abf7733 8c1c1d7 abf7733 8c1c1d7 abf7733 f2fb25b 8eab7fb f2fb25b 8eab7fb f2fb25b 8eab7fb f2fb25b 58b221a f2fb25b 58b221a f2fb25b 218a109 5ac3262 8eab7fb 218a109 8eab7fb 218a109 f2fb25b 1ddd191 8c1c1d7 2d1753b 8ca15d7 f2fb25b 68bf109 f2fb25b 1ddd191 f2fb25b 1ddd191 f2fb25b e7f7e3b f2fb25b cead3b5 f2fb25b 218a109 f2fb25b 218a109 f2fb25b a8de61c f2fb25b a8de61c 8eab7fb a8de61c 5ac3262 7994434 a8de61c 7994434 a8de61c f2fb25b a8de61c 218a109 f2fb25b 218a109 58b221a 218a109 a8de61c 218a109 f2fb25b 8eab7fb f2fb25b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 | import gradio as gr
import subprocess, os, sys, pickle, random, librosa, huggingface_hub
sys.path.append('./pronunciation-score-icelandic')
from captinialign import makeAlign
from captiniscore import PronunciationScorer
#from captinifeedback import FeedbackConverter
from hffeedback import FeedbackConverter
from os.path import basename, splitext
from gradio_rangeslider import RangeSlider
def setup():
# Speech embedding model and layer must match the pre-computed scoring models.
#speech_featurizer_path = 'language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h'
# when kaldi==5.5.1016, w2v2 needs to download local copy before loading pretrained??????????
speech_featurizer_path = huggingface_hub.snapshot_download(
repo_id='language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h',
repo_type='model')
speech_featurizer_layer = 8
models_data_dir = huggingface_hub.snapshot_download(
repo_id='clr/captini-scoring-references',
repo_type='dataset') #local_dir="path/to/folder"
task_scoring_models = os.path.join(models_data_dir, 'models/task_models_w2v2-IS-30e967h_l8_3EP49G/')
task_key_path = os.path.join(models_data_dir, 'models/task_key_3EP49G.json')
phone_key_path = os.path.join(models_data_dir, 'models/phone_key_3EP49G.tsv')
monophone_reference_feat_path = os.path.join(models_data_dir, 'models/monophones/w2v2-IS-30e967h_SPLIT1.pickle')
task_text_path = os.path.join(models_data_dir, 'models/task2text.txt')
# Define constants for converting pronunciation scores
# to user feedback
lower_bound_100 = -0.1
upper_bound_100 = 0.03 #0.02
# PronunciationScorer takes considerable time to initialise,
# due to loading the w2v2 featurizer.
# After the first loading, it quickly scores each new user input speech.
# It's faster on GPU.
# Do not re-load a new w2v2 featurizer each time a user speaks.
scorer = PronunciationScorer(
task_scoring_models,
speech_featurizer_path,
speech_featurizer_layer,
task_text_path,
monophone_reference_feat_path)
# FeedbackConverter new module to process scores into user feedback
#fb = FeedbackConverter(task_key_path, phone_key_path, lower_bound_100, upper_bound_100)
fb_inits = {'t_key': task_key_path,
'p_key': phone_key_path,
'lb': lower_bound_100,
'ub': upper_bound_100}
with open(task_text_path,'r') as handle:
prompts = handle.read().splitlines()
prompts = [tuple(l.split('\t')) for l in prompts]
return scorer, prompts, fb_inits
# return a random prompt text from the list of prompt texts
def refresh_prompt_fn():
task_id, sentence, normed_text = random.choice(prompts)
tips = _user_instructions(sentence)
return [tuple([task_id, normed_text]),
gr.update(value = [( sentence,"hl") ] ),
tips,
None,
_disp_scorertype('...'),
"## Einkunn/gæði framburðar: ...",
None,
None]
# heuristic for different instructions
# based on how users should speak short vs longer prompts
def _user_instructions(to_speak):
if (len(to_speak.split(' '))<2) or ' – ' in to_speak:
return """Lestu eftirfarandi texta. Talaðu skýrt:"""
else:
return """Lestu eftirfarandi texta eins og þu segir í venjulegu samtali:"""
def recal_fb_fn(cslider):
lowr, uppr = cslider
return FeedbackConverter(fb_params['t_key'], fb_params['p_key'], lowr, uppr)
def score_speech_fn(current_prompt, user_wav, fbc, devopts):
task_id, norm_text = current_prompt
task_text, task_model = scorer.task_scorer(task_id)
print(user_wav)
user_wav_duration = librosa.get_duration(path=user_wav)
word_aligns, phone_aligns = makeAlign(
task_text,
user_wav,
user_wav_duration,
splitext(basename(user_wav))[0],
'./pronunciation-score-icelandic/alignment/new/',
'./pronunciation-score-icelandic/alignment/captini_pretrained_aligner/'
)
#print('an audio file! : ', user_wav)
print('TASK ID', task_id, 'TEXT:', task_text)
#print('Duration --', user_wav_duration)
#print('WORD ALIGNS:', word_aligns)
#print('PHONE ALIgNS:', phone_aligns)
if word_aligns:
word_scores, phone_scores = scorer.score_one(
task_model,
user_wav,
word_aligns,
phone_aligns)
print('feedback with:', fbc.lower_bound_100, fbc.upper_bound_100)
task_feedback, word_feedback, phone_feedback = fbc.convert(
word_scores,
phone_scores,
task_id)
collected_info = {'task_feedback': task_feedback,
'word_feedback': word_feedback, 'phone_feedback': phone_feedback,
'word_scores': word_scores, 'phone_scores': phone_scores,
'word_aligns': word_aligns, 'phone_aligns': phone_aligns}
#print('WORD SCORES:', word_scores)
#print('PHONE SCORES:', phone_scores)
#print('TASK FB:', task_feedback)
#print('WORD FB:', word_feedback)
#print('PHONE FB', phone_feedback)
disp_fb = f'## Einkunn/gæði framburðar: {task_feedback}'
#return ''.join([ hc_from_3(self.phone_3sort_monophone(score,label)) for label,score in scores_list ])
disp_fb2 = ' '.join([''.join([fbc.hc_from_3(sc3,phid) for sc3,phid in wphones]) for wrd, wphones in phone_feedback])
#disp_fb2 = ' '.join([htxt for wrd, htxt in phone_feedback])
disp_fb2 = f'## {disp_fb2}'
disp_plot1 = fbc.generate_graphic_feedback_0(user_wav, word_aligns, phone_aligns, phone_feedback, devopts)
disp_blocksplot = fbc.generate_graphic_feedback_blocks(phone_scores)
else:
disp_fb = "Failure. If this happens every time try a different device/browser, "
disp_fb += "Kaldi speech processor does not work on some (older?) devices, "
disp_fb += "this should not be a possible issue on Spaces but unfortunately it is."
disp_fb2 = ''
disp_plot1 = None
disp_blocksplot = None
if devopts:
return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, disp_blocksplot
else:
return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, gr.update(value=None)
#return disp_fb, disp_fb2, _disp_scorertype(task_model[1]), disp_plot1
def _disp_scorertype(scorertype):
if scorertype == 'task':
scoring_model_type = 'Full'
elif scorertype == 'phone':
scoring_model_type = 'Monophone'
else:
scoring_model_type = '...'
return f"### Scoring model for this exercise was: [{scoring_model_type}]"
# toggle on/off some components
def dev_opts_fn(check_box,phone_output_keyinfo):
energy_key = """, <span style='color:#F49098;'>--- Hljóðstyrkur (root mean square energy)</span>"""
reset_fb = FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_params['lb'], fb_params['ub'])
if check_box: # user has just turned ON extra options
new_key = phone_output_keyinfo + energy_key
new_vis = True
else:
new_key = phone_output_keyinfo[:-len(energy_key)]
new_vis = False
return [new_key,
gr.update(value= _disp_scorertype('...'),visible=new_vis),
gr.update(value=(fb_params['lb'], fb_params['ub']),visible=new_vis),
reset_fb,
gr.update(visible=new_vis),
gr.update(visible=new_vis),
gr.update(visible=new_vis), #gr.Plot(),
gr.update(visible=new_vis)]
def display0(score_output):
task_feedback = score_output['task_feedback']
word_feedback = score_output['word_feedback']
phone_feedback = score_output['phone_feedback']
ostring = f'{task_feedback}'
ostring += '\n--\n'
for w_s,p_s in zip(word_feedback, phone_feedback):
assert w_s[0] == p_s[0]
ostring += f'{w_s[0]}\t{w_s[1]}--\n'
for i in range(len(p_s[1])):
ostring += f'\t{p_s[1][i][0]}\t{p_s[1][i][1]}\n'
return ostring
scorer, prompts, fb_params = setup()
bl = gr.Blocks()
with bl:
gr.Markdown(
"""
## Framburðarþjálfun [Nýrómur]
""")
#setup user-adjustable feedback calibration
fb_lb, fb_ub = fb_params['lb'], fb_params['ub']
fb = gr.State(FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_lb, fb_ub))
current_prompt = gr.State((None,None))
with gr.Row(equal_height=True):
new_prompt_button = gr.Button("🔃 Endurnýja texta ⮕",scale=0)
with gr.Column():
prompt_instructions = gr.Markdown("""Lestu eftirfarandi texta:""")
prompt_text = gr.HighlightedText(value=[("[--- dæmi ---]","hl")],
color_map={"hl": "#A8DADC"},
show_legend=False, show_label=False,show_inline_category=False)
#with gr.Row(equal_height=True):
#with gr.Column():
user_speech = gr.Audio(sources=["microphone"],type="filepath",
waveform_options=gr.WaveformOptions(sample_rate=16000),
label="Lestu upp texta og vistaðu upptöku")
#with gr.Column(scale=0):
score_speech_button = gr.Button("⇩⇩⇩ Greina upptökuna ⇩⇩⇩")
with gr.Row(equal_height=True):
with gr.Column(scale=1):
placeholder_output = gr.Markdown("## Einkunn/gæði framburðar: ...")
with gr.Column(scale=3):
blocks_output = gr.Plot(visible=False)
blocks_info = gr.Markdown("""*This colour feedback was not calibrated by exercise/phone,
relative colours may mismatch other scores""",visible=False)
plot_output = gr.Plot()
with gr.Row():
with gr.Column(scale=2):
#phone_output_keyinfo = gr.Markdown("""### Lykill: <span style='color:#26701C;'>Meira nálægt</span>, <span style='color:#E85907;'>Minna nálægt</span>, <span style='color:#BBBBBB;'>Ekki hægt að greina (of stutt hljóð)</span>, <span style='color:#88447F;'>••• Ítónun (tónhæð)</span>""")
phone_output_keyinfo = gr.Markdown("""### Lykill: <span style='color:#26701C;'>Nálægt réttum framburði</span>, <span style='color:#E85907;'>Fjarri réttum framburði</span>, <span style='color:#BBBBBB;'>Ekki hægt að greina (of stutt hljóð)</span>, <span style='color:#88447F;'>••• Ítónun (tónhæð)</span>""")
with gr.Column(scale=1):
dev_checkbox = gr.Checkbox(label="[Sýna villuleitarverkfæri]", value=False)
modeltype_info = gr.Markdown(_disp_scorertype('...'),visible=False)
with gr.Row():
#modeltype_info = gr.Markdown(_disp_scorertype('...'),visible=False) # doesnt fit on mobile
with gr.Column():
calibrate_slider = RangeSlider(minimum=-1, maximum=1, value=(fb_lb, fb_ub), interactive=True, visible=False)
cali_title = gr.Markdown("""#### ◭ Re-calibrate scoring""", visible=False)
cali_info=gr.Markdown("""Too easy, always 100? Raise the maximum. Too hard, always 0? Lower the minimum.
### Re-calibration does not affect Monophone model.
""", visible=False)
new_prompt_button.click(refresh_prompt_fn,
inputs = [],
outputs = [current_prompt, prompt_text, prompt_instructions, user_speech, modeltype_info, placeholder_output, plot_output, blocks_output])
dev_checkbox.input(dev_opts_fn,
inputs = [dev_checkbox,phone_output_keyinfo],
outputs = [phone_output_keyinfo, modeltype_info, calibrate_slider, fb, cali_title, cali_info, blocks_output, blocks_info])
calibrate_slider.release(recal_fb_fn,
inputs = [calibrate_slider],
outputs = [fb] )
score_speech_button.click(score_speech_fn,
inputs=[current_prompt, user_speech, fb, dev_checkbox],
outputs = [placeholder_output, modeltype_info, plot_output, blocks_output])
#outputs = [placeholder_output, phone_output, modeltype_info, plot_output])
bl.launch()
|