clr commited on
Commit
f2fb25b
·
1 Parent(s): 2c42716

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -0
app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess, os, sys, pickle, random, librosa, huggingface_hub
3
+ sys.path.append('./pronunciation-score-icelandic')
4
+ from captinialign import makeAlign
5
+ from captiniscore import PronunciationScorer
6
+ #from captinifeedback import FeedbackConverter
7
+ from hffeedback import FeedbackConverter
8
+ from os.path import basename, splitext
9
+ from gradio_rangeslider import RangeSlider
10
+
11
+
12
+ def setup():
13
+
14
+ # Speech embedding model and layer must match the pre-computed scoring models.
15
+ #speech_featurizer_path = 'language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h'
16
+ # when kaldi==5.5.1016, w2v2 needs to download local copy before loading pretrained??????????
17
+ speech_featurizer_path = huggingface_hub.snapshot_download(
18
+ repo_id='language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h',
19
+ repo_type='model')
20
+ speech_featurizer_layer = 8
21
+
22
+ models_data_dir = huggingface_hub.snapshot_download(
23
+ repo_id='clr/captini-scoring-references',
24
+ repo_type='dataset') #local_dir="path/to/folder"
25
+
26
+ task_scoring_models = os.path.join(models_data_dir, 'models/task_models_w2v2-IS-30e967h_l8_3EP49G/')
27
+ task_key_path = os.path.join(models_data_dir, 'models/task_key_3EP49G.json')
28
+
29
+ phone_key_path = os.path.join(models_data_dir, 'models/phone_key_3EP49G.tsv')
30
+ monophone_reference_feat_path = os.path.join(models_data_dir, 'models/monophones/w2v2-IS-30e967h_SPLIT1.pickle')
31
+
32
+ task_text_path = os.path.join(models_data_dir, 'models/task2text.txt')
33
+
34
+
35
+ # Define constants for converting pronunciation scores
36
+ # to user feedback
37
+ lower_bound_100 = -0.1
38
+ upper_bound_100 = 0.03 #0.02
39
+
40
+
41
+ # PronunciationScorer takes considerable time to initialise,
42
+ # due to loading the w2v2 featurizer.
43
+ # After the first loading, it quickly scores each new user input speech.
44
+ # It's faster on GPU.
45
+ # Do not re-load a new w2v2 featurizer each time a user speaks.
46
+ scorer = PronunciationScorer(
47
+ task_scoring_models,
48
+ speech_featurizer_path,
49
+ speech_featurizer_layer,
50
+ task_text_path,
51
+ monophone_reference_feat_path)
52
+
53
+ # FeedbackConverter new module to process scores into user feedback
54
+ #fb = FeedbackConverter(task_key_path, phone_key_path, lower_bound_100, upper_bound_100)
55
+ fb_inits = {'t_key': task_key_path,
56
+ 'p_key': phone_key_path,
57
+ 'lb': lower_bound_100,
58
+ 'ub': upper_bound_100}
59
+
60
+ with open(task_text_path,'r') as handle:
61
+ prompts = handle.read().splitlines()
62
+ prompts = [tuple(l.split('\t')) for l in prompts]
63
+
64
+ return scorer, prompts, fb_inits
65
+
66
+
67
+ # return a random prompt text from the list of prompt texts
68
+ def refresh_prompt_fn():
69
+ task_id, sentence, normed_text = random.choice(prompts)
70
+ return tuple([task_id, normed_text]), sentence, None, _disp_scorertype('...'), None, None
71
+
72
+
73
+ def recal_fb_fn(cslider):
74
+ lowr, uppr = cslider
75
+ return FeedbackConverter(fb_params['t_key'], fb_params['p_key'], lowr, uppr)
76
+
77
+
78
+ def score_speech_fn(current_prompt, user_wav, fbc):
79
+
80
+ task_id, norm_text = current_prompt
81
+ task_text, task_model = scorer.task_scorer(task_id)
82
+ print(user_wav)
83
+ user_wav_duration = librosa.get_duration(path=user_wav)
84
+
85
+ word_aligns, phone_aligns = makeAlign(
86
+ task_text,
87
+ user_wav,
88
+ user_wav_duration,
89
+ splitext(basename(user_wav))[0],
90
+ './pronunciation-score-icelandic/alignment/new/',
91
+ './pronunciation-score-icelandic/alignment/captini_pretrained_aligner/'
92
+ )
93
+
94
+ #print('an audio file! : ', user_wav)
95
+ print('TASK ID', task_id, 'TEXT:', task_text)
96
+ #print('Duration --', user_wav_duration)
97
+ #print('WORD ALIGNS:', word_aligns)
98
+ #print('PHONE ALIgNS:', phone_aligns)
99
+
100
+ if word_aligns:
101
+ word_scores, phone_scores = scorer.score_one(
102
+ task_model,
103
+ user_wav,
104
+ word_aligns,
105
+ phone_aligns)
106
+
107
+ print('feedback with:', fbc.lower_bound_100, fbc.upper_bound_100)
108
+
109
+ task_feedback, word_feedback, phone_feedback = fbc.convert(
110
+ word_scores,
111
+ phone_scores,
112
+ task_id)
113
+
114
+ collected_info = {'task_feedback': task_feedback,
115
+ 'word_feedback': word_feedback, 'phone_feedback': phone_feedback,
116
+ 'word_scores': word_scores, 'phone_scores': phone_scores,
117
+ 'word_aligns': word_aligns, 'phone_aligns': phone_aligns}
118
+
119
+ #print('WORD SCORES:', word_scores)
120
+ #print('PHONE SCORES:', phone_scores)
121
+ #print('TASK FB:', task_feedback)
122
+ #print('WORD FB:', word_feedback)
123
+ #print('PHONE FB', phone_feedback)
124
+
125
+ #disp_fb = display0(collected_info)
126
+ disp_fb = f'## Einkunn/gæði: {task_feedback}'
127
+
128
+ # colorised html version....
129
+ #return ''.join([ hc_from_3(self.phone_3sort_monophone(score,label)) for label,score in scores_list ])
130
+ disp_fb2 = ' '.join([''.join([fbc.hc_from_3(sc3,phid) for sc3,phid in wphones]) for wrd, wphones in phone_feedback])
131
+ #disp_fb2 = ' '.join([htxt for wrd, htxt in phone_feedback])
132
+ disp_fb2 = f'## {disp_fb2}'
133
+
134
+ disp_plot1 = fbc.generate_graphic_feedback_0(user_wav, word_aligns, phone_aligns, phone_feedback)
135
+
136
+ disp_blocksplot = fbc.generate_graphic_feedback_blocks(phone_scores)
137
+
138
+ else:
139
+ disp_fb = "Failure. If this happens every time try a different device/browser, "
140
+ disp_fb += "Kaldi speech processor does not work on some (older?) devices, "
141
+ disp_fb += "this should not be a possible issue on Spaces but unfortunately it is."
142
+ disp_fb2 = ''
143
+ disp_plot1 = None
144
+ disp_blocksplot = None
145
+
146
+ return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, disp_blocksplot
147
+ #return disp_fb, disp_fb2, _disp_scorertype(task_model[1]), disp_plot1
148
+
149
+ def _disp_scorertype(scorertype):
150
+ if scorertype == 'task':
151
+ scoring_model_type = 'Full'
152
+ elif scorertype == 'phone':
153
+ scoring_model_type = 'Monophone'
154
+ else:
155
+ scoring_model_type = '...'
156
+ return f"### Scoring model for this exercise was: [{scoring_model_type}]"
157
+
158
+ def display0(score_output):
159
+ task_feedback = score_output['task_feedback']
160
+ word_feedback = score_output['word_feedback']
161
+ phone_feedback = score_output['phone_feedback']
162
+
163
+ ostring = f'{task_feedback}'
164
+ ostring += '\n--\n'
165
+ for w_s,p_s in zip(word_feedback, phone_feedback):
166
+ assert w_s[0] == p_s[0]
167
+ ostring += f'{w_s[0]}\t{w_s[1]}--\n'
168
+ for i in range(len(p_s[1])):
169
+ ostring += f'\t{p_s[1][i][0]}\t{p_s[1][i][1]}\n'
170
+
171
+ return ostring
172
+
173
+
174
+ scorer, prompts, fb_params = setup()
175
+
176
+
177
+ bl = gr.Blocks()
178
+ with bl:
179
+
180
+
181
+ gr.Markdown(
182
+ """
183
+ ## Framburðarþjálfun [Nýrómur]
184
+
185
+ """)
186
+
187
+ #setup user-adjustable feedback calibration
188
+ fb_lb, fb_ub = fb_params['lb'], fb_params['ub']
189
+ fb = gr.State(FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_lb, fb_ub))
190
+
191
+ current_prompt = gr.State((None,None))
192
+ with gr.Row(equal_height=True):
193
+ new_prompt_button = gr.Button("🔃 ⮕ \nEndurnýja textann",scale=0)
194
+ prompt_text = gr.Textbox(label="Texti", value="[--- stuttur æfingatexti ---]", interactive=False)
195
+
196
+ #with gr.Row(equal_height=True):
197
+ #with gr.Column():
198
+ user_speech = gr.Audio(sources=["upload", "microphone"],type="filepath",
199
+ waveform_options=gr.WaveformOptions(sample_rate=16000),
200
+ label="Lestu textann upphátt og vistaðu upptöku")
201
+ #with gr.Column(scale=0):
202
+ score_speech_button = gr.Button("⇩⇩⇩")
203
+
204
+ with gr.Row(equal_height=True):
205
+ with gr.Column(scale=1):
206
+ placeholder_output = gr.Markdown("## Einkunn/gæði: ...")
207
+ with gr.Column(scale=3):
208
+ blocks_output = gr.Plot()
209
+ blocks_info = gr.Markdown("""*This colour feedback was not calibrated by exercise/phone,
210
+ relative colours may mismatch other scores""")
211
+
212
+ #phone_output = gr.Markdown('')
213
+ plot_output = gr.Plot()
214
+
215
+ with gr.Row():
216
+ with gr.Column():
217
+ modeltype_info = gr.Markdown(_disp_scorertype('...'))
218
+ phone_output_keyinfo = gr.Markdown("""### Lykill: <span style='color:#0000FF;'>Rétt</span>, <span style='color:#FF0000;'>Rangt</span>, <span style='color:#BBBBBB;'>Ekki hægt að greina (of stutt hljóð)</span>, <span style='color:#8a2be2;'>••• Pitch</span>, <span style='color:#FFDAB9;'>--- Energy</span>""")
219
+ calibrate_slider = RangeSlider(minimum=-1, maximum=1, value=(fb_lb, fb_ub), interactive=True)
220
+ cali_info=gr.Markdown("""
221
+ #### ◄ Re-calibrate scoring
222
+
223
+ Too easy, always 100? Raise the maximum. Too hard, always 0? Lower the minimum.
224
+
225
+ ### Note: Does not affect Monophone model.
226
+ """)
227
+
228
+ new_prompt_button.click(refresh_prompt_fn, inputs = [], outputs = [current_prompt, prompt_text, user_speech, modeltype_info, plot_output, blocks_output])
229
+ calibrate_slider.release(recal_fb_fn, inputs = [calibrate_slider], outputs = [fb] )
230
+ score_speech_button.click(score_speech_fn,
231
+ inputs=[current_prompt, user_speech, fb],
232
+ outputs = [placeholder_output, modeltype_info, plot_output, blocks_output])
233
+ #outputs = [placeholder_output, phone_output, modeltype_info, plot_output])
234
+
235
+
236
+ bl.launch()
237
+