Spaces:

clr
/

nyro-dev-p

Sleeping

App Files Files Community

nyro-dev-p / app.py

clr

Update app.py

7994434 verified 4 days ago

raw

history blame contribute delete

12.5 kB

	import gradio as gr
	import subprocess, os, sys, pickle, random, librosa, huggingface_hub
	sys.path.append('./pronunciation-score-icelandic')
	from captinialign import makeAlign
	from captiniscore import PronunciationScorer
	#from captinifeedback import FeedbackConverter
	from hffeedback import FeedbackConverter
	from os.path import basename, splitext
	from gradio_rangeslider import RangeSlider


	def setup():

	# Speech embedding model and layer must match the pre-computed scoring models.
	#speech_featurizer_path = 'language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h'
	# when kaldi==5.5.1016, w2v2 needs to download local copy before loading pretrained??????????
	speech_featurizer_path = huggingface_hub.snapshot_download(
	repo_id='language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h',
	repo_type='model')
	speech_featurizer_layer = 8

	models_data_dir = huggingface_hub.snapshot_download(
	repo_id='clr/captini-scoring-references',
	repo_type='dataset') #local_dir="path/to/folder"

	task_scoring_models = os.path.join(models_data_dir, 'models/task_models_w2v2-IS-30e967h_l8_3EP49G/')
	task_key_path = os.path.join(models_data_dir, 'models/task_key_3EP49G.json')

	phone_key_path = os.path.join(models_data_dir, 'models/phone_key_3EP49G.tsv')
	monophone_reference_feat_path = os.path.join(models_data_dir, 'models/monophones/w2v2-IS-30e967h_SPLIT1.pickle')

	task_text_path = os.path.join(models_data_dir, 'models/task2text.txt')


	# Define constants for converting pronunciation scores
	# to user feedback
	lower_bound_100 = -0.1
	upper_bound_100 = 0.03 #0.02


	# PronunciationScorer takes considerable time to initialise,
	# due to loading the w2v2 featurizer.
	# After the first loading, it quickly scores each new user input speech.
	# It's faster on GPU.
	# Do not re-load a new w2v2 featurizer each time a user speaks.
	scorer = PronunciationScorer(
	task_scoring_models,
	speech_featurizer_path,
	speech_featurizer_layer,
	task_text_path,
	monophone_reference_feat_path)

	# FeedbackConverter new module to process scores into user feedback
	#fb = FeedbackConverter(task_key_path, phone_key_path, lower_bound_100, upper_bound_100)
	fb_inits = {'t_key': task_key_path,
	'p_key': phone_key_path,
	'lb': lower_bound_100,
	'ub': upper_bound_100}

	with open(task_text_path,'r') as handle:
	prompts = handle.read().splitlines()
	prompts = [tuple(l.split('\t')) for l in prompts]

	return scorer, prompts, fb_inits


	# return a random prompt text from the list of prompt texts
	def refresh_prompt_fn():
	task_id, sentence, normed_text = random.choice(prompts)
	tips = _user_instructions(sentence)
	return [tuple([task_id, normed_text]),
	gr.update(value = [( sentence,"hl") ] ),
	tips,
	None,
	_disp_scorertype('...'),
	"## Einkunn/gæði framburðar: ...",
	None,
	None]

	# heuristic for different instructions
	# based on how users should speak short vs longer prompts
	def _user_instructions(to_speak):
	if (len(to_speak.split(' '))<2) or ' – ' in to_speak:
	return """Lestu eftirfarandi texta. Talaðu skýrt:"""
	else:
	return """Lestu eftirfarandi texta eins og þu segir í venjulegu samtali:"""


	def recal_fb_fn(cslider):
	lowr, uppr = cslider
	return FeedbackConverter(fb_params['t_key'], fb_params['p_key'], lowr, uppr)


	def score_speech_fn(current_prompt, user_wav, fbc, devopts):

	task_id, norm_text = current_prompt
	task_text, task_model = scorer.task_scorer(task_id)
	print(user_wav)
	user_wav_duration = librosa.get_duration(path=user_wav)

	word_aligns, phone_aligns = makeAlign(
	task_text,
	user_wav,
	user_wav_duration,
	splitext(basename(user_wav))[0],
	'./pronunciation-score-icelandic/alignment/new/',
	'./pronunciation-score-icelandic/alignment/captini_pretrained_aligner/'
	)

	#print('an audio file! : ', user_wav)
	print('TASK ID', task_id, 'TEXT:', task_text)
	#print('Duration --', user_wav_duration)
	#print('WORD ALIGNS:', word_aligns)
	#print('PHONE ALIgNS:', phone_aligns)

	if word_aligns:
	word_scores, phone_scores = scorer.score_one(
	task_model,
	user_wav,
	word_aligns,
	phone_aligns)

	print('feedback with:', fbc.lower_bound_100, fbc.upper_bound_100)

	task_feedback, word_feedback, phone_feedback = fbc.convert(
	word_scores,
	phone_scores,
	task_id)

	collected_info = {'task_feedback': task_feedback,
	'word_feedback': word_feedback, 'phone_feedback': phone_feedback,
	'word_scores': word_scores, 'phone_scores': phone_scores,
	'word_aligns': word_aligns, 'phone_aligns': phone_aligns}

	#print('WORD SCORES:', word_scores)
	#print('PHONE SCORES:', phone_scores)
	#print('TASK FB:', task_feedback)
	#print('WORD FB:', word_feedback)
	#print('PHONE FB', phone_feedback)

	disp_fb = f'## Einkunn/gæði framburðar: {task_feedback}'

	#return ''.join([ hc_from_3(self.phone_3sort_monophone(score,label)) for label,score in scores_list ])
	disp_fb2 = ' '.join([''.join([fbc.hc_from_3(sc3,phid) for sc3,phid in wphones]) for wrd, wphones in phone_feedback])
	#disp_fb2 = ' '.join([htxt for wrd, htxt in phone_feedback])
	disp_fb2 = f'## {disp_fb2}'

	disp_plot1 = fbc.generate_graphic_feedback_0(user_wav, word_aligns, phone_aligns, phone_feedback, devopts)

	disp_blocksplot = fbc.generate_graphic_feedback_blocks(phone_scores)

	else:
	disp_fb = "Failure. If this happens every time try a different device/browser, "
	disp_fb += "Kaldi speech processor does not work on some (older?) devices, "
	disp_fb += "this should not be a possible issue on Spaces but unfortunately it is."
	disp_fb2 = ''
	disp_plot1 = None
	disp_blocksplot = None

	if devopts:
	return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, disp_blocksplot
	else:
	return disp_fb, _disp_scorertype(task_model[1]), disp_plot1, gr.update(value=None)

	#return disp_fb, disp_fb2, _disp_scorertype(task_model[1]), disp_plot1

	def _disp_scorertype(scorertype):
	if scorertype == 'task':
	scoring_model_type = 'Full'
	elif scorertype == 'phone':
	scoring_model_type = 'Monophone'
	else:
	scoring_model_type = '...'
	return f"### Scoring model for this exercise was: [{scoring_model_type}]"

	# toggle on/off some components
	def dev_opts_fn(check_box,phone_output_keyinfo):
	energy_key = """, <span style='color:#F49098;'>--- Hljóðstyrkur (root mean square energy)</span>"""
	reset_fb = FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_params['lb'], fb_params['ub'])
	if check_box: # user has just turned ON extra options
	new_key = phone_output_keyinfo + energy_key
	new_vis = True
	else:
	new_key = phone_output_keyinfo[:-len(energy_key)]
	new_vis = False
	return [new_key,
	gr.update(value= _disp_scorertype('...'),visible=new_vis),
	gr.update(value=(fb_params['lb'], fb_params['ub']),visible=new_vis),
	reset_fb,
	gr.update(visible=new_vis),
	gr.update(visible=new_vis),
	gr.update(visible=new_vis), #gr.Plot(),
	gr.update(visible=new_vis)]


	def display0(score_output):
	task_feedback = score_output['task_feedback']
	word_feedback = score_output['word_feedback']
	phone_feedback = score_output['phone_feedback']

	ostring = f'{task_feedback}'
	ostring += '\n--\n'
	for w_s,p_s in zip(word_feedback, phone_feedback):
	assert w_s[0] == p_s[0]
	ostring += f'{w_s[0]}\t{w_s[1]}--\n'
	for i in range(len(p_s[1])):
	ostring += f'\t{p_s[1][i][0]}\t{p_s[1][i][1]}\n'

	return ostring


	scorer, prompts, fb_params = setup()


	bl = gr.Blocks()
	with bl:


	gr.Markdown(
	"""
	## Framburðarþjálfun [Nýrómur]

	""")

	#setup user-adjustable feedback calibration
	fb_lb, fb_ub = fb_params['lb'], fb_params['ub']
	fb = gr.State(FeedbackConverter(fb_params['t_key'], fb_params['p_key'], fb_lb, fb_ub))

	current_prompt = gr.State((None,None))
	with gr.Row(equal_height=True):
	new_prompt_button = gr.Button("🔃 Endurnýja texta ⮕",scale=0)
	with gr.Column():
	prompt_instructions = gr.Markdown("""Lestu eftirfarandi texta:""")
	prompt_text = gr.HighlightedText(value=[("[--- dæmi ---]","hl")],
	color_map={"hl": "#A8DADC"},
	show_legend=False, show_label=False,show_inline_category=False)


	#with gr.Row(equal_height=True):
	#with gr.Column():
	user_speech = gr.Audio(sources=["microphone"],type="filepath",
	waveform_options=gr.WaveformOptions(sample_rate=16000),
	label="Lestu upp texta og vistaðu upptöku")
	#with gr.Column(scale=0):
	score_speech_button = gr.Button("⇩⇩⇩ Greina upptökuna ⇩⇩⇩")

	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	placeholder_output = gr.Markdown("## Einkunn/gæði framburðar: ...")
	with gr.Column(scale=3):
	blocks_output = gr.Plot(visible=False)
	blocks_info = gr.Markdown("""*This colour feedback was not calibrated by exercise/phone,
	relative colours may mismatch other scores""",visible=False)

	plot_output = gr.Plot()

	with gr.Row():
	with gr.Column(scale=2):
	#phone_output_keyinfo = gr.Markdown("""### Lykill: <span style='color:#26701C;'>Meira nálægt</span>, <span style='color:#E85907;'>Minna nálægt</span>, <span style='color:#BBBBBB;'>Ekki hægt að greina (of stutt hljóð)</span>, <span style='color:#88447F;'>••• Ítónun (tónhæð)</span>""")
	phone_output_keyinfo = gr.Markdown("""### Lykill: <span style='color:#26701C;'>Nálægt réttum framburði</span>, <span style='color:#E85907;'>Fjarri réttum framburði</span>, <span style='color:#BBBBBB;'>Ekki hægt að greina (of stutt hljóð)</span>, <span style='color:#88447F;'>••• Ítónun (tónhæð)</span>""")

	with gr.Column(scale=1):
	dev_checkbox = gr.Checkbox(label="[Sýna villuleitarverkfæri]", value=False)


	modeltype_info = gr.Markdown(_disp_scorertype('...'),visible=False)
	with gr.Row():
	#modeltype_info = gr.Markdown(_disp_scorertype('...'),visible=False) # doesnt fit on mobile
	with gr.Column():
	calibrate_slider = RangeSlider(minimum=-1, maximum=1, value=(fb_lb, fb_ub), interactive=True, visible=False)
	cali_title = gr.Markdown("""#### ◭ Re-calibrate scoring""", visible=False)
	cali_info=gr.Markdown("""Too easy, always 100? Raise the maximum. Too hard, always 0? Lower the minimum.

	### Re-calibration does not affect Monophone model.
	""", visible=False)

	new_prompt_button.click(refresh_prompt_fn,
	inputs = [],
	outputs = [current_prompt, prompt_text, prompt_instructions, user_speech, modeltype_info, placeholder_output, plot_output, blocks_output])

	dev_checkbox.input(dev_opts_fn,
	inputs = [dev_checkbox,phone_output_keyinfo],
	outputs = [phone_output_keyinfo, modeltype_info, calibrate_slider, fb, cali_title, cali_info, blocks_output, blocks_info])

	calibrate_slider.release(recal_fb_fn,
	inputs = [calibrate_slider],
	outputs = [fb] )

	score_speech_button.click(score_speech_fn,
	inputs=[current_prompt, user_speech, fb, dev_checkbox],
	outputs = [placeholder_output, modeltype_info, plot_output, blocks_output])
	#outputs = [placeholder_output, phone_output, modeltype_info, plot_output])


	bl.launch()