Spaces:

TlanextliOpenLab
/

MultilanguageTranscTransl

Sleeping

App Files Files Community

MultilanguageTranscTransl / app.py

Tlanextli

Update app.py

6b86b4a over 2 years ago

raw

history blame contribute delete

5.74 kB

	import os
	import gradio as gr
	import torch
	import numpy
	import librosa
	import languages_dic
	from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline


	title = "Multilanguage Transcription and Translation"

	availableLang = "Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian, Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh."

	description1 = """<p style='font-size: 18px;'> Transcribe an audio file containing a speech in any of the languages listed below and translate it to English. </p>
	<p style='font-size: 16px;'> This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow. </p> \n
	""" + availableLang

	description2 ="""<p style='font-size: 18px;'> Transcribe a recording with your microphone of a speech in any of the languages listed below and translate it to English. </p>
	<p style='font-size: 16px;'> This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow. </p> \n
	""" + availableLang


	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	#modelType = "openai/whisper-small"

	class LM:
	model={}
	processor={}
	pipe={}
	#LMsizes = ["tiny", "base", "small", "medium", "large"]
	LMsizes = ["base", "small", "medium"]

	myLM = LM()

	for LMsize in myLM.LMsizes:
	modelType = "openai/whisper-"+LMsize
	myLM.model[LMsize] = WhisperForConditionalGeneration.from_pretrained(modelType).to(device)
	myLM.processor[LMsize] = WhisperProcessor.from_pretrained(modelType)
	myLM.pipe[LMsize] = pipeline(task="automatic-speech-recognition", model=modelType, device=device, chunk_length_s=29, stride_length_s=[5,0])



	def detect_language(audio_path, model, processor, asr_pipe_whisper):
	#Is not possible to retrieve the predicted language directly or using a pipeline. Instead:
	# Loads and resample the audio file to 16kHz, convert to mono and control the duration of the audio input to 20sec
	speech_data, sampling_rate = librosa.load(audio_path, sr=16000, mono=True, duration=20)
	#get the input features using the feature extractor on the raw speech data
	input_features = processor.feature_extractor(speech_data, return_tensors="pt", sampling_rate=sampling_rate).input_features.to(device)
	#transcribe the input tensor of features obtained from function preAudioPath
	predicted_ids = model.generate(input_features, task="transcribe")
	#decode the second entry from the output array which conatins the detected language
	detected_lang = asr_pipe_whisper.tokenizer.decode(predicted_ids[0,1])
	#looks up in the dictionary to retrieve the expanded language name. E.g. detected_lang = "<\|ge\|>" returns detected_lang = "german"
	detected_lang = languages_dic.LANGUAGES.get(detected_lang.strip("<\|>"))
	return detected_lang


	# def transcribe(inputs):
	# # predicted_ids = model.generate(inputs, language="<\|es\|>", task="transcribe")
	# predicted_ids = model.generate(inputs, task="transcribe")
	# transcription = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
	# return transcription

	def getLM(modelsize):
	modelsize = modelsize.split(" ")
	if len(modelsize) > 0:
	modelsize = modelsize[0]
	return (myLM.model[modelsize], myLM.processor[modelsize], myLM.pipe[modelsize])


	def processAudio(audio_path, modelsize):
	model, processor, asr_pipe_whisper = getLM(modelsize)
	translation = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"translate"})
	transcription = asr_pipe_whisper(audio_path, generate_kwargs={"task":"transcribe"})
	#transcription = transcribe(preprocessAudioPath(audio_path))
	inputLang = detect_language(audio_path, model, processor, asr_pipe_whisper)
	return (inputLang, transcription["text"], translation["text"])


	modelsizeInfo = "Try out the performance for different model sizes. Larger models are more robust and deliver better results but are also slower."

	app1 = gr.Interface(
	fn=processAudio,
	# inputs=[gr.Audio(source="upload", type="filepath",label="Audio Input"),
	# gr.Radio(["tiny - 39M", "base - 74M", "small -244M", "medium - 769M", "large - 1550M"],
	# label="Select the model size", info=modelsizeInfo, value="small -244M")],
	inputs=[gr.Audio(source="upload", type="filepath",label="Audio Input"),
	gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")],
	outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")],
	title=title,
	description=description1
	)


	app2 = gr.Interface(
	fn=processAudio,
	inputs=[gr.Audio(source="microphone", type="filepath",label="Audio Input"),
	gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")],
	outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")],
	title=title,
	description=description2
	)

	demo = gr.TabbedInterface([app1, app2], ["Audio File", "Microphone"])

	if __name__ == "__main__":
	demo.launch()