Spaces:

RASMUS
/

Finnish-Audio-to-Text

Runtime error

App Files Files Community

Finnish-Audio-to-Text / app.py

RASMUS

update to use fi language flag

4487b3c about 2 years ago

raw

history blame

7.69 kB

	import os
	import time
	import gradio as gr
	from pathlib import Path
	import pysrt
	import pandas as pd

	if os.path.isdir(f'{os.getcwd() + os.sep}whisper.cpp'):
	print("Models already loaded")
	else:
	os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
	os.system("git reset --hard 3163090d89c47933d7c2a080b224f0d2e842b468")
	os.system('git clone https://huggingface.co/Finnish-NLP/Finnish-finetuned-whisper-models-ggml-format')
	os.system('make -C ./whisper.cpp')



	whisper_models = ["medium", "large"]
	whisper_modelpath_translator= {
	"medium": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-medium.bin",
	"large": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-large-v3.bin"
	}



	def speech_to_text(audio_path, whisper_model):

	if(audio_path is None):
	retry_cnt = 0
	for retry_cnt in range(3):
	if(audio_path is None):
	print(f'Retrying, retry counter: {retry_cnt +1}')
	time.sleep(0.5)
	retry_cnt +=1
	if retry_cnt == 3:
	raise ValueError("Error no audio input")
	else:
	break
	print(audio_path)
	try:
	retry_cnt = 0
	for retry_cnt in range(3):
	try:
	_,file_ending = os.path.splitext(f'{audio_path}')
	print(f'file enging is {file_ending}')
	print("starting conversion to wav")
	new_path = audio_path.replace(file_ending, "_converted.wav")
	os.system(f'ffmpeg -i "{audio_path}" -ar 16000 -y -ac 1 -c:a pcm_s16le "{new_path}"')
	print("conversion to wav ready")
	break
	except Exception as e:
	time.sleep(0.5)
	retry_cnt +=1
	if retry_cnt == 3:
	pass

	except Exception as e:
	raise RuntimeError(f'Error Running inference with local model: {e}') from e

	try:

	print("starting whisper c++")
	srt_path = new_path + ".srt"
	os.system(f'rm -f {srt_path}')
	os.system(f'./whisper.cpp/main "{new_path}" -t 4 -m ./{whisper_modelpath_translator.get(whisper_model)} -osrt -l fi')
	print("starting whisper done with whisper")
	except Exception as e:
	raise RuntimeError(f'Error running Whisper cpp model: {e}') from e

	try:

	df = pd.DataFrame(columns = ['start','end','text'])
	subs = pysrt.open(srt_path)


	rows = []
	for sub in subs:
	start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
	end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]

	start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
	end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]

	start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
	end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]

	start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
	end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
	rows.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])

	for row in rows:
	srt_to_df = {
	'start': [row[1]],
	'end': [row[2]],
	'text': [row[0]]
	}

	df = pd.concat([df, pd.DataFrame(srt_to_df)])

	except Exception as e:
	print(f"Error creating srt df with error: {e}")

	return df

	def output_to_files(df):

	df.reset_index(inplace=True)


	print("Starting SRT-file creation")
	print(df.head())

	with open('subtitles.vtt','w', encoding="utf-8") as file:
	print("Starting WEBVTT-file creation")

	for i in range(len(df)):
	if i == 0:
	file.write('WEBVTT')
	file.write('\n')

	else:
	file.write(str(i+1))
	file.write('\n')
	start = df.iloc[i]['start']


	file.write(f"{start.strip()}")

	stop = df.iloc[i]['end']


	file.write(' --> ')
	file.write(f"{stop}")
	file.write('\n')
	file.writelines(df.iloc[i]['text'])
	if int(i) != len(df)-1:
	file.write('\n\n')

	print("WEBVTT DONE")

	with open('subtitles.srt','w', encoding="utf-8") as file:
	print("Starting SRT-file creation")

	for i in range(len(df)):
	file.write(str(i+1))
	file.write('\n')
	start = df.iloc[i]['start']


	file.write(f"{start.strip()}")

	stop = df.iloc[i]['end']


	file.write(' --> ')
	file.write(f"{stop}")
	file.write('\n')
	file.writelines(df.iloc[i]['text'])
	if int(i) != len(df)-1:
	file.write('\n\n')

	print("SRT DONE")
	subtitle_files_out = ['subtitles.vtt','subtitles.srt']

	return subtitle_files_out

	# ---- Gradio Layout -----





	demo = gr.Blocks(css='''
	#cut_btn, #reset_btn { align-self:stretch; }
	#\\31 3 { max-width: 540px; }
	.output-markdown {max-width: 65ch !important;}
	''')
	demo.encrypt = False


	with demo:
	with gr.Row():
	with gr.Column():
	gr.Markdown('''
	# Simple Finnish Audio --> Text app
	### This space allows you to:
	1. Insert audio file or record with microphone
	2. Run audio through transcription process using speech recognition models
	3. Download generated transcriptions in .vtt and .srt formats
	''')


	with gr.Row():
	with gr.Column():
	audio_in = gr.Audio(label="Audio file", type='filepath')
	transcribe_btn = gr.Button("Step 1. Transcribe audio")
	selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="large", label="Selected Whisper model", interactive=True)

	with gr.Row():
	with gr.Column():
	transcription_df = gr.DataFrame(headers = ['start','end','text'], label="Transcription dataframe")

	with gr.Row():
	with gr.Column():
	translate_transcriptions_button = gr.Button("Step 2. Create subtitle files")


	with gr.Row():
	with gr.Column():
	gr.Markdown('''##### From here you can download subtitles in .srt or .vtt format''')
	subtitle_files = gr.File(
	label="Download files",
	file_count="multiple",
	type="filepath",
	interactive=False,
	)

	# Functionalities
	transcribe_btn.click(speech_to_text, [audio_in, selected_whisper_model], [transcription_df])
	translate_transcriptions_button.click(output_to_files, transcription_df, [subtitle_files])

	demo.launch()