deeppunct-gr

Runtime error

App Files Files Community

deeppunct-gr / app.py

wldmr

added queue function

af5d68b almost 3 years ago

raw

history blame contribute delete

4.54 kB

	from deepmultilingualpunctuation import PunctuationModel
	import gradio as gr
	import re
	import metrics

	# https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence
	def cap(match):
	return(match.group().capitalize())

	def remove_filler_words(transcript):

	# preserve line brakes
	transcript_hash = " # ".join(transcript.strip().splitlines())
	print('transcript_hash')
	print(transcript_hash)
	# preprocess the text by removing filler words
	# Define a list of filler words to remove
	filler_words = ["um", "uh", "hmm", "ha", "er", "ah", "yeah"]
	words = transcript_hash.split()
	clean_words = [word for word in words if word.lower() not in filler_words]
	input_text_clean = ' '.join(clean_words)
	# restore the line brakes
	input_text= input_text_clean.replace(' # ','\n')
	return input_text
	# Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
	#pattern = r"(?<=\s\|\b)(" + "\|".join(fillers) + r")(?=\s\|\b)"
	# Use re.sub to replace the filler words with empty strings
	#clean_input_text = re.sub(pattern, "", input_text)

	def predict(brakes, transcript):

	input_text = remove_filler_words(transcript)
	# Do the punctuation restauration
	model = PunctuationModel()
	output_text = model.restore_punctuation(input_text)

	# if any of the line brake methods are implemented,
	# return the text as a single line
	pcnt_file_cr = output_text

	if 'textlines' in brakes:

	# preserve line brakes
	srt_file_hash = '# '.join(input_text.strip().splitlines())
	#srt_file_sub=re.sub('\s\n\s','# ',srt_file_strip)
	srt_file_array=srt_file_hash.split()
	pcnt_file_array=output_text.split()

	print('pcnt_file_array')
	print(pcnt_file_array)
	print('srt_file_array')
	print(srt_file_array)
	# goal: restore the break points i.e. the same number of lines as the srt file
	# this is necessary, because each line in the srt file corresponds to a frame from the video
	if len(srt_file_array)!=len(pcnt_file_array):
	return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)

	pcnt_file_array_hash = []
	for idx, item in enumerate(srt_file_array):
	if item.endswith('#'):
	pcnt_file_array_hash.append(pcnt_file_array[idx]+'#')
	else:
	pcnt_file_array_hash.append(pcnt_file_array[idx])

	# assemble the array back to a string
	pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n')

	elif 'sentences' in brakes:
	split_text = output_text.split('. ')
	pcnt_file_cr = '.\n'.join(split_text)

	regex1 = r"\bi\b"
	regex2 = r"(?<=[.?!;])\s*\w"
	regex3 = r"^\w"
	pcnt_file_cr_cap = re.sub(regex3, lambda x: x.group().upper(), re.sub(regex2, lambda x: x.group().upper(), re.sub(regex1, "I", pcnt_file_cr)))

	n_tokens= metrics.num_tokens(pcnt_file_cr_cap)
	n_sents = metrics.num_sentences(pcnt_file_cr_cap)
	n_words = metrics.num_words(pcnt_file_cr_cap)
	n_chars = metrics.num_chars(pcnt_file_cr_cap)

	return pcnt_file_cr_cap, n_words, n_sents, n_chars, n_tokens

	if __name__ == "__main__":

	metrics.load_nltk()

	title = "Deep Punkt App"
	description = """
	<b>Description</b>: <br>
	Model restores punctuation and case i.e. of the following punctuations -- [! ? . , - : ; ' ] and also the upper-casing of words. <br>
	"""
	examples = [['sentences', "my name is clara i live in berkeley california"]]

	interface = gr.Interface(fn = predict,
	inputs = [gr.Radio(["no brakes","sentences", "textlines"], value="no brakes", label="preserve line brakes"),
	"text"],
	outputs=[gr.Textbox(label="Punctuated Transcript"),
	gr.Number(label="Number of Words"),
	gr.Number(label="Number of Sentences"),
	gr.Number(label="Number of Characters"),
	gr.Number(label="Number of Tokens")],
	title = title,
	description = description,
	examples=examples,
	allow_flagging="never").queue(concurrency_count=2)

	interface.launch()