Spaces:
Runtime error
Runtime error
| from deepmultilingualpunctuation import PunctuationModel | |
| import gradio as gr | |
| import re | |
| import metrics | |
| # https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence | |
| def cap(match): | |
| return(match.group().capitalize()) | |
| def remove_filler_words(transcript): | |
| # preserve line brakes | |
| transcript_hash = " # ".join(transcript.strip().splitlines()) | |
| print('transcript_hash') | |
| print(transcript_hash) | |
| # preprocess the text by removing filler words | |
| # Define a list of filler words to remove | |
| filler_words = ["um", "uh", "hmm", "ha", "er", "ah", "yeah"] | |
| words = transcript_hash.split() | |
| clean_words = [word for word in words if word.lower() not in filler_words] | |
| input_text_clean = ' '.join(clean_words) | |
| # restore the line brakes | |
| input_text= input_text_clean.replace(' # ','\n') | |
| return input_text | |
| # Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation | |
| #pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)" | |
| # Use re.sub to replace the filler words with empty strings | |
| #clean_input_text = re.sub(pattern, "", input_text) | |
| def predict(brakes, transcript): | |
| input_text = remove_filler_words(transcript) | |
| # Do the punctuation restauration | |
| model = PunctuationModel() | |
| output_text = model.restore_punctuation(input_text) | |
| # if any of the line brake methods are implemented, | |
| # return the text as a single line | |
| pcnt_file_cr = output_text | |
| if 'textlines' in brakes: | |
| # preserve line brakes | |
| srt_file_hash = '# '.join(input_text.strip().splitlines()) | |
| #srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip) | |
| srt_file_array=srt_file_hash.split() | |
| pcnt_file_array=output_text.split() | |
| print('pcnt_file_array') | |
| print(pcnt_file_array) | |
| print('srt_file_array') | |
| print(srt_file_array) | |
| # goal: restore the break points i.e. the same number of lines as the srt file | |
| # this is necessary, because each line in the srt file corresponds to a frame from the video | |
| if len(srt_file_array)!=len(pcnt_file_array): | |
| return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array) | |
| pcnt_file_array_hash = [] | |
| for idx, item in enumerate(srt_file_array): | |
| if item.endswith('#'): | |
| pcnt_file_array_hash.append(pcnt_file_array[idx]+'#') | |
| else: | |
| pcnt_file_array_hash.append(pcnt_file_array[idx]) | |
| # assemble the array back to a string | |
| pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n') | |
| elif 'sentences' in brakes: | |
| split_text = output_text.split('. ') | |
| pcnt_file_cr = '.\n'.join(split_text) | |
| regex1 = r"\bi\b" | |
| regex2 = r"(?<=[.?!;])\s*\w" | |
| regex3 = r"^\w" | |
| pcnt_file_cr_cap = re.sub(regex3, lambda x: x.group().upper(), re.sub(regex2, lambda x: x.group().upper(), re.sub(regex1, "I", pcnt_file_cr))) | |
| n_tokens= metrics.num_tokens(pcnt_file_cr_cap) | |
| n_sents = metrics.num_sentences(pcnt_file_cr_cap) | |
| n_words = metrics.num_words(pcnt_file_cr_cap) | |
| n_chars = metrics.num_chars(pcnt_file_cr_cap) | |
| return pcnt_file_cr_cap, n_words, n_sents, n_chars, n_tokens | |
| if __name__ == "__main__": | |
| metrics.load_nltk() | |
| title = "Deep Punkt App" | |
| description = """ | |
| <b>Description</b>: <br> | |
| Model restores punctuation and case i.e. of the following punctuations -- [! ? . , - : ; ' ] and also the upper-casing of words. <br> | |
| """ | |
| examples = [['sentences', "my name is clara i live in berkeley california"]] | |
| interface = gr.Interface(fn = predict, | |
| inputs = [gr.Radio(["no brakes","sentences", "textlines"], value="no brakes", label="preserve line brakes"), | |
| "text"], | |
| outputs=[gr.Textbox(label="Punctuated Transcript"), | |
| gr.Number(label="Number of Words"), | |
| gr.Number(label="Number of Sentences"), | |
| gr.Number(label="Number of Characters"), | |
| gr.Number(label="Number of Tokens")], | |
| title = title, | |
| description = description, | |
| examples=examples, | |
| allow_flagging="never").queue(concurrency_count=2) | |
| interface.launch() | |