Spaces:
Runtime error
Runtime error
File size: 4,537 Bytes
6d42c53 47ae719 22e7a05 6d42c53 29d3b86 6491f4a 25e3dec 6491f4a 4e99e82 25e3dec 750c85a 25e3dec 6491f4a c5b4e7d 6491f4a 25e3dec 6491f4a 25e3dec 6491f4a 6d42c53 c5b4e7d 47ae719 d76f980 47ae719 ce2a6bf 25e3dec d76f980 25e3dec d76f980 47ae719 cc4ad4c 9d7acb6 349d7f3 fc7642f 816be8b e88ea86 82e77d6 22e7a05 47ae719 072aee3 33e875f 47ae719 349d7f3 47ae719 22e7a05 d76f980 22e7a05 47ae719 9d7acb6 af5d68b 47ae719 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
from deepmultilingualpunctuation import PunctuationModel
import gradio as gr
import re
import metrics
# https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence
def cap(match):
return(match.group().capitalize())
def remove_filler_words(transcript):
# preserve line brakes
transcript_hash = " # ".join(transcript.strip().splitlines())
print('transcript_hash')
print(transcript_hash)
# preprocess the text by removing filler words
# Define a list of filler words to remove
filler_words = ["um", "uh", "hmm", "ha", "er", "ah", "yeah"]
words = transcript_hash.split()
clean_words = [word for word in words if word.lower() not in filler_words]
input_text_clean = ' '.join(clean_words)
# restore the line brakes
input_text= input_text_clean.replace(' # ','\n')
return input_text
# Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
#pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
# Use re.sub to replace the filler words with empty strings
#clean_input_text = re.sub(pattern, "", input_text)
def predict(brakes, transcript):
input_text = remove_filler_words(transcript)
# Do the punctuation restauration
model = PunctuationModel()
output_text = model.restore_punctuation(input_text)
# if any of the line brake methods are implemented,
# return the text as a single line
pcnt_file_cr = output_text
if 'textlines' in brakes:
# preserve line brakes
srt_file_hash = '# '.join(input_text.strip().splitlines())
#srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
srt_file_array=srt_file_hash.split()
pcnt_file_array=output_text.split()
print('pcnt_file_array')
print(pcnt_file_array)
print('srt_file_array')
print(srt_file_array)
# goal: restore the break points i.e. the same number of lines as the srt file
# this is necessary, because each line in the srt file corresponds to a frame from the video
if len(srt_file_array)!=len(pcnt_file_array):
return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
pcnt_file_array_hash = []
for idx, item in enumerate(srt_file_array):
if item.endswith('#'):
pcnt_file_array_hash.append(pcnt_file_array[idx]+'#')
else:
pcnt_file_array_hash.append(pcnt_file_array[idx])
# assemble the array back to a string
pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n')
elif 'sentences' in brakes:
split_text = output_text.split('. ')
pcnt_file_cr = '.\n'.join(split_text)
regex1 = r"\bi\b"
regex2 = r"(?<=[.?!;])\s*\w"
regex3 = r"^\w"
pcnt_file_cr_cap = re.sub(regex3, lambda x: x.group().upper(), re.sub(regex2, lambda x: x.group().upper(), re.sub(regex1, "I", pcnt_file_cr)))
n_tokens= metrics.num_tokens(pcnt_file_cr_cap)
n_sents = metrics.num_sentences(pcnt_file_cr_cap)
n_words = metrics.num_words(pcnt_file_cr_cap)
n_chars = metrics.num_chars(pcnt_file_cr_cap)
return pcnt_file_cr_cap, n_words, n_sents, n_chars, n_tokens
if __name__ == "__main__":
metrics.load_nltk()
title = "Deep Punkt App"
description = """
<b>Description</b>: <br>
Model restores punctuation and case i.e. of the following punctuations -- [! ? . , - : ; ' ] and also the upper-casing of words. <br>
"""
examples = [['sentences', "my name is clara i live in berkeley california"]]
interface = gr.Interface(fn = predict,
inputs = [gr.Radio(["no brakes","sentences", "textlines"], value="no brakes", label="preserve line brakes"),
"text"],
outputs=[gr.Textbox(label="Punctuated Transcript"),
gr.Number(label="Number of Words"),
gr.Number(label="Number of Sentences"),
gr.Number(label="Number of Characters"),
gr.Number(label="Number of Tokens")],
title = title,
description = description,
examples=examples,
allow_flagging="never").queue(concurrency_count=2)
interface.launch()
|