File size: 4,537 Bytes
6d42c53
47ae719
 
22e7a05
6d42c53
29d3b86
 
 
6491f4a
25e3dec
 
 
 
 
 
6491f4a
 
4e99e82
25e3dec
750c85a
25e3dec
 
 
 
6491f4a
c5b4e7d
6491f4a
25e3dec
 
 
6491f4a
25e3dec
6491f4a
6d42c53
c5b4e7d
47ae719
d76f980
 
 
47ae719
ce2a6bf
25e3dec
 
 
 
 
 
 
 
 
 
 
d76f980
 
 
 
25e3dec
d76f980
 
 
 
 
 
 
 
 
47ae719
cc4ad4c
9d7acb6
349d7f3
fc7642f
816be8b
e88ea86
82e77d6
 
 
22e7a05
 
 
 
 
 
47ae719
 
 
072aee3
 
33e875f
47ae719
 
 
 
349d7f3
47ae719
 
22e7a05
d76f980
22e7a05
 
 
 
 
47ae719
 
9d7acb6
af5d68b
47ae719
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from deepmultilingualpunctuation import PunctuationModel
import gradio as gr
import re
import metrics
        
# https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence
def cap(match):
    return(match.group().capitalize())

def remove_filler_words(transcript):
    
    # preserve line brakes
    transcript_hash = " # ".join(transcript.strip().splitlines())
    print('transcript_hash')
    print(transcript_hash)
    # preprocess the text by removing filler words
    # Define a list of filler words to remove
    filler_words = ["um", "uh", "hmm", "ha", "er", "ah", "yeah"]
    words = transcript_hash.split()
    clean_words = [word for word in words if word.lower() not in filler_words]
    input_text_clean = ' '.join(clean_words)
    # restore the line brakes
    input_text= input_text_clean.replace(' # ','\n')
    return input_text
    # Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
    #pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
    # Use re.sub to replace the filler words with empty strings
    #clean_input_text = re.sub(pattern, "", input_text)    

def predict(brakes, transcript):

    input_text = remove_filler_words(transcript)
    # Do the punctuation restauration
    model = PunctuationModel()
    output_text = model.restore_punctuation(input_text)

    # if any of the line brake methods are implemented,
    # return the text as a single line
    pcnt_file_cr = output_text

    if 'textlines' in brakes:

        # preserve line brakes
        srt_file_hash = '# '.join(input_text.strip().splitlines())
        #srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
        srt_file_array=srt_file_hash.split()
        pcnt_file_array=output_text.split()

        print('pcnt_file_array')
        print(pcnt_file_array)
        print('srt_file_array')
        print(srt_file_array)
        # goal: restore the break points i.e. the same number of lines as the srt file
        # this is necessary, because each line in the srt file corresponds to a frame from the video
        if len(srt_file_array)!=len(pcnt_file_array):
            return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
        
        pcnt_file_array_hash = []
        for idx, item in enumerate(srt_file_array):
            if item.endswith('#'):
                pcnt_file_array_hash.append(pcnt_file_array[idx]+'#')
            else:
                pcnt_file_array_hash.append(pcnt_file_array[idx])
    
        # assemble the array back to a string
        pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n')

    elif 'sentences' in brakes:
        split_text = output_text.split('. ')
        pcnt_file_cr = '.\n'.join(split_text)
    
    regex1 = r"\bi\b"
    regex2 = r"(?<=[.?!;])\s*\w"
    regex3 = r"^\w"
    pcnt_file_cr_cap = re.sub(regex3, lambda x: x.group().upper(), re.sub(regex2, lambda x: x.group().upper(), re.sub(regex1, "I", pcnt_file_cr)))

    n_tokens= metrics.num_tokens(pcnt_file_cr_cap)
    n_sents = metrics.num_sentences(pcnt_file_cr_cap)
    n_words = metrics.num_words(pcnt_file_cr_cap)
    n_chars = metrics.num_chars(pcnt_file_cr_cap)

    return pcnt_file_cr_cap, n_words, n_sents, n_chars, n_tokens
 
if __name__ == "__main__":

    metrics.load_nltk()

    title = "Deep Punkt App"
    description = """
<b>Description</b>: <br>
Model restores punctuation and case i.e. of the following punctuations -- [! ? . , - : ; ' ] and also the upper-casing of words. <br>
"""
    examples = [['sentences', "my name is clara i live in berkeley california"]]

    interface = gr.Interface(fn = predict,
                         inputs = [gr.Radio(["no brakes","sentences", "textlines"], value="no brakes", label="preserve line brakes"),
                                   "text"],
                         outputs=[gr.Textbox(label="Punctuated Transcript"), 
                                    gr.Number(label="Number of Words"),
                                    gr.Number(label="Number of Sentences"),
                                    gr.Number(label="Number of Characters"),
                                    gr.Number(label="Number of Tokens")],
                         title = title,
                         description = description, 
                         examples=examples, 
                         allow_flagging="never").queue(concurrency_count=2)

    interface.launch()