deeppunct-gr

Runtime error

wldmr commited on Mar 24, 2023

Commit

6491f4a

1 Parent(s): 349d7f3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,12 +8,20 @@ def cap(match):
 def predict(brakes, input_text):
     model = PunctuationModel()
-    output_text = model.restore_punctuation(input_text)
-    # restore the carrige returns
-    srt_file = input_text
     punctuated = output_text
     # if any of the line brake methods are implemented,
@@ -21,6 +29,7 @@ def predict(brakes, input_text):
     pcnt_file_cr = output_text
     if 'timelines' in brakes:
         srt_file_strip=srt_file.strip()
         srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
         srt_file_array=srt_file_sub.split(' ')

 def predict(brakes, input_text):
+    # preprocess the text by removing filler words
+    # Define a list of filler words to remove
+    fillers = ["um", "uh", "hmm", "ha", "er", "ah"]
+    # Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
+    pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
+    # Use re.sub to replace the filler words with empty strings
+    clean_input_text = re.sub(pattern, "", input_text)
+    # Do the punctuation restauration
     model = PunctuationModel()
+    output_text = model.restore_punctuation(clean_input_text)
+    srt_file = clean_input_text
     punctuated = output_text
     # if any of the line brake methods are implemented,
     pcnt_file_cr = output_text
     if 'timelines' in brakes:
+        # restore the carrige returns
         srt_file_strip=srt_file.strip()
         srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
         srt_file_array=srt_file_sub.split(' ')