wldmr commited on
Commit
6491f4a
·
1 Parent(s): 349d7f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -4
app.py CHANGED
@@ -8,12 +8,20 @@ def cap(match):
8
 
9
 
10
  def predict(brakes, input_text):
11
-
 
 
 
 
 
 
 
 
 
12
  model = PunctuationModel()
13
- output_text = model.restore_punctuation(input_text)
14
 
15
- # restore the carrige returns
16
- srt_file = input_text
17
  punctuated = output_text
18
 
19
  # if any of the line brake methods are implemented,
@@ -21,6 +29,7 @@ def predict(brakes, input_text):
21
  pcnt_file_cr = output_text
22
 
23
  if 'timelines' in brakes:
 
24
  srt_file_strip=srt_file.strip()
25
  srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
26
  srt_file_array=srt_file_sub.split(' ')
 
8
 
9
 
10
  def predict(brakes, input_text):
11
+
12
+ # preprocess the text by removing filler words
13
+ # Define a list of filler words to remove
14
+ fillers = ["um", "uh", "hmm", "ha", "er", "ah"]
15
+ # Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
16
+ pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
17
+ # Use re.sub to replace the filler words with empty strings
18
+ clean_input_text = re.sub(pattern, "", input_text)
19
+
20
+ # Do the punctuation restauration
21
  model = PunctuationModel()
22
+ output_text = model.restore_punctuation(clean_input_text)
23
 
24
+ srt_file = clean_input_text
 
25
  punctuated = output_text
26
 
27
  # if any of the line brake methods are implemented,
 
29
  pcnt_file_cr = output_text
30
 
31
  if 'timelines' in brakes:
32
+ # restore the carrige returns
33
  srt_file_strip=srt_file.strip()
34
  srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
35
  srt_file_array=srt_file_sub.split(' ')