rosyvs commited on
Commit
d971130
·
1 Parent(s): 97631c5

Refactor to use a single func convert_and_trim_video for both tasks, fix some webm --> mp4 / wav issues

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +28 -39
  3. utils.py +127 -164
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  .DS_Store
2
  __pycache__/
3
  flagged/
 
 
 
1
  .DS_Store
2
  __pycache__/
3
  flagged/
4
+ results_*/
5
+ logs/
app.py CHANGED
@@ -8,8 +8,8 @@ import random
8
 
9
  import gradio as gr
10
 
11
- from utils import (HHMMSS_to_sec, convert_video_format, molly_xlsx_to_table,
12
- sort_transcript, table_to_ELAN_tsv, trim_media,
13
  xlsx_to_table)
14
 
15
 
@@ -19,26 +19,19 @@ def delete_files(files):
19
  try:
20
  os.remove(file)
21
  except FileNotFoundError:
 
22
  pass
23
  print("files deleted")
24
 
25
  def classify_input_format(input_string):
26
- # check if the input string is a valid time in the format HH:MM:SS
27
- hhmmss_pattern = re.compile('^\d{1,2}:\d{1,2}:\d{1,2}$')
28
- if hhmmss_pattern.match(str(input_string)):
29
- print("input string is a valid time in the format HH:MM:SS")
30
- return HHMMSS_to_sec(time_str=input_string)
31
-
32
- # check if the input string is a valid number in the format of seconds
33
- try:
34
- seconds = float(input_string)
35
- print("input string is a valid number in the format of seconds")
36
- return float(input_string)
37
- except ValueError:
38
- pass
39
-
40
- # if the input format is neither seconds nor HH:MM:SS, return None
41
- gr.Error("Input format not supported. Please enter a valid time in the format HH:MM:SS or seconds.")
42
 
43
  def set_output_file(input_file, output_format, folder, insert_string = 'trimmed'):
44
  # Set output file name and extension
@@ -57,32 +50,28 @@ def trim_video_helper(input_file, output_file, start_time, end_time):
57
  end_time = 300
58
  elif start_time != "" and end_time == "":
59
  end_time = 300 + HHMMSS_to_sec(time_str=start_time)
60
- print("start time: ", start_time)
61
- print("end time: ", end_time)
62
  # Trim the video
63
  print("start trimming")
64
- start_time = classify_input_format(start_time)
65
- print("start time: ", start_time)
66
- end_time = classify_input_format(end_time)
67
- print("end time: ", end_time)
68
- output_file = trim_media(input_file.name, output_file, start_time, end_time)
69
- print("finish trimming")
70
  return output_file
71
 
72
  def convert_video_helper(input_file, output_file, output_format):
73
  # convert video
74
  print("start converting")
75
- output_file = convert_video_format(input_file.name, output_file)
76
- print("finish converting")
77
  return output_file
78
 
79
  def convert_transcript_helper(input_transcript, output_transcript):
80
  # convert transcript
81
  print("start converting transcript")
82
  table = xlsx_to_table(xl_file=input_transcript)
83
- print("finish converting transcript to table")
84
  output_file = table_to_ELAN_tsv(table, output_transcript)
85
- print("finish converting transcript")
86
  return output_file
87
 
88
 
@@ -93,10 +82,10 @@ def trim_video_vtr(input_file, output_format):
93
  # randomly select start time
94
  start_time = random.randint(300, 900)
95
  end_time = start_time + 600 # since 10 minutes
96
- print("start time: ", start_time)
97
- print("end time: ", end_time)
98
- aduio_name = input_file.name.split("/")[-1].split(".")[-2]
99
- print("aduio_name: ", aduio_name)
100
 
101
  # set output file
102
  insert_string = f"start{start_time}_end{end_time}"
@@ -104,7 +93,7 @@ def trim_video_vtr(input_file, output_format):
104
  output_file = set_output_file(input_file, output_format, output_folder, insert_string)
105
 
106
  # write the start time, and end time to a txt file
107
- log_file = f"{os.getcwd()}/results_vtr/{aduio_name}_start_end_time.txt"
108
  print("log_file: ", log_file)
109
  with open(log_file, "w") as f:
110
  f.write(f"{start_time}\n")
@@ -152,7 +141,7 @@ def sort_transcript_helper(input_transcript, output_transcript):
152
  print("input_transcript: ", input_transcript)
153
  print("output_transcript: ", output_transcript)
154
  output_transcript = sort_transcript(input_transcript, output_transcript)
155
- print("finish sorting transcript")
156
  return output_transcript
157
 
158
  def sort_transcript_wrapper(input_file):
@@ -185,12 +174,12 @@ def convert_video(input_file, output_format):
185
  output_folder = f"{os.getcwd()}/results_vc/"
186
  output_file = set_output_file(input_file, output_format, output_folder, \
187
  insert_string = 'converted')
188
-
189
  # Convert video
190
  output_file = convert_video_helper(input_file, output_file, output_format)
191
-
192
  # remove file after 10 minutes for security
193
- print("start deleting files")
194
  path_to_delete = [input_file.name, output_file]
195
  threading.Thread(target=delete_files, args=([path_to_delete])).start()
196
 
 
8
 
9
  import gradio as gr
10
 
11
+ from utils import (HHMMSS_to_sec, molly_xlsx_to_table, convert_and_trim_video,
12
+ sort_transcript, table_to_ELAN_tsv,
13
  xlsx_to_table)
14
 
15
 
 
19
  try:
20
  os.remove(file)
21
  except FileNotFoundError:
22
+ print(f"File {file} not found for deletion.")
23
  pass
24
  print("files deleted")
25
 
26
  def classify_input_format(input_string):
27
+ seconds = HHMMSS_to_sec(time_str=input_string)
28
+ if seconds is not None:
29
+ print("Successfully converted timestamps to seconds")
30
+ return seconds
31
+ else:
32
+ # if the input format is neither seconds nor HH:MM:SS, return None
33
+ gr.Error("Input time stamp format not supported. Please enter a valid time in the format HH:MM:SS or seconds.")
34
+ return None
 
 
 
 
 
 
 
 
35
 
36
  def set_output_file(input_file, output_format, folder, insert_string = 'trimmed'):
37
  # Set output file name and extension
 
50
  end_time = 300
51
  elif start_time != "" and end_time == "":
52
  end_time = 300 + HHMMSS_to_sec(time_str=start_time)
53
+ print("start time (s): ", start_time)
54
+ print("end time (s): ", end_time)
55
  # Trim the video
56
  print("start trimming")
57
+ output_file = convert_and_trim_video(input_file.name, output_file, start_time, end_time)
58
+ print("finished trimming")
 
 
 
 
59
  return output_file
60
 
61
  def convert_video_helper(input_file, output_file, output_format):
62
  # convert video
63
  print("start converting")
64
+ output_file = convert_and_trim_video(input_file.name, output_file)
65
+ print("finished converting")
66
  return output_file
67
 
68
  def convert_transcript_helper(input_transcript, output_transcript):
69
  # convert transcript
70
  print("start converting transcript")
71
  table = xlsx_to_table(xl_file=input_transcript)
72
+ print("finished converting transcript to table")
73
  output_file = table_to_ELAN_tsv(table, output_transcript)
74
+ print("finished converting transcript")
75
  return output_file
76
 
77
 
 
82
  # randomly select start time
83
  start_time = random.randint(300, 900)
84
  end_time = start_time + 600 # since 10 minutes
85
+ print("start time (s): ", start_time)
86
+ print("end time (s): ", end_time)
87
+ audio_base_name = input_file.name.split("/")[-1].split(".")[-2]
88
+ print("audio_base_name: ", audio_base_name)
89
 
90
  # set output file
91
  insert_string = f"start{start_time}_end{end_time}"
 
93
  output_file = set_output_file(input_file, output_format, output_folder, insert_string)
94
 
95
  # write the start time, and end time to a txt file
96
+ log_file = f"{os.getcwd()}/results_vtr/{audio_base_name}_start_end_time.txt"
97
  print("log_file: ", log_file)
98
  with open(log_file, "w") as f:
99
  f.write(f"{start_time}\n")
 
141
  print("input_transcript: ", input_transcript)
142
  print("output_transcript: ", output_transcript)
143
  output_transcript = sort_transcript(input_transcript, output_transcript)
144
+ print("finished sorting transcript")
145
  return output_transcript
146
 
147
  def sort_transcript_wrapper(input_file):
 
174
  output_folder = f"{os.getcwd()}/results_vc/"
175
  output_file = set_output_file(input_file, output_format, output_folder, \
176
  insert_string = 'converted')
177
+ print("Begin converting to output format: ", output_format)
178
  # Convert video
179
  output_file = convert_video_helper(input_file, output_file, output_format)
180
+ print("created output file: ", output_file)
181
  # remove file after 10 minutes for security
182
+ print("Done converting. start deleting files")
183
  path_to_delete = [input_file.name, output_file]
184
  threading.Thread(target=delete_files, args=([path_to_delete])).start()
185
 
utils.py CHANGED
@@ -75,174 +75,44 @@ def sort_transcript(file_path: str, save_path: str) -> str:
75
 
76
  return save_path
77
 
78
- def convert_video_format(media_in, media_out):
79
- """
80
- Function to convert video format using ffmpeg.
81
-
82
- """
83
- logging.info(f'...Converting video format from {media_in} to {media_out}...')
84
-
85
- try:
86
- WAV_CHANNELS = 1
87
- WAV_SAMPLE_RATE = 16000
88
-
89
- if not isinstance(media_in, (str, Path)):
90
- raise TypeError("media_in must be a string or a PathLike object")
91
-
92
- if not isinstance(media_out, (str, Path)):
93
- raise TypeError("media_out must be a string or a PathLike object")
94
-
95
- in_ext = Path(media_in).suffix.lower()
96
- out_ext = Path(media_out).suffix.lower()
97
- logging.info(f'...detected extensions from filename: input={in_ext} output={out_ext}')
98
-
99
-
100
- if in_ext !='.webm':
101
- # input is not in webm format
102
- if out_ext == '.wav':
103
- # convert to wav with standard format for audio models
104
- logging.info(f'...generating {media_out}...')
105
- command = ['ffmpeg', '-y', '-i',
106
- media_in, '-acodec', 'pcm_s16le',
107
- '-ac', str(WAV_CHANNELS), '-ar',
108
- str(WAV_SAMPLE_RATE), media_out,
109
- '-hide_banner', '-loglevel', 'warning']
110
- process = subprocess.run(command, capture_output=True, text=True)
111
- if process.returncode != 0:
112
- logging.info(f"Error: {process.stderr}")
113
- else:
114
- logging.info(process.stdout)
115
- return media_out
116
-
117
- else:
118
- logging.info(f'...Using ffmpeg to convert {media_in} to {media_out}...')
119
- logging.info(f'...generating {media_out}...')
120
- command = ['ffmpeg',
121
- '-y',
122
- '-i',
123
- media_in,
124
- '-c',
125
- 'copy',
126
- media_out,
127
- '-hide_banner',
128
- '-loglevel',
129
- 'warning'
130
- ]
131
- process = subprocess.run(command, check=True)
132
- if process.returncode != 0:
133
- logging.info(f"Error: {process.stderr}")
134
- else:
135
- logging.info(process.stdout)
136
- return media_out
137
-
138
- if in_ext == '.webm':
139
- if out_ext == '.wav':
140
- command = ['ffmpeg', '-i', media_in, media_out]
141
- process = subprocess.run(command, capture_output=True, text=True)
142
- if process.returncode != 0:
143
- logging.info(f"Error: {process.stderr}")
144
- else:
145
- logging.info(process.stdout)
146
- return media_out
147
-
148
- else:
149
- command = ['ffmpeg', '-i', media_in, '-vcodec', 'h264', '-acodec', 'aac', media_out]
150
- process = subprocess.run(command, capture_output=True, text=True)
151
- if process.returncode != 0:
152
- logging.info(f"Error: {process.stderr}")
153
- else:
154
- logging.info(process.stdout)
155
- return media_out
156
-
157
-
158
- except Exception as e:
159
- logging.error(f"Error converting video format: {e}")
160
- gr.Error(f"Error converting video format: {e}")
161
-
162
-
163
- def trim_media(media_in, media_out, start, end):
164
- try:
165
- # options for writing out audio if converting
166
- WAV_CHANNELS = 1
167
- WAV_SAMPLE_RATE = 16000
168
-
169
- media_type = Path(media_in).suffix.lower()
170
- ext = Path(media_out).suffix.lower()
171
- print("EXT", ext)
172
- if isinstance(start, str):
173
- start_sec = HHMMSS_to_sec(start)
174
- else:
175
- start_sec = float(start)
176
- if isinstance(end, str):
177
- end_sec = HHMMSS_to_sec(end)
178
- else:
179
- end_sec = float(end)
180
-
181
- if ext == '.wav':
182
- # convert to wav with standard format for audio models
183
- print(f'...Using ffmpeg to trim video from {start} to {end} \n and convert to {WAV_SAMPLE_RATE}Hz WAV with {WAV_CHANNELS} channels...')
184
- print(f'...generating {media_out}...')
185
-
186
- subprocess.run(f'ffmpeg -y -i "{media_in}" -ss {start_sec} -to {end_sec} -acodec pcm_s16le -ac {WAV_CHANNELS} -ar {WAV_SAMPLE_RATE} "{media_out}" -hide_banner -loglevel warning', check=True, shell=True)
187
-
188
- print(f'...done trimming and converting to {media_out}...')
189
- return media_out
190
-
191
- else:
192
- print(f'...Using ffmpeg to trim video from {start_sec} to {end_sec}...')
193
- print(f'...generating {media_out}...')
194
-
195
- subprocess.run(['ffmpeg',
196
- '-y',
197
- '-i',
198
- media_in,
199
- '-ss',
200
- f'{start_sec}',
201
- '-to',
202
- f'{end_sec}',
203
- '-c',
204
- 'copy',
205
- media_out,
206
- '-hide_banner',
207
- '-loglevel',
208
- 'warning'
209
- ], check=True)
210
- return media_out
211
- except Exception as e:
212
- gr.Error(f"Error trimming media: {e}")
213
-
214
  def HHMMSS_to_sec(time_str):
215
  """Get Seconds from timestamp string with milliseconds."""
216
- try:
217
- if not time_str:
218
- return None
219
- if time_str.count(':')==2:
220
- h, m, s = time_str.split(':')
221
- elif time_str.count(':')==3:
222
- # weird timestamps where there is a field followign seconds delimited by colon
223
- h, m, s, u = time_str.split(':')
224
- # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
225
- if len(u)==1:
226
- print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
227
- ms = float(u)/10
228
- elif len(u)==2: # hundredths
229
- ms = float(u)/100
230
- elif len(u)==3: # hundredths
231
- ms = float(u)/1000
232
- else:
233
- print(f'input string format not supported: {time_str}')
234
- return None
235
- s = int(s)+ms
236
- elif time_str.count(':')==1:
237
- # print('missing HH from timestamp, assuming MM:SS')
238
- m, s = time_str.split(':')
239
- h=0
240
  else:
241
  print(f'input string format not supported: {time_str}')
242
  return None
243
- return int(h) * 3600 + int(m) * 60 + float(s)
244
- except Exception as e:
245
- gr.Error(f"Error converting time to seconds: {e}")
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  def molly_xlsx_to_table(xl_file):
248
  # contractor transcribers provide an xlsx with the following columns
@@ -307,4 +177,97 @@ def xlsx_to_table(xl_file):
307
  def table_to_ELAN_tsv(table:pd.DataFrame, path:str):
308
  # write table to tsv compatible with ELAN import
309
  table.to_csv(path, index=False, float_format='%.3f',sep='\t')
310
- return path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  return save_path
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def HHMMSS_to_sec(time_str):
79
  """Get Seconds from timestamp string with milliseconds."""
80
+ if not time_str:
81
+ return None
82
+ if isinstance(time_str, (int, float)):
83
+ return float(time_str)
84
+ if time_str.count(':')==2:
85
+ h, m, s = time_str.split(':')
86
+ elif time_str.count(':')==3:
87
+ # weird timestamps where there is a field followign seconds delimited by colon
88
+ h, m, s, u = time_str.split(':')
89
+ # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
90
+ if len(u)==1:
91
+ print('Weird time format with 3 colons detected - HH:MM:SS:X . Interpreting X as tenths of a second. - please verify this is how you want the time interpreted')
92
+ ms = float(u)/10
93
+ elif len(u)==2: # hundredths
94
+ print('Weird time format with 3 colons detected - HH:MM:SS:XX . Interpreting XX as hundredths of a second. - please verify this is how you want the time interpreted')
95
+ ms = float(u)/100
96
+ elif len(u)==3: # hundredths
97
+ print('Weird time format with 3 colons detected - HH:MM:SS:XXX . Interpreting XX as milliseconds. - please verify this is how you want the time interpreted')
98
+ ms = float(u)/1000
 
 
 
 
 
99
  else:
100
  print(f'input string format not supported: {time_str}')
101
  return None
102
+ s = int(s)+ms
103
+ elif time_str.count(':')==1:
104
+ # print('missing HH from timestamp, assuming MM:SS')
105
+ m, s = time_str.split(':')
106
+ h=0
107
+ else:
108
+ try:
109
+ time_str=float(time_str) # maybe its already in seconds!
110
+ return time_str
111
+ except Exception as e:
112
+ gr.Error(f"Error converting time to seconds: {e}")
113
+ return None
114
+ return int(h) * 3600 + int(m) * 60 + float(s)
115
+
116
 
117
  def molly_xlsx_to_table(xl_file):
118
  # contractor transcribers provide an xlsx with the following columns
 
177
  def table_to_ELAN_tsv(table:pd.DataFrame, path:str):
178
  # write table to tsv compatible with ELAN import
179
  table.to_csv(path, index=False, float_format='%.3f',sep='\t')
180
+ return path
181
+
182
+ def convert_and_trim_video(media_in, media_out, start=None, end=None):
183
+ WAV_CHANNELS = 1
184
+ WAV_SAMPLE_RATE = 16000
185
+ start_sec = HHMMSS_to_sec(start)
186
+ end_sec = HHMMSS_to_sec(end)
187
+ try:
188
+ if start is None and end is None:
189
+ logging.info(f'...No start and end times provided. Converting entire video without trimming...')
190
+ trim_command=[]
191
+ else:
192
+ if start is None:
193
+ logging.info(f'...No start time provided. Trimming video from start to specified end...')
194
+ start_sec = 0.0
195
+ trim_command = ['-ss',f'{start_sec}']
196
+ if end is None:
197
+ logging.info(f'...No end time provided. Trimming video from specified start to end of video...')
198
+ end_sec = None
199
+ else:
200
+ end_sec = HHMMSS_to_sec(end)
201
+ trim_command.extend(['-to', f'{end_sec}'])
202
+
203
+ if not isinstance(media_in, (str, Path)):
204
+ raise TypeError("media_in must be a string or a PathLike object")
205
+ if not isinstance(media_out, (str, Path)):
206
+ raise TypeError("media_out must be a string or a PathLike object")
207
+
208
+ in_ext = Path(media_in).suffix.lower()
209
+ out_ext = Path(media_out).suffix.lower()
210
+ logging.info(f'...detected extensions from filename: input={in_ext} output={out_ext}')
211
+ if in_ext == out_ext:
212
+ logging.info(f'...No media conversion needed...')
213
+ else:
214
+ logging.info(f'...Using ffmpeg to convert {in_ext} to {out_ext}...')
215
+
216
+ if out_ext == '.wav':
217
+ # convert to wav with standard format for audio models
218
+ command = [
219
+ 'ffmpeg',
220
+ "-f", "s16le",
221
+ '-y',
222
+ '-i', media_in,
223
+ *trim_command,
224
+ '-vn',
225
+ '-acodec', 'pcm_s16le',
226
+ '-ac', str(WAV_CHANNELS),
227
+ '-ar', str(WAV_SAMPLE_RATE),
228
+ media_out,
229
+ '-hide_banner', '-loglevel', 'warning']
230
+ logging.info(f"FFMPEG command: {' '.join(command)}")
231
+ process = subprocess.run(command, capture_output=True, text=True)
232
+ if process.returncode != 0:
233
+ logging.info(f"FFMPEG error: {process.stderr}")
234
+ else:
235
+ logging.info(process.stdout)
236
+ return media_out
237
+
238
+ else: # convert using copy codec
239
+ if in_ext == '.webm':
240
+ command = ['ffmpeg',
241
+ '-i', media_in,
242
+ *trim_command,
243
+ '-c', 'copy',
244
+ '-vcodec', 'h264',
245
+ '-acodec', 'aac',
246
+ media_out,
247
+ '-hide_banner', '-loglevel', 'warning']
248
+ logging.info(f"FFMPEG command: {' '.join(command)}")
249
+ process = subprocess.run(command, capture_output=True, text=True)
250
+ if process.returncode != 0:
251
+ logging.info(f"FFMPEG error: {process.stderr}")
252
+ else:
253
+ logging.info(process.stdout)
254
+ return media_out
255
+ else: # not webm
256
+ command = ['ffmpeg',
257
+ '-y',
258
+ '-i', media_in,
259
+ *trim_command,
260
+ '-c','copy',
261
+ media_out,
262
+ '-hide_banner', '-loglevel', 'warning']
263
+ logging.info(f"FFMPEG command: {' '.join(command)}")
264
+ process = subprocess.run(command, check=True)
265
+ if process.returncode != 0:
266
+ logging.info(f"FFMPEG error: {process.stderr}")
267
+ else:
268
+ logging.info(process.stdout)
269
+ return media_out
270
+
271
+ except Exception as e:
272
+ logging.error(f"Error converting video format: {e}")
273
+ gr.Error(f"Error converting video format: {e}")