rosyvs commited on
Commit
5d0f90f
Β·
1 Parent(s): 0df4506

Add transcript sorting and merging tool for xlsx or csv input t

Browse files
Files changed (2) hide show
  1. app.py +60 -30
  2. utils.py +3 -3
app.py CHANGED
@@ -7,8 +7,8 @@ import random
7
  import gradio as gr
8
 
9
  from utils import (HHMMSS_to_sec, convert_and_trim_video,
10
- sort_transcript, table_to_ELAN_tsv,
11
- xlsx_to_table,
12
  convert_transcript_for_TM, convert_transcript_for_annotation,
13
  table_to_ELAN_tsv, ELAN_to_labels_csv, deidentify_speaker)
14
 
@@ -134,22 +134,6 @@ def trim_video_wt(input_file, input_transcript, output_format, start_time, end_t
134
  gr.Error(f"Error: {str(e)}")
135
  return f"Error: {str(e)}"
136
 
137
-
138
- def sort_transcript_helper(input_transcript, output_transcript):
139
- # sort transcript
140
- print("input_transcript: ", input_transcript)
141
- print("output_transcript: ", output_transcript)
142
- output_transcript = sort_transcript(input_transcript, output_transcript)
143
- print("finished sorting transcript")
144
- return output_transcript
145
-
146
- def sort_transcript_wrapper(input_file):
147
- print(f"\nBEGIN TASK: sorting transcript {input_file}")
148
- output_folder = f"{os.getcwd()}/results/"
149
- output_file_path = set_output_file(input_file, "tsv", output_folder, insert_string = 'sorted')
150
- output_file_path = sort_transcript_helper(input_file.name, output_file_path)
151
- return output_file_path
152
-
153
  def trim_video(input_file, output_format, start_time, end_time):
154
  print(f"\nBEGIN TASK: trimming {input_file} from {start_time} to {end_time}")
155
  try:
@@ -252,28 +236,67 @@ def convert_xlsx_to_ELANtsv(input_file_list):
252
  output_files.append(output_file)
253
  return output_files
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  #TODO: support sort and merge for XLSX output if this is needed
257
 
258
- def convert_ELANtsv_to_CSV(input_file_list, merge_ellipsis=False):
259
  output_files=[]
260
  for input_transcript in input_file_list:
261
  # convert transcript
262
  print("start converting transcript")
263
  output_transcript = input_transcript.replace('.tsv', '.csv')
264
- output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
265
  print("finish converting transcript")
266
  output_files.append(output_file)
267
  return output_files
268
 
269
  # TODO: XLSX to csv (seg_labels or utt_labels)
270
- def convert_xlsx_to_csv(input_file_list, merge_ellipsis=False):
271
  output_files=[]
272
  for input_transcript in input_file_list:
273
  # read xl file to table
274
  # write table to csv with option to merge segments on ellipsis
275
  output_transcript = input_transcript.replace('.xlsx', '.csv')
276
- output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
277
  output_files.append(output_file)
278
  return output_files
279
 
@@ -329,13 +352,6 @@ interface_c = gr.Interface(fn=convert_video, inputs=[input_file_c, output_format
329
  description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
330
 
331
 
332
- # gr components for transcript sorter
333
- input_file_s = gr.File(label="Select transcript file")
334
- output_file_s = gr.File(label="Download sorted transcript")
335
- interface_s = gr.Interface(fn=sort_transcript_wrapper, inputs=input_file_s, outputs=output_file_s, title="Transcript Sorter", flagging_mode="never",
336
- description="Sort a transcript file by time. Please wait for the file to upload before clicking the 'Submit' button.")
337
-
338
-
339
 
340
 
341
  # gr components for video trimmer with random start
@@ -428,7 +444,6 @@ interface_x2c = gr.Interface(fn=convert_xlsx_to_csv, # TODO: swap out for correc
428
  # gr components for annotation XLSX
429
  input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
430
  annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
431
-
432
  output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
433
  interface_c2a = gr.Interface(
434
  fn=convert_for_annotation, # TODO: swap out for correct fn
@@ -456,6 +471,19 @@ interface_di = gr.Interface(
456
  )
457
 
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
 
460
  ######## LAUNCH APP ########
461
  demo = gr.TabbedInterface(
@@ -466,6 +494,7 @@ demo = gr.TabbedInterface(
466
  interface_c2a,
467
  interface_tm,
468
  interface_di,
 
469
  interface_c,
470
  interface,
471
  interface_vtr,
@@ -478,6 +507,7 @@ demo = gr.TabbedInterface(
478
  "πŸ—’οΈβ†’βŽβ˜· CSVβ†’XLSX",
479
  "πŸ—’οΈβ†’βŽπŸ’¬ CSVβ†’XLSX+TM",
480
  "πŸ—’οΈβ†’πŸ₯·πŸ» Deidentify",
 
481
  "πŸŽ₯β†’πŸ“½ Convert",
482
  "πŸŽ₯βœ‚οΈ Trim",
483
  "πŸŽ₯βœ‚οΈπŸŽ² Trim Random",
 
7
  import gradio as gr
8
 
9
  from utils import (HHMMSS_to_sec, convert_and_trim_video,
10
+ table_to_ELAN_tsv, parse_label_csv,
11
+ xlsx_to_table, merge_ellipsis,
12
  convert_transcript_for_TM, convert_transcript_for_annotation,
13
  table_to_ELAN_tsv, ELAN_to_labels_csv, deidentify_speaker)
14
 
 
134
  gr.Error(f"Error: {str(e)}")
135
  return f"Error: {str(e)}"
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  def trim_video(input_file, output_format, start_time, end_time):
138
  print(f"\nBEGIN TASK: trimming {input_file} from {start_time} to {end_time}")
139
  try:
 
236
  output_files.append(output_file)
237
  return output_files
238
 
239
+ def sort_and_merge(input_file_list, merge_on_ellipsis=False):
240
+ # simply load a csv file using parse_label_csv, then merge the segments on ellipsis
241
+ # and save to a new file
242
+ output_files=[]
243
+ for input_transcript in input_file_list:
244
+ # convert transcript
245
+ # if is excel then use xlsx_to_table
246
+ if input_transcript.endswith('.xlsx') or input_transcript.endswith('.xls'):
247
+ print("...input is xlsx")
248
+ table = xlsx_to_table(xl_file=input_transcript)
249
+ input_transcript = input_transcript.replace('.xlsx', '.csv')
250
+ elif input_transcript.endswith('.csv') or input_transcript.endswith('.txt') or input_transcript.endswith('.tsv'):
251
+ print("...input is csv, txt, or tsv")
252
+ table = parse_label_csv(input_transcript)
253
+ else:
254
+ print(f"...input {input_transcript} is not a supported file type")
255
+ continue
256
+ table = table.sort_values(by=['start_sec'])
257
+ if merge_on_ellipsis:
258
+ table = merge_ellipsis(table)
259
+ print("finished sorting and merging segments")
260
+ # make filename
261
+ if 'seg_labels' in input_transcript:
262
+ output_file= input_transcript.replace('seg_labels', 'utt_labels')
263
+ elif 'seglabels' in input_transcript:
264
+ output_file= input_transcript.replace('seglabels', 'utt_labels')
265
+ else:
266
+ # prepend it to the filename (but it could be a path so be careful)
267
+ output_file_base = os.path.basename(input_transcript)
268
+ output_file = os.path.join(os.path.dirname(input_transcript), f"utt_labels_{output_file_base}")
269
+ else:
270
+ print("finished sorting segments")
271
+ # make filename
272
+ output_file = input_transcript.replace('.csv', '_sorted.csv')
273
+ # save to csv
274
+ table.to_csv(output_file, index=False)
275
+ print("saved processed transcript to csv")
276
+ output_files.append(output_file)
277
+ return output_files
278
 
279
  #TODO: support sort and merge for XLSX output if this is needed
280
 
281
+ def convert_ELANtsv_to_CSV(input_file_list, merge_on_ellipsis=False):
282
  output_files=[]
283
  for input_transcript in input_file_list:
284
  # convert transcript
285
  print("start converting transcript")
286
  output_transcript = input_transcript.replace('.tsv', '.csv')
287
+ output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_on_ellipsis)
288
  print("finish converting transcript")
289
  output_files.append(output_file)
290
  return output_files
291
 
292
  # TODO: XLSX to csv (seg_labels or utt_labels)
293
+ def convert_xlsx_to_csv(input_file_list, merge_on_ellipsis=False):
294
  output_files=[]
295
  for input_transcript in input_file_list:
296
  # read xl file to table
297
  # write table to csv with option to merge segments on ellipsis
298
  output_transcript = input_transcript.replace('.xlsx', '.csv')
299
+ output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_on_ellipsis)
300
  output_files.append(output_file)
301
  return output_files
302
 
 
352
  description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
353
 
354
 
 
 
 
 
 
 
 
355
 
356
 
357
  # gr components for video trimmer with random start
 
444
  # gr components for annotation XLSX
445
  input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
446
  annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
 
447
  output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
448
  interface_c2a = gr.Interface(
449
  fn=convert_for_annotation, # TODO: swap out for correct fn
 
471
  )
472
 
473
 
474
+ # gr components for transcript sorter
475
+ input_file_s = gr.Files(label="Select transcript files", type="filepath", file_types=[".csv", ".xlsx",".xls", ".tsv", ".txt"])
476
+ merge_s = gr.Checkbox(label="Merge segments on ellipsis?")
477
+ output_file_s = gr.Files(label="Download sorted/merged transcript as .csv", type="filepath", file_types=[".csv"])
478
+ interface_s = gr.Interface(fn=sort_and_merge,
479
+ inputs=[input_file_s, merge_s],
480
+ outputs=output_file_s,
481
+ title="Sort+Merge",
482
+ description="Sort a transcript file by time, and optionally merge partial utterances on ellipsis. Output is a .csv file in standard format.",
483
+ live=False,
484
+ flagging_mode="never")
485
+
486
+
487
 
488
  ######## LAUNCH APP ########
489
  demo = gr.TabbedInterface(
 
494
  interface_c2a,
495
  interface_tm,
496
  interface_di,
497
+ interface_s,
498
  interface_c,
499
  interface,
500
  interface_vtr,
 
507
  "πŸ—’οΈβ†’βŽβ˜· CSVβ†’XLSX",
508
  "πŸ—’οΈβ†’βŽπŸ’¬ CSVβ†’XLSX+TM",
509
  "πŸ—’οΈβ†’πŸ₯·πŸ» Deidentify",
510
+ "πŸ—’οΈπŸ”€πŸ—’οΈ Sort+Merge",
511
  "πŸŽ₯β†’πŸ“½ Convert",
512
  "πŸŽ₯βœ‚οΈ Trim",
513
  "πŸŽ₯βœ‚οΈπŸŽ² Trim Random",
utils.py CHANGED
@@ -25,7 +25,7 @@ def subprocess_run_verbose(cmd):
25
  res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
26
  return res
27
 
28
- def sort_transcript(file_path: str, save_path: str) -> str:
29
  """
30
  Sort the rows of a transcript file by start time.
31
 
@@ -181,7 +181,8 @@ def xlsx_to_table(xl_file):
181
  # reorder columns
182
  print(f'...reordering columns...')
183
  table = table[['uttID', 'speaker', 'transcript', 'start_sec', 'end_sec']]
184
-
 
185
  return table
186
  except Exception as e:
187
  gr.Error(f'Error converting {xl_file}: {e}')
@@ -892,7 +893,6 @@ def parse_label_csv(label_csv:str):
892
  table=table[['uttID','speaker','start_sec','end_sec','utterance']]
893
  return table
894
 
895
-
896
  def deidentify_speaker(df, who='all'):
897
  """replace speaker ID with generic labels
898
  in order of appearance (speaker1, speaker2)'
 
25
  res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
26
  return res
27
 
28
+ def sort_transcript(file_path: str):
29
  """
30
  Sort the rows of a transcript file by start time.
31
 
 
181
  # reorder columns
182
  print(f'...reordering columns...')
183
  table = table[['uttID', 'speaker', 'transcript', 'start_sec', 'end_sec']]
184
+ # sort by start time
185
+ table.sort_values('start_sec', inplace=True)
186
  return table
187
  except Exception as e:
188
  gr.Error(f'Error converting {xl_file}: {e}')
 
893
  table=table[['uttID','speaker','start_sec','end_sec','utterance']]
894
  return table
895
 
 
896
  def deidentify_speaker(df, who='all'):
897
  """replace speaker ID with generic labels
898
  in order of appearance (speaker1, speaker2)'