Spaces:
Sleeping
Sleeping
rosyvs
commited on
Commit
Β·
5d0f90f
1
Parent(s):
0df4506
Add transcript sorting and merging tool for xlsx or csv input t
Browse files
app.py
CHANGED
|
@@ -7,8 +7,8 @@ import random
|
|
| 7 |
import gradio as gr
|
| 8 |
|
| 9 |
from utils import (HHMMSS_to_sec, convert_and_trim_video,
|
| 10 |
-
|
| 11 |
-
xlsx_to_table,
|
| 12 |
convert_transcript_for_TM, convert_transcript_for_annotation,
|
| 13 |
table_to_ELAN_tsv, ELAN_to_labels_csv, deidentify_speaker)
|
| 14 |
|
|
@@ -134,22 +134,6 @@ def trim_video_wt(input_file, input_transcript, output_format, start_time, end_t
|
|
| 134 |
gr.Error(f"Error: {str(e)}")
|
| 135 |
return f"Error: {str(e)}"
|
| 136 |
|
| 137 |
-
|
| 138 |
-
def sort_transcript_helper(input_transcript, output_transcript):
|
| 139 |
-
# sort transcript
|
| 140 |
-
print("input_transcript: ", input_transcript)
|
| 141 |
-
print("output_transcript: ", output_transcript)
|
| 142 |
-
output_transcript = sort_transcript(input_transcript, output_transcript)
|
| 143 |
-
print("finished sorting transcript")
|
| 144 |
-
return output_transcript
|
| 145 |
-
|
| 146 |
-
def sort_transcript_wrapper(input_file):
|
| 147 |
-
print(f"\nBEGIN TASK: sorting transcript {input_file}")
|
| 148 |
-
output_folder = f"{os.getcwd()}/results/"
|
| 149 |
-
output_file_path = set_output_file(input_file, "tsv", output_folder, insert_string = 'sorted')
|
| 150 |
-
output_file_path = sort_transcript_helper(input_file.name, output_file_path)
|
| 151 |
-
return output_file_path
|
| 152 |
-
|
| 153 |
def trim_video(input_file, output_format, start_time, end_time):
|
| 154 |
print(f"\nBEGIN TASK: trimming {input_file} from {start_time} to {end_time}")
|
| 155 |
try:
|
|
@@ -252,28 +236,67 @@ def convert_xlsx_to_ELANtsv(input_file_list):
|
|
| 252 |
output_files.append(output_file)
|
| 253 |
return output_files
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
#TODO: support sort and merge for XLSX output if this is needed
|
| 257 |
|
| 258 |
-
def convert_ELANtsv_to_CSV(input_file_list,
|
| 259 |
output_files=[]
|
| 260 |
for input_transcript in input_file_list:
|
| 261 |
# convert transcript
|
| 262 |
print("start converting transcript")
|
| 263 |
output_transcript = input_transcript.replace('.tsv', '.csv')
|
| 264 |
-
output_file = ELAN_to_labels_csv(input_transcript, merge_segments =
|
| 265 |
print("finish converting transcript")
|
| 266 |
output_files.append(output_file)
|
| 267 |
return output_files
|
| 268 |
|
| 269 |
# TODO: XLSX to csv (seg_labels or utt_labels)
|
| 270 |
-
def convert_xlsx_to_csv(input_file_list,
|
| 271 |
output_files=[]
|
| 272 |
for input_transcript in input_file_list:
|
| 273 |
# read xl file to table
|
| 274 |
# write table to csv with option to merge segments on ellipsis
|
| 275 |
output_transcript = input_transcript.replace('.xlsx', '.csv')
|
| 276 |
-
output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments =
|
| 277 |
output_files.append(output_file)
|
| 278 |
return output_files
|
| 279 |
|
|
@@ -329,13 +352,6 @@ interface_c = gr.Interface(fn=convert_video, inputs=[input_file_c, output_format
|
|
| 329 |
description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
|
| 330 |
|
| 331 |
|
| 332 |
-
# gr components for transcript sorter
|
| 333 |
-
input_file_s = gr.File(label="Select transcript file")
|
| 334 |
-
output_file_s = gr.File(label="Download sorted transcript")
|
| 335 |
-
interface_s = gr.Interface(fn=sort_transcript_wrapper, inputs=input_file_s, outputs=output_file_s, title="Transcript Sorter", flagging_mode="never",
|
| 336 |
-
description="Sort a transcript file by time. Please wait for the file to upload before clicking the 'Submit' button.")
|
| 337 |
-
|
| 338 |
-
|
| 339 |
|
| 340 |
|
| 341 |
# gr components for video trimmer with random start
|
|
@@ -428,7 +444,6 @@ interface_x2c = gr.Interface(fn=convert_xlsx_to_csv, # TODO: swap out for correc
|
|
| 428 |
# gr components for annotation XLSX
|
| 429 |
input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
|
| 430 |
annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
|
| 431 |
-
|
| 432 |
output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
|
| 433 |
interface_c2a = gr.Interface(
|
| 434 |
fn=convert_for_annotation, # TODO: swap out for correct fn
|
|
@@ -456,6 +471,19 @@ interface_di = gr.Interface(
|
|
| 456 |
)
|
| 457 |
|
| 458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
######## LAUNCH APP ########
|
| 461 |
demo = gr.TabbedInterface(
|
|
@@ -466,6 +494,7 @@ demo = gr.TabbedInterface(
|
|
| 466 |
interface_c2a,
|
| 467 |
interface_tm,
|
| 468 |
interface_di,
|
|
|
|
| 469 |
interface_c,
|
| 470 |
interface,
|
| 471 |
interface_vtr,
|
|
@@ -478,6 +507,7 @@ demo = gr.TabbedInterface(
|
|
| 478 |
"ποΈβββ· CSVβXLSX",
|
| 479 |
"ποΈββπ¬ CSVβXLSX+TM",
|
| 480 |
"ποΈβπ₯·π» Deidentify",
|
|
|
|
| 481 |
"π₯βπ½ Convert",
|
| 482 |
"π₯βοΈ Trim",
|
| 483 |
"π₯βοΈπ² Trim Random",
|
|
|
|
| 7 |
import gradio as gr
|
| 8 |
|
| 9 |
from utils import (HHMMSS_to_sec, convert_and_trim_video,
|
| 10 |
+
table_to_ELAN_tsv, parse_label_csv,
|
| 11 |
+
xlsx_to_table, merge_ellipsis,
|
| 12 |
convert_transcript_for_TM, convert_transcript_for_annotation,
|
| 13 |
table_to_ELAN_tsv, ELAN_to_labels_csv, deidentify_speaker)
|
| 14 |
|
|
|
|
| 134 |
gr.Error(f"Error: {str(e)}")
|
| 135 |
return f"Error: {str(e)}"
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
def trim_video(input_file, output_format, start_time, end_time):
|
| 138 |
print(f"\nBEGIN TASK: trimming {input_file} from {start_time} to {end_time}")
|
| 139 |
try:
|
|
|
|
| 236 |
output_files.append(output_file)
|
| 237 |
return output_files
|
| 238 |
|
| 239 |
+
def sort_and_merge(input_file_list, merge_on_ellipsis=False):
|
| 240 |
+
# simply load a csv file using parse_label_csv, then merge the segments on ellipsis
|
| 241 |
+
# and save to a new file
|
| 242 |
+
output_files=[]
|
| 243 |
+
for input_transcript in input_file_list:
|
| 244 |
+
# convert transcript
|
| 245 |
+
# if is excel then use xlsx_to_table
|
| 246 |
+
if input_transcript.endswith('.xlsx') or input_transcript.endswith('.xls'):
|
| 247 |
+
print("...input is xlsx")
|
| 248 |
+
table = xlsx_to_table(xl_file=input_transcript)
|
| 249 |
+
input_transcript = input_transcript.replace('.xlsx', '.csv')
|
| 250 |
+
elif input_transcript.endswith('.csv') or input_transcript.endswith('.txt') or input_transcript.endswith('.tsv'):
|
| 251 |
+
print("...input is csv, txt, or tsv")
|
| 252 |
+
table = parse_label_csv(input_transcript)
|
| 253 |
+
else:
|
| 254 |
+
print(f"...input {input_transcript} is not a supported file type")
|
| 255 |
+
continue
|
| 256 |
+
table = table.sort_values(by=['start_sec'])
|
| 257 |
+
if merge_on_ellipsis:
|
| 258 |
+
table = merge_ellipsis(table)
|
| 259 |
+
print("finished sorting and merging segments")
|
| 260 |
+
# make filename
|
| 261 |
+
if 'seg_labels' in input_transcript:
|
| 262 |
+
output_file= input_transcript.replace('seg_labels', 'utt_labels')
|
| 263 |
+
elif 'seglabels' in input_transcript:
|
| 264 |
+
output_file= input_transcript.replace('seglabels', 'utt_labels')
|
| 265 |
+
else:
|
| 266 |
+
# prepend it to the filename (but it could be a path so be careful)
|
| 267 |
+
output_file_base = os.path.basename(input_transcript)
|
| 268 |
+
output_file = os.path.join(os.path.dirname(input_transcript), f"utt_labels_{output_file_base}")
|
| 269 |
+
else:
|
| 270 |
+
print("finished sorting segments")
|
| 271 |
+
# make filename
|
| 272 |
+
output_file = input_transcript.replace('.csv', '_sorted.csv')
|
| 273 |
+
# save to csv
|
| 274 |
+
table.to_csv(output_file, index=False)
|
| 275 |
+
print("saved processed transcript to csv")
|
| 276 |
+
output_files.append(output_file)
|
| 277 |
+
return output_files
|
| 278 |
|
| 279 |
#TODO: support sort and merge for XLSX output if this is needed
|
| 280 |
|
| 281 |
+
def convert_ELANtsv_to_CSV(input_file_list, merge_on_ellipsis=False):
|
| 282 |
output_files=[]
|
| 283 |
for input_transcript in input_file_list:
|
| 284 |
# convert transcript
|
| 285 |
print("start converting transcript")
|
| 286 |
output_transcript = input_transcript.replace('.tsv', '.csv')
|
| 287 |
+
output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_on_ellipsis)
|
| 288 |
print("finish converting transcript")
|
| 289 |
output_files.append(output_file)
|
| 290 |
return output_files
|
| 291 |
|
| 292 |
# TODO: XLSX to csv (seg_labels or utt_labels)
|
| 293 |
+
def convert_xlsx_to_csv(input_file_list, merge_on_ellipsis=False):
|
| 294 |
output_files=[]
|
| 295 |
for input_transcript in input_file_list:
|
| 296 |
# read xl file to table
|
| 297 |
# write table to csv with option to merge segments on ellipsis
|
| 298 |
output_transcript = input_transcript.replace('.xlsx', '.csv')
|
| 299 |
+
output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_on_ellipsis)
|
| 300 |
output_files.append(output_file)
|
| 301 |
return output_files
|
| 302 |
|
|
|
|
| 352 |
description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
|
| 353 |
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
|
| 357 |
# gr components for video trimmer with random start
|
|
|
|
| 444 |
# gr components for annotation XLSX
|
| 445 |
input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
|
| 446 |
annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
|
|
|
|
| 447 |
output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
|
| 448 |
interface_c2a = gr.Interface(
|
| 449 |
fn=convert_for_annotation, # TODO: swap out for correct fn
|
|
|
|
| 471 |
)
|
| 472 |
|
| 473 |
|
| 474 |
+
# gr components for transcript sorter
|
| 475 |
+
input_file_s = gr.Files(label="Select transcript files", type="filepath", file_types=[".csv", ".xlsx",".xls", ".tsv", ".txt"])
|
| 476 |
+
merge_s = gr.Checkbox(label="Merge segments on ellipsis?")
|
| 477 |
+
output_file_s = gr.Files(label="Download sorted/merged transcript as .csv", type="filepath", file_types=[".csv"])
|
| 478 |
+
interface_s = gr.Interface(fn=sort_and_merge,
|
| 479 |
+
inputs=[input_file_s, merge_s],
|
| 480 |
+
outputs=output_file_s,
|
| 481 |
+
title="Sort+Merge",
|
| 482 |
+
description="Sort a transcript file by time, and optionally merge partial utterances on ellipsis. Output is a .csv file in standard format.",
|
| 483 |
+
live=False,
|
| 484 |
+
flagging_mode="never")
|
| 485 |
+
|
| 486 |
+
|
| 487 |
|
| 488 |
######## LAUNCH APP ########
|
| 489 |
demo = gr.TabbedInterface(
|
|
|
|
| 494 |
interface_c2a,
|
| 495 |
interface_tm,
|
| 496 |
interface_di,
|
| 497 |
+
interface_s,
|
| 498 |
interface_c,
|
| 499 |
interface,
|
| 500 |
interface_vtr,
|
|
|
|
| 507 |
"ποΈβββ· CSVβXLSX",
|
| 508 |
"ποΈββπ¬ CSVβXLSX+TM",
|
| 509 |
"ποΈβπ₯·π» Deidentify",
|
| 510 |
+
"ποΈπποΈ Sort+Merge",
|
| 511 |
"π₯βπ½ Convert",
|
| 512 |
"π₯βοΈ Trim",
|
| 513 |
"π₯βοΈπ² Trim Random",
|
utils.py
CHANGED
|
@@ -25,7 +25,7 @@ def subprocess_run_verbose(cmd):
|
|
| 25 |
res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
|
| 26 |
return res
|
| 27 |
|
| 28 |
-
def sort_transcript(file_path: str
|
| 29 |
"""
|
| 30 |
Sort the rows of a transcript file by start time.
|
| 31 |
|
|
@@ -181,7 +181,8 @@ def xlsx_to_table(xl_file):
|
|
| 181 |
# reorder columns
|
| 182 |
print(f'...reordering columns...')
|
| 183 |
table = table[['uttID', 'speaker', 'transcript', 'start_sec', 'end_sec']]
|
| 184 |
-
|
|
|
|
| 185 |
return table
|
| 186 |
except Exception as e:
|
| 187 |
gr.Error(f'Error converting {xl_file}: {e}')
|
|
@@ -892,7 +893,6 @@ def parse_label_csv(label_csv:str):
|
|
| 892 |
table=table[['uttID','speaker','start_sec','end_sec','utterance']]
|
| 893 |
return table
|
| 894 |
|
| 895 |
-
|
| 896 |
def deidentify_speaker(df, who='all'):
|
| 897 |
"""replace speaker ID with generic labels
|
| 898 |
in order of appearance (speaker1, speaker2)'
|
|
|
|
| 25 |
res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
|
| 26 |
return res
|
| 27 |
|
| 28 |
+
def sort_transcript(file_path: str):
|
| 29 |
"""
|
| 30 |
Sort the rows of a transcript file by start time.
|
| 31 |
|
|
|
|
| 181 |
# reorder columns
|
| 182 |
print(f'...reordering columns...')
|
| 183 |
table = table[['uttID', 'speaker', 'transcript', 'start_sec', 'end_sec']]
|
| 184 |
+
# sort by start time
|
| 185 |
+
table.sort_values('start_sec', inplace=True)
|
| 186 |
return table
|
| 187 |
except Exception as e:
|
| 188 |
gr.Error(f'Error converting {xl_file}: {e}')
|
|
|
|
| 893 |
table=table[['uttID','speaker','start_sec','end_sec','utterance']]
|
| 894 |
return table
|
| 895 |
|
|
|
|
| 896 |
def deidentify_speaker(df, who='all'):
|
| 897 |
"""replace speaker ID with generic labels
|
| 898 |
in order of appearance (speaker1, speaker2)'
|