Spaces:

levicu
/

transcriber_tools

Sleeping

App Files Files Community

rosyvs commited on May 2, 2025

Commit

5d0f90f

1 Parent(s): 0df4506

Add transcript sorting and merging tool for xlsx or csv input t

Browse files

Files changed (2) hide show

app.py +60 -30
utils.py +3 -3

app.py CHANGED Viewed

@@ -7,8 +7,8 @@ import random
 import gradio as gr
 from utils import (HHMMSS_to_sec,  convert_and_trim_video,
-                   sort_transcript, table_to_ELAN_tsv,
-                   xlsx_to_table,
                    convert_transcript_for_TM, convert_transcript_for_annotation,
                    table_to_ELAN_tsv, ELAN_to_labels_csv, deidentify_speaker)
@@ -134,22 +134,6 @@ def trim_video_wt(input_file, input_transcript, output_format, start_time, end_t
         gr.Error(f"Error: {str(e)}")
         return f"Error: {str(e)}"
-def sort_transcript_helper(input_transcript, output_transcript):
-    # sort transcript
-    print("input_transcript: ", input_transcript)
-    print("output_transcript: ", output_transcript)
-    output_transcript = sort_transcript(input_transcript, output_transcript)
-    print("finished sorting transcript")
-    return output_transcript
-def sort_transcript_wrapper(input_file):
-    print(f"\nBEGIN TASK: sorting transcript {input_file}")
-    output_folder = f"{os.getcwd()}/results/"
-    output_file_path = set_output_file(input_file, "tsv", output_folder, insert_string = 'sorted')
-    output_file_path = sort_transcript_helper(input_file.name, output_file_path)
-    return output_file_path
 def trim_video(input_file, output_format, start_time, end_time):
     print(f"\nBEGIN TASK: trimming {input_file} from {start_time} to {end_time}")
     try:
@@ -252,28 +236,67 @@ def convert_xlsx_to_ELANtsv(input_file_list):
         output_files.append(output_file)
     return output_files
 #TODO: support sort and merge for XLSX output if this is needed
-def convert_ELANtsv_to_CSV(input_file_list, merge_ellipsis=False):
     output_files=[]
     for input_transcript in input_file_list:
         # convert transcript
         print("start converting transcript")
         output_transcript = input_transcript.replace('.tsv', '.csv')
-        output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
         print("finish converting transcript")
         output_files.append(output_file)
     return output_files
 # TODO: XLSX to csv (seg_labels or utt_labels)
-def convert_xlsx_to_csv(input_file_list, merge_ellipsis=False):
     output_files=[]
     for input_transcript in input_file_list:
         # read xl file to table
         # write table to csv with option to merge segments on ellipsis
         output_transcript = input_transcript.replace('.xlsx', '.csv')
-        output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
         output_files.append(output_file)
     return output_files
@@ -329,13 +352,6 @@ interface_c = gr.Interface(fn=convert_video, inputs=[input_file_c, output_format
                             description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
-# gr components for transcript sorter
-input_file_s = gr.File(label="Select transcript file")
-output_file_s = gr.File(label="Download sorted transcript")
-interface_s = gr.Interface(fn=sort_transcript_wrapper, inputs=input_file_s, outputs=output_file_s, title="Transcript Sorter", flagging_mode="never",
-                            description="Sort a transcript file by time. Please wait for the file to upload before clicking the 'Submit' button.")
 # gr components for video trimmer with random start
@@ -428,7 +444,6 @@ interface_x2c = gr.Interface(fn=convert_xlsx_to_csv, # TODO: swap out for correc
 # gr components for annotation XLSX
 input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
 annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
 output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
 interface_c2a = gr.Interface(
                         fn=convert_for_annotation, # TODO: swap out for correct fn
@@ -456,6 +471,19 @@ interface_di = gr.Interface(
     )
 ######## LAUNCH APP ########
 demo = gr.TabbedInterface(
@@ -466,6 +494,7 @@ demo = gr.TabbedInterface(
     interface_c2a,
     interface_tm,
     interface_di,
     interface_c,
     interface,
     interface_vtr,
@@ -478,6 +507,7 @@ demo = gr.TabbedInterface(
     "🗒️→❎☷ CSV→XLSX",
     "🗒️→❎💬 CSV→XLSX+TM",
     "🗒️→🥷🏻 Deidentify",
     "🎥→📽 Convert",
     "🎥✂️ Trim",
     "🎥✂️🎲 Trim Random",

 import gradio as gr
 from utils import (HHMMSS_to_sec,  convert_and_trim_video,
+                   table_to_ELAN_tsv, parse_label_csv,
+                   xlsx_to_table, merge_ellipsis,
                    convert_transcript_for_TM, convert_transcript_for_annotation,
                    table_to_ELAN_tsv, ELAN_to_labels_csv, deidentify_speaker)
         gr.Error(f"Error: {str(e)}")
         return f"Error: {str(e)}"
 def trim_video(input_file, output_format, start_time, end_time):
     print(f"\nBEGIN TASK: trimming {input_file} from {start_time} to {end_time}")
     try:
         output_files.append(output_file)
     return output_files
+def sort_and_merge(input_file_list, merge_on_ellipsis=False):
+    # simply load a csv file using parse_label_csv, then merge the segments on ellipsis
+    # and save to a new file
+    output_files=[]
+    for input_transcript in input_file_list:
+        # convert transcript
+        # if is excel then use xlsx_to_table
+        if input_transcript.endswith('.xlsx') or input_transcript.endswith('.xls'):
+            print("...input is xlsx")
+            table = xlsx_to_table(xl_file=input_transcript)
+            input_transcript = input_transcript.replace('.xlsx', '.csv')
+        elif input_transcript.endswith('.csv') or input_transcript.endswith('.txt') or input_transcript.endswith('.tsv'):
+            print("...input is csv, txt, or tsv")
+            table = parse_label_csv(input_transcript)
+        else:
+            print(f"...input {input_transcript} is not a supported file type")
+            continue
+        table = table.sort_values(by=['start_sec'])
+        if merge_on_ellipsis:
+            table = merge_ellipsis(table)
+            print("finished sorting and merging segments")
+            # make filename
+            if 'seg_labels' in input_transcript:
+                output_file= input_transcript.replace('seg_labels', 'utt_labels')
+            elif 'seglabels' in input_transcript:
+                output_file= input_transcript.replace('seglabels', 'utt_labels')
+            else:
+                # prepend it to the filename (but it could be a path so be careful)
+                output_file_base = os.path.basename(input_transcript)
+                output_file = os.path.join(os.path.dirname(input_transcript), f"utt_labels_{output_file_base}")
+        else:
+            print("finished sorting segments")
+            # make filename
+            output_file = input_transcript.replace('.csv', '_sorted.csv')
+        # save to csv
+        table.to_csv(output_file, index=False)
+        print("saved processed transcript to csv")
+        output_files.append(output_file)
+    return output_files
 #TODO: support sort and merge for XLSX output if this is needed
+def convert_ELANtsv_to_CSV(input_file_list, merge_on_ellipsis=False):
     output_files=[]
     for input_transcript in input_file_list:
         # convert transcript
         print("start converting transcript")
         output_transcript = input_transcript.replace('.tsv', '.csv')
+        output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_on_ellipsis)
         print("finish converting transcript")
         output_files.append(output_file)
     return output_files
 # TODO: XLSX to csv (seg_labels or utt_labels)
+def convert_xlsx_to_csv(input_file_list, merge_on_ellipsis=False):
     output_files=[]
     for input_transcript in input_file_list:
         # read xl file to table
         # write table to csv with option to merge segments on ellipsis
         output_transcript = input_transcript.replace('.xlsx', '.csv')
+        output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_on_ellipsis)
         output_files.append(output_file)
     return output_files
                             description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
 # gr components for video trimmer with random start
 # gr components for annotation XLSX
 input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
 annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
 output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
 interface_c2a = gr.Interface(
                         fn=convert_for_annotation, # TODO: swap out for correct fn
     )
+# gr components for transcript sorter
+input_file_s = gr.Files(label="Select transcript files", type="filepath", file_types=[".csv", ".xlsx",".xls", ".tsv", ".txt"])
+merge_s = gr.Checkbox(label="Merge segments on ellipsis?")
+output_file_s = gr.Files(label="Download sorted/merged transcript as .csv", type="filepath", file_types=[".csv"])
+interface_s = gr.Interface(fn=sort_and_merge,
+                           inputs=[input_file_s, merge_s],
+                            outputs=output_file_s,
+                            title="Sort+Merge",
+                            description="Sort a transcript file by time, and optionally merge partial utterances on ellipsis. Output is a .csv file in standard format.",
+                            live=False,
+                            flagging_mode="never")
 ######## LAUNCH APP ########
 demo = gr.TabbedInterface(
     interface_c2a,
     interface_tm,
     interface_di,
+    interface_s,
     interface_c,
     interface,
     interface_vtr,
     "🗒️→❎☷ CSV→XLSX",
     "🗒️→❎💬 CSV→XLSX+TM",
     "🗒️→🥷🏻 Deidentify",
+    "🗒️🔀🗒️ Sort+Merge",
     "🎥→📽 Convert",
     "🎥✂️ Trim",
     "🎥✂️🎲 Trim Random",

utils.py CHANGED Viewed

@@ -25,7 +25,7 @@ def subprocess_run_verbose(cmd):
     res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
     return res
-def sort_transcript(file_path: str, save_path: str) -> str:
     """
     Sort the rows of a transcript file by start time.
@@ -181,7 +181,8 @@ def xlsx_to_table(xl_file):
         # reorder columns
         print(f'...reordering columns...')
         table = table[['uttID', 'speaker', 'transcript', 'start_sec', 'end_sec']]
         return table
     except Exception as e:
         gr.Error(f'Error converting {xl_file}: {e}')
@@ -892,7 +893,6 @@ def parse_label_csv(label_csv:str):
     table=table[['uttID','speaker','start_sec','end_sec','utterance']]
     return table
 def deidentify_speaker(df, who='all'):
     """replace speaker ID with generic labels
     in order of appearance (speaker1, speaker2)'

     res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
     return res
+def sort_transcript(file_path: str):
     """
     Sort the rows of a transcript file by start time.
         # reorder columns
         print(f'...reordering columns...')
         table = table[['uttID', 'speaker', 'transcript', 'start_sec', 'end_sec']]
+        # sort by start time
+        table.sort_values('start_sec', inplace=True)
         return table
     except Exception as e:
         gr.Error(f'Error converting {xl_file}: {e}')
     table=table[['uttID','speaker','start_sec','end_sec','utterance']]
     return table
 def deidentify_speaker(df, who='all'):
     """replace speaker ID with generic labels
     in order of appearance (speaker1, speaker2)'