Spaces:

levicu
/

transcriber_tools

Sleeping

App Files Files Community

rosyvs commited on Apr 21, 2025

Commit

d9fb961

1 Parent(s): 9636874

Add transcript processing functions to utils and add to the interface and enhance utils for TM conversion

Browse files

Files changed (2) hide show

app.py +234 -4
utils.py +626 -1

app.py CHANGED Viewed

@@ -6,9 +6,11 @@ from pathlib import Path
 import random
 import gradio as gr
-from utils import (HHMMSS_to_sec,  molly_xlsx_to_table, convert_and_trim_video,
                    sort_transcript, table_to_ELAN_tsv,
-                   xlsx_to_table)
 def delete_files(files):
@@ -193,6 +195,121 @@ def convert_video(input_file, output_format):
         gr.Error(f"Error: {str(e)}")
         return f"Error: {str(e)}"
 # gr components for video trimmer
 input_file = gr.File(label="Select video file")
@@ -254,5 +371,118 @@ interface_wt = gr.Interface(fn=trim_video_wt, inputs=[input_file_wt, input_trans
                                         `Annotations`: a string that may be blank, representing any annotations for the utterance. \n\
                                         `Error Type`: a string that may be blank, representing any errors in the transcription of the utterance. ")
-demo = gr.TabbedInterface([interface_c, interface,  interface_vtr, interface_wt ], ["Video Converter", "Video Trimmer", "Video Trimmer with Random Start Time", "Video Trimmer with Transcript"])
-demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

 import random
 import gradio as gr
+from utils import (HHMMSS_to_sec,  convert_and_trim_video,
                    sort_transcript, table_to_ELAN_tsv,
+                   xlsx_to_table,
+                   convert_transcript_for_TM, convert_transcript_for_annotation,
+                   table_to_ELAN_tsv, ELAN_to_labels_csv, deidentify_speaker)
 def delete_files(files):
         gr.Error(f"Error: {str(e)}")
         return f"Error: {str(e)}"
+def delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath):
+    for output_filepath in output_filepath_list:
+        try:
+            os.remove(output_filepath)
+        except FileNotFoundError:
+            pass
+    for trans_log_filepath in trans_log_filepath_list:
+        try:
+            os.remove(trans_log_filepath)
+        except FileNotFoundError:
+            pass
+    try:
+        os.remove(global_log_filepath)
+    except FileNotFoundError:
+        pass
+    print("Files deleted")
+def delete_files_thread(output_filepath_list, trans_log_filepath_list, global_log_filepath):
+    print("Thread started")
+    time.sleep(20)
+    delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath)
+def convert_xlsx_to_TMxlsx(input_file_list):
+    file_list = [file.name for file in input_file_list]
+    output_filepath_list, trans_log_filepath_list, error_check, global_transfer_log_path = convert_transcript_for_TM(file_list=file_list)
+    if not error_check:
+        error_check = "No errors found."
+    delete_thread = threading.Thread(target=delete_files_thread, args=(output_filepath_list, trans_log_filepath_list, global_transfer_log_path))
+    delete_thread.start()
+    return output_filepath_list, trans_log_filepath_list, global_transfer_log_path, error_check
+def convert_for_annotation(input_file_list, annotation_scheme):
+    output_files=[]
+    for input_transcript in input_file_list:
+        print("start converting transcript")
+        output_file = convert_transcript_for_annotation(file=input_transcript, annotation_scheme=annotation_scheme)
+        print("finished converting transcript to xlsx for annotation")
+        output_files.append(output_file)
+    return output_files
+def convert_xlsx_to_ELANtsv(input_file_list):
+    output_files=[]
+    for input_transcript in input_file_list:
+        # convert transcript
+        print("start converting transcript")
+        table = old_xlsx_to_table(xl_file=input_transcript)
+        print("finished converting transcript to table")
+        output_transcript = input_transcript.replace('.xlsx', '.tsv')
+        output_file = table_to_ELAN_tsv(table, output_transcript)
+        print("saved table to tsv")
+        output_files.append(output_file)
+    return output_files
+#TODO: support sort and merge for XLSX output if this is needed
+def convert_ELANtsv_to_CSV(input_file_list, merge_ellipsis=False):
+    output_files=[]
+    for input_transcript in input_file_list:
+        # convert transcript
+        print("start converting transcript")
+        output_transcript = input_transcript.replace('.tsv', '.csv')
+        output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
+        print("finish converting transcript")
+        output_files.append(output_file)
+    return output_files
+# TODO: XLSX to csv (seg_labels or utt_labels)
+def convert_xlsx_to_csv(input_file_list, merge_ellipsis=False):
+    output_files=[]
+    for input_transcript in input_file_list:
+        # read xl file to table
+        # write table to csv with option to merge segments on ellipsis
+        output_transcript = input_transcript.replace('.xlsx', '.csv')
+        output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
+        output_files.append(output_file)
+    return output_files
+def deidentify_transcripts(input_file_list, who='student'):
+    output_files=[]
+    for file in input_file_list:
+        basename = os.path.basename(file)
+        ext = file.split('.')[-1]
+        if file.endswith('.xlsx') or file.endswith('.xls'):
+            df = pd.read_excel(file)
+        elif file.endswith('.csv'):
+            df = pd.read_csv(file)
+        elif file.endswith('.tsv'):
+            df = pd.read_csv(file, sep='\t')
+        elif file.endswith('.txt'):
+            df = pd.read_csv(file, sep='\t')
+        else:
+            gr.Warning("File type not supported (must be .xlsx, .xls, .csv, .tsv, or .txt)")
+        try:
+            df = deidentify_speaker(df, who=who)
+        except ValueError as e:
+            gr.Warning(f"{e}: {basename} ")
+            continue
+        output_file = file.replace(f'.{ext}', f'_deidentified.{ext}')
+        if ext == 'xlsx' or ext == 'xls':
+            df.to_excel(output_file, index=False)
+        elif ext == 'csv':
+            df.to_csv(output_file, index=False)
+        elif ext == 'tsv' or ext == 'txt':
+            df.to_csv(output_file, sep='\t', index=False)
+        output_files.append(output_file)
+    return output_files
+###### GRADIO INTERFACE ######
 # gr components for video trimmer
 input_file = gr.File(label="Select video file")
                                         `Annotations`: a string that may be blank, representing any annotations for the utterance. \n\
                                         `Error Type`: a string that may be blank, representing any errors in the transcription of the utterance. ")
+#### TRANSCRIPT COMPONENTS ####
+# gr components for TM converter
+input_xlsx = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
+output_xlsx_tm = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
+process_log_tm = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
+global_transfer_log_tm = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
+error_check_tm = gr.Textbox(label="Error Check", type="text")
+interface_tm = gr.Interface(fn=convert_xlsx_to_TMxlsx,
+                        inputs=input_xlsx,
+                        outputs=[output_xlsx_tm, process_log_tm, global_transfer_log_tm, error_check_tm],
+                        title="transcript-->XLSX+TM_dropdown",
+                        description="Converts XLSX or csv transcript to XLSX+TM transcript with prefilled dropdown for talkmoves",
+                        live=False,
+                        allow_flagging="never",)
+# gr components for xlsx to ELAN
+input_x2e = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
+output_x2e = gr.Files(label="Output ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
+# process_log_x2e = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
+# global_transfer_log_x2e = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
+# error_check_x2e = gr.Textbox(label="Error Check", type="text")
+interface_x2e = gr.Interface(fn=convert_xlsx_to_ELANtsv, # TODO: swap out for correct fn
+                        inputs=input_x2e,
+                        outputs=output_x2e,
+                        title="XLSX-->ELAN",
+                        description="Converts XLSX transcript to ELAN-compatible tsv file",
+                        live=False,
+                        allow_flagging="never",)
+# gr components for ELAN to CSV
+input_e2c = gr.Files(label="Input ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
+merge_e2c = gr.Checkbox(label="Merge segments on ellipsis?")
+output_e2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
+interface_e2c = gr.Interface(fn=convert_ELANtsv_to_CSV, # TODO: swap out for correct fn
+                        inputs=[input_e2c, merge_e2c],
+                        outputs=[output_e2c],
+                        title="ELAN-->CSV",
+                        description="Converts ELAN-exported file (.txt or .tsv, tab separated values) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
+                        live=False,
+                        allow_flagging="never",)
+# gr components for XLSX to CSV
+input_x2c = gr.Files(label="Input XLSX file", type="filepath", file_types=[".xlsx", ".csv"])
+merge_x2c = gr.Checkbox(label="Merge segments on ellipsis?")
+output_x2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
+interface_x2c = gr.Interface(fn=convert_xlsx_to_csv, # TODO: swap out for correct fn
+                        inputs=[input_x2c, merge_x2c],
+                        outputs=[output_x2c],
+                        title="XLSX-->CSV",
+                        description="Converts old version XLSX transcript (with a single Timecode column) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
+                        live=False,
+                        allow_flagging="never",)
+# gr components for annotation XLSX
+input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
+annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
+output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
+interface_c2a = gr.Interface(
+                        fn=convert_for_annotation, # TODO: swap out for correct fn
+                        inputs=[input_c2a, annotation_scheme_c2a],
+                        outputs=[output_c2a],
+                        title="CSV-->XLSX+annotation",
+                        description="Converts CSV file to XLSX file for annotation (added columns for CPS or TM or None)",
+                        live=False,
+                        allow_flagging="never",
+                        # submit_btn="Convert"
+                        )
+# gr components for deidentification
+input_di = gr.Files(label="Input transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
+who_di = gr.Radio(label="Who to deidentify", choices=[("student","student"), ("all","all")])
+output_di = gr.Files(label="Output deidentified transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
+interface_di = gr.Interface(
+    fn=deidentify_transcripts,
+    inputs=[input_di, who_di],
+    outputs=[output_di],
+    title="Deidentify",
+    description="Deidentify speaker labels in a transcript. Compatible with .xlsx, .xls, .csv, .tsv, .txt files with a column containing speaker labels. Will not work if speaker column is missing a header. Speaker names or IDs will be replaced with a deidentified label numbered in order of appearance. Choose whether to deidentify just students or all speakers.",
+    live=False,
+    allow_flagging="never",
+    )
+######## LAUNCH APP ########
+demo = gr.TabbedInterface(
+    [
+    interface_e2c,
+    interface_x2e,
+    interface_x2c,
+    interface_c2a,
+    interface_tm,
+    interface_di,
+    interface_c,
+    interface,
+    interface_vtr,
+    interface_wt
+    ],
+    [
+    "📝→🗒️ ELAN→CSV",
+    "❎→📝 XLSX→ELAN",
+    "❎→🗒️ XLSX→CSV",
+    "🗒️→❎☷ CSV→XLSX+annotation",
+    "🗒️→❎💬 transcript→XLSX+TM_dropdown",
+    "🗒️→🥷🏻 Deidentify",
+    "🎥→📽 Video Converter",
+    "🎥✂️ Video Trimmer",
+    "🎥🎲 Video Trimmer with Random Start Time",
+    "🎥🗒️✂️ Video Trimmer with Transcript"
+    ]
+)
+demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

utils.py CHANGED Viewed

@@ -7,6 +7,11 @@ from pathlib import Path
 import sys
 import gradio as gr
 import pandas as pd
 os.makedirs(f'{os.getcwd()}/logs', exist_ok=True)
 os.makedirs(f'{os.getcwd()}/results', exist_ok=True)
@@ -291,4 +296,624 @@ def convert_and_trim_video(media_in, media_out, start=None, end=None):
     except Exception as e:
         print(f"Error converting video format: {e}")
-        gr.Error(f"Error converting video format: {e}")

 import sys
 import gradio as gr
 import pandas as pd
+from pathlib import Path
+import nltk
+from openpyxl import Workbook
+from openpyxl.utils.dataframe import dataframe_to_rows
+from openpyxl.worksheet.datavalidation import DataValidation
 os.makedirs(f'{os.getcwd()}/logs', exist_ok=True)
 os.makedirs(f'{os.getcwd()}/results', exist_ok=True)
     except Exception as e:
         print(f"Error converting video format: {e}")
+        gr.Error(f"Error converting video format: {e}")
+###### TRANSCRIT UTILS ######
+def convert_transcript_for_TM(file_list):
+    """Convert transcripts for TalkMoves Annotation
+    Input can be xlsx or csv transcript file
+    Can handle sepraate start and end time columns or a single timecode column
+    Output will have separate start and end timestamps in HH:MM:SS.sss format
+    Args:
+        file_list (_type_): _description_
+    Raises:
+        gr.Error: _description_
+        gr.Error: _description_
+    Returns:
+        _type_: _description_
+    """
+    # Regular expression pattern for matching speaker names and timecodes.
+    bracket_re = re.compile(r'(?:\[[UI|ui|Inaudible|inaudible|overlapping speech|VIDEO SILENCE|teacher explaining in background].*\]\W{0,2})')
+    # Regular expression pattern for matching anything enclosed in square brackets.
+    all_bracket_re = re.compile(r'(?:\[.*\]\W{0,2})')
+    # whether remove the inaudible
+    do_remove_inaudible = True
+    # whether_keep_context_switch
+    do_keep_context_switch = True
+    # whether_convert_to_timestamp if start and end time are in seconds and in separate columns
+    convert_to_timestamp = True
+    error_message = [] # List of error messages to be displayed to the user.
+    global_stat_dict = {} # Dictionary of global statistics.
+    output_filepath_list = [] # List of output file paths.
+    trans_log_filepath_list = [] # List of transcription log file paths.
+    for file in file_list:
+        filename = file.split('/')[-1] # Get the filename from the file.
+        filepath = os.path.dirname(file) # Get the file path from the file.
+        # Read the file into a Pandas DataFrame depending on its file format.
+        if filename.endswith('.xlsx'):
+            df = pd.read_excel(file, index_col=0)
+            output_filename = f"{filename[:-5]}" + "_TMcoded.xlsx"
+        elif filename.endswith('.csv'):
+            df = pd.read_csv(file, index_col=0, error_bad_lines=False)
+            output_filename = f"{filename[:-4]}" + "_TMcoded.xlsx"
+        else:
+            raise gr.Error(f"{file} format is wrong")
+        # Remove the "Copy of" prefix from the output filename, if present.
+        if output_filename.startswith("Copy of "):
+            output_filename = output_filename[8:]
+        # Remove the word "_Transcript" from the output filename, if present.
+        if '_Transcript' in output_filename:
+            # print("before: "+output_filename)
+            error_message.append("before: "+output_filename)
+            output_filename = ''.join(output_filename.split('_Transcript'))
+            # print("after: "+output_filename)
+            error_message.append("after: "+output_filename)
+        # Construct the output file and transcription log file paths.
+        output_filepath = os.path.join(filepath, output_filename)
+        trans_log_filepath = os.path.join(filepath, f"{output_filename}"+ ".log")
+        # Open the transcription log file for writing.
+        with open(trans_log_filepath, "w") as outfile:
+            sub_cnt_in_file = 0
+            empty_speaker_cnt_in_file = 0
+            turn_skipped_in_file = 0
+            turn_skipped_speaker_switch_in_file = 0
+            snt_mark_skip_in_file = 0
+            snt_skipped_in_file = 0
+            chat_flag_in_speaker_time_line = 0
+            chat_flag_in_content_line = 0
+            all_inaudible_in_file = 0
+            all_bracket_in_file = 0
+            all_snts_in_file = 0
+            all_token_cnt_in_file = 0
+            #index	Timecode	Duration	Speaker	Dialogue	Annotations	Error Type
+            #1	00:00:05:04 - 00:00:07:12	00:00:02:08	Tutor	Did you... How was your Halloween?
+            turns = []
+            time_stamps = []
+            speakers = []
+            chat_flags = []
+            sentences = []
+            snt_ids = []
+            ## parse the df flexibly: find key column names which might vary dependign on transcript source
+            # set all column names to lowercase
+            df.columns = map(str.lower, df.columns)
+            # several possibilities for column names, detect which are present
+            uttID_keys = ['utt','seg','utt_id','seg_id','index']
+            speaker_keys = ['speaker']
+            start_keys=['start_sec','start','start_time','timestart']
+            end_keys=['end_sec','end','end_time','timeend']
+            timestamp_keys = ['timecode','timestamp']
+            content_keys=['dialogue','utterance','transcript','text']
+            # detect which is used in this df
+            uttID_key = next((key for key in uttID_keys if key in df.columns), None)
+            speaker_key = next((key for key in speaker_keys if key in df.columns), None)
+            content_key = next((key for key in content_keys if key in df.columns), None)
+            # check if separate start and end times are present, otherwise assume single timecode column
+            if any(df.columns.isin(start_keys)):
+                start_key = next((key for key in start_keys if key in df.columns), None)
+                end_key = next((key for key in end_keys if key in df.columns), None)
+                time_format = 'seconds'
+                if convert_to_timestamp:
+                    # convert to timestamp format HH:MM:SS.sss - HH:MM:SS.sss
+                    df['timecode'] = df.apply(lambda x: f"{sec_to_HHMMSS(x[start_key])} - {sec_to_HHMMSS(x[end_key])}", axis=1)
+                    timestamp_key='timecode'
+                    time_format = 'timestamp'
+            else:
+                timestamp_key=next((key for key in timestamp_keys if key in df.columns), None)
+                time_format = 'timestamp'
+            # Turn started with 1, the same as molly's transcripts
+            for i, row in df.iterrows():
+                turn = row[uttID_key] if uttID_key else i+1
+                speaker = row[speaker_key]
+                time_str = row[timestamp_key]
+                content = "" if pd.isna(row[content_key]) else row[content_key].strip("\n")
+                # when speaker is empty, use the previous speaker
+                if speaker == "":
+                    if speakers:
+                        speaker = speakers[-1]
+                        empty_speaker_cnt_in_file += 1
+                        outfile.write(f"{turn}: found empty speaker, use the speaker in previous turn: {speaker}\n")
+                    else:
+                        raise gr.Error(f"{row}, the first turn is empty speaker")
+                # clean after the sentence tokenize
+                snts = sent_tokenize(content)
+                all_snts_in_file += len(snts)
+                snt_skipped_in_turn = 0
+                for i, snt in enumerate(snts):
+                    remove_flag = False
+                    inaudible_search = re.findall(bracket_re, snt)
+                    if inaudible_search:
+                        all_inaudible_in_file += len(inaudible_search)
+                        outfile.write(f"{turn}, {inaudible_search}, inaudible found in snt: {snt}\n")
+                    all_bracket_search = re.findall(all_bracket_re, snt)
+                    if all_bracket_search:
+                        all_bracket_in_file += len(all_bracket_search)
+                        outfile.write(f"{turn}, {all_bracket_search} bracket found in snt: {snt}\n")
+                    # only remove the [inaudible xxx] when it is the whole sentence.
+                    inaudible_match = re.fullmatch(bracket_re, snt)
+                    if inaudible_match:
+                        if do_keep_context_switch:
+                            # if keep context switch
+                            if speakers and speaker == speakers[-1]:
+                                # share the same speaker, no context switching, just remove it
+                                remove_flag = True
+                            else:
+                                # different speakers, it is the context switching.
+                                if len(snts) == 1:
+                                    # current empty sentence is the only single sentence
+                                    remove_flag = False
+                                else:
+                                    if i != len(snts)-1:
+                                        # current empty utterance is not the last one, just delete it
+                                        remove_flag = True
+                                    else:
+                                        # current empty utterance is the last one, keep it.
+                                        if snt_skipped_in_turn == len(snts)-1:
+                                            # all previous snts are empty, then keep this to not skip the whole turn
+                                            remove_flag = False
+                                        else:
+                                            remove_flag = True
+                        else:
+                            # if not keep context switch, then simply remove all empty utterance
+                            remove_flag = True
+                    # If remove_flag is true:
+                    if remove_flag:
+                        # Increment sub_cnt_in_file and snt_mark_skip_in_file
+                        sub_cnt_in_file += 1
+                        snt_mark_skip_in_file += 1
+                        # Write the following message to outfile:
+                        outfile.write(f"{turn}, sub happend: {snt}, skip this sentence\n")
+                        # If do_remove_inaudible is true:
+                        if do_remove_inaudible:
+                            snt_skipped_in_file += 1
+                            snt_skipped_in_turn += 1
+                            continue
+                    # Add to pd:
+                    # Append turn to turns list
+                    turns.append(turn)
+                    # Set snt_id to the string f"{turn}.{i}"
+                    snt_id = f"{turn}.{i}"
+                    # Append time_str to time_stamps list
+                    time_stamps.append(time_str)
+                    # Append speaker to speakers list
+                    speakers.append(speaker)
+                    # Set sentence to the string representation of snt, with whitespace removed from the start and end
+                    sentence = str(snt).strip().rstrip("\n")
+                    # Calculate the number of tokens in sentence and add to all_token_cnt_in_file
+                    token_cnt = len(nltk.word_tokenize(sentence))
+                    all_token_cnt_in_file += token_cnt
+                    # Append snt_id to snt_ids list
+                    snt_ids.append(snt_id)
+                    # Append sentence to sentences list
+                    sentences.append(sentence)
+                if snt_skipped_in_turn == len(snts):
+                    # all snts in turn are skiped, then skip the turn
+                    turn_skipped_in_file += 1
+                    if (speakers and speaker != speakers[-1]) or not speakers:
+                        turn_skipped_speaker_switch_in_file += 1
+                    outfile.write(f"{turn}, since all snts are empty, skip this whole turn {content}\n")
+            # Create a new DataFrame with the following columns:
+            new_df = pd.DataFrame({
+                "Sentence_ID": snt_ids, # A
+                "TimeStamp": time_stamps, #B
+                "Turn" : turns, #C
+                "Speaker" : speakers, #D
+                "Sentence" : sentences #E
+            })
+            # assert turn_skipped_speaker_switch_in_file==0, "Some speaker switch turn skipped"
+            new_df["Teacher_TM"] = None #F
+            new_df["Student_TM"] = None #G
+            # write new_df to xlsx file
+            new_df.to_excel(output_filepath, index=False)
+            # https://openpyxl.readthedocs.io/en/latest/api/openpyxl.utils.dataframe.html#openpyxl.utils.dataframe.dataframe_to_rows
+            wb = Workbook()
+            ws = wb.active
+            teacher_dv = DataValidation(type="list", formula1='",1-None,2-Keep-Together,3-Getting-Student-to-Relate,4-Restating,5-Revoicing,6-Context,7-Press-for-Accuracy,8-Press-for-Reasoning"', allow_blank=True)
+            student_dv = DataValidation(type="list", formula1='",1-None,2-Relate-to-Another-Student,3-Asking-for-More-info,4-Making-a-Claim,5-Providing-Evidence/Reasoning"', allow_blank=True)
+            ws.add_data_validation(teacher_dv)
+            ws.add_data_validation(student_dv)
+            teacher_dv.add('F2:F1048576')
+            student_dv.add('G2:G1048576')
+            for r in dataframe_to_rows(new_df, index=False, header=True):
+                ws.append(r)
+            wb.save(output_filepath)
+            stat_dict = {
+                "chat_flag_in_speaker_time_line": chat_flag_in_speaker_time_line,
+                "chat_flag_in_content_line": chat_flag_in_content_line,
+                "empty_speaker_cnt_in_file": empty_speaker_cnt_in_file,
+                "ori_total_turn": df.shape[0],
+                "ori_total_snt": all_snts_in_file,
+                "turn_skipped": turn_skipped_in_file,
+                "turn_skipped_speaker_switch_in_file": turn_skipped_speaker_switch_in_file,
+                "snt_skipped": snt_skipped_in_file,
+                "remaining_snt": all_snts_in_file - snt_skipped_in_file,
+                "all_token_cnt_in_file": all_token_cnt_in_file,
+                "avg_token_cnt_per_snt": all_token_cnt_in_file/(all_snts_in_file - snt_skipped_in_file),
+                "sub_cnt_in_file": sub_cnt_in_file,
+                "all_inaudible_in_file": all_inaudible_in_file,
+                "all_bracket_in_file": all_bracket_in_file,
+                "other_bracket_in_file": all_bracket_in_file - all_inaudible_in_file
+            }
+            if all_inaudible_in_file != all_bracket_in_file:
+                # print(f"{filename} has special brakets")
+                error_message.append(f"Warning: {filename} has special brakets")
+            for k, v in stat_dict.items():
+                global_stat_dict[k] = global_stat_dict.get(k,0) + v
+            outfile.write(f"{output_filepath}, {json.dumps(stat_dict, indent=4)}")
+        output_filepath_list.append(output_filepath)
+        trans_log_filepath_list.append(trans_log_filepath)
+    for k, v in global_stat_dict.items():
+        if "avg" in k:
+            global_stat_dict[k] = global_stat_dict[k]/len(file_list)
+    global_log_filepath = os.path.join(filepath, "global_transfer"+ ".log")
+    with open(global_log_filepath, "w") as outfile:
+        outfile.write(f"global_stat_dict: {json.dumps(global_stat_dict, indent=4)}")
+    # error_check
+    if global_stat_dict["all_inaudible_in_file"] != global_stat_dict["all_bracket_in_file"]:
+        error_message.append("Error: 'all_inaudible_in_file' does not match 'all_bracket_in_file'")
+    if global_stat_dict["other_bracket_in_file"] != 0:
+        error_message.append("Error: 'other_bracket_in_file' is not zero")
+    return output_filepath_list, trans_log_filepath_list, error_message, global_log_filepath
+def add_CPS_columns(df):
+    # Observation	Instructions	CONST_SharesU_Situation	CONST_SharesU_CorrectSolutions	CONST_SharesU_IncorrectSolutions	CONST_EstablishesCG_Confirms	CONST_EstablishesCG_Interrupts	NEG_Responds_Reasons	NEG_Responds_QuestionsOthers	NEG_Responds_Responds	MAINTAIN_Initiative_Criticizes	NEG_MonitorsE_Results	NEG_MonitorsE_GivingUp	NEG_MonitorsE_Strategizes	NEG_MonitorsE_Save	MAINTAIN_Initiative_Suggestions	MAINTAIN_Initiative_Compliments	MAINTAIN_FulfillsR_InitiatesOffTopic	MAINTAIN_FulfillsR_JoinsOffTopic	MAINTAIN_FulfillsR_Support	MAINTAIN_FulfillsR_Apologizes	Notes
+    annotation_columns = ['Observation','Instructions', 'CONST_SharesU_Situation', 'CONST_SharesU_CorrectSolutions', 'CONST_SharesU_IncorrectSolutions', 'CONST_EstablishesCG_Confirms', 'CONST_EstablishesCG_Interrupts', 'NEG_Responds_Reasons', 'NEG_Responds_QuestionsOthers', 'NEG_Responds_Responds', 'MAINTAIN_Initiative_Criticizes', 'NEG_MonitorsE_Results', 'NEG_MonitorsE_GivingUp', 'NEG_MonitorsE_Strategizes', 'NEG_MonitorsE_Save', 'MAINTAIN_Initiative_Suggestions', 'MAINTAIN_Initiative_Compliments', 'MAINTAIN_FulfillsR_InitiatesOffTopic', 'MAINTAIN_FulfillsR_JoinsOffTopic', 'MAINTAIN_FulfillsR_Support', 'MAINTAIN_FulfillsR_Apologizes', 'Notes']
+    # add these columns to the end of the df in this order
+    for col in annotation_columns:
+        df[col]=''
+    return df
+def add_TM_columns(df):
+    annotation_columns = ['Teacher_TM', 'Student_TM']
+    # add these columns to the end of the df in this order
+    for col in annotation_columns:
+        df[col]=''
+    return df
+def convert_transcript_for_annotation(file, annotation_scheme=None):
+    """Convert transcript for annotation:
+    Input standard csv transcript file
+    Output will have separate start and end timestamps in HH:MM:SS.sss format
+    Filename column will infer the video filename from the transcript filename
+    Columns for CPS annotators are added
+    """
+    filename,ext = os.path.splitext(os.path.basename(file)) # Get the filename from the file.
+    filepath = os.path.dirname(file) # Get the file path from the file.
+    # Read the file into a Pandas DataFrame depending on its file format.
+    try:
+        table = parse_label_csv(file)
+        media_filename = get_sessname_from_filename(filename)
+        out_df=table.copy()
+        out_df['recordingID']=media_filename
+        out_df['TimeStart']=out_df['start_sec'].apply(sec_to_HHMMSS)
+        out_df['TimeEnd']=out_df['end_sec'].apply(sec_to_HHMMSS)
+        out_df=out_df[['speaker','TimeStart','TimeEnd','utterance','recordingID','uttID']]
+        if annotation_scheme=='CPS':
+            out_df=add_CPS_columns(out_df)
+            output_file = os.path.join(filepath, f"CPS_{filename}.xlsx")
+            out_df.to_excel(output_file, index=False)
+        elif annotation_scheme=='TM':
+            out_df=add_TM_columns(out_df)
+            output_file = os.path.join(filepath, f"TM_{filename}.xlsx")
+            out_df.to_excel(output_file, index=False)
+        else:
+            output_file = os.path.join(filepath, f"{filename}.xlsx")
+            out_df.to_excel(output_file, index=False)
+        return output_file
+    except Exception as e:
+        raise gr.Error(f"{filename}: error {e}")
+def sec_to_HHMMSS(seconds):
+    """Get timestamp string from seconds."""
+    seconds = float(seconds)
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    h=int(h)
+    m=int(m)
+    return f"{h:02d}:{m:02d}:{s:06.3f}"
+def readELANtsv(file, fmt=None):
+    with open(file) as in_file:
+        reader = csv.reader(in_file, delimiter="\t")
+        skiprows=0
+        row=next(reader)
+        while not len(row)>=4: # 4 being the min numbert of cols ELAN exports have
+            skiprows+=1
+            row=next(reader)
+        in_file.seek(skiprows)
+        if skiprows>0:
+            print(f'Detected {skiprows} header rows to skip')
+            reader = csv.reader(in_file, delimiter="\t")
+            for _ in range(skiprows):
+                next(reader)
+        labels = [] # transcript with speaker labels and timestamp in sec
+        for i,utt in enumerate(reader):
+            if not ''.join(utt).strip(): # skip blank lines
+                continue
+            try:
+                if len(utt) == 5: # IF data comes straight from ELAN sometimes there is a superfluous blank column 2
+                    if i==0:
+                        print('detected extra blank column in first row, will remove')
+                    if fmt=='AUG23':
+                        if i==0:
+                            print('detected extra blank 1st column, will remove')
+                        _,speaker,start_HHMMSS,end_HHMMSS,utterance= utt
+                        convert_timestamps=True
+                    else:
+                        if i==0:
+                            print('detected extra blank 2nd column, will remove')
+                        speaker,_,start_HHMMSS, end_HHMMSS, utterance = utt
+                        convert_timestamps=True
+                elif len(utt) == 4: # sometimes the blank col is already removed
+                    if i==0:
+                        print('detected 4 columns, assuming: speaker,start_HHMMSS, end_HHMMSS, utterance ')
+                    speaker,start_HHMMSS, end_HHMMSS, utterance = utt
+                    convert_timestamps=True
+                elif len(utt) == 6: # New one from 2023 Aug has a redundant extra start col!?
+                    if i==0:
+                        print('detected 6 columns, assuming: _,speaker,start_HHMMSS, end_HHMMSS, utterance,_ ')
+                    _,speaker,start_HHMMSS,end_HHMMSS,utterance,_ = utt
+                    convert_timestamps=True
+                elif len(utt) == 9: # 2023 transcribers tend to give full elan output
+                    if i==0:
+                        print('detected 9 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance ')
+                    speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance = utt
+                    convert_timestamps=True
+                elif len(utt) == 10: # sometimes an extra blank column appears at the end
+                    if i==0:
+                        print('detected 10 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ ')
+                    speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ = utt
+                    convert_timestamps=True
+                elif len(utt) == 12: # WOw how many redundant columns can ELAN make...
+                    if i==0:
+                        print('detected 12 columns, assuming: speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance ')
+                    speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance = utt
+                    convert_timestamps=True
+                else:
+                    raise ValueError(f'Unknown transcript format with {len(utt)} columns for {file}')
+            except BaseException as err:
+                print(f'!!! transcript parse error on line {i} for {file}')
+                print(utt)
+                raise err
+            if convert_timestamps:
+                start_sec = HHMMSS_to_sec(start_HHMMSS)
+                end_sec = HHMMSS_to_sec(end_HHMMSS)
+            labels.append((speaker, utterance, start_sec,end_sec))
+        labels= pd.DataFrame(labels, columns = ('speaker', 'utterance', 'start_sec','end_sec'))
+        labels.sort_values(by='start_sec', inplace=True, ignore_index=True)
+        labels.reset_index(inplace=True)
+        labels = labels.rename(columns = {'index':'seg'})
+    return(labels)
+def merge_ellipsis(seg_labels):
+    # merge utterances with ellipsis
+    # input is seg_labels format: [optional index] speaker, utterance, start_sec, end_sec
+    if isinstance(seg_labels,str) and seg_labels.endswith(('.csv','.tsv','.txt')):
+        df=pd.read_csv(seg_labels)
+    elif isinstance(seg_labels, pd.DataFrame):
+        df=seg_labels
+    else:
+        raise ValueError('input seg_labels should be path to csv or pd.DataFrame')
+    if len(df.columns)==4:
+        # no seg index yet
+        df.reset_index(inplace=True)
+        df = df.rename(columns = {'index':'seg'})
+    elif len(df.columns)==5:
+        # first col is seg
+        df.columns = ['seg','speaker','utterance','start_sec','end_sec']
+    else:
+        raise ValueError('input seg_labels should have 4 or 5 columns')
+    df2=[]
+    prev_spk=None
+    prev_utt=""
+    prev_start=0
+    prev_end=0
+    segs=[0]
+    merge_utt={"seg":None, "speaker":None,"utterance":None,"start_sec":None, "end_sec":None}
+    for i,row in df.iterrows():
+        if i==0:
+            merge_utt=row
+        else:
+            # if same speaker as last and ellipsis
+            if merge_utt["speaker"]==row["speaker"] and str(merge_utt["utterance"]).endswith('...') and str(row["utterance"]).startswith('...'):
+                # append current to temporary merged utt: use prev_ items
+                merge_utt["utterance"]+=str(row["utterance"])
+                merge_utt["end_sec"]=row["end_sec"]
+                segs.append(row["seg"])
+            else:
+                # append merge_utt to df2
+                merge_utt["seg"]=segs
+                df2.append(merge_utt)
+                # clear merge_utt and set to current
+                merge_utt=row
+                segs=[merge_utt["seg"]]
+    merge_utt["seg"]=segs
+    # if not isinstance(merge_utt["seg"],list):
+    #     merge_utt["seg"]=list(segs)
+    df2.append(merge_utt) # catch final merge_utt if not terminated
+    df2=pd.DataFrame(df2)
+    df2['utterance']=df2['utterance'].str.replace('\.+',' ', regex=True)
+    # clear up "......"
+    # enumerate utterances
+    df2.reset_index(inplace=True,drop=True)
+    df2 = df2.reset_index().rename(columns = {'index':'utt'})
+    return df2
+def add_dummy_seg_column(table):
+    # adds a dummy seg column (listing segments comprising utterance) for a df without this column
+    # labelfiles generated from merge_ellipsis have an 'utt' column giving utterance ID, and a seg column
+    # containing a list of original segments comprising each utterance
+    # but you may need all label files top have the exact same format even if they weren't produced by
+    # merge_ellipsis()
+    # returns a table with columns 'utt' and 'seg'
+    if 'seg' in table.columns.tolist():
+        print('\'seg\' column already exists, not changing anything')
+        return table
+    if 'uttID' in table.columns.tolist():
+        table=table.rename(columns={"uttID":"utt"})
+    if not 'utt' in table.columns.tolist():
+        table['utt']=table.index
+    table['seg']=[[u] for u in table['utt']]
+    table=table[['utt','seg','speaker','start_sec','end_sec','utterance']]
+    return table
+def get_sessname_from_filename(filename):
+    sessname=Path(filename).stem
+    sessname = re.sub('reworked-transcript-diarized-timestamped-', '', sessname,flags=re.I)
+    sessname = re.sub('reworked_transcript-diarized-timestamped-', '', sessname,flags=re.I)
+    sessname = re.sub('reworked-diarized-timestamped-', '', sessname,flags=re.I)
+    sessname = re.sub('reworked_timestamped_', '', sessname,flags=re.I)
+    sessname = re.sub('reworked_', '', sessname,flags=re.I)
+    sessname = re.sub('reworked-', '', sessname,flags=re.I)
+    sessname = re.sub('transcript_diarized_timestamped_', '', sessname,flags=re.I)
+    sessname = re.sub('transcript-diarized-timestamped_', '', sessname,flags=re.I)
+    sessname = re.sub('transcript-diarized-timestamped-', '', sessname,flags=re.I)
+    sessname = re.sub('_transcript', '', sessname,flags=re.I)
+    sessname = re.sub('_tmcoded', '', sessname,flags=re.I)
+    sessname = re.sub('utt_labels_', '', sessname,flags=re.I)
+    sessname = re.sub('seg_labels_', '', sessname,flags=re.I)
+    sessname = re.sub('_redacted', '', sessname,flags=re.I)
+    return sessname
+def ELAN_to_labels_csv(ELANfile, merge_segments = True):
+    # dumb but effective string wrangling to get sess name
+    sessname=get_sessname_from_filename(ELANfile)
+    # reads ELAN output to pd.DataFrame in a unified format
+    labels=readELANtsv(ELANfile)
+    if merge_segments:
+        save_file=f'utt_labels_{sessname}.csv'
+    # merge segments to form utterances where there have been splits separated by '...'
+        merged_labels=merge_ellipsis(labels)
+        merged_labels.to_csv(save_file,index=False, float_format='%.3f')
+    else:
+        save_file=f'seg_labels_{sessname}.csv'
+        labels.to_csv(save_file,index=False, float_format='%.3f')
+    return save_file
+def parse_label_csv(label_csv:str):
+    # utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
+    # There are several versions with differnt columns (with/without segment &/ utterance index,
+    # withouot column headers etc)
+    # table:
+    # [uttID, speaker, transcript, start_sec, end_sec]
+    table = pd.read_csv(label_csv,keep_default_na=False, header=None)
+    row0=table.iloc[0]
+    is_header = not any(str(cell).replace('.','').isdigit() for cell in row0)
+    if is_header:
+        table.columns=row0.tolist()
+        table=table.iloc[1:]
+        table=table.reset_index(drop=True)
+    else:
+        if len(table.columns)==4:
+            print('no header detected, assuming annotation file has columns [speaker,utterance,start_sec, end_sec] ')
+            table.columns=['speaker','utterance','start_sec', 'end_sec']
+        elif len(table.columns)==5:
+            print('no header detected, assuming annotation file has columns [seg,speaker,utterance,start_sec, end_sec] ')
+            table.columns=['seg','speaker','utterance','start_sec', 'end_sec']
+        elif len(table.columns)==6:
+            print('no header detected, assuming annotation file has columns [utt,seg,speaker,utterance,start_sec, end_sec] ')
+            table.columns=['utt','seg','speaker','utterance','start_sec', 'end_sec']
+        else:
+            print(f'no header detected, csv has {len(table.columns)} columns, could not determine column names.')
+            return None
+    # choose which column to use for uttID in table
+    if 'utt' in table.columns.tolist():
+        table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
+    elif 'seg' in table.columns.tolist():
+        table=table.rename(columns={"seg":"uttID"})
+    else:
+        table=table.reset_index().rename(columns={"index":"uttID"})
+    table=table[['uttID','speaker','start_sec','end_sec','utterance']]
+    return table
+def deidentify_speaker(df, who='all'):
+    """replace speaker ID with generic labels
+    in order of appearance (speaker1, speaker2)'
+    if who is "student", only student names are replaced
+    Args:
+        df (_type_): _description_
+        who (str, optional): 'all','student'. Which names to replace. Defaults to 'all'.
+    """
+    colnames = df.columns.tolist()
+    speaker_key = next((key for key in ['speaker','Speaker','speaker_id','Speaker_ID'] if key in colnames),None)
+    if not speaker_key:
+        raise ValueError('No speaker column found in dataframe!')
+    speakers = df[speaker_key].unique()
+    if who=='student':
+        # detect student. ID format can be student_xxx or 00-0000 numeric
+        speakers = [s for s in speakers if ('student' in s.lower() or re.match(r'^\d{2}-\d{4}$',s))]
+        generic_speakers = [f'student_{i+1}' for i in range(len(speakers))]
+    else:
+        generic_speakers = [f'speaker_{i+1}' for i in range(len(speakers))]
+    speaker_dict = dict(zip(speakers, generic_speakers))
+    df[speaker_key] = df[speaker_key].replace(speaker_dict)
+    return df