Spaces:

levicu
/

transcriber_tools

Sleeping

App Files Files Community

rosyvs commited on Apr 21, 2025

Commit

ff6eb07

1 Parent(s): 9bb8ff3

Add transcript processing application and utility functions from file_convertor, not yet integrated into app.

Browse files

Files changed (7) hide show

.gitignore +7 -2
Dockerfile +21 -3
README.md +11 -0
requirements.txt +2 -0
setup.py +6 -0
transcript_app.py +225 -0
transcript_utils.py +745 -0

.gitignore CHANGED Viewed

@@ -1,5 +1,10 @@
 .DS_Store
 __pycache__/
-flagged/
 results*/
-logs/

 .DS_Store
 __pycache__/
 results*/
+logs/
+*.xlsx
+*.log
+*.csv
+*.xls
+flagged/
+test.py

Dockerfile CHANGED Viewed

@@ -8,12 +8,28 @@ WORKDIR /app
 COPY requirements.txt .
 # Install Python dependencies without storing cache, for a smaller image
-RUN pip install --no-cache-dir -r requirements.txt
 # Update package lists and install FFmpeg for media processing
 RUN apt-get update && apt-get install -y ffmpeg
-# Set an environment variable for Matplotlib to store its configuration in /tmp
 ENV MPLCONFIGDIR /tmp/matplotlib
 # Create and set permissions for result directories and logs inside the container
@@ -25,5 +41,7 @@ done
 # Copy all Python files from the current directory to the container
 COPY *.py .
 # Specify the command to run on container start
-CMD ["python", "app.py"]

 COPY requirements.txt .
 # Install Python dependencies without storing cache, for a smaller image
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
 # Update package lists and install FFmpeg for media processing
 RUN apt-get update && apt-get install -y ffmpeg
+RUN useradd -m -u 1000 user
+# Switch to root user to change directory ownership
+USER root
+RUN mkdir -p /usr/share/nltk_data && chown -R user:user /usr/share/nltk_data
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+# Set environment variables for NLTK data and Matplotlib configuration
+ENV NLTK_DATA /usr/share/nltk_data
 ENV MPLCONFIGDIR /tmp/matplotlib
 # Create and set permissions for result directories and logs inside the container
 # Copy all Python files from the current directory to the container
 COPY *.py .
+RUN python setup.py
 # Specify the command to run on container start
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -8,4 +8,15 @@ pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: mit
 ---
+Various tools for transribers.
+converting media files
+converting transcription files
+- XLSX-->XLSX+TM: from xlsx to xlsx with TM annotation labels
+- XLSX-->ELAN: from xlsx to ELAN-compatible TSV
+- ELAN-->CSV: from ELAN output tsv to standardized transcript csv format (seg_labels)
+  - supports merging adjacent segments from the same speaker to reconstitute utterances
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

requirements.txt CHANGED Viewed

@@ -3,3 +3,5 @@ moviepy==1.0.3
 pandas==2.2.3
 xlrd==1.2.0
 numpy==2.2.5

 pandas==2.2.3
 xlrd==1.2.0
 numpy==2.2.5
+nltk==3.5
+openpyxl==3.0.10

setup.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import nltk
+import os
+download_dir = os.path.expanduser('/usr/share/nltk_data/')
+os.makedirs(name=download_dir, exist_ok=True)
+nltk.download('punkt', download_dir=download_dir)
+print(download_dir)

transcript_app.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import threading
+import os
+import time
+import pandas as pd
+import gradio as gr
+from utils import (HHMMSS_to_sec, molly_old_xlsx_to_table, convert_transcript_for_TM, convert_transcript_for_annotation,
+                   table_to_ELAN_tsv, ELAN_to_labels_csv, old_xlsx_to_table, old_xlsx_to_labels_csv, deidentify_speaker)
+def delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath):
+    for output_filepath in output_filepath_list:
+        try:
+            os.remove(output_filepath)
+        except FileNotFoundError:
+            pass
+    for trans_log_filepath in trans_log_filepath_list:
+        try:
+            os.remove(trans_log_filepath)
+        except FileNotFoundError:
+            pass
+    try:
+        os.remove(global_log_filepath)
+    except FileNotFoundError:
+        pass
+    print("Files deleted")
+def delete_files_thread(output_filepath_list, trans_log_filepath_list, global_log_filepath):
+    print("Thread started")
+    time.sleep(20)
+    delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath)
+def convert_xlsx_to_TMxlsx(input_file_list):
+    file_list = [file.name for file in input_file_list]
+    output_filepath_list, trans_log_filepath_list, error_check, global_transfer_log_path = convert_transcript_for_TM(file_list=file_list)
+    if not error_check:
+        error_check = "No errors found."
+    delete_thread = threading.Thread(target=delete_files_thread, args=(output_filepath_list, trans_log_filepath_list, global_transfer_log_path))
+    delete_thread.start()
+    return output_filepath_list, trans_log_filepath_list, global_transfer_log_path, error_check
+def convert_for_annotation(input_file_list, annotation_scheme):
+    output_files=[]
+    for input_transcript in input_file_list:
+        print("start converting transcript")
+        output_file = convert_transcript_for_annotation(file=input_transcript, annotation_scheme=annotation_scheme)
+        print("finished converting transcript to xlsx for annotation")
+        output_files.append(output_file)
+    return output_files
+def convert_xlsx_to_ELANtsv(input_file_list):
+    output_files=[]
+    for input_transcript in input_file_list:
+        # convert transcript
+        print("start converting transcript")
+        table = old_xlsx_to_table(xl_file=input_transcript)
+        print("finished converting transcript to table")
+        output_transcript = input_transcript.replace('.xlsx', '.tsv')
+        output_file = table_to_ELAN_tsv(table, output_transcript)
+        print("saved table to tsv")
+        output_files.append(output_file)
+    return output_files
+#TODO: support sort and merge for XLSX output if this is needed
+def convert_ELANtsv_to_CSV(input_file_list, merge_ellipsis=False):
+    output_files=[]
+    for input_transcript in input_file_list:
+        # convert transcript
+        print("start converting transcript")
+        output_transcript = input_transcript.replace('.tsv', '.csv')
+        output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
+        print("finish converting transcript")
+        output_files.append(output_file)
+    return output_files
+# TODO: XLSX to csv (seg_labels or utt_labels)
+def convert_xlsx_to_csv(input_file_list, merge_ellipsis=False):
+    output_files=[]
+    for input_transcript in input_file_list:
+        # read xl file to table
+        # write table to csv with option to merge segments on ellipsis
+        output_transcript = input_transcript.replace('.xlsx', '.csv')
+        output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
+        output_files.append(output_file)
+    return output_files
+def deidentify_transcripts(input_file_list, who='student'):
+    output_files=[]
+    for file in input_file_list:
+        basename = os.path.basename(file)
+        ext = file.split('.')[-1]
+        if file.endswith('.xlsx') or file.endswith('.xls'):
+            df = pd.read_excel(file)
+        elif file.endswith('.csv'):
+            df = pd.read_csv(file)
+        elif file.endswith('.tsv'):
+            df = pd.read_csv(file, sep='\t')
+        elif file.endswith('.txt'):
+            df = pd.read_csv(file, sep='\t')
+        else:
+            gr.Warning("File type not supported (must be .xlsx, .xls, .csv, .tsv, or .txt)")
+        try:
+            df = deidentify_speaker(df, who=who)
+        except ValueError as e:
+            gr.Warning(f"{e}: {basename} ")
+            continue
+        output_file = file.replace(f'.{ext}', f'_deidentified.{ext}')
+        if ext == 'xlsx' or ext == 'xls':
+            df.to_excel(output_file, index=False)
+        elif ext == 'csv':
+            df.to_csv(output_file, index=False)
+        elif ext == 'tsv' or ext == 'txt':
+            df.to_csv(output_file, sep='\t', index=False)
+        output_files.append(output_file)
+    return output_files
+# gr components for TM converter
+input_xlsx = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
+output_xlsx_tm = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
+process_log_tm = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
+global_transfer_log_tm = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
+error_check_tm = gr.Textbox(label="Error Check", type="text")
+interface_tm = gr.Interface(fn=convert_xlsx_to_TMxlsx,
+                        inputs=input_xlsx,
+                        outputs=[output_xlsx_tm, process_log_tm, global_transfer_log_tm, error_check_tm],
+                        title="transcript-->XLSX+TM_dropdown",
+                        description="Converts XLSX or csv transcript to XLSX+TM transcript with prefilled dropdown for talkmoves",
+                        live=False,
+                        allow_flagging="never",)
+# gr components for xlsx to ELAN
+input_x2e = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
+output_x2e = gr.Files(label="Output ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
+# process_log_x2e = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
+# global_transfer_log_x2e = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
+# error_check_x2e = gr.Textbox(label="Error Check", type="text")
+interface_x2e = gr.Interface(fn=convert_xlsx_to_ELANtsv, # TODO: swap out for correct fn
+                        inputs=input_x2e,
+                        outputs=output_x2e,
+                        title="XLSX-->ELAN",
+                        description="Converts XLSX transcript to ELAN-compatible tsv file",
+                        live=False,
+                        allow_flagging="never",)
+# gr components for ELAN to CSV
+input_e2c = gr.Files(label="Input ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
+merge_e2c = gr.Checkbox(label="Merge segments on ellipsis?")
+output_e2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
+interface_e2c = gr.Interface(fn=convert_ELANtsv_to_CSV, # TODO: swap out for correct fn
+                        inputs=[input_e2c, merge_e2c],
+                        outputs=[output_e2c],
+                        title="ELAN-->CSV",
+                        description="Converts ELAN-exported file (.txt or .tsv, tab separated values) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
+                        live=False,
+                        allow_flagging="never",)
+# gr components for XLSX to CSV
+input_x2c = gr.Files(label="Input XLSX file", type="filepath", file_types=[".xlsx", ".csv"])
+merge_x2c = gr.Checkbox(label="Merge segments on ellipsis?")
+output_x2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
+interface_x2c = gr.Interface(fn=convert_xlsx_to_csv, # TODO: swap out for correct fn
+                        inputs=[input_x2c, merge_x2c],
+                        outputs=[output_x2c],
+                        title="XLSX-->CSV",
+                        description="Converts old version XLSX transcript (with a single Timecode column) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
+                        live=False,
+                        allow_flagging="never",)
+# gr components for annotation XLSX
+input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
+annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
+output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
+interface_c2a = gr.Interface(
+                        fn=convert_for_annotation, # TODO: swap out for correct fn
+                        inputs=[input_c2a, annotation_scheme_c2a],
+                        outputs=[output_c2a],
+                        title="CSV-->XLSX+annotation",
+                        description="Converts CSV file to XLSX file for annotation (added columns for CPS or TM or None)",
+                        live=False,
+                        allow_flagging="never",
+                        # submit_btn="Convert"
+                        )
+# gr components for deidentification
+input_di = gr.Files(label="Input transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
+who_di = gr.Radio(label="Who to deidentify", choices=[("student","student"), ("all","all")])
+output_di = gr.Files(label="Output deidentified transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
+interface_di = gr.Interface(
+    fn=deidentify_transcripts,
+    inputs=[input_di, who_di],
+    outputs=[output_di],
+    title="Deidentify",
+    description="Deidentify speaker labels in a transcript. Compatible with .xlsx, .xls, .csv, .tsv, .txt files with a column containing speaker labels. Will not work if speaker column is missing a header. Speaker names or IDs will be replaced with a deidentified label numbered in order of appearance. Choose whether to deidentify just students or all speakers.",
+    live=False,
+    allow_flagging="never",
+    )
+tab_interface = gr.TabbedInterface(
+    [
+    interface_e2c,
+    interface_c2a,
+    interface_x2e,
+    interface_x2c,
+    interface_tm,
+    interface_di
+    ]
+    ,
+    ["ELAN→CSV",
+    "CSV→XLSX+annotation",
+    "XLSX→ELAN",
+    "XLSX→CSV",
+    "transcript→XLSX+TM_dropdown",
+    "Deidentify"
+    ]
+)
+# TODO: XLSX to csv (seg_labels or utt_labels)
+# TODO: XLSX to merged on ellipsis, keep XLSX format
+tab_interface.launch(server_name="0.0.0.0", server_port=7860)

transcript_utils.py ADDED Viewed

	@@ -0,0 +1,745 @@

+import json
+import math
+import os
+import re
+import csv
+from pathlib import Path
+import gradio as gr
+import nltk
+import pandas as pd
+from nltk.tokenize import sent_tokenize
+from openpyxl import Workbook
+from openpyxl.utils.dataframe import dataframe_to_rows
+from openpyxl.worksheet.datavalidation import DataValidation
+from pandas._libs.tslibs import timestamps
+def convert_transcript_for_TM(file_list):
+    """Convert transcripts for TalkMoves Annotation
+    Input can be xlsx or csv transcript file
+    Can handle sepraate start and end time columns or a single timecode column
+    Output will have separate start and end timestamps in HH:MM:SS.sss format
+    Args:
+        file_list (_type_): _description_
+    Raises:
+        gr.Error: _description_
+        gr.Error: _description_
+    Returns:
+        _type_: _description_
+    """
+    # Regular expression pattern for matching speaker names and timecodes.
+    bracket_re = re.compile(r'(?:\[[UI|ui|Inaudible|inaudible|overlapping speech|VIDEO SILENCE|teacher explaining in background].*\]\W{0,2})')
+    # Regular expression pattern for matching anything enclosed in square brackets.
+    all_bracket_re = re.compile(r'(?:\[.*\]\W{0,2})')
+    # whether remove the inaudible
+    do_remove_inaudible = True
+    # whether_keep_context_switch
+    do_keep_context_switch = True
+    # whether_convert_to_timestamp if start and end time are in seconds and in separate columns
+    convert_to_timestamp = True
+    error_message = [] # List of error messages to be displayed to the user.
+    global_stat_dict = {} # Dictionary of global statistics.
+    output_filepath_list = [] # List of output file paths.
+    trans_log_filepath_list = [] # List of transcription log file paths.
+    for file in file_list:
+        filename = file.split('/')[-1] # Get the filename from the file.
+        filepath = os.path.dirname(file) # Get the file path from the file.
+        # Read the file into a Pandas DataFrame depending on its file format.
+        if filename.endswith('.xlsx'):
+            df = pd.read_excel(file, index_col=0)
+            output_filename = f"{filename[:-5]}" + "_TMcoded.xlsx"
+        elif filename.endswith('.csv'):
+            df = pd.read_csv(file, index_col=0, error_bad_lines=False)
+            output_filename = f"{filename[:-4]}" + "_TMcoded.xlsx"
+        else:
+            raise gr.Error(f"{file} format is wrong")
+        # Remove the "Copy of" prefix from the output filename, if present.
+        if output_filename.startswith("Copy of "):
+            output_filename = output_filename[8:]
+        # Remove the word "_Transcript" from the output filename, if present.
+        if '_Transcript' in output_filename:
+            # print("before: "+output_filename)
+            error_message.append("before: "+output_filename)
+            output_filename = ''.join(output_filename.split('_Transcript'))
+            # print("after: "+output_filename)
+            error_message.append("after: "+output_filename)
+        # Construct the output file and transcription log file paths.
+        output_filepath = os.path.join(filepath, output_filename)
+        trans_log_filepath = os.path.join(filepath, f"{output_filename}"+ ".log")
+        # Open the transcription log file for writing.
+        with open(trans_log_filepath, "w") as outfile:
+            sub_cnt_in_file = 0
+            empty_speaker_cnt_in_file = 0
+            turn_skipped_in_file = 0
+            turn_skipped_speaker_switch_in_file = 0
+            snt_mark_skip_in_file = 0
+            snt_skipped_in_file = 0
+            chat_flag_in_speaker_time_line = 0
+            chat_flag_in_content_line = 0
+            all_inaudible_in_file = 0
+            all_bracket_in_file = 0
+            all_snts_in_file = 0
+            all_token_cnt_in_file = 0
+            #index	Timecode	Duration	Speaker	Dialogue	Annotations	Error Type
+            #1	00:00:05:04 - 00:00:07:12	00:00:02:08	Tutor	Did you... How was your Halloween?
+            turns = []
+            time_stamps = []
+            speakers = []
+            chat_flags = []
+            sentences = []
+            snt_ids = []
+            ## parse the df flexibly: find key column names which might vary dependign on transcript source
+            # set all column names to lowercase
+            df.columns = map(str.lower, df.columns)
+            # several possibilities for column names, detect which are present
+            uttID_keys = ['utt','seg','utt_id','seg_id','index']
+            speaker_keys = ['speaker']
+            start_keys=['start_sec','start','start_time','timestart']
+            end_keys=['end_sec','end','end_time','timeend']
+            timestamp_keys = ['timecode','timestamp']
+            content_keys=['dialogue','utterance','transcript','text']
+            # detect which is used in this df
+            uttID_key = next((key for key in uttID_keys if key in df.columns), None)
+            speaker_key = next((key for key in speaker_keys if key in df.columns), None)
+            content_key = next((key for key in content_keys if key in df.columns), None)
+            # check if separate start and end times are present, otherwise assume single timecode column
+            if any(df.columns.isin(start_keys)):
+                start_key = next((key for key in start_keys if key in df.columns), None)
+                end_key = next((key for key in end_keys if key in df.columns), None)
+                time_format = 'seconds'
+                if convert_to_timestamp:
+                    # convert to timestamp format HH:MM:SS.sss - HH:MM:SS.sss
+                    df['timecode'] = df.apply(lambda x: f"{sec_to_HHMMSS(x[start_key])} - {sec_to_HHMMSS(x[end_key])}", axis=1)
+                    timestamp_key='timecode'
+                    time_format = 'timestamp'
+            else:
+                timestamp_key=next((key for key in timestamp_keys if key in df.columns), None)
+                time_format = 'timestamp'
+            # Turn started with 1, the same as molly's transcripts
+            for i, row in df.iterrows():
+                turn = row[uttID_key] if uttID_key else i+1
+                speaker = row[speaker_key]
+                time_str = row[timestamp_key]
+                content = "" if pd.isna(row[content_key]) else row[content_key].strip("\n")
+                # when speaker is empty, use the previous speaker
+                if speaker == "":
+                    if speakers:
+                        speaker = speakers[-1]
+                        empty_speaker_cnt_in_file += 1
+                        outfile.write(f"{turn}: found empty speaker, use the speaker in previous turn: {speaker}\n")
+                    else:
+                        raise gr.Error(f"{row}, the first turn is empty speaker")
+                # clean after the sentence tokenize
+                snts = sent_tokenize(content)
+                all_snts_in_file += len(snts)
+                snt_skipped_in_turn = 0
+                for i, snt in enumerate(snts):
+                    remove_flag = False
+                    inaudible_search = re.findall(bracket_re, snt)
+                    if inaudible_search:
+                        all_inaudible_in_file += len(inaudible_search)
+                        outfile.write(f"{turn}, {inaudible_search}, inaudible found in snt: {snt}\n")
+                    all_bracket_search = re.findall(all_bracket_re, snt)
+                    if all_bracket_search:
+                        all_bracket_in_file += len(all_bracket_search)
+                        outfile.write(f"{turn}, {all_bracket_search} bracket found in snt: {snt}\n")
+                    # only remove the [inaudible xxx] when it is the whole sentence.
+                    inaudible_match = re.fullmatch(bracket_re, snt)
+                    if inaudible_match:
+                        if do_keep_context_switch:
+                            # if keep context switch
+                            if speakers and speaker == speakers[-1]:
+                                # share the same speaker, no context switching, just remove it
+                                remove_flag = True
+                            else:
+                                # different speakers, it is the context switching.
+                                if len(snts) == 1:
+                                    # current empty sentence is the only single sentence
+                                    remove_flag = False
+                                else:
+                                    if i != len(snts)-1:
+                                        # current empty utterance is not the last one, just delete it
+                                        remove_flag = True
+                                    else:
+                                        # current empty utterance is the last one, keep it.
+                                        if snt_skipped_in_turn == len(snts)-1:
+                                            # all previous snts are empty, then keep this to not skip the whole turn
+                                            remove_flag = False
+                                        else:
+                                            remove_flag = True
+                        else:
+                            # if not keep context switch, then simply remove all empty utterance
+                            remove_flag = True
+                    # If remove_flag is true:
+                    if remove_flag:
+                        # Increment sub_cnt_in_file and snt_mark_skip_in_file
+                        sub_cnt_in_file += 1
+                        snt_mark_skip_in_file += 1
+                        # Write the following message to outfile:
+                        outfile.write(f"{turn}, sub happend: {snt}, skip this sentence\n")
+                        # If do_remove_inaudible is true:
+                        if do_remove_inaudible:
+                            snt_skipped_in_file += 1
+                            snt_skipped_in_turn += 1
+                            continue
+                    # Add to pd:
+                    # Append turn to turns list
+                    turns.append(turn)
+                    # Set snt_id to the string f"{turn}.{i}"
+                    snt_id = f"{turn}.{i}"
+                    # Append time_str to time_stamps list
+                    time_stamps.append(time_str)
+                    # Append speaker to speakers list
+                    speakers.append(speaker)
+                    # Set sentence to the string representation of snt, with whitespace removed from the start and end
+                    sentence = str(snt).strip().rstrip("\n")
+                    # Calculate the number of tokens in sentence and add to all_token_cnt_in_file
+                    token_cnt = len(nltk.word_tokenize(sentence))
+                    all_token_cnt_in_file += token_cnt
+                    # Append snt_id to snt_ids list
+                    snt_ids.append(snt_id)
+                    # Append sentence to sentences list
+                    sentences.append(sentence)
+                if snt_skipped_in_turn == len(snts):
+                    # all snts in turn are skiped, then skip the turn
+                    turn_skipped_in_file += 1
+                    if (speakers and speaker != speakers[-1]) or not speakers:
+                        turn_skipped_speaker_switch_in_file += 1
+                    outfile.write(f"{turn}, since all snts are empty, skip this whole turn {content}\n")
+            # Create a new DataFrame with the following columns:
+            new_df = pd.DataFrame({
+                "Sentence_ID": snt_ids, # A
+                "TimeStamp": time_stamps, #B
+                "Turn" : turns, #C
+                "Speaker" : speakers, #D
+                "Sentence" : sentences #E
+            })
+            # assert turn_skipped_speaker_switch_in_file==0, "Some speaker switch turn skipped"
+            new_df["Teacher_TM"] = None #F
+            new_df["Student_TM"] = None #G
+            # write new_df to xlsx file
+            new_df.to_excel(output_filepath, index=False)
+            # https://openpyxl.readthedocs.io/en/latest/api/openpyxl.utils.dataframe.html#openpyxl.utils.dataframe.dataframe_to_rows
+            wb = Workbook()
+            ws = wb.active
+            teacher_dv = DataValidation(type="list", formula1='",1-None,2-Keep-Together,3-Getting-Student-to-Relate,4-Restating,5-Revoicing,6-Context,7-Press-for-Accuracy,8-Press-for-Reasoning"', allow_blank=True)
+            student_dv = DataValidation(type="list", formula1='",1-None,2-Relate-to-Another-Student,3-Asking-for-More-info,4-Making-a-Claim,5-Providing-Evidence/Reasoning"', allow_blank=True)
+            ws.add_data_validation(teacher_dv)
+            ws.add_data_validation(student_dv)
+            teacher_dv.add('F2:F1048576')
+            student_dv.add('G2:G1048576')
+            for r in dataframe_to_rows(new_df, index=False, header=True):
+                ws.append(r)
+            wb.save(output_filepath)
+            stat_dict = {
+                "chat_flag_in_speaker_time_line": chat_flag_in_speaker_time_line,
+                "chat_flag_in_content_line": chat_flag_in_content_line,
+                "empty_speaker_cnt_in_file": empty_speaker_cnt_in_file,
+                "ori_total_turn": df.shape[0],
+                "ori_total_snt": all_snts_in_file,
+                "turn_skipped": turn_skipped_in_file,
+                "turn_skipped_speaker_switch_in_file": turn_skipped_speaker_switch_in_file,
+                "snt_skipped": snt_skipped_in_file,
+                "remaining_snt": all_snts_in_file - snt_skipped_in_file,
+                "all_token_cnt_in_file": all_token_cnt_in_file,
+                "avg_token_cnt_per_snt": all_token_cnt_in_file/(all_snts_in_file - snt_skipped_in_file),
+                "sub_cnt_in_file": sub_cnt_in_file,
+                "all_inaudible_in_file": all_inaudible_in_file,
+                "all_bracket_in_file": all_bracket_in_file,
+                "other_bracket_in_file": all_bracket_in_file - all_inaudible_in_file
+            }
+            if all_inaudible_in_file != all_bracket_in_file:
+                # print(f"{filename} has special brakets")
+                error_message.append(f"Warning: {filename} has special brakets")
+            for k, v in stat_dict.items():
+                global_stat_dict[k] = global_stat_dict.get(k,0) + v
+            outfile.write(f"{output_filepath}, {json.dumps(stat_dict, indent=4)}")
+        output_filepath_list.append(output_filepath)
+        trans_log_filepath_list.append(trans_log_filepath)
+    for k, v in global_stat_dict.items():
+        if "avg" in k:
+            global_stat_dict[k] = global_stat_dict[k]/len(file_list)
+    global_log_filepath = os.path.join(filepath, "global_transfer"+ ".log")
+    with open(global_log_filepath, "w") as outfile:
+        outfile.write(f"global_stat_dict: {json.dumps(global_stat_dict, indent=4)}")
+    # error_check
+    if global_stat_dict["all_inaudible_in_file"] != global_stat_dict["all_bracket_in_file"]:
+        error_message.append("Error: 'all_inaudible_in_file' does not match 'all_bracket_in_file'")
+    if global_stat_dict["other_bracket_in_file"] != 0:
+        error_message.append("Error: 'other_bracket_in_file' is not zero")
+    return output_filepath_list, trans_log_filepath_list, error_message, global_log_filepath
+def add_CPS_columns(df):
+    # Observation	Instructions	CONST_SharesU_Situation	CONST_SharesU_CorrectSolutions	CONST_SharesU_IncorrectSolutions	CONST_EstablishesCG_Confirms	CONST_EstablishesCG_Interrupts	NEG_Responds_Reasons	NEG_Responds_QuestionsOthers	NEG_Responds_Responds	MAINTAIN_Initiative_Criticizes	NEG_MonitorsE_Results	NEG_MonitorsE_GivingUp	NEG_MonitorsE_Strategizes	NEG_MonitorsE_Save	MAINTAIN_Initiative_Suggestions	MAINTAIN_Initiative_Compliments	MAINTAIN_FulfillsR_InitiatesOffTopic	MAINTAIN_FulfillsR_JoinsOffTopic	MAINTAIN_FulfillsR_Support	MAINTAIN_FulfillsR_Apologizes	Notes
+    annotation_columns = ['Observation','Instructions', 'CONST_SharesU_Situation', 'CONST_SharesU_CorrectSolutions', 'CONST_SharesU_IncorrectSolutions', 'CONST_EstablishesCG_Confirms', 'CONST_EstablishesCG_Interrupts', 'NEG_Responds_Reasons', 'NEG_Responds_QuestionsOthers', 'NEG_Responds_Responds', 'MAINTAIN_Initiative_Criticizes', 'NEG_MonitorsE_Results', 'NEG_MonitorsE_GivingUp', 'NEG_MonitorsE_Strategizes', 'NEG_MonitorsE_Save', 'MAINTAIN_Initiative_Suggestions', 'MAINTAIN_Initiative_Compliments', 'MAINTAIN_FulfillsR_InitiatesOffTopic', 'MAINTAIN_FulfillsR_JoinsOffTopic', 'MAINTAIN_FulfillsR_Support', 'MAINTAIN_FulfillsR_Apologizes', 'Notes']
+    # add these columns to the end of the df in this order
+    for col in annotation_columns:
+        df[col]=''
+    return df
+def add_TM_columns(df):
+    annotation_columns = ['Teacher_TM', 'Student_TM']
+    # add these columns to the end of the df in this order
+    for col in annotation_columns:
+        df[col]=''
+    return df
+def convert_transcript_for_annotation(file, annotation_scheme=None):
+    """Convert transcript for annotation:
+    Input standard csv transcript file
+    Output will have separate start and end timestamps in HH:MM:SS.sss format
+    Filename column will infer the video filename from the transcript filename
+    Columns for CPS annotators are added
+    """
+    filename,ext = os.path.splitext(os.path.basename(file)) # Get the filename from the file.
+    filepath = os.path.dirname(file) # Get the file path from the file.
+    # Read the file into a Pandas DataFrame depending on its file format.
+    try:
+        table = parse_label_csv(file)
+        media_filename = get_sessname_from_filename(filename)
+        out_df=table.copy()
+        out_df['recordingID']=media_filename
+        out_df['TimeStart']=out_df['start_sec'].apply(sec_to_HHMMSS)
+        out_df['TimeEnd']=out_df['end_sec'].apply(sec_to_HHMMSS)
+        out_df=out_df[['speaker','TimeStart','TimeEnd','utterance','recordingID','uttID']]
+        if annotation_scheme=='CPS':
+            out_df=add_CPS_columns(out_df)
+            output_file = os.path.join(filepath, f"CPS_{filename}.xlsx")
+            out_df.to_excel(output_file, index=False)
+        elif annotation_scheme=='TM':
+            out_df=add_TM_columns(out_df)
+            output_file = os.path.join(filepath, f"TM_{filename}.xlsx")
+            out_df.to_excel(output_file, index=False)
+        else:
+            output_file = os.path.join(filepath, f"{filename}.xlsx")
+            out_df.to_excel(output_file, index=False)
+        return output_file
+    except Exception as e:
+        raise gr.Error(f"{filename}: error {e}")
+def HHMMSS_to_sec(time_str):
+    """Get Seconds from timestamp string with milliseconds."""
+    if not time_str:
+        return None
+    if time_str.count(':')==2:
+        h, m, s = time_str.split(':')
+    elif time_str.count(':')==3:
+    # weird timestamps where there is a field followign seconds delimited by colon
+        h, m, s, u = time_str.split(':')
+        # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
+        if len(u)==1:
+            print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
+            ms = float(u)/10
+        elif len(u)==2: # hundredths
+            ms = float(u)/100
+        elif len(u)==3: # hundredths
+            ms = float(u)/1000
+        else:
+            print(f'input string format not supported: {time_str}')
+            return None
+        s = int(s)+ms
+    elif time_str.count(':')==1:
+        # print('missing HH from timestamp, assuming MM:SS')
+        m, s = time_str.split(':')
+        h=0
+    else:
+        try:
+            time_str=float(time_str) # maybe its already in seconds!
+            return time_str
+        except Exception as e:
+            gr.Error(f"Error converting time to seconds: {e}")
+            return None
+    return int(h) * 3600 + int(m) * 60 + float(s)
+def sec_to_HHMMSS(seconds):
+    """Get timestamp string from seconds."""
+    seconds = float(seconds)
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    h=int(h)
+    m=int(m)
+    return f"{h:02d}:{m:02d}:{s:06.3f}"
+def molly_old_xlsx_to_table(xl_file): #TODO: check against isatasr
+    # contractor transcribers provide an xlsx with the following columns
+    # utt_ix:	int
+    # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
+    # Duration:	HH:MM:SS:ss
+    # Speaker:	str
+    # Dialogue:	str
+    # Annotations:	blank
+    # Error Type: blank
+    with pd.ExcelFile(xl_file) as xls:
+        sheetname = xls.sheet_names
+        table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
+    table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True)
+    table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
+    table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
+    table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True)
+    table=table[['#','Speaker','Dialogue','start_sec','end_sec']]
+    table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True)
+    return table
+def old_xlsx_to_table(xl_file):#TODO: check against isatasr
+    try:
+        # read the first sheet of the Excel file into a DataFrame
+        print(f'...reading {xl_file}...')
+        table = pd.read_excel(xl_file, sheet_name=0)
+        print(f'...done reading {xl_file}...')
+        # convert column names to lowercase
+        table.columns = map(str.lower, table.columns)
+        # extract start and end time from the Timecode column
+        print(f'...splitting Timecode column into start and end time...')
+        timecodes = table['timecode'].str.split(' - ', expand=True)
+        table['start_time'] = timecodes[0]
+        table['end_time'] = timecodes[1]
+        print(f'...done splitting Timecode column into start and end time...')
+        # convert start and end time to seconds using the HHMMSS_to_sec function
+        print(f'...converting start and end time to seconds...')
+        table['start_sec'] = table['start_time'].apply(HHMMSS_to_sec)
+        table['end_sec'] = table['end_time'].apply(HHMMSS_to_sec)
+        print(f'...done converting start and end time to seconds...')
+        # drop unnecessary columns
+        print(f'...dropping unnecessary columns...')
+        table.drop(['timecode', 'annotations', 'error type', 'duration'], axis=1, inplace=True)
+        # rename columns
+        print(f'...renaming columns...')
+        table.rename(columns={'#': 'uttID', 'speaker': 'speaker', 'dialogue': 'transcript'}, inplace=True)
+        # reorder columns
+        print(f'...reordering columns...')
+        table = table[['uttID', 'speaker', 'transcript', 'start_sec', 'end_sec']]
+        table.sort_values(by='start_sec', inplace=True, ignore_index=True)
+        table.reset_index(inplace=True)
+        return table
+    except Exception as e:
+        gr.Error(f'Error converting {xl_file}: {e}')
+def table_to_ELAN_tsv(table:pd.DataFrame, path:str):#TODO: check against isatasr
+    # write table to tsv compatible with ELAN import
+    table.to_csv(path, index=False, float_format='%.3f',sep='\t')
+    return path
+def table_to_labels_csv(table:pd.DataFrame, path:str):
+    # write table to utt_labels csv format comaptable w rosy's isatasr lib
+    table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') # drop rows with missing values in speaker and utterance
+    table.to_csv(path,index=False, float_format='%.3f')
+    return path
+def readELANtsv(file, fmt=None):
+    with open(file) as in_file:
+        reader = csv.reader(in_file, delimiter="\t")
+        skiprows=0
+        row=next(reader)
+        while not len(row)>=4: # 4 being the min numbert of cols ELAN exports have
+            skiprows+=1
+            row=next(reader)
+        in_file.seek(skiprows)
+        if skiprows>0:
+            print(f'Detected {skiprows} header rows to skip')
+            reader = csv.reader(in_file, delimiter="\t")
+            for _ in range(skiprows):
+                next(reader)
+        labels = [] # transcript with speaker labels and timestamp in sec
+        for i,utt in enumerate(reader):
+            if not ''.join(utt).strip(): # skip blank lines
+                continue
+            try:
+                if len(utt) == 5: # IF data comes straight from ELAN sometimes there is a superfluous blank column 2
+                    if i==0:
+                        print('detected extra blank column in first row, will remove')
+                    if fmt=='AUG23':
+                        if i==0:
+                            print('detected extra blank 1st column, will remove')
+                        _,speaker,start_HHMMSS,end_HHMMSS,utterance= utt
+                        convert_timestamps=True
+                    else:
+                        if i==0:
+                            print('detected extra blank 2nd column, will remove')
+                        speaker,_,start_HHMMSS, end_HHMMSS, utterance = utt
+                        convert_timestamps=True
+                elif len(utt) == 4: # sometimes the blank col is already removed
+                    if i==0:
+                        print('detected 4 columns, assuming: speaker,start_HHMMSS, end_HHMMSS, utterance ')
+                    speaker,start_HHMMSS, end_HHMMSS, utterance = utt
+                    convert_timestamps=True
+                elif len(utt) == 6: # New one from 2023 Aug has a redundant extra start col!?
+                    if i==0:
+                        print('detected 6 columns, assuming: _,speaker,start_HHMMSS, end_HHMMSS, utterance,_ ')
+                    _,speaker,start_HHMMSS,end_HHMMSS,utterance,_ = utt
+                    convert_timestamps=True
+                elif len(utt) == 9: # 2023 transcribers tend to give full elan output
+                    if i==0:
+                        print('detected 9 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance ')
+                    speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance = utt
+                    convert_timestamps=True
+                elif len(utt) == 10: # sometimes an extra blank column appears at the end
+                    if i==0:
+                        print('detected 10 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ ')
+                    speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ = utt
+                    convert_timestamps=True
+                elif len(utt) == 12: # WOw how many redundant columns can ELAN make...
+                    if i==0:
+                        print('detected 12 columns, assuming: speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance ')
+                    speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance = utt
+                    convert_timestamps=True
+                else:
+                    raise ValueError(f'Unknown transcript format with {len(utt)} columns for {file}')
+            except BaseException as err:
+                print(f'!!! transcript parse error on line {i} for {file}')
+                print(utt)
+                raise err
+            if convert_timestamps:
+                start_sec = HHMMSS_to_sec(start_HHMMSS)
+                end_sec = HHMMSS_to_sec(end_HHMMSS)
+            labels.append((speaker, utterance, start_sec,end_sec))
+        labels= pd.DataFrame(labels, columns = ('speaker', 'utterance', 'start_sec','end_sec'))
+        labels.sort_values(by='start_sec', inplace=True, ignore_index=True)
+        labels.reset_index(inplace=True)
+        labels = labels.rename(columns = {'index':'seg'})
+    return(labels)
+def merge_ellipsis(seg_labels):
+    # merge utterances with ellipsis
+    # input is seg_labels format: [optional index] speaker, utterance, start_sec, end_sec
+    if isinstance(seg_labels,str) and seg_labels.endswith(('.csv','.tsv','.txt')):
+        df=pd.read_csv(seg_labels)
+    elif isinstance(seg_labels, pd.DataFrame):
+        df=seg_labels
+    else:
+        raise ValueError('input seg_labels should be path to csv or pd.DataFrame')
+    if len(df.columns)==4:
+        # no seg index yet
+        df.reset_index(inplace=True)
+        df = df.rename(columns = {'index':'seg'})
+    elif len(df.columns)==5:
+        # first col is seg
+        df.columns = ['seg','speaker','utterance','start_sec','end_sec']
+    else:
+        raise ValueError('input seg_labels should have 4 or 5 columns')
+    df2=[]
+    prev_spk=None
+    prev_utt=""
+    prev_start=0
+    prev_end=0
+    segs=[0]
+    merge_utt={"seg":None, "speaker":None,"utterance":None,"start_sec":None, "end_sec":None}
+    for i,row in df.iterrows():
+        if i==0:
+            merge_utt=row
+        else:
+            # if same speaker as last and ellipsis
+            if merge_utt["speaker"]==row["speaker"] and str(merge_utt["utterance"]).endswith('...') and str(row["utterance"]).startswith('...'):
+                # append current to temporary merged utt: use prev_ items
+                merge_utt["utterance"]+=str(row["utterance"])
+                merge_utt["end_sec"]=row["end_sec"]
+                segs.append(row["seg"])
+            else:
+                # append merge_utt to df2
+                merge_utt["seg"]=segs
+                df2.append(merge_utt)
+                # clear merge_utt and set to current
+                merge_utt=row
+                segs=[merge_utt["seg"]]
+    merge_utt["seg"]=segs
+    # if not isinstance(merge_utt["seg"],list):
+    #     merge_utt["seg"]=list(segs)
+    df2.append(merge_utt) # catch final merge_utt if not terminated
+    df2=pd.DataFrame(df2)
+    df2['utterance']=df2['utterance'].str.replace('\.+',' ', regex=True)
+    # clear up "......"
+    # enumerate utterances
+    df2.reset_index(inplace=True,drop=True)
+    df2 = df2.reset_index().rename(columns = {'index':'utt'})
+    return df2
+def add_dummy_seg_column(table):
+    # adds a dummy seg column (listing segments comprising utterance) for a df without this column
+    # labelfiles generated from merge_ellipsis have an 'utt' column giving utterance ID, and a seg column
+    # containing a list of original segments comprising each utterance
+    # but you may need all label files top have the exact same format even if they weren't produced by
+    # merge_ellipsis()
+    # returns a table with columns 'utt' and 'seg'
+    if 'seg' in table.columns.tolist():
+        print('\'seg\' column already exists, not changing anything')
+        return table
+    if 'uttID' in table.columns.tolist():
+        table=table.rename(columns={"uttID":"utt"})
+    if not 'utt' in table.columns.tolist():
+        table['utt']=table.index
+    table['seg']=[[u] for u in table['utt']]
+    table=table[['utt','seg','speaker','start_sec','end_sec','utterance']]
+    return table
+def old_xlsx_to_labels_csv(xl_file, merge_segments=True):
+    # converts an xlsx file (from contractor transcription service which has single timecode col) to a csv in the format required by rosy's isatasr lib
+    # if merge_segments=True, will merge segments to form utterances where there have been splits separated by '...'
+    # if merge_segments=False, will keep segments as they were in the ELAN output
+    # returns the path to the csv file
+    table=old_xlsx_to_table(xl_file)
+    sessname=get_sessname_from_filename(xl_file)
+    if merge_segments:
+        save_file=f'utt_labels_{sessname}.csv'
+        merged_labels=merge_ellipsis(table)
+        merged_labels.to_csv(save_file,index=False, float_format='%.3f')
+    else:
+        save_file=f'seg_labels_{sessname}.csv'
+        table.to_csv(save_file,index=False, float_format='%.3f')
+    return save_file
+def get_sessname_from_filename(filename):
+    sessname=Path(filename).stem
+    sessname = re.sub('reworked-transcript-diarized-timestamped-', '', sessname,flags=re.I)
+    sessname = re.sub('reworked_transcript-diarized-timestamped-', '', sessname,flags=re.I)
+    sessname = re.sub('reworked-diarized-timestamped-', '', sessname,flags=re.I)
+    sessname = re.sub('reworked_timestamped_', '', sessname,flags=re.I)
+    sessname = re.sub('reworked_', '', sessname,flags=re.I)
+    sessname = re.sub('reworked-', '', sessname,flags=re.I)
+    sessname = re.sub('transcript_diarized_timestamped_', '', sessname,flags=re.I)
+    sessname = re.sub('transcript-diarized-timestamped_', '', sessname,flags=re.I)
+    sessname = re.sub('transcript-diarized-timestamped-', '', sessname,flags=re.I)
+    sessname = re.sub('_transcript', '', sessname,flags=re.I)
+    sessname = re.sub('_tmcoded', '', sessname,flags=re.I)
+    sessname = re.sub('utt_labels_', '', sessname,flags=re.I)
+    sessname = re.sub('seg_labels_', '', sessname,flags=re.I)
+    sessname = re.sub('_redacted', '', sessname,flags=re.I)
+    return sessname
+def ELAN_to_labels_csv(ELANfile, merge_segments = True):
+    # dumb but effective string wrangling to get sess name
+    sessname=get_sessname_from_filename(ELANfile)
+    # reads ELAN output to pd.DataFrame in a unified format
+    labels=readELANtsv(ELANfile)
+    if merge_segments:
+        save_file=f'utt_labels_{sessname}.csv'
+    # merge segments to form utterances where there have been splits separated by '...'
+        merged_labels=merge_ellipsis(labels)
+        merged_labels.to_csv(save_file,index=False, float_format='%.3f')
+    else:
+        save_file=f'seg_labels_{sessname}.csv'
+        labels.to_csv(save_file,index=False, float_format='%.3f')
+    return save_file
+def parse_label_csv(label_csv:str):
+    # utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
+    # There are several versions with differnt columns (with/without segment &/ utterance index,
+    # withouot column headers etc)
+    # table:
+    # [uttID, speaker, transcript, start_sec, end_sec]
+    table = pd.read_csv(label_csv,keep_default_na=False, header=None)
+    row0=table.iloc[0]
+    is_header = not any(str(cell).replace('.','').isdigit() for cell in row0)
+    if is_header:
+        table.columns=row0.tolist()
+        table=table.iloc[1:]
+        table=table.reset_index(drop=True)
+    else:
+        if len(table.columns)==4:
+            print('no header detected, assuming annotation file has columns [speaker,utterance,start_sec, end_sec] ')
+            table.columns=['speaker','utterance','start_sec', 'end_sec']
+        elif len(table.columns)==5:
+            print('no header detected, assuming annotation file has columns [seg,speaker,utterance,start_sec, end_sec] ')
+            table.columns=['seg','speaker','utterance','start_sec', 'end_sec']
+        elif len(table.columns)==6:
+            print('no header detected, assuming annotation file has columns [utt,seg,speaker,utterance,start_sec, end_sec] ')
+            table.columns=['utt','seg','speaker','utterance','start_sec', 'end_sec']
+        else:
+            print(f'no header detected, csv has {len(table.columns)} columns, could not determine column names.')
+            return None
+    # choose which column to use for uttID in table
+    if 'utt' in table.columns.tolist():
+        table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
+    elif 'seg' in table.columns.tolist():
+        table=table.rename(columns={"seg":"uttID"})
+    else:
+        table=table.reset_index().rename(columns={"index":"uttID"})
+    table=table[['uttID','speaker','start_sec','end_sec','utterance']]
+    return table
+def deidentify_speaker(df, who='all'):
+    """replace speaker ID with generic labels
+    in order of appearance (speaker1, speaker2)'
+    if who is "student", only student names are replaced
+    Args:
+        df (_type_): _description_
+        who (str, optional): 'all','student'. Which names to replace. Defaults to 'all'.
+    """
+    colnames = df.columns.tolist()
+    speaker_key = next((key for key in ['speaker','Speaker','speaker_id','Speaker_ID'] if key in colnames),None)
+    if not speaker_key:
+        raise ValueError('No speaker column found in dataframe!')
+    speakers = df[speaker_key].unique()
+    if who=='student':
+        # detect student. ID format can be student_xxx or 00-0000 numeric
+        speakers = [s for s in speakers if ('student' in s.lower() or re.match(r'^\d{2}-\d{4}$',s))]
+        generic_speakers = [f'student_{i+1}' for i in range(len(speakers))]
+    else:
+        generic_speakers = [f'speaker_{i+1}' for i in range(len(speakers))]
+    speaker_dict = dict(zip(speakers, generic_speakers))
+    df[speaker_key] = df[speaker_key].replace(speaker_dict)
+    return df