rosyvs commited on
Commit
d9fb961
·
1 Parent(s): 9636874

Add transcript processing functions to utils and add to the interface and enhance utils for TM conversion

Browse files
Files changed (2) hide show
  1. app.py +234 -4
  2. utils.py +626 -1
app.py CHANGED
@@ -6,9 +6,11 @@ from pathlib import Path
6
  import random
7
  import gradio as gr
8
 
9
- from utils import (HHMMSS_to_sec, molly_xlsx_to_table, convert_and_trim_video,
10
  sort_transcript, table_to_ELAN_tsv,
11
- xlsx_to_table)
 
 
12
 
13
 
14
  def delete_files(files):
@@ -193,6 +195,121 @@ def convert_video(input_file, output_format):
193
  gr.Error(f"Error: {str(e)}")
194
  return f"Error: {str(e)}"
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  # gr components for video trimmer
198
  input_file = gr.File(label="Select video file")
@@ -254,5 +371,118 @@ interface_wt = gr.Interface(fn=trim_video_wt, inputs=[input_file_wt, input_trans
254
  `Annotations`: a string that may be blank, representing any annotations for the utterance. \n\
255
  `Error Type`: a string that may be blank, representing any errors in the transcription of the utterance. ")
256
 
257
- demo = gr.TabbedInterface([interface_c, interface, interface_vtr, interface_wt ], ["Video Converter", "Video Trimmer", "Video Trimmer with Random Start Time", "Video Trimmer with Transcript"])
258
- demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import random
7
  import gradio as gr
8
 
9
+ from utils import (HHMMSS_to_sec, convert_and_trim_video,
10
  sort_transcript, table_to_ELAN_tsv,
11
+ xlsx_to_table,
12
+ convert_transcript_for_TM, convert_transcript_for_annotation,
13
+ table_to_ELAN_tsv, ELAN_to_labels_csv, deidentify_speaker)
14
 
15
 
16
  def delete_files(files):
 
195
  gr.Error(f"Error: {str(e)}")
196
  return f"Error: {str(e)}"
197
 
198
+ def delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath):
199
+ for output_filepath in output_filepath_list:
200
+ try:
201
+ os.remove(output_filepath)
202
+ except FileNotFoundError:
203
+ pass
204
+ for trans_log_filepath in trans_log_filepath_list:
205
+ try:
206
+ os.remove(trans_log_filepath)
207
+ except FileNotFoundError:
208
+ pass
209
+ try:
210
+ os.remove(global_log_filepath)
211
+ except FileNotFoundError:
212
+ pass
213
+ print("Files deleted")
214
+
215
+ def delete_files_thread(output_filepath_list, trans_log_filepath_list, global_log_filepath):
216
+ print("Thread started")
217
+ time.sleep(20)
218
+ delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath)
219
+
220
+ def convert_xlsx_to_TMxlsx(input_file_list):
221
+
222
+ file_list = [file.name for file in input_file_list]
223
+ output_filepath_list, trans_log_filepath_list, error_check, global_transfer_log_path = convert_transcript_for_TM(file_list=file_list)
224
+ if not error_check:
225
+ error_check = "No errors found."
226
+
227
+ delete_thread = threading.Thread(target=delete_files_thread, args=(output_filepath_list, trans_log_filepath_list, global_transfer_log_path))
228
+ delete_thread.start()
229
+
230
+ return output_filepath_list, trans_log_filepath_list, global_transfer_log_path, error_check
231
+
232
+ def convert_for_annotation(input_file_list, annotation_scheme):
233
+ output_files=[]
234
+ for input_transcript in input_file_list:
235
+ print("start converting transcript")
236
+ output_file = convert_transcript_for_annotation(file=input_transcript, annotation_scheme=annotation_scheme)
237
+ print("finished converting transcript to xlsx for annotation")
238
+ output_files.append(output_file)
239
+ return output_files
240
+
241
+
242
+ def convert_xlsx_to_ELANtsv(input_file_list):
243
+ output_files=[]
244
+ for input_transcript in input_file_list:
245
+ # convert transcript
246
+ print("start converting transcript")
247
+ table = old_xlsx_to_table(xl_file=input_transcript)
248
+ print("finished converting transcript to table")
249
+ output_transcript = input_transcript.replace('.xlsx', '.tsv')
250
+ output_file = table_to_ELAN_tsv(table, output_transcript)
251
+ print("saved table to tsv")
252
+ output_files.append(output_file)
253
+ return output_files
254
+
255
+
256
+ #TODO: support sort and merge for XLSX output if this is needed
257
+
258
+ def convert_ELANtsv_to_CSV(input_file_list, merge_ellipsis=False):
259
+ output_files=[]
260
+ for input_transcript in input_file_list:
261
+ # convert transcript
262
+ print("start converting transcript")
263
+ output_transcript = input_transcript.replace('.tsv', '.csv')
264
+ output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
265
+ print("finish converting transcript")
266
+ output_files.append(output_file)
267
+ return output_files
268
+
269
+ # TODO: XLSX to csv (seg_labels or utt_labels)
270
+ def convert_xlsx_to_csv(input_file_list, merge_ellipsis=False):
271
+ output_files=[]
272
+ for input_transcript in input_file_list:
273
+ # read xl file to table
274
+ # write table to csv with option to merge segments on ellipsis
275
+ output_transcript = input_transcript.replace('.xlsx', '.csv')
276
+ output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
277
+ output_files.append(output_file)
278
+ return output_files
279
+
280
+ def deidentify_transcripts(input_file_list, who='student'):
281
+ output_files=[]
282
+ for file in input_file_list:
283
+ basename = os.path.basename(file)
284
+ ext = file.split('.')[-1]
285
+ if file.endswith('.xlsx') or file.endswith('.xls'):
286
+ df = pd.read_excel(file)
287
+ elif file.endswith('.csv'):
288
+ df = pd.read_csv(file)
289
+ elif file.endswith('.tsv'):
290
+ df = pd.read_csv(file, sep='\t')
291
+ elif file.endswith('.txt'):
292
+ df = pd.read_csv(file, sep='\t')
293
+ else:
294
+ gr.Warning("File type not supported (must be .xlsx, .xls, .csv, .tsv, or .txt)")
295
+ try:
296
+ df = deidentify_speaker(df, who=who)
297
+ except ValueError as e:
298
+ gr.Warning(f"{e}: {basename} ")
299
+ continue
300
+ output_file = file.replace(f'.{ext}', f'_deidentified.{ext}')
301
+ if ext == 'xlsx' or ext == 'xls':
302
+ df.to_excel(output_file, index=False)
303
+ elif ext == 'csv':
304
+ df.to_csv(output_file, index=False)
305
+ elif ext == 'tsv' or ext == 'txt':
306
+ df.to_csv(output_file, sep='\t', index=False)
307
+ output_files.append(output_file)
308
+ return output_files
309
+
310
+
311
+
312
+ ###### GRADIO INTERFACE ######
313
 
314
  # gr components for video trimmer
315
  input_file = gr.File(label="Select video file")
 
371
  `Annotations`: a string that may be blank, representing any annotations for the utterance. \n\
372
  `Error Type`: a string that may be blank, representing any errors in the transcription of the utterance. ")
373
 
374
+
375
+ #### TRANSCRIPT COMPONENTS ####
376
+ # gr components for TM converter
377
+ input_xlsx = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
378
+ output_xlsx_tm = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
379
+ process_log_tm = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
380
+ global_transfer_log_tm = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
381
+ error_check_tm = gr.Textbox(label="Error Check", type="text")
382
+ interface_tm = gr.Interface(fn=convert_xlsx_to_TMxlsx,
383
+ inputs=input_xlsx,
384
+ outputs=[output_xlsx_tm, process_log_tm, global_transfer_log_tm, error_check_tm],
385
+ title="transcript-->XLSX+TM_dropdown",
386
+ description="Converts XLSX or csv transcript to XLSX+TM transcript with prefilled dropdown for talkmoves",
387
+ live=False,
388
+ allow_flagging="never",)
389
+
390
+ # gr components for xlsx to ELAN
391
+ input_x2e = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
392
+ output_x2e = gr.Files(label="Output ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
393
+ # process_log_x2e = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
394
+ # global_transfer_log_x2e = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
395
+ # error_check_x2e = gr.Textbox(label="Error Check", type="text")
396
+ interface_x2e = gr.Interface(fn=convert_xlsx_to_ELANtsv, # TODO: swap out for correct fn
397
+ inputs=input_x2e,
398
+ outputs=output_x2e,
399
+ title="XLSX-->ELAN",
400
+ description="Converts XLSX transcript to ELAN-compatible tsv file",
401
+ live=False,
402
+ allow_flagging="never",)
403
+
404
+ # gr components for ELAN to CSV
405
+ input_e2c = gr.Files(label="Input ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
406
+ merge_e2c = gr.Checkbox(label="Merge segments on ellipsis?")
407
+ output_e2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
408
+ interface_e2c = gr.Interface(fn=convert_ELANtsv_to_CSV, # TODO: swap out for correct fn
409
+ inputs=[input_e2c, merge_e2c],
410
+ outputs=[output_e2c],
411
+ title="ELAN-->CSV",
412
+ description="Converts ELAN-exported file (.txt or .tsv, tab separated values) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
413
+ live=False,
414
+ allow_flagging="never",)
415
+
416
+ # gr components for XLSX to CSV
417
+ input_x2c = gr.Files(label="Input XLSX file", type="filepath", file_types=[".xlsx", ".csv"])
418
+ merge_x2c = gr.Checkbox(label="Merge segments on ellipsis?")
419
+ output_x2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
420
+ interface_x2c = gr.Interface(fn=convert_xlsx_to_csv, # TODO: swap out for correct fn
421
+ inputs=[input_x2c, merge_x2c],
422
+ outputs=[output_x2c],
423
+ title="XLSX-->CSV",
424
+ description="Converts old version XLSX transcript (with a single Timecode column) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
425
+ live=False,
426
+ allow_flagging="never",)
427
+
428
+ # gr components for annotation XLSX
429
+ input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
430
+ annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
431
+
432
+ output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
433
+ interface_c2a = gr.Interface(
434
+ fn=convert_for_annotation, # TODO: swap out for correct fn
435
+ inputs=[input_c2a, annotation_scheme_c2a],
436
+ outputs=[output_c2a],
437
+ title="CSV-->XLSX+annotation",
438
+ description="Converts CSV file to XLSX file for annotation (added columns for CPS or TM or None)",
439
+ live=False,
440
+ allow_flagging="never",
441
+ # submit_btn="Convert"
442
+ )
443
+
444
+ # gr components for deidentification
445
+ input_di = gr.Files(label="Input transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
446
+ who_di = gr.Radio(label="Who to deidentify", choices=[("student","student"), ("all","all")])
447
+ output_di = gr.Files(label="Output deidentified transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
448
+ interface_di = gr.Interface(
449
+ fn=deidentify_transcripts,
450
+ inputs=[input_di, who_di],
451
+ outputs=[output_di],
452
+ title="Deidentify",
453
+ description="Deidentify speaker labels in a transcript. Compatible with .xlsx, .xls, .csv, .tsv, .txt files with a column containing speaker labels. Will not work if speaker column is missing a header. Speaker names or IDs will be replaced with a deidentified label numbered in order of appearance. Choose whether to deidentify just students or all speakers.",
454
+ live=False,
455
+ allow_flagging="never",
456
+ )
457
+
458
+
459
+
460
+ ######## LAUNCH APP ########
461
+ demo = gr.TabbedInterface(
462
+ [
463
+ interface_e2c,
464
+ interface_x2e,
465
+ interface_x2c,
466
+ interface_c2a,
467
+ interface_tm,
468
+ interface_di,
469
+ interface_c,
470
+ interface,
471
+ interface_vtr,
472
+ interface_wt
473
+ ],
474
+ [
475
+ "📝→🗒️ ELAN→CSV",
476
+ "❎→📝 XLSX→ELAN",
477
+ "❎→🗒️ XLSX→CSV",
478
+ "🗒️→❎☷ CSV→XLSX+annotation",
479
+ "🗒️→❎💬 transcript→XLSX+TM_dropdown",
480
+ "🗒️→🥷🏻 Deidentify",
481
+ "🎥→📽 Video Converter",
482
+ "🎥✂️ Video Trimmer",
483
+ "🎥🎲 Video Trimmer with Random Start Time",
484
+ "🎥🗒️✂️ Video Trimmer with Transcript"
485
+ ]
486
+ )
487
+
488
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
utils.py CHANGED
@@ -7,6 +7,11 @@ from pathlib import Path
7
  import sys
8
  import gradio as gr
9
  import pandas as pd
 
 
 
 
 
10
 
11
  os.makedirs(f'{os.getcwd()}/logs', exist_ok=True)
12
  os.makedirs(f'{os.getcwd()}/results', exist_ok=True)
@@ -291,4 +296,624 @@ def convert_and_trim_video(media_in, media_out, start=None, end=None):
291
 
292
  except Exception as e:
293
  print(f"Error converting video format: {e}")
294
- gr.Error(f"Error converting video format: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import sys
8
  import gradio as gr
9
  import pandas as pd
10
+ from pathlib import Path
11
+ import nltk
12
+ from openpyxl import Workbook
13
+ from openpyxl.utils.dataframe import dataframe_to_rows
14
+ from openpyxl.worksheet.datavalidation import DataValidation
15
 
16
  os.makedirs(f'{os.getcwd()}/logs', exist_ok=True)
17
  os.makedirs(f'{os.getcwd()}/results', exist_ok=True)
 
296
 
297
  except Exception as e:
298
  print(f"Error converting video format: {e}")
299
+ gr.Error(f"Error converting video format: {e}")
300
+
301
+
302
+
303
+
304
+
305
+ ###### TRANSCRIT UTILS ######
306
+
307
+ def convert_transcript_for_TM(file_list):
308
+ """Convert transcripts for TalkMoves Annotation
309
+ Input can be xlsx or csv transcript file
310
+ Can handle sepraate start and end time columns or a single timecode column
311
+ Output will have separate start and end timestamps in HH:MM:SS.sss format
312
+
313
+ Args:
314
+ file_list (_type_): _description_
315
+
316
+ Raises:
317
+ gr.Error: _description_
318
+ gr.Error: _description_
319
+
320
+ Returns:
321
+ _type_: _description_
322
+ """
323
+
324
+
325
+ # Regular expression pattern for matching speaker names and timecodes.
326
+ bracket_re = re.compile(r'(?:\[[UI|ui|Inaudible|inaudible|overlapping speech|VIDEO SILENCE|teacher explaining in background].*\]\W{0,2})')
327
+ # Regular expression pattern for matching anything enclosed in square brackets.
328
+ all_bracket_re = re.compile(r'(?:\[.*\]\W{0,2})')
329
+ # whether remove the inaudible
330
+ do_remove_inaudible = True
331
+ # whether_keep_context_switch
332
+ do_keep_context_switch = True
333
+ # whether_convert_to_timestamp if start and end time are in seconds and in separate columns
334
+ convert_to_timestamp = True
335
+
336
+ error_message = [] # List of error messages to be displayed to the user.
337
+ global_stat_dict = {} # Dictionary of global statistics.
338
+ output_filepath_list = [] # List of output file paths.
339
+ trans_log_filepath_list = [] # List of transcription log file paths.
340
+ for file in file_list:
341
+ filename = file.split('/')[-1] # Get the filename from the file.
342
+ filepath = os.path.dirname(file) # Get the file path from the file.
343
+ # Read the file into a Pandas DataFrame depending on its file format.
344
+ if filename.endswith('.xlsx'):
345
+ df = pd.read_excel(file, index_col=0)
346
+ output_filename = f"{filename[:-5]}" + "_TMcoded.xlsx"
347
+ elif filename.endswith('.csv'):
348
+ df = pd.read_csv(file, index_col=0, error_bad_lines=False)
349
+ output_filename = f"{filename[:-4]}" + "_TMcoded.xlsx"
350
+
351
+ else:
352
+ raise gr.Error(f"{file} format is wrong")
353
+
354
+ # Remove the "Copy of" prefix from the output filename, if present.
355
+ if output_filename.startswith("Copy of "):
356
+ output_filename = output_filename[8:]
357
+
358
+ # Remove the word "_Transcript" from the output filename, if present.
359
+ if '_Transcript' in output_filename:
360
+ # print("before: "+output_filename)
361
+ error_message.append("before: "+output_filename)
362
+ output_filename = ''.join(output_filename.split('_Transcript'))
363
+ # print("after: "+output_filename)
364
+ error_message.append("after: "+output_filename)
365
+
366
+ # Construct the output file and transcription log file paths.
367
+ output_filepath = os.path.join(filepath, output_filename)
368
+ trans_log_filepath = os.path.join(filepath, f"{output_filename}"+ ".log")
369
+
370
+ # Open the transcription log file for writing.
371
+ with open(trans_log_filepath, "w") as outfile:
372
+ sub_cnt_in_file = 0
373
+ empty_speaker_cnt_in_file = 0
374
+ turn_skipped_in_file = 0
375
+ turn_skipped_speaker_switch_in_file = 0
376
+ snt_mark_skip_in_file = 0
377
+ snt_skipped_in_file = 0
378
+ chat_flag_in_speaker_time_line = 0
379
+ chat_flag_in_content_line = 0
380
+ all_inaudible_in_file = 0
381
+ all_bracket_in_file = 0
382
+ all_snts_in_file = 0
383
+ all_token_cnt_in_file = 0
384
+ #index Timecode Duration Speaker Dialogue Annotations Error Type
385
+ #1 00:00:05:04 - 00:00:07:12 00:00:02:08 Tutor Did you... How was your Halloween?
386
+ turns = []
387
+ time_stamps = []
388
+ speakers = []
389
+ chat_flags = []
390
+ sentences = []
391
+ snt_ids = []
392
+
393
+ ## parse the df flexibly: find key column names which might vary dependign on transcript source
394
+ # set all column names to lowercase
395
+ df.columns = map(str.lower, df.columns)
396
+ # several possibilities for column names, detect which are present
397
+ uttID_keys = ['utt','seg','utt_id','seg_id','index']
398
+ speaker_keys = ['speaker']
399
+ start_keys=['start_sec','start','start_time','timestart']
400
+ end_keys=['end_sec','end','end_time','timeend']
401
+ timestamp_keys = ['timecode','timestamp']
402
+ content_keys=['dialogue','utterance','transcript','text']
403
+ # detect which is used in this df
404
+ uttID_key = next((key for key in uttID_keys if key in df.columns), None)
405
+ speaker_key = next((key for key in speaker_keys if key in df.columns), None)
406
+ content_key = next((key for key in content_keys if key in df.columns), None)
407
+ # check if separate start and end times are present, otherwise assume single timecode column
408
+ if any(df.columns.isin(start_keys)):
409
+ start_key = next((key for key in start_keys if key in df.columns), None)
410
+ end_key = next((key for key in end_keys if key in df.columns), None)
411
+ time_format = 'seconds'
412
+ if convert_to_timestamp:
413
+ # convert to timestamp format HH:MM:SS.sss - HH:MM:SS.sss
414
+ df['timecode'] = df.apply(lambda x: f"{sec_to_HHMMSS(x[start_key])} - {sec_to_HHMMSS(x[end_key])}", axis=1)
415
+ timestamp_key='timecode'
416
+ time_format = 'timestamp'
417
+ else:
418
+ timestamp_key=next((key for key in timestamp_keys if key in df.columns), None)
419
+ time_format = 'timestamp'
420
+ # Turn started with 1, the same as molly's transcripts
421
+ for i, row in df.iterrows():
422
+ turn = row[uttID_key] if uttID_key else i+1
423
+ speaker = row[speaker_key]
424
+ time_str = row[timestamp_key]
425
+ content = "" if pd.isna(row[content_key]) else row[content_key].strip("\n")
426
+ # when speaker is empty, use the previous speaker
427
+ if speaker == "":
428
+ if speakers:
429
+ speaker = speakers[-1]
430
+ empty_speaker_cnt_in_file += 1
431
+ outfile.write(f"{turn}: found empty speaker, use the speaker in previous turn: {speaker}\n")
432
+ else:
433
+ raise gr.Error(f"{row}, the first turn is empty speaker")
434
+
435
+ # clean after the sentence tokenize
436
+ snts = sent_tokenize(content)
437
+ all_snts_in_file += len(snts)
438
+ snt_skipped_in_turn = 0
439
+ for i, snt in enumerate(snts):
440
+ remove_flag = False
441
+ inaudible_search = re.findall(bracket_re, snt)
442
+ if inaudible_search:
443
+ all_inaudible_in_file += len(inaudible_search)
444
+ outfile.write(f"{turn}, {inaudible_search}, inaudible found in snt: {snt}\n")
445
+
446
+ all_bracket_search = re.findall(all_bracket_re, snt)
447
+ if all_bracket_search:
448
+ all_bracket_in_file += len(all_bracket_search)
449
+ outfile.write(f"{turn}, {all_bracket_search} bracket found in snt: {snt}\n")
450
+
451
+ # only remove the [inaudible xxx] when it is the whole sentence.
452
+ inaudible_match = re.fullmatch(bracket_re, snt)
453
+
454
+ if inaudible_match:
455
+ if do_keep_context_switch:
456
+ # if keep context switch
457
+ if speakers and speaker == speakers[-1]:
458
+ # share the same speaker, no context switching, just remove it
459
+ remove_flag = True
460
+ else:
461
+ # different speakers, it is the context switching.
462
+ if len(snts) == 1:
463
+ # current empty sentence is the only single sentence
464
+ remove_flag = False
465
+ else:
466
+ if i != len(snts)-1:
467
+ # current empty utterance is not the last one, just delete it
468
+ remove_flag = True
469
+ else:
470
+ # current empty utterance is the last one, keep it.
471
+ if snt_skipped_in_turn == len(snts)-1:
472
+ # all previous snts are empty, then keep this to not skip the whole turn
473
+ remove_flag = False
474
+ else:
475
+ remove_flag = True
476
+ else:
477
+ # if not keep context switch, then simply remove all empty utterance
478
+ remove_flag = True
479
+
480
+ # If remove_flag is true:
481
+ if remove_flag:
482
+ # Increment sub_cnt_in_file and snt_mark_skip_in_file
483
+ sub_cnt_in_file += 1
484
+ snt_mark_skip_in_file += 1
485
+ # Write the following message to outfile:
486
+ outfile.write(f"{turn}, sub happend: {snt}, skip this sentence\n")
487
+ # If do_remove_inaudible is true:
488
+ if do_remove_inaudible:
489
+ snt_skipped_in_file += 1
490
+ snt_skipped_in_turn += 1
491
+ continue
492
+
493
+ # Add to pd:
494
+ # Append turn to turns list
495
+ turns.append(turn)
496
+ # Set snt_id to the string f"{turn}.{i}"
497
+ snt_id = f"{turn}.{i}"
498
+ # Append time_str to time_stamps list
499
+ time_stamps.append(time_str)
500
+ # Append speaker to speakers list
501
+ speakers.append(speaker)
502
+ # Set sentence to the string representation of snt, with whitespace removed from the start and end
503
+ sentence = str(snt).strip().rstrip("\n")
504
+ # Calculate the number of tokens in sentence and add to all_token_cnt_in_file
505
+ token_cnt = len(nltk.word_tokenize(sentence))
506
+ all_token_cnt_in_file += token_cnt
507
+ # Append snt_id to snt_ids list
508
+ snt_ids.append(snt_id)
509
+ # Append sentence to sentences list
510
+ sentences.append(sentence)
511
+
512
+ if snt_skipped_in_turn == len(snts):
513
+ # all snts in turn are skiped, then skip the turn
514
+ turn_skipped_in_file += 1
515
+ if (speakers and speaker != speakers[-1]) or not speakers:
516
+ turn_skipped_speaker_switch_in_file += 1
517
+ outfile.write(f"{turn}, since all snts are empty, skip this whole turn {content}\n")
518
+ # Create a new DataFrame with the following columns:
519
+ new_df = pd.DataFrame({
520
+ "Sentence_ID": snt_ids, # A
521
+ "TimeStamp": time_stamps, #B
522
+ "Turn" : turns, #C
523
+ "Speaker" : speakers, #D
524
+ "Sentence" : sentences #E
525
+ })
526
+
527
+ # assert turn_skipped_speaker_switch_in_file==0, "Some speaker switch turn skipped"
528
+ new_df["Teacher_TM"] = None #F
529
+ new_df["Student_TM"] = None #G
530
+
531
+ # write new_df to xlsx file
532
+ new_df.to_excel(output_filepath, index=False)
533
+
534
+
535
+ # https://openpyxl.readthedocs.io/en/latest/api/openpyxl.utils.dataframe.html#openpyxl.utils.dataframe.dataframe_to_rows
536
+ wb = Workbook()
537
+ ws = wb.active
538
+ teacher_dv = DataValidation(type="list", formula1='",1-None,2-Keep-Together,3-Getting-Student-to-Relate,4-Restating,5-Revoicing,6-Context,7-Press-for-Accuracy,8-Press-for-Reasoning"', allow_blank=True)
539
+ student_dv = DataValidation(type="list", formula1='",1-None,2-Relate-to-Another-Student,3-Asking-for-More-info,4-Making-a-Claim,5-Providing-Evidence/Reasoning"', allow_blank=True)
540
+ ws.add_data_validation(teacher_dv)
541
+ ws.add_data_validation(student_dv)
542
+ teacher_dv.add('F2:F1048576')
543
+ student_dv.add('G2:G1048576')
544
+ for r in dataframe_to_rows(new_df, index=False, header=True):
545
+ ws.append(r)
546
+ wb.save(output_filepath)
547
+
548
+ stat_dict = {
549
+ "chat_flag_in_speaker_time_line": chat_flag_in_speaker_time_line,
550
+ "chat_flag_in_content_line": chat_flag_in_content_line,
551
+ "empty_speaker_cnt_in_file": empty_speaker_cnt_in_file,
552
+ "ori_total_turn": df.shape[0],
553
+ "ori_total_snt": all_snts_in_file,
554
+ "turn_skipped": turn_skipped_in_file,
555
+ "turn_skipped_speaker_switch_in_file": turn_skipped_speaker_switch_in_file,
556
+ "snt_skipped": snt_skipped_in_file,
557
+ "remaining_snt": all_snts_in_file - snt_skipped_in_file,
558
+ "all_token_cnt_in_file": all_token_cnt_in_file,
559
+ "avg_token_cnt_per_snt": all_token_cnt_in_file/(all_snts_in_file - snt_skipped_in_file),
560
+ "sub_cnt_in_file": sub_cnt_in_file,
561
+ "all_inaudible_in_file": all_inaudible_in_file,
562
+ "all_bracket_in_file": all_bracket_in_file,
563
+ "other_bracket_in_file": all_bracket_in_file - all_inaudible_in_file
564
+ }
565
+ if all_inaudible_in_file != all_bracket_in_file:
566
+ # print(f"{filename} has special brakets")
567
+ error_message.append(f"Warning: {filename} has special brakets")
568
+ for k, v in stat_dict.items():
569
+ global_stat_dict[k] = global_stat_dict.get(k,0) + v
570
+ outfile.write(f"{output_filepath}, {json.dumps(stat_dict, indent=4)}")
571
+
572
+ output_filepath_list.append(output_filepath)
573
+ trans_log_filepath_list.append(trans_log_filepath)
574
+
575
+ for k, v in global_stat_dict.items():
576
+ if "avg" in k:
577
+ global_stat_dict[k] = global_stat_dict[k]/len(file_list)
578
+ global_log_filepath = os.path.join(filepath, "global_transfer"+ ".log")
579
+ with open(global_log_filepath, "w") as outfile:
580
+ outfile.write(f"global_stat_dict: {json.dumps(global_stat_dict, indent=4)}")
581
+
582
+ # error_check
583
+ if global_stat_dict["all_inaudible_in_file"] != global_stat_dict["all_bracket_in_file"]:
584
+ error_message.append("Error: 'all_inaudible_in_file' does not match 'all_bracket_in_file'")
585
+ if global_stat_dict["other_bracket_in_file"] != 0:
586
+ error_message.append("Error: 'other_bracket_in_file' is not zero")
587
+
588
+ return output_filepath_list, trans_log_filepath_list, error_message, global_log_filepath
589
+
590
+
591
+
592
+ def add_CPS_columns(df):
593
+ # Observation Instructions CONST_SharesU_Situation CONST_SharesU_CorrectSolutions CONST_SharesU_IncorrectSolutions CONST_EstablishesCG_Confirms CONST_EstablishesCG_Interrupts NEG_Responds_Reasons NEG_Responds_QuestionsOthers NEG_Responds_Responds MAINTAIN_Initiative_Criticizes NEG_MonitorsE_Results NEG_MonitorsE_GivingUp NEG_MonitorsE_Strategizes NEG_MonitorsE_Save MAINTAIN_Initiative_Suggestions MAINTAIN_Initiative_Compliments MAINTAIN_FulfillsR_InitiatesOffTopic MAINTAIN_FulfillsR_JoinsOffTopic MAINTAIN_FulfillsR_Support MAINTAIN_FulfillsR_Apologizes Notes
594
+ annotation_columns = ['Observation','Instructions', 'CONST_SharesU_Situation', 'CONST_SharesU_CorrectSolutions', 'CONST_SharesU_IncorrectSolutions', 'CONST_EstablishesCG_Confirms', 'CONST_EstablishesCG_Interrupts', 'NEG_Responds_Reasons', 'NEG_Responds_QuestionsOthers', 'NEG_Responds_Responds', 'MAINTAIN_Initiative_Criticizes', 'NEG_MonitorsE_Results', 'NEG_MonitorsE_GivingUp', 'NEG_MonitorsE_Strategizes', 'NEG_MonitorsE_Save', 'MAINTAIN_Initiative_Suggestions', 'MAINTAIN_Initiative_Compliments', 'MAINTAIN_FulfillsR_InitiatesOffTopic', 'MAINTAIN_FulfillsR_JoinsOffTopic', 'MAINTAIN_FulfillsR_Support', 'MAINTAIN_FulfillsR_Apologizes', 'Notes']
595
+ # add these columns to the end of the df in this order
596
+ for col in annotation_columns:
597
+ df[col]=''
598
+ return df
599
+
600
+ def add_TM_columns(df):
601
+ annotation_columns = ['Teacher_TM', 'Student_TM']
602
+ # add these columns to the end of the df in this order
603
+ for col in annotation_columns:
604
+ df[col]=''
605
+ return df
606
+
607
+
608
+
609
+ def convert_transcript_for_annotation(file, annotation_scheme=None):
610
+ """Convert transcript for annotation:
611
+ Input standard csv transcript file
612
+ Output will have separate start and end timestamps in HH:MM:SS.sss format
613
+ Filename column will infer the video filename from the transcript filename
614
+ Columns for CPS annotators are added
615
+ """
616
+ filename,ext = os.path.splitext(os.path.basename(file)) # Get the filename from the file.
617
+ filepath = os.path.dirname(file) # Get the file path from the file.
618
+ # Read the file into a Pandas DataFrame depending on its file format.
619
+ try:
620
+ table = parse_label_csv(file)
621
+ media_filename = get_sessname_from_filename(filename)
622
+ out_df=table.copy()
623
+ out_df['recordingID']=media_filename
624
+ out_df['TimeStart']=out_df['start_sec'].apply(sec_to_HHMMSS)
625
+ out_df['TimeEnd']=out_df['end_sec'].apply(sec_to_HHMMSS)
626
+ out_df=out_df[['speaker','TimeStart','TimeEnd','utterance','recordingID','uttID']]
627
+ if annotation_scheme=='CPS':
628
+ out_df=add_CPS_columns(out_df)
629
+ output_file = os.path.join(filepath, f"CPS_{filename}.xlsx")
630
+ out_df.to_excel(output_file, index=False)
631
+ elif annotation_scheme=='TM':
632
+ out_df=add_TM_columns(out_df)
633
+ output_file = os.path.join(filepath, f"TM_{filename}.xlsx")
634
+ out_df.to_excel(output_file, index=False)
635
+ else:
636
+ output_file = os.path.join(filepath, f"{filename}.xlsx")
637
+ out_df.to_excel(output_file, index=False)
638
+ return output_file
639
+ except Exception as e:
640
+ raise gr.Error(f"{filename}: error {e}")
641
+
642
+
643
+ def sec_to_HHMMSS(seconds):
644
+ """Get timestamp string from seconds."""
645
+ seconds = float(seconds)
646
+ m, s = divmod(seconds, 60)
647
+ h, m = divmod(m, 60)
648
+ h=int(h)
649
+ m=int(m)
650
+ return f"{h:02d}:{m:02d}:{s:06.3f}"
651
+
652
+
653
+ def readELANtsv(file, fmt=None):
654
+ with open(file) as in_file:
655
+
656
+ reader = csv.reader(in_file, delimiter="\t")
657
+
658
+ skiprows=0
659
+ row=next(reader)
660
+
661
+ while not len(row)>=4: # 4 being the min numbert of cols ELAN exports have
662
+ skiprows+=1
663
+ row=next(reader)
664
+ in_file.seek(skiprows)
665
+
666
+ if skiprows>0:
667
+ print(f'Detected {skiprows} header rows to skip')
668
+ reader = csv.reader(in_file, delimiter="\t")
669
+ for _ in range(skiprows):
670
+ next(reader)
671
+
672
+ labels = [] # transcript with speaker labels and timestamp in sec
673
+
674
+ for i,utt in enumerate(reader):
675
+ if not ''.join(utt).strip(): # skip blank lines
676
+ continue
677
+ try:
678
+ if len(utt) == 5: # IF data comes straight from ELAN sometimes there is a superfluous blank column 2
679
+ if i==0:
680
+ print('detected extra blank column in first row, will remove')
681
+ if fmt=='AUG23':
682
+ if i==0:
683
+ print('detected extra blank 1st column, will remove')
684
+ _,speaker,start_HHMMSS,end_HHMMSS,utterance= utt
685
+ convert_timestamps=True
686
+ else:
687
+ if i==0:
688
+ print('detected extra blank 2nd column, will remove')
689
+ speaker,_,start_HHMMSS, end_HHMMSS, utterance = utt
690
+ convert_timestamps=True
691
+ elif len(utt) == 4: # sometimes the blank col is already removed
692
+ if i==0:
693
+ print('detected 4 columns, assuming: speaker,start_HHMMSS, end_HHMMSS, utterance ')
694
+ speaker,start_HHMMSS, end_HHMMSS, utterance = utt
695
+ convert_timestamps=True
696
+ elif len(utt) == 6: # New one from 2023 Aug has a redundant extra start col!?
697
+ if i==0:
698
+ print('detected 6 columns, assuming: _,speaker,start_HHMMSS, end_HHMMSS, utterance,_ ')
699
+ _,speaker,start_HHMMSS,end_HHMMSS,utterance,_ = utt
700
+ convert_timestamps=True
701
+ elif len(utt) == 9: # 2023 transcribers tend to give full elan output
702
+ if i==0:
703
+ print('detected 9 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance ')
704
+ speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance = utt
705
+ convert_timestamps=True
706
+ elif len(utt) == 10: # sometimes an extra blank column appears at the end
707
+ if i==0:
708
+ print('detected 10 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ ')
709
+ speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ = utt
710
+ convert_timestamps=True
711
+ elif len(utt) == 12: # WOw how many redundant columns can ELAN make...
712
+ if i==0:
713
+ print('detected 12 columns, assuming: speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance ')
714
+ speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance = utt
715
+ convert_timestamps=True
716
+
717
+ else:
718
+ raise ValueError(f'Unknown transcript format with {len(utt)} columns for {file}')
719
+ except BaseException as err:
720
+ print(f'!!! transcript parse error on line {i} for {file}')
721
+ print(utt)
722
+ raise err
723
+ if convert_timestamps:
724
+ start_sec = HHMMSS_to_sec(start_HHMMSS)
725
+ end_sec = HHMMSS_to_sec(end_HHMMSS)
726
+
727
+ labels.append((speaker, utterance, start_sec,end_sec))
728
+ labels= pd.DataFrame(labels, columns = ('speaker', 'utterance', 'start_sec','end_sec'))
729
+ labels.sort_values(by='start_sec', inplace=True, ignore_index=True)
730
+ labels.reset_index(inplace=True)
731
+ labels = labels.rename(columns = {'index':'seg'})
732
+
733
+ return(labels)
734
+
735
+
736
+ def merge_ellipsis(seg_labels):
737
+ # merge utterances with ellipsis
738
+ # input is seg_labels format: [optional index] speaker, utterance, start_sec, end_sec
739
+ if isinstance(seg_labels,str) and seg_labels.endswith(('.csv','.tsv','.txt')):
740
+ df=pd.read_csv(seg_labels)
741
+ elif isinstance(seg_labels, pd.DataFrame):
742
+ df=seg_labels
743
+ else:
744
+ raise ValueError('input seg_labels should be path to csv or pd.DataFrame')
745
+
746
+ if len(df.columns)==4:
747
+ # no seg index yet
748
+ df.reset_index(inplace=True)
749
+ df = df.rename(columns = {'index':'seg'})
750
+ elif len(df.columns)==5:
751
+ # first col is seg
752
+ df.columns = ['seg','speaker','utterance','start_sec','end_sec']
753
+ else:
754
+ raise ValueError('input seg_labels should have 4 or 5 columns')
755
+ df2=[]
756
+ prev_spk=None
757
+ prev_utt=""
758
+ prev_start=0
759
+ prev_end=0
760
+ segs=[0]
761
+ merge_utt={"seg":None, "speaker":None,"utterance":None,"start_sec":None, "end_sec":None}
762
+ for i,row in df.iterrows():
763
+ if i==0:
764
+ merge_utt=row
765
+
766
+ else:
767
+ # if same speaker as last and ellipsis
768
+ if merge_utt["speaker"]==row["speaker"] and str(merge_utt["utterance"]).endswith('...') and str(row["utterance"]).startswith('...'):
769
+ # append current to temporary merged utt: use prev_ items
770
+
771
+ merge_utt["utterance"]+=str(row["utterance"])
772
+ merge_utt["end_sec"]=row["end_sec"]
773
+ segs.append(row["seg"])
774
+ else:
775
+ # append merge_utt to df2
776
+ merge_utt["seg"]=segs
777
+ df2.append(merge_utt)
778
+ # clear merge_utt and set to current
779
+ merge_utt=row
780
+ segs=[merge_utt["seg"]]
781
+
782
+ merge_utt["seg"]=segs
783
+ # if not isinstance(merge_utt["seg"],list):
784
+ # merge_utt["seg"]=list(segs)
785
+ df2.append(merge_utt) # catch final merge_utt if not terminated
786
+
787
+ df2=pd.DataFrame(df2)
788
+ df2['utterance']=df2['utterance'].str.replace('\.+',' ', regex=True)
789
+
790
+ # clear up "......"
791
+ # enumerate utterances
792
+ df2.reset_index(inplace=True,drop=True)
793
+ df2 = df2.reset_index().rename(columns = {'index':'utt'})
794
+ return df2
795
+
796
+
797
+
798
+ def add_dummy_seg_column(table):
799
+ # adds a dummy seg column (listing segments comprising utterance) for a df without this column
800
+ # labelfiles generated from merge_ellipsis have an 'utt' column giving utterance ID, and a seg column
801
+ # containing a list of original segments comprising each utterance
802
+ # but you may need all label files top have the exact same format even if they weren't produced by
803
+ # merge_ellipsis()
804
+ # returns a table with columns 'utt' and 'seg'
805
+
806
+ if 'seg' in table.columns.tolist():
807
+ print('\'seg\' column already exists, not changing anything')
808
+ return table
809
+ if 'uttID' in table.columns.tolist():
810
+ table=table.rename(columns={"uttID":"utt"})
811
+ if not 'utt' in table.columns.tolist():
812
+ table['utt']=table.index
813
+ table['seg']=[[u] for u in table['utt']]
814
+ table=table[['utt','seg','speaker','start_sec','end_sec','utterance']]
815
+
816
+ return table
817
+
818
+
819
+ def get_sessname_from_filename(filename):
820
+ sessname=Path(filename).stem
821
+ sessname = re.sub('reworked-transcript-diarized-timestamped-', '', sessname,flags=re.I)
822
+ sessname = re.sub('reworked_transcript-diarized-timestamped-', '', sessname,flags=re.I)
823
+ sessname = re.sub('reworked-diarized-timestamped-', '', sessname,flags=re.I)
824
+ sessname = re.sub('reworked_timestamped_', '', sessname,flags=re.I)
825
+ sessname = re.sub('reworked_', '', sessname,flags=re.I)
826
+ sessname = re.sub('reworked-', '', sessname,flags=re.I)
827
+ sessname = re.sub('transcript_diarized_timestamped_', '', sessname,flags=re.I)
828
+ sessname = re.sub('transcript-diarized-timestamped_', '', sessname,flags=re.I)
829
+ sessname = re.sub('transcript-diarized-timestamped-', '', sessname,flags=re.I)
830
+ sessname = re.sub('_transcript', '', sessname,flags=re.I)
831
+ sessname = re.sub('_tmcoded', '', sessname,flags=re.I)
832
+ sessname = re.sub('utt_labels_', '', sessname,flags=re.I)
833
+ sessname = re.sub('seg_labels_', '', sessname,flags=re.I)
834
+ sessname = re.sub('_redacted', '', sessname,flags=re.I)
835
+ return sessname
836
+
837
+
838
+ def ELAN_to_labels_csv(ELANfile, merge_segments = True):
839
+ # dumb but effective string wrangling to get sess name
840
+ sessname=get_sessname_from_filename(ELANfile)
841
+
842
+ # reads ELAN output to pd.DataFrame in a unified format
843
+ labels=readELANtsv(ELANfile)
844
+
845
+ if merge_segments:
846
+ save_file=f'utt_labels_{sessname}.csv'
847
+ # merge segments to form utterances where there have been splits separated by '...'
848
+ merged_labels=merge_ellipsis(labels)
849
+ merged_labels.to_csv(save_file,index=False, float_format='%.3f')
850
+ else:
851
+ save_file=f'seg_labels_{sessname}.csv'
852
+ labels.to_csv(save_file,index=False, float_format='%.3f')
853
+ return save_file
854
+
855
+
856
+ def parse_label_csv(label_csv:str):
857
+ # utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
858
+ # There are several versions with differnt columns (with/without segment &/ utterance index,
859
+ # withouot column headers etc)
860
+ # table:
861
+ # [uttID, speaker, transcript, start_sec, end_sec]
862
+
863
+ table = pd.read_csv(label_csv,keep_default_na=False, header=None)
864
+ row0=table.iloc[0]
865
+
866
+ is_header = not any(str(cell).replace('.','').isdigit() for cell in row0)
867
+ if is_header:
868
+ table.columns=row0.tolist()
869
+ table=table.iloc[1:]
870
+ table=table.reset_index(drop=True)
871
+ else:
872
+ if len(table.columns)==4:
873
+ print('no header detected, assuming annotation file has columns [speaker,utterance,start_sec, end_sec] ')
874
+ table.columns=['speaker','utterance','start_sec', 'end_sec']
875
+ elif len(table.columns)==5:
876
+ print('no header detected, assuming annotation file has columns [seg,speaker,utterance,start_sec, end_sec] ')
877
+ table.columns=['seg','speaker','utterance','start_sec', 'end_sec']
878
+ elif len(table.columns)==6:
879
+ print('no header detected, assuming annotation file has columns [utt,seg,speaker,utterance,start_sec, end_sec] ')
880
+ table.columns=['utt','seg','speaker','utterance','start_sec', 'end_sec']
881
+ else:
882
+ print(f'no header detected, csv has {len(table.columns)} columns, could not determine column names.')
883
+ return None
884
+ # choose which column to use for uttID in table
885
+ if 'utt' in table.columns.tolist():
886
+ table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
887
+ elif 'seg' in table.columns.tolist():
888
+ table=table.rename(columns={"seg":"uttID"})
889
+ else:
890
+ table=table.reset_index().rename(columns={"index":"uttID"})
891
+
892
+ table=table[['uttID','speaker','start_sec','end_sec','utterance']]
893
+ return table
894
+
895
+
896
+ def deidentify_speaker(df, who='all'):
897
+ """replace speaker ID with generic labels
898
+ in order of appearance (speaker1, speaker2)'
899
+ if who is "student", only student names are replaced
900
+
901
+
902
+ Args:
903
+ df (_type_): _description_
904
+ who (str, optional): 'all','student'. Which names to replace. Defaults to 'all'.
905
+ """
906
+ colnames = df.columns.tolist()
907
+ speaker_key = next((key for key in ['speaker','Speaker','speaker_id','Speaker_ID'] if key in colnames),None)
908
+ if not speaker_key:
909
+ raise ValueError('No speaker column found in dataframe!')
910
+ speakers = df[speaker_key].unique()
911
+ if who=='student':
912
+ # detect student. ID format can be student_xxx or 00-0000 numeric
913
+ speakers = [s for s in speakers if ('student' in s.lower() or re.match(r'^\d{2}-\d{4}$',s))]
914
+ generic_speakers = [f'student_{i+1}' for i in range(len(speakers))]
915
+ else:
916
+ generic_speakers = [f'speaker_{i+1}' for i in range(len(speakers))]
917
+ speaker_dict = dict(zip(speakers, generic_speakers))
918
+ df[speaker_key] = df[speaker_key].replace(speaker_dict)
919
+ return df