rosyvs commited on
Commit
55f82cb
ยท
1 Parent(s): d9fb961

Remove transcript_app.py and transcript_utils.py files

Browse files
Files changed (3) hide show
  1. app.py +7 -7
  2. transcript_app.py +0 -225
  3. transcript_utils.py +0 -745
app.py CHANGED
@@ -382,7 +382,7 @@ error_check_tm = gr.Textbox(label="Error Check", type="text")
382
  interface_tm = gr.Interface(fn=convert_xlsx_to_TMxlsx,
383
  inputs=input_xlsx,
384
  outputs=[output_xlsx_tm, process_log_tm, global_transfer_log_tm, error_check_tm],
385
- title="transcript-->XLSX+TM_dropdown",
386
  description="Converts XLSX or csv transcript to XLSX+TM transcript with prefilled dropdown for talkmoves",
387
  live=False,
388
  allow_flagging="never",)
@@ -475,13 +475,13 @@ demo = gr.TabbedInterface(
475
  "๐Ÿ“โ†’๐Ÿ—’๏ธ ELANโ†’CSV",
476
  "โŽโ†’๐Ÿ“ XLSXโ†’ELAN",
477
  "โŽโ†’๐Ÿ—’๏ธ XLSXโ†’CSV",
478
- "๐Ÿ—’๏ธโ†’โŽโ˜ท CSVโ†’XLSX+annotation",
479
- "๐Ÿ—’๏ธโ†’โŽ๐Ÿ’ฌ transcriptโ†’XLSX+TM_dropdown",
480
  "๐Ÿ—’๏ธโ†’๐Ÿฅท๐Ÿป Deidentify",
481
- "๐ŸŽฅโ†’๐Ÿ“ฝ Video Converter",
482
- "๐ŸŽฅโœ‚๏ธ Video Trimmer",
483
- "๐ŸŽฅ๐ŸŽฒ Video Trimmer with Random Start Time",
484
- "๐ŸŽฅ๐Ÿ—’๏ธโœ‚๏ธ Video Trimmer with Transcript"
485
  ]
486
  )
487
 
 
382
  interface_tm = gr.Interface(fn=convert_xlsx_to_TMxlsx,
383
  inputs=input_xlsx,
384
  outputs=[output_xlsx_tm, process_log_tm, global_transfer_log_tm, error_check_tm],
385
+ title="transcript-->XLSX+TM",
386
  description="Converts XLSX or csv transcript to XLSX+TM transcript with prefilled dropdown for talkmoves",
387
  live=False,
388
  allow_flagging="never",)
 
475
  "๐Ÿ“โ†’๐Ÿ—’๏ธ ELANโ†’CSV",
476
  "โŽโ†’๐Ÿ“ XLSXโ†’ELAN",
477
  "โŽโ†’๐Ÿ—’๏ธ XLSXโ†’CSV",
478
+ "๐Ÿ—’๏ธโ†’โŽโ˜ท CSVโ†’XLSX",
479
+ "๐Ÿ—’๏ธโ†’โŽ๐Ÿ’ฌ CSVโ†’XLSX+TM",
480
  "๐Ÿ—’๏ธโ†’๐Ÿฅท๐Ÿป Deidentify",
481
+ "๐ŸŽฅโ†’๐Ÿ“ฝ Convert",
482
+ "๐ŸŽฅโœ‚๏ธ Trim",
483
+ "๐ŸŽฅโœ‚๏ธ๐ŸŽฒ Trim Random",
484
+ "๐ŸŽฅ๐Ÿ—’๏ธโœ‚๏ธ Trim + Transcript"
485
  ]
486
  )
487
 
transcript_app.py DELETED
@@ -1,225 +0,0 @@
1
- import threading
2
- import os
3
- import time
4
- import pandas as pd
5
-
6
- import gradio as gr
7
- from utils import (HHMMSS_to_sec, molly_old_xlsx_to_table, convert_transcript_for_TM, convert_transcript_for_annotation,
8
- table_to_ELAN_tsv, ELAN_to_labels_csv, old_xlsx_to_table, old_xlsx_to_labels_csv, deidentify_speaker)
9
-
10
-
11
- def delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath):
12
- for output_filepath in output_filepath_list:
13
- try:
14
- os.remove(output_filepath)
15
- except FileNotFoundError:
16
- pass
17
- for trans_log_filepath in trans_log_filepath_list:
18
- try:
19
- os.remove(trans_log_filepath)
20
- except FileNotFoundError:
21
- pass
22
- try:
23
- os.remove(global_log_filepath)
24
- except FileNotFoundError:
25
- pass
26
- print("Files deleted")
27
-
28
- def delete_files_thread(output_filepath_list, trans_log_filepath_list, global_log_filepath):
29
- print("Thread started")
30
- time.sleep(20)
31
- delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath)
32
-
33
- def convert_xlsx_to_TMxlsx(input_file_list):
34
-
35
- file_list = [file.name for file in input_file_list]
36
- output_filepath_list, trans_log_filepath_list, error_check, global_transfer_log_path = convert_transcript_for_TM(file_list=file_list)
37
- if not error_check:
38
- error_check = "No errors found."
39
-
40
- delete_thread = threading.Thread(target=delete_files_thread, args=(output_filepath_list, trans_log_filepath_list, global_transfer_log_path))
41
- delete_thread.start()
42
-
43
- return output_filepath_list, trans_log_filepath_list, global_transfer_log_path, error_check
44
-
45
- def convert_for_annotation(input_file_list, annotation_scheme):
46
- output_files=[]
47
- for input_transcript in input_file_list:
48
- print("start converting transcript")
49
- output_file = convert_transcript_for_annotation(file=input_transcript, annotation_scheme=annotation_scheme)
50
- print("finished converting transcript to xlsx for annotation")
51
- output_files.append(output_file)
52
- return output_files
53
-
54
-
55
- def convert_xlsx_to_ELANtsv(input_file_list):
56
- output_files=[]
57
- for input_transcript in input_file_list:
58
- # convert transcript
59
- print("start converting transcript")
60
- table = old_xlsx_to_table(xl_file=input_transcript)
61
- print("finished converting transcript to table")
62
- output_transcript = input_transcript.replace('.xlsx', '.tsv')
63
- output_file = table_to_ELAN_tsv(table, output_transcript)
64
- print("saved table to tsv")
65
- output_files.append(output_file)
66
- return output_files
67
-
68
-
69
- #TODO: support sort and merge for XLSX output if this is needed
70
-
71
- def convert_ELANtsv_to_CSV(input_file_list, merge_ellipsis=False):
72
- output_files=[]
73
- for input_transcript in input_file_list:
74
- # convert transcript
75
- print("start converting transcript")
76
- output_transcript = input_transcript.replace('.tsv', '.csv')
77
- output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
78
- print("finish converting transcript")
79
- output_files.append(output_file)
80
- return output_files
81
-
82
- # TODO: XLSX to csv (seg_labels or utt_labels)
83
- def convert_xlsx_to_csv(input_file_list, merge_ellipsis=False):
84
- output_files=[]
85
- for input_transcript in input_file_list:
86
- # read xl file to table
87
- # write table to csv with option to merge segments on ellipsis
88
- output_transcript = input_transcript.replace('.xlsx', '.csv')
89
- output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
90
- output_files.append(output_file)
91
- return output_files
92
-
93
- def deidentify_transcripts(input_file_list, who='student'):
94
- output_files=[]
95
- for file in input_file_list:
96
- basename = os.path.basename(file)
97
- ext = file.split('.')[-1]
98
- if file.endswith('.xlsx') or file.endswith('.xls'):
99
- df = pd.read_excel(file)
100
- elif file.endswith('.csv'):
101
- df = pd.read_csv(file)
102
- elif file.endswith('.tsv'):
103
- df = pd.read_csv(file, sep='\t')
104
- elif file.endswith('.txt'):
105
- df = pd.read_csv(file, sep='\t')
106
- else:
107
- gr.Warning("File type not supported (must be .xlsx, .xls, .csv, .tsv, or .txt)")
108
- try:
109
- df = deidentify_speaker(df, who=who)
110
- except ValueError as e:
111
- gr.Warning(f"{e}: {basename} ")
112
- continue
113
- output_file = file.replace(f'.{ext}', f'_deidentified.{ext}')
114
- if ext == 'xlsx' or ext == 'xls':
115
- df.to_excel(output_file, index=False)
116
- elif ext == 'csv':
117
- df.to_csv(output_file, index=False)
118
- elif ext == 'tsv' or ext == 'txt':
119
- df.to_csv(output_file, sep='\t', index=False)
120
- output_files.append(output_file)
121
- return output_files
122
-
123
- # gr components for TM converter
124
- input_xlsx = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
125
- output_xlsx_tm = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
126
- process_log_tm = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
127
- global_transfer_log_tm = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
128
- error_check_tm = gr.Textbox(label="Error Check", type="text")
129
- interface_tm = gr.Interface(fn=convert_xlsx_to_TMxlsx,
130
- inputs=input_xlsx,
131
- outputs=[output_xlsx_tm, process_log_tm, global_transfer_log_tm, error_check_tm],
132
- title="transcript-->XLSX+TM_dropdown",
133
- description="Converts XLSX or csv transcript to XLSX+TM transcript with prefilled dropdown for talkmoves",
134
- live=False,
135
- allow_flagging="never",)
136
-
137
- # gr components for xlsx to ELAN
138
- input_x2e = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
139
- output_x2e = gr.Files(label="Output ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
140
- # process_log_x2e = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
141
- # global_transfer_log_x2e = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
142
- # error_check_x2e = gr.Textbox(label="Error Check", type="text")
143
- interface_x2e = gr.Interface(fn=convert_xlsx_to_ELANtsv, # TODO: swap out for correct fn
144
- inputs=input_x2e,
145
- outputs=output_x2e,
146
- title="XLSX-->ELAN",
147
- description="Converts XLSX transcript to ELAN-compatible tsv file",
148
- live=False,
149
- allow_flagging="never",)
150
-
151
- # gr components for ELAN to CSV
152
- input_e2c = gr.Files(label="Input ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
153
- merge_e2c = gr.Checkbox(label="Merge segments on ellipsis?")
154
- output_e2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
155
- interface_e2c = gr.Interface(fn=convert_ELANtsv_to_CSV, # TODO: swap out for correct fn
156
- inputs=[input_e2c, merge_e2c],
157
- outputs=[output_e2c],
158
- title="ELAN-->CSV",
159
- description="Converts ELAN-exported file (.txt or .tsv, tab separated values) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
160
- live=False,
161
- allow_flagging="never",)
162
-
163
- # gr components for XLSX to CSV
164
- input_x2c = gr.Files(label="Input XLSX file", type="filepath", file_types=[".xlsx", ".csv"])
165
- merge_x2c = gr.Checkbox(label="Merge segments on ellipsis?")
166
- output_x2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
167
- interface_x2c = gr.Interface(fn=convert_xlsx_to_csv, # TODO: swap out for correct fn
168
- inputs=[input_x2c, merge_x2c],
169
- outputs=[output_x2c],
170
- title="XLSX-->CSV",
171
- description="Converts old version XLSX transcript (with a single Timecode column) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
172
- live=False,
173
- allow_flagging="never",)
174
-
175
- # gr components for annotation XLSX
176
- input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
177
- annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
178
-
179
- output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
180
- interface_c2a = gr.Interface(
181
- fn=convert_for_annotation, # TODO: swap out for correct fn
182
- inputs=[input_c2a, annotation_scheme_c2a],
183
- outputs=[output_c2a],
184
- title="CSV-->XLSX+annotation",
185
- description="Converts CSV file to XLSX file for annotation (added columns for CPS or TM or None)",
186
- live=False,
187
- allow_flagging="never",
188
- # submit_btn="Convert"
189
- )
190
-
191
- # gr components for deidentification
192
- input_di = gr.Files(label="Input transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
193
- who_di = gr.Radio(label="Who to deidentify", choices=[("student","student"), ("all","all")])
194
- output_di = gr.Files(label="Output deidentified transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
195
- interface_di = gr.Interface(
196
- fn=deidentify_transcripts,
197
- inputs=[input_di, who_di],
198
- outputs=[output_di],
199
- title="Deidentify",
200
- description="Deidentify speaker labels in a transcript. Compatible with .xlsx, .xls, .csv, .tsv, .txt files with a column containing speaker labels. Will not work if speaker column is missing a header. Speaker names or IDs will be replaced with a deidentified label numbered in order of appearance. Choose whether to deidentify just students or all speakers.",
201
- live=False,
202
- allow_flagging="never",
203
- )
204
-
205
- tab_interface = gr.TabbedInterface(
206
- [
207
- interface_e2c,
208
- interface_c2a,
209
- interface_x2e,
210
- interface_x2c,
211
- interface_tm,
212
- interface_di
213
- ]
214
- ,
215
- ["ELANโ†’CSV",
216
- "CSVโ†’XLSX+annotation",
217
- "XLSXโ†’ELAN",
218
- "XLSXโ†’CSV",
219
- "transcriptโ†’XLSX+TM_dropdown",
220
- "Deidentify"
221
- ]
222
- )
223
- # TODO: XLSX to csv (seg_labels or utt_labels)
224
- # TODO: XLSX to merged on ellipsis, keep XLSX format
225
- tab_interface.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
transcript_utils.py DELETED
@@ -1,745 +0,0 @@
1
- import json
2
- import math
3
- import os
4
- import re
5
- import csv
6
- from pathlib import Path
7
- import gradio as gr
8
- import nltk
9
- import pandas as pd
10
- from nltk.tokenize import sent_tokenize
11
- from openpyxl import Workbook
12
- from openpyxl.utils.dataframe import dataframe_to_rows
13
- from openpyxl.worksheet.datavalidation import DataValidation
14
- from pandas._libs.tslibs import timestamps
15
-
16
-
17
- def convert_transcript_for_TM(file_list):
18
- """Convert transcripts for TalkMoves Annotation
19
- Input can be xlsx or csv transcript file
20
- Can handle sepraate start and end time columns or a single timecode column
21
- Output will have separate start and end timestamps in HH:MM:SS.sss format
22
-
23
- Args:
24
- file_list (_type_): _description_
25
-
26
- Raises:
27
- gr.Error: _description_
28
- gr.Error: _description_
29
-
30
- Returns:
31
- _type_: _description_
32
- """
33
-
34
-
35
- # Regular expression pattern for matching speaker names and timecodes.
36
- bracket_re = re.compile(r'(?:\[[UI|ui|Inaudible|inaudible|overlapping speech|VIDEO SILENCE|teacher explaining in background].*\]\W{0,2})')
37
- # Regular expression pattern for matching anything enclosed in square brackets.
38
- all_bracket_re = re.compile(r'(?:\[.*\]\W{0,2})')
39
- # whether remove the inaudible
40
- do_remove_inaudible = True
41
- # whether_keep_context_switch
42
- do_keep_context_switch = True
43
- # whether_convert_to_timestamp if start and end time are in seconds and in separate columns
44
- convert_to_timestamp = True
45
-
46
- error_message = [] # List of error messages to be displayed to the user.
47
- global_stat_dict = {} # Dictionary of global statistics.
48
- output_filepath_list = [] # List of output file paths.
49
- trans_log_filepath_list = [] # List of transcription log file paths.
50
- for file in file_list:
51
- filename = file.split('/')[-1] # Get the filename from the file.
52
- filepath = os.path.dirname(file) # Get the file path from the file.
53
- # Read the file into a Pandas DataFrame depending on its file format.
54
- if filename.endswith('.xlsx'):
55
- df = pd.read_excel(file, index_col=0)
56
- output_filename = f"{filename[:-5]}" + "_TMcoded.xlsx"
57
- elif filename.endswith('.csv'):
58
- df = pd.read_csv(file, index_col=0, error_bad_lines=False)
59
- output_filename = f"{filename[:-4]}" + "_TMcoded.xlsx"
60
-
61
- else:
62
- raise gr.Error(f"{file} format is wrong")
63
-
64
- # Remove the "Copy of" prefix from the output filename, if present.
65
- if output_filename.startswith("Copy of "):
66
- output_filename = output_filename[8:]
67
-
68
- # Remove the word "_Transcript" from the output filename, if present.
69
- if '_Transcript' in output_filename:
70
- # print("before: "+output_filename)
71
- error_message.append("before: "+output_filename)
72
- output_filename = ''.join(output_filename.split('_Transcript'))
73
- # print("after: "+output_filename)
74
- error_message.append("after: "+output_filename)
75
-
76
- # Construct the output file and transcription log file paths.
77
- output_filepath = os.path.join(filepath, output_filename)
78
- trans_log_filepath = os.path.join(filepath, f"{output_filename}"+ ".log")
79
-
80
- # Open the transcription log file for writing.
81
- with open(trans_log_filepath, "w") as outfile:
82
- sub_cnt_in_file = 0
83
- empty_speaker_cnt_in_file = 0
84
- turn_skipped_in_file = 0
85
- turn_skipped_speaker_switch_in_file = 0
86
- snt_mark_skip_in_file = 0
87
- snt_skipped_in_file = 0
88
- chat_flag_in_speaker_time_line = 0
89
- chat_flag_in_content_line = 0
90
- all_inaudible_in_file = 0
91
- all_bracket_in_file = 0
92
- all_snts_in_file = 0
93
- all_token_cnt_in_file = 0
94
- #index Timecode Duration Speaker Dialogue Annotations Error Type
95
- #1 00:00:05:04 - 00:00:07:12 00:00:02:08 Tutor Did you... How was your Halloween?
96
- turns = []
97
- time_stamps = []
98
- speakers = []
99
- chat_flags = []
100
- sentences = []
101
- snt_ids = []
102
-
103
- ## parse the df flexibly: find key column names which might vary dependign on transcript source
104
- # set all column names to lowercase
105
- df.columns = map(str.lower, df.columns)
106
- # several possibilities for column names, detect which are present
107
- uttID_keys = ['utt','seg','utt_id','seg_id','index']
108
- speaker_keys = ['speaker']
109
- start_keys=['start_sec','start','start_time','timestart']
110
- end_keys=['end_sec','end','end_time','timeend']
111
- timestamp_keys = ['timecode','timestamp']
112
- content_keys=['dialogue','utterance','transcript','text']
113
- # detect which is used in this df
114
- uttID_key = next((key for key in uttID_keys if key in df.columns), None)
115
- speaker_key = next((key for key in speaker_keys if key in df.columns), None)
116
- content_key = next((key for key in content_keys if key in df.columns), None)
117
- # check if separate start and end times are present, otherwise assume single timecode column
118
- if any(df.columns.isin(start_keys)):
119
- start_key = next((key for key in start_keys if key in df.columns), None)
120
- end_key = next((key for key in end_keys if key in df.columns), None)
121
- time_format = 'seconds'
122
- if convert_to_timestamp:
123
- # convert to timestamp format HH:MM:SS.sss - HH:MM:SS.sss
124
- df['timecode'] = df.apply(lambda x: f"{sec_to_HHMMSS(x[start_key])} - {sec_to_HHMMSS(x[end_key])}", axis=1)
125
- timestamp_key='timecode'
126
- time_format = 'timestamp'
127
- else:
128
- timestamp_key=next((key for key in timestamp_keys if key in df.columns), None)
129
- time_format = 'timestamp'
130
- # Turn started with 1, the same as molly's transcripts
131
- for i, row in df.iterrows():
132
- turn = row[uttID_key] if uttID_key else i+1
133
- speaker = row[speaker_key]
134
- time_str = row[timestamp_key]
135
- content = "" if pd.isna(row[content_key]) else row[content_key].strip("\n")
136
- # when speaker is empty, use the previous speaker
137
- if speaker == "":
138
- if speakers:
139
- speaker = speakers[-1]
140
- empty_speaker_cnt_in_file += 1
141
- outfile.write(f"{turn}: found empty speaker, use the speaker in previous turn: {speaker}\n")
142
- else:
143
- raise gr.Error(f"{row}, the first turn is empty speaker")
144
-
145
- # clean after the sentence tokenize
146
- snts = sent_tokenize(content)
147
- all_snts_in_file += len(snts)
148
- snt_skipped_in_turn = 0
149
- for i, snt in enumerate(snts):
150
- remove_flag = False
151
- inaudible_search = re.findall(bracket_re, snt)
152
- if inaudible_search:
153
- all_inaudible_in_file += len(inaudible_search)
154
- outfile.write(f"{turn}, {inaudible_search}, inaudible found in snt: {snt}\n")
155
-
156
- all_bracket_search = re.findall(all_bracket_re, snt)
157
- if all_bracket_search:
158
- all_bracket_in_file += len(all_bracket_search)
159
- outfile.write(f"{turn}, {all_bracket_search} bracket found in snt: {snt}\n")
160
-
161
- # only remove the [inaudible xxx] when it is the whole sentence.
162
- inaudible_match = re.fullmatch(bracket_re, snt)
163
-
164
- if inaudible_match:
165
- if do_keep_context_switch:
166
- # if keep context switch
167
- if speakers and speaker == speakers[-1]:
168
- # share the same speaker, no context switching, just remove it
169
- remove_flag = True
170
- else:
171
- # different speakers, it is the context switching.
172
- if len(snts) == 1:
173
- # current empty sentence is the only single sentence
174
- remove_flag = False
175
- else:
176
- if i != len(snts)-1:
177
- # current empty utterance is not the last one, just delete it
178
- remove_flag = True
179
- else:
180
- # current empty utterance is the last one, keep it.
181
- if snt_skipped_in_turn == len(snts)-1:
182
- # all previous snts are empty, then keep this to not skip the whole turn
183
- remove_flag = False
184
- else:
185
- remove_flag = True
186
- else:
187
- # if not keep context switch, then simply remove all empty utterance
188
- remove_flag = True
189
-
190
- # If remove_flag is true:
191
- if remove_flag:
192
- # Increment sub_cnt_in_file and snt_mark_skip_in_file
193
- sub_cnt_in_file += 1
194
- snt_mark_skip_in_file += 1
195
- # Write the following message to outfile:
196
- outfile.write(f"{turn}, sub happend: {snt}, skip this sentence\n")
197
- # If do_remove_inaudible is true:
198
- if do_remove_inaudible:
199
- snt_skipped_in_file += 1
200
- snt_skipped_in_turn += 1
201
- continue
202
-
203
- # Add to pd:
204
- # Append turn to turns list
205
- turns.append(turn)
206
- # Set snt_id to the string f"{turn}.{i}"
207
- snt_id = f"{turn}.{i}"
208
- # Append time_str to time_stamps list
209
- time_stamps.append(time_str)
210
- # Append speaker to speakers list
211
- speakers.append(speaker)
212
- # Set sentence to the string representation of snt, with whitespace removed from the start and end
213
- sentence = str(snt).strip().rstrip("\n")
214
- # Calculate the number of tokens in sentence and add to all_token_cnt_in_file
215
- token_cnt = len(nltk.word_tokenize(sentence))
216
- all_token_cnt_in_file += token_cnt
217
- # Append snt_id to snt_ids list
218
- snt_ids.append(snt_id)
219
- # Append sentence to sentences list
220
- sentences.append(sentence)
221
-
222
- if snt_skipped_in_turn == len(snts):
223
- # all snts in turn are skiped, then skip the turn
224
- turn_skipped_in_file += 1
225
- if (speakers and speaker != speakers[-1]) or not speakers:
226
- turn_skipped_speaker_switch_in_file += 1
227
- outfile.write(f"{turn}, since all snts are empty, skip this whole turn {content}\n")
228
- # Create a new DataFrame with the following columns:
229
- new_df = pd.DataFrame({
230
- "Sentence_ID": snt_ids, # A
231
- "TimeStamp": time_stamps, #B
232
- "Turn" : turns, #C
233
- "Speaker" : speakers, #D
234
- "Sentence" : sentences #E
235
- })
236
-
237
- # assert turn_skipped_speaker_switch_in_file==0, "Some speaker switch turn skipped"
238
- new_df["Teacher_TM"] = None #F
239
- new_df["Student_TM"] = None #G
240
-
241
- # write new_df to xlsx file
242
- new_df.to_excel(output_filepath, index=False)
243
-
244
-
245
- # https://openpyxl.readthedocs.io/en/latest/api/openpyxl.utils.dataframe.html#openpyxl.utils.dataframe.dataframe_to_rows
246
- wb = Workbook()
247
- ws = wb.active
248
- teacher_dv = DataValidation(type="list", formula1='",1-None,2-Keep-Together,3-Getting-Student-to-Relate,4-Restating,5-Revoicing,6-Context,7-Press-for-Accuracy,8-Press-for-Reasoning"', allow_blank=True)
249
- student_dv = DataValidation(type="list", formula1='",1-None,2-Relate-to-Another-Student,3-Asking-for-More-info,4-Making-a-Claim,5-Providing-Evidence/Reasoning"', allow_blank=True)
250
- ws.add_data_validation(teacher_dv)
251
- ws.add_data_validation(student_dv)
252
- teacher_dv.add('F2:F1048576')
253
- student_dv.add('G2:G1048576')
254
- for r in dataframe_to_rows(new_df, index=False, header=True):
255
- ws.append(r)
256
- wb.save(output_filepath)
257
-
258
- stat_dict = {
259
- "chat_flag_in_speaker_time_line": chat_flag_in_speaker_time_line,
260
- "chat_flag_in_content_line": chat_flag_in_content_line,
261
- "empty_speaker_cnt_in_file": empty_speaker_cnt_in_file,
262
- "ori_total_turn": df.shape[0],
263
- "ori_total_snt": all_snts_in_file,
264
- "turn_skipped": turn_skipped_in_file,
265
- "turn_skipped_speaker_switch_in_file": turn_skipped_speaker_switch_in_file,
266
- "snt_skipped": snt_skipped_in_file,
267
- "remaining_snt": all_snts_in_file - snt_skipped_in_file,
268
- "all_token_cnt_in_file": all_token_cnt_in_file,
269
- "avg_token_cnt_per_snt": all_token_cnt_in_file/(all_snts_in_file - snt_skipped_in_file),
270
- "sub_cnt_in_file": sub_cnt_in_file,
271
- "all_inaudible_in_file": all_inaudible_in_file,
272
- "all_bracket_in_file": all_bracket_in_file,
273
- "other_bracket_in_file": all_bracket_in_file - all_inaudible_in_file
274
- }
275
- if all_inaudible_in_file != all_bracket_in_file:
276
- # print(f"{filename} has special brakets")
277
- error_message.append(f"Warning: {filename} has special brakets")
278
- for k, v in stat_dict.items():
279
- global_stat_dict[k] = global_stat_dict.get(k,0) + v
280
- outfile.write(f"{output_filepath}, {json.dumps(stat_dict, indent=4)}")
281
-
282
- output_filepath_list.append(output_filepath)
283
- trans_log_filepath_list.append(trans_log_filepath)
284
-
285
- for k, v in global_stat_dict.items():
286
- if "avg" in k:
287
- global_stat_dict[k] = global_stat_dict[k]/len(file_list)
288
- global_log_filepath = os.path.join(filepath, "global_transfer"+ ".log")
289
- with open(global_log_filepath, "w") as outfile:
290
- outfile.write(f"global_stat_dict: {json.dumps(global_stat_dict, indent=4)}")
291
-
292
- # error_check
293
- if global_stat_dict["all_inaudible_in_file"] != global_stat_dict["all_bracket_in_file"]:
294
- error_message.append("Error: 'all_inaudible_in_file' does not match 'all_bracket_in_file'")
295
- if global_stat_dict["other_bracket_in_file"] != 0:
296
- error_message.append("Error: 'other_bracket_in_file' is not zero")
297
-
298
- return output_filepath_list, trans_log_filepath_list, error_message, global_log_filepath
299
-
300
- def add_CPS_columns(df):
301
- # Observation Instructions CONST_SharesU_Situation CONST_SharesU_CorrectSolutions CONST_SharesU_IncorrectSolutions CONST_EstablishesCG_Confirms CONST_EstablishesCG_Interrupts NEG_Responds_Reasons NEG_Responds_QuestionsOthers NEG_Responds_Responds MAINTAIN_Initiative_Criticizes NEG_MonitorsE_Results NEG_MonitorsE_GivingUp NEG_MonitorsE_Strategizes NEG_MonitorsE_Save MAINTAIN_Initiative_Suggestions MAINTAIN_Initiative_Compliments MAINTAIN_FulfillsR_InitiatesOffTopic MAINTAIN_FulfillsR_JoinsOffTopic MAINTAIN_FulfillsR_Support MAINTAIN_FulfillsR_Apologizes Notes
302
- annotation_columns = ['Observation','Instructions', 'CONST_SharesU_Situation', 'CONST_SharesU_CorrectSolutions', 'CONST_SharesU_IncorrectSolutions', 'CONST_EstablishesCG_Confirms', 'CONST_EstablishesCG_Interrupts', 'NEG_Responds_Reasons', 'NEG_Responds_QuestionsOthers', 'NEG_Responds_Responds', 'MAINTAIN_Initiative_Criticizes', 'NEG_MonitorsE_Results', 'NEG_MonitorsE_GivingUp', 'NEG_MonitorsE_Strategizes', 'NEG_MonitorsE_Save', 'MAINTAIN_Initiative_Suggestions', 'MAINTAIN_Initiative_Compliments', 'MAINTAIN_FulfillsR_InitiatesOffTopic', 'MAINTAIN_FulfillsR_JoinsOffTopic', 'MAINTAIN_FulfillsR_Support', 'MAINTAIN_FulfillsR_Apologizes', 'Notes']
303
- # add these columns to the end of the df in this order
304
- for col in annotation_columns:
305
- df[col]=''
306
- return df
307
-
308
- def add_TM_columns(df):
309
- annotation_columns = ['Teacher_TM', 'Student_TM']
310
- # add these columns to the end of the df in this order
311
- for col in annotation_columns:
312
- df[col]=''
313
- return df
314
-
315
- def convert_transcript_for_annotation(file, annotation_scheme=None):
316
- """Convert transcript for annotation:
317
- Input standard csv transcript file
318
- Output will have separate start and end timestamps in HH:MM:SS.sss format
319
- Filename column will infer the video filename from the transcript filename
320
- Columns for CPS annotators are added
321
- """
322
- filename,ext = os.path.splitext(os.path.basename(file)) # Get the filename from the file.
323
- filepath = os.path.dirname(file) # Get the file path from the file.
324
- # Read the file into a Pandas DataFrame depending on its file format.
325
- try:
326
- table = parse_label_csv(file)
327
- media_filename = get_sessname_from_filename(filename)
328
- out_df=table.copy()
329
- out_df['recordingID']=media_filename
330
- out_df['TimeStart']=out_df['start_sec'].apply(sec_to_HHMMSS)
331
- out_df['TimeEnd']=out_df['end_sec'].apply(sec_to_HHMMSS)
332
- out_df=out_df[['speaker','TimeStart','TimeEnd','utterance','recordingID','uttID']]
333
- if annotation_scheme=='CPS':
334
- out_df=add_CPS_columns(out_df)
335
- output_file = os.path.join(filepath, f"CPS_{filename}.xlsx")
336
- out_df.to_excel(output_file, index=False)
337
- elif annotation_scheme=='TM':
338
- out_df=add_TM_columns(out_df)
339
- output_file = os.path.join(filepath, f"TM_{filename}.xlsx")
340
- out_df.to_excel(output_file, index=False)
341
- else:
342
- output_file = os.path.join(filepath, f"{filename}.xlsx")
343
- out_df.to_excel(output_file, index=False)
344
- return output_file
345
- except Exception as e:
346
- raise gr.Error(f"{filename}: error {e}")
347
-
348
- def HHMMSS_to_sec(time_str):
349
- """Get Seconds from timestamp string with milliseconds."""
350
- if not time_str:
351
- return None
352
- if time_str.count(':')==2:
353
- h, m, s = time_str.split(':')
354
- elif time_str.count(':')==3:
355
- # weird timestamps where there is a field followign seconds delimited by colon
356
- h, m, s, u = time_str.split(':')
357
- # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
358
- if len(u)==1:
359
- print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
360
- ms = float(u)/10
361
- elif len(u)==2: # hundredths
362
- ms = float(u)/100
363
- elif len(u)==3: # hundredths
364
- ms = float(u)/1000
365
- else:
366
- print(f'input string format not supported: {time_str}')
367
- return None
368
- s = int(s)+ms
369
- elif time_str.count(':')==1:
370
- # print('missing HH from timestamp, assuming MM:SS')
371
- m, s = time_str.split(':')
372
- h=0
373
- else:
374
- try:
375
- time_str=float(time_str) # maybe its already in seconds!
376
- return time_str
377
- except Exception as e:
378
- gr.Error(f"Error converting time to seconds: {e}")
379
- return None
380
- return int(h) * 3600 + int(m) * 60 + float(s)
381
-
382
-
383
- def sec_to_HHMMSS(seconds):
384
- """Get timestamp string from seconds."""
385
- seconds = float(seconds)
386
- m, s = divmod(seconds, 60)
387
- h, m = divmod(m, 60)
388
- h=int(h)
389
- m=int(m)
390
- return f"{h:02d}:{m:02d}:{s:06.3f}"
391
-
392
- def molly_old_xlsx_to_table(xl_file): #TODO: check against isatasr
393
- # contractor transcribers provide an xlsx with the following columns
394
- # utt_ix: int
395
- # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
396
- # Duration: HH:MM:SS:ss
397
- # Speaker: str
398
- # Dialogue: str
399
- # Annotations: blank
400
- # Error Type: blank
401
- with pd.ExcelFile(xl_file) as xls:
402
- sheetname = xls.sheet_names
403
- table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
404
- table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True)
405
- table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
406
- table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
407
- table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True)
408
- table=table[['#','Speaker','Dialogue','start_sec','end_sec']]
409
- table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True)
410
-
411
- return table
412
-
413
- def old_xlsx_to_table(xl_file):#TODO: check against isatasr
414
- try:
415
- # read the first sheet of the Excel file into a DataFrame
416
- print(f'...reading {xl_file}...')
417
- table = pd.read_excel(xl_file, sheet_name=0)
418
- print(f'...done reading {xl_file}...')
419
-
420
- # convert column names to lowercase
421
- table.columns = map(str.lower, table.columns)
422
-
423
- # extract start and end time from the Timecode column
424
- print(f'...splitting Timecode column into start and end time...')
425
- timecodes = table['timecode'].str.split(' - ', expand=True)
426
- table['start_time'] = timecodes[0]
427
- table['end_time'] = timecodes[1]
428
- print(f'...done splitting Timecode column into start and end time...')
429
-
430
- # convert start and end time to seconds using the HHMMSS_to_sec function
431
- print(f'...converting start and end time to seconds...')
432
- table['start_sec'] = table['start_time'].apply(HHMMSS_to_sec)
433
- table['end_sec'] = table['end_time'].apply(HHMMSS_to_sec)
434
- print(f'...done converting start and end time to seconds...')
435
-
436
- # drop unnecessary columns
437
- print(f'...dropping unnecessary columns...')
438
- table.drop(['timecode', 'annotations', 'error type', 'duration'], axis=1, inplace=True)
439
-
440
- # rename columns
441
- print(f'...renaming columns...')
442
- table.rename(columns={'#': 'uttID', 'speaker': 'speaker', 'dialogue': 'transcript'}, inplace=True)
443
-
444
- # reorder columns
445
- print(f'...reordering columns...')
446
- table = table[['uttID', 'speaker', 'transcript', 'start_sec', 'end_sec']]
447
-
448
- table.sort_values(by='start_sec', inplace=True, ignore_index=True)
449
- table.reset_index(inplace=True)
450
-
451
- return table
452
- except Exception as e:
453
- gr.Error(f'Error converting {xl_file}: {e}')
454
-
455
- def table_to_ELAN_tsv(table:pd.DataFrame, path:str):#TODO: check against isatasr
456
- # write table to tsv compatible with ELAN import
457
- table.to_csv(path, index=False, float_format='%.3f',sep='\t')
458
- return path
459
-
460
- def table_to_labels_csv(table:pd.DataFrame, path:str):
461
- # write table to utt_labels csv format comaptable w rosy's isatasr lib
462
- table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') # drop rows with missing values in speaker and utterance
463
- table.to_csv(path,index=False, float_format='%.3f')
464
- return path
465
-
466
- def readELANtsv(file, fmt=None):
467
- with open(file) as in_file:
468
-
469
- reader = csv.reader(in_file, delimiter="\t")
470
-
471
- skiprows=0
472
- row=next(reader)
473
-
474
- while not len(row)>=4: # 4 being the min numbert of cols ELAN exports have
475
- skiprows+=1
476
- row=next(reader)
477
- in_file.seek(skiprows)
478
-
479
- if skiprows>0:
480
- print(f'Detected {skiprows} header rows to skip')
481
- reader = csv.reader(in_file, delimiter="\t")
482
- for _ in range(skiprows):
483
- next(reader)
484
-
485
- labels = [] # transcript with speaker labels and timestamp in sec
486
-
487
- for i,utt in enumerate(reader):
488
- if not ''.join(utt).strip(): # skip blank lines
489
- continue
490
- try:
491
- if len(utt) == 5: # IF data comes straight from ELAN sometimes there is a superfluous blank column 2
492
- if i==0:
493
- print('detected extra blank column in first row, will remove')
494
- if fmt=='AUG23':
495
- if i==0:
496
- print('detected extra blank 1st column, will remove')
497
- _,speaker,start_HHMMSS,end_HHMMSS,utterance= utt
498
- convert_timestamps=True
499
- else:
500
- if i==0:
501
- print('detected extra blank 2nd column, will remove')
502
- speaker,_,start_HHMMSS, end_HHMMSS, utterance = utt
503
- convert_timestamps=True
504
- elif len(utt) == 4: # sometimes the blank col is already removed
505
- if i==0:
506
- print('detected 4 columns, assuming: speaker,start_HHMMSS, end_HHMMSS, utterance ')
507
- speaker,start_HHMMSS, end_HHMMSS, utterance = utt
508
- convert_timestamps=True
509
- elif len(utt) == 6: # New one from 2023 Aug has a redundant extra start col!?
510
- if i==0:
511
- print('detected 6 columns, assuming: _,speaker,start_HHMMSS, end_HHMMSS, utterance,_ ')
512
- _,speaker,start_HHMMSS,end_HHMMSS,utterance,_ = utt
513
- convert_timestamps=True
514
- elif len(utt) == 9: # 2023 transcribers tend to give full elan output
515
- if i==0:
516
- print('detected 9 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance ')
517
- speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance = utt
518
- convert_timestamps=True
519
- elif len(utt) == 10: # sometimes an extra blank column appears at the end
520
- if i==0:
521
- print('detected 10 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ ')
522
- speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ = utt
523
- convert_timestamps=True
524
- elif len(utt) == 12: # WOw how many redundant columns can ELAN make...
525
- if i==0:
526
- print('detected 12 columns, assuming: speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance ')
527
- speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance = utt
528
- convert_timestamps=True
529
-
530
- else:
531
- raise ValueError(f'Unknown transcript format with {len(utt)} columns for {file}')
532
- except BaseException as err:
533
- print(f'!!! transcript parse error on line {i} for {file}')
534
- print(utt)
535
- raise err
536
- if convert_timestamps:
537
- start_sec = HHMMSS_to_sec(start_HHMMSS)
538
- end_sec = HHMMSS_to_sec(end_HHMMSS)
539
-
540
- labels.append((speaker, utterance, start_sec,end_sec))
541
- labels= pd.DataFrame(labels, columns = ('speaker', 'utterance', 'start_sec','end_sec'))
542
- labels.sort_values(by='start_sec', inplace=True, ignore_index=True)
543
- labels.reset_index(inplace=True)
544
- labels = labels.rename(columns = {'index':'seg'})
545
-
546
- return(labels)
547
-
548
-
549
- def merge_ellipsis(seg_labels):
550
- # merge utterances with ellipsis
551
- # input is seg_labels format: [optional index] speaker, utterance, start_sec, end_sec
552
- if isinstance(seg_labels,str) and seg_labels.endswith(('.csv','.tsv','.txt')):
553
- df=pd.read_csv(seg_labels)
554
- elif isinstance(seg_labels, pd.DataFrame):
555
- df=seg_labels
556
- else:
557
- raise ValueError('input seg_labels should be path to csv or pd.DataFrame')
558
-
559
- if len(df.columns)==4:
560
- # no seg index yet
561
- df.reset_index(inplace=True)
562
- df = df.rename(columns = {'index':'seg'})
563
- elif len(df.columns)==5:
564
- # first col is seg
565
- df.columns = ['seg','speaker','utterance','start_sec','end_sec']
566
- else:
567
- raise ValueError('input seg_labels should have 4 or 5 columns')
568
- df2=[]
569
- prev_spk=None
570
- prev_utt=""
571
- prev_start=0
572
- prev_end=0
573
- segs=[0]
574
- merge_utt={"seg":None, "speaker":None,"utterance":None,"start_sec":None, "end_sec":None}
575
- for i,row in df.iterrows():
576
- if i==0:
577
- merge_utt=row
578
-
579
- else:
580
- # if same speaker as last and ellipsis
581
- if merge_utt["speaker"]==row["speaker"] and str(merge_utt["utterance"]).endswith('...') and str(row["utterance"]).startswith('...'):
582
- # append current to temporary merged utt: use prev_ items
583
-
584
- merge_utt["utterance"]+=str(row["utterance"])
585
- merge_utt["end_sec"]=row["end_sec"]
586
- segs.append(row["seg"])
587
- else:
588
- # append merge_utt to df2
589
- merge_utt["seg"]=segs
590
- df2.append(merge_utt)
591
- # clear merge_utt and set to current
592
- merge_utt=row
593
- segs=[merge_utt["seg"]]
594
-
595
- merge_utt["seg"]=segs
596
- # if not isinstance(merge_utt["seg"],list):
597
- # merge_utt["seg"]=list(segs)
598
- df2.append(merge_utt) # catch final merge_utt if not terminated
599
-
600
- df2=pd.DataFrame(df2)
601
- df2['utterance']=df2['utterance'].str.replace('\.+',' ', regex=True)
602
-
603
- # clear up "......"
604
- # enumerate utterances
605
- df2.reset_index(inplace=True,drop=True)
606
- df2 = df2.reset_index().rename(columns = {'index':'utt'})
607
- return df2
608
-
609
-
610
- def add_dummy_seg_column(table):
611
- # adds a dummy seg column (listing segments comprising utterance) for a df without this column
612
- # labelfiles generated from merge_ellipsis have an 'utt' column giving utterance ID, and a seg column
613
- # containing a list of original segments comprising each utterance
614
- # but you may need all label files top have the exact same format even if they weren't produced by
615
- # merge_ellipsis()
616
- # returns a table with columns 'utt' and 'seg'
617
-
618
- if 'seg' in table.columns.tolist():
619
- print('\'seg\' column already exists, not changing anything')
620
- return table
621
- if 'uttID' in table.columns.tolist():
622
- table=table.rename(columns={"uttID":"utt"})
623
- if not 'utt' in table.columns.tolist():
624
- table['utt']=table.index
625
- table['seg']=[[u] for u in table['utt']]
626
- table=table[['utt','seg','speaker','start_sec','end_sec','utterance']]
627
-
628
- return table
629
-
630
-
631
- def old_xlsx_to_labels_csv(xl_file, merge_segments=True):
632
- # converts an xlsx file (from contractor transcription service which has single timecode col) to a csv in the format required by rosy's isatasr lib
633
- # if merge_segments=True, will merge segments to form utterances where there have been splits separated by '...'
634
- # if merge_segments=False, will keep segments as they were in the ELAN output
635
- # returns the path to the csv file
636
- table=old_xlsx_to_table(xl_file)
637
- sessname=get_sessname_from_filename(xl_file)
638
-
639
- if merge_segments:
640
- save_file=f'utt_labels_{sessname}.csv'
641
- merged_labels=merge_ellipsis(table)
642
- merged_labels.to_csv(save_file,index=False, float_format='%.3f')
643
- else:
644
- save_file=f'seg_labels_{sessname}.csv'
645
- table.to_csv(save_file,index=False, float_format='%.3f')
646
- return save_file
647
-
648
- def get_sessname_from_filename(filename):
649
- sessname=Path(filename).stem
650
- sessname = re.sub('reworked-transcript-diarized-timestamped-', '', sessname,flags=re.I)
651
- sessname = re.sub('reworked_transcript-diarized-timestamped-', '', sessname,flags=re.I)
652
- sessname = re.sub('reworked-diarized-timestamped-', '', sessname,flags=re.I)
653
- sessname = re.sub('reworked_timestamped_', '', sessname,flags=re.I)
654
- sessname = re.sub('reworked_', '', sessname,flags=re.I)
655
- sessname = re.sub('reworked-', '', sessname,flags=re.I)
656
- sessname = re.sub('transcript_diarized_timestamped_', '', sessname,flags=re.I)
657
- sessname = re.sub('transcript-diarized-timestamped_', '', sessname,flags=re.I)
658
- sessname = re.sub('transcript-diarized-timestamped-', '', sessname,flags=re.I)
659
- sessname = re.sub('_transcript', '', sessname,flags=re.I)
660
- sessname = re.sub('_tmcoded', '', sessname,flags=re.I)
661
- sessname = re.sub('utt_labels_', '', sessname,flags=re.I)
662
- sessname = re.sub('seg_labels_', '', sessname,flags=re.I)
663
- sessname = re.sub('_redacted', '', sessname,flags=re.I)
664
- return sessname
665
-
666
- def ELAN_to_labels_csv(ELANfile, merge_segments = True):
667
- # dumb but effective string wrangling to get sess name
668
- sessname=get_sessname_from_filename(ELANfile)
669
-
670
- # reads ELAN output to pd.DataFrame in a unified format
671
- labels=readELANtsv(ELANfile)
672
-
673
- if merge_segments:
674
- save_file=f'utt_labels_{sessname}.csv'
675
- # merge segments to form utterances where there have been splits separated by '...'
676
- merged_labels=merge_ellipsis(labels)
677
- merged_labels.to_csv(save_file,index=False, float_format='%.3f')
678
- else:
679
- save_file=f'seg_labels_{sessname}.csv'
680
- labels.to_csv(save_file,index=False, float_format='%.3f')
681
- return save_file
682
-
683
- def parse_label_csv(label_csv:str):
684
- # utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
685
- # There are several versions with differnt columns (with/without segment &/ utterance index,
686
- # withouot column headers etc)
687
- # table:
688
- # [uttID, speaker, transcript, start_sec, end_sec]
689
-
690
- table = pd.read_csv(label_csv,keep_default_na=False, header=None)
691
- row0=table.iloc[0]
692
-
693
- is_header = not any(str(cell).replace('.','').isdigit() for cell in row0)
694
- if is_header:
695
- table.columns=row0.tolist()
696
- table=table.iloc[1:]
697
- table=table.reset_index(drop=True)
698
- else:
699
- if len(table.columns)==4:
700
- print('no header detected, assuming annotation file has columns [speaker,utterance,start_sec, end_sec] ')
701
- table.columns=['speaker','utterance','start_sec', 'end_sec']
702
- elif len(table.columns)==5:
703
- print('no header detected, assuming annotation file has columns [seg,speaker,utterance,start_sec, end_sec] ')
704
- table.columns=['seg','speaker','utterance','start_sec', 'end_sec']
705
- elif len(table.columns)==6:
706
- print('no header detected, assuming annotation file has columns [utt,seg,speaker,utterance,start_sec, end_sec] ')
707
- table.columns=['utt','seg','speaker','utterance','start_sec', 'end_sec']
708
- else:
709
- print(f'no header detected, csv has {len(table.columns)} columns, could not determine column names.')
710
- return None
711
- # choose which column to use for uttID in table
712
- if 'utt' in table.columns.tolist():
713
- table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
714
- elif 'seg' in table.columns.tolist():
715
- table=table.rename(columns={"seg":"uttID"})
716
- else:
717
- table=table.reset_index().rename(columns={"index":"uttID"})
718
-
719
- table=table[['uttID','speaker','start_sec','end_sec','utterance']]
720
- return table
721
-
722
- def deidentify_speaker(df, who='all'):
723
- """replace speaker ID with generic labels
724
- in order of appearance (speaker1, speaker2)'
725
- if who is "student", only student names are replaced
726
-
727
-
728
- Args:
729
- df (_type_): _description_
730
- who (str, optional): 'all','student'. Which names to replace. Defaults to 'all'.
731
- """
732
- colnames = df.columns.tolist()
733
- speaker_key = next((key for key in ['speaker','Speaker','speaker_id','Speaker_ID'] if key in colnames),None)
734
- if not speaker_key:
735
- raise ValueError('No speaker column found in dataframe!')
736
- speakers = df[speaker_key].unique()
737
- if who=='student':
738
- # detect student. ID format can be student_xxx or 00-0000 numeric
739
- speakers = [s for s in speakers if ('student' in s.lower() or re.match(r'^\d{2}-\d{4}$',s))]
740
- generic_speakers = [f'student_{i+1}' for i in range(len(speakers))]
741
- else:
742
- generic_speakers = [f'speaker_{i+1}' for i in range(len(speakers))]
743
- speaker_dict = dict(zip(speakers, generic_speakers))
744
- df[speaker_key] = df[speaker_key].replace(speaker_dict)
745
- return df