rosyvs commited on
Commit
ff6eb07
·
1 Parent(s): 9bb8ff3

Add transcript processing application and utility functions from file_convertor, not yet integrated into app.

Browse files
Files changed (7) hide show
  1. .gitignore +7 -2
  2. Dockerfile +21 -3
  3. README.md +11 -0
  4. requirements.txt +2 -0
  5. setup.py +6 -0
  6. transcript_app.py +225 -0
  7. transcript_utils.py +745 -0
.gitignore CHANGED
@@ -1,5 +1,10 @@
1
  .DS_Store
2
  __pycache__/
3
- flagged/
4
  results*/
5
- logs/
 
 
 
 
 
 
 
1
  .DS_Store
2
  __pycache__/
 
3
  results*/
4
+ logs/
5
+ *.xlsx
6
+ *.log
7
+ *.csv
8
+ *.xls
9
+ flagged/
10
+ test.py
Dockerfile CHANGED
@@ -8,12 +8,28 @@ WORKDIR /app
8
  COPY requirements.txt .
9
 
10
  # Install Python dependencies without storing cache, for a smaller image
11
- RUN pip install --no-cache-dir -r requirements.txt
12
 
13
  # Update package lists and install FFmpeg for media processing
14
  RUN apt-get update && apt-get install -y ffmpeg
15
 
16
- # Set an environment variable for Matplotlib to store its configuration in /tmp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ENV MPLCONFIGDIR /tmp/matplotlib
18
 
19
  # Create and set permissions for result directories and logs inside the container
@@ -25,5 +41,7 @@ done
25
  # Copy all Python files from the current directory to the container
26
  COPY *.py .
27
 
 
 
28
  # Specify the command to run on container start
29
- CMD ["python", "app.py"]
 
8
  COPY requirements.txt .
9
 
10
  # Install Python dependencies without storing cache, for a smaller image
11
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
12
 
13
  # Update package lists and install FFmpeg for media processing
14
  RUN apt-get update && apt-get install -y ffmpeg
15
 
16
+ RUN useradd -m -u 1000 user
17
+
18
+ # Switch to root user to change directory ownership
19
+ USER root
20
+ RUN mkdir -p /usr/share/nltk_data && chown -R user:user /usr/share/nltk_data
21
+
22
+ USER user
23
+ ENV HOME=/home/user \
24
+ PATH=/home/user/.local/bin:$PATH
25
+
26
+ # Set the working directory to the user's home directory
27
+ WORKDIR $HOME/app
28
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
29
+ COPY --chown=user . $HOME/app
30
+
31
+ # Set environment variables for NLTK data and Matplotlib configuration
32
+ ENV NLTK_DATA /usr/share/nltk_data
33
  ENV MPLCONFIGDIR /tmp/matplotlib
34
 
35
  # Create and set permissions for result directories and logs inside the container
 
41
  # Copy all Python files from the current directory to the container
42
  COPY *.py .
43
 
44
+ RUN python setup.py
45
+
46
  # Specify the command to run on container start
47
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -8,4 +8,15 @@ pinned: false
8
  license: mit
9
  ---
10
 
 
 
 
 
 
 
 
 
 
 
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  license: mit
9
  ---
10
 
11
+ Various tools for transribers.
12
+
13
+ converting media files
14
+
15
+
16
+ converting transcription files
17
+ - XLSX-->XLSX+TM: from xlsx to xlsx with TM annotation labels
18
+ - XLSX-->ELAN: from xlsx to ELAN-compatible TSV
19
+ - ELAN-->CSV: from ELAN output tsv to standardized transcript csv format (seg_labels)
20
+ - supports merging adjacent segments from the same speaker to reconstitute utterances
21
+
22
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
requirements.txt CHANGED
@@ -3,3 +3,5 @@ moviepy==1.0.3
3
  pandas==2.2.3
4
  xlrd==1.2.0
5
  numpy==2.2.5
 
 
 
3
  pandas==2.2.3
4
  xlrd==1.2.0
5
  numpy==2.2.5
6
+ nltk==3.5
7
+ openpyxl==3.0.10
setup.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import nltk
2
+ import os
3
+ download_dir = os.path.expanduser('/usr/share/nltk_data/')
4
+ os.makedirs(name=download_dir, exist_ok=True)
5
+ nltk.download('punkt', download_dir=download_dir)
6
+ print(download_dir)
transcript_app.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import os
3
+ import time
4
+ import pandas as pd
5
+
6
+ import gradio as gr
7
+ from utils import (HHMMSS_to_sec, molly_old_xlsx_to_table, convert_transcript_for_TM, convert_transcript_for_annotation,
8
+ table_to_ELAN_tsv, ELAN_to_labels_csv, old_xlsx_to_table, old_xlsx_to_labels_csv, deidentify_speaker)
9
+
10
+
11
+ def delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath):
12
+ for output_filepath in output_filepath_list:
13
+ try:
14
+ os.remove(output_filepath)
15
+ except FileNotFoundError:
16
+ pass
17
+ for trans_log_filepath in trans_log_filepath_list:
18
+ try:
19
+ os.remove(trans_log_filepath)
20
+ except FileNotFoundError:
21
+ pass
22
+ try:
23
+ os.remove(global_log_filepath)
24
+ except FileNotFoundError:
25
+ pass
26
+ print("Files deleted")
27
+
28
+ def delete_files_thread(output_filepath_list, trans_log_filepath_list, global_log_filepath):
29
+ print("Thread started")
30
+ time.sleep(20)
31
+ delete_files(output_filepath_list, trans_log_filepath_list, global_log_filepath)
32
+
33
+ def convert_xlsx_to_TMxlsx(input_file_list):
34
+
35
+ file_list = [file.name for file in input_file_list]
36
+ output_filepath_list, trans_log_filepath_list, error_check, global_transfer_log_path = convert_transcript_for_TM(file_list=file_list)
37
+ if not error_check:
38
+ error_check = "No errors found."
39
+
40
+ delete_thread = threading.Thread(target=delete_files_thread, args=(output_filepath_list, trans_log_filepath_list, global_transfer_log_path))
41
+ delete_thread.start()
42
+
43
+ return output_filepath_list, trans_log_filepath_list, global_transfer_log_path, error_check
44
+
45
+ def convert_for_annotation(input_file_list, annotation_scheme):
46
+ output_files=[]
47
+ for input_transcript in input_file_list:
48
+ print("start converting transcript")
49
+ output_file = convert_transcript_for_annotation(file=input_transcript, annotation_scheme=annotation_scheme)
50
+ print("finished converting transcript to xlsx for annotation")
51
+ output_files.append(output_file)
52
+ return output_files
53
+
54
+
55
+ def convert_xlsx_to_ELANtsv(input_file_list):
56
+ output_files=[]
57
+ for input_transcript in input_file_list:
58
+ # convert transcript
59
+ print("start converting transcript")
60
+ table = old_xlsx_to_table(xl_file=input_transcript)
61
+ print("finished converting transcript to table")
62
+ output_transcript = input_transcript.replace('.xlsx', '.tsv')
63
+ output_file = table_to_ELAN_tsv(table, output_transcript)
64
+ print("saved table to tsv")
65
+ output_files.append(output_file)
66
+ return output_files
67
+
68
+
69
+ #TODO: support sort and merge for XLSX output if this is needed
70
+
71
+ def convert_ELANtsv_to_CSV(input_file_list, merge_ellipsis=False):
72
+ output_files=[]
73
+ for input_transcript in input_file_list:
74
+ # convert transcript
75
+ print("start converting transcript")
76
+ output_transcript = input_transcript.replace('.tsv', '.csv')
77
+ output_file = ELAN_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
78
+ print("finish converting transcript")
79
+ output_files.append(output_file)
80
+ return output_files
81
+
82
+ # TODO: XLSX to csv (seg_labels or utt_labels)
83
+ def convert_xlsx_to_csv(input_file_list, merge_ellipsis=False):
84
+ output_files=[]
85
+ for input_transcript in input_file_list:
86
+ # read xl file to table
87
+ # write table to csv with option to merge segments on ellipsis
88
+ output_transcript = input_transcript.replace('.xlsx', '.csv')
89
+ output_file = old_xlsx_to_labels_csv(input_transcript, merge_segments = merge_ellipsis)
90
+ output_files.append(output_file)
91
+ return output_files
92
+
93
+ def deidentify_transcripts(input_file_list, who='student'):
94
+ output_files=[]
95
+ for file in input_file_list:
96
+ basename = os.path.basename(file)
97
+ ext = file.split('.')[-1]
98
+ if file.endswith('.xlsx') or file.endswith('.xls'):
99
+ df = pd.read_excel(file)
100
+ elif file.endswith('.csv'):
101
+ df = pd.read_csv(file)
102
+ elif file.endswith('.tsv'):
103
+ df = pd.read_csv(file, sep='\t')
104
+ elif file.endswith('.txt'):
105
+ df = pd.read_csv(file, sep='\t')
106
+ else:
107
+ gr.Warning("File type not supported (must be .xlsx, .xls, .csv, .tsv, or .txt)")
108
+ try:
109
+ df = deidentify_speaker(df, who=who)
110
+ except ValueError as e:
111
+ gr.Warning(f"{e}: {basename} ")
112
+ continue
113
+ output_file = file.replace(f'.{ext}', f'_deidentified.{ext}')
114
+ if ext == 'xlsx' or ext == 'xls':
115
+ df.to_excel(output_file, index=False)
116
+ elif ext == 'csv':
117
+ df.to_csv(output_file, index=False)
118
+ elif ext == 'tsv' or ext == 'txt':
119
+ df.to_csv(output_file, sep='\t', index=False)
120
+ output_files.append(output_file)
121
+ return output_files
122
+
123
+ # gr components for TM converter
124
+ input_xlsx = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
125
+ output_xlsx_tm = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
126
+ process_log_tm = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
127
+ global_transfer_log_tm = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
128
+ error_check_tm = gr.Textbox(label="Error Check", type="text")
129
+ interface_tm = gr.Interface(fn=convert_xlsx_to_TMxlsx,
130
+ inputs=input_xlsx,
131
+ outputs=[output_xlsx_tm, process_log_tm, global_transfer_log_tm, error_check_tm],
132
+ title="transcript-->XLSX+TM_dropdown",
133
+ description="Converts XLSX or csv transcript to XLSX+TM transcript with prefilled dropdown for talkmoves",
134
+ live=False,
135
+ allow_flagging="never",)
136
+
137
+ # gr components for xlsx to ELAN
138
+ input_x2e = gr.Files(label="Input XLSX or CSV transcript file", type="filepath", file_types=[".xlsx", ".csv"])
139
+ output_x2e = gr.Files(label="Output ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
140
+ # process_log_x2e = gr.File(label="Process Log", type="filepath", file_types=[".log", ".txt"] )
141
+ # global_transfer_log_x2e = gr.File(label="Global transfer log", type="filepath", file_types=[".log", ".txt"])
142
+ # error_check_x2e = gr.Textbox(label="Error Check", type="text")
143
+ interface_x2e = gr.Interface(fn=convert_xlsx_to_ELANtsv, # TODO: swap out for correct fn
144
+ inputs=input_x2e,
145
+ outputs=output_x2e,
146
+ title="XLSX-->ELAN",
147
+ description="Converts XLSX transcript to ELAN-compatible tsv file",
148
+ live=False,
149
+ allow_flagging="never",)
150
+
151
+ # gr components for ELAN to CSV
152
+ input_e2c = gr.Files(label="Input ELAN-compatible tsv file", type="filepath", file_types=[".tsv",'.txt'])
153
+ merge_e2c = gr.Checkbox(label="Merge segments on ellipsis?")
154
+ output_e2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
155
+ interface_e2c = gr.Interface(fn=convert_ELANtsv_to_CSV, # TODO: swap out for correct fn
156
+ inputs=[input_e2c, merge_e2c],
157
+ outputs=[output_e2c],
158
+ title="ELAN-->CSV",
159
+ description="Converts ELAN-exported file (.txt or .tsv, tab separated values) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
160
+ live=False,
161
+ allow_flagging="never",)
162
+
163
+ # gr components for XLSX to CSV
164
+ input_x2c = gr.Files(label="Input XLSX file", type="filepath", file_types=[".xlsx", ".csv"])
165
+ merge_x2c = gr.Checkbox(label="Merge segments on ellipsis?")
166
+ output_x2c = gr.Files(label="Output CSV file", type="filepath", file_types=[".csv"])
167
+ interface_x2c = gr.Interface(fn=convert_xlsx_to_csv, # TODO: swap out for correct fn
168
+ inputs=[input_x2c, merge_x2c],
169
+ outputs=[output_x2c],
170
+ title="XLSX-->CSV",
171
+ description="Converts old version XLSX transcript (with a single Timecode column) to standardized CSV file with rows sorted by segment start time. Optionally merges segments on ellipsis.",
172
+ live=False,
173
+ allow_flagging="never",)
174
+
175
+ # gr components for annotation XLSX
176
+ input_c2a = gr.Files(label="Input CSV file", type="filepath", file_types=[".csv"])
177
+ annotation_scheme_c2a = gr.Radio(label="Annotation Scheme", choices=[("CPS","CPS"), ("TalkMove","TM"),("None",None)])
178
+
179
+ output_c2a = gr.Files(label="Output XLSX file", type="filepath", file_types=[".xlsx"])
180
+ interface_c2a = gr.Interface(
181
+ fn=convert_for_annotation, # TODO: swap out for correct fn
182
+ inputs=[input_c2a, annotation_scheme_c2a],
183
+ outputs=[output_c2a],
184
+ title="CSV-->XLSX+annotation",
185
+ description="Converts CSV file to XLSX file for annotation (added columns for CPS or TM or None)",
186
+ live=False,
187
+ allow_flagging="never",
188
+ # submit_btn="Convert"
189
+ )
190
+
191
+ # gr components for deidentification
192
+ input_di = gr.Files(label="Input transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
193
+ who_di = gr.Radio(label="Who to deidentify", choices=[("student","student"), ("all","all")])
194
+ output_di = gr.Files(label="Output deidentified transcript file", type="filepath", file_types=[".xlsx", ".xls",".csv", ".tsv", ".txt"])
195
+ interface_di = gr.Interface(
196
+ fn=deidentify_transcripts,
197
+ inputs=[input_di, who_di],
198
+ outputs=[output_di],
199
+ title="Deidentify",
200
+ description="Deidentify speaker labels in a transcript. Compatible with .xlsx, .xls, .csv, .tsv, .txt files with a column containing speaker labels. Will not work if speaker column is missing a header. Speaker names or IDs will be replaced with a deidentified label numbered in order of appearance. Choose whether to deidentify just students or all speakers.",
201
+ live=False,
202
+ allow_flagging="never",
203
+ )
204
+
205
+ tab_interface = gr.TabbedInterface(
206
+ [
207
+ interface_e2c,
208
+ interface_c2a,
209
+ interface_x2e,
210
+ interface_x2c,
211
+ interface_tm,
212
+ interface_di
213
+ ]
214
+ ,
215
+ ["ELAN→CSV",
216
+ "CSV→XLSX+annotation",
217
+ "XLSX→ELAN",
218
+ "XLSX→CSV",
219
+ "transcript→XLSX+TM_dropdown",
220
+ "Deidentify"
221
+ ]
222
+ )
223
+ # TODO: XLSX to csv (seg_labels or utt_labels)
224
+ # TODO: XLSX to merged on ellipsis, keep XLSX format
225
+ tab_interface.launch(server_name="0.0.0.0", server_port=7860)
transcript_utils.py ADDED
@@ -0,0 +1,745 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import math
3
+ import os
4
+ import re
5
+ import csv
6
+ from pathlib import Path
7
+ import gradio as gr
8
+ import nltk
9
+ import pandas as pd
10
+ from nltk.tokenize import sent_tokenize
11
+ from openpyxl import Workbook
12
+ from openpyxl.utils.dataframe import dataframe_to_rows
13
+ from openpyxl.worksheet.datavalidation import DataValidation
14
+ from pandas._libs.tslibs import timestamps
15
+
16
+
17
+ def convert_transcript_for_TM(file_list):
18
+ """Convert transcripts for TalkMoves Annotation
19
+ Input can be xlsx or csv transcript file
20
+ Can handle sepraate start and end time columns or a single timecode column
21
+ Output will have separate start and end timestamps in HH:MM:SS.sss format
22
+
23
+ Args:
24
+ file_list (_type_): _description_
25
+
26
+ Raises:
27
+ gr.Error: _description_
28
+ gr.Error: _description_
29
+
30
+ Returns:
31
+ _type_: _description_
32
+ """
33
+
34
+
35
+ # Regular expression pattern for matching speaker names and timecodes.
36
+ bracket_re = re.compile(r'(?:\[[UI|ui|Inaudible|inaudible|overlapping speech|VIDEO SILENCE|teacher explaining in background].*\]\W{0,2})')
37
+ # Regular expression pattern for matching anything enclosed in square brackets.
38
+ all_bracket_re = re.compile(r'(?:\[.*\]\W{0,2})')
39
+ # whether remove the inaudible
40
+ do_remove_inaudible = True
41
+ # whether_keep_context_switch
42
+ do_keep_context_switch = True
43
+ # whether_convert_to_timestamp if start and end time are in seconds and in separate columns
44
+ convert_to_timestamp = True
45
+
46
+ error_message = [] # List of error messages to be displayed to the user.
47
+ global_stat_dict = {} # Dictionary of global statistics.
48
+ output_filepath_list = [] # List of output file paths.
49
+ trans_log_filepath_list = [] # List of transcription log file paths.
50
+ for file in file_list:
51
+ filename = file.split('/')[-1] # Get the filename from the file.
52
+ filepath = os.path.dirname(file) # Get the file path from the file.
53
+ # Read the file into a Pandas DataFrame depending on its file format.
54
+ if filename.endswith('.xlsx'):
55
+ df = pd.read_excel(file, index_col=0)
56
+ output_filename = f"{filename[:-5]}" + "_TMcoded.xlsx"
57
+ elif filename.endswith('.csv'):
58
+ df = pd.read_csv(file, index_col=0, error_bad_lines=False)
59
+ output_filename = f"{filename[:-4]}" + "_TMcoded.xlsx"
60
+
61
+ else:
62
+ raise gr.Error(f"{file} format is wrong")
63
+
64
+ # Remove the "Copy of" prefix from the output filename, if present.
65
+ if output_filename.startswith("Copy of "):
66
+ output_filename = output_filename[8:]
67
+
68
+ # Remove the word "_Transcript" from the output filename, if present.
69
+ if '_Transcript' in output_filename:
70
+ # print("before: "+output_filename)
71
+ error_message.append("before: "+output_filename)
72
+ output_filename = ''.join(output_filename.split('_Transcript'))
73
+ # print("after: "+output_filename)
74
+ error_message.append("after: "+output_filename)
75
+
76
+ # Construct the output file and transcription log file paths.
77
+ output_filepath = os.path.join(filepath, output_filename)
78
+ trans_log_filepath = os.path.join(filepath, f"{output_filename}"+ ".log")
79
+
80
+ # Open the transcription log file for writing.
81
+ with open(trans_log_filepath, "w") as outfile:
82
+ sub_cnt_in_file = 0
83
+ empty_speaker_cnt_in_file = 0
84
+ turn_skipped_in_file = 0
85
+ turn_skipped_speaker_switch_in_file = 0
86
+ snt_mark_skip_in_file = 0
87
+ snt_skipped_in_file = 0
88
+ chat_flag_in_speaker_time_line = 0
89
+ chat_flag_in_content_line = 0
90
+ all_inaudible_in_file = 0
91
+ all_bracket_in_file = 0
92
+ all_snts_in_file = 0
93
+ all_token_cnt_in_file = 0
94
+ #index Timecode Duration Speaker Dialogue Annotations Error Type
95
+ #1 00:00:05:04 - 00:00:07:12 00:00:02:08 Tutor Did you... How was your Halloween?
96
+ turns = []
97
+ time_stamps = []
98
+ speakers = []
99
+ chat_flags = []
100
+ sentences = []
101
+ snt_ids = []
102
+
103
+ ## parse the df flexibly: find key column names which might vary dependign on transcript source
104
+ # set all column names to lowercase
105
+ df.columns = map(str.lower, df.columns)
106
+ # several possibilities for column names, detect which are present
107
+ uttID_keys = ['utt','seg','utt_id','seg_id','index']
108
+ speaker_keys = ['speaker']
109
+ start_keys=['start_sec','start','start_time','timestart']
110
+ end_keys=['end_sec','end','end_time','timeend']
111
+ timestamp_keys = ['timecode','timestamp']
112
+ content_keys=['dialogue','utterance','transcript','text']
113
+ # detect which is used in this df
114
+ uttID_key = next((key for key in uttID_keys if key in df.columns), None)
115
+ speaker_key = next((key for key in speaker_keys if key in df.columns), None)
116
+ content_key = next((key for key in content_keys if key in df.columns), None)
117
+ # check if separate start and end times are present, otherwise assume single timecode column
118
+ if any(df.columns.isin(start_keys)):
119
+ start_key = next((key for key in start_keys if key in df.columns), None)
120
+ end_key = next((key for key in end_keys if key in df.columns), None)
121
+ time_format = 'seconds'
122
+ if convert_to_timestamp:
123
+ # convert to timestamp format HH:MM:SS.sss - HH:MM:SS.sss
124
+ df['timecode'] = df.apply(lambda x: f"{sec_to_HHMMSS(x[start_key])} - {sec_to_HHMMSS(x[end_key])}", axis=1)
125
+ timestamp_key='timecode'
126
+ time_format = 'timestamp'
127
+ else:
128
+ timestamp_key=next((key for key in timestamp_keys if key in df.columns), None)
129
+ time_format = 'timestamp'
130
+ # Turn started with 1, the same as molly's transcripts
131
+ for i, row in df.iterrows():
132
+ turn = row[uttID_key] if uttID_key else i+1
133
+ speaker = row[speaker_key]
134
+ time_str = row[timestamp_key]
135
+ content = "" if pd.isna(row[content_key]) else row[content_key].strip("\n")
136
+ # when speaker is empty, use the previous speaker
137
+ if speaker == "":
138
+ if speakers:
139
+ speaker = speakers[-1]
140
+ empty_speaker_cnt_in_file += 1
141
+ outfile.write(f"{turn}: found empty speaker, use the speaker in previous turn: {speaker}\n")
142
+ else:
143
+ raise gr.Error(f"{row}, the first turn is empty speaker")
144
+
145
+ # clean after the sentence tokenize
146
+ snts = sent_tokenize(content)
147
+ all_snts_in_file += len(snts)
148
+ snt_skipped_in_turn = 0
149
+ for i, snt in enumerate(snts):
150
+ remove_flag = False
151
+ inaudible_search = re.findall(bracket_re, snt)
152
+ if inaudible_search:
153
+ all_inaudible_in_file += len(inaudible_search)
154
+ outfile.write(f"{turn}, {inaudible_search}, inaudible found in snt: {snt}\n")
155
+
156
+ all_bracket_search = re.findall(all_bracket_re, snt)
157
+ if all_bracket_search:
158
+ all_bracket_in_file += len(all_bracket_search)
159
+ outfile.write(f"{turn}, {all_bracket_search} bracket found in snt: {snt}\n")
160
+
161
+ # only remove the [inaudible xxx] when it is the whole sentence.
162
+ inaudible_match = re.fullmatch(bracket_re, snt)
163
+
164
+ if inaudible_match:
165
+ if do_keep_context_switch:
166
+ # if keep context switch
167
+ if speakers and speaker == speakers[-1]:
168
+ # share the same speaker, no context switching, just remove it
169
+ remove_flag = True
170
+ else:
171
+ # different speakers, it is the context switching.
172
+ if len(snts) == 1:
173
+ # current empty sentence is the only single sentence
174
+ remove_flag = False
175
+ else:
176
+ if i != len(snts)-1:
177
+ # current empty utterance is not the last one, just delete it
178
+ remove_flag = True
179
+ else:
180
+ # current empty utterance is the last one, keep it.
181
+ if snt_skipped_in_turn == len(snts)-1:
182
+ # all previous snts are empty, then keep this to not skip the whole turn
183
+ remove_flag = False
184
+ else:
185
+ remove_flag = True
186
+ else:
187
+ # if not keep context switch, then simply remove all empty utterance
188
+ remove_flag = True
189
+
190
+ # If remove_flag is true:
191
+ if remove_flag:
192
+ # Increment sub_cnt_in_file and snt_mark_skip_in_file
193
+ sub_cnt_in_file += 1
194
+ snt_mark_skip_in_file += 1
195
+ # Write the following message to outfile:
196
+ outfile.write(f"{turn}, sub happend: {snt}, skip this sentence\n")
197
+ # If do_remove_inaudible is true:
198
+ if do_remove_inaudible:
199
+ snt_skipped_in_file += 1
200
+ snt_skipped_in_turn += 1
201
+ continue
202
+
203
+ # Add to pd:
204
+ # Append turn to turns list
205
+ turns.append(turn)
206
+ # Set snt_id to the string f"{turn}.{i}"
207
+ snt_id = f"{turn}.{i}"
208
+ # Append time_str to time_stamps list
209
+ time_stamps.append(time_str)
210
+ # Append speaker to speakers list
211
+ speakers.append(speaker)
212
+ # Set sentence to the string representation of snt, with whitespace removed from the start and end
213
+ sentence = str(snt).strip().rstrip("\n")
214
+ # Calculate the number of tokens in sentence and add to all_token_cnt_in_file
215
+ token_cnt = len(nltk.word_tokenize(sentence))
216
+ all_token_cnt_in_file += token_cnt
217
+ # Append snt_id to snt_ids list
218
+ snt_ids.append(snt_id)
219
+ # Append sentence to sentences list
220
+ sentences.append(sentence)
221
+
222
+ if snt_skipped_in_turn == len(snts):
223
+ # all snts in turn are skiped, then skip the turn
224
+ turn_skipped_in_file += 1
225
+ if (speakers and speaker != speakers[-1]) or not speakers:
226
+ turn_skipped_speaker_switch_in_file += 1
227
+ outfile.write(f"{turn}, since all snts are empty, skip this whole turn {content}\n")
228
+ # Create a new DataFrame with the following columns:
229
+ new_df = pd.DataFrame({
230
+ "Sentence_ID": snt_ids, # A
231
+ "TimeStamp": time_stamps, #B
232
+ "Turn" : turns, #C
233
+ "Speaker" : speakers, #D
234
+ "Sentence" : sentences #E
235
+ })
236
+
237
+ # assert turn_skipped_speaker_switch_in_file==0, "Some speaker switch turn skipped"
238
+ new_df["Teacher_TM"] = None #F
239
+ new_df["Student_TM"] = None #G
240
+
241
+ # write new_df to xlsx file
242
+ new_df.to_excel(output_filepath, index=False)
243
+
244
+
245
+ # https://openpyxl.readthedocs.io/en/latest/api/openpyxl.utils.dataframe.html#openpyxl.utils.dataframe.dataframe_to_rows
246
+ wb = Workbook()
247
+ ws = wb.active
248
+ teacher_dv = DataValidation(type="list", formula1='",1-None,2-Keep-Together,3-Getting-Student-to-Relate,4-Restating,5-Revoicing,6-Context,7-Press-for-Accuracy,8-Press-for-Reasoning"', allow_blank=True)
249
+ student_dv = DataValidation(type="list", formula1='",1-None,2-Relate-to-Another-Student,3-Asking-for-More-info,4-Making-a-Claim,5-Providing-Evidence/Reasoning"', allow_blank=True)
250
+ ws.add_data_validation(teacher_dv)
251
+ ws.add_data_validation(student_dv)
252
+ teacher_dv.add('F2:F1048576')
253
+ student_dv.add('G2:G1048576')
254
+ for r in dataframe_to_rows(new_df, index=False, header=True):
255
+ ws.append(r)
256
+ wb.save(output_filepath)
257
+
258
+ stat_dict = {
259
+ "chat_flag_in_speaker_time_line": chat_flag_in_speaker_time_line,
260
+ "chat_flag_in_content_line": chat_flag_in_content_line,
261
+ "empty_speaker_cnt_in_file": empty_speaker_cnt_in_file,
262
+ "ori_total_turn": df.shape[0],
263
+ "ori_total_snt": all_snts_in_file,
264
+ "turn_skipped": turn_skipped_in_file,
265
+ "turn_skipped_speaker_switch_in_file": turn_skipped_speaker_switch_in_file,
266
+ "snt_skipped": snt_skipped_in_file,
267
+ "remaining_snt": all_snts_in_file - snt_skipped_in_file,
268
+ "all_token_cnt_in_file": all_token_cnt_in_file,
269
+ "avg_token_cnt_per_snt": all_token_cnt_in_file/(all_snts_in_file - snt_skipped_in_file),
270
+ "sub_cnt_in_file": sub_cnt_in_file,
271
+ "all_inaudible_in_file": all_inaudible_in_file,
272
+ "all_bracket_in_file": all_bracket_in_file,
273
+ "other_bracket_in_file": all_bracket_in_file - all_inaudible_in_file
274
+ }
275
+ if all_inaudible_in_file != all_bracket_in_file:
276
+ # print(f"{filename} has special brakets")
277
+ error_message.append(f"Warning: {filename} has special brakets")
278
+ for k, v in stat_dict.items():
279
+ global_stat_dict[k] = global_stat_dict.get(k,0) + v
280
+ outfile.write(f"{output_filepath}, {json.dumps(stat_dict, indent=4)}")
281
+
282
+ output_filepath_list.append(output_filepath)
283
+ trans_log_filepath_list.append(trans_log_filepath)
284
+
285
+ for k, v in global_stat_dict.items():
286
+ if "avg" in k:
287
+ global_stat_dict[k] = global_stat_dict[k]/len(file_list)
288
+ global_log_filepath = os.path.join(filepath, "global_transfer"+ ".log")
289
+ with open(global_log_filepath, "w") as outfile:
290
+ outfile.write(f"global_stat_dict: {json.dumps(global_stat_dict, indent=4)}")
291
+
292
+ # error_check
293
+ if global_stat_dict["all_inaudible_in_file"] != global_stat_dict["all_bracket_in_file"]:
294
+ error_message.append("Error: 'all_inaudible_in_file' does not match 'all_bracket_in_file'")
295
+ if global_stat_dict["other_bracket_in_file"] != 0:
296
+ error_message.append("Error: 'other_bracket_in_file' is not zero")
297
+
298
+ return output_filepath_list, trans_log_filepath_list, error_message, global_log_filepath
299
+
300
+ def add_CPS_columns(df):
301
+ # Observation Instructions CONST_SharesU_Situation CONST_SharesU_CorrectSolutions CONST_SharesU_IncorrectSolutions CONST_EstablishesCG_Confirms CONST_EstablishesCG_Interrupts NEG_Responds_Reasons NEG_Responds_QuestionsOthers NEG_Responds_Responds MAINTAIN_Initiative_Criticizes NEG_MonitorsE_Results NEG_MonitorsE_GivingUp NEG_MonitorsE_Strategizes NEG_MonitorsE_Save MAINTAIN_Initiative_Suggestions MAINTAIN_Initiative_Compliments MAINTAIN_FulfillsR_InitiatesOffTopic MAINTAIN_FulfillsR_JoinsOffTopic MAINTAIN_FulfillsR_Support MAINTAIN_FulfillsR_Apologizes Notes
302
+ annotation_columns = ['Observation','Instructions', 'CONST_SharesU_Situation', 'CONST_SharesU_CorrectSolutions', 'CONST_SharesU_IncorrectSolutions', 'CONST_EstablishesCG_Confirms', 'CONST_EstablishesCG_Interrupts', 'NEG_Responds_Reasons', 'NEG_Responds_QuestionsOthers', 'NEG_Responds_Responds', 'MAINTAIN_Initiative_Criticizes', 'NEG_MonitorsE_Results', 'NEG_MonitorsE_GivingUp', 'NEG_MonitorsE_Strategizes', 'NEG_MonitorsE_Save', 'MAINTAIN_Initiative_Suggestions', 'MAINTAIN_Initiative_Compliments', 'MAINTAIN_FulfillsR_InitiatesOffTopic', 'MAINTAIN_FulfillsR_JoinsOffTopic', 'MAINTAIN_FulfillsR_Support', 'MAINTAIN_FulfillsR_Apologizes', 'Notes']
303
+ # add these columns to the end of the df in this order
304
+ for col in annotation_columns:
305
+ df[col]=''
306
+ return df
307
+
308
+ def add_TM_columns(df):
309
+ annotation_columns = ['Teacher_TM', 'Student_TM']
310
+ # add these columns to the end of the df in this order
311
+ for col in annotation_columns:
312
+ df[col]=''
313
+ return df
314
+
315
+ def convert_transcript_for_annotation(file, annotation_scheme=None):
316
+ """Convert transcript for annotation:
317
+ Input standard csv transcript file
318
+ Output will have separate start and end timestamps in HH:MM:SS.sss format
319
+ Filename column will infer the video filename from the transcript filename
320
+ Columns for CPS annotators are added
321
+ """
322
+ filename,ext = os.path.splitext(os.path.basename(file)) # Get the filename from the file.
323
+ filepath = os.path.dirname(file) # Get the file path from the file.
324
+ # Read the file into a Pandas DataFrame depending on its file format.
325
+ try:
326
+ table = parse_label_csv(file)
327
+ media_filename = get_sessname_from_filename(filename)
328
+ out_df=table.copy()
329
+ out_df['recordingID']=media_filename
330
+ out_df['TimeStart']=out_df['start_sec'].apply(sec_to_HHMMSS)
331
+ out_df['TimeEnd']=out_df['end_sec'].apply(sec_to_HHMMSS)
332
+ out_df=out_df[['speaker','TimeStart','TimeEnd','utterance','recordingID','uttID']]
333
+ if annotation_scheme=='CPS':
334
+ out_df=add_CPS_columns(out_df)
335
+ output_file = os.path.join(filepath, f"CPS_{filename}.xlsx")
336
+ out_df.to_excel(output_file, index=False)
337
+ elif annotation_scheme=='TM':
338
+ out_df=add_TM_columns(out_df)
339
+ output_file = os.path.join(filepath, f"TM_{filename}.xlsx")
340
+ out_df.to_excel(output_file, index=False)
341
+ else:
342
+ output_file = os.path.join(filepath, f"{filename}.xlsx")
343
+ out_df.to_excel(output_file, index=False)
344
+ return output_file
345
+ except Exception as e:
346
+ raise gr.Error(f"{filename}: error {e}")
347
+
348
+ def HHMMSS_to_sec(time_str):
349
+ """Get Seconds from timestamp string with milliseconds."""
350
+ if not time_str:
351
+ return None
352
+ if time_str.count(':')==2:
353
+ h, m, s = time_str.split(':')
354
+ elif time_str.count(':')==3:
355
+ # weird timestamps where there is a field followign seconds delimited by colon
356
+ h, m, s, u = time_str.split(':')
357
+ # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
358
+ if len(u)==1:
359
+ print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
360
+ ms = float(u)/10
361
+ elif len(u)==2: # hundredths
362
+ ms = float(u)/100
363
+ elif len(u)==3: # hundredths
364
+ ms = float(u)/1000
365
+ else:
366
+ print(f'input string format not supported: {time_str}')
367
+ return None
368
+ s = int(s)+ms
369
+ elif time_str.count(':')==1:
370
+ # print('missing HH from timestamp, assuming MM:SS')
371
+ m, s = time_str.split(':')
372
+ h=0
373
+ else:
374
+ try:
375
+ time_str=float(time_str) # maybe its already in seconds!
376
+ return time_str
377
+ except Exception as e:
378
+ gr.Error(f"Error converting time to seconds: {e}")
379
+ return None
380
+ return int(h) * 3600 + int(m) * 60 + float(s)
381
+
382
+
383
+ def sec_to_HHMMSS(seconds):
384
+ """Get timestamp string from seconds."""
385
+ seconds = float(seconds)
386
+ m, s = divmod(seconds, 60)
387
+ h, m = divmod(m, 60)
388
+ h=int(h)
389
+ m=int(m)
390
+ return f"{h:02d}:{m:02d}:{s:06.3f}"
391
+
392
+ def molly_old_xlsx_to_table(xl_file): #TODO: check against isatasr
393
+ # contractor transcribers provide an xlsx with the following columns
394
+ # utt_ix: int
395
+ # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
396
+ # Duration: HH:MM:SS:ss
397
+ # Speaker: str
398
+ # Dialogue: str
399
+ # Annotations: blank
400
+ # Error Type: blank
401
+ with pd.ExcelFile(xl_file) as xls:
402
+ sheetname = xls.sheet_names
403
+ table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
404
+ table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True)
405
+ table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
406
+ table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
407
+ table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True)
408
+ table=table[['#','Speaker','Dialogue','start_sec','end_sec']]
409
+ table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True)
410
+
411
+ return table
412
+
413
+ def old_xlsx_to_table(xl_file):#TODO: check against isatasr
414
+ try:
415
+ # read the first sheet of the Excel file into a DataFrame
416
+ print(f'...reading {xl_file}...')
417
+ table = pd.read_excel(xl_file, sheet_name=0)
418
+ print(f'...done reading {xl_file}...')
419
+
420
+ # convert column names to lowercase
421
+ table.columns = map(str.lower, table.columns)
422
+
423
+ # extract start and end time from the Timecode column
424
+ print(f'...splitting Timecode column into start and end time...')
425
+ timecodes = table['timecode'].str.split(' - ', expand=True)
426
+ table['start_time'] = timecodes[0]
427
+ table['end_time'] = timecodes[1]
428
+ print(f'...done splitting Timecode column into start and end time...')
429
+
430
+ # convert start and end time to seconds using the HHMMSS_to_sec function
431
+ print(f'...converting start and end time to seconds...')
432
+ table['start_sec'] = table['start_time'].apply(HHMMSS_to_sec)
433
+ table['end_sec'] = table['end_time'].apply(HHMMSS_to_sec)
434
+ print(f'...done converting start and end time to seconds...')
435
+
436
+ # drop unnecessary columns
437
+ print(f'...dropping unnecessary columns...')
438
+ table.drop(['timecode', 'annotations', 'error type', 'duration'], axis=1, inplace=True)
439
+
440
+ # rename columns
441
+ print(f'...renaming columns...')
442
+ table.rename(columns={'#': 'uttID', 'speaker': 'speaker', 'dialogue': 'transcript'}, inplace=True)
443
+
444
+ # reorder columns
445
+ print(f'...reordering columns...')
446
+ table = table[['uttID', 'speaker', 'transcript', 'start_sec', 'end_sec']]
447
+
448
+ table.sort_values(by='start_sec', inplace=True, ignore_index=True)
449
+ table.reset_index(inplace=True)
450
+
451
+ return table
452
+ except Exception as e:
453
+ gr.Error(f'Error converting {xl_file}: {e}')
454
+
455
+ def table_to_ELAN_tsv(table:pd.DataFrame, path:str):#TODO: check against isatasr
456
+ # write table to tsv compatible with ELAN import
457
+ table.to_csv(path, index=False, float_format='%.3f',sep='\t')
458
+ return path
459
+
460
+ def table_to_labels_csv(table:pd.DataFrame, path:str):
461
+ # write table to utt_labels csv format comaptable w rosy's isatasr lib
462
+ table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') # drop rows with missing values in speaker and utterance
463
+ table.to_csv(path,index=False, float_format='%.3f')
464
+ return path
465
+
466
+ def readELANtsv(file, fmt=None):
467
+ with open(file) as in_file:
468
+
469
+ reader = csv.reader(in_file, delimiter="\t")
470
+
471
+ skiprows=0
472
+ row=next(reader)
473
+
474
+ while not len(row)>=4: # 4 being the min numbert of cols ELAN exports have
475
+ skiprows+=1
476
+ row=next(reader)
477
+ in_file.seek(skiprows)
478
+
479
+ if skiprows>0:
480
+ print(f'Detected {skiprows} header rows to skip')
481
+ reader = csv.reader(in_file, delimiter="\t")
482
+ for _ in range(skiprows):
483
+ next(reader)
484
+
485
+ labels = [] # transcript with speaker labels and timestamp in sec
486
+
487
+ for i,utt in enumerate(reader):
488
+ if not ''.join(utt).strip(): # skip blank lines
489
+ continue
490
+ try:
491
+ if len(utt) == 5: # IF data comes straight from ELAN sometimes there is a superfluous blank column 2
492
+ if i==0:
493
+ print('detected extra blank column in first row, will remove')
494
+ if fmt=='AUG23':
495
+ if i==0:
496
+ print('detected extra blank 1st column, will remove')
497
+ _,speaker,start_HHMMSS,end_HHMMSS,utterance= utt
498
+ convert_timestamps=True
499
+ else:
500
+ if i==0:
501
+ print('detected extra blank 2nd column, will remove')
502
+ speaker,_,start_HHMMSS, end_HHMMSS, utterance = utt
503
+ convert_timestamps=True
504
+ elif len(utt) == 4: # sometimes the blank col is already removed
505
+ if i==0:
506
+ print('detected 4 columns, assuming: speaker,start_HHMMSS, end_HHMMSS, utterance ')
507
+ speaker,start_HHMMSS, end_HHMMSS, utterance = utt
508
+ convert_timestamps=True
509
+ elif len(utt) == 6: # New one from 2023 Aug has a redundant extra start col!?
510
+ if i==0:
511
+ print('detected 6 columns, assuming: _,speaker,start_HHMMSS, end_HHMMSS, utterance,_ ')
512
+ _,speaker,start_HHMMSS,end_HHMMSS,utterance,_ = utt
513
+ convert_timestamps=True
514
+ elif len(utt) == 9: # 2023 transcribers tend to give full elan output
515
+ if i==0:
516
+ print('detected 9 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance ')
517
+ speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance = utt
518
+ convert_timestamps=True
519
+ elif len(utt) == 10: # sometimes an extra blank column appears at the end
520
+ if i==0:
521
+ print('detected 10 columns, assuming: speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ ')
522
+ speaker,_,start_HHMMSS,_,end_HHMMSS,_,_,_,utterance,_ = utt
523
+ convert_timestamps=True
524
+ elif len(utt) == 12: # WOw how many redundant columns can ELAN make...
525
+ if i==0:
526
+ print('detected 12 columns, assuming: speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance ')
527
+ speaker,_,start_HHMMSS,_,_,end_HHMMSS,_,_,_,_,_,utterance = utt
528
+ convert_timestamps=True
529
+
530
+ else:
531
+ raise ValueError(f'Unknown transcript format with {len(utt)} columns for {file}')
532
+ except BaseException as err:
533
+ print(f'!!! transcript parse error on line {i} for {file}')
534
+ print(utt)
535
+ raise err
536
+ if convert_timestamps:
537
+ start_sec = HHMMSS_to_sec(start_HHMMSS)
538
+ end_sec = HHMMSS_to_sec(end_HHMMSS)
539
+
540
+ labels.append((speaker, utterance, start_sec,end_sec))
541
+ labels= pd.DataFrame(labels, columns = ('speaker', 'utterance', 'start_sec','end_sec'))
542
+ labels.sort_values(by='start_sec', inplace=True, ignore_index=True)
543
+ labels.reset_index(inplace=True)
544
+ labels = labels.rename(columns = {'index':'seg'})
545
+
546
+ return(labels)
547
+
548
+
549
+ def merge_ellipsis(seg_labels):
550
+ # merge utterances with ellipsis
551
+ # input is seg_labels format: [optional index] speaker, utterance, start_sec, end_sec
552
+ if isinstance(seg_labels,str) and seg_labels.endswith(('.csv','.tsv','.txt')):
553
+ df=pd.read_csv(seg_labels)
554
+ elif isinstance(seg_labels, pd.DataFrame):
555
+ df=seg_labels
556
+ else:
557
+ raise ValueError('input seg_labels should be path to csv or pd.DataFrame')
558
+
559
+ if len(df.columns)==4:
560
+ # no seg index yet
561
+ df.reset_index(inplace=True)
562
+ df = df.rename(columns = {'index':'seg'})
563
+ elif len(df.columns)==5:
564
+ # first col is seg
565
+ df.columns = ['seg','speaker','utterance','start_sec','end_sec']
566
+ else:
567
+ raise ValueError('input seg_labels should have 4 or 5 columns')
568
+ df2=[]
569
+ prev_spk=None
570
+ prev_utt=""
571
+ prev_start=0
572
+ prev_end=0
573
+ segs=[0]
574
+ merge_utt={"seg":None, "speaker":None,"utterance":None,"start_sec":None, "end_sec":None}
575
+ for i,row in df.iterrows():
576
+ if i==0:
577
+ merge_utt=row
578
+
579
+ else:
580
+ # if same speaker as last and ellipsis
581
+ if merge_utt["speaker"]==row["speaker"] and str(merge_utt["utterance"]).endswith('...') and str(row["utterance"]).startswith('...'):
582
+ # append current to temporary merged utt: use prev_ items
583
+
584
+ merge_utt["utterance"]+=str(row["utterance"])
585
+ merge_utt["end_sec"]=row["end_sec"]
586
+ segs.append(row["seg"])
587
+ else:
588
+ # append merge_utt to df2
589
+ merge_utt["seg"]=segs
590
+ df2.append(merge_utt)
591
+ # clear merge_utt and set to current
592
+ merge_utt=row
593
+ segs=[merge_utt["seg"]]
594
+
595
+ merge_utt["seg"]=segs
596
+ # if not isinstance(merge_utt["seg"],list):
597
+ # merge_utt["seg"]=list(segs)
598
+ df2.append(merge_utt) # catch final merge_utt if not terminated
599
+
600
+ df2=pd.DataFrame(df2)
601
+ df2['utterance']=df2['utterance'].str.replace('\.+',' ', regex=True)
602
+
603
+ # clear up "......"
604
+ # enumerate utterances
605
+ df2.reset_index(inplace=True,drop=True)
606
+ df2 = df2.reset_index().rename(columns = {'index':'utt'})
607
+ return df2
608
+
609
+
610
+ def add_dummy_seg_column(table):
611
+ # adds a dummy seg column (listing segments comprising utterance) for a df without this column
612
+ # labelfiles generated from merge_ellipsis have an 'utt' column giving utterance ID, and a seg column
613
+ # containing a list of original segments comprising each utterance
614
+ # but you may need all label files top have the exact same format even if they weren't produced by
615
+ # merge_ellipsis()
616
+ # returns a table with columns 'utt' and 'seg'
617
+
618
+ if 'seg' in table.columns.tolist():
619
+ print('\'seg\' column already exists, not changing anything')
620
+ return table
621
+ if 'uttID' in table.columns.tolist():
622
+ table=table.rename(columns={"uttID":"utt"})
623
+ if not 'utt' in table.columns.tolist():
624
+ table['utt']=table.index
625
+ table['seg']=[[u] for u in table['utt']]
626
+ table=table[['utt','seg','speaker','start_sec','end_sec','utterance']]
627
+
628
+ return table
629
+
630
+
631
+ def old_xlsx_to_labels_csv(xl_file, merge_segments=True):
632
+ # converts an xlsx file (from contractor transcription service which has single timecode col) to a csv in the format required by rosy's isatasr lib
633
+ # if merge_segments=True, will merge segments to form utterances where there have been splits separated by '...'
634
+ # if merge_segments=False, will keep segments as they were in the ELAN output
635
+ # returns the path to the csv file
636
+ table=old_xlsx_to_table(xl_file)
637
+ sessname=get_sessname_from_filename(xl_file)
638
+
639
+ if merge_segments:
640
+ save_file=f'utt_labels_{sessname}.csv'
641
+ merged_labels=merge_ellipsis(table)
642
+ merged_labels.to_csv(save_file,index=False, float_format='%.3f')
643
+ else:
644
+ save_file=f'seg_labels_{sessname}.csv'
645
+ table.to_csv(save_file,index=False, float_format='%.3f')
646
+ return save_file
647
+
648
+ def get_sessname_from_filename(filename):
649
+ sessname=Path(filename).stem
650
+ sessname = re.sub('reworked-transcript-diarized-timestamped-', '', sessname,flags=re.I)
651
+ sessname = re.sub('reworked_transcript-diarized-timestamped-', '', sessname,flags=re.I)
652
+ sessname = re.sub('reworked-diarized-timestamped-', '', sessname,flags=re.I)
653
+ sessname = re.sub('reworked_timestamped_', '', sessname,flags=re.I)
654
+ sessname = re.sub('reworked_', '', sessname,flags=re.I)
655
+ sessname = re.sub('reworked-', '', sessname,flags=re.I)
656
+ sessname = re.sub('transcript_diarized_timestamped_', '', sessname,flags=re.I)
657
+ sessname = re.sub('transcript-diarized-timestamped_', '', sessname,flags=re.I)
658
+ sessname = re.sub('transcript-diarized-timestamped-', '', sessname,flags=re.I)
659
+ sessname = re.sub('_transcript', '', sessname,flags=re.I)
660
+ sessname = re.sub('_tmcoded', '', sessname,flags=re.I)
661
+ sessname = re.sub('utt_labels_', '', sessname,flags=re.I)
662
+ sessname = re.sub('seg_labels_', '', sessname,flags=re.I)
663
+ sessname = re.sub('_redacted', '', sessname,flags=re.I)
664
+ return sessname
665
+
666
+ def ELAN_to_labels_csv(ELANfile, merge_segments = True):
667
+ # dumb but effective string wrangling to get sess name
668
+ sessname=get_sessname_from_filename(ELANfile)
669
+
670
+ # reads ELAN output to pd.DataFrame in a unified format
671
+ labels=readELANtsv(ELANfile)
672
+
673
+ if merge_segments:
674
+ save_file=f'utt_labels_{sessname}.csv'
675
+ # merge segments to form utterances where there have been splits separated by '...'
676
+ merged_labels=merge_ellipsis(labels)
677
+ merged_labels.to_csv(save_file,index=False, float_format='%.3f')
678
+ else:
679
+ save_file=f'seg_labels_{sessname}.csv'
680
+ labels.to_csv(save_file,index=False, float_format='%.3f')
681
+ return save_file
682
+
683
+ def parse_label_csv(label_csv:str):
684
+ # utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
685
+ # There are several versions with differnt columns (with/without segment &/ utterance index,
686
+ # withouot column headers etc)
687
+ # table:
688
+ # [uttID, speaker, transcript, start_sec, end_sec]
689
+
690
+ table = pd.read_csv(label_csv,keep_default_na=False, header=None)
691
+ row0=table.iloc[0]
692
+
693
+ is_header = not any(str(cell).replace('.','').isdigit() for cell in row0)
694
+ if is_header:
695
+ table.columns=row0.tolist()
696
+ table=table.iloc[1:]
697
+ table=table.reset_index(drop=True)
698
+ else:
699
+ if len(table.columns)==4:
700
+ print('no header detected, assuming annotation file has columns [speaker,utterance,start_sec, end_sec] ')
701
+ table.columns=['speaker','utterance','start_sec', 'end_sec']
702
+ elif len(table.columns)==5:
703
+ print('no header detected, assuming annotation file has columns [seg,speaker,utterance,start_sec, end_sec] ')
704
+ table.columns=['seg','speaker','utterance','start_sec', 'end_sec']
705
+ elif len(table.columns)==6:
706
+ print('no header detected, assuming annotation file has columns [utt,seg,speaker,utterance,start_sec, end_sec] ')
707
+ table.columns=['utt','seg','speaker','utterance','start_sec', 'end_sec']
708
+ else:
709
+ print(f'no header detected, csv has {len(table.columns)} columns, could not determine column names.')
710
+ return None
711
+ # choose which column to use for uttID in table
712
+ if 'utt' in table.columns.tolist():
713
+ table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
714
+ elif 'seg' in table.columns.tolist():
715
+ table=table.rename(columns={"seg":"uttID"})
716
+ else:
717
+ table=table.reset_index().rename(columns={"index":"uttID"})
718
+
719
+ table=table[['uttID','speaker','start_sec','end_sec','utterance']]
720
+ return table
721
+
722
+ def deidentify_speaker(df, who='all'):
723
+ """replace speaker ID with generic labels
724
+ in order of appearance (speaker1, speaker2)'
725
+ if who is "student", only student names are replaced
726
+
727
+
728
+ Args:
729
+ df (_type_): _description_
730
+ who (str, optional): 'all','student'. Which names to replace. Defaults to 'all'.
731
+ """
732
+ colnames = df.columns.tolist()
733
+ speaker_key = next((key for key in ['speaker','Speaker','speaker_id','Speaker_ID'] if key in colnames),None)
734
+ if not speaker_key:
735
+ raise ValueError('No speaker column found in dataframe!')
736
+ speakers = df[speaker_key].unique()
737
+ if who=='student':
738
+ # detect student. ID format can be student_xxx or 00-0000 numeric
739
+ speakers = [s for s in speakers if ('student' in s.lower() or re.match(r'^\d{2}-\d{4}$',s))]
740
+ generic_speakers = [f'student_{i+1}' for i in range(len(speakers))]
741
+ else:
742
+ generic_speakers = [f'speaker_{i+1}' for i in range(len(speakers))]
743
+ speaker_dict = dict(zip(speakers, generic_speakers))
744
+ df[speaker_key] = df[speaker_key].replace(speaker_dict)
745
+ return df