Spaces:
Sleeping
Sleeping
rosyvs
commited on
Commit
·
693e4cf
1
Parent(s):
5d0f90f
Remove unused sort_transcript function and update column renaming in merge_ellipsis and parse_label_csv functions
Browse files
utils.py
CHANGED
|
@@ -25,68 +25,6 @@ def subprocess_run_verbose(cmd):
|
|
| 25 |
res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
|
| 26 |
return res
|
| 27 |
|
| 28 |
-
def sort_transcript(file_path: str):
|
| 29 |
-
"""
|
| 30 |
-
Sort the rows of a transcript file by start time.
|
| 31 |
-
|
| 32 |
-
Parameters
|
| 33 |
-
----------
|
| 34 |
-
file_path : str
|
| 35 |
-
The file path of the transcript file.
|
| 36 |
-
save_path : str
|
| 37 |
-
The file path to save the sorted transcript file.
|
| 38 |
-
|
| 39 |
-
Returns
|
| 40 |
-
-------
|
| 41 |
-
str
|
| 42 |
-
The file path of the sorted transcript file.
|
| 43 |
-
"""
|
| 44 |
-
|
| 45 |
-
logging.info(f"Received file_path: {file_path}")
|
| 46 |
-
logging.info(f"Expected save_path: {save_path}")
|
| 47 |
-
|
| 48 |
-
file_ext = os.path.splitext(file_path)[1]
|
| 49 |
-
|
| 50 |
-
if file_ext in ['.txt', '.tsv']:
|
| 51 |
-
try:
|
| 52 |
-
with open(file_path, 'r') as file:
|
| 53 |
-
# Read the lines from the file, ignoring lines starting with '#'
|
| 54 |
-
lines = [line.strip() for line in file if not line.startswith('''"#file''')]
|
| 55 |
-
logging.info(f"Read {len(lines)} lines from {file_path}.")
|
| 56 |
-
|
| 57 |
-
# Create a DataFrame from the lines
|
| 58 |
-
table = pd.DataFrame([line.split('\t') for line in lines])
|
| 59 |
-
# Adjust column names based on the input file format
|
| 60 |
-
if len(table.columns) == 5:
|
| 61 |
-
table.columns = ['Speaker', 'Empty', 'Start', 'End', 'Transcript']
|
| 62 |
-
else:
|
| 63 |
-
table.columns = ['Speaker', 'Empty', 'Start', 'Start (s)', 'End', 'End (s)', 'Duration', 'Duration (s)', 'Transcript']
|
| 64 |
-
logging.info(f"Processed the file into a DataFrame with {table.shape[0]} rows and {table.shape[1]} columns.")
|
| 65 |
-
|
| 66 |
-
except Exception as e:
|
| 67 |
-
print(f"Error processing the file: {e}")
|
| 68 |
-
raise e
|
| 69 |
-
else:
|
| 70 |
-
error_msg = f"Unsupported file format: {file_ext}. Must be '.txt' or '.tsv'."
|
| 71 |
-
print(error_msg)
|
| 72 |
-
raise ValueError(error_msg)
|
| 73 |
-
|
| 74 |
-
try:
|
| 75 |
-
# Process the table for sorting
|
| 76 |
-
table.columns = map(str.lower, table.columns)
|
| 77 |
-
table = table[['speaker', 'start', 'end', 'transcript']]
|
| 78 |
-
table = table.rename(columns={'transcript': 'utterance'})
|
| 79 |
-
table['start_time'] = table['start'].str.split('.', expand=True)[0]
|
| 80 |
-
sorted_table = table.sort_values('start_time')
|
| 81 |
-
sorted_table = sorted_table.drop(columns=['start_time'])
|
| 82 |
-
sorted_table.to_csv(save_path, sep='\t', index=False, header=False)
|
| 83 |
-
logging.info(f"Saved sorted transcript to {save_path}")
|
| 84 |
-
except Exception as e:
|
| 85 |
-
print(f"Error sorting and saving the transcript: {e}")
|
| 86 |
-
raise e
|
| 87 |
-
|
| 88 |
-
return save_path
|
| 89 |
-
|
| 90 |
def HHMMSS_to_sec(time_str):
|
| 91 |
"""Get Seconds from timestamp string with milliseconds."""
|
| 92 |
if not time_str:
|
|
@@ -750,7 +688,7 @@ def merge_ellipsis(seg_labels):
|
|
| 750 |
df = df.rename(columns = {'index':'seg'})
|
| 751 |
elif len(df.columns)==5:
|
| 752 |
# first col is seg
|
| 753 |
-
df.columns = ['seg'
|
| 754 |
else:
|
| 755 |
raise ValueError('input seg_labels should have 4 or 5 columns')
|
| 756 |
df2=[]
|
|
@@ -890,7 +828,7 @@ def parse_label_csv(label_csv:str):
|
|
| 890 |
else:
|
| 891 |
table=table.reset_index().rename(columns={"index":"uttID"})
|
| 892 |
|
| 893 |
-
table=table[['uttID','speaker','
|
| 894 |
return table
|
| 895 |
|
| 896 |
def deidentify_speaker(df, who='all'):
|
|
|
|
| 25 |
res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
|
| 26 |
return res
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
def HHMMSS_to_sec(time_str):
|
| 29 |
"""Get Seconds from timestamp string with milliseconds."""
|
| 30 |
if not time_str:
|
|
|
|
| 688 |
df = df.rename(columns = {'index':'seg'})
|
| 689 |
elif len(df.columns)==5:
|
| 690 |
# first col is seg
|
| 691 |
+
df = df.rename(columns = {df.columns[0]:'seg'})
|
| 692 |
else:
|
| 693 |
raise ValueError('input seg_labels should have 4 or 5 columns')
|
| 694 |
df2=[]
|
|
|
|
| 828 |
else:
|
| 829 |
table=table.reset_index().rename(columns={"index":"uttID"})
|
| 830 |
|
| 831 |
+
table=table[['uttID','speaker','utterance','start_sec','end_sec']]
|
| 832 |
return table
|
| 833 |
|
| 834 |
def deidentify_speaker(df, who='all'):
|