rosyvs commited on
Commit
693e4cf
·
1 Parent(s): 5d0f90f

Remove unused sort_transcript function and update column renaming in merge_ellipsis and parse_label_csv functions

Browse files
Files changed (1) hide show
  1. utils.py +2 -64
utils.py CHANGED
@@ -25,68 +25,6 @@ def subprocess_run_verbose(cmd):
25
  res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
26
  return res
27
 
28
- def sort_transcript(file_path: str):
29
- """
30
- Sort the rows of a transcript file by start time.
31
-
32
- Parameters
33
- ----------
34
- file_path : str
35
- The file path of the transcript file.
36
- save_path : str
37
- The file path to save the sorted transcript file.
38
-
39
- Returns
40
- -------
41
- str
42
- The file path of the sorted transcript file.
43
- """
44
-
45
- logging.info(f"Received file_path: {file_path}")
46
- logging.info(f"Expected save_path: {save_path}")
47
-
48
- file_ext = os.path.splitext(file_path)[1]
49
-
50
- if file_ext in ['.txt', '.tsv']:
51
- try:
52
- with open(file_path, 'r') as file:
53
- # Read the lines from the file, ignoring lines starting with '#'
54
- lines = [line.strip() for line in file if not line.startswith('''"#file''')]
55
- logging.info(f"Read {len(lines)} lines from {file_path}.")
56
-
57
- # Create a DataFrame from the lines
58
- table = pd.DataFrame([line.split('\t') for line in lines])
59
- # Adjust column names based on the input file format
60
- if len(table.columns) == 5:
61
- table.columns = ['Speaker', 'Empty', 'Start', 'End', 'Transcript']
62
- else:
63
- table.columns = ['Speaker', 'Empty', 'Start', 'Start (s)', 'End', 'End (s)', 'Duration', 'Duration (s)', 'Transcript']
64
- logging.info(f"Processed the file into a DataFrame with {table.shape[0]} rows and {table.shape[1]} columns.")
65
-
66
- except Exception as e:
67
- print(f"Error processing the file: {e}")
68
- raise e
69
- else:
70
- error_msg = f"Unsupported file format: {file_ext}. Must be '.txt' or '.tsv'."
71
- print(error_msg)
72
- raise ValueError(error_msg)
73
-
74
- try:
75
- # Process the table for sorting
76
- table.columns = map(str.lower, table.columns)
77
- table = table[['speaker', 'start', 'end', 'transcript']]
78
- table = table.rename(columns={'transcript': 'utterance'})
79
- table['start_time'] = table['start'].str.split('.', expand=True)[0]
80
- sorted_table = table.sort_values('start_time')
81
- sorted_table = sorted_table.drop(columns=['start_time'])
82
- sorted_table.to_csv(save_path, sep='\t', index=False, header=False)
83
- logging.info(f"Saved sorted transcript to {save_path}")
84
- except Exception as e:
85
- print(f"Error sorting and saving the transcript: {e}")
86
- raise e
87
-
88
- return save_path
89
-
90
  def HHMMSS_to_sec(time_str):
91
  """Get Seconds from timestamp string with milliseconds."""
92
  if not time_str:
@@ -750,7 +688,7 @@ def merge_ellipsis(seg_labels):
750
  df = df.rename(columns = {'index':'seg'})
751
  elif len(df.columns)==5:
752
  # first col is seg
753
- df.columns = ['seg','speaker','utterance','start_sec','end_sec']
754
  else:
755
  raise ValueError('input seg_labels should have 4 or 5 columns')
756
  df2=[]
@@ -890,7 +828,7 @@ def parse_label_csv(label_csv:str):
890
  else:
891
  table=table.reset_index().rename(columns={"index":"uttID"})
892
 
893
- table=table[['uttID','speaker','start_sec','end_sec','utterance']]
894
  return table
895
 
896
  def deidentify_speaker(df, who='all'):
 
25
  res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
26
  return res
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def HHMMSS_to_sec(time_str):
29
  """Get Seconds from timestamp string with milliseconds."""
30
  if not time_str:
 
688
  df = df.rename(columns = {'index':'seg'})
689
  elif len(df.columns)==5:
690
  # first col is seg
691
+ df = df.rename(columns = {df.columns[0]:'seg'})
692
  else:
693
  raise ValueError('input seg_labels should have 4 or 5 columns')
694
  df2=[]
 
828
  else:
829
  table=table.reset_index().rename(columns={"index":"uttID"})
830
 
831
+ table=table[['uttID','speaker','utterance','start_sec','end_sec']]
832
  return table
833
 
834
  def deidentify_speaker(df, who='all'):