Ericwang commited on
Commit
8c96951
·
1 Parent(s): cf52440

modified sort transcript function to support a new format input

Browse files
Files changed (1) hide show
  1. utlis.py +10 -4
utlis.py CHANGED
@@ -22,10 +22,16 @@ def sort_transcript(file_path, save_path):
22
  file_ext = os.path.splitext(file_path)[1]
23
 
24
  if file_ext == '.txt' or file_ext == '.tsv':
25
- # read the tab-separated plaintext file into a DataFrame
26
- table = pd.read_csv(file_path, sep='\t', header=None, \
27
- names=['Speaker', 'Empty', 'Start', 'Start (s)', 'End', 'End (s)', 'Duration', 'Duration (s)', 'Transcript'],
28
- index_col=False)
 
 
 
 
 
 
29
  else:
30
  raise ValueError("Unsupported file format. Must be '.txt' or '.tsv'.")
31
 
 
22
  file_ext = os.path.splitext(file_path)[1]
23
 
24
  if file_ext == '.txt' or file_ext == '.tsv':
25
+ with open(file_path, 'r') as file:
26
+ # Read the lines from the file, ignoring lines starting with '#'
27
+ lines = [line.strip() for line in file if line.strip() and not line.startswith('''"#file''')]
28
+ # Create a DataFrame from the remaining lines
29
+ table = pd.DataFrame([line.split('\t') for line in lines])
30
+ # since there two formats of input files
31
+ if len(table.columns) == 5:
32
+ table.columns = ['Speaker', 'Empty', 'Start', 'End', 'Transcript']
33
+ else:
34
+ table.columns = ['Speaker', 'Empty', 'Start', 'Start (s)', 'End', 'End (s)', 'Duration', 'Duration (s)', 'Transcript']
35
  else:
36
  raise ValueError("Unsupported file format. Must be '.txt' or '.tsv'.")
37