Spaces:
Sleeping
Sleeping
modified sort transcript function to support a new format input
Browse files
utlis.py
CHANGED
|
@@ -22,10 +22,16 @@ def sort_transcript(file_path, save_path):
|
|
| 22 |
file_ext = os.path.splitext(file_path)[1]
|
| 23 |
|
| 24 |
if file_ext == '.txt' or file_ext == '.tsv':
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
else:
|
| 30 |
raise ValueError("Unsupported file format. Must be '.txt' or '.tsv'.")
|
| 31 |
|
|
|
|
| 22 |
file_ext = os.path.splitext(file_path)[1]
|
| 23 |
|
| 24 |
if file_ext == '.txt' or file_ext == '.tsv':
|
| 25 |
+
with open(file_path, 'r') as file:
|
| 26 |
+
# Read the lines from the file, ignoring lines starting with '#'
|
| 27 |
+
lines = [line.strip() for line in file if line.strip() and not line.startswith('''"#file''')]
|
| 28 |
+
# Create a DataFrame from the remaining lines
|
| 29 |
+
table = pd.DataFrame([line.split('\t') for line in lines])
|
| 30 |
+
# since there two formats of input files
|
| 31 |
+
if len(table.columns) == 5:
|
| 32 |
+
table.columns = ['Speaker', 'Empty', 'Start', 'End', 'Transcript']
|
| 33 |
+
else:
|
| 34 |
+
table.columns = ['Speaker', 'Empty', 'Start', 'Start (s)', 'End', 'End (s)', 'Duration', 'Duration (s)', 'Transcript']
|
| 35 |
else:
|
| 36 |
raise ValueError("Unsupported file format. Must be '.txt' or '.tsv'.")
|
| 37 |
|