Spaces:

levicu
/

transcriber_tools

Sleeping

App Files Files Community

rosyvs commited on May 2, 2025

Commit

693e4cf

1 Parent(s): 5d0f90f

Remove unused sort_transcript function and update column renaming in merge_ellipsis and parse_label_csv functions

Browse files

Files changed (1) hide show

utils.py +2 -64

utils.py CHANGED Viewed

@@ -25,68 +25,6 @@ def subprocess_run_verbose(cmd):
     res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
     return res
-def sort_transcript(file_path: str):
-    """
-    Sort the rows of a transcript file by start time.
-    Parameters
-    ----------
-    file_path : str
-        The file path of the transcript file.
-    save_path : str
-        The file path to save the sorted transcript file.
-    Returns
-    -------
-    str
-        The file path of the sorted transcript file.
-    """
-    logging.info(f"Received file_path: {file_path}")
-    logging.info(f"Expected save_path: {save_path}")
-    file_ext = os.path.splitext(file_path)[1]
-    if file_ext in ['.txt', '.tsv']:
-        try:
-            with open(file_path, 'r') as file:
-                # Read the lines from the file, ignoring lines starting with '#'
-                lines = [line.strip() for line in file if not line.startswith('''"#file''')]
-            logging.info(f"Read {len(lines)} lines from {file_path}.")
-            # Create a DataFrame from the lines
-            table = pd.DataFrame([line.split('\t') for line in lines])
-            # Adjust column names based on the input file format
-            if len(table.columns) == 5:
-                table.columns = ['Speaker', 'Empty', 'Start', 'End', 'Transcript']
-            else:
-                table.columns = ['Speaker', 'Empty', 'Start', 'Start (s)', 'End', 'End (s)', 'Duration', 'Duration (s)', 'Transcript']
-            logging.info(f"Processed the file into a DataFrame with {table.shape[0]} rows and {table.shape[1]} columns.")
-        except Exception as e:
-            print(f"Error processing the file: {e}")
-            raise e
-    else:
-        error_msg = f"Unsupported file format: {file_ext}. Must be '.txt' or '.tsv'."
-        print(error_msg)
-        raise ValueError(error_msg)
-    try:
-        # Process the table for sorting
-        table.columns = map(str.lower, table.columns)
-        table = table[['speaker', 'start', 'end', 'transcript']]
-        table = table.rename(columns={'transcript': 'utterance'})
-        table['start_time'] = table['start'].str.split('.', expand=True)[0]
-        sorted_table = table.sort_values('start_time')
-        sorted_table = sorted_table.drop(columns=['start_time'])
-        sorted_table.to_csv(save_path, sep='\t', index=False, header=False)
-        logging.info(f"Saved sorted transcript to {save_path}")
-    except Exception as e:
-        print(f"Error sorting and saving the transcript: {e}")
-        raise e
-    return save_path
 def HHMMSS_to_sec(time_str):
     """Get Seconds from timestamp string with milliseconds."""
     if not time_str:
@@ -750,7 +688,7 @@ def merge_ellipsis(seg_labels):
         df = df.rename(columns = {'index':'seg'})
     elif len(df.columns)==5:
         # first col is seg
-        df.columns = ['seg','speaker','utterance','start_sec','end_sec']
     else:
         raise ValueError('input seg_labels should have 4 or 5 columns')
     df2=[]
@@ -890,7 +828,7 @@ def parse_label_csv(label_csv:str):
     else:
         table=table.reset_index().rename(columns={"index":"uttID"})
-    table=table[['uttID','speaker','start_sec','end_sec','utterance']]
     return table
 def deidentify_speaker(df, who='all'):

     res = subprocess.check_call(cmd, stdout=sys.stdout, stderr=subprocess.STDOUT)
     return res
 def HHMMSS_to_sec(time_str):
     """Get Seconds from timestamp string with milliseconds."""
     if not time_str:
         df = df.rename(columns = {'index':'seg'})
     elif len(df.columns)==5:
         # first col is seg
+        df = df.rename(columns = {df.columns[0]:'seg'})
     else:
         raise ValueError('input seg_labels should have 4 or 5 columns')
     df2=[]
     else:
         table=table.reset_index().rename(columns={"index":"uttID"})
+    table=table[['uttID','speaker','utterance','start_sec','end_sec']]
     return table
 def deidentify_speaker(df, who='all'):