Spaces:

levicu
/

transcriber_tools

Sleeping

App Files Files Community

rosyvs commited on Oct 31, 2024

Commit

d971130

1 Parent(s): 97631c5

Refactor to use a single func convert_and_trim_video for both tasks, fix some webm --> mp4 / wav issues

Browse files

Files changed (3) hide show

.gitignore +2 -0
app.py +28 -39
utils.py +127 -164

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 .DS_Store
 __pycache__/
 flagged/

 .DS_Store
 __pycache__/
 flagged/
+results_*/
+logs/

app.py CHANGED Viewed

@@ -8,8 +8,8 @@ import random
 import gradio as gr
-from utils import (HHMMSS_to_sec, convert_video_format, molly_xlsx_to_table,
-                   sort_transcript, table_to_ELAN_tsv, trim_media,
                    xlsx_to_table)
@@ -19,26 +19,19 @@ def delete_files(files):
         try:
             os.remove(file)
         except FileNotFoundError:
             pass
     print("files deleted")
 def classify_input_format(input_string):
-    # check if the input string is a valid time in the format HH:MM:SS
-    hhmmss_pattern = re.compile('^\d{1,2}:\d{1,2}:\d{1,2}$')
-    if hhmmss_pattern.match(str(input_string)):
-        print("input string is a valid time in the format HH:MM:SS")
-        return HHMMSS_to_sec(time_str=input_string)
-    # check if the input string is a valid number in the format of seconds
-    try:
-        seconds = float(input_string)
-        print("input string is a valid number in the format of seconds")
-        return float(input_string)
-    except ValueError:
-        pass
-    # if the input format is neither seconds nor HH:MM:SS, return None
-    gr.Error("Input format not supported. Please enter a valid time in the format HH:MM:SS or seconds.")
 def set_output_file(input_file, output_format, folder, insert_string = 'trimmed'):
     # Set output file name and extension
@@ -57,32 +50,28 @@ def trim_video_helper(input_file, output_file, start_time, end_time):
         end_time = 300
     elif start_time != "" and end_time == "":
         end_time = 300 + HHMMSS_to_sec(time_str=start_time)
-    print("start time: ", start_time)
-    print("end time: ", end_time)
     # Trim the video
     print("start trimming")
-    start_time = classify_input_format(start_time)
-    print("start time: ", start_time)
-    end_time = classify_input_format(end_time)
-    print("end time: ", end_time)
-    output_file = trim_media(input_file.name, output_file, start_time, end_time)
-    print("finish trimming")
     return output_file
 def convert_video_helper(input_file, output_file, output_format):
     # convert video
     print("start converting")
-    output_file = convert_video_format(input_file.name, output_file)
-    print("finish converting")
     return output_file
 def convert_transcript_helper(input_transcript, output_transcript):
     # convert transcript
     print("start converting transcript")
     table = xlsx_to_table(xl_file=input_transcript)
-    print("finish converting transcript to table")
     output_file = table_to_ELAN_tsv(table, output_transcript)
-    print("finish converting transcript")
     return output_file
@@ -93,10 +82,10 @@ def trim_video_vtr(input_file, output_format):
         # randomly select start time
         start_time = random.randint(300, 900)
         end_time = start_time + 600 # since 10 minutes
-        print("start time: ", start_time)
-        print("end time: ", end_time)
-        aduio_name = input_file.name.split("/")[-1].split(".")[-2]
-        print("aduio_name: ", aduio_name)
         # set output file
         insert_string = f"start{start_time}_end{end_time}"
@@ -104,7 +93,7 @@ def trim_video_vtr(input_file, output_format):
         output_file = set_output_file(input_file, output_format, output_folder, insert_string)
         # write the start time, and end time to a txt file
-        log_file = f"{os.getcwd()}/results_vtr/{aduio_name}_start_end_time.txt"
         print("log_file: ", log_file)
         with open(log_file, "w") as f:
             f.write(f"{start_time}\n")
@@ -152,7 +141,7 @@ def sort_transcript_helper(input_transcript, output_transcript):
     print("input_transcript: ", input_transcript)
     print("output_transcript: ", output_transcript)
     output_transcript = sort_transcript(input_transcript, output_transcript)
-    print("finish sorting transcript")
     return output_transcript
 def sort_transcript_wrapper(input_file):
@@ -185,12 +174,12 @@ def convert_video(input_file, output_format):
         output_folder = f"{os.getcwd()}/results_vc/"
         output_file = set_output_file(input_file, output_format, output_folder, \
                                       insert_string = 'converted')
         # Convert video
         output_file = convert_video_helper(input_file, output_file, output_format)
         # remove file after 10 minutes for security
-        print("start deleting files")
         path_to_delete = [input_file.name, output_file]
         threading.Thread(target=delete_files, args=([path_to_delete])).start()

 import gradio as gr
+from utils import (HHMMSS_to_sec,  molly_xlsx_to_table, convert_and_trim_video,
+                   sort_transcript, table_to_ELAN_tsv,
                    xlsx_to_table)
         try:
             os.remove(file)
         except FileNotFoundError:
+            print(f"File {file} not found for deletion.")
             pass
     print("files deleted")
 def classify_input_format(input_string):
+    seconds = HHMMSS_to_sec(time_str=input_string)
+    if seconds is not None:
+        print("Successfully converted timestamps to seconds")
+        return seconds
+    else:
+        # if the input format is neither seconds nor HH:MM:SS, return None
+        gr.Error("Input time stamp format not supported. Please enter a valid time in the format HH:MM:SS or seconds.")
+        return None
 def set_output_file(input_file, output_format, folder, insert_string = 'trimmed'):
     # Set output file name and extension
         end_time = 300
     elif start_time != "" and end_time == "":
         end_time = 300 + HHMMSS_to_sec(time_str=start_time)
+    print("start time (s): ", start_time)
+    print("end time (s): ", end_time)
     # Trim the video
     print("start trimming")
+    output_file = convert_and_trim_video(input_file.name, output_file, start_time, end_time)
+    print("finished trimming")
     return output_file
 def convert_video_helper(input_file, output_file, output_format):
     # convert video
     print("start converting")
+    output_file = convert_and_trim_video(input_file.name, output_file)
+    print("finished converting")
     return output_file
 def convert_transcript_helper(input_transcript, output_transcript):
     # convert transcript
     print("start converting transcript")
     table = xlsx_to_table(xl_file=input_transcript)
+    print("finished converting transcript to table")
     output_file = table_to_ELAN_tsv(table, output_transcript)
+    print("finished converting transcript")
     return output_file
         # randomly select start time
         start_time = random.randint(300, 900)
         end_time = start_time + 600 # since 10 minutes
+        print("start time (s): ", start_time)
+        print("end time (s): ", end_time)
+        audio_base_name = input_file.name.split("/")[-1].split(".")[-2]
+        print("audio_base_name: ", audio_base_name)
         # set output file
         insert_string = f"start{start_time}_end{end_time}"
         output_file = set_output_file(input_file, output_format, output_folder, insert_string)
         # write the start time, and end time to a txt file
+        log_file = f"{os.getcwd()}/results_vtr/{audio_base_name}_start_end_time.txt"
         print("log_file: ", log_file)
         with open(log_file, "w") as f:
             f.write(f"{start_time}\n")
     print("input_transcript: ", input_transcript)
     print("output_transcript: ", output_transcript)
     output_transcript = sort_transcript(input_transcript, output_transcript)
+    print("finished sorting transcript")
     return output_transcript
 def sort_transcript_wrapper(input_file):
         output_folder = f"{os.getcwd()}/results_vc/"
         output_file = set_output_file(input_file, output_format, output_folder, \
                                       insert_string = 'converted')
+        print("Begin converting to output format: ", output_format)
         # Convert video
         output_file = convert_video_helper(input_file, output_file, output_format)
+        print("created output file: ", output_file)
         # remove file after 10 minutes for security
+        print("Done converting. start deleting files")
         path_to_delete = [input_file.name, output_file]
         threading.Thread(target=delete_files, args=([path_to_delete])).start()

utils.py CHANGED Viewed

@@ -75,174 +75,44 @@ def sort_transcript(file_path: str, save_path: str) -> str:
     return save_path
-def convert_video_format(media_in, media_out):
-    """
-      Function to convert video format using ffmpeg.
-    """
-    logging.info(f'...Converting video format from {media_in} to {media_out}...')
-    try:
-        WAV_CHANNELS = 1
-        WAV_SAMPLE_RATE = 16000
-        if not isinstance(media_in, (str, Path)):
-            raise TypeError("media_in must be a string or a PathLike object")
-        if not isinstance(media_out, (str, Path)):
-            raise TypeError("media_out must be a string or a PathLike object")
-        in_ext = Path(media_in).suffix.lower()
-        out_ext = Path(media_out).suffix.lower()
-        logging.info(f'...detected extensions from filename: input={in_ext} output={out_ext}')
-        if in_ext !='.webm':
-          # input is not in webm format
-          if out_ext == '.wav':
-              # convert to wav with standard format for audio models
-              logging.info(f'...generating {media_out}...')
-              command = ['ffmpeg', '-y', '-i',
-                          media_in, '-acodec', 'pcm_s16le',
-                          '-ac', str(WAV_CHANNELS), '-ar',
-                          str(WAV_SAMPLE_RATE), media_out,
-                          '-hide_banner', '-loglevel', 'warning']
-              process = subprocess.run(command, capture_output=True, text=True)
-              if process.returncode != 0:
-                logging.info(f"Error: {process.stderr}")
-              else:
-                logging.info(process.stdout)
-              return media_out
-          else:
-              logging.info(f'...Using ffmpeg to convert {media_in} to {media_out}...')
-              logging.info(f'...generating {media_out}...')
-              command = ['ffmpeg',
-                              '-y',
-                              '-i',
-                              media_in,
-                              '-c',
-                              'copy',
-                              media_out,
-                              '-hide_banner',
-                              '-loglevel',
-                              'warning'
-                              ]
-              process = subprocess.run(command, check=True)
-              if process.returncode != 0:
-                logging.info(f"Error: {process.stderr}")
-              else:
-                logging.info(process.stdout)
-              return media_out
-        if in_ext == '.webm':
-          if out_ext == '.wav':
-            command = ['ffmpeg', '-i', media_in, media_out]
-            process = subprocess.run(command, capture_output=True, text=True)
-            if process.returncode != 0:
-                logging.info(f"Error: {process.stderr}")
-            else:
-                logging.info(process.stdout)
-            return media_out
-          else:
-            command = ['ffmpeg', '-i', media_in, '-vcodec', 'h264', '-acodec', 'aac', media_out]
-            process = subprocess.run(command, capture_output=True, text=True)
-            if process.returncode != 0:
-                logging.info(f"Error: {process.stderr}")
-            else:
-                logging.info(process.stdout)
-            return media_out
-    except Exception as e:
-        logging.error(f"Error converting video format: {e}")
-        gr.Error(f"Error converting video format: {e}")
-def trim_media(media_in, media_out, start, end):
-    try:
-        # options for writing out audio if converting
-        WAV_CHANNELS = 1
-        WAV_SAMPLE_RATE = 16000
-        media_type = Path(media_in).suffix.lower()
-        ext = Path(media_out).suffix.lower()
-        print("EXT", ext)
-        if isinstance(start, str):
-            start_sec = HHMMSS_to_sec(start)
-        else:
-            start_sec = float(start)
-        if isinstance(end, str):
-            end_sec = HHMMSS_to_sec(end)
-        else:
-            end_sec = float(end)
-        if ext == '.wav':
-            # convert to wav with standard format for audio models
-            print(f'...Using ffmpeg to trim video from {start} to {end} \n   and convert to {WAV_SAMPLE_RATE}Hz WAV with {WAV_CHANNELS} channels...')
-            print(f'...generating {media_out}...')
-            subprocess.run(f'ffmpeg -y -i "{media_in}" -ss {start_sec} -to {end_sec} -acodec pcm_s16le -ac {WAV_CHANNELS} -ar {WAV_SAMPLE_RATE} "{media_out}" -hide_banner -loglevel warning', check=True, shell=True)
-            print(f'...done trimming and converting to {media_out}...')
-            return media_out
-        else:
-            print(f'...Using ffmpeg to trim video from {start_sec} to {end_sec}...')
-            print(f'...generating {media_out}...')
-            subprocess.run(['ffmpeg',
-                            '-y',
-                            '-i',
-                            media_in,
-                            '-ss',
-                            f'{start_sec}',
-                            '-to',
-                            f'{end_sec}',
-                            '-c',
-                            'copy',
-                            media_out,
-                            '-hide_banner',
-                            '-loglevel',
-                            'warning'
-                            ], check=True)
-            return media_out
-    except Exception as e:
-        gr.Error(f"Error trimming media: {e}")
 def HHMMSS_to_sec(time_str):
     """Get Seconds from timestamp string with milliseconds."""
-    try:
-        if not time_str:
-            return None
-        if time_str.count(':')==2:
-            h, m, s = time_str.split(':')
-        elif time_str.count(':')==3:
-        # weird timestamps where there is a field followign seconds delimited by colon
-            h, m, s, u = time_str.split(':')
-            # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
-            if len(u)==1:
-                print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
-                ms = float(u)/10
-            elif len(u)==2: # hundredths
-                ms = float(u)/100
-            elif len(u)==3: # hundredths
-                ms = float(u)/1000
-            else:
-                print(f'input string format not supported: {time_str}')
-                return None
-            s = int(s)+ms
-        elif time_str.count(':')==1:
-            # print('missing HH from timestamp, assuming MM:SS')
-            m, s = time_str.split(':')
-            h=0
         else:
             print(f'input string format not supported: {time_str}')
             return None
-        return int(h) * 3600 + int(m) * 60 + float(s)
-    except Exception as e:
-        gr.Error(f"Error converting time to seconds: {e}")
 def molly_xlsx_to_table(xl_file):
     # contractor transcribers provide an xlsx with the following columns
@@ -307,4 +177,97 @@ def xlsx_to_table(xl_file):
 def table_to_ELAN_tsv(table:pd.DataFrame, path:str):
     # write table to tsv compatible with ELAN import
     table.to_csv(path, index=False, float_format='%.3f',sep='\t')
-    return path

     return save_path
 def HHMMSS_to_sec(time_str):
     """Get Seconds from timestamp string with milliseconds."""
+    if not time_str:
+        return None
+    if isinstance(time_str, (int, float)):
+        return float(time_str)
+    if time_str.count(':')==2:
+        h, m, s = time_str.split(':')
+    elif time_str.count(':')==3:
+    # weird timestamps where there is a field followign seconds delimited by colon
+        h, m, s, u = time_str.split(':')
+        # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
+        if len(u)==1:
+            print('Weird time format with 3 colons detected - HH:MM:SS:X . Interpreting X as tenths of a second. - please verify this is how you want the time interpreted')
+            ms = float(u)/10
+        elif len(u)==2: # hundredths
+            print('Weird time format with 3 colons detected - HH:MM:SS:XX . Interpreting XX as hundredths of a second. - please verify this is how you want the time interpreted')
+            ms = float(u)/100
+        elif len(u)==3: # hundredths
+            print('Weird time format with 3 colons detected - HH:MM:SS:XXX . Interpreting XX as milliseconds. - please verify this is how you want the time interpreted')
+            ms = float(u)/1000
         else:
             print(f'input string format not supported: {time_str}')
             return None
+        s = int(s)+ms
+    elif time_str.count(':')==1:
+        # print('missing HH from timestamp, assuming MM:SS')
+        m, s = time_str.split(':')
+        h=0
+    else:
+        try:
+            time_str=float(time_str) # maybe its already in seconds!
+            return time_str
+        except Exception as e:
+            gr.Error(f"Error converting time to seconds: {e}")
+            return None
+    return int(h) * 3600 + int(m) * 60 + float(s)
 def molly_xlsx_to_table(xl_file):
     # contractor transcribers provide an xlsx with the following columns
 def table_to_ELAN_tsv(table:pd.DataFrame, path:str):
     # write table to tsv compatible with ELAN import
     table.to_csv(path, index=False, float_format='%.3f',sep='\t')
+    return path
+def convert_and_trim_video(media_in, media_out, start=None, end=None):
+    WAV_CHANNELS = 1
+    WAV_SAMPLE_RATE = 16000
+    start_sec = HHMMSS_to_sec(start)
+    end_sec = HHMMSS_to_sec(end)
+    try:
+        if start is None and end is None:
+            logging.info(f'...No start and end times provided. Converting entire video without trimming...')
+            trim_command=[]
+        else:
+            if start is None:
+                logging.info(f'...No start time provided. Trimming video from start to specified end...')
+                start_sec = 0.0
+            trim_command = ['-ss',f'{start_sec}']
+            if end is None:
+                logging.info(f'...No end time provided. Trimming video from specified start to end of video...')
+                end_sec = None
+            else:
+                end_sec = HHMMSS_to_sec(end)
+                trim_command.extend(['-to', f'{end_sec}'])
+        if not isinstance(media_in, (str, Path)):
+            raise TypeError("media_in must be a string or a PathLike object")
+        if not isinstance(media_out, (str, Path)):
+            raise TypeError("media_out must be a string or a PathLike object")
+        in_ext = Path(media_in).suffix.lower()
+        out_ext = Path(media_out).suffix.lower()
+        logging.info(f'...detected extensions from filename: input={in_ext} output={out_ext}')
+        if in_ext == out_ext:
+            logging.info(f'...No media conversion needed...')
+        else:
+            logging.info(f'...Using ffmpeg to convert {in_ext} to {out_ext}...')
+        if out_ext == '.wav':
+            # convert to wav with standard format for audio models
+            command = [
+                'ffmpeg',
+                "-f", "s16le",
+                '-y',
+                '-i', media_in,
+                *trim_command,
+                '-vn',
+                '-acodec', 'pcm_s16le',
+                '-ac', str(WAV_CHANNELS),
+                '-ar', str(WAV_SAMPLE_RATE),
+                media_out,
+                '-hide_banner', '-loglevel', 'warning']
+            logging.info(f"FFMPEG command: {' '.join(command)}")
+            process = subprocess.run(command, capture_output=True, text=True)
+            if process.returncode != 0:
+                logging.info(f"FFMPEG error: {process.stderr}")
+            else:
+                logging.info(process.stdout)
+            return media_out
+        else: # convert using copy codec
+            if in_ext == '.webm':
+                command = ['ffmpeg',
+                '-i', media_in,
+                *trim_command,
+                '-c', 'copy',
+                '-vcodec', 'h264',
+                '-acodec', 'aac',
+                media_out,
+                '-hide_banner', '-loglevel', 'warning']
+                logging.info(f"FFMPEG command: {' '.join(command)}")
+                process = subprocess.run(command, capture_output=True, text=True)
+                if process.returncode != 0:
+                    logging.info(f"FFMPEG error: {process.stderr}")
+                else:
+                    logging.info(process.stdout)
+                return media_out
+            else: # not webm
+                command = ['ffmpeg',
+                                '-y',
+                                '-i', media_in,
+                                *trim_command,
+                                '-c','copy',
+                                media_out,
+                                '-hide_banner', '-loglevel', 'warning']
+                logging.info(f"FFMPEG command: {' '.join(command)}")
+                process = subprocess.run(command, check=True)
+                if process.returncode != 0:
+                    logging.info(f"FFMPEG error: {process.stderr}")
+                else:
+                    logging.info(process.stdout)
+                return media_out
+    except Exception as e:
+        logging.error(f"Error converting video format: {e}")
+        gr.Error(f"Error converting video format: {e}")