Spaces:

Maximofn
/

subtify

Build error

Maximofn commited on Feb 11, 2025

Commit

218960f

1 Parent(s): 9b5b5b7

Improve code documentation and add docstrings

- Add comprehensive docstrings to functions in app.py and slice_audio.py
- Enhance code readability with clear function descriptions and parameter explanations
- Remove commented-out code and improve inline comments
- Standardize code comments and improve code clarity

Files changed (2) hide show

app.py +44 -19
slice_audio.py +45 -3

app.py CHANGED Viewed

@@ -39,9 +39,7 @@ TRANSLATE_TRANSCRIPTIONS = True
 ADD_SUBTITLES_TO_VIDEO = True
 REMOVE_FILES = True
 if DEVICE == "cpu":
-    # I supose that I am on huggingface server
-    # Get RAM space
-    # ram = int(os.popen("free -m | grep Mem | awk '{print $2}'").read())
     ram = 16000
     factor = 1
     CHUNK_SECONDS = int(ram*factor)
@@ -49,8 +47,7 @@ if DEVICE == "cpu":
     CHUNK_OVERLAP_SECONDS = 5
     print(f"RAM: {ram}, CHUNK_SECONDS: {CHUNK_SECONDS}, CHUNK_OVERLAP_SECONDS: {CHUNK_OVERLAP_SECONDS}")
 else:
-    # I supose that I am on my computer
-    # Get VRAM space
     CHUNK_SECONDS = 30
     CHUNK_OVERLAP_SECONDS = 5
@@ -73,6 +70,7 @@ html_subtify_logo_small = get_html_subtify_logo_small(new_width, new_height)
 language_dict = union_language_dict()
 def remove_all_files():
     if os.path.exists("audios"):
         command = f"rm -r audios"
         os.system(command)
@@ -96,6 +94,7 @@ def remove_all_files():
         os.system(command)
 def reset_frontend():
     visible = False
     return (
         None,
@@ -118,9 +117,11 @@ def reset_frontend():
     )
 def show_auxiliar_block1():
     return gr.Textbox(value="URL checked", visible=False)
 def change_visibility_texboxes():
     return (
         gr.update(value="Done"), # auxiliar_block1
         gr.update(visible=True), # get_audio_from_video_info
@@ -134,6 +135,15 @@ def change_visibility_texboxes():
     )
 def get_audio(video_path):
     print('*'*NUMBER)
     print(f"Getting audio from video {video_path}")
@@ -161,6 +171,12 @@ def get_audio(video_path):
         ]
 def slice_audio(input_audio_path):
     print('*'*NUMBER)
     print(f"Slicing audio {input_audio_path} in chunks of {CHUNK_SECONDS} seconds with {CHUNK_OVERLAP_SECONDS} seconds overlap")
@@ -180,6 +196,15 @@ def slice_audio(input_audio_path):
     )
 def diarize(input_audio_path, num_speakers, min_speakers, max_speakers):
     print('*'*NUMBER)
     print(f"Diarize {input_audio_path}")
@@ -364,10 +389,10 @@ def process_uploaded_video(video_path):
 def merge_transcription_and_diarization():
     """
-    Combina la transcripción y la diarización para asignar speakers a cada palabra.
     Returns:
-        dict: Transcripción combinada con información de speakers
     """
     print('*'*NUMBER)
     print("Merge transcription and diarization")
@@ -382,20 +407,20 @@ def merge_transcription_and_diarization():
                 gr.update(value=merged_transcription)
             ]
     transcription_path = "transcriptions/transcription_English.json"
     diarization_path = "diarization/diarization.json"
-    # Cargar los archivos JSON
     with open(transcription_path, 'r') as f:
         transcription = json.load(f)
     with open(diarization_path, 'r') as f:
         diarization = json.load(f)
-    # Crear una nueva lista para los chunks combinados
     merged_chunks = []
-    # Para cada palabra en la transcripción
     for chunk in transcription.get('chunks', []):
-        # Verificar que el chunk tiene timestamps válidos
         if not (isinstance(chunk.get('start'), (int, float)) and
                 isinstance(chunk.get('end'), (int, float))):
             continue
@@ -403,10 +428,10 @@ def merge_transcription_and_diarization():
         word_start = float(chunk['start'])
         word_end = float(chunk['end'])
-        # Encontrar el speaker correspondiente en la diarización
         speaker = None
         for segment in diarization:
-            # Verificar que el segmento tiene timestamps válidos
             if not (isinstance(segment.get('start'), (int, float)) and
                     isinstance(segment.get('end'), (int, float))):
                 continue
@@ -414,12 +439,12 @@ def merge_transcription_and_diarization():
             segment_start = float(segment['start'])
             segment_end = float(segment['end'])
-            # Si la palabra está dentro del rango de tiempo del segmento
             if (word_start >= segment_start and word_end <= segment_end):
                 speaker = segment['speaker']
                 break
-            # Si la palabra está mayormente dentro del segmento (más del 50% de su duración)
             word_duration = word_end - word_start
             overlap_start = max(word_start, segment_start)
             overlap_end = min(word_end, segment_end)
@@ -429,7 +454,7 @@ def merge_transcription_and_diarization():
                 speaker = segment['speaker']
                 break
-        # Crear el nuevo chunk con la información del speaker
         merged_chunk = {
             'start': word_start,
             'end': word_end,
@@ -438,17 +463,17 @@ def merge_transcription_and_diarization():
         }
         merged_chunks.append(merged_chunk)
-    # Crear el diccionario final
     merged_transcription = {
         'text': transcription.get('text', ''),
         'chunks': merged_chunks
     }
-    # Crear el directorio si no existe
     if not os.path.exists(merged_transcription_path):
         os.makedirs(merged_transcription_path)
-    # Guardar el resultado en el nuevo directorio
     with open(merged_transcription_path, 'w', encoding='utf-8') as f:
         json.dump(merged_transcription, f, ensure_ascii=False, indent=2)

 ADD_SUBTITLES_TO_VIDEO = True
 REMOVE_FILES = True
 if DEVICE == "cpu":
+    # Assuming we are on huggingface server
     ram = 16000
     factor = 1
     CHUNK_SECONDS = int(ram*factor)
     CHUNK_OVERLAP_SECONDS = 5
     print(f"RAM: {ram}, CHUNK_SECONDS: {CHUNK_SECONDS}, CHUNK_OVERLAP_SECONDS: {CHUNK_OVERLAP_SECONDS}")
 else:
+    # Assuming we are on local machine
     CHUNK_SECONDS = 30
     CHUNK_OVERLAP_SECONDS = 5
 language_dict = union_language_dict()
 def remove_all_files():
+    """Remove all temporary files and folders"""
     if os.path.exists("audios"):
         command = f"rm -r audios"
         os.system(command)
         os.system(command)
 def reset_frontend():
+    """Reset all frontend elements to their default state"""
     visible = False
     return (
         None,
     )
 def show_auxiliar_block1():
+    """Show auxiliary block 1 with URL checked message"""
     return gr.Textbox(value="URL checked", visible=False)
 def change_visibility_texboxes():
+    """Change visibility of progress info textboxes"""
     return (
         gr.update(value="Done"), # auxiliar_block1
         gr.update(visible=True), # get_audio_from_video_info
     )
 def get_audio(video_path):
+    """
+    Extract audio from video file.
+    Args:
+        video_path (str): Path to video file
+    Returns:
+        list: Status update and audio file path
+    """
     print('*'*NUMBER)
     print(f"Getting audio from video {video_path}")
         ]
 def slice_audio(input_audio_path):
+    """
+    Slice audio into chunks.
+    Args:
+        input_audio_path (str): Path to input audio file
+    """
     print('*'*NUMBER)
     print(f"Slicing audio {input_audio_path} in chunks of {CHUNK_SECONDS} seconds with {CHUNK_OVERLAP_SECONDS} seconds overlap")
     )
 def diarize(input_audio_path, num_speakers, min_speakers, max_speakers):
+    """
+    Perform speaker diarization on audio file.
+    Args:
+        input_audio_path (str): Path to audio file
+        num_speakers (int): Expected number of speakers
+        min_speakers (int): Minimum number of speakers
+        max_speakers (int): Maximum number of speakers
+    """
     print('*'*NUMBER)
     print(f"Diarize {input_audio_path}")
 def merge_transcription_and_diarization():
     """
+    Merge transcription and diarization results to assign speakers to each word.
     Returns:
+        dict: Combined transcription with speaker information
     """
     print('*'*NUMBER)
     print("Merge transcription and diarization")
                 gr.update(value=merged_transcription)
             ]
+    # Load JSON files
     transcription_path = "transcriptions/transcription_English.json"
     diarization_path = "diarization/diarization.json"
     with open(transcription_path, 'r') as f:
         transcription = json.load(f)
     with open(diarization_path, 'r') as f:
         diarization = json.load(f)
+    # Create new list for combined chunks
     merged_chunks = []
+    # For each word in transcription
     for chunk in transcription.get('chunks', []):
+        # Verify chunk has valid timestamps
         if not (isinstance(chunk.get('start'), (int, float)) and
                 isinstance(chunk.get('end'), (int, float))):
             continue
         word_start = float(chunk['start'])
         word_end = float(chunk['end'])
+        # Find corresponding speaker in diarization
         speaker = None
         for segment in diarization:
+            # Verify segment has valid timestamps
             if not (isinstance(segment.get('start'), (int, float)) and
                     isinstance(segment.get('end'), (int, float))):
                 continue
             segment_start = float(segment['start'])
             segment_end = float(segment['end'])
+            # If word is within segment time range
             if (word_start >= segment_start and word_end <= segment_end):
                 speaker = segment['speaker']
                 break
+            # If word is mostly within segment (>50% duration)
             word_duration = word_end - word_start
             overlap_start = max(word_start, segment_start)
             overlap_end = min(word_end, segment_end)
                 speaker = segment['speaker']
                 break
+        # Create new chunk with speaker information
         merged_chunk = {
             'start': word_start,
             'end': word_end,
         }
         merged_chunks.append(merged_chunk)
+    # Create final dictionary
     merged_transcription = {
         'text': transcription.get('text', ''),
         'chunks': merged_chunks
     }
+    # Create directory if it doesn't exist
     if not os.path.exists(merged_transcription_path):
         os.makedirs(merged_transcription_path)
+    # Save result to new directory
     with open(merged_transcription_path, 'w', encoding='utf-8') as f:
         json.dump(merged_transcription, f, ensure_ascii=False, indent=2)

slice_audio.py CHANGED Viewed

@@ -7,6 +7,19 @@ FOLDER = "chunks"
 DEBUG = True
 def seconds_to_hms(seconds):
     hour = 00
     minute = 00
     second = seconds
@@ -21,17 +34,46 @@ def seconds_to_hms(seconds):
     return hour, minute, second
 def hms_to_seconds(hour, minute, second):
     return hour*3600 + minute*60 + second
 def slice_audio(input_audio_path, output_folder, chunks_seconds, chunk_overlap_seconds):
     """
     Slice audio into chunks with specified duration and overlap.
     Args:
-        input_audio_path (str): Path to input audio file
-        output_folder (str): Path to output folder
         chunks_seconds (int): Duration of each chunk in seconds
-        chunk_overlap_seconds (int): Overlap between chunks in seconds
     """
     _, filename = os.path.split(input_audio_path)
     name, extension = os.path.splitext(filename)

 DEBUG = True
 def seconds_to_hms(seconds):
+    """
+    Convert seconds to hours, minutes, seconds format.
+    Args:
+        seconds (int): Total number of seconds to convert
+    Returns:
+        tuple: A tuple containing (hours, minutes, seconds)
+    Example:
+        >>> seconds_to_hms(3665)
+        (1, 1, 5)  # 1 hour, 1 minute, 5 seconds
+    """
     hour = 00
     minute = 00
     second = seconds
     return hour, minute, second
 def hms_to_seconds(hour, minute, second):
+    """
+    Convert hours, minutes, seconds to total seconds.
+    Args:
+        hour (int): Number of hours
+        minute (int): Number of minutes
+        second (int): Number of seconds
+    Returns:
+        int: Total number of seconds
+    Example:
+        >>> hms_to_seconds(1, 1, 5)
+        3665  # 1 hour + 1 minute + 5 seconds in seconds
+    """
     return hour*3600 + minute*60 + second
 def slice_audio(input_audio_path, output_folder, chunks_seconds, chunk_overlap_seconds):
     """
     Slice audio into chunks with specified duration and overlap.
+    This function takes an audio file and splits it into smaller chunks with a specified
+    duration and overlap between chunks. It uses ffmpeg for the actual audio processing.
     Args:
+        input_audio_path (str): Path to the input audio file
+        output_folder (str): Directory where the chunks will be saved
         chunks_seconds (int): Duration of each chunk in seconds
+        chunk_overlap_seconds (int): Amount of overlap between consecutive chunks in seconds
+    Returns:
+        None: Creates audio chunks in the specified output folder and generates
+              a text file listing all chunk files
+    Raises:
+        ValueError: If chunk_overlap_seconds is greater than or equal to chunks_seconds
+    Example:
+        >>> slice_audio("input.mp3", "chunks", 30, 5)
+        # Creates chunks of 30 seconds with 5 seconds overlap
     """
     _, filename = os.path.split(input_audio_path)
     name, extension = os.path.splitext(filename)