Spaces:

JamesDigitalOcean
/

Parakeet-AutoCaption

Build error

App Files Files Community

jamesesqueleto commited on Aug 27, 2025

Commit

cfab262

1 Parent(s): fa0a8cc

init commit

Browse files

Files changed (3) hide show

experiments/empty_file.md +0 -0
gradio_app.py +212 -0
requirements.txt +7 -0

experiments/empty_file.md ADDED Viewed

File without changes

gradio_app.py ADDED Viewed

	@@ -0,0 +1,212 @@

+## Imports
+from __future__ import unicode_literals
+from IPython.display import Video
+import whisper
+import cv2
+import pandas as pd
+from moviepy import VideoFileClip
+from IPython.display import display, Markdown
+# from moviepy.editor import *
+from moviepy.video.tools.subtitles import SubtitlesClip
+import os
+from moviepy.video.tools.subtitles import SubtitlesClip
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from moviepy import CompositeVideoClip
+from moviepy import TextClip
+import nemo.collections.asr as nemo_asr
+import gradio as gr
+#!/usr/bin/env python3
+import csv, re, sys
+from pathlib import Path
+def parse_time_to_srt(t):
+    s = str(t).strip()
+    if re.fullmatch(r"\d+(\.\d+)?", s):
+        total_ms = int(round(float(s) * 1000))
+    else:
+        parts = s.split(':')
+        if len(parts) == 2:
+            mm, ss = parts
+            sec = float(ss)
+            total_ms = int(round((int(mm) * 60 + sec) * 1000))
+        elif len(parts) == 3:
+            hh, mm, ss = parts
+            sec = float(ss)
+            total_ms = int(round(((int(hh) * 3600) + (int(mm) * 60) + sec) * 1000))
+        else:
+            raise ValueError(f"Unrecognized time format: {s}")
+    hours = total_ms // 3_600_000
+    rem = total_ms % 3_600_000
+    minutes = rem // 60_000
+    rem = rem % 60_000
+    seconds = rem // 1000
+    millis = rem % 1000
+    return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
+def map_position_to_tag(pos):
+    if not pos:
+        return ""
+    s = str(pos).strip().lower()
+    m = re.search(r"\\?an([1-9])", s)
+    if m:
+        return "{\\an" + m.group(1) + "}"
+    if "top left" in s or ("top" in s and "left" in s):
+        return "{\\an7}"
+    if "top right" in s or ("top" in s and "right" in s):
+        return "{\\an9}"
+    if "bottom left" in s or ("bottom" in s and "left" in s):
+        return "{\\an1}"
+    if "bottom right" in s or ("bottom" in s and "right" in s):
+        return "{\\an3}"
+    if "top" in s:
+        return "{\\an8}"
+    if "middle" in s or "center" in s or "centre" in s:
+        return "{\\an5}"
+    if "bottom" in s:
+        return "{\\an2}"
+    return ""
+def looks_like_header(row):
+    joined = ",".join(c.strip().lower() for c in row[:4])
+    header_words = ["position", "pos", "align", "start", "begin", "end", "stop", "subtitle", "text", "caption"]
+    return any(w in joined for w in header_words)
+def csv_to_srt(csv_path: Path, srt_path: Path):
+    with open(f'{csv_path}',"r", encoding="utf-8-sig", newline="") as f:
+        reader = csv.reader(f)
+        rows = [row for row in reader if any(cell.strip() for cell in row)]
+    if not rows:
+        raise ValueError("CSV is empty.")
+    start_index = 1 if looks_like_header(rows[0]) else 0
+    normalized = []
+    for i, row in enumerate(rows[start_index:], start=start_index+1):
+        if len(row) < 4:
+            raise ValueError(f"Row {i} has fewer than 4 columns: {row}")
+        position, start, end, text = row[0].strip(), row[1].strip(), row[2].strip(), row[3]
+        normalized.append((position, start, end, text))
+    with open(f"{srt_path}", "w", encoding="utf-8") as out:
+        for idx, (position, start, end, text) in enumerate(normalized, start=1):
+            start_srt = parse_time_to_srt(start)
+            end_srt = parse_time_to_srt(end)
+            pos_tag = map_position_to_tag(position)
+            final_text = f"{pos_tag}{text}" if pos_tag else text
+            out.write(f"{idx}\n")
+            out.write(f"{start_srt} --> {end_srt}\n")
+            out.write(f"{final_text}\n\n")
+from pydub import AudioSegment
+def convert_audio_to_mono_16khz(input_path, output_path):
+    """
+    Converts an audio file to mono and resamples it to 16 kHz.
+    Args:
+        input_path (str): The path to the input audio file.
+        output_path (str): The path to save the converted audio file.
+    """
+    try:
+        audio = AudioSegment.from_file(input_path)
+        # Set channels to 1 (mono)
+        audio = audio.set_channels(1)
+        # Set frame rate (sample rate) to 16000 Hz
+        audio = audio.set_frame_rate(16000)
+        audio.export(output_path, format="wav") # Export as WAV or desired format
+        print(f"Audio converted successfully to mono, 16kHz at: {output_path}")
+    except Exception as e:
+        print(f"Error converting audio: {e}")
+def subtitle_video(input_file):
+# ------------------------------------------------------------------------------------------------------------------------------
+#     Params:
+# ------------------------------------------------------------------------------------------------------------------------------
+#.    name:          str, name of directory to store files in in experiments folder
+#     audio_file:    str, path to extracted audio file for Whisper
+#     input_file:    str, path to video file for MoviePy to caption
+#     output:        str, destination of final output video file
+#     lang:          str, language
+#     uploaded_vid:  str, path to uploaded video file if download is False
+#
+#--------------------------------------------------------------------------------------------------------------------------------
+#     Returns:       An annotated video with translated captions into english, saved to name/output
+#--------------------------------------------------------------------------------------------------------------------------------
+    ## First, this checks if your expermiment name is taken. If not, it will create the directory.
+    ## Otherwise, we will be prompted to retry with a new name
+    name = 'run'
+    try:
+        os.mkdir(f'experiments/{name}')
+        print('Starting AutoCaptioning...')
+        print(f'Results will be stored in experiments/{name}')
+    except:
+        None
+    # Use local clip if not downloading from youtube
+    my_clip = VideoFileClip(input_file)
+    my_clip.write_videofile(f"experiments/{name}/{input_file.split('/')[-1]}")
+    my_clip.audio.write_audiofile(f'experiments/{name}/audio_file.wav', codec="mp3")
+    # Instantiate parakeet model
+    model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
+    # convert to format parakeet can interpret
+    convert_audio_to_mono_16khz(f'experiments/{name}/audio_file.wav', f'experiments/{name}/audio_file.wav')
+    # transcribe audio
+    output = model.transcribe([f'experiments/{name}/audio_file.wav'], timestamps=True)
+    # Convert audio to text with timestamps, dump into dataframe
+    df = pd.DataFrame(output[0].timestamp['segment'])
+    df['text'] = df['segment']
+    df = df.drop(['start_offset', 'end_offset', 'segment'],axis = 1)
+    # save csv and srt files
+    df.to_csv(f'experiments/{name}/subs.csv')
+    csv_to_srt(f"experiments/{name}/subs.csv",f"experiments/{name}/subs.srt")
+    # Capture video
+    vidcap = cv2.VideoCapture(f'''experiments/{name}/{input_file}''')
+    success, image = vidcap.read()
+    # Instantiate MoviePy subtitle generator with TextClip, subtitles, and SubtitlesClip
+    generator = lambda txt: TextClip(
+    "./P052-Roman.ttf",
+    text = txt,
+    font_size = int(my_clip.w/50),
+    stroke_width=1,
+    color= "white",
+    stroke_color="black",
+    size = (my_clip.w, my_clip.h),
+    vertical_align = 'bottom',
+    horizontal_align = 'center',
+    method='caption')
+    subs = SubtitlesClip(f"experiments/{name}/subs.srt", make_textclip=generator)
+    video = VideoFileClip(input_file)
+    final = CompositeVideoClip([video, subs])
+    final.write_videofile(f'experiments/{name}/output.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac")
+    return f'experiments/{name}/output.mp4', df, f"experiments/{name}/subs.srt"
+with gr.Blocks() as demo:
+    gr.Markdown("<div style='display:flex;justify-content:center;align-items:center;gap:.5rem;font-size:24px;'>🦜 <strong>Parakeet AutoCaption Web App</strong></div>")
+    with gr.Column():
+        input_video = gr.Video(label = 'Input your video for captioning')
+        # input_name = gr.Textbox(label = 'Name of your experiment run')
+    with gr.Column():
+        run_button = gr.Button('Run Video Captioning')
+    with gr.Column():
+        output_video = gr.Video(label = 'Output Video')
+        output_subs = gr.Dataframe(label = 'Output Subtitles')
+        output_subs_srt_file = gr.DownloadButton(label = 'Download subtitles as SRT file')
+    gr.on(
+        triggers=[run_button.click],
+        fn=subtitle_video,
+        inputs=[
+            input_video,
+        ],
+        outputs=[output_video, output_subs, output_subs_srt_file],
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+torchvision
+moviepy
+pydub
+opencv-python-headless
+nemo-toolkit['all']
+gradio