Spaces:
Build error
Build error
Commit
·
cfab262
1
Parent(s):
fa0a8cc
init commit
Browse files- experiments/empty_file.md +0 -0
- gradio_app.py +212 -0
- requirements.txt +7 -0
experiments/empty_file.md
ADDED
|
File without changes
|
gradio_app.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Imports
|
| 2 |
+
from __future__ import unicode_literals
|
| 3 |
+
from IPython.display import Video
|
| 4 |
+
import whisper
|
| 5 |
+
import cv2
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from moviepy import VideoFileClip
|
| 8 |
+
from IPython.display import display, Markdown
|
| 9 |
+
# from moviepy.editor import *
|
| 10 |
+
from moviepy.video.tools.subtitles import SubtitlesClip
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
from moviepy.video.tools.subtitles import SubtitlesClip
|
| 14 |
+
from moviepy.video.io.VideoFileClip import VideoFileClip
|
| 15 |
+
from moviepy import CompositeVideoClip
|
| 16 |
+
from moviepy import TextClip
|
| 17 |
+
import nemo.collections.asr as nemo_asr
|
| 18 |
+
import gradio as gr
|
| 19 |
+
#!/usr/bin/env python3
|
| 20 |
+
import csv, re, sys
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
def parse_time_to_srt(t):
|
| 24 |
+
s = str(t).strip()
|
| 25 |
+
if re.fullmatch(r"\d+(\.\d+)?", s):
|
| 26 |
+
total_ms = int(round(float(s) * 1000))
|
| 27 |
+
else:
|
| 28 |
+
parts = s.split(':')
|
| 29 |
+
if len(parts) == 2:
|
| 30 |
+
mm, ss = parts
|
| 31 |
+
sec = float(ss)
|
| 32 |
+
total_ms = int(round((int(mm) * 60 + sec) * 1000))
|
| 33 |
+
elif len(parts) == 3:
|
| 34 |
+
hh, mm, ss = parts
|
| 35 |
+
sec = float(ss)
|
| 36 |
+
total_ms = int(round(((int(hh) * 3600) + (int(mm) * 60) + sec) * 1000))
|
| 37 |
+
else:
|
| 38 |
+
raise ValueError(f"Unrecognized time format: {s}")
|
| 39 |
+
hours = total_ms // 3_600_000
|
| 40 |
+
rem = total_ms % 3_600_000
|
| 41 |
+
minutes = rem // 60_000
|
| 42 |
+
rem = rem % 60_000
|
| 43 |
+
seconds = rem // 1000
|
| 44 |
+
millis = rem % 1000
|
| 45 |
+
return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
|
| 46 |
+
|
| 47 |
+
def map_position_to_tag(pos):
|
| 48 |
+
if not pos:
|
| 49 |
+
return ""
|
| 50 |
+
s = str(pos).strip().lower()
|
| 51 |
+
m = re.search(r"\\?an([1-9])", s)
|
| 52 |
+
if m:
|
| 53 |
+
return "{\\an" + m.group(1) + "}"
|
| 54 |
+
if "top left" in s or ("top" in s and "left" in s):
|
| 55 |
+
return "{\\an7}"
|
| 56 |
+
if "top right" in s or ("top" in s and "right" in s):
|
| 57 |
+
return "{\\an9}"
|
| 58 |
+
if "bottom left" in s or ("bottom" in s and "left" in s):
|
| 59 |
+
return "{\\an1}"
|
| 60 |
+
if "bottom right" in s or ("bottom" in s and "right" in s):
|
| 61 |
+
return "{\\an3}"
|
| 62 |
+
if "top" in s:
|
| 63 |
+
return "{\\an8}"
|
| 64 |
+
if "middle" in s or "center" in s or "centre" in s:
|
| 65 |
+
return "{\\an5}"
|
| 66 |
+
if "bottom" in s:
|
| 67 |
+
return "{\\an2}"
|
| 68 |
+
return ""
|
| 69 |
+
|
| 70 |
+
def looks_like_header(row):
|
| 71 |
+
joined = ",".join(c.strip().lower() for c in row[:4])
|
| 72 |
+
header_words = ["position", "pos", "align", "start", "begin", "end", "stop", "subtitle", "text", "caption"]
|
| 73 |
+
return any(w in joined for w in header_words)
|
| 74 |
+
|
| 75 |
+
def csv_to_srt(csv_path: Path, srt_path: Path):
|
| 76 |
+
with open(f'{csv_path}',"r", encoding="utf-8-sig", newline="") as f:
|
| 77 |
+
reader = csv.reader(f)
|
| 78 |
+
rows = [row for row in reader if any(cell.strip() for cell in row)]
|
| 79 |
+
if not rows:
|
| 80 |
+
raise ValueError("CSV is empty.")
|
| 81 |
+
start_index = 1 if looks_like_header(rows[0]) else 0
|
| 82 |
+
normalized = []
|
| 83 |
+
for i, row in enumerate(rows[start_index:], start=start_index+1):
|
| 84 |
+
if len(row) < 4:
|
| 85 |
+
raise ValueError(f"Row {i} has fewer than 4 columns: {row}")
|
| 86 |
+
position, start, end, text = row[0].strip(), row[1].strip(), row[2].strip(), row[3]
|
| 87 |
+
normalized.append((position, start, end, text))
|
| 88 |
+
with open(f"{srt_path}", "w", encoding="utf-8") as out:
|
| 89 |
+
for idx, (position, start, end, text) in enumerate(normalized, start=1):
|
| 90 |
+
start_srt = parse_time_to_srt(start)
|
| 91 |
+
end_srt = parse_time_to_srt(end)
|
| 92 |
+
pos_tag = map_position_to_tag(position)
|
| 93 |
+
final_text = f"{pos_tag}{text}" if pos_tag else text
|
| 94 |
+
out.write(f"{idx}\n")
|
| 95 |
+
out.write(f"{start_srt} --> {end_srt}\n")
|
| 96 |
+
out.write(f"{final_text}\n\n")
|
| 97 |
+
|
| 98 |
+
from pydub import AudioSegment
|
| 99 |
+
|
| 100 |
+
def convert_audio_to_mono_16khz(input_path, output_path):
|
| 101 |
+
"""
|
| 102 |
+
Converts an audio file to mono and resamples it to 16 kHz.
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
input_path (str): The path to the input audio file.
|
| 106 |
+
output_path (str): The path to save the converted audio file.
|
| 107 |
+
"""
|
| 108 |
+
try:
|
| 109 |
+
audio = AudioSegment.from_file(input_path)
|
| 110 |
+
# Set channels to 1 (mono)
|
| 111 |
+
audio = audio.set_channels(1)
|
| 112 |
+
# Set frame rate (sample rate) to 16000 Hz
|
| 113 |
+
audio = audio.set_frame_rate(16000)
|
| 114 |
+
audio.export(output_path, format="wav") # Export as WAV or desired format
|
| 115 |
+
print(f"Audio converted successfully to mono, 16kHz at: {output_path}")
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"Error converting audio: {e}")
|
| 118 |
+
|
| 119 |
+
def subtitle_video(input_file):
|
| 120 |
+
# ------------------------------------------------------------------------------------------------------------------------------
|
| 121 |
+
# Params:
|
| 122 |
+
# ------------------------------------------------------------------------------------------------------------------------------
|
| 123 |
+
#. name: str, name of directory to store files in in experiments folder
|
| 124 |
+
# audio_file: str, path to extracted audio file for Whisper
|
| 125 |
+
# input_file: str, path to video file for MoviePy to caption
|
| 126 |
+
# output: str, destination of final output video file
|
| 127 |
+
# lang: str, language
|
| 128 |
+
# uploaded_vid: str, path to uploaded video file if download is False
|
| 129 |
+
#
|
| 130 |
+
#--------------------------------------------------------------------------------------------------------------------------------
|
| 131 |
+
# Returns: An annotated video with translated captions into english, saved to name/output
|
| 132 |
+
#--------------------------------------------------------------------------------------------------------------------------------
|
| 133 |
+
|
| 134 |
+
## First, this checks if your expermiment name is taken. If not, it will create the directory.
|
| 135 |
+
## Otherwise, we will be prompted to retry with a new name
|
| 136 |
+
name = 'run'
|
| 137 |
+
try:
|
| 138 |
+
os.mkdir(f'experiments/{name}')
|
| 139 |
+
print('Starting AutoCaptioning...')
|
| 140 |
+
print(f'Results will be stored in experiments/{name}')
|
| 141 |
+
|
| 142 |
+
except:
|
| 143 |
+
None
|
| 144 |
+
|
| 145 |
+
# Use local clip if not downloading from youtube
|
| 146 |
+
my_clip = VideoFileClip(input_file)
|
| 147 |
+
my_clip.write_videofile(f"experiments/{name}/{input_file.split('/')[-1]}")
|
| 148 |
+
my_clip.audio.write_audiofile(f'experiments/{name}/audio_file.wav', codec="mp3")
|
| 149 |
+
|
| 150 |
+
# Instantiate parakeet model
|
| 151 |
+
model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
|
| 152 |
+
|
| 153 |
+
# convert to format parakeet can interpret
|
| 154 |
+
convert_audio_to_mono_16khz(f'experiments/{name}/audio_file.wav', f'experiments/{name}/audio_file.wav')
|
| 155 |
+
|
| 156 |
+
# transcribe audio
|
| 157 |
+
output = model.transcribe([f'experiments/{name}/audio_file.wav'], timestamps=True)
|
| 158 |
+
|
| 159 |
+
# Convert audio to text with timestamps, dump into dataframe
|
| 160 |
+
df = pd.DataFrame(output[0].timestamp['segment'])
|
| 161 |
+
df['text'] = df['segment']
|
| 162 |
+
df = df.drop(['start_offset', 'end_offset', 'segment'],axis = 1)
|
| 163 |
+
|
| 164 |
+
# save csv and srt files
|
| 165 |
+
df.to_csv(f'experiments/{name}/subs.csv')
|
| 166 |
+
csv_to_srt(f"experiments/{name}/subs.csv",f"experiments/{name}/subs.srt")
|
| 167 |
+
|
| 168 |
+
# Capture video
|
| 169 |
+
vidcap = cv2.VideoCapture(f'''experiments/{name}/{input_file}''')
|
| 170 |
+
success, image = vidcap.read()
|
| 171 |
+
|
| 172 |
+
# Instantiate MoviePy subtitle generator with TextClip, subtitles, and SubtitlesClip
|
| 173 |
+
generator = lambda txt: TextClip(
|
| 174 |
+
"./P052-Roman.ttf",
|
| 175 |
+
text = txt,
|
| 176 |
+
font_size = int(my_clip.w/50),
|
| 177 |
+
stroke_width=1,
|
| 178 |
+
color= "white",
|
| 179 |
+
stroke_color="black",
|
| 180 |
+
size = (my_clip.w, my_clip.h),
|
| 181 |
+
vertical_align = 'bottom',
|
| 182 |
+
horizontal_align = 'center',
|
| 183 |
+
method='caption')
|
| 184 |
+
|
| 185 |
+
subs = SubtitlesClip(f"experiments/{name}/subs.srt", make_textclip=generator)
|
| 186 |
+
|
| 187 |
+
video = VideoFileClip(input_file)
|
| 188 |
+
final = CompositeVideoClip([video, subs])
|
| 189 |
+
final.write_videofile(f'experiments/{name}/output.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac")
|
| 190 |
+
return f'experiments/{name}/output.mp4', df, f"experiments/{name}/subs.srt"
|
| 191 |
+
|
| 192 |
+
with gr.Blocks() as demo:
|
| 193 |
+
gr.Markdown("<div style='display:flex;justify-content:center;align-items:center;gap:.5rem;font-size:24px;'>🦜 <strong>Parakeet AutoCaption Web App</strong></div>")
|
| 194 |
+
with gr.Column():
|
| 195 |
+
input_video = gr.Video(label = 'Input your video for captioning')
|
| 196 |
+
# input_name = gr.Textbox(label = 'Name of your experiment run')
|
| 197 |
+
with gr.Column():
|
| 198 |
+
run_button = gr.Button('Run Video Captioning')
|
| 199 |
+
with gr.Column():
|
| 200 |
+
output_video = gr.Video(label = 'Output Video')
|
| 201 |
+
output_subs = gr.Dataframe(label = 'Output Subtitles')
|
| 202 |
+
output_subs_srt_file = gr.DownloadButton(label = 'Download subtitles as SRT file')
|
| 203 |
+
gr.on(
|
| 204 |
+
triggers=[run_button.click],
|
| 205 |
+
fn=subtitle_video,
|
| 206 |
+
inputs=[
|
| 207 |
+
input_video,
|
| 208 |
+
],
|
| 209 |
+
outputs=[output_video, output_subs, output_subs_srt_file],
|
| 210 |
+
)
|
| 211 |
+
if __name__ == "__main__":
|
| 212 |
+
demo.launch(share=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
torchvision
|
| 3 |
+
moviepy
|
| 4 |
+
pydub
|
| 5 |
+
opencv-python-headless
|
| 6 |
+
nemo-toolkit['all']
|
| 7 |
+
gradio
|