jamesesqueleto commited on
Commit
cfab262
·
1 Parent(s): fa0a8cc

init commit

Browse files
Files changed (3) hide show
  1. experiments/empty_file.md +0 -0
  2. gradio_app.py +212 -0
  3. requirements.txt +7 -0
experiments/empty_file.md ADDED
File without changes
gradio_app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Imports
2
+ from __future__ import unicode_literals
3
+ from IPython.display import Video
4
+ import whisper
5
+ import cv2
6
+ import pandas as pd
7
+ from moviepy import VideoFileClip
8
+ from IPython.display import display, Markdown
9
+ # from moviepy.editor import *
10
+ from moviepy.video.tools.subtitles import SubtitlesClip
11
+ import os
12
+
13
+ from moviepy.video.tools.subtitles import SubtitlesClip
14
+ from moviepy.video.io.VideoFileClip import VideoFileClip
15
+ from moviepy import CompositeVideoClip
16
+ from moviepy import TextClip
17
+ import nemo.collections.asr as nemo_asr
18
+ import gradio as gr
19
+ #!/usr/bin/env python3
20
+ import csv, re, sys
21
+ from pathlib import Path
22
+
23
+ def parse_time_to_srt(t):
24
+ s = str(t).strip()
25
+ if re.fullmatch(r"\d+(\.\d+)?", s):
26
+ total_ms = int(round(float(s) * 1000))
27
+ else:
28
+ parts = s.split(':')
29
+ if len(parts) == 2:
30
+ mm, ss = parts
31
+ sec = float(ss)
32
+ total_ms = int(round((int(mm) * 60 + sec) * 1000))
33
+ elif len(parts) == 3:
34
+ hh, mm, ss = parts
35
+ sec = float(ss)
36
+ total_ms = int(round(((int(hh) * 3600) + (int(mm) * 60) + sec) * 1000))
37
+ else:
38
+ raise ValueError(f"Unrecognized time format: {s}")
39
+ hours = total_ms // 3_600_000
40
+ rem = total_ms % 3_600_000
41
+ minutes = rem // 60_000
42
+ rem = rem % 60_000
43
+ seconds = rem // 1000
44
+ millis = rem % 1000
45
+ return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
46
+
47
+ def map_position_to_tag(pos):
48
+ if not pos:
49
+ return ""
50
+ s = str(pos).strip().lower()
51
+ m = re.search(r"\\?an([1-9])", s)
52
+ if m:
53
+ return "{\\an" + m.group(1) + "}"
54
+ if "top left" in s or ("top" in s and "left" in s):
55
+ return "{\\an7}"
56
+ if "top right" in s or ("top" in s and "right" in s):
57
+ return "{\\an9}"
58
+ if "bottom left" in s or ("bottom" in s and "left" in s):
59
+ return "{\\an1}"
60
+ if "bottom right" in s or ("bottom" in s and "right" in s):
61
+ return "{\\an3}"
62
+ if "top" in s:
63
+ return "{\\an8}"
64
+ if "middle" in s or "center" in s or "centre" in s:
65
+ return "{\\an5}"
66
+ if "bottom" in s:
67
+ return "{\\an2}"
68
+ return ""
69
+
70
+ def looks_like_header(row):
71
+ joined = ",".join(c.strip().lower() for c in row[:4])
72
+ header_words = ["position", "pos", "align", "start", "begin", "end", "stop", "subtitle", "text", "caption"]
73
+ return any(w in joined for w in header_words)
74
+
75
+ def csv_to_srt(csv_path: Path, srt_path: Path):
76
+ with open(f'{csv_path}',"r", encoding="utf-8-sig", newline="") as f:
77
+ reader = csv.reader(f)
78
+ rows = [row for row in reader if any(cell.strip() for cell in row)]
79
+ if not rows:
80
+ raise ValueError("CSV is empty.")
81
+ start_index = 1 if looks_like_header(rows[0]) else 0
82
+ normalized = []
83
+ for i, row in enumerate(rows[start_index:], start=start_index+1):
84
+ if len(row) < 4:
85
+ raise ValueError(f"Row {i} has fewer than 4 columns: {row}")
86
+ position, start, end, text = row[0].strip(), row[1].strip(), row[2].strip(), row[3]
87
+ normalized.append((position, start, end, text))
88
+ with open(f"{srt_path}", "w", encoding="utf-8") as out:
89
+ for idx, (position, start, end, text) in enumerate(normalized, start=1):
90
+ start_srt = parse_time_to_srt(start)
91
+ end_srt = parse_time_to_srt(end)
92
+ pos_tag = map_position_to_tag(position)
93
+ final_text = f"{pos_tag}{text}" if pos_tag else text
94
+ out.write(f"{idx}\n")
95
+ out.write(f"{start_srt} --> {end_srt}\n")
96
+ out.write(f"{final_text}\n\n")
97
+
98
+ from pydub import AudioSegment
99
+
100
+ def convert_audio_to_mono_16khz(input_path, output_path):
101
+ """
102
+ Converts an audio file to mono and resamples it to 16 kHz.
103
+
104
+ Args:
105
+ input_path (str): The path to the input audio file.
106
+ output_path (str): The path to save the converted audio file.
107
+ """
108
+ try:
109
+ audio = AudioSegment.from_file(input_path)
110
+ # Set channels to 1 (mono)
111
+ audio = audio.set_channels(1)
112
+ # Set frame rate (sample rate) to 16000 Hz
113
+ audio = audio.set_frame_rate(16000)
114
+ audio.export(output_path, format="wav") # Export as WAV or desired format
115
+ print(f"Audio converted successfully to mono, 16kHz at: {output_path}")
116
+ except Exception as e:
117
+ print(f"Error converting audio: {e}")
118
+
119
+ def subtitle_video(input_file):
120
+ # ------------------------------------------------------------------------------------------------------------------------------
121
+ # Params:
122
+ # ------------------------------------------------------------------------------------------------------------------------------
123
+ #. name: str, name of directory to store files in in experiments folder
124
+ # audio_file: str, path to extracted audio file for Whisper
125
+ # input_file: str, path to video file for MoviePy to caption
126
+ # output: str, destination of final output video file
127
+ # lang: str, language
128
+ # uploaded_vid: str, path to uploaded video file if download is False
129
+ #
130
+ #--------------------------------------------------------------------------------------------------------------------------------
131
+ # Returns: An annotated video with translated captions into english, saved to name/output
132
+ #--------------------------------------------------------------------------------------------------------------------------------
133
+
134
+ ## First, this checks if your expermiment name is taken. If not, it will create the directory.
135
+ ## Otherwise, we will be prompted to retry with a new name
136
+ name = 'run'
137
+ try:
138
+ os.mkdir(f'experiments/{name}')
139
+ print('Starting AutoCaptioning...')
140
+ print(f'Results will be stored in experiments/{name}')
141
+
142
+ except:
143
+ None
144
+
145
+ # Use local clip if not downloading from youtube
146
+ my_clip = VideoFileClip(input_file)
147
+ my_clip.write_videofile(f"experiments/{name}/{input_file.split('/')[-1]}")
148
+ my_clip.audio.write_audiofile(f'experiments/{name}/audio_file.wav', codec="mp3")
149
+
150
+ # Instantiate parakeet model
151
+ model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
152
+
153
+ # convert to format parakeet can interpret
154
+ convert_audio_to_mono_16khz(f'experiments/{name}/audio_file.wav', f'experiments/{name}/audio_file.wav')
155
+
156
+ # transcribe audio
157
+ output = model.transcribe([f'experiments/{name}/audio_file.wav'], timestamps=True)
158
+
159
+ # Convert audio to text with timestamps, dump into dataframe
160
+ df = pd.DataFrame(output[0].timestamp['segment'])
161
+ df['text'] = df['segment']
162
+ df = df.drop(['start_offset', 'end_offset', 'segment'],axis = 1)
163
+
164
+ # save csv and srt files
165
+ df.to_csv(f'experiments/{name}/subs.csv')
166
+ csv_to_srt(f"experiments/{name}/subs.csv",f"experiments/{name}/subs.srt")
167
+
168
+ # Capture video
169
+ vidcap = cv2.VideoCapture(f'''experiments/{name}/{input_file}''')
170
+ success, image = vidcap.read()
171
+
172
+ # Instantiate MoviePy subtitle generator with TextClip, subtitles, and SubtitlesClip
173
+ generator = lambda txt: TextClip(
174
+ "./P052-Roman.ttf",
175
+ text = txt,
176
+ font_size = int(my_clip.w/50),
177
+ stroke_width=1,
178
+ color= "white",
179
+ stroke_color="black",
180
+ size = (my_clip.w, my_clip.h),
181
+ vertical_align = 'bottom',
182
+ horizontal_align = 'center',
183
+ method='caption')
184
+
185
+ subs = SubtitlesClip(f"experiments/{name}/subs.srt", make_textclip=generator)
186
+
187
+ video = VideoFileClip(input_file)
188
+ final = CompositeVideoClip([video, subs])
189
+ final.write_videofile(f'experiments/{name}/output.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac")
190
+ return f'experiments/{name}/output.mp4', df, f"experiments/{name}/subs.srt"
191
+
192
+ with gr.Blocks() as demo:
193
+ gr.Markdown("<div style='display:flex;justify-content:center;align-items:center;gap:.5rem;font-size:24px;'>🦜 <strong>Parakeet AutoCaption Web App</strong></div>")
194
+ with gr.Column():
195
+ input_video = gr.Video(label = 'Input your video for captioning')
196
+ # input_name = gr.Textbox(label = 'Name of your experiment run')
197
+ with gr.Column():
198
+ run_button = gr.Button('Run Video Captioning')
199
+ with gr.Column():
200
+ output_video = gr.Video(label = 'Output Video')
201
+ output_subs = gr.Dataframe(label = 'Output Subtitles')
202
+ output_subs_srt_file = gr.DownloadButton(label = 'Download subtitles as SRT file')
203
+ gr.on(
204
+ triggers=[run_button.click],
205
+ fn=subtitle_video,
206
+ inputs=[
207
+ input_video,
208
+ ],
209
+ outputs=[output_video, output_subs, output_subs_srt_file],
210
+ )
211
+ if __name__ == "__main__":
212
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ moviepy
4
+ pydub
5
+ opencv-python-headless
6
+ nemo-toolkit['all']
7
+ gradio