Add option to split on \n. Add .ass output
#1
by DEVMAXXING - opened
app.py
CHANGED
|
@@ -104,7 +104,7 @@ def delete_mp4s_except_given_filepath(filepath):
|
|
| 104 |
|
| 105 |
|
| 106 |
|
| 107 |
-
def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Progress()):
|
| 108 |
# Create utt_id, specify output_video_filepath and delete any MP4s
|
| 109 |
# that are not that filepath. These stray MP4s can be created
|
| 110 |
# if a user refreshes or exits the page while this 'align' function is executing.
|
|
@@ -115,6 +115,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
| 115 |
delete_mp4s_except_given_filepath(output_video_filepath)
|
| 116 |
|
| 117 |
output_info = ""
|
|
|
|
| 118 |
|
| 119 |
progress(0, desc="Validating input")
|
| 120 |
|
|
@@ -197,6 +198,10 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
| 197 |
with open(manifest_path, 'w') as fout:
|
| 198 |
fout.write(f"{json.dumps(data)}\n")
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
# run alignment
|
| 201 |
if "|" in text:
|
| 202 |
resegment_text_to_fill_space = False
|
|
@@ -238,6 +243,9 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
| 238 |
# make video file from the word-level ASS file
|
| 239 |
ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
|
| 240 |
|
|
|
|
|
|
|
|
|
|
| 241 |
ffmpeg_command = (
|
| 242 |
f"ffmpeg -y -i {audio_path} "
|
| 243 |
"-f lavfi -i color=c=white:s=1280x720:r=50 "
|
|
@@ -248,7 +256,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
| 248 |
|
| 249 |
os.system(ffmpeg_command)
|
| 250 |
|
| 251 |
-
return output_video_filepath, gr.update(value=output_info, visible=True), output_video_filepath
|
| 252 |
|
| 253 |
|
| 254 |
def delete_non_tmp_video(video_path):
|
|
@@ -281,6 +289,9 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
|
|
| 281 |
label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
|
| 282 |
"Leave this field blank to use an ASR model's transcription as the reference text instead."
|
| 283 |
)
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
|
| 286 |
with gr.Row():
|
|
@@ -294,6 +305,7 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
|
|
| 294 |
gr.Markdown("## Output")
|
| 295 |
video_out = gr.Video(label="output video")
|
| 296 |
text_out = gr.Textbox(label="output info", visible=False)
|
|
|
|
| 297 |
|
| 298 |
with gr.Row():
|
| 299 |
gr.HTML(
|
|
@@ -306,8 +318,8 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
|
|
| 306 |
|
| 307 |
submit_button.click(
|
| 308 |
fn=align,
|
| 309 |
-
inputs=[lang_drop, mic_in, audio_file_in, ref_text, col1, col2, col3,],
|
| 310 |
-
outputs=[video_out, text_out, non_tmp_output_video_filepath],
|
| 311 |
).then(
|
| 312 |
fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
|
| 313 |
)
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
|
| 107 |
+
def align(lang, Microphone, File_Upload, text, col1, col2, col3, split_on_newline, progress=gr.Progress()):
|
| 108 |
# Create utt_id, specify output_video_filepath and delete any MP4s
|
| 109 |
# that are not that filepath. These stray MP4s can be created
|
| 110 |
# if a user refreshes or exits the page while this 'align' function is executing.
|
|
|
|
| 115 |
delete_mp4s_except_given_filepath(output_video_filepath)
|
| 116 |
|
| 117 |
output_info = ""
|
| 118 |
+
ass_text=""
|
| 119 |
|
| 120 |
progress(0, desc="Validating input")
|
| 121 |
|
|
|
|
| 198 |
with open(manifest_path, 'w') as fout:
|
| 199 |
fout.write(f"{json.dumps(data)}\n")
|
| 200 |
|
| 201 |
+
# split text on new lines if requested
|
| 202 |
+
if split_on_newline:
|
| 203 |
+
text = "|".join(list(filter(None, text.split("\n"))))
|
| 204 |
+
|
| 205 |
# run alignment
|
| 206 |
if "|" in text:
|
| 207 |
resegment_text_to_fill_space = False
|
|
|
|
| 243 |
# make video file from the word-level ASS file
|
| 244 |
ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
|
| 245 |
|
| 246 |
+
with open(ass_file_for_video, "r") as ass_file:
|
| 247 |
+
ass_text = ass_file.read()
|
| 248 |
+
|
| 249 |
ffmpeg_command = (
|
| 250 |
f"ffmpeg -y -i {audio_path} "
|
| 251 |
"-f lavfi -i color=c=white:s=1280x720:r=50 "
|
|
|
|
| 256 |
|
| 257 |
os.system(ffmpeg_command)
|
| 258 |
|
| 259 |
+
return output_video_filepath, gr.update(value=output_info, visible=True), output_video_filepath, ass_text
|
| 260 |
|
| 261 |
|
| 262 |
def delete_non_tmp_video(video_path):
|
|
|
|
| 289 |
label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
|
| 290 |
"Leave this field blank to use an ASR model's transcription as the reference text instead."
|
| 291 |
)
|
| 292 |
+
split_on_newline = gr.Checkbox(
|
| 293 |
+
label="Separate text on new lines", default=False
|
| 294 |
+
)
|
| 295 |
|
| 296 |
gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
|
| 297 |
with gr.Row():
|
|
|
|
| 305 |
gr.Markdown("## Output")
|
| 306 |
video_out = gr.Video(label="output video")
|
| 307 |
text_out = gr.Textbox(label="output info", visible=False)
|
| 308 |
+
ass_out = gr.Textbox(label="output .ass")
|
| 309 |
|
| 310 |
with gr.Row():
|
| 311 |
gr.HTML(
|
|
|
|
| 318 |
|
| 319 |
submit_button.click(
|
| 320 |
fn=align,
|
| 321 |
+
inputs=[lang_drop, mic_in, audio_file_in, ref_text, col1, col2, col3,split_on_newline,],
|
| 322 |
+
outputs=[video_out, text_out, non_tmp_output_video_filepath, ass_out],
|
| 323 |
).then(
|
| 324 |
fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
|
| 325 |
)
|