Spaces:

erastorgueva-nv
/

NeMo-Forced-Aligner

Running

App Files Files Community

Add option to split on \n. Add .ass output

by DEVMAXXING - opened Mar 31, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+16

-4

Files changed (1) hide show

app.py +16 -4

app.py CHANGED Viewed

@@ -104,7 +104,7 @@ def delete_mp4s_except_given_filepath(filepath):
-def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Progress()):
 	# Create utt_id,  specify output_video_filepath and delete any MP4s
 	# that are not that filepath. These stray MP4s can be created
 	# if a user refreshes or exits the page while this 'align' function is executing.
@@ -115,6 +115,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 	delete_mp4s_except_given_filepath(output_video_filepath)
 	output_info = ""
 	progress(0, desc="Validating input")
@@ -197,6 +198,10 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 		with open(manifest_path, 'w') as fout:
 			fout.write(f"{json.dumps(data)}\n")
 		# run alignment
 		if "|" in text:
 			resegment_text_to_fill_space = False
@@ -238,6 +243,9 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 			# make video file from the word-level ASS file
 			ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
 		ffmpeg_command = (
 			f"ffmpeg -y -i {audio_path} "
 			"-f lavfi -i color=c=white:s=1280x720:r=50 "
@@ -248,7 +256,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
 		os.system(ffmpeg_command)
-	return output_video_filepath, gr.update(value=output_info, visible=True), output_video_filepath
 def delete_non_tmp_video(video_path):
@@ -281,6 +289,9 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
 				label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
 				"Leave this field blank to use an ASR model's transcription as the reference text instead."
 			)
 			gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
 			with gr.Row():
@@ -294,6 +305,7 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
 			gr.Markdown("## Output")
 			video_out = gr.Video(label="output video")
 			text_out = gr.Textbox(label="output info", visible=False)
 	with gr.Row():
 		gr.HTML(
@@ -306,8 +318,8 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
 	submit_button.click(
 		fn=align,
-		inputs=[lang_drop, mic_in, audio_file_in, ref_text, col1, col2, col3,],
-		outputs=[video_out, text_out, non_tmp_output_video_filepath],
 	).then(
 		fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
 	)

+def align(lang, Microphone, File_Upload, text, col1, col2, col3, split_on_newline, progress=gr.Progress()):
 	# Create utt_id,  specify output_video_filepath and delete any MP4s
 	# that are not that filepath. These stray MP4s can be created
 	# if a user refreshes or exits the page while this 'align' function is executing.
 	delete_mp4s_except_given_filepath(output_video_filepath)
 	output_info = ""
+    ass_text=""
 	progress(0, desc="Validating input")
 		with open(manifest_path, 'w') as fout:
 			fout.write(f"{json.dumps(data)}\n")
+        # split text on new lines if requested
+        if split_on_newline:
+            text = "|".join(list(filter(None, text.split("\n"))))
 		# run alignment
 		if "|" in text:
 			resegment_text_to_fill_space = False
 			# make video file from the word-level ASS file
 			ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
+        with open(ass_file_for_video, "r") as ass_file:
+            ass_text = ass_file.read()
 		ffmpeg_command = (
 			f"ffmpeg -y -i {audio_path} "
 			"-f lavfi -i color=c=white:s=1280x720:r=50 "
 		os.system(ffmpeg_command)
+	return output_video_filepath, gr.update(value=output_info, visible=True), output_video_filepath, ass_text
 def delete_non_tmp_video(video_path):
 				label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
 				"Leave this field blank to use an ASR model's transcription as the reference text instead."
 			)
+            split_on_newline = gr.Checkbox(
+                label="Separate text on new lines", default=False
+            )
 			gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
 			with gr.Row():
 			gr.Markdown("## Output")
 			video_out = gr.Video(label="output video")
 			text_out = gr.Textbox(label="output info", visible=False)
+            ass_out = gr.Textbox(label="output .ass")
 	with gr.Row():
 		gr.HTML(
 	submit_button.click(
 		fn=align,
+		inputs=[lang_drop, mic_in, audio_file_in, ref_text, col1, col2, col3,split_on_newline,],
+		outputs=[video_out, text_out, non_tmp_output_video_filepath, ass_out],
 	).then(
 		fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
 	)