Spaces:

hidenoriyamano37
/

srtLineBreakTool

Runtime error

App Files Files Community

srtLineBreakTool / app.py

hiDenorIYamano

ホワイトリスト追加、最大行数追加

cd646ca over 2 years ago

raw

history blame contribute delete

3.74 kB

	import gradio as gr
	import re
	import MeCab


	def split_japanese_line(line, max_length):
	"""Split a Japanese line into multiple lines using MeCab for morphological analysis."""
	if len(line) < max_length:
	return line

	max_line = 2
	line = re.sub(r'[、。]', '', line)

	m = MeCab.Tagger()
	nodes = m.parse(line).split("\n")

	words = [node.split("\t")[0] for node in nodes if node and not node.startswith("EOS")]
	word_features = [re.split('[\t,]', node)[4] if len(re.split('[\t,]', node)) > 3 else None for node in nodes if
	node and not node.startswith("EOS")]

	lines = []
	current_line = ""

	idx = 0
	while idx < len(words):
	word = words[idx]
	# If adding the next word exceeds the max_length or if the word is a particle, break the line
	if idx + 1 < len(words) and any(
	whitelisted_word == word + words[idx + 1] for whitelisted_word in ["という", "っていう", "になって"]):
	current_line += word + words[idx + 1]
	idx += 2 # Increment to skip next word
	elif len(word_features) > idx - 1:
	if word_features[idx - 1] in ["助詞-格助詞", "助詞-副助詞", "助詞-終助詞"]:
	lines.append(current_line)
	current_line = ""
	current_line += word
	idx += 1
	else:
	current_line += word
	idx += 1


	# Append the last line if it exists
	if current_line:
	lines.append(current_line)

	# Merge lines to ensure each line is less than or equal to max_length
	merged_lines = []
	temp_line = ''
	for line in lines:
	if len(temp_line) <= 12:
	temp_line += line
	elif len(temp_line + line) <= max_length:
	temp_line += line
	elif len(merged_lines) >= max_line - 1:
	temp_line += line
	else:
	if temp_line == '':
	merged_lines.append(line)
	else:
	merged_lines.append(temp_line)
	temp_line = line
	if temp_line:
	merged_lines.append(temp_line)

	return "\n".join(merged_lines)


	def split_japanese_srt_text(srt_content, max_length):
	"""Split the lines of the srt content for Japanese text."""
	srt_lines = srt_content.split("\n")
	modified_lines = []

	for line in srt_lines:
	# Check if the line looks like a subtitle text (not an index or a timestamp)
	if re.match(r"^[0-9]{1,2}:[0-9]{2}:[0-9]{2},[0-9]{3} --> [0-9]{1,2}:[0-9]{2}:[0-9]{2},[0-9]{3}$", line):
	modified_lines.append(line)
	elif line.isdigit():
	modified_lines.append(line)
	elif line == "":
	modified_lines.append(line)
	else:
	# Split the Japanese subtitle text line
	modified_lines.extend(split_japanese_line(line, max_length).split("\n"))

	return "\n".join(modified_lines)

	def format_transcription_for_japanese(file_path, max_length):
	with open(file_path, "r", encoding="utf-8") as file:
	srt_content = file.read()
	modified_japanese_srt_content = split_japanese_srt_text(srt_content, max_length)
	with open("formatted_transcription.srt", "w", encoding="utf-8") as file:
	file.write(modified_japanese_srt_content)

	def main(srt_obj, max_length):
	format_transcription_for_japanese(srt_obj.name, max_length)
	return "formatted_transcription.srt"



	app = gr.Interface(
	main,
	[gr.File(file_types=[".srt"], label="srtファイル"),
	gr.Number(20, label="1行あたりの最大文字数")], # Textbox input for the API key
	"file"
	)

	if __name__ == "__main__":
	app.queue(concurrency_count=1)
	app.launch()