hiDenorIYamano's picture
ホワイトリスト追加、最大行数追加
cd646ca
import gradio as gr
import re
import MeCab
def split_japanese_line(line, max_length):
"""Split a Japanese line into multiple lines using MeCab for morphological analysis."""
if len(line) < max_length:
return line
max_line = 2
line = re.sub(r'[、。]', '', line)
m = MeCab.Tagger()
nodes = m.parse(line).split("\n")
words = [node.split("\t")[0] for node in nodes if node and not node.startswith("EOS")]
word_features = [re.split('[\t,]', node)[4] if len(re.split('[\t,]', node)) > 3 else None for node in nodes if
node and not node.startswith("EOS")]
lines = []
current_line = ""
idx = 0
while idx < len(words):
word = words[idx]
# If adding the next word exceeds the max_length or if the word is a particle, break the line
if idx + 1 < len(words) and any(
whitelisted_word == word + words[idx + 1] for whitelisted_word in ["という", "っていう", "になって"]):
current_line += word + words[idx + 1]
idx += 2 # Increment to skip next word
elif len(word_features) > idx - 1:
if word_features[idx - 1] in ["助詞-格助詞", "助詞-副助詞", "助詞-終助詞"]:
lines.append(current_line)
current_line = ""
current_line += word
idx += 1
else:
current_line += word
idx += 1
# Append the last line if it exists
if current_line:
lines.append(current_line)
# Merge lines to ensure each line is less than or equal to max_length
merged_lines = []
temp_line = ''
for line in lines:
if len(temp_line) <= 12:
temp_line += line
elif len(temp_line + line) <= max_length:
temp_line += line
elif len(merged_lines) >= max_line - 1:
temp_line += line
else:
if temp_line == '':
merged_lines.append(line)
else:
merged_lines.append(temp_line)
temp_line = line
if temp_line:
merged_lines.append(temp_line)
return "\n".join(merged_lines)
def split_japanese_srt_text(srt_content, max_length):
"""Split the lines of the srt content for Japanese text."""
srt_lines = srt_content.split("\n")
modified_lines = []
for line in srt_lines:
# Check if the line looks like a subtitle text (not an index or a timestamp)
if re.match(r"^[0-9]{1,2}:[0-9]{2}:[0-9]{2},[0-9]{3} --> [0-9]{1,2}:[0-9]{2}:[0-9]{2},[0-9]{3}$", line):
modified_lines.append(line)
elif line.isdigit():
modified_lines.append(line)
elif line == "":
modified_lines.append(line)
else:
# Split the Japanese subtitle text line
modified_lines.extend(split_japanese_line(line, max_length).split("\n"))
return "\n".join(modified_lines)
def format_transcription_for_japanese(file_path, max_length):
with open(file_path, "r", encoding="utf-8") as file:
srt_content = file.read()
modified_japanese_srt_content = split_japanese_srt_text(srt_content, max_length)
with open("formatted_transcription.srt", "w", encoding="utf-8") as file:
file.write(modified_japanese_srt_content)
def main(srt_obj, max_length):
format_transcription_for_japanese(srt_obj.name, max_length)
return "formatted_transcription.srt"
app = gr.Interface(
main,
[gr.File(file_types=[".srt"], label="srtファイル"),
gr.Number(20, label="1行あたりの最大文字数")], # Textbox input for the API key
"file"
)
if __name__ == "__main__":
app.queue(concurrency_count=1)
app.launch()