Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import re | |
| import MeCab | |
| def split_japanese_line(line, max_length): | |
| """Split a Japanese line into multiple lines using MeCab for morphological analysis.""" | |
| if len(line) < max_length: | |
| return line | |
| max_line = 2 | |
| line = re.sub(r'[、。]', '', line) | |
| m = MeCab.Tagger() | |
| nodes = m.parse(line).split("\n") | |
| words = [node.split("\t")[0] for node in nodes if node and not node.startswith("EOS")] | |
| word_features = [re.split('[\t,]', node)[4] if len(re.split('[\t,]', node)) > 3 else None for node in nodes if | |
| node and not node.startswith("EOS")] | |
| lines = [] | |
| current_line = "" | |
| idx = 0 | |
| while idx < len(words): | |
| word = words[idx] | |
| # If adding the next word exceeds the max_length or if the word is a particle, break the line | |
| if idx + 1 < len(words) and any( | |
| whitelisted_word == word + words[idx + 1] for whitelisted_word in ["という", "っていう", "になって"]): | |
| current_line += word + words[idx + 1] | |
| idx += 2 # Increment to skip next word | |
| elif len(word_features) > idx - 1: | |
| if word_features[idx - 1] in ["助詞-格助詞", "助詞-副助詞", "助詞-終助詞"]: | |
| lines.append(current_line) | |
| current_line = "" | |
| current_line += word | |
| idx += 1 | |
| else: | |
| current_line += word | |
| idx += 1 | |
| # Append the last line if it exists | |
| if current_line: | |
| lines.append(current_line) | |
| # Merge lines to ensure each line is less than or equal to max_length | |
| merged_lines = [] | |
| temp_line = '' | |
| for line in lines: | |
| if len(temp_line) <= 12: | |
| temp_line += line | |
| elif len(temp_line + line) <= max_length: | |
| temp_line += line | |
| elif len(merged_lines) >= max_line - 1: | |
| temp_line += line | |
| else: | |
| if temp_line == '': | |
| merged_lines.append(line) | |
| else: | |
| merged_lines.append(temp_line) | |
| temp_line = line | |
| if temp_line: | |
| merged_lines.append(temp_line) | |
| return "\n".join(merged_lines) | |
| def split_japanese_srt_text(srt_content, max_length): | |
| """Split the lines of the srt content for Japanese text.""" | |
| srt_lines = srt_content.split("\n") | |
| modified_lines = [] | |
| for line in srt_lines: | |
| # Check if the line looks like a subtitle text (not an index or a timestamp) | |
| if re.match(r"^[0-9]{1,2}:[0-9]{2}:[0-9]{2},[0-9]{3} --> [0-9]{1,2}:[0-9]{2}:[0-9]{2},[0-9]{3}$", line): | |
| modified_lines.append(line) | |
| elif line.isdigit(): | |
| modified_lines.append(line) | |
| elif line == "": | |
| modified_lines.append(line) | |
| else: | |
| # Split the Japanese subtitle text line | |
| modified_lines.extend(split_japanese_line(line, max_length).split("\n")) | |
| return "\n".join(modified_lines) | |
| def format_transcription_for_japanese(file_path, max_length): | |
| with open(file_path, "r", encoding="utf-8") as file: | |
| srt_content = file.read() | |
| modified_japanese_srt_content = split_japanese_srt_text(srt_content, max_length) | |
| with open("formatted_transcription.srt", "w", encoding="utf-8") as file: | |
| file.write(modified_japanese_srt_content) | |
| def main(srt_obj, max_length): | |
| format_transcription_for_japanese(srt_obj.name, max_length) | |
| return "formatted_transcription.srt" | |
| app = gr.Interface( | |
| main, | |
| [gr.File(file_types=[".srt"], label="srtファイル"), | |
| gr.Number(20, label="1行あたりの最大文字数")], # Textbox input for the API key | |
| "file" | |
| ) | |
| if __name__ == "__main__": | |
| app.queue(concurrency_count=1) | |
| app.launch() | |