| import re |
|
|
|
|
| def timeformat_srt(time): |
| hours = time // 3600 |
| minutes = (time - hours * 3600) // 60 |
| seconds = time - hours * 3600 - minutes * 60 |
| milliseconds = (time - int(time)) * 1000 |
| return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}" |
|
|
| def timeformat_txt(time): |
| hours = time // 3600 |
| minutes = (time - hours * 3600) // 60 |
| seconds = time - hours * 3600 - minutes * 60 |
| |
| if hours > 0: |
| return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}" |
| else: |
| return f"{int(minutes):02d}:{int(seconds):02d}" |
| |
| def timeformat_vtt(time): |
| hours = time // 3600 |
| minutes = (time - hours * 3600) // 60 |
| seconds = time - hours * 3600 - minutes * 60 |
| milliseconds = (time - int(time)) * 1000 |
| return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}" |
|
|
|
|
| def write_file(subtitle, output_file): |
| with open(output_file, 'w', encoding='utf-8') as f: |
| f.write(subtitle) |
|
|
|
|
| def get_srt(segments): |
| output = "" |
| for i, segment in enumerate(segments): |
| output += f"{i + 1}\n" |
| output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n" |
| if segment['text'].startswith(' '): |
| segment['text'] = segment['text'][1:] |
| output += f"{segment['text']}\n\n" |
| return output |
|
|
| def get_csv(segments): |
| bDiarization = False |
| output = "" |
|
|
| |
| for i, segment in enumerate(segments): |
| if re.search(r'SPEAKER [0-9][0-9]: ',segment['text']) != None: |
| bDiarization = True |
| break |
|
|
| for i, segment in enumerate(segments): |
| if bDiarization: |
| speaker_id = ((segment['text']).split(":", 1)[0]).strip() |
| speaker_text = ((segment['text']).split(":", 1)[1]).strip() |
| output += f"{i + 1};{timeformat_srt(segment['start'])};{timeformat_srt(segment['end'])};{speaker_id};{speaker_text}\n" |
| else: |
| speaker_text = (segment['text']).strip() |
| output += f"{i + 1};{timeformat_srt(segment['start'])};{timeformat_srt(segment['end'])};{speaker_text}\n" |
| |
| |
| if bDiarization: |
| output = "Line;Start time;End time;Speaker;Text\n" + output |
| else: |
| output = "Line;Start time;End time;Text\n" + output |
|
|
| return output.rstrip("\n") |
|
|
| def get_vtt(segments): |
| output = "WebVTT\n\n" |
| for i, segment in enumerate(segments): |
| output += f"{i + 1}\n" |
| output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n" |
| if segment['text'].startswith(' '): |
| segment['text'] = segment['text'][1:] |
| output += f"{segment['text']}\n\n" |
| return output |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| def get_txt(segments): |
| bDiarization = False |
| output = "" |
|
|
| |
| for i, segment in enumerate(segments): |
| if re.search(r'SPEAKER [0-9][0-9]: ',segment['text']) != None: |
| bDiarization = True |
| break |
|
|
| for i, segment in enumerate(segments): |
| if bDiarization: |
| speaker_id = ((segment['text']).split(":", 1)[0]).strip() |
| speaker_text = ((segment['text']).split(":", 1)[1]).strip() |
| output += f"{timeformat_txt(segment['start'])}\t{speaker_id}\t{speaker_text}\n" |
| else: |
| speaker_text = (segment['text']).strip() |
| output += f"{timeformat_txt(segment['start'])}\t{speaker_text}\n" |
|
|
| return output |
|
|
| def get_plaintext(segments): |
| bDiarization = False |
| output = "" |
|
|
| |
| for i, segment in enumerate(segments): |
| if re.search(r'SPEAKER [0-9][0-9]: ',segment['text']) != None: |
| bDiarization = True |
| break |
|
|
| for i, segment in enumerate(segments): |
| if bDiarization: |
| speaker_id = ((segment['text']).split(":", 1)[0]).strip() |
| speaker_text = ((segment['text']).split(":", 1)[1]).strip() |
| output += f"{speaker_id}\t{speaker_text}\n" |
| else: |
| speaker_text = (segment['text']).strip() |
| output += f"{speaker_text}\n" |
|
|
| return output |
|
|
| def parse_srt(file_path): |
| """Reads SRT file and returns as dict""" |
| with open(file_path, 'r', encoding='utf-8') as file: |
| srt_data = file.read() |
|
|
| data = [] |
| blocks = srt_data.split('\n\n') |
|
|
| for block in blocks: |
| if block.strip() != '': |
| lines = block.strip().split('\n') |
| index = lines[0] |
| timestamp = lines[1] |
| sentence = ' '.join(lines[2:]) |
|
|
| data.append({ |
| "index": index, |
| "timestamp": timestamp, |
| "sentence": sentence |
| }) |
| return data |
|
|
|
|
| def parse_vtt(file_path): |
| """Reads WebVTT file and returns as dict""" |
| with open(file_path, 'r', encoding='utf-8') as file: |
| webvtt_data = file.read() |
|
|
| data = [] |
| blocks = webvtt_data.split('\n\n') |
|
|
| for block in blocks: |
| if block.strip() != '' and not block.strip().startswith("WebVTT"): |
| lines = block.strip().split('\n') |
| index = lines[0] |
| timestamp = lines[1] |
| sentence = ' '.join(lines[2:]) |
|
|
| data.append({ |
| "index": index, |
| "timestamp": timestamp, |
| "sentence": sentence |
| }) |
|
|
| return data |
|
|
|
|
| def get_serialized_srt(dicts): |
| output = "" |
| for dic in dicts: |
| output += f'{dic["index"]}\n' |
| output += f'{dic["timestamp"]}\n' |
| output += f'{dic["sentence"]}\n\n' |
| return output |
|
|
|
|
| def get_serialized_vtt(dicts): |
| output = "WebVTT\n\n" |
| for dic in dicts: |
| output += f'{dic["index"]}\n' |
| output += f'{dic["timestamp"]}\n' |
| output += f'{dic["sentence"]}\n\n' |
| return output |
|
|
|
|
| def safe_filename(name): |
| INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]' |
| safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name) |
| |
| if len(safe_name) > 20: |
| file_extension = safe_name.split('.')[-1] |
| if len(file_extension) + 1 < 20: |
| truncated_name = safe_name[:20 - len(file_extension) - 1] |
| safe_name = truncated_name + '.' + file_extension |
| else: |
| safe_name = safe_name[:20] |
| return safe_name |
|
|