Spaces:
Sleeping
Sleeping
| """SRT 字幕文件解析器""" | |
| import re | |
| from dataclasses import dataclass | |
| class SubtitleEntry: | |
| """字幕条目""" | |
| index: int | |
| start_time: float # 秒 | |
| end_time: float # 秒 | |
| text: str | |
| def parse_timestamp(timestamp: str) -> float: | |
| """将 HH:MM:SS,mmm 格式转换为秒数 | |
| Args: | |
| timestamp: 时间戳字符串,如 "00:01:23,456" | |
| Returns: | |
| float: 秒数 | |
| """ | |
| # 匹配格式:HH:MM:SS,mmm | |
| match = re.match(r"(\d{2}):(\d{2}):(\d{2}),(\d{3})", timestamp) | |
| if not match: | |
| raise ValueError(f"无效的时间戳格式: {timestamp}") | |
| hours, minutes, seconds, milliseconds = map(int, match.groups()) | |
| total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000 | |
| return total_seconds | |
| def parse_srt(file_path: str) -> list[SubtitleEntry]: | |
| """解析 SRT 字幕文件 | |
| Args: | |
| file_path: SRT 文件路径 | |
| Returns: | |
| List[SubtitleEntry]: 字幕条目列表 | |
| """ | |
| with open(file_path, encoding="utf-8") as f: | |
| content = f.read() | |
| entries = [] | |
| # 按空行分割字幕块 | |
| blocks = content.strip().split("\n\n") | |
| for block in blocks: | |
| lines = block.strip().split("\n") | |
| if len(lines) < 3: | |
| continue | |
| # 第一行是序号 | |
| try: | |
| index = int(lines[0]) | |
| except ValueError: | |
| continue | |
| # 第二行是时间戳 | |
| timestamp_line = lines[1] | |
| match = re.match(r"(.+?)\s*-->\s*(.+)", timestamp_line) | |
| if not match: | |
| continue | |
| start_str, end_str = match.groups() | |
| start_time = parse_timestamp(start_str.strip()) | |
| end_time = parse_timestamp(end_str.strip()) | |
| # 剩余行是文本内容 | |
| text = " ".join(lines[2:]) | |
| entries.append(SubtitleEntry(index=index, start_time=start_time, end_time=end_time, text=text)) | |
| return entries | |