Spaces:

bbruceyuan
/

Auto-Chapter-Bar

Sleeping

File size: 1,926 Bytes

db4f540

"""SRT 字幕文件解析器"""

import re
from dataclasses import dataclass


@dataclass
class SubtitleEntry:
    """字幕条目"""

    index: int
    start_time: float  # 秒
    end_time: float  # 秒
    text: str


def parse_timestamp(timestamp: str) -> float:
    """将 HH:MM:SS,mmm 格式转换为秒数

    Args:
        timestamp: 时间戳字符串，如 "00:01:23,456"

    Returns:
        float: 秒数
    """
    # 匹配格式：HH:MM:SS,mmm
    match = re.match(r"(\d{2}):(\d{2}):(\d{2}),(\d{3})", timestamp)
    if not match:
        raise ValueError(f"无效的时间戳格式: {timestamp}")

    hours, minutes, seconds, milliseconds = map(int, match.groups())
    total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000
    return total_seconds


def parse_srt(file_path: str) -> list[SubtitleEntry]:
    """解析 SRT 字幕文件

    Args:
        file_path: SRT 文件路径

    Returns:
        List[SubtitleEntry]: 字幕条目列表
    """
    with open(file_path, encoding="utf-8") as f:
        content = f.read()

    entries = []
    # 按空行分割字幕块
    blocks = content.strip().split("\n\n")

    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) < 3:
            continue

        # 第一行是序号
        try:
            index = int(lines[0])
        except ValueError:
            continue

        # 第二行是时间戳
        timestamp_line = lines[1]
        match = re.match(r"(.+?)\s*-->\s*(.+)", timestamp_line)
        if not match:
            continue

        start_str, end_str = match.groups()
        start_time = parse_timestamp(start_str.strip())
        end_time = parse_timestamp(end_str.strip())

        # 剩余行是文本内容
        text = " ".join(lines[2:])

        entries.append(SubtitleEntry(index=index, start_time=start_time, end_time=end_time, text=text))

    return entries