File size: 4,680 Bytes
dd5bcef
6672a34
dd5bcef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6672a34
dd5bcef
 
6672a34
dd5bcef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379a259
dd5bcef
6672a34
dd5bcef
 
 
6672a34
 
dd5bcef
 
 
 
 
6672a34
 
 
dd5bcef
6672a34
dd5bcef
 
 
 
 
 
 
6672a34
dd5bcef
 
6672a34
 
 
 
dd5bcef
6672a34
dd5bcef
 
379a259
dd5bcef
 
 
 
 
6672a34
 
 
dd5bcef
6672a34
 
 
 
dd5bcef
ef51d6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379a259
dd5bcef
6672a34
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""Utilities for VTT validation and cleaning."""

import re
from typing import Tuple


def parse_timestamp(timestamp_str: str) -> int | None:
    """
    Parse timestamp string to milliseconds.

    Args:
        timestamp_str: Timestamp in format HH:MM:SS.mmm

    Returns:
        Milliseconds as integer, or None if parsing fails
    """
    try:
        parts = timestamp_str.strip().split(":")
        hours = int(parts[0])
        minutes = int(parts[1])
        seconds_parts = parts[2].split(".")
        seconds = int(seconds_parts[0])
        milliseconds = int(seconds_parts[1])

        total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
        return total_ms
    except (ValueError, IndexError, AttributeError):
        return None


def validate_vtt(vtt_content: str) -> Tuple[str, str]:
    """
    Validate VTT format and return status message.

    Args:
        vtt_content: VTT file content as string

    Returns:
        Tuple of (status_message, status_type) where status_type is "error", "warning", "success", or ""
    """
    if not vtt_content or vtt_content.strip() == "":
        return "βšͺ No content", ""

    try:
        # Check if starts with WEBVTT
        if not vtt_content.strip().startswith("WEBVTT"):
            return "πŸ”΄ Invalid: Missing WEBVTT header", "error"

        lines = vtt_content.split("\n")
        has_timestamps = False
        timestamps = []

        for i, line in enumerate(lines, 1):
            if "-->" not in line:
                continue

            has_timestamps = True

            # Validate timestamp format
            match = re.match(
                r"(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})", line
            )
            if not match:
                return f"🟑 Warning: Malformed timestamp found at line {i}", "warning"

            # Parse and validate timestamps
            start_str, end_str = match.groups()
            start_ms = parse_timestamp(start_str)
            end_ms = parse_timestamp(end_str)

            if start_ms is None or end_ms is None:
                return f"🟑 Warning: Invalid timestamp values at line {i}", "warning"

            if start_ms >= end_ms:
                return (
                    f"🟑 Warning: Start timestamp >= end timestamp at line {i}",
                    "warning",
                )

            timestamps.append((start_ms, end_ms, i))

        if not has_timestamps:
            return "πŸ”΄ Invalid: No timestamps found", "error"

        # Check for overlapping timestamps
        for i in range(len(timestamps) - 1):
            current_end = timestamps[i][1]
            next_start = timestamps[i + 1][0]
            current_line = timestamps[i][2]
            next_line = timestamps[i + 1][2]

            if current_end > next_start:
                return (
                    f"🟑 Warning: Overlapping timestamps detected (Lines {current_line} and {next_line})",
                    "warning",
                )

        # Check for punctuation followed by lowercase
        last_char = None
        last_line_num = None

        for i, line in enumerate(lines):
            if "-->" not in line:
                continue

            # Get text lines for this cue
            j = i + 1
            while j < len(lines):
                content_line = lines[j]
                if "-->" in content_line:
                    break
                if content_line.strip() == "":
                    break

                # Process text line
                # Remove speaker tag for validation
                clean_text = re.sub(r"<v\s+[^>]+>", "", content_line).strip()

                if clean_text:
                    # Check internal to the line
                    match = re.search(r"([.!?])\s+([a-z])", clean_text)
                    if match:
                        return (
                            f"🟑 Warning: Punctuation followed by lowercase at line {j + 1}",
                            "warning",
                        )

                    # Check across boundary
                    if last_char and last_char in ".!?":
                        if clean_text[0].islower():
                            return (
                                f"🟑 Warning: Punctuation followed by lowercase across lines {last_line_num} and {j + 1}",
                                "warning",
                            )

                    last_char = clean_text[-1]
                    last_line_num = j + 1

                j += 1

        return "🟒 Valid", "success"
    except Exception as e:
        return f"πŸ”΄ Validation error: {str(e)}", "error"