Spaces:
Sleeping
Sleeping
File size: 4,680 Bytes
dd5bcef 6672a34 dd5bcef 6672a34 dd5bcef 6672a34 dd5bcef 379a259 dd5bcef 6672a34 dd5bcef 6672a34 dd5bcef 6672a34 dd5bcef 6672a34 dd5bcef 6672a34 dd5bcef 6672a34 dd5bcef 6672a34 dd5bcef 379a259 dd5bcef 6672a34 dd5bcef 6672a34 dd5bcef ef51d6b 379a259 dd5bcef 6672a34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
"""Utilities for VTT validation and cleaning."""
import re
from typing import Tuple
def parse_timestamp(timestamp_str: str) -> int | None:
"""
Parse timestamp string to milliseconds.
Args:
timestamp_str: Timestamp in format HH:MM:SS.mmm
Returns:
Milliseconds as integer, or None if parsing fails
"""
try:
parts = timestamp_str.strip().split(":")
hours = int(parts[0])
minutes = int(parts[1])
seconds_parts = parts[2].split(".")
seconds = int(seconds_parts[0])
milliseconds = int(seconds_parts[1])
total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
return total_ms
except (ValueError, IndexError, AttributeError):
return None
def validate_vtt(vtt_content: str) -> Tuple[str, str]:
"""
Validate VTT format and return status message.
Args:
vtt_content: VTT file content as string
Returns:
Tuple of (status_message, status_type) where status_type is "error", "warning", "success", or ""
"""
if not vtt_content or vtt_content.strip() == "":
return "βͺ No content", ""
try:
# Check if starts with WEBVTT
if not vtt_content.strip().startswith("WEBVTT"):
return "π΄ Invalid: Missing WEBVTT header", "error"
lines = vtt_content.split("\n")
has_timestamps = False
timestamps = []
for i, line in enumerate(lines, 1):
if "-->" not in line:
continue
has_timestamps = True
# Validate timestamp format
match = re.match(
r"(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})", line
)
if not match:
return f"π‘ Warning: Malformed timestamp found at line {i}", "warning"
# Parse and validate timestamps
start_str, end_str = match.groups()
start_ms = parse_timestamp(start_str)
end_ms = parse_timestamp(end_str)
if start_ms is None or end_ms is None:
return f"π‘ Warning: Invalid timestamp values at line {i}", "warning"
if start_ms >= end_ms:
return (
f"π‘ Warning: Start timestamp >= end timestamp at line {i}",
"warning",
)
timestamps.append((start_ms, end_ms, i))
if not has_timestamps:
return "π΄ Invalid: No timestamps found", "error"
# Check for overlapping timestamps
for i in range(len(timestamps) - 1):
current_end = timestamps[i][1]
next_start = timestamps[i + 1][0]
current_line = timestamps[i][2]
next_line = timestamps[i + 1][2]
if current_end > next_start:
return (
f"π‘ Warning: Overlapping timestamps detected (Lines {current_line} and {next_line})",
"warning",
)
# Check for punctuation followed by lowercase
last_char = None
last_line_num = None
for i, line in enumerate(lines):
if "-->" not in line:
continue
# Get text lines for this cue
j = i + 1
while j < len(lines):
content_line = lines[j]
if "-->" in content_line:
break
if content_line.strip() == "":
break
# Process text line
# Remove speaker tag for validation
clean_text = re.sub(r"<v\s+[^>]+>", "", content_line).strip()
if clean_text:
# Check internal to the line
match = re.search(r"([.!?])\s+([a-z])", clean_text)
if match:
return (
f"π‘ Warning: Punctuation followed by lowercase at line {j + 1}",
"warning",
)
# Check across boundary
if last_char and last_char in ".!?":
if clean_text[0].islower():
return (
f"π‘ Warning: Punctuation followed by lowercase across lines {last_line_num} and {j + 1}",
"warning",
)
last_char = clean_text[-1]
last_line_num = j + 1
j += 1
return "π’ Valid", "success"
except Exception as e:
return f"π΄ Validation error: {str(e)}", "error"
|