whisper-diarization / src /vtt_utils.py
lucamartinelli's picture
Validazione
ef51d6b
"""Utilities for VTT validation and cleaning."""
import re
from typing import Tuple
def parse_timestamp(timestamp_str: str) -> int | None:
"""
Parse timestamp string to milliseconds.
Args:
timestamp_str: Timestamp in format HH:MM:SS.mmm
Returns:
Milliseconds as integer, or None if parsing fails
"""
try:
parts = timestamp_str.strip().split(":")
hours = int(parts[0])
minutes = int(parts[1])
seconds_parts = parts[2].split(".")
seconds = int(seconds_parts[0])
milliseconds = int(seconds_parts[1])
total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
return total_ms
except (ValueError, IndexError, AttributeError):
return None
def validate_vtt(vtt_content: str) -> Tuple[str, str]:
"""
Validate VTT format and return status message.
Args:
vtt_content: VTT file content as string
Returns:
Tuple of (status_message, status_type) where status_type is "error", "warning", "success", or ""
"""
if not vtt_content or vtt_content.strip() == "":
return "βšͺ No content", ""
try:
# Check if starts with WEBVTT
if not vtt_content.strip().startswith("WEBVTT"):
return "πŸ”΄ Invalid: Missing WEBVTT header", "error"
lines = vtt_content.split("\n")
has_timestamps = False
timestamps = []
for i, line in enumerate(lines, 1):
if "-->" not in line:
continue
has_timestamps = True
# Validate timestamp format
match = re.match(
r"(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})", line
)
if not match:
return f"🟑 Warning: Malformed timestamp found at line {i}", "warning"
# Parse and validate timestamps
start_str, end_str = match.groups()
start_ms = parse_timestamp(start_str)
end_ms = parse_timestamp(end_str)
if start_ms is None or end_ms is None:
return f"🟑 Warning: Invalid timestamp values at line {i}", "warning"
if start_ms >= end_ms:
return (
f"🟑 Warning: Start timestamp >= end timestamp at line {i}",
"warning",
)
timestamps.append((start_ms, end_ms, i))
if not has_timestamps:
return "πŸ”΄ Invalid: No timestamps found", "error"
# Check for overlapping timestamps
for i in range(len(timestamps) - 1):
current_end = timestamps[i][1]
next_start = timestamps[i + 1][0]
current_line = timestamps[i][2]
next_line = timestamps[i + 1][2]
if current_end > next_start:
return (
f"🟑 Warning: Overlapping timestamps detected (Lines {current_line} and {next_line})",
"warning",
)
# Check for punctuation followed by lowercase
last_char = None
last_line_num = None
for i, line in enumerate(lines):
if "-->" not in line:
continue
# Get text lines for this cue
j = i + 1
while j < len(lines):
content_line = lines[j]
if "-->" in content_line:
break
if content_line.strip() == "":
break
# Process text line
# Remove speaker tag for validation
clean_text = re.sub(r"<v\s+[^>]+>", "", content_line).strip()
if clean_text:
# Check internal to the line
match = re.search(r"([.!?])\s+([a-z])", clean_text)
if match:
return (
f"🟑 Warning: Punctuation followed by lowercase at line {j + 1}",
"warning",
)
# Check across boundary
if last_char and last_char in ".!?":
if clean_text[0].islower():
return (
f"🟑 Warning: Punctuation followed by lowercase across lines {last_line_num} and {j + 1}",
"warning",
)
last_char = clean_text[-1]
last_line_num = j + 1
j += 1
return "🟒 Valid", "success"
except Exception as e:
return f"πŸ”΄ Validation error: {str(e)}", "error"