"""Utilities for VTT validation and cleaning.""" import re from typing import Tuple def parse_timestamp(timestamp_str: str) -> int | None: """ Parse timestamp string to milliseconds. Args: timestamp_str: Timestamp in format HH:MM:SS.mmm Returns: Milliseconds as integer, or None if parsing fails """ try: parts = timestamp_str.strip().split(":") hours = int(parts[0]) minutes = int(parts[1]) seconds_parts = parts[2].split(".") seconds = int(seconds_parts[0]) milliseconds = int(seconds_parts[1]) total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds return total_ms except (ValueError, IndexError, AttributeError): return None def validate_vtt(vtt_content: str) -> Tuple[str, str]: """ Validate VTT format and return status message. Args: vtt_content: VTT file content as string Returns: Tuple of (status_message, status_type) where status_type is "error", "warning", "success", or "" """ if not vtt_content or vtt_content.strip() == "": return "⚪ No content", "" try: # Check if starts with WEBVTT if not vtt_content.strip().startswith("WEBVTT"): return "🔴 Invalid: Missing WEBVTT header", "error" lines = vtt_content.split("\n") has_timestamps = False timestamps = [] for i, line in enumerate(lines, 1): if "-->" not in line: continue has_timestamps = True # Validate timestamp format match = re.match( r"(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})", line ) if not match: return f"🟡 Warning: Malformed timestamp found at line {i}", "warning" # Parse and validate timestamps start_str, end_str = match.groups() start_ms = parse_timestamp(start_str) end_ms = parse_timestamp(end_str) if start_ms is None or end_ms is None: return f"🟡 Warning: Invalid timestamp values at line {i}", "warning" if start_ms >= end_ms: return ( f"🟡 Warning: Start timestamp >= end timestamp at line {i}", "warning", ) timestamps.append((start_ms, end_ms, i)) if not has_timestamps: return "🔴 Invalid: No timestamps found", "error" # Check for overlapping timestamps for i in range(len(timestamps) - 1): current_end = timestamps[i][1] next_start = timestamps[i + 1][0] current_line = timestamps[i][2] next_line = timestamps[i + 1][2] if current_end > next_start: return ( f"🟡 Warning: Overlapping timestamps detected (Lines {current_line} and {next_line})", "warning", ) # Check for punctuation followed by lowercase last_char = None last_line_num = None for i, line in enumerate(lines): if "-->" not in line: continue # Get text lines for this cue j = i + 1 while j < len(lines): content_line = lines[j] if "-->" in content_line: break if content_line.strip() == "": break # Process text line # Remove speaker tag for validation clean_text = re.sub(r"]+>", "", content_line).strip() if clean_text: # Check internal to the line match = re.search(r"([.!?])\s+([a-z])", clean_text) if match: return ( f"🟡 Warning: Punctuation followed by lowercase at line {j + 1}", "warning", ) # Check across boundary if last_char and last_char in ".!?": if clean_text[0].islower(): return ( f"🟡 Warning: Punctuation followed by lowercase across lines {last_line_num} and {j + 1}", "warning", ) last_char = clean_text[-1] last_line_num = j + 1 j += 1 return "🟢 Valid", "success" except Exception as e: return f"🔴 Validation error: {str(e)}", "error"