Spaces:
Sleeping
Sleeping
| """Utilities for VTT validation and cleaning.""" | |
| import re | |
| from typing import Tuple | |
| def parse_timestamp(timestamp_str: str) -> int | None: | |
| """ | |
| Parse timestamp string to milliseconds. | |
| Args: | |
| timestamp_str: Timestamp in format HH:MM:SS.mmm | |
| Returns: | |
| Milliseconds as integer, or None if parsing fails | |
| """ | |
| try: | |
| parts = timestamp_str.strip().split(":") | |
| hours = int(parts[0]) | |
| minutes = int(parts[1]) | |
| seconds_parts = parts[2].split(".") | |
| seconds = int(seconds_parts[0]) | |
| milliseconds = int(seconds_parts[1]) | |
| total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds | |
| return total_ms | |
| except (ValueError, IndexError, AttributeError): | |
| return None | |
| def validate_vtt(vtt_content: str) -> Tuple[str, str]: | |
| """ | |
| Validate VTT format and return status message. | |
| Args: | |
| vtt_content: VTT file content as string | |
| Returns: | |
| Tuple of (status_message, status_type) where status_type is "error", "warning", "success", or "" | |
| """ | |
| if not vtt_content or vtt_content.strip() == "": | |
| return "βͺ No content", "" | |
| try: | |
| # Check if starts with WEBVTT | |
| if not vtt_content.strip().startswith("WEBVTT"): | |
| return "π΄ Invalid: Missing WEBVTT header", "error" | |
| lines = vtt_content.split("\n") | |
| has_timestamps = False | |
| timestamps = [] | |
| for i, line in enumerate(lines, 1): | |
| if "-->" not in line: | |
| continue | |
| has_timestamps = True | |
| # Validate timestamp format | |
| match = re.match( | |
| r"(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})", line | |
| ) | |
| if not match: | |
| return f"π‘ Warning: Malformed timestamp found at line {i}", "warning" | |
| # Parse and validate timestamps | |
| start_str, end_str = match.groups() | |
| start_ms = parse_timestamp(start_str) | |
| end_ms = parse_timestamp(end_str) | |
| if start_ms is None or end_ms is None: | |
| return f"π‘ Warning: Invalid timestamp values at line {i}", "warning" | |
| if start_ms >= end_ms: | |
| return ( | |
| f"π‘ Warning: Start timestamp >= end timestamp at line {i}", | |
| "warning", | |
| ) | |
| timestamps.append((start_ms, end_ms, i)) | |
| if not has_timestamps: | |
| return "π΄ Invalid: No timestamps found", "error" | |
| # Check for overlapping timestamps | |
| for i in range(len(timestamps) - 1): | |
| current_end = timestamps[i][1] | |
| next_start = timestamps[i + 1][0] | |
| current_line = timestamps[i][2] | |
| next_line = timestamps[i + 1][2] | |
| if current_end > next_start: | |
| return ( | |
| f"π‘ Warning: Overlapping timestamps detected (Lines {current_line} and {next_line})", | |
| "warning", | |
| ) | |
| # Check for punctuation followed by lowercase | |
| last_char = None | |
| last_line_num = None | |
| for i, line in enumerate(lines): | |
| if "-->" not in line: | |
| continue | |
| # Get text lines for this cue | |
| j = i + 1 | |
| while j < len(lines): | |
| content_line = lines[j] | |
| if "-->" in content_line: | |
| break | |
| if content_line.strip() == "": | |
| break | |
| # Process text line | |
| # Remove speaker tag for validation | |
| clean_text = re.sub(r"<v\s+[^>]+>", "", content_line).strip() | |
| if clean_text: | |
| # Check internal to the line | |
| match = re.search(r"([.!?])\s+([a-z])", clean_text) | |
| if match: | |
| return ( | |
| f"π‘ Warning: Punctuation followed by lowercase at line {j + 1}", | |
| "warning", | |
| ) | |
| # Check across boundary | |
| if last_char and last_char in ".!?": | |
| if clean_text[0].islower(): | |
| return ( | |
| f"π‘ Warning: Punctuation followed by lowercase across lines {last_line_num} and {j + 1}", | |
| "warning", | |
| ) | |
| last_char = clean_text[-1] | |
| last_line_num = j + 1 | |
| j += 1 | |
| return "π’ Valid", "success" | |
| except Exception as e: | |
| return f"π΄ Validation error: {str(e)}", "error" | |