Spaces:

lucamartinelli
/

whisper-diarization

Sleeping

File size: 4,680 Bytes

dd5bcef
6672a34
dd5bcef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6672a34
dd5bcef
 
6672a34
dd5bcef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379a259
dd5bcef
6672a34
dd5bcef
 
 
6672a34
 
dd5bcef
 
 
 
 
6672a34
 
 
dd5bcef
6672a34
dd5bcef
 
 
 
 
 
 
6672a34
dd5bcef
 
6672a34
 
 
 
dd5bcef
6672a34
dd5bcef
 
379a259
dd5bcef
 
 
 
 
6672a34
 
 
dd5bcef
6672a34
 
 
 
dd5bcef
ef51d6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379a259
dd5bcef
6672a34

"""Utilities for VTT validation and cleaning."""

import re
from typing import Tuple


def parse_timestamp(timestamp_str: str) -> int | None:
    """
    Parse timestamp string to milliseconds.

    Args:
        timestamp_str: Timestamp in format HH:MM:SS.mmm

    Returns:
        Milliseconds as integer, or None if parsing fails
    """
    try:
        parts = timestamp_str.strip().split(":")
        hours = int(parts[0])
        minutes = int(parts[1])
        seconds_parts = parts[2].split(".")
        seconds = int(seconds_parts[0])
        milliseconds = int(seconds_parts[1])

        total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
        return total_ms
    except (ValueError, IndexError, AttributeError):
        return None


def validate_vtt(vtt_content: str) -> Tuple[str, str]:
    """
    Validate VTT format and return status message.

    Args:
        vtt_content: VTT file content as string

    Returns:
        Tuple of (status_message, status_type) where status_type is "error", "warning", "success", or ""
    """
    if not vtt_content or vtt_content.strip() == "":
        return "⚪ No content", ""

    try:
        # Check if starts with WEBVTT
        if not vtt_content.strip().startswith("WEBVTT"):
            return "🔴 Invalid: Missing WEBVTT header", "error"

        lines = vtt_content.split("\n")
        has_timestamps = False
        timestamps = []

        for i, line in enumerate(lines, 1):
            if "-->" not in line:
                continue

            has_timestamps = True

            # Validate timestamp format
            match = re.match(
                r"(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})", line
            )
            if not match:
                return f"🟡 Warning: Malformed timestamp found at line {i}", "warning"

            # Parse and validate timestamps
            start_str, end_str = match.groups()
            start_ms = parse_timestamp(start_str)
            end_ms = parse_timestamp(end_str)

            if start_ms is None or end_ms is None:
                return f"🟡 Warning: Invalid timestamp values at line {i}", "warning"

            if start_ms >= end_ms:
                return (
                    f"🟡 Warning: Start timestamp >= end timestamp at line {i}",
                    "warning",
                )

            timestamps.append((start_ms, end_ms, i))

        if not has_timestamps:
            return "🔴 Invalid: No timestamps found", "error"

        # Check for overlapping timestamps
        for i in range(len(timestamps) - 1):
            current_end = timestamps[i][1]
            next_start = timestamps[i + 1][0]
            current_line = timestamps[i][2]
            next_line = timestamps[i + 1][2]

            if current_end > next_start:
                return (
                    f"🟡 Warning: Overlapping timestamps detected (Lines {current_line} and {next_line})",
                    "warning",
                )

        # Check for punctuation followed by lowercase
        last_char = None
        last_line_num = None

        for i, line in enumerate(lines):
            if "-->" not in line:
                continue

            # Get text lines for this cue
            j = i + 1
            while j < len(lines):
                content_line = lines[j]
                if "-->" in content_line:
                    break
                if content_line.strip() == "":
                    break

                # Process text line
                # Remove speaker tag for validation
                clean_text = re.sub(r"<v\s+[^>]+>", "", content_line).strip()

                if clean_text:
                    # Check internal to the line
                    match = re.search(r"([.!?])\s+([a-z])", clean_text)
                    if match:
                        return (
                            f"🟡 Warning: Punctuation followed by lowercase at line {j + 1}",
                            "warning",
                        )

                    # Check across boundary
                    if last_char and last_char in ".!?":
                        if clean_text[0].islower():
                            return (
                                f"🟡 Warning: Punctuation followed by lowercase across lines {last_line_num} and {j + 1}",
                                "warning",
                            )

                    last_char = clean_text[-1]
                    last_line_num = j + 1

                j += 1

        return "🟢 Valid", "success"
    except Exception as e:
        return f"🔴 Validation error: {str(e)}", "error"