"""Utilities for VTT validation and cleaning."""

import re
from typing import Tuple


def parse_timestamp(timestamp_str: str) -> int | None:
    """
    Parse timestamp string to milliseconds.

    Args:
        timestamp_str: Timestamp in format HH:MM:SS.mmm

    Returns:
        Milliseconds as integer, or None if parsing fails
    """
    try:
        parts = timestamp_str.strip().split(":")
        hours = int(parts[0])
        minutes = int(parts[1])
        seconds_parts = parts[2].split(".")
        seconds = int(seconds_parts[0])
        milliseconds = int(seconds_parts[1])

        total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
        return total_ms
    except (ValueError, IndexError, AttributeError):
        return None


def validate_vtt(vtt_content: str) -> Tuple[str, str]:
    """
    Validate VTT format and return status message.

    Args:
        vtt_content: VTT file content as string

    Returns:
        Tuple of (status_message, status_type) where status_type is "error", "warning", "success", or ""
    """
    if not vtt_content or vtt_content.strip() == "":
        return "⚪ No content", ""

    try:
        # Check if starts with WEBVTT
        if not vtt_content.strip().startswith("WEBVTT"):
            return "🔴 Invalid: Missing WEBVTT header", "error"

        lines = vtt_content.split("\n")
        has_timestamps = False
        timestamps = []

        for i, line in enumerate(lines, 1):
            if "-->" not in line:
                continue

            has_timestamps = True

            # Validate timestamp format
            match = re.match(
                r"(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})", line
            )
            if not match:
                return f"🟡 Warning: Malformed timestamp found at line {i}", "warning"

            # Parse and validate timestamps
            start_str, end_str = match.groups()
            start_ms = parse_timestamp(start_str)
            end_ms = parse_timestamp(end_str)

            if start_ms is None or end_ms is None:
                return f"🟡 Warning: Invalid timestamp values at line {i}", "warning"

            if start_ms >= end_ms:
                return (
                    f"🟡 Warning: Start timestamp >= end timestamp at line {i}",
                    "warning",
                )

            timestamps.append((start_ms, end_ms, i))

        if not has_timestamps:
            return "🔴 Invalid: No timestamps found", "error"

        # Check for overlapping timestamps
        for i in range(len(timestamps) - 1):
            current_end = timestamps[i][1]
            next_start = timestamps[i + 1][0]
            current_line = timestamps[i][2]
            next_line = timestamps[i + 1][2]

            if current_end > next_start:
                return (
                    f"🟡 Warning: Overlapping timestamps detected (Lines {current_line} and {next_line})",
                    "warning",
                )

        # Check for punctuation followed by lowercase
        last_char = None
        last_line_num = None

        for i, line in enumerate(lines):
            if "-->" not in line:
                continue

            # Get text lines for this cue
            j = i + 1
            while j < len(lines):
                content_line = lines[j]
                if "-->" in content_line:
                    break
                if content_line.strip() == "":
                    break

                # Process text line
                # Remove speaker tag for validation
                clean_text = re.sub(r"<v\s+[^>]+>", "", content_line).strip()

                if clean_text:
                    # Check internal to the line
                    match = re.search(r"([.!?])\s+([a-z])", clean_text)
                    if match:
                        return (
                            f"🟡 Warning: Punctuation followed by lowercase at line {j + 1}",
                            "warning",
                        )

                    # Check across boundary
                    if last_char and last_char in ".!?":
                        if clean_text[0].islower():
                            return (
                                f"🟡 Warning: Punctuation followed by lowercase across lines {last_line_num} and {j + 1}",
                                "warning",
                            )

                    last_char = clean_text[-1]
                    last_line_num = j + 1

                j += 1

        return "🟢 Valid", "success"
    except Exception as e:
        return f"🔴 Validation error: {str(e)}", "error"