Spaces:

lucamartinelli
/

whisper-diarization

Sleeping

App Files Files Community

whisper-diarization / src /vtt_utils.py

lucamartinelli

Validazione

ef51d6b 18 days ago

raw

history blame contribute delete

4.68 kB

	"""Utilities for VTT validation and cleaning."""

	import re
	from typing import Tuple


	def parse_timestamp(timestamp_str: str) -> int \| None:
	"""
	Parse timestamp string to milliseconds.

	Args:
	timestamp_str: Timestamp in format HH:MM:SS.mmm

	Returns:
	Milliseconds as integer, or None if parsing fails
	"""
	try:
	parts = timestamp_str.strip().split(":")
	hours = int(parts[0])
	minutes = int(parts[1])
	seconds_parts = parts[2].split(".")
	seconds = int(seconds_parts[0])
	milliseconds = int(seconds_parts[1])

	total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
	return total_ms
	except (ValueError, IndexError, AttributeError):
	return None


	def validate_vtt(vtt_content: str) -> Tuple[str, str]:
	"""
	Validate VTT format and return status message.

	Args:
	vtt_content: VTT file content as string

	Returns:
	Tuple of (status_message, status_type) where status_type is "error", "warning", "success", or ""
	"""
	if not vtt_content or vtt_content.strip() == "":
	return "⚪ No content", ""

	try:
	# Check if starts with WEBVTT
	if not vtt_content.strip().startswith("WEBVTT"):
	return "🔴 Invalid: Missing WEBVTT header", "error"

	lines = vtt_content.split("\n")
	has_timestamps = False
	timestamps = []

	for i, line in enumerate(lines, 1):
	if "-->" not in line:
	continue

	has_timestamps = True

	# Validate timestamp format
	match = re.match(
	r"(\d{2}:\d{2}:\d{2}\.\d{3})\s-->\s(\d{2}:\d{2}:\d{2}\.\d{3})", line
	)
	if not match:
	return f"🟡 Warning: Malformed timestamp found at line {i}", "warning"

	# Parse and validate timestamps
	start_str, end_str = match.groups()
	start_ms = parse_timestamp(start_str)
	end_ms = parse_timestamp(end_str)

	if start_ms is None or end_ms is None:
	return f"🟡 Warning: Invalid timestamp values at line {i}", "warning"

	if start_ms >= end_ms:
	return (
	f"🟡 Warning: Start timestamp >= end timestamp at line {i}",
	"warning",
	)

	timestamps.append((start_ms, end_ms, i))

	if not has_timestamps:
	return "🔴 Invalid: No timestamps found", "error"

	# Check for overlapping timestamps
	for i in range(len(timestamps) - 1):
	current_end = timestamps[i][1]
	next_start = timestamps[i + 1][0]
	current_line = timestamps[i][2]
	next_line = timestamps[i + 1][2]

	if current_end > next_start:
	return (
	f"🟡 Warning: Overlapping timestamps detected (Lines {current_line} and {next_line})",
	"warning",
	)

	# Check for punctuation followed by lowercase
	last_char = None
	last_line_num = None

	for i, line in enumerate(lines):
	if "-->" not in line:
	continue

	# Get text lines for this cue
	j = i + 1
	while j < len(lines):
	content_line = lines[j]
	if "-->" in content_line:
	break
	if content_line.strip() == "":
	break

	# Process text line
	# Remove speaker tag for validation
	clean_text = re.sub(r"<v\s+[^>]+>", "", content_line).strip()

	if clean_text:
	# Check internal to the line
	match = re.search(r"([.!?])\s+([a-z])", clean_text)
	if match:
	return (
	f"🟡 Warning: Punctuation followed by lowercase at line {j + 1}",
	"warning",
	)

	# Check across boundary
	if last_char and last_char in ".!?":
	if clean_text[0].islower():
	return (
	f"🟡 Warning: Punctuation followed by lowercase across lines {last_line_num} and {j + 1}",
	"warning",
	)

	last_char = clean_text[-1]
	last_line_num = j + 1

	j += 1

	return "🟢 Valid", "success"
	except Exception as e:
	return f"🔴 Validation error: {str(e)}", "error"