"""Text processing utilities.""" import re from typing import List, Optional def normalize_text(text: Optional[str]) -> str: """Normalize text for comparison and matching. Converts to lowercase, strips whitespace, removes punctuation, and collapses multiple spaces into single spaces. Args: text: Input text to normalize Returns: Normalized text string """ if text is None: return "" text = str(text).lower().strip() text = re.sub(r"[^\w\s]", " ", text) text = re.sub(r"\s+", " ", text) return text def tokenize_text(text: Optional[str]) -> List[str]: """Tokenize text into words. Args: text: Input text to tokenize Returns: List of word tokens """ clean = normalize_text(text) return clean.split() if clean else [] def extract_number(value: Optional[str]) -> Optional[int]: """Extract first integer from a string. Args: value: String potentially containing a number Returns: Extracted integer or None if no number found """ if value is None: return None match = re.search(r"(\d+)", str(value)) return int(match.group(1)) if match else None def get_fuzzy_threshold(token_count: int) -> float: """Get fuzzy matching threshold based on token count. Higher thresholds for shorter sequences to reduce false positives. Args: token_count: Number of tokens in the sequence Returns: Fuzzy matching threshold (0.0 to 1.0) """ if token_count <= 2: return 0.96 if token_count <= 5: return 0.92 return 0.88 def clamp01(value: Optional[float]) -> float: """Clamp value to [0.0, 1.0] range. Args: value: Input value Returns: Value clamped to [0.0, 1.0] """ try: v = float(value) # type: ignore except (TypeError, ValueError): v = 0.5 return max(0.0, min(1.0, v)) def safe_int(value: Optional[float], default: int) -> int: """Safely convert value to int with fallback. Args: value: Value to convert default: Default if conversion fails Returns: Integer value """ try: return int(round(float(value))) # type: ignore except (TypeError, ValueError): return int(default) def apply_case_style(text: Optional[str], case_style: Optional[str]) -> str: """Apply case transformation to text. Args: text: Input text case_style: One of "original", "upper", "lower", "title" Returns: Text with applied case style """ text = "" if text is None else str(text) case_style = (case_style or "original").strip().lower() if case_style == "upper": return text.upper() if case_style == "lower": return text.lower() if case_style == "title": return text.title() return text def safe_output_name(name: Optional[str], default: str = "output.mp4") -> str: """Sanitize output filename. Args: name: Desired output name default: Default name if none provided Returns: Safe output filename ending in .mp4 """ name = str(name).strip() if name else default if not name.lower().endswith(".mp4"): name += ".mp4" # Replace unsafe characters with underscore name = re.sub(r"[^a-zA-Z0-9._-]", "_", name) return name