Spaces:
Sleeping
Sleeping
| """Text processing utilities.""" | |
| import re | |
| from typing import List, Optional | |
| def normalize_text(text: Optional[str]) -> str: | |
| """Normalize text for comparison and matching. | |
| Converts to lowercase, strips whitespace, removes punctuation, | |
| and collapses multiple spaces into single spaces. | |
| Args: | |
| text: Input text to normalize | |
| Returns: | |
| Normalized text string | |
| """ | |
| if text is None: | |
| return "" | |
| text = str(text).lower().strip() | |
| text = re.sub(r"[^\w\s]", " ", text) | |
| text = re.sub(r"\s+", " ", text) | |
| return text | |
| def tokenize_text(text: Optional[str]) -> List[str]: | |
| """Tokenize text into words. | |
| Args: | |
| text: Input text to tokenize | |
| Returns: | |
| List of word tokens | |
| """ | |
| clean = normalize_text(text) | |
| return clean.split() if clean else [] | |
| def extract_number(value: Optional[str]) -> Optional[int]: | |
| """Extract first integer from a string. | |
| Args: | |
| value: String potentially containing a number | |
| Returns: | |
| Extracted integer or None if no number found | |
| """ | |
| if value is None: | |
| return None | |
| match = re.search(r"(\d+)", str(value)) | |
| return int(match.group(1)) if match else None | |
| def get_fuzzy_threshold(token_count: int) -> float: | |
| """Get fuzzy matching threshold based on token count. | |
| Higher thresholds for shorter sequences to reduce false positives. | |
| Args: | |
| token_count: Number of tokens in the sequence | |
| Returns: | |
| Fuzzy matching threshold (0.0 to 1.0) | |
| """ | |
| if token_count <= 2: | |
| return 0.96 | |
| if token_count <= 5: | |
| return 0.92 | |
| return 0.88 | |
| def clamp01(value: Optional[float]) -> float: | |
| """Clamp value to [0.0, 1.0] range. | |
| Args: | |
| value: Input value | |
| Returns: | |
| Value clamped to [0.0, 1.0] | |
| """ | |
| try: | |
| v = float(value) # type: ignore | |
| except (TypeError, ValueError): | |
| v = 0.5 | |
| return max(0.0, min(1.0, v)) | |
| def safe_int(value: Optional[float], default: int) -> int: | |
| """Safely convert value to int with fallback. | |
| Args: | |
| value: Value to convert | |
| default: Default if conversion fails | |
| Returns: | |
| Integer value | |
| """ | |
| try: | |
| return int(round(float(value))) # type: ignore | |
| except (TypeError, ValueError): | |
| return int(default) | |
| def apply_case_style(text: Optional[str], case_style: Optional[str]) -> str: | |
| """Apply case transformation to text. | |
| Args: | |
| text: Input text | |
| case_style: One of "original", "upper", "lower", "title" | |
| Returns: | |
| Text with applied case style | |
| """ | |
| text = "" if text is None else str(text) | |
| case_style = (case_style or "original").strip().lower() | |
| if case_style == "upper": | |
| return text.upper() | |
| if case_style == "lower": | |
| return text.lower() | |
| if case_style == "title": | |
| return text.title() | |
| return text | |
| def safe_output_name(name: Optional[str], default: str = "output.mp4") -> str: | |
| """Sanitize output filename. | |
| Args: | |
| name: Desired output name | |
| default: Default name if none provided | |
| Returns: | |
| Safe output filename ending in .mp4 | |
| """ | |
| name = str(name).strip() if name else default | |
| if not name.lower().endswith(".mp4"): | |
| name += ".mp4" | |
| # Replace unsafe characters with underscore | |
| name = re.sub(r"[^a-zA-Z0-9._-]", "_", name) | |
| return name | |