Nanny7's picture
Initial commit: Audio Video Generator v1.0.0
929f41f
Raw
History Blame Contribute Delete
3.52 kB
"""Text processing utilities."""
import re
from typing import List, Optional
def normalize_text(text: Optional[str]) -> str:
"""Normalize text for comparison and matching.
Converts to lowercase, strips whitespace, removes punctuation,
and collapses multiple spaces into single spaces.
Args:
text: Input text to normalize
Returns:
Normalized text string
"""
if text is None:
return ""
text = str(text).lower().strip()
text = re.sub(r"[^\w\s]", " ", text)
text = re.sub(r"\s+", " ", text)
return text
def tokenize_text(text: Optional[str]) -> List[str]:
"""Tokenize text into words.
Args:
text: Input text to tokenize
Returns:
List of word tokens
"""
clean = normalize_text(text)
return clean.split() if clean else []
def extract_number(value: Optional[str]) -> Optional[int]:
"""Extract first integer from a string.
Args:
value: String potentially containing a number
Returns:
Extracted integer or None if no number found
"""
if value is None:
return None
match = re.search(r"(\d+)", str(value))
return int(match.group(1)) if match else None
def get_fuzzy_threshold(token_count: int) -> float:
"""Get fuzzy matching threshold based on token count.
Higher thresholds for shorter sequences to reduce false positives.
Args:
token_count: Number of tokens in the sequence
Returns:
Fuzzy matching threshold (0.0 to 1.0)
"""
if token_count <= 2:
return 0.96
if token_count <= 5:
return 0.92
return 0.88
def clamp01(value: Optional[float]) -> float:
"""Clamp value to [0.0, 1.0] range.
Args:
value: Input value
Returns:
Value clamped to [0.0, 1.0]
"""
try:
v = float(value) # type: ignore
except (TypeError, ValueError):
v = 0.5
return max(0.0, min(1.0, v))
def safe_int(value: Optional[float], default: int) -> int:
"""Safely convert value to int with fallback.
Args:
value: Value to convert
default: Default if conversion fails
Returns:
Integer value
"""
try:
return int(round(float(value))) # type: ignore
except (TypeError, ValueError):
return int(default)
def apply_case_style(text: Optional[str], case_style: Optional[str]) -> str:
"""Apply case transformation to text.
Args:
text: Input text
case_style: One of "original", "upper", "lower", "title"
Returns:
Text with applied case style
"""
text = "" if text is None else str(text)
case_style = (case_style or "original").strip().lower()
if case_style == "upper":
return text.upper()
if case_style == "lower":
return text.lower()
if case_style == "title":
return text.title()
return text
def safe_output_name(name: Optional[str], default: str = "output.mp4") -> str:
"""Sanitize output filename.
Args:
name: Desired output name
default: Default name if none provided
Returns:
Safe output filename ending in .mp4
"""
name = str(name).strip() if name else default
if not name.lower().endswith(".mp4"):
name += ".mp4"
# Replace unsafe characters with underscore
name = re.sub(r"[^a-zA-Z0-9._-]", "_", name)
return name