Spaces:

areebsa
/

imageaudiosync

Sleeping

App Files Files Community

imageaudiosync / src /audio_video_generator /utils /text.py

Nanny7

Initial commit: Audio Video Generator v1.0.0

929f41f 2 months ago

Raw

History Blame Contribute Delete

3.52 kB

	"""Text processing utilities."""

	import re
	from typing import List, Optional


	def normalize_text(text: Optional[str]) -> str:
	"""Normalize text for comparison and matching.

	Converts to lowercase, strips whitespace, removes punctuation,
	and collapses multiple spaces into single spaces.

	Args:
	text: Input text to normalize

	Returns:
	Normalized text string
	"""
	if text is None:
	return ""
	text = str(text).lower().strip()
	text = re.sub(r"[^\w\s]", " ", text)
	text = re.sub(r"\s+", " ", text)
	return text


	def tokenize_text(text: Optional[str]) -> List[str]:
	"""Tokenize text into words.

	Args:
	text: Input text to tokenize

	Returns:
	List of word tokens
	"""
	clean = normalize_text(text)
	return clean.split() if clean else []


	def extract_number(value: Optional[str]) -> Optional[int]:
	"""Extract first integer from a string.

	Args:
	value: String potentially containing a number

	Returns:
	Extracted integer or None if no number found
	"""
	if value is None:
	return None
	match = re.search(r"(\d+)", str(value))
	return int(match.group(1)) if match else None


	def get_fuzzy_threshold(token_count: int) -> float:
	"""Get fuzzy matching threshold based on token count.

	Higher thresholds for shorter sequences to reduce false positives.

	Args:
	token_count: Number of tokens in the sequence

	Returns:
	Fuzzy matching threshold (0.0 to 1.0)
	"""
	if token_count <= 2:
	return 0.96
	if token_count <= 5:
	return 0.92
	return 0.88


	def clamp01(value: Optional[float]) -> float:
	"""Clamp value to [0.0, 1.0] range.

	Args:
	value: Input value

	Returns:
	Value clamped to [0.0, 1.0]
	"""
	try:
	v = float(value) # type: ignore
	except (TypeError, ValueError):
	v = 0.5
	return max(0.0, min(1.0, v))


	def safe_int(value: Optional[float], default: int) -> int:
	"""Safely convert value to int with fallback.

	Args:
	value: Value to convert
	default: Default if conversion fails

	Returns:
	Integer value
	"""
	try:
	return int(round(float(value))) # type: ignore
	except (TypeError, ValueError):
	return int(default)


	def apply_case_style(text: Optional[str], case_style: Optional[str]) -> str:
	"""Apply case transformation to text.

	Args:
	text: Input text
	case_style: One of "original", "upper", "lower", "title"

	Returns:
	Text with applied case style
	"""
	text = "" if text is None else str(text)
	case_style = (case_style or "original").strip().lower()

	if case_style == "upper":
	return text.upper()
	if case_style == "lower":
	return text.lower()
	if case_style == "title":
	return text.title()

	return text


	def safe_output_name(name: Optional[str], default: str = "output.mp4") -> str:
	"""Sanitize output filename.

	Args:
	name: Desired output name
	default: Default name if none provided

	Returns:
	Safe output filename ending in .mp4
	"""
	name = str(name).strip() if name else default
	if not name.lower().endswith(".mp4"):
	name += ".mp4"
	# Replace unsafe characters with underscore
	name = re.sub(r"[^a-zA-Z0-9._-]", "_", name)
	return name