"""Shared text helpers for lightweight semantic scoring.""" from __future__ import annotations import hashlib import re import numpy as np TOKEN_RE = re.compile(r"[a-z0-9_]+") def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float: """Clamp a scalar value into a closed interval.""" return max(low, min(high, value)) def normalize_text(text: str) -> str: """Lower-case and collapse whitespace for stable comparisons.""" return " ".join(text.lower().split()) def tokenize(text: str) -> list[str]: """Tokenize text into a lightweight alphanumeric bag.""" return TOKEN_RE.findall(normalize_text(text)) def hashed_embedding(text: str, dim: int = 256) -> np.ndarray: """Create a deterministic hashed bag-of-words embedding.""" vector = np.zeros(dim, dtype=np.float32) for token in tokenize(text): digest = hashlib.blake2b(token.encode("utf-8"), digest_size=2).hexdigest() index = int(digest, 16) % dim vector[index] += 1.0 norm = np.linalg.norm(vector) if norm > 0: vector /= norm return vector def cosine_similarity(left: np.ndarray, right: np.ndarray) -> float: """Compute cosine similarity with safe zero-vector handling.""" left_norm = np.linalg.norm(left) right_norm = np.linalg.norm(right) if left_norm == 0.0 or right_norm == 0.0: return 0.0 return float(np.dot(left, right) / (left_norm * right_norm)) def text_similarity(left: str, right: str) -> float: """Compute deterministic cosine similarity between two texts.""" return cosine_similarity(hashed_embedding(left), hashed_embedding(right)) def stable_noise(text: str, low: float = -0.05, high: float = 0.05) -> float: """Map a text fingerprint to a stable uniform noise value.""" digest = hashlib.blake2b(text.encode("utf-8"), digest_size=8).hexdigest() value = int(digest, 16) / float(16**16 - 1) return low + (high - low) * value