File size: 1,955 Bytes
410276d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""Shared text helpers for lightweight semantic scoring."""

from __future__ import annotations

import hashlib
import re

import numpy as np

TOKEN_RE = re.compile(r"[a-z0-9_]+")


def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
    """Clamp a scalar value into a closed interval."""
    return max(low, min(high, value))


def normalize_text(text: str) -> str:
    """Lower-case and collapse whitespace for stable comparisons."""
    return " ".join(text.lower().split())


def tokenize(text: str) -> list[str]:
    """Tokenize text into a lightweight alphanumeric bag."""
    return TOKEN_RE.findall(normalize_text(text))


def hashed_embedding(text: str, dim: int = 256) -> np.ndarray:
    """Create a deterministic hashed bag-of-words embedding."""
    vector = np.zeros(dim, dtype=np.float32)
    for token in tokenize(text):
        digest = hashlib.blake2b(token.encode("utf-8"), digest_size=2).hexdigest()
        index = int(digest, 16) % dim
        vector[index] += 1.0
    norm = np.linalg.norm(vector)
    if norm > 0:
        vector /= norm
    return vector


def cosine_similarity(left: np.ndarray, right: np.ndarray) -> float:
    """Compute cosine similarity with safe zero-vector handling."""
    left_norm = np.linalg.norm(left)
    right_norm = np.linalg.norm(right)
    if left_norm == 0.0 or right_norm == 0.0:
        return 0.0
    return float(np.dot(left, right) / (left_norm * right_norm))


def text_similarity(left: str, right: str) -> float:
    """Compute deterministic cosine similarity between two texts."""
    return cosine_similarity(hashed_embedding(left), hashed_embedding(right))


def stable_noise(text: str, low: float = -0.05, high: float = 0.05) -> float:
    """Map a text fingerprint to a stable uniform noise value."""
    digest = hashlib.blake2b(text.encode("utf-8"), digest_size=8).hexdigest()
    value = int(digest, 16) / float(16**16 - 1)
    return low + (high - low) * value