elder-care-copilot / src /app_kit /embedding.py
Abhishek
Add all folders and files
f9a9b47
Raw
History Blame Contribute Delete
1.55 kB
from __future__ import annotations
from collections import Counter
import math
import re
from dataclasses import dataclass, field
from typing import Iterable
TOKEN_RE = re.compile(r"[A-Za-z0-9']+")
def tokenize(text: str) -> list[str]:
return [tok.lower() for tok in TOKEN_RE.findall(text or '')]
def vectorize(text: str) -> Counter[str]:
return Counter(tokenize(text))
def cosine_similarity(left: Counter[str], right: Counter[str]) -> float:
if not left or not right:
return 0.0
keys = set(left) | set(right)
dot = sum(left[k] * right[k] for k in keys)
if dot == 0:
return 0.0
left_norm = math.sqrt(sum(v * v for v in left.values()))
right_norm = math.sqrt(sum(v * v for v in right.values()))
if not left_norm or not right_norm:
return 0.0
return dot / (left_norm * right_norm)
@dataclass
class SimpleEmbeddingIndex:
entries: dict[str, Counter[str]] = field(default_factory=dict)
def add(self, record_id: str, text: str) -> None:
self.entries[record_id] = vectorize(text)
def search(self, query: str, limit: int = 5) -> list[tuple[str, float]]:
qvec = vectorize(query)
scored = [(record_id, cosine_similarity(qvec, vec)) for record_id, vec in self.entries.items()]
return sorted(scored, key=lambda item: item[1], reverse=True)[:limit]
def extract_keywords(text: str, limit: int = 6) -> list[str]:
counts = Counter(tok for tok in tokenize(text) if len(tok) > 2)
return [word for word, _ in counts.most_common(limit)]