Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| from collections import Counter | |
| import math | |
| import re | |
| from dataclasses import dataclass, field | |
| from typing import Iterable | |
| TOKEN_RE = re.compile(r"[A-Za-z0-9']+") | |
| def tokenize(text: str) -> list[str]: | |
| return [tok.lower() for tok in TOKEN_RE.findall(text or '')] | |
| def vectorize(text: str) -> Counter[str]: | |
| return Counter(tokenize(text)) | |
| def cosine_similarity(left: Counter[str], right: Counter[str]) -> float: | |
| if not left or not right: | |
| return 0.0 | |
| keys = set(left) | set(right) | |
| dot = sum(left[k] * right[k] for k in keys) | |
| if dot == 0: | |
| return 0.0 | |
| left_norm = math.sqrt(sum(v * v for v in left.values())) | |
| right_norm = math.sqrt(sum(v * v for v in right.values())) | |
| if not left_norm or not right_norm: | |
| return 0.0 | |
| return dot / (left_norm * right_norm) | |
| class SimpleEmbeddingIndex: | |
| entries: dict[str, Counter[str]] = field(default_factory=dict) | |
| def add(self, record_id: str, text: str) -> None: | |
| self.entries[record_id] = vectorize(text) | |
| def search(self, query: str, limit: int = 5) -> list[tuple[str, float]]: | |
| qvec = vectorize(query) | |
| scored = [(record_id, cosine_similarity(qvec, vec)) for record_id, vec in self.entries.items()] | |
| return sorted(scored, key=lambda item: item[1], reverse=True)[:limit] | |
| def extract_keywords(text: str, limit: int = 6) -> list[str]: | |
| counts = Counter(tok for tok in tokenize(text) if len(tok) > 2) | |
| return [word for word, _ in counts.most_common(limit)] | |