mm1 / src /memory /extractor.py
TheRealHubertus's picture
Upload extractor.py
9a43107 verified
Raw
History Blame Contribute Delete
2.68 kB
from __future__ import annotations
import re
import uuid
from src.schemas import MemoryExtractionResult, MemoryFact
PATTERNS: list[tuple[str, str]] = [
("profile", r"\bmy name is\s+[A-ZÄÖÜ][A-Za-zÄÖÜäöüß -]{1,60}"),
("profile", r"\bI am called\s+[A-ZÄÖÜ][A-Za-zÄÖÜäöüß -]{1,60}"),
("profile", r"\bI am\s+[A-ZÄÖÜ][A-Za-zÄÖÜäöüß-]{1,60}\b"),
("preference", r"\bI (?:like|prefer|love|enjoy|want)\b[^.!?]*"),
("avoid", r"\bI (?:dislike|hate|avoid|do not want|don't want)\b[^.!?]*"),
("constraint", r"\b(?:I do not have|I don't have|no local GPU|without a local GPU|my constraint is)\b[^.!?]*"),
("skill", r"\bI (?:can|know|am comfortable with|work with)\b[^.!?]*"),
("goal", r"\b(?:my goal is|I want to build|I want the internet|I need to)\b[^.!?]*"),
("value", r"\b(?:matters to me|I care about|free again|open internet|local-first)\b[^.!?]*"),
]
def _clean(text: str) -> str:
return re.sub(r"\s+", " ", text).strip(" .")
def _normalize_fact_text(kind: str, text: str) -> str:
if kind == "profile":
name_match = re.search(r"\b(?:my name is|I am called|I am)\s+(.+)$", text, flags=re.IGNORECASE)
if name_match:
return f"User name: {name_match.group(1).strip()}"
return text
def extract_memory_candidates(message: str) -> MemoryExtractionResult:
candidates: list[MemoryFact] = []
seen: set[str] = set()
bare_name = re.fullmatch(r"\s*([A-ZÄÖÜ][A-Za-zÄÖÜäöüß-]{1,60})\s*", message)
if bare_name and bare_name.group(1).lower() not in {"hello", "hi", "hey", "yes", "no", "ok", "okay"}:
candidates.append(
MemoryFact(
id=f"mem_{uuid.uuid4().hex[:10]}",
kind="profile",
text=f"User name: {bare_name.group(1)}",
evidence=message[:240],
confidence=0.68,
)
)
seen.add(candidates[-1].text.lower())
for kind, pattern in PATTERNS:
for match in re.finditer(pattern, message, flags=re.IGNORECASE):
text = _clean(match.group(0))
if len(text) < 8:
continue
text = _normalize_fact_text(kind, text)
normalized = text.lower()
if normalized in seen:
continue
seen.add(normalized)
candidates.append(
MemoryFact(
id=f"mem_{uuid.uuid4().hex[:10]}",
kind=kind,
text=text,
evidence=message[:240],
confidence=0.72,
)
)
return MemoryExtractionResult(candidates)