Spaces:

build-small-hackathon
/

mm1

Sleeping

App Files Files Community

mm1 / src /memory /extractor.py

TheRealHubertus

Upload extractor.py

9a43107 verified 12 days ago

Raw

History Blame Contribute Delete

2.68 kB

	from __future__ import annotations

	import re
	import uuid

	from src.schemas import MemoryExtractionResult, MemoryFact


	PATTERNS: list[tuple[str, str]] = [
	("profile", r"\bmy name is\s+[A-ZÄÖÜ][A-Za-zÄÖÜäöüß -]{1,60}"),
	("profile", r"\bI am called\s+[A-ZÄÖÜ][A-Za-zÄÖÜäöüß -]{1,60}"),
	("profile", r"\bI am\s+[A-ZÄÖÜ][A-Za-zÄÖÜäöüß-]{1,60}\b"),
	("preference", r"\bI (?:like\|prefer\|love\|enjoy\|want)\b[^.!?]*"),
	("avoid", r"\bI (?:dislike\|hate\|avoid\|do not want\|don't want)\b[^.!?]*"),
	("constraint", r"\b(?:I do not have\|I don't have\|no local GPU\|without a local GPU\|my constraint is)\b[^.!?]*"),
	("skill", r"\bI (?:can\|know\|am comfortable with\|work with)\b[^.!?]*"),
	("goal", r"\b(?:my goal is\|I want to build\|I want the internet\|I need to)\b[^.!?]*"),
	("value", r"\b(?:matters to me\|I care about\|free again\|open internet\|local-first)\b[^.!?]*"),
	]


	def _clean(text: str) -> str:
	return re.sub(r"\s+", " ", text).strip(" .")


	def _normalize_fact_text(kind: str, text: str) -> str:
	if kind == "profile":
	name_match = re.search(r"\b(?:my name is\|I am called\|I am)\s+(.+)$", text, flags=re.IGNORECASE)
	if name_match:
	return f"User name: {name_match.group(1).strip()}"
	return text


	def extract_memory_candidates(message: str) -> MemoryExtractionResult:
	candidates: list[MemoryFact] = []
	seen: set[str] = set()
	bare_name = re.fullmatch(r"\s([A-ZÄÖÜ][A-Za-zÄÖÜäöüß-]{1,60})\s", message)
	if bare_name and bare_name.group(1).lower() not in {"hello", "hi", "hey", "yes", "no", "ok", "okay"}:
	candidates.append(
	MemoryFact(
	id=f"mem_{uuid.uuid4().hex[:10]}",
	kind="profile",
	text=f"User name: {bare_name.group(1)}",
	evidence=message[:240],
	confidence=0.68,
	)
	)
	seen.add(candidates[-1].text.lower())
	for kind, pattern in PATTERNS:
	for match in re.finditer(pattern, message, flags=re.IGNORECASE):
	text = _clean(match.group(0))
	if len(text) < 8:
	continue
	text = _normalize_fact_text(kind, text)
	normalized = text.lower()
	if normalized in seen:
	continue
	seen.add(normalized)
	candidates.append(
	MemoryFact(
	id=f"mem_{uuid.uuid4().hex[:10]}",
	kind=kind,
	text=text,
	evidence=message[:240],
	confidence=0.72,
	)
	)
	return MemoryExtractionResult(candidates)