Spaces:

build-small-hackathon
/

mm1

Running

File size: 2,676 Bytes

from __future__ import annotations

import re
import uuid

from src.schemas import MemoryExtractionResult, MemoryFact


PATTERNS: list[tuple[str, str]] = [
    ("profile", r"\bmy name is\s+[A-ZÄÖÜ][A-Za-zÄÖÜäöüß -]{1,60}"),
    ("profile", r"\bI am called\s+[A-ZÄÖÜ][A-Za-zÄÖÜäöüß -]{1,60}"),
    ("profile", r"\bI am\s+[A-ZÄÖÜ][A-Za-zÄÖÜäöüß-]{1,60}\b"),
    ("preference", r"\bI (?:like|prefer|love|enjoy|want)\b[^.!?]*"),
    ("avoid", r"\bI (?:dislike|hate|avoid|do not want|don't want)\b[^.!?]*"),
    ("constraint", r"\b(?:I do not have|I don't have|no local GPU|without a local GPU|my constraint is)\b[^.!?]*"),
    ("skill", r"\bI (?:can|know|am comfortable with|work with)\b[^.!?]*"),
    ("goal", r"\b(?:my goal is|I want to build|I want the internet|I need to)\b[^.!?]*"),
    ("value", r"\b(?:matters to me|I care about|free again|open internet|local-first)\b[^.!?]*"),
]


def _clean(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip(" .")


def _normalize_fact_text(kind: str, text: str) -> str:
    if kind == "profile":
        name_match = re.search(r"\b(?:my name is|I am called|I am)\s+(.+)$", text, flags=re.IGNORECASE)
        if name_match:
            return f"User name: {name_match.group(1).strip()}"
    return text


def extract_memory_candidates(message: str) -> MemoryExtractionResult:
    candidates: list[MemoryFact] = []
    seen: set[str] = set()
    bare_name = re.fullmatch(r"\s*([A-ZÄÖÜ][A-Za-zÄÖÜäöüß-]{1,60})\s*", message)
    if bare_name and bare_name.group(1).lower() not in {"hello", "hi", "hey", "yes", "no", "ok", "okay"}:
        candidates.append(
            MemoryFact(
                id=f"mem_{uuid.uuid4().hex[:10]}",
                kind="profile",
                text=f"User name: {bare_name.group(1)}",
                evidence=message[:240],
                confidence=0.68,
            )
        )
        seen.add(candidates[-1].text.lower())
    for kind, pattern in PATTERNS:
        for match in re.finditer(pattern, message, flags=re.IGNORECASE):
            text = _clean(match.group(0))
            if len(text) < 8:
                continue
            text = _normalize_fact_text(kind, text)
            normalized = text.lower()
            if normalized in seen:
                continue
            seen.add(normalized)
            candidates.append(
                MemoryFact(
                    id=f"mem_{uuid.uuid4().hex[:10]}",
                    kind=kind,
                    text=text,
                    evidence=message[:240],
                    confidence=0.72,
                )
            )
    return MemoryExtractionResult(candidates)