Spaces:

AthelaPerk
/

mnemo

Sleeping

App Files Files Community

AthelaPerk commited on Mar 9

Commit

c08de39

verified ·

1 Parent(s): c49ff1f

v6.3: Case-insensitive NER in graph_search

Browse files

Files changed (1) hide show

mnemo_core.py +70 -36

mnemo_core.py CHANGED Viewed

@@ -1093,31 +1093,70 @@ class MnemoEngine:
     def graph_search(self, query: str, top_k: int = 15,
                      active_sessions: Optional[List[str]] = None) -> List[dict]:
-        """v6.2: Fixed entity retrieval for multi-word names from auto-extract.
-        v6.2 changes:
-        - EntityIndex now supports word-level lookup (main fix)
-        - Added multi-word entity detection from query ("Alistair Fitzroy")
-        - Added value content scan — CPs mentioning entity names in their
-          value field get a small graph boost even without entity field match
-        - Lowered semantic threshold from 0.25 to 0.20 for better recall
         v6.1: both signals always contribute.
         """
         # Computed outside lock to prevent blocking concurrent API requests
         query_emb = self._get_embedding(query)
         with self._lock:
             self.stats["graph_searches"] += 1
-            # === SIGNAL 1: Entity graph traversal (fast, precise) ===
             entity_candidates = set(re.findall(r'\b[A-Z][a-z]{2,}\b', query))
-            query_words = set(w.lower() for w in query.split() if len(w) > 2)
-            # v6.2: Also detect multi-word entity names from adjacent capitalized words
-            # "Tell me about Alistair Fitzroy" → also try "Alistair Fitzroy" as one entity
-            words = query.split()
             multi_word_entities = set()
             i = 0
             while i < len(words):
                 if re.match(r'^[A-Z][a-z]{2,}$', words[i]):
@@ -1127,14 +1166,14 @@ class MnemoEngine:
                         parts.append(words[j])
                         j += 1
                     if len(parts) >= 2:
-                        multi_word_entities.add(" ".join(parts))
                     i = j
                 else:
                     i += 1
             graph_results: Dict[str, float] = {}
-            # Single-word entity lookups (now also hits word-level index via v6.2 EntityIndex)
             for candidate in entity_candidates:
                 cp_ids = self.entity_index.lookup_entity(candidate)
                 for cp_id in cp_ids:
@@ -1146,26 +1185,24 @@ class MnemoEngine:
                         for cp_id in conn_ids:
                             graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.6
-            # v6.2: Multi-word entity lookups (exact match on full name)
             for mw_entity in multi_word_entities:
-                cp_ids = self.entity_index.by_entity.get(mw_entity.lower(), [])
                 for cp_id in cp_ids:
-                    graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.5  # higher than single-word
-            # v6.2: Value content scan — if entity names appear in CP value text,
-            # give a small graph boost. Catches CPs where entity field differs
-            # but the content mentions the character.
-            if entity_candidates:
-                entity_lower = {e.lower() for e in entity_candidates if len(e) >= 4}
-                if entity_lower:
-                    for cp_id, cp in self.connection_points.items():
-                        if cp_id in graph_results:
-                            continue  # already found via entity index
-                        val_lower = cp.value.lower()
-                        for ent in entity_lower:
-                            if ent in val_lower:
-                                graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.25
-                                break  # one match per CP is enough
             type_keywords = {
                 "relationship": ["relationship", "brother", "sister", "friend", "rival",
@@ -1183,7 +1220,6 @@ class MnemoEngine:
                             graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.2
             # === SIGNAL 2: Semantic search on ALL CP embeddings (always runs) ===
-            # v6.2: Lowered threshold from 0.25 to 0.20 for better recall
             semantic_results: Dict[str, float] = {}
             for cp_id, cp in self.connection_points.items():
                 if active_sessions and cp.session_id and cp.session_id not in active_sessions:
@@ -1195,8 +1231,6 @@ class MnemoEngine:
                         semantic_results[cp_id] = sim
             # === COMBINE: semantic * 0.6 + graph * 0.4 ===
-            # Semantic is primary (finds cross-entity relevance)
-            # Graph boosts entity-matched results
             if active_sessions:
                 graph_results = {
                     cp_id: score for cp_id, score in graph_results.items()

     def graph_search(self, query: str, top_k: int = 15,
                      active_sessions: Optional[List[str]] = None) -> List[dict]:
+        """v6.3: Case-insensitive entity recognition.
+        v6.3 changes:
+        - NER no longer requires Title Case. "tell me about alistair" now works.
+        - Three-signal entity extraction:
+          1a. Title Case regex (original, high confidence)
+          1b. Case-insensitive word matching against EntityIndex keys
+          1c. Case-insensitive multi-word matching against full entity names
+        - Stopword list prevents false positives ("tell", "about", "write")
+        v6.2: word-level EntityIndex, value content scan, lowered threshold
         v6.1: both signals always contribute.
         """
         # Computed outside lock to prevent blocking concurrent API requests
         query_emb = self._get_embedding(query)
+        # NER stopwords — common words that should never be treated as entities
+        _NER_STOP = {
+            'the','this','that','these','those','what','which','who','whom',
+            'where','when','how','why','will','would','could','should','can',
+            'may','might','shall','must','about','with','from','into','through',
+            'during','before','after','between','under','above','does','have',
+            'has','had','was','were','been','being','are','not','but','and',
+            'for','nor','yet','also','just','very','too','some','any','all',
+            'each','every','both','few','more','most','other','only','own',
+            'than','then','now','here','there','tell','show','give','get',
+            'find','know','remember','recall','write','create','describe',
+            'make','help','please','scene','chapter','story','book',
+            'character','plot','setting','like','want','need','think',
+            'said','says','going','come','came','take','took','keep',
+        }
         with self._lock:
             self.stats["graph_searches"] += 1
+            # === SIGNAL 1: Entity graph traversal ===
+            # 1a. Title Case regex (original — still useful for proper nouns)
             entity_candidates = set(re.findall(r'\b[A-Z][a-z]{2,}\b', query))
+            # 1b. Case-insensitive: check ALL query words against EntityIndex
+            # This catches "tell me about alistair" where "alistair" is lowercase
+            query_words_raw = re.findall(r"\b\w{3,}\b", query)
+            query_words_lower = {w.lower() for w in query_words_raw} - _NER_STOP
+            # Collect all known entity keys (full names + individual words)
+            known_full = set(self.entity_index.by_entity.keys())      # "alistair fitzroy", "sebastian carlisle"
+            known_words = set(self.entity_index.by_entity_word.keys()) # "alistair", "fitzroy", "sebastian"
+            known_all = known_full | known_words
+            # Match query words against known entities (case-insensitive)
+            for qw in query_words_lower:
+                if qw in known_all:
+                    entity_candidates.add(qw.title())  # Add as Title Case for uniform processing
+            # 1c. Multi-word entity detection (case-insensitive)
+            query_lower = query.lower()
             multi_word_entities = set()
+            for full_name in known_full:
+                if ' ' in full_name and full_name in query_lower:
+                    multi_word_entities.add(full_name)
+            # Also detect adjacent capitalized words (original v6.2 logic)
+            words = query.split()
             i = 0
             while i < len(words):
                 if re.match(r'^[A-Z][a-z]{2,}$', words[i]):
                         parts.append(words[j])
                         j += 1
                     if len(parts) >= 2:
+                        multi_word_entities.add(" ".join(parts).lower())
                     i = j
                 else:
                     i += 1
             graph_results: Dict[str, float] = {}
+            # Single-word entity lookups
             for candidate in entity_candidates:
                 cp_ids = self.entity_index.lookup_entity(candidate)
                 for cp_id in cp_ids:
                         for cp_id in conn_ids:
                             graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.6
+            # Multi-word entity lookups (exact match on full name)
             for mw_entity in multi_word_entities:
+                mw_key = mw_entity.lower()
+                cp_ids = self.entity_index.by_entity.get(mw_key, [])
                 for cp_id in cp_ids:
+                    graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.5
+            # Value content scan — entity names in CP value text
+            entity_lower = {e.lower() for e in entity_candidates if len(e) >= 4}
+            if entity_lower:
+                for cp_id, cp in self.connection_points.items():
+                    if cp_id in graph_results:
+                        continue
+                    val_lower = cp.value.lower()
+                    for ent in entity_lower:
+                        if ent in val_lower:
+                            graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.25
+                            break
             type_keywords = {
                 "relationship": ["relationship", "brother", "sister", "friend", "rival",
                             graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.2
             # === SIGNAL 2: Semantic search on ALL CP embeddings (always runs) ===
             semantic_results: Dict[str, float] = {}
             for cp_id, cp in self.connection_points.items():
                 if active_sessions and cp.session_id and cp.session_id not in active_sessions:
                         semantic_results[cp_id] = sim
             # === COMBINE: semantic * 0.6 + graph * 0.4 ===
             if active_sessions:
                 graph_results = {
                     cp_id: score for cp_id, score in graph_results.items()