AthelaPerk commited on
Commit
c08de39
·
verified ·
1 Parent(s): c49ff1f

v6.3: Case-insensitive NER in graph_search

Browse files
Files changed (1) hide show
  1. mnemo_core.py +70 -36
mnemo_core.py CHANGED
@@ -1093,31 +1093,70 @@ class MnemoEngine:
1093
 
1094
  def graph_search(self, query: str, top_k: int = 15,
1095
  active_sessions: Optional[List[str]] = None) -> List[dict]:
1096
- """v6.2: Fixed entity retrieval for multi-word names from auto-extract.
1097
 
1098
- v6.2 changes:
1099
- - EntityIndex now supports word-level lookup (main fix)
1100
- - Added multi-word entity detection from query ("Alistair Fitzroy")
1101
- - Added value content scan CPs mentioning entity names in their
1102
- value field get a small graph boost even without entity field match
1103
- - Lowered semantic threshold from 0.25 to 0.20 for better recall
 
1104
 
 
1105
  v6.1: both signals always contribute.
1106
  """
1107
  # Computed outside lock to prevent blocking concurrent API requests
1108
  query_emb = self._get_embedding(query)
1109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1110
  with self._lock:
1111
  self.stats["graph_searches"] += 1
1112
 
1113
- # === SIGNAL 1: Entity graph traversal (fast, precise) ===
 
 
1114
  entity_candidates = set(re.findall(r'\b[A-Z][a-z]{2,}\b', query))
1115
- query_words = set(w.lower() for w in query.split() if len(w) > 2)
1116
-
1117
- # v6.2: Also detect multi-word entity names from adjacent capitalized words
1118
- # "Tell me about Alistair Fitzroy" → also try "Alistair Fitzroy" as one entity
1119
- words = query.split()
 
 
 
 
 
 
 
 
 
 
 
 
 
1120
  multi_word_entities = set()
 
 
 
 
 
 
1121
  i = 0
1122
  while i < len(words):
1123
  if re.match(r'^[A-Z][a-z]{2,}$', words[i]):
@@ -1127,14 +1166,14 @@ class MnemoEngine:
1127
  parts.append(words[j])
1128
  j += 1
1129
  if len(parts) >= 2:
1130
- multi_word_entities.add(" ".join(parts))
1131
  i = j
1132
  else:
1133
  i += 1
1134
 
1135
  graph_results: Dict[str, float] = {}
1136
 
1137
- # Single-word entity lookups (now also hits word-level index via v6.2 EntityIndex)
1138
  for candidate in entity_candidates:
1139
  cp_ids = self.entity_index.lookup_entity(candidate)
1140
  for cp_id in cp_ids:
@@ -1146,26 +1185,24 @@ class MnemoEngine:
1146
  for cp_id in conn_ids:
1147
  graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.6
1148
 
1149
- # v6.2: Multi-word entity lookups (exact match on full name)
1150
  for mw_entity in multi_word_entities:
1151
- cp_ids = self.entity_index.by_entity.get(mw_entity.lower(), [])
 
1152
  for cp_id in cp_ids:
1153
- graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.5 # higher than single-word
1154
-
1155
- # v6.2: Value content scan — if entity names appear in CP value text,
1156
- # give a small graph boost. Catches CPs where entity field differs
1157
- # but the content mentions the character.
1158
- if entity_candidates:
1159
- entity_lower = {e.lower() for e in entity_candidates if len(e) >= 4}
1160
- if entity_lower:
1161
- for cp_id, cp in self.connection_points.items():
1162
- if cp_id in graph_results:
1163
- continue # already found via entity index
1164
- val_lower = cp.value.lower()
1165
- for ent in entity_lower:
1166
- if ent in val_lower:
1167
- graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.25
1168
- break # one match per CP is enough
1169
 
1170
  type_keywords = {
1171
  "relationship": ["relationship", "brother", "sister", "friend", "rival",
@@ -1183,7 +1220,6 @@ class MnemoEngine:
1183
  graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.2
1184
 
1185
  # === SIGNAL 2: Semantic search on ALL CP embeddings (always runs) ===
1186
- # v6.2: Lowered threshold from 0.25 to 0.20 for better recall
1187
  semantic_results: Dict[str, float] = {}
1188
  for cp_id, cp in self.connection_points.items():
1189
  if active_sessions and cp.session_id and cp.session_id not in active_sessions:
@@ -1195,8 +1231,6 @@ class MnemoEngine:
1195
  semantic_results[cp_id] = sim
1196
 
1197
  # === COMBINE: semantic * 0.6 + graph * 0.4 ===
1198
- # Semantic is primary (finds cross-entity relevance)
1199
- # Graph boosts entity-matched results
1200
  if active_sessions:
1201
  graph_results = {
1202
  cp_id: score for cp_id, score in graph_results.items()
 
1093
 
1094
  def graph_search(self, query: str, top_k: int = 15,
1095
  active_sessions: Optional[List[str]] = None) -> List[dict]:
1096
+ """v6.3: Case-insensitive entity recognition.
1097
 
1098
+ v6.3 changes:
1099
+ - NER no longer requires Title Case. "tell me about alistair" now works.
1100
+ - Three-signal entity extraction:
1101
+ 1a. Title Case regex (original, high confidence)
1102
+ 1b. Case-insensitive word matching against EntityIndex keys
1103
+ 1c. Case-insensitive multi-word matching against full entity names
1104
+ - Stopword list prevents false positives ("tell", "about", "write")
1105
 
1106
+ v6.2: word-level EntityIndex, value content scan, lowered threshold
1107
  v6.1: both signals always contribute.
1108
  """
1109
  # Computed outside lock to prevent blocking concurrent API requests
1110
  query_emb = self._get_embedding(query)
1111
 
1112
+ # NER stopwords — common words that should never be treated as entities
1113
+ _NER_STOP = {
1114
+ 'the','this','that','these','those','what','which','who','whom',
1115
+ 'where','when','how','why','will','would','could','should','can',
1116
+ 'may','might','shall','must','about','with','from','into','through',
1117
+ 'during','before','after','between','under','above','does','have',
1118
+ 'has','had','was','were','been','being','are','not','but','and',
1119
+ 'for','nor','yet','also','just','very','too','some','any','all',
1120
+ 'each','every','both','few','more','most','other','only','own',
1121
+ 'than','then','now','here','there','tell','show','give','get',
1122
+ 'find','know','remember','recall','write','create','describe',
1123
+ 'make','help','please','scene','chapter','story','book',
1124
+ 'character','plot','setting','like','want','need','think',
1125
+ 'said','says','going','come','came','take','took','keep',
1126
+ }
1127
+
1128
  with self._lock:
1129
  self.stats["graph_searches"] += 1
1130
 
1131
+ # === SIGNAL 1: Entity graph traversal ===
1132
+
1133
+ # 1a. Title Case regex (original — still useful for proper nouns)
1134
  entity_candidates = set(re.findall(r'\b[A-Z][a-z]{2,}\b', query))
1135
+
1136
+ # 1b. Case-insensitive: check ALL query words against EntityIndex
1137
+ # This catches "tell me about alistair" where "alistair" is lowercase
1138
+ query_words_raw = re.findall(r"\b\w{3,}\b", query)
1139
+ query_words_lower = {w.lower() for w in query_words_raw} - _NER_STOP
1140
+
1141
+ # Collect all known entity keys (full names + individual words)
1142
+ known_full = set(self.entity_index.by_entity.keys()) # "alistair fitzroy", "sebastian carlisle"
1143
+ known_words = set(self.entity_index.by_entity_word.keys()) # "alistair", "fitzroy", "sebastian"
1144
+ known_all = known_full | known_words
1145
+
1146
+ # Match query words against known entities (case-insensitive)
1147
+ for qw in query_words_lower:
1148
+ if qw in known_all:
1149
+ entity_candidates.add(qw.title()) # Add as Title Case for uniform processing
1150
+
1151
+ # 1c. Multi-word entity detection (case-insensitive)
1152
+ query_lower = query.lower()
1153
  multi_word_entities = set()
1154
+ for full_name in known_full:
1155
+ if ' ' in full_name and full_name in query_lower:
1156
+ multi_word_entities.add(full_name)
1157
+
1158
+ # Also detect adjacent capitalized words (original v6.2 logic)
1159
+ words = query.split()
1160
  i = 0
1161
  while i < len(words):
1162
  if re.match(r'^[A-Z][a-z]{2,}$', words[i]):
 
1166
  parts.append(words[j])
1167
  j += 1
1168
  if len(parts) >= 2:
1169
+ multi_word_entities.add(" ".join(parts).lower())
1170
  i = j
1171
  else:
1172
  i += 1
1173
 
1174
  graph_results: Dict[str, float] = {}
1175
 
1176
+ # Single-word entity lookups
1177
  for candidate in entity_candidates:
1178
  cp_ids = self.entity_index.lookup_entity(candidate)
1179
  for cp_id in cp_ids:
 
1185
  for cp_id in conn_ids:
1186
  graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.6
1187
 
1188
+ # Multi-word entity lookups (exact match on full name)
1189
  for mw_entity in multi_word_entities:
1190
+ mw_key = mw_entity.lower()
1191
+ cp_ids = self.entity_index.by_entity.get(mw_key, [])
1192
  for cp_id in cp_ids:
1193
+ graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.5
1194
+
1195
+ # Value content scan — entity names in CP value text
1196
+ entity_lower = {e.lower() for e in entity_candidates if len(e) >= 4}
1197
+ if entity_lower:
1198
+ for cp_id, cp in self.connection_points.items():
1199
+ if cp_id in graph_results:
1200
+ continue
1201
+ val_lower = cp.value.lower()
1202
+ for ent in entity_lower:
1203
+ if ent in val_lower:
1204
+ graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.25
1205
+ break
 
 
 
1206
 
1207
  type_keywords = {
1208
  "relationship": ["relationship", "brother", "sister", "friend", "rival",
 
1220
  graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.2
1221
 
1222
  # === SIGNAL 2: Semantic search on ALL CP embeddings (always runs) ===
 
1223
  semantic_results: Dict[str, float] = {}
1224
  for cp_id, cp in self.connection_points.items():
1225
  if active_sessions and cp.session_id and cp.session_id not in active_sessions:
 
1231
  semantic_results[cp_id] = sim
1232
 
1233
  # === COMBINE: semantic * 0.6 + graph * 0.4 ===
 
 
1234
  if active_sessions:
1235
  graph_results = {
1236
  cp_id: score for cp_id, score in graph_results.items()