Spaces:
Sleeping
Sleeping
v6.3: Case-insensitive NER in graph_search
Browse files- mnemo_core.py +70 -36
mnemo_core.py
CHANGED
|
@@ -1093,31 +1093,70 @@ class MnemoEngine:
|
|
| 1093 |
|
| 1094 |
def graph_search(self, query: str, top_k: int = 15,
|
| 1095 |
active_sessions: Optional[List[str]] = None) -> List[dict]:
|
| 1096 |
-
"""v6.
|
| 1097 |
|
| 1098 |
-
v6.
|
| 1099 |
-
-
|
| 1100 |
-
-
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
|
|
|
|
| 1104 |
|
|
|
|
| 1105 |
v6.1: both signals always contribute.
|
| 1106 |
"""
|
| 1107 |
# Computed outside lock to prevent blocking concurrent API requests
|
| 1108 |
query_emb = self._get_embedding(query)
|
| 1109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1110 |
with self._lock:
|
| 1111 |
self.stats["graph_searches"] += 1
|
| 1112 |
|
| 1113 |
-
# === SIGNAL 1: Entity graph traversal
|
|
|
|
|
|
|
| 1114 |
entity_candidates = set(re.findall(r'\b[A-Z][a-z]{2,}\b', query))
|
| 1115 |
-
|
| 1116 |
-
|
| 1117 |
-
#
|
| 1118 |
-
|
| 1119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1120 |
multi_word_entities = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1121 |
i = 0
|
| 1122 |
while i < len(words):
|
| 1123 |
if re.match(r'^[A-Z][a-z]{2,}$', words[i]):
|
|
@@ -1127,14 +1166,14 @@ class MnemoEngine:
|
|
| 1127 |
parts.append(words[j])
|
| 1128 |
j += 1
|
| 1129 |
if len(parts) >= 2:
|
| 1130 |
-
multi_word_entities.add(" ".join(parts))
|
| 1131 |
i = j
|
| 1132 |
else:
|
| 1133 |
i += 1
|
| 1134 |
|
| 1135 |
graph_results: Dict[str, float] = {}
|
| 1136 |
|
| 1137 |
-
# Single-word entity lookups
|
| 1138 |
for candidate in entity_candidates:
|
| 1139 |
cp_ids = self.entity_index.lookup_entity(candidate)
|
| 1140 |
for cp_id in cp_ids:
|
|
@@ -1146,26 +1185,24 @@ class MnemoEngine:
|
|
| 1146 |
for cp_id in conn_ids:
|
| 1147 |
graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.6
|
| 1148 |
|
| 1149 |
-
#
|
| 1150 |
for mw_entity in multi_word_entities:
|
| 1151 |
-
|
|
|
|
| 1152 |
for cp_id in cp_ids:
|
| 1153 |
-
graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.5
|
| 1154 |
-
|
| 1155 |
-
#
|
| 1156 |
-
|
| 1157 |
-
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
-
|
| 1163 |
-
|
| 1164 |
-
|
| 1165 |
-
|
| 1166 |
-
if ent in val_lower:
|
| 1167 |
-
graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.25
|
| 1168 |
-
break # one match per CP is enough
|
| 1169 |
|
| 1170 |
type_keywords = {
|
| 1171 |
"relationship": ["relationship", "brother", "sister", "friend", "rival",
|
|
@@ -1183,7 +1220,6 @@ class MnemoEngine:
|
|
| 1183 |
graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.2
|
| 1184 |
|
| 1185 |
# === SIGNAL 2: Semantic search on ALL CP embeddings (always runs) ===
|
| 1186 |
-
# v6.2: Lowered threshold from 0.25 to 0.20 for better recall
|
| 1187 |
semantic_results: Dict[str, float] = {}
|
| 1188 |
for cp_id, cp in self.connection_points.items():
|
| 1189 |
if active_sessions and cp.session_id and cp.session_id not in active_sessions:
|
|
@@ -1195,8 +1231,6 @@ class MnemoEngine:
|
|
| 1195 |
semantic_results[cp_id] = sim
|
| 1196 |
|
| 1197 |
# === COMBINE: semantic * 0.6 + graph * 0.4 ===
|
| 1198 |
-
# Semantic is primary (finds cross-entity relevance)
|
| 1199 |
-
# Graph boosts entity-matched results
|
| 1200 |
if active_sessions:
|
| 1201 |
graph_results = {
|
| 1202 |
cp_id: score for cp_id, score in graph_results.items()
|
|
|
|
| 1093 |
|
| 1094 |
def graph_search(self, query: str, top_k: int = 15,
|
| 1095 |
active_sessions: Optional[List[str]] = None) -> List[dict]:
|
| 1096 |
+
"""v6.3: Case-insensitive entity recognition.
|
| 1097 |
|
| 1098 |
+
v6.3 changes:
|
| 1099 |
+
- NER no longer requires Title Case. "tell me about alistair" now works.
|
| 1100 |
+
- Three-signal entity extraction:
|
| 1101 |
+
1a. Title Case regex (original, high confidence)
|
| 1102 |
+
1b. Case-insensitive word matching against EntityIndex keys
|
| 1103 |
+
1c. Case-insensitive multi-word matching against full entity names
|
| 1104 |
+
- Stopword list prevents false positives ("tell", "about", "write")
|
| 1105 |
|
| 1106 |
+
v6.2: word-level EntityIndex, value content scan, lowered threshold
|
| 1107 |
v6.1: both signals always contribute.
|
| 1108 |
"""
|
| 1109 |
# Computed outside lock to prevent blocking concurrent API requests
|
| 1110 |
query_emb = self._get_embedding(query)
|
| 1111 |
|
| 1112 |
+
# NER stopwords — common words that should never be treated as entities
|
| 1113 |
+
_NER_STOP = {
|
| 1114 |
+
'the','this','that','these','those','what','which','who','whom',
|
| 1115 |
+
'where','when','how','why','will','would','could','should','can',
|
| 1116 |
+
'may','might','shall','must','about','with','from','into','through',
|
| 1117 |
+
'during','before','after','between','under','above','does','have',
|
| 1118 |
+
'has','had','was','were','been','being','are','not','but','and',
|
| 1119 |
+
'for','nor','yet','also','just','very','too','some','any','all',
|
| 1120 |
+
'each','every','both','few','more','most','other','only','own',
|
| 1121 |
+
'than','then','now','here','there','tell','show','give','get',
|
| 1122 |
+
'find','know','remember','recall','write','create','describe',
|
| 1123 |
+
'make','help','please','scene','chapter','story','book',
|
| 1124 |
+
'character','plot','setting','like','want','need','think',
|
| 1125 |
+
'said','says','going','come','came','take','took','keep',
|
| 1126 |
+
}
|
| 1127 |
+
|
| 1128 |
with self._lock:
|
| 1129 |
self.stats["graph_searches"] += 1
|
| 1130 |
|
| 1131 |
+
# === SIGNAL 1: Entity graph traversal ===
|
| 1132 |
+
|
| 1133 |
+
# 1a. Title Case regex (original — still useful for proper nouns)
|
| 1134 |
entity_candidates = set(re.findall(r'\b[A-Z][a-z]{2,}\b', query))
|
| 1135 |
+
|
| 1136 |
+
# 1b. Case-insensitive: check ALL query words against EntityIndex
|
| 1137 |
+
# This catches "tell me about alistair" where "alistair" is lowercase
|
| 1138 |
+
query_words_raw = re.findall(r"\b\w{3,}\b", query)
|
| 1139 |
+
query_words_lower = {w.lower() for w in query_words_raw} - _NER_STOP
|
| 1140 |
+
|
| 1141 |
+
# Collect all known entity keys (full names + individual words)
|
| 1142 |
+
known_full = set(self.entity_index.by_entity.keys()) # "alistair fitzroy", "sebastian carlisle"
|
| 1143 |
+
known_words = set(self.entity_index.by_entity_word.keys()) # "alistair", "fitzroy", "sebastian"
|
| 1144 |
+
known_all = known_full | known_words
|
| 1145 |
+
|
| 1146 |
+
# Match query words against known entities (case-insensitive)
|
| 1147 |
+
for qw in query_words_lower:
|
| 1148 |
+
if qw in known_all:
|
| 1149 |
+
entity_candidates.add(qw.title()) # Add as Title Case for uniform processing
|
| 1150 |
+
|
| 1151 |
+
# 1c. Multi-word entity detection (case-insensitive)
|
| 1152 |
+
query_lower = query.lower()
|
| 1153 |
multi_word_entities = set()
|
| 1154 |
+
for full_name in known_full:
|
| 1155 |
+
if ' ' in full_name and full_name in query_lower:
|
| 1156 |
+
multi_word_entities.add(full_name)
|
| 1157 |
+
|
| 1158 |
+
# Also detect adjacent capitalized words (original v6.2 logic)
|
| 1159 |
+
words = query.split()
|
| 1160 |
i = 0
|
| 1161 |
while i < len(words):
|
| 1162 |
if re.match(r'^[A-Z][a-z]{2,}$', words[i]):
|
|
|
|
| 1166 |
parts.append(words[j])
|
| 1167 |
j += 1
|
| 1168 |
if len(parts) >= 2:
|
| 1169 |
+
multi_word_entities.add(" ".join(parts).lower())
|
| 1170 |
i = j
|
| 1171 |
else:
|
| 1172 |
i += 1
|
| 1173 |
|
| 1174 |
graph_results: Dict[str, float] = {}
|
| 1175 |
|
| 1176 |
+
# Single-word entity lookups
|
| 1177 |
for candidate in entity_candidates:
|
| 1178 |
cp_ids = self.entity_index.lookup_entity(candidate)
|
| 1179 |
for cp_id in cp_ids:
|
|
|
|
| 1185 |
for cp_id in conn_ids:
|
| 1186 |
graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.6
|
| 1187 |
|
| 1188 |
+
# Multi-word entity lookups (exact match on full name)
|
| 1189 |
for mw_entity in multi_word_entities:
|
| 1190 |
+
mw_key = mw_entity.lower()
|
| 1191 |
+
cp_ids = self.entity_index.by_entity.get(mw_key, [])
|
| 1192 |
for cp_id in cp_ids:
|
| 1193 |
+
graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.5
|
| 1194 |
+
|
| 1195 |
+
# Value content scan — entity names in CP value text
|
| 1196 |
+
entity_lower = {e.lower() for e in entity_candidates if len(e) >= 4}
|
| 1197 |
+
if entity_lower:
|
| 1198 |
+
for cp_id, cp in self.connection_points.items():
|
| 1199 |
+
if cp_id in graph_results:
|
| 1200 |
+
continue
|
| 1201 |
+
val_lower = cp.value.lower()
|
| 1202 |
+
for ent in entity_lower:
|
| 1203 |
+
if ent in val_lower:
|
| 1204 |
+
graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.25
|
| 1205 |
+
break
|
|
|
|
|
|
|
|
|
|
| 1206 |
|
| 1207 |
type_keywords = {
|
| 1208 |
"relationship": ["relationship", "brother", "sister", "friend", "rival",
|
|
|
|
| 1220 |
graph_results[cp_id] = graph_results.get(cp_id, 0) + 0.2
|
| 1221 |
|
| 1222 |
# === SIGNAL 2: Semantic search on ALL CP embeddings (always runs) ===
|
|
|
|
| 1223 |
semantic_results: Dict[str, float] = {}
|
| 1224 |
for cp_id, cp in self.connection_points.items():
|
| 1225 |
if active_sessions and cp.session_id and cp.session_id not in active_sessions:
|
|
|
|
| 1231 |
semantic_results[cp_id] = sim
|
| 1232 |
|
| 1233 |
# === COMBINE: semantic * 0.6 + graph * 0.4 ===
|
|
|
|
|
|
|
| 1234 |
if active_sessions:
|
| 1235 |
graph_results = {
|
| 1236 |
cp_id: score for cp_id, score in graph_results.items()
|