Fabuilds commited on
Commit
66363ae
·
verified ·
1 Parent(s): 3eb08b0

Delete in_memory_index.py

Browse files
Files changed (1) hide show
  1. in_memory_index.py +0 -471
in_memory_index.py DELETED
@@ -1,471 +0,0 @@
1
- """
2
- IN-MEMORY PATTERN INDEX
3
- Fast lookup without HDD writes - merge existing + conversation + Gemini chat patterns
4
- """
5
- import sys
6
- import os
7
- import json
8
- import time
9
- import re
10
-
11
- try:
12
- from System.semantic_embedder import SemanticEmbedder
13
- except ImportError:
14
- try:
15
- from semantic_embedder import SemanticEmbedder
16
- except ImportError:
17
- # Final fallback for scripts in Shop/
18
- sys.path.append(os.path.dirname(os.path.abspath(__file__)))
19
- from semantic_embedder import SemanticEmbedder
20
- # Existing 5 lattice patterns
21
- LATTICE_PATTERNS = {
22
- "PATTERN_SINGLETON_DATABASE": {
23
- "lba": 8534859776,
24
- "domain": "SOFTWARE_ARCHITECTURE",
25
- "problem": "Need to ensure only one database connection exists",
26
- "solution": "Singleton pattern with thread-safe initialization",
27
- "reusability": 9,
28
- "confidence": 0.82
29
- },
30
- "PATTERN_REACT_HOOKS_DEPS": {
31
- "lba": 3371401216,
32
- "domain": "WEB_DEVELOPMENT",
33
- "problem": "React component not re-rendering when props change",
34
- "solution": "Add dependency array to useEffect",
35
- "reusability": 10,
36
- "confidence": 0.85
37
- }
38
- }
39
-
40
- CONVERSATION_PATTERNS = {
41
- "AGENT_IS_LATTICE": {
42
- "domain": "CONCEPTUAL",
43
- "problem": "Separation between agent and data structure",
44
- "solution": "Agent is non-orientable surface - no inside/outside separation",
45
- "confidence": 0.95
46
- }
47
- }
48
-
49
- class InMemoryIndex:
50
- """
51
- Adaptive Distillation Index.
52
-
53
- Tracks pattern hit counts to distinguish signal from noise:
54
- - Once-patterns (1 hit) = UNCONFIRMED (might be noise)
55
- - Twice-patterns (2 hits) = PLAUSIBLE
56
- - Multi-patterns (3+ hits) = CONFIRMED (logic)
57
-
58
- The lattice self-cleans through use. Signal persists, noise decays.
59
- """
60
-
61
- # Hit tracking file handled dynamically in __init__
62
- HIT_LOG_PATH = None
63
-
64
- # Magnitude layers: logic exists in layers
65
- # Layer 0: Surface (keyword substring match) = low magnitude
66
- # Layer 1: Structural (multi-word + domain match) = medium magnitude
67
- # Layer 2: Conceptual (phrase match in problem/solution) = high magnitude
68
- # Decay: magnitude halves every DECAY_HALF_LIFE seconds without a hit
69
- DECAY_HALF_LIFE = 86400 # 24 hours
70
-
71
- MAGNITUDE_LAYERS = {
72
- "surface": 0.3, # keyword substring match (low relevance)
73
- "structural": 0.6, # multi-word + domain match (medium)
74
- "conceptual": 1.0, # full phrase match in problem/solution (high)
75
- }
76
-
77
- def __init__(self):
78
- # Handle relative pathing for portability
79
- BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
80
- self.LATTICE_DB_DIR = os.path.join(BASE_DIR, "Lattice_DB")
81
- self.HIT_LOG_PATH = os.path.join(self.LATTICE_DB_DIR, "pattern_hits.json")
82
-
83
- index_path = os.path.join(self.LATTICE_DB_DIR, "dual_anchor_index.json")
84
-
85
- if os.path.exists(index_path):
86
- with open(index_path, 'r') as f:
87
- data = json.load(f)
88
- self.patterns = data.get('patterns', {})
89
- sources = data.get('sources', {})
90
- print(f"[INDEX] Loaded {len(self.patterns)} dual-anchor patterns")
91
- else:
92
- # Fallback to original patterns
93
- self.patterns = {}
94
- self.load_lattice_patterns()
95
- self.load_conversation_patterns()
96
- print("[INDEX] Dual-anchor index not found, using original 16 patterns")
97
-
98
- # Load hit tracking (magnitude-weighted)
99
- self.hits = self._load_hits()
100
-
101
- # Calculate adaptive threshold based on pattern count
102
- self.base_threshold = 0.3 + (0.4 * min(len(self.patterns) / 200, 1.0))
103
-
104
- # Initialize Semantic Engine
105
- print("[INDEX] Initializing Semantic Manifold...")
106
- self.embedder = SemanticEmbedder()
107
- self.pattern_vectors = {}
108
- self._reindex_vectors()
109
-
110
- confirmed = sum(1 for h in self.hits.values() if self._total_magnitude(h) >= 2.0)
111
- unconfirmed = sum(1 for h in self.hits.values() if 0 < self._total_magnitude(h) < 1.0)
112
- print(f"[DISTILLER] Confirmed: {confirmed} | Unconfirmed: {unconfirmed} | Threshold: {self.base_threshold:.2f}")
113
- self.word_freq = self._calculate_word_freq()
114
-
115
- def _reindex_vectors(self):
116
- """Pre-calculates semantic embeddings for all known patterns."""
117
- print(f"[INDEX]: Generating embeddings for {len(self.patterns)} patterns...")
118
- for label, p in self.patterns.items():
119
- # Combine problem + solution for semantic context
120
- context = f"{p.get('problem', '')} {p.get('solution', '')} {label}"
121
- self.pattern_vectors[label] = self.embedder.embed_text(context)
122
- print(f"[INDEX]: ✅ Semantic manifold mapped ({len(self.pattern_vectors)} vectors).")
123
-
124
- def _calculate_word_freq(self):
125
- """Calculate inverse pattern frequency (IPF) for lean semantic weighting."""
126
- freq = {}
127
- for p in self.patterns.values():
128
- text = (p.get('problem','') + " " + p.get('solution','')).lower()
129
- words = set(re.findall(r'\w+', text))
130
- for w in words:
131
- freq[w] = freq.get(w, 0) + 1
132
- return freq
133
-
134
- def _get_word_weight(self, word, structural_weight):
135
- """Calculate semantic weight: rare words matter more."""
136
- count = self.word_freq.get(word, 0)
137
- if count == 0: return structural_weight
138
- # Logarithmic scale for IPF: weight = 1 + log(total / count)
139
- import math
140
- ipf = 1.0 + math.log(len(self.patterns) / count)
141
- return structural_weight * ipf
142
-
143
- def _fuzzy_match(self, w1, w2):
144
- """Lightweight Jaccard similarity for fuzzy matching."""
145
- if w1 == w2: return 1.0
146
- if len(w1) < 4 or len(w2) < 4: return 0.0
147
- s1, s2 = set(w1), set(w2)
148
- intersection = len(s1 & s2)
149
- union = len(s1 | s2)
150
- score = intersection / union
151
- return score if score > 0.7 else 0.0
152
-
153
- def _load_hits(self):
154
- """Load magnitude-weighted hit data from disk."""
155
- if os.path.exists(self.HIT_LOG_PATH):
156
- with open(self.HIT_LOG_PATH, 'r') as f:
157
- data = json.load(f)
158
- # Backward compat: convert flat counts to magnitude format
159
- for label, val in data.items():
160
- if isinstance(val, (int, float)):
161
- data[label] = {"count": int(val), "magnitude": float(val) * 0.5, "layers": []}
162
- return data
163
- return {}
164
-
165
- def _save_hits(self):
166
- """Persist hit data to disk."""
167
- with open(self.HIT_LOG_PATH, 'w') as f:
168
- json.dump(self.hits, f, indent=2)
169
-
170
- def _total_magnitude(self, hit_data):
171
- """Get current magnitude with decay applied."""
172
- if isinstance(hit_data, dict):
173
- raw_mag = hit_data.get('magnitude', 0)
174
- last_hit = hit_data.get('last_hit', 0)
175
- if last_hit > 0 and raw_mag > 0:
176
- elapsed = time.time() - last_hit
177
- # Halve every DECAY_HALF_LIFE seconds
178
- decay_factor = 0.5 ** (elapsed / self.DECAY_HALF_LIFE)
179
- return raw_mag * decay_factor
180
- return raw_mag
181
- return float(hit_data) * 0.5 # backward compat
182
-
183
- def _classify_relevance(self, relevance):
184
- """Classify match into magnitude layer based on relevance score."""
185
- if relevance >= 0.7:
186
- return "conceptual", self.MAGNITUDE_LAYERS["conceptual"]
187
- elif relevance >= 0.4:
188
- return "structural", self.MAGNITUDE_LAYERS["structural"]
189
- else:
190
- return "surface", self.MAGNITUDE_LAYERS["surface"]
191
-
192
- def _record_hit(self, label, relevance):
193
- """Record a hit. Re-mention restores magnitude to peak."""
194
- layer_name, magnitude = self._classify_relevance(relevance)
195
-
196
- if label not in self.hits:
197
- self.hits[label] = {"count": 0, "magnitude": 0.0, "peak": 0.0, "layers": [], "last_hit": 0}
198
-
199
- h = self.hits[label]
200
- h["count"] += 1
201
- h["last_hit"] = time.time()
202
-
203
- # Restore to peak first (re-mention recovery), then add new magnitude
204
- current_peak = h.get("peak", h["magnitude"])
205
- h["magnitude"] = current_peak + magnitude
206
- h["peak"] = h["magnitude"] # new peak
207
-
208
- # Track which layers have been hit
209
- if layer_name not in h["layers"]:
210
- h["layers"].append(layer_name)
211
-
212
- def get_status(self, label):
213
- """Get distillation status based on decayed magnitude."""
214
- hit_data = self.hits.get(label, {})
215
- mag = self._total_magnitude(hit_data) # applies decay
216
- layers = hit_data.get('layers', []) if isinstance(hit_data, dict) else []
217
-
218
- if mag == 0:
219
- return "NEW"
220
- elif mag < 1.0:
221
- return "UNCONFIRMED" # surface-only = might be noise
222
- elif mag < 2.0:
223
- return "PLAUSIBLE"
224
- elif len(layers) >= 2:
225
- return "DEEP_LOGIC" # hit at multiple layers = real
226
- else:
227
- return "CONFIRMED" # high magnitude single layer
228
-
229
- def add_note(self, text, domain="NOTE", forced_label=None):
230
- """Add a new pattern from freeform text. Self-organizing entry point."""
231
- if forced_label:
232
- label = forced_label
233
- else:
234
- # Auto-generate label from text
235
- words = re.sub(r'[^a-zA-Z0-9\s]', '', text).upper().split()
236
- # Take first 4 meaningful words for label
237
- label_words = [w for w in words if len(w) > 2][:4]
238
- label = "_".join(label_words) if label_words else "NOTE_" + str(int(time.time()))
239
-
240
- # Don't overwrite existing patterns unless forced
241
- if label in self.patterns and not forced_label:
242
- label = label + "_" + str(int(time.time()) % 10000)
243
-
244
- self.patterns[label] = {
245
- "problem": text,
246
- "solution": text,
247
- "domain": domain,
248
- "confidence": 0.5, # starts neutral
249
- "source": "notepad",
250
- "type": "NOTE",
251
- "created": time.time(),
252
- }
253
-
254
- # Initial hit at conceptual layer (you wrote it = you meant it)
255
- self._record_hit(label, 1.0)
256
- self._save_hits()
257
-
258
- # Update threshold for new pattern count
259
- self.base_threshold = 0.3 + (0.4 * min(len(self.patterns) / 200, 1.0))
260
-
261
- return label
262
-
263
- def load_lattice_patterns(self):
264
- """Load existing 5 patterns from lattice."""
265
- for label, data in LATTICE_PATTERNS.items():
266
- self.patterns[label] = {
267
- **data,
268
- "source": "lattice",
269
- "type": "CODE_PATTERN"
270
- }
271
-
272
- def load_conversation_patterns(self):
273
- """Load 11 patterns from this conversation."""
274
- for label, data in CONVERSATION_PATTERNS.items():
275
- self.patterns[label] = {
276
- **data,
277
- "source": "conversation_0938ac6c",
278
- "type": "INSIGHT"
279
- }
280
-
281
- def search(self, query, threshold=None, record=True):
282
- """
283
- Adaptive distillation search.
284
-
285
- - Matches patterns using phrase + word relevance
286
- - Integrates 384-dim semantic similarity from manifolds
287
- - Records hits for matched patterns
288
- """
289
- if threshold is None:
290
- threshold = self.base_threshold
291
-
292
- results = []
293
- query_lower = query.lower()
294
-
295
- # 1. Generate Query Vector
296
- query_vector = self.embedder.embed_text(query)
297
-
298
- # 2. Hard matching patterns
299
- STRUCTURAL_WORDS = { 'a', 'an', 'the', 'is', 'it', 'in', 'on', 'at', 'to', 'of', 'and', 'or', 'but' }
300
- query_words = [(w, self._get_word_weight(w, 0.3 if w in STRUCTURAL_WORDS else 1.0)) for w in query_lower.split()]
301
- links = re.findall(r'\[\[(\w+)\]\]', query_lower)
302
-
303
- for label, pattern in self.patterns.items():
304
- problem = pattern.get('problem', '').lower()
305
- solution = pattern.get('solution', '').lower()
306
- label_text = label.lower()
307
-
308
- relevance = 0
309
-
310
- # Semantic Boost (Manifold Pathfinding)
311
- pattern_vector = self.pattern_vectors.get(label)
312
- semantic_score = 0 # Initialize semantic_score
313
- if pattern_vector:
314
- semantic_score = self.embedder.cosine_similarity(query_vector, pattern_vector)
315
- # Apply high weight to semantic resonance (The "LOVE" Anchor)
316
- relevance += (semantic_score * 0.8)
317
-
318
- # Exact phrase match (The 0x52 Anchor)
319
- if query_lower in problem: relevance += 0.4
320
- if query_lower in solution: relevance += 0.3
321
- if query_lower in label_text: relevance += 0.5
322
-
323
- # Link boost
324
- if label.lower() in links: relevance += 2.0
325
-
326
- # Combine logic
327
- if relevance >= threshold:
328
- status = self.get_status(label)
329
-
330
- # Record magnitude-weighted hit
331
- if record:
332
- self._record_hit(label, relevance)
333
-
334
- hit_data = self.hits.get(label, {})
335
- results.append({
336
- "label": label,
337
- "relevance": relevance,
338
- "confidence": pattern.get('confidence', 0.5),
339
- "status": status,
340
- "hits": hit_data.get('count', 0) if isinstance(hit_data, dict) else 0,
341
- "magnitude": self._total_magnitude(hit_data),
342
- "layers": hit_data.get('layers', []) if isinstance(hit_data, dict) else [],
343
- **pattern
344
- })
345
-
346
- # Sort by: confirmed first, then relevance, then confidence
347
- status_order = {"DEEP_LOGIC": 4, "CONFIRMED": 3, "PLAUSIBLE": 2, "UNCONFIRMED": 1, "NEW": 0}
348
- results.sort(key=lambda x: (
349
- status_order.get(x.get('status', 'NEW'), 0),
350
- x['relevance'],
351
- x['confidence']
352
- ), reverse=True)
353
-
354
- # Save hits after search
355
- if record:
356
- self._save_hits()
357
-
358
- return results
359
-
360
- def distillation_report(self):
361
- """Report on pattern distillation with magnitude layers."""
362
- deep_logic = []
363
- confirmed = []
364
- plausible = []
365
- unconfirmed = []
366
- new_patterns = []
367
-
368
- for label in self.patterns:
369
- status = self.get_status(label)
370
- hit_data = self.hits.get(label, {})
371
- mag = self._total_magnitude(hit_data)
372
- layers = hit_data.get('layers', []) if isinstance(hit_data, dict) else []
373
-
374
- entry = (label, mag, layers)
375
- if status == "DEEP_LOGIC":
376
- deep_logic.append(entry)
377
- elif status == "CONFIRMED":
378
- confirmed.append(entry)
379
- elif status == "PLAUSIBLE":
380
- plausible.append(entry)
381
- elif status == "UNCONFIRMED":
382
- unconfirmed.append(entry)
383
- else:
384
- new_patterns.append(entry)
385
-
386
- print(f"\n{'='*60}")
387
- print(f"DISTILLATION REPORT (Magnitude Layers)")
388
- print(f"{'='*60}")
389
- print(f"Total patterns: {len(self.patterns)}")
390
- print(f" DEEP_LOGIC (multi-layer): {len(deep_logic)} = verified across layers")
391
- print(f" CONFIRMED (mag >= 2.0): {len(confirmed)} = strong signal")
392
- print(f" PLAUSIBLE (mag 1.0-2.0): {len(plausible)} = growing")
393
- print(f" UNCONFIRMED (mag < 1.0): {len(unconfirmed)} = potential noise")
394
- print(f" NEW (untested): {len(new_patterns)}")
395
- print(f"\nAdaptive threshold: {self.base_threshold:.2f}")
396
-
397
- if deep_logic:
398
- print(f"\nDEEP LOGIC (multi-layer verified):")
399
- for label, mag, layers in sorted(deep_logic, key=lambda x: x[1], reverse=True):
400
- print(f" [mag:{mag:.1f}] [{'+'.join(layers)}] {label}")
401
-
402
- if confirmed:
403
- print(f"\nCONFIRMED (strong signal):")
404
- for label, mag, layers in sorted(confirmed, key=lambda x: x[1], reverse=True):
405
- print(f" [mag:{mag:.1f}] [{'+'.join(layers)}] {label}")
406
-
407
- if unconfirmed:
408
- print(f"\nUNCONFIRMED (potential noise):")
409
- for label, mag, layers in unconfirmed:
410
- print(f" [mag:{mag:.1f}] [{'+'.join(layers)}] {label}")
411
-
412
- return {
413
- "confirmed": len(confirmed),
414
- "plausible": len(plausible),
415
- "unconfirmed": len(unconfirmed),
416
- "new": len(new_patterns),
417
- "threshold": self.base_threshold
418
- }
419
-
420
- def save_to_json(self, path):
421
- """Persist to JSON for inspection."""
422
- with open(path, 'w') as f:
423
- json.dump({
424
- "total_patterns": len(self.patterns),
425
- "sources": {
426
- "lattice": len(LATTICE_PATTERNS),
427
- "conversation": len(CONVERSATION_PATTERNS)
428
- },
429
- "patterns": self.patterns
430
- }, f, indent=2)
431
- print(f"\n💾 Saved index to: {path}")
432
-
433
- def stats(self):
434
- """Print statistics."""
435
- print(f"\n{'='*60}")
436
- print(f"IN-MEMORY PATTERN INDEX")
437
- print(f"{'='*60}")
438
- print(f"Total patterns: {len(self.patterns)}")
439
- print(f" From lattice: {len(LATTICE_PATTERNS)}")
440
- print(f" From conversation: {len(CONVERSATION_PATTERNS)}")
441
- print(f"Average confidence: {sum(p.get('confidence', 0.5) for p in self.patterns.values()) / len(self.patterns):.0%}")
442
-
443
- # Domain breakdown
444
- domains = {}
445
- for p in self.patterns.values():
446
- d = p.get('domain', 'UNKNOWN')
447
- domains[d] = domains.get(d, 0) + 1
448
-
449
- print(f"\nDomains:")
450
- for domain, count in sorted(domains.items(), key=lambda x: x[1], reverse=True):
451
- print(f" {domain}: {count}")
452
-
453
- if __name__ == "__main__":
454
- index = InMemoryIndex()
455
- index.stats()
456
-
457
- # Save to JSON
458
- save_path = os.path.join(index.LATTICE_DB_DIR, "in_memory_index.json")
459
- index.save_to_json(save_path)
460
-
461
- # Test search
462
- print(f"\n{'='*60}")
463
- print(f"TEST SEARCHES")
464
- print(f"{'='*60}\n")
465
-
466
- for query in ["singleton", "react", "lattice", "honest"]:
467
- results = index.search(query)
468
- print(f"Query: '{query}' → {len(results)} results")
469
- if results:
470
- print(f" Top: {results[0]['label']} ({results[0]['confidence']:.0%})")
471
- print()