Executor-Tyrant-Framework commited on
Commit
899326d
·
verified ·
1 Parent(s): 89e75d3

Delete recursive_context.py

Browse files
Files changed (1) hide show
  1. recursive_context.py +0 -979
recursive_context.py DELETED
@@ -1,979 +0,0 @@
1
- """
2
- Recursive Context Manager for Clawdbot
3
-
4
- CHANGELOG [2025-01-28 - Josh]
5
- Implements MIT's Recursive Language Model technique for unlimited context.
6
-
7
- CHANGELOG [2025-01-30 - Claude]
8
- Added HuggingFace Dataset persistence layer.
9
- PROBLEM: /workspace gets wiped on Space restart, killing ChromaDB data.
10
- SOLUTION: Sync ChromaDB collections to a private HF Dataset repo.
11
- - On startup: Pull from Dataset -> restore to ChromaDB
12
- - On save: Also push to Dataset (debounced to avoid spam)
13
- - Periodic backup every N conversation turns
14
- This gives us FREE, VERSIONED, PERSISTENT storage that survives restarts.
15
-
16
- CHANGELOG [2025-01-31 - Claude]
17
- FIXED: Multiple persistence failures causing "Conversations Saved: 0"
18
- ROOT CAUSES FOUND:
19
- 1. ChromaDB path was /workspace/chroma_db - EPHEMERAL on HF Spaces Docker.
20
- Container filesystem gets wiped on every restart. Only /data survives.
21
- 2. Cloud backup (HF Dataset) silently did nothing when MEMORY_REPO wasn't set.
22
- No errors, no warnings in UI - just quiet failure.
23
- 3. Debounce timer (30s) could prevent saves if Space sleeps quickly.
24
- 4. HF Spaces sometimes SIGKILL containers without sending SIGTERM,
25
- so shutdown hooks never fire and pending saves are lost.
26
-
27
- CHANGELOG [2025-01-31 - Claude + Gemini]
28
- FIXED: PermissionError on /.cache during ChromaDB embedding model download.
29
- ROOT CAUSE: ChromaDB's ONNXMiniLM_L6_V2 embedding function ignores env vars
30
- like XDG_CACHE_HOME and hardcodes its download path based on ~/.cache.
31
- In Docker containers where HOME isn't set or is /, this resolves to /.cache
32
- which is owned by root and not writable by UID 1000 (HF Spaces runtime user).
33
- FIX (Gemini's approach): Import ONNXMiniLM_L6_V2 directly, override its
34
- DOWNLOAD_PATH attribute to point at CHROMA_CACHE_DIR, and pass the configured
35
- embedding function explicitly to every get_or_create_collection() call.
36
- ALSO: Switched from separate get_collection/create_collection to atomic
37
- get_or_create_collection() to avoid race conditions on half-built collections.
38
-
39
- PERSISTENCE ARCHITECTURE:
40
- /data/chroma_db (survives restarts if persistent storage enabled)
41
- |
42
- v
43
- ChromaDB (fast local queries) <--> HF Dataset (durable cloud storage)
44
- ^
45
- Private repo: username/clawdbot-memory
46
- Contains: conversations.json
47
-
48
- REFERENCE: https://www.youtube.com/watch?v=huszaaJPjU8
49
- "MIT basically solved unlimited context windows"
50
-
51
- APPROACH:
52
- Instead of cramming everything into context (hits limits) or summarizing
53
- (lossy compression), we:
54
-
55
- 1. Store entire codebase in searchable environment
56
- 2. Give model TOOLS to query what it needs
57
- 3. Model recursively retrieves relevant pieces
58
- 4. No summarization loss - full fidelity access
59
-
60
- This is like RAG, but IN-ENVIRONMENT with the model actively deciding
61
- what context it needs rather than us guessing upfront.
62
-
63
- EXAMPLE FLOW:
64
- User: "How does Genesis handle surprise?"
65
- Model: search_code("Genesis surprise detection")
66
- -> Finds: genesis/substrate.py, genesis/attention.py
67
- Model: read_file("genesis/substrate.py", lines 145-167)
68
- -> Gets actual implementation
69
- Model: search_testament("surprise detection rationale")
70
- -> Gets design decision
71
- Model: Synthesizes answer from retrieved pieces
72
-
73
- NO CONTEXT WINDOW LIMIT - just selective retrieval.
74
- """
75
-
76
- from pathlib import Path
77
- from typing import List, Dict, Optional, Tuple
78
- import chromadb
79
- from chromadb.config import Settings
80
- from chromadb.utils.embedding_functions import ONNXMiniLM_L6_V2
81
- import hashlib
82
- import json
83
- import os
84
- import time
85
- import threading
86
-
87
-
88
- # =============================================================================
89
- # PERSISTENT STORAGE PATH SELECTION
90
- # =============================================================================
91
- # CHANGELOG [2025-01-31 - Claude]
92
- # HF Spaces Docker containers wipe everything EXCEPT /data on restart.
93
- # We try /data first (persistent), fall back to /workspace (ephemeral).
94
- # This decision is made once at module load and logged clearly.
95
- # =============================================================================
96
-
97
- def _select_chroma_path():
98
- """
99
- Choose the best available path for ChromaDB storage.
100
-
101
- CHANGELOG [2025-01-31 - Claude]
102
- PRIORITY ORDER:
103
- 1. /data/chroma_db - HF Spaces persistent volume (survives restarts)
104
- 2. /workspace/chroma_db - Container filesystem (wiped on restart)
105
-
106
- WHY /data:
107
- HuggingFace Spaces with Docker SDK provide /data as persistent storage.
108
- It must be enabled in Space settings (Settings -> Persistent Storage).
109
- Free tier gets 20GB. This is the ONLY path that survives container restarts.
110
-
111
- WHY FALLBACK:
112
- If /data doesn't exist or isn't writable (persistent storage not enabled),
113
- we still need ChromaDB to work for the current session. /workspace works
114
- fine within a single session, just doesn't survive restarts.
115
- """
116
- data_path = Path("/data/chroma_db")
117
- try:
118
- data_path.mkdir(parents=True, exist_ok=True)
119
- # Test write access by creating and removing a temp file
120
- test_file = data_path / ".write_test"
121
- test_file.write_text("test")
122
- test_file.unlink()
123
- print("=" * 60)
124
- print("STORAGE: Using /data/chroma_db (PERSISTENT - survives restarts)")
125
- print("=" * 60)
126
- return str(data_path)
127
- except (OSError, PermissionError) as e:
128
- print("=" * 60)
129
- print(f"STORAGE WARNING: /data not available ({e})")
130
- print("STORAGE: Falling back to /workspace/chroma_db (EPHEMERAL)")
131
- print("STORAGE: Memory will be lost on restart!")
132
- print("STORAGE: Enable persistent storage in Space Settings,")
133
- print("STORAGE: or set MEMORY_REPO secret for cloud backup.")
134
- print("=" * 60)
135
-
136
- workspace_path = Path("/workspace/chroma_db")
137
- workspace_path.mkdir(parents=True, exist_ok=True)
138
- return str(workspace_path)
139
-
140
-
141
- # Resolve once at import time so it's consistent throughout the session
142
- CHROMA_DB_PATH = _select_chroma_path()
143
-
144
-
145
- class HFDatasetPersistence:
146
- """
147
- Handles syncing ChromaDB data to/from HuggingFace Datasets.
148
-
149
- CHANGELOG [2025-01-30 - Claude]
150
- Created to solve the Space restart problem.
151
-
152
- CHANGELOG [2025-01-31 - Claude]
153
- FIXED: Now logs clear warnings when MEMORY_REPO isn't configured.
154
- Previously failed silently, making it impossible to tell why memory
155
- wasn't persisting. Also reduced debounce from 30s to 10s.
156
-
157
- CHANGELOG [2025-01-31 - Claude + Gemini]
158
- Added _repo_ready guard to save_conversations() to prevent race condition
159
- where saves fire before repo initialization finishes.
160
-
161
- WHY HF DATASETS:
162
- - Free storage (up to 50GB on free tier)
163
- - Version controlled (can roll back if corrupted)
164
- - Private repos available
165
- - Native HF integration (no extra auth needed in Spaces)
166
- - JSON files work great for conversation data
167
-
168
- ALTERNATIVES CONSIDERED:
169
- - Supabase: Good but adds external dependency
170
- - /data mount: Requires persistent storage setting (now our primary!)
171
- - External S3: More complex, costs money
172
- """
173
-
174
- def __init__(self, repo_id: str = None):
175
- """
176
- Initialize persistence layer.
177
-
178
- Args:
179
- repo_id: HF Dataset repo (e.g., "username/clawdbot-memory")
180
- If None, uses MEMORY_REPO env var
181
- """
182
- from huggingface_hub import HfApi
183
-
184
- self.api = HfApi()
185
- self.repo_id = repo_id or os.getenv("MEMORY_REPO")
186
- self.token = (
187
- os.getenv("HF_TOKEN") or
188
- os.getenv("HUGGING_FACE_HUB_TOKEN") or
189
- os.getenv("HUGGINGFACE_TOKEN")
190
- )
191
-
192
- # Track if we've initialized the repo
193
- self._repo_ready = False
194
-
195
- # Debounce saves to avoid hammering HF API
196
- # RATIONALE: User might send 10 messages quickly, we don't want 10 uploads
197
- # CHANGELOG [2025-01-31 - Claude]: Reduced from 30s to 10s. 30s was too
198
- # long - Spaces can sleep after 15 minutes of inactivity, and if a user
199
- # sends a few messages then leaves, the debounce could eat the last save.
200
- self._save_lock = threading.Lock()
201
- self._pending_save = False
202
- self._last_save_time = 0
203
- self.SAVE_DEBOUNCE_SECONDS = 10 # Min time between cloud saves
204
-
205
- # CHANGELOG [2025-01-31 - Claude]
206
- # Log configuration status clearly on startup so it's visible in logs
207
- if self.repo_id and self.token:
208
- self._ensure_repo_exists()
209
- # Verify token has write permissions
210
- # CHANGELOG [2025-01-31 - Claude]
211
- # Gemini caught this: a read-only token will let the app start
212
- # but all upload_file calls will fail with 403. Check early.
213
- self._verify_write_permissions()
214
- print(f"CLOUD BACKUP: Configured -> {self.repo_id}")
215
- elif not self.repo_id:
216
- print("=" * 60)
217
- print("CLOUD BACKUP: NOT CONFIGURED")
218
- print("Add MEMORY_REPO secret to Space settings.")
219
- print("Value should be: your-username/clawdbot-memory")
220
- print("Without this, conversations won't survive restarts")
221
- print("(unless /data persistent storage is enabled).")
222
- print("=" * 60)
223
- elif not self.token:
224
- print("CLOUD BACKUP: No HF_TOKEN found - cloud backup disabled")
225
-
226
- def _ensure_repo_exists(self):
227
- """
228
- Create the memory repo if it doesn't exist.
229
-
230
- CHANGELOG [2025-01-30 - Claude]
231
- Auto-creates private Dataset repo for memory storage.
232
-
233
- CHANGELOG [2025-01-31 - Claude]
234
- Added detailed error logging. Previously just silently passed on failure,
235
- making it impossible to tell if the repo existed or creation failed.
236
- """
237
- if self._repo_ready:
238
- return
239
-
240
- try:
241
- self.api.repo_info(
242
- repo_id=self.repo_id,
243
- repo_type="dataset",
244
- token=self.token
245
- )
246
- print(f"Memory repo exists: {self.repo_id}")
247
- self._repo_ready = True
248
- except Exception:
249
- # Repo doesn't exist - try to create it
250
- try:
251
- self.api.create_repo(
252
- repo_id=self.repo_id,
253
- repo_type="dataset",
254
- private=True, # Keep conversations private!
255
- token=self.token
256
- )
257
- print(f"Created memory repo: {self.repo_id}")
258
- self._repo_ready = True
259
- except Exception as e:
260
- print(f"Could not create memory repo: {e}")
261
- print(" Memory will not persist across restarts!")
262
-
263
- @property
264
- def is_configured(self):
265
- """
266
- Check if cloud backup is properly configured.
267
-
268
- CHANGELOG [2025-01-31 - Claude]
269
- Added so callers can check before relying on cloud backup.
270
- """
271
- return bool(self.repo_id and self.token)
272
-
273
- def _verify_write_permissions(self):
274
- """
275
- Check that the HF_TOKEN has write permissions.
276
-
277
- CHANGELOG [2025-01-31 - Claude]
278
- Added per Gemini's feedback: a read-only token lets the app start
279
- but causes all cloud saves to fail with 403. Better to catch this
280
- at startup and warn loudly than discover it after losing data.
281
-
282
- NOTE: We don't fail hard here because the app can still function
283
- without cloud backup (using /data persistent storage). Just warn.
284
- """
285
- try:
286
- user_info = self.api.whoami(token=self.token)
287
- token_name = user_info.get("auth", {}).get("accessToken", {}).get("displayName", "unknown")
288
- print(f"CLOUD BACKUP: Token verified (name: {token_name})")
289
- except Exception as e:
290
- print(f"CLOUD BACKUP WARNING: Could not verify token permissions: {e}")
291
- print("CLOUD BACKUP WARNING: If saves fail, check that HF_TOKEN has WRITE access")
292
-
293
- def save_conversations(self, conversations_data: List[Dict], force: bool = False):
294
- """
295
- Save conversations to HF Dataset.
296
-
297
- CHANGELOG [2025-01-30 - Claude]
298
- Debounced save to avoid API spam. Use force=True for shutdown saves.
299
-
300
- CHANGELOG [2025-01-31 - Claude]
301
- Now logs when save is skipped due to missing config (was silent before).
302
-
303
- CHANGELOG [2025-01-31 - Claude + Gemini]
304
- Added _repo_ready guard per Gemini's race condition catch: if the repo
305
- hasn't finished initializing (or failed to initialize), skip the save
306
- rather than letting it throw an opaque HfApi error.
307
-
308
- Args:
309
- conversations_data: List of conversation dicts to save
310
- force: If True, save immediately ignoring debounce
311
- """
312
- if not self.is_configured:
313
- print("Cloud save skipped: MEMORY_REPO not configured")
314
- return False
315
-
316
- # CHANGELOG [2025-01-31 - Claude + Gemini]
317
- # Race condition guard: _ensure_repo_exists() runs in __init__ but
318
- # could fail (network issue, bad token, etc). If repo isn't ready,
319
- # retry once then give up for this save cycle.
320
- if not self._repo_ready:
321
- print("Cloud save skipped: memory repo not ready (retrying init...)")
322
- self._ensure_repo_exists()
323
- if not self._repo_ready:
324
- return False
325
-
326
- current_time = time.time()
327
-
328
- # Check debounce (unless forced)
329
- if not force:
330
- if current_time - self._last_save_time < self.SAVE_DEBOUNCE_SECONDS:
331
- self._pending_save = True
332
- return False
333
-
334
- with self._save_lock:
335
- try:
336
- # Save to local temp file first
337
- temp_path = Path("/tmp/conversations_backup.json")
338
- temp_path.write_text(json.dumps(conversations_data, indent=2))
339
-
340
- # Upload to HF Dataset
341
- self.api.upload_file(
342
- path_or_fileobj=str(temp_path),
343
- path_in_repo="conversations.json",
344
- repo_id=self.repo_id,
345
- repo_type="dataset",
346
- token=self.token,
347
- commit_message=f"Backup {len(conversations_data)} conversations"
348
- )
349
-
350
- self._last_save_time = current_time
351
- self._pending_save = False
352
- print(f"Cloud saved {len(conversations_data)} conversations to {self.repo_id}")
353
- return True
354
-
355
- except Exception as e:
356
- print(f"Failed to save conversations to cloud: {e}")
357
- return False
358
-
359
- def load_conversations(self) -> List[Dict]:
360
- """
361
- Load conversations from HF Dataset.
362
-
363
- CHANGELOG [2025-01-30 - Claude]
364
- Called on startup to restore conversation history.
365
-
366
- Returns:
367
- List of conversation dicts, or empty list if none found
368
- """
369
- if not self.is_configured:
370
- print("Cloud load skipped: MEMORY_REPO not configured")
371
- return []
372
-
373
- try:
374
- from huggingface_hub import hf_hub_download
375
-
376
- local_path = hf_hub_download(
377
- repo_id=self.repo_id,
378
- filename="conversations.json",
379
- repo_type="dataset",
380
- token=self.token
381
- )
382
-
383
- with open(local_path, 'r') as f:
384
- data = json.load(f)
385
-
386
- print(f"Cloud loaded {len(data)} conversations from {self.repo_id}")
387
- return data
388
-
389
- except Exception as e:
390
- # File might not exist yet (first run)
391
- if "404" in str(e) or "not found" in str(e).lower():
392
- print(f"No existing conversations found in {self.repo_id} (first run)")
393
- else:
394
- print(f"Failed to load conversations from cloud: {e}")
395
- return []
396
-
397
- def has_pending_save(self) -> bool:
398
- """Check if there's a pending save that was debounced."""
399
- return self._pending_save
400
-
401
-
402
- class RecursiveContextManager:
403
- """
404
- Manages unlimited context via recursive retrieval.
405
-
406
- The model has TOOLS to search and read the codebase selectively,
407
- rather than loading everything upfront.
408
-
409
- CHANGELOG [2025-01-30 - Claude]
410
- Added HF Dataset persistence. Conversations now survive Space restarts.
411
-
412
- CHANGELOG [2025-01-31 - Claude]
413
- FIXED: ChromaDB path now uses /data (persistent) instead of /workspace (ephemeral).
414
- FIXED: Cloud backup logs clear warnings when not configured.
415
- FIXED: First conversation turn always triggers immediate cloud save.
416
-
417
- CHANGELOG [2025-01-31 - Claude + Gemini]
418
- FIXED: PermissionError on /.cache by overriding ONNXMiniLM_L6_V2.DOWNLOAD_PATH.
419
- FIXED: Switched to get_or_create_collection() for atomic collection init.
420
- FIXED: BACKUP_EVERY_N_SAVES set to 1 while validating persistence works.
421
- """
422
-
423
- def __init__(self, repo_path: str):
424
- """
425
- Initialize context manager for a repository.
426
-
427
- Args:
428
- repo_path: Path to the code repository
429
- """
430
- self.repo_path = Path(repo_path)
431
-
432
- # Initialize persistence layer FIRST
433
- # RATIONALE: Need this before ChromaDB so we can restore data
434
- self.persistence = HFDatasetPersistence()
435
-
436
- # =================================================================
437
- # EXPLICIT EMBEDDING FUNCTION WITH WRITABLE CACHE PATH
438
- # =================================================================
439
- # CHANGELOG [2025-01-31 - Claude + Gemini]
440
- # PROBLEM: ChromaDB's default ONNX MiniLM embedding function ignores
441
- # XDG_CACHE_HOME and other env vars. It hardcodes its download path
442
- # based on ~/.cache, which resolves to /.cache in containers where
443
- # HOME isn't set properly. UID 1000 can't write to /.cache.
444
- # This crashed the app with: PermissionError: [Errno 13] /.cache
445
- #
446
- # FIX (Gemini's approach): Import ONNXMiniLM_L6_V2 directly, override
447
- # its DOWNLOAD_PATH to our writable CHROMA_CACHE_DIR, then pass it
448
- # explicitly to every get_or_create_collection() call.
449
- #
450
- # WHY NOT JUST ENV VARS: We tried XDG_CACHE_HOME, HF_HOME, HOME=/tmp
451
- # in the Dockerfile. ChromaDB's ONNX code doesn't read them.
452
- # The DOWNLOAD_PATH override is the only reliable fix.
453
- #
454
- # BONUS: The embedding model download persists in /data/.cache across
455
- # restarts (if persistent storage enabled), so subsequent startups
456
- # skip the download entirely.
457
- # =================================================================
458
- self.embedding_function = ONNXMiniLM_L6_V2()
459
- cache_dir = os.getenv("CHROMA_CACHE_DIR", "/tmp/.cache/chroma")
460
- os.makedirs(cache_dir, exist_ok=True)
461
- self.embedding_function.DOWNLOAD_PATH = cache_dir
462
- print(f"Embedding model cache: {cache_dir}")
463
-
464
- # Initialize ChromaDB for semantic search
465
- # CHANGELOG [2025-01-31 - Claude]
466
- # Uses CHROMA_DB_PATH resolved at module load to either
467
- # /data/chroma_db (persistent) or /workspace/chroma_db (ephemeral).
468
- # See _select_chroma_path() at top of file for selection logic.
469
- self.chroma_client = chromadb.PersistentClient(
470
- path=CHROMA_DB_PATH,
471
- settings=Settings(
472
- anonymized_telemetry=False,
473
- allow_reset=True
474
- )
475
- )
476
- print(f"ChromaDB initialized at: {CHROMA_DB_PATH}")
477
-
478
- # Create or get CODEBASE collection
479
- # CHANGELOG [2025-01-31 - Claude + Gemini]
480
- # Switched to get_or_create_collection with explicit embedding function.
481
- # Previous approach: try get_collection, except -> create_collection
482
- # Problem: If create succeeded but _index_codebase crashed (e.g. the
483
- # /.cache error), next restart would try get_collection on a half-built
484
- # collection, fail, try create again, fail because name conflicts.
485
- # get_or_create_collection handles all of this atomically.
486
- #
487
- # CRITICAL: embedding_function MUST be passed here. Without it,
488
- # ChromaDB falls back to its default embedding function which tries
489
- # to download to /.cache and crashes. This was the root cause of the
490
- # PermissionError that blocked all indexing.
491
- collection_name = self._get_collection_name()
492
- self.collection = self.chroma_client.get_or_create_collection(
493
- name=collection_name,
494
- embedding_function=self.embedding_function,
495
- metadata={"description": "E-T Systems codebase"}
496
- )
497
- existing_count = self.collection.count()
498
- if existing_count > 0:
499
- print(f"Loaded existing index: {existing_count} files")
500
- else:
501
- print(f"Created new collection: {collection_name}")
502
- self._index_codebase()
503
-
504
- # Create or get CONVERSATION collection for persistence
505
- # CHANGELOG [2025-01-30 - Josh]: Added conversation persistence
506
- # CHANGELOG [2025-01-30 - Claude]: Added HF Dataset restore on startup
507
- # CHANGELOG [2025-01-31 - Claude + Gemini]: Now uses explicit embedding
508
- # function and atomic get_or_create_collection
509
- conversations_name = f"conversations_{collection_name.split('_')[1]}"
510
- self.conversations = self.chroma_client.get_or_create_collection(
511
- name=conversations_name,
512
- embedding_function=self.embedding_function,
513
- metadata={"description": "Clawdbot conversation history"}
514
- )
515
- conv_count = self.conversations.count()
516
- if conv_count > 0:
517
- print(f"Loaded conversation history: {conv_count} exchanges")
518
- else:
519
- print(f"Created conversation collection: {conversations_name}")
520
-
521
- # RESTORE FROM CLOUD if local is empty but cloud has data
522
- # RATIONALE: Space restarted, ChromaDB wiped, but HF Dataset has our history
523
- if self.conversations.count() == 0:
524
- self._restore_from_cloud()
525
-
526
- # Track saves for periodic backup
527
- # CHANGELOG [2025-01-31 - Gemini]: Set to 1 for reliability during validation.
528
- # Once persistence is confirmed working, can bump back to 3.
529
- # CHANGELOG [2025-01-31 - Claude]: Added _is_first_save flag for immediate
530
- # first-turn backup so even single-message sessions persist.
531
- self._saves_since_backup = 0
532
- self.BACKUP_EVERY_N_SAVES = 1 # Sync every turn while validating persistence
533
- self._is_first_save = True # First save always goes to cloud immediately
534
-
535
- def _restore_from_cloud(self):
536
- """
537
- Restore conversations from HF Dataset to ChromaDB.
538
-
539
- CHANGELOG [2025-01-30 - Claude]
540
- Called when local ChromaDB is empty but cloud might have data.
541
- This is the magic that makes memory survive restarts.
542
- """
543
- cloud_data = self.persistence.load_conversations()
544
-
545
- if not cloud_data:
546
- print("No cloud conversations to restore")
547
- return
548
-
549
- print(f"Restoring {len(cloud_data)} conversations from cloud...")
550
-
551
- restored = 0
552
- for conv in cloud_data:
553
- try:
554
- self.conversations.add(
555
- documents=[conv["document"]],
556
- metadatas=[conv["metadata"]],
557
- ids=[conv["id"]]
558
- )
559
- restored += 1
560
- except Exception as e:
561
- # Might fail if ID already exists (shouldn't happen but safety first)
562
- print(f"Skipping conversation {conv.get('id')}: {e}")
563
-
564
- print(f"Restored {restored} conversations (total: {self.conversations.count()})")
565
-
566
- def _backup_to_cloud(self, force: bool = False):
567
- """
568
- Backup all conversations to HF Dataset.
569
-
570
- CHANGELOG [2025-01-30 - Claude]
571
- Called periodically and on shutdown to ensure durability.
572
-
573
- Args:
574
- force: If True, save immediately ignoring debounce
575
- """
576
- if self.conversations.count() == 0:
577
- return
578
-
579
- # Get all conversations from ChromaDB
580
- all_convs = self.conversations.get(
581
- include=["documents", "metadatas"]
582
- )
583
-
584
- # Format for JSON storage
585
- backup_data = [
586
- {"id": id_, "document": doc, "metadata": meta}
587
- for doc, meta, id_ in zip(
588
- all_convs["documents"],
589
- all_convs["metadatas"],
590
- all_convs["ids"]
591
- )
592
- ]
593
-
594
- # Save to cloud
595
- self.persistence.save_conversations(backup_data, force=force)
596
-
597
- def _get_collection_name(self) -> str:
598
- """Generate unique collection name based on repo path."""
599
- path_hash = hashlib.md5(str(self.repo_path).encode()).hexdigest()[:8]
600
- return f"codebase_{path_hash}"
601
-
602
- def _index_codebase(self):
603
- """
604
- Index all code files for semantic search.
605
-
606
- This creates the "environment" that the model can search through.
607
- We index with metadata so search results include file paths.
608
- """
609
- print(f"Indexing codebase at {self.repo_path}...")
610
-
611
- # File types to index
612
- code_extensions = {
613
- '.py', '.js', '.ts', '.tsx', '.jsx',
614
- '.md', '.txt', '.json', '.yaml', '.yml'
615
- }
616
-
617
- # Skip these directories
618
- skip_dirs = {
619
- 'node_modules', '.git', '__pycache__', 'venv',
620
- 'env', '.venv', 'dist', 'build'
621
- }
622
-
623
- documents = []
624
- metadatas = []
625
- ids = []
626
-
627
- for file_path in self.repo_path.rglob('*'):
628
- # Skip directories and non-code files
629
- if file_path.is_dir():
630
- continue
631
- if any(skip in file_path.parts for skip in skip_dirs):
632
- continue
633
- if file_path.suffix not in code_extensions:
634
- continue
635
-
636
- try:
637
- content = file_path.read_text(encoding='utf-8', errors='ignore')
638
-
639
- # Don't index empty files or massive files
640
- if not content.strip() or len(content) > 100000:
641
- continue
642
-
643
- relative_path = str(file_path.relative_to(self.repo_path))
644
-
645
- documents.append(content)
646
- metadatas.append({
647
- "path": relative_path,
648
- "type": file_path.suffix[1:], # Remove leading dot
649
- "size": len(content)
650
- })
651
- ids.append(relative_path)
652
-
653
- except Exception as e:
654
- print(f"Skipping {file_path.name}: {e}")
655
- continue
656
-
657
- if documents:
658
- # Add to collection in batches
659
- batch_size = 100
660
- for i in range(0, len(documents), batch_size):
661
- self.collection.add(
662
- documents=documents[i:i+batch_size],
663
- metadatas=metadatas[i:i+batch_size],
664
- ids=ids[i:i+batch_size]
665
- )
666
-
667
- print(f"Indexed {len(documents)} files")
668
- else:
669
- print("No files found to index")
670
-
671
- def search_code(self, query: str, n_results: int = 5) -> List[Dict]:
672
- """
673
- Search codebase semantically.
674
-
675
- This is a TOOL available to the model for recursive retrieval.
676
- Model can search for concepts without knowing exact file names.
677
-
678
- Args:
679
- query: What to search for (e.g. "surprise detection", "vector embedding")
680
- n_results: How many results to return
681
-
682
- Returns:
683
- List of dicts with {file, snippet, relevance}
684
- """
685
- if self.collection.count() == 0:
686
- return [{"error": "No files indexed yet"}]
687
-
688
- results = self.collection.query(
689
- query_texts=[query],
690
- n_results=min(n_results, self.collection.count())
691
- )
692
-
693
- # Format results for the model
694
- # Truncate to 500 chars for search results - model can read_file() for full content
695
- formatted = []
696
- for doc, meta, dist in zip(
697
- results['documents'][0],
698
- results['metadatas'][0],
699
- results['distances'][0]
700
- ):
701
- snippet = doc[:500]
702
- if len(doc) > 500:
703
- snippet += "... [truncated, use read_file to see more]"
704
-
705
- formatted.append({
706
- "file": meta['path'],
707
- "snippet": snippet,
708
- "relevance": round(1 - dist, 3),
709
- "type": meta['type']
710
- })
711
-
712
- return formatted
713
-
714
- def read_file(self, path: str, start_line: int = None, end_line: int = None) -> str:
715
- """
716
- Read a specific file or line range.
717
-
718
- This is a TOOL available to the model.
719
- After searching, model can read full files as needed.
720
-
721
- Args:
722
- path: Relative path to file
723
- start_line: Optional starting line number (1-indexed)
724
- end_line: Optional ending line number (1-indexed)
725
-
726
- Returns:
727
- File content or specified lines
728
- """
729
- full_path = self.repo_path / path
730
-
731
- if not full_path.exists():
732
- return f"Error: File not found: {path}"
733
-
734
- if not full_path.is_relative_to(self.repo_path):
735
- return "Error: Path outside repository"
736
-
737
- try:
738
- content = full_path.read_text(encoding='utf-8', errors='ignore')
739
-
740
- if start_line and end_line:
741
- content_lines = content.split('\n')
742
- # Adjust for 1-indexed
743
- selected_lines = content_lines[start_line-1:end_line]
744
- return '\n'.join(selected_lines)
745
-
746
- return content
747
-
748
- except Exception as e:
749
- return f"Error reading file: {str(e)}"
750
-
751
- def search_testament(self, query: str) -> str:
752
- """
753
- Search architectural decisions in Testament.
754
-
755
- This is a TOOL available to the model.
756
- Helps model understand design rationale.
757
-
758
- Args:
759
- query: What decision to look for
760
-
761
- Returns:
762
- Relevant Testament sections
763
- """
764
- testament_path = self.repo_path / "TESTAMENT.md"
765
-
766
- if not testament_path.exists():
767
- return "Testament not found. No architectural decisions recorded yet."
768
-
769
- try:
770
- content = testament_path.read_text(encoding='utf-8')
771
-
772
- # Split into sections (marked by ## headers)
773
- sections = content.split('\n## ')
774
-
775
- # Simple relevance: sections that contain query terms
776
- query_lower = query.lower()
777
- relevant = []
778
-
779
- for section in sections:
780
- if query_lower in section.lower():
781
- # Include section with header
782
- if not section.startswith('#'):
783
- section = '## ' + section
784
- relevant.append(section)
785
-
786
- if relevant:
787
- return '\n\n'.join(relevant)
788
- else:
789
- return f"No Testament entries found matching '{query}'"
790
-
791
- except Exception as e:
792
- return f"Error searching Testament: {str(e)}"
793
-
794
- def list_files(self, directory: str = ".") -> List[str]:
795
- """
796
- List files in a directory.
797
-
798
- This is a TOOL available to the model.
799
- Helps model explore repository structure.
800
-
801
- Args:
802
- directory: Directory to list (relative path)
803
-
804
- Returns:
805
- List of file/directory names
806
- """
807
- dir_path = self.repo_path / directory
808
-
809
- if not dir_path.exists():
810
- return [f"Error: Directory not found: {directory}"]
811
-
812
- if not dir_path.is_relative_to(self.repo_path):
813
- return ["Error: Path outside repository"]
814
-
815
- try:
816
- items = []
817
- for item in sorted(dir_path.iterdir()):
818
- # Skip hidden and system directories
819
- if item.name.startswith('.'):
820
- continue
821
- if item.name in {'node_modules', '__pycache__', 'venv'}:
822
- continue
823
-
824
- # Mark directories with /
825
- if item.is_dir():
826
- items.append(f"{item.name}/")
827
- else:
828
- items.append(item.name)
829
-
830
- return items
831
-
832
- except Exception as e:
833
- return [f"Error listing directory: {str(e)}"]
834
-
835
- def save_conversation_turn(self, user_message: str, assistant_message: str, turn_id: int):
836
- """
837
- Save a conversation turn to persistent storage.
838
-
839
- CHANGELOG [2025-01-30 - Josh]
840
- Implements MIT recursive technique for conversations.
841
- Chat history becomes searchable context that persists across sessions.
842
-
843
- CHANGELOG [2025-01-30 - Claude]
844
- Added cloud backup integration. Every N saves triggers HF Dataset backup.
845
-
846
- CHANGELOG [2025-01-31 - Claude]
847
- FIXED: First conversation turn now triggers immediate cloud backup.
848
- Previously, a user could have a single exchange and leave, and the
849
- debounce timer would prevent the cloud save from ever firing.
850
-
851
- CHANGELOG [2025-01-31 - Gemini]
852
- BACKUP_EVERY_N_SAVES set to 1 for reliability while validating persistence.
853
-
854
- Args:
855
- user_message: What the user said
856
- assistant_message: What Clawdbot responded
857
- turn_id: Unique ID for this turn (timestamp-based)
858
- """
859
- # Create a combined document for semantic search
860
- combined = f"USER: {user_message}\n\nASSISTANT: {assistant_message}"
861
-
862
- # Generate unique ID with timestamp to avoid collisions
863
- unique_id = f"turn_{int(time.time())}_{turn_id}"
864
-
865
- # Save to ChromaDB (fast local access)
866
- self.conversations.add(
867
- documents=[combined],
868
- metadatas=[{
869
- "user": user_message[:500], # Truncate for metadata
870
- "assistant": assistant_message[:500],
871
- "timestamp": int(time.time()),
872
- "turn": turn_id
873
- }],
874
- ids=[unique_id]
875
- )
876
-
877
- print(f"Saved conversation turn {turn_id} (total: {self.conversations.count()})")
878
-
879
- # CLOUD BACKUP LOGIC
880
- # CHANGELOG [2025-01-31 - Claude]
881
- # First save always goes to cloud immediately (force=True).
882
- # This ensures even single-message sessions persist.
883
- # Subsequent saves follow the periodic backup schedule.
884
- if self._is_first_save:
885
- print("First conversation turn - forcing immediate cloud backup")
886
- self._backup_to_cloud(force=True)
887
- self._is_first_save = False
888
- self._saves_since_backup = 0
889
- else:
890
- # Periodic cloud backup
891
- # RATIONALE: Don't backup every message (API spam), but don't wait too long
892
- # Currently set to 1 for validation. Bump to 3 once persistence confirmed.
893
- self._saves_since_backup += 1
894
- if self._saves_since_backup >= self.BACKUP_EVERY_N_SAVES:
895
- self._backup_to_cloud()
896
- self._saves_since_backup = 0
897
-
898
- def search_conversations(self, query: str, n_results: int = 5) -> List[Dict]:
899
- """
900
- Search past conversations for relevant context.
901
-
902
- This enables TRUE unlimited context - Clawdbot can remember
903
- everything ever discussed by searching its own conversation history.
904
-
905
- Args:
906
- query: What to search for in past conversations
907
- n_results: How many results to return
908
-
909
- Returns:
910
- List of past conversation turns with user/assistant messages
911
- """
912
- if self.conversations.count() == 0:
913
- return []
914
-
915
- results = self.conversations.query(
916
- query_texts=[query],
917
- n_results=min(n_results, self.conversations.count())
918
- )
919
-
920
- formatted = []
921
- for doc, metadata in zip(
922
- results['documents'][0],
923
- results['metadatas'][0]
924
- ):
925
- formatted.append({
926
- "turn": metadata.get("turn", "unknown"),
927
- "user": metadata.get("user", ""),
928
- "assistant": metadata.get("assistant", ""),
929
- "full_text": doc,
930
- "relevance": len(formatted) + 1 # Lower is more relevant
931
- })
932
-
933
- return formatted
934
-
935
- def get_conversation_count(self) -> int:
936
- """Get total number of saved conversation turns."""
937
- return self.conversations.count()
938
-
939
- def get_stats(self) -> Dict:
940
- """
941
- Get statistics about indexed codebase.
942
-
943
- CHANGELOG [2025-01-31 - Claude]
944
- Added storage_path and cloud_backup_status for better diagnostics.
945
-
946
- Returns:
947
- Dict with file counts, sizes, etc.
948
- """
949
- return {
950
- "total_files": self.collection.count(),
951
- "repo_path": str(self.repo_path),
952
- "collection_name": self.collection.name,
953
- "conversations": self.conversations.count(),
954
- "storage_path": CHROMA_DB_PATH,
955
- "cloud_backup_configured": self.persistence.is_configured,
956
- "cloud_backup_repo": self.persistence.repo_id or "Not set"
957
- }
958
-
959
- def force_backup(self):
960
- """
961
- Force immediate backup to cloud.
962
-
963
- CHANGELOG [2025-01-30 - Claude]
964
- Call this on app shutdown to ensure no data loss.
965
- """
966
- print("Forcing cloud backup...")
967
- self._backup_to_cloud(force=True)
968
- print("Backup complete")
969
-
970
- def shutdown(self):
971
- """
972
- Clean shutdown - ensure all data is saved.
973
-
974
- CHANGELOG [2025-01-30 - Claude]
975
- Call this when the Space is shutting down.
976
- """
977
- print("Shutting down RecursiveContextManager...")
978
- self.force_backup()
979
- print("Shutdown complete")